Add support for Unicode variation sequences

These are character pairs that should be treated as tokens with a
buffer delta of 2 and a screen delta of 1.
This commit is contained in:
Kevin Sawicki
2014-09-17 09:26:52 -07:00
parent b6faffe2db
commit 878da262d2
3 changed files with 73 additions and 41 deletions

View File

@@ -4,34 +4,59 @@ isHighSurrogate = (string, index) ->
isLowSurrogate = (string, index) ->
0xDC00 <= string.charCodeAt(index) <= 0xDFFF
isVariationSelector = (string, index) ->
0xFE00 <= string.charCodeAt(index) <= 0xFE0F
# Is the character at the given index the start of a high/low surrogate pair?
#
# string - The {String} to check for a surrogate pair.
# index - The {Number} index to look for a surrogate pair at.
# * `string` The {String} to check for a surrogate pair.
# * `index` The {Number} index to look for a surrogate pair at.
#
# Return a {Boolean}.
isSurrogatePair = (string, index=0) ->
isHighSurrogate(string, index) and isLowSurrogate(string, index + 1)
# Get the number of characters in the string accounting for surrogate pairs.
# Is the character at the given index the start of a variation sequence?
#
# This method counts high/low surrogate pairs as a single character and will
# always returns a value less than or equal to `string.length`.
# * `string` The {String} to check for a surrogate pair.
# * `index` The {Number} index to look for a surrogate pair at.
#
# string - The {String} to count the number of full characters in.
# Return a {Boolean}.
isVariationSequence = (string, index=0) ->
isVariationSelector(string, index + 1)
# Is the character at the given index the start of high/low surrogate pair
# or a variation sequence?
#
# * `string` The {String} to check for a surrogate pair.
# * `index` The {Number} index to look for a surrogate pair at.
#
# Return a {Boolean}.
isPairedCharacter = (string, index=0) ->
isSurrogatePair(string, index) or isVariationSequence(string, index)
# Get the number of characters in the string accounting for surrogate pairs and
# variation sequences.
#
# This method counts high/low surrogate pairs and variation sequences as a
# single character and will always returns a value less than or equal to
# `string.length`.
#
# * `string` The {String} to count the number of full characters in.
#
# Returns a {Number}.
getCharacterCount = (string) ->
count = string.length
count-- for index in [0...string.length] when isSurrogatePair(string, index)
count-- for index in [0...string.length] when isPairedCharacter(string, index)
count
# Does the given string contain at least one surrogate pair?
# Does the given string contain at least surrogate pair or variation sequence?
#
# string - The {String} to check for the presence of surrogate pairs.
# * `string` The {String} to check for the presence of paired characters.
#
# Returns a {Boolean}.
hasSurrogatePair = (string) ->
hasPairedCharacter = (string) ->
string.length isnt getCharacterCount(string)
module.exports = {getCharacterCount, isSurrogatePair, hasSurrogatePair}
module.exports = {getCharacterCount, isPairedCharacter, hasPairedCharacter}

View File

@@ -12,7 +12,7 @@ MaxTokenLength = 20000
module.exports =
class Token
value: null
hasSurrogatePair: false
hasPairedCharacter: false
scopes: null
isAtomic: null
isHardTab: null
@@ -23,7 +23,7 @@ class Token
constructor: ({@value, @scopes, @isAtomic, @bufferDelta, @isHardTab}) ->
@screenDelta = @value.length
@bufferDelta ?= @screenDelta
@hasSurrogatePair = textUtils.hasSurrogatePair(@value)
@hasPairedCharacter = textUtils.hasPairedCharacter(@value)
isEqual: (other) ->
@value == other.value and _.isEqual(@scopes, other.scopes) and !!@isAtomic == !!other.isAtomic
@@ -57,11 +57,11 @@ class Token
WhitespaceRegexesByTabLength[tabLength] ?= new RegExp("([ ]{#{tabLength}})|(\t)|([^\t]+)", "g")
breakOutAtomicTokens: (tabLength, breakOutLeadingSoftTabs, startColumn) ->
if @hasSurrogatePair
if @hasPairedCharacter
outputTokens = []
column = startColumn
for token in @breakOutSurrogatePairs()
for token in @breakOutPairedCharacters()
if token.isAtomic
outputTokens.push(token)
else
@@ -98,27 +98,27 @@ class Token
outputTokens
breakOutSurrogatePairs: ->
breakOutPairedCharacters: ->
outputTokens = []
index = 0
nonSurrogatePairStart = 0
nonPairStart = 0
while index < @value.length
if textUtils.isSurrogatePair(@value, index)
if nonSurrogatePairStart isnt index
outputTokens.push(new Token({value: @value[nonSurrogatePairStart...index], @scopes}))
outputTokens.push(@buildSurrogatePairToken(@value, index))
if textUtils.isPairedCharacter(@value, index)
if nonPairStart isnt index
outputTokens.push(new Token({value: @value[nonPairStart...index], @scopes}))
outputTokens.push(@buildPairedCharacterToken(@value, index))
index += 2
nonSurrogatePairStart = index
nonPairStart = index
else
index++
if nonSurrogatePairStart isnt index
outputTokens.push(new Token({value: @value[nonSurrogatePairStart...index], @scopes}))
if nonPairStart isnt index
outputTokens.push(new Token({value: @value[nonPairStart...index], @scopes}))
outputTokens
buildSurrogatePairToken: (value, index) ->
buildPairedCharacterToken: (value, index) ->
new Token(
value: value[index..index + 1]
scopes: @scopes