mirror of
https://github.com/atom/atom.git
synced 2026-04-06 03:02:13 -04:00
Add support for Unicode variation sequences
These are character pairs that should be treated as tokens with a buffer delta of 2 and a screen delta of 1.
This commit is contained in:
@@ -4,34 +4,59 @@ isHighSurrogate = (string, index) ->
|
||||
isLowSurrogate = (string, index) ->
|
||||
0xDC00 <= string.charCodeAt(index) <= 0xDFFF
|
||||
|
||||
isVariationSelector = (string, index) ->
|
||||
0xFE00 <= string.charCodeAt(index) <= 0xFE0F
|
||||
|
||||
# Is the character at the given index the start of a high/low surrogate pair?
|
||||
#
|
||||
# string - The {String} to check for a surrogate pair.
|
||||
# index - The {Number} index to look for a surrogate pair at.
|
||||
# * `string` The {String} to check for a surrogate pair.
|
||||
# * `index` The {Number} index to look for a surrogate pair at.
|
||||
#
|
||||
# Return a {Boolean}.
|
||||
isSurrogatePair = (string, index=0) ->
|
||||
isHighSurrogate(string, index) and isLowSurrogate(string, index + 1)
|
||||
|
||||
# Get the number of characters in the string accounting for surrogate pairs.
|
||||
# Is the character at the given index the start of a variation sequence?
|
||||
#
|
||||
# This method counts high/low surrogate pairs as a single character and will
|
||||
# always returns a value less than or equal to `string.length`.
|
||||
# * `string` The {String} to check for a surrogate pair.
|
||||
# * `index` The {Number} index to look for a surrogate pair at.
|
||||
#
|
||||
# string - The {String} to count the number of full characters in.
|
||||
# Return a {Boolean}.
|
||||
isVariationSequence = (string, index=0) ->
|
||||
isVariationSelector(string, index + 1)
|
||||
|
||||
# Is the character at the given index the start of high/low surrogate pair
|
||||
# or a variation sequence?
|
||||
#
|
||||
# * `string` The {String} to check for a surrogate pair.
|
||||
# * `index` The {Number} index to look for a surrogate pair at.
|
||||
#
|
||||
# Return a {Boolean}.
|
||||
|
||||
isPairedCharacter = (string, index=0) ->
|
||||
isSurrogatePair(string, index) or isVariationSequence(string, index)
|
||||
|
||||
# Get the number of characters in the string accounting for surrogate pairs and
|
||||
# variation sequences.
|
||||
#
|
||||
# This method counts high/low surrogate pairs and variation sequences as a
|
||||
# single character and will always returns a value less than or equal to
|
||||
# `string.length`.
|
||||
#
|
||||
# * `string` The {String} to count the number of full characters in.
|
||||
#
|
||||
# Returns a {Number}.
|
||||
getCharacterCount = (string) ->
|
||||
count = string.length
|
||||
count-- for index in [0...string.length] when isSurrogatePair(string, index)
|
||||
count-- for index in [0...string.length] when isPairedCharacter(string, index)
|
||||
count
|
||||
|
||||
# Does the given string contain at least one surrogate pair?
|
||||
# Does the given string contain at least surrogate pair or variation sequence?
|
||||
#
|
||||
# string - The {String} to check for the presence of surrogate pairs.
|
||||
# * `string` The {String} to check for the presence of paired characters.
|
||||
#
|
||||
# Returns a {Boolean}.
|
||||
hasSurrogatePair = (string) ->
|
||||
hasPairedCharacter = (string) ->
|
||||
string.length isnt getCharacterCount(string)
|
||||
|
||||
module.exports = {getCharacterCount, isSurrogatePair, hasSurrogatePair}
|
||||
module.exports = {getCharacterCount, isPairedCharacter, hasPairedCharacter}
|
||||
|
||||
@@ -12,7 +12,7 @@ MaxTokenLength = 20000
|
||||
module.exports =
|
||||
class Token
|
||||
value: null
|
||||
hasSurrogatePair: false
|
||||
hasPairedCharacter: false
|
||||
scopes: null
|
||||
isAtomic: null
|
||||
isHardTab: null
|
||||
@@ -23,7 +23,7 @@ class Token
|
||||
constructor: ({@value, @scopes, @isAtomic, @bufferDelta, @isHardTab}) ->
|
||||
@screenDelta = @value.length
|
||||
@bufferDelta ?= @screenDelta
|
||||
@hasSurrogatePair = textUtils.hasSurrogatePair(@value)
|
||||
@hasPairedCharacter = textUtils.hasPairedCharacter(@value)
|
||||
|
||||
isEqual: (other) ->
|
||||
@value == other.value and _.isEqual(@scopes, other.scopes) and !!@isAtomic == !!other.isAtomic
|
||||
@@ -57,11 +57,11 @@ class Token
|
||||
WhitespaceRegexesByTabLength[tabLength] ?= new RegExp("([ ]{#{tabLength}})|(\t)|([^\t]+)", "g")
|
||||
|
||||
breakOutAtomicTokens: (tabLength, breakOutLeadingSoftTabs, startColumn) ->
|
||||
if @hasSurrogatePair
|
||||
if @hasPairedCharacter
|
||||
outputTokens = []
|
||||
column = startColumn
|
||||
|
||||
for token in @breakOutSurrogatePairs()
|
||||
for token in @breakOutPairedCharacters()
|
||||
if token.isAtomic
|
||||
outputTokens.push(token)
|
||||
else
|
||||
@@ -98,27 +98,27 @@ class Token
|
||||
|
||||
outputTokens
|
||||
|
||||
breakOutSurrogatePairs: ->
|
||||
breakOutPairedCharacters: ->
|
||||
outputTokens = []
|
||||
index = 0
|
||||
nonSurrogatePairStart = 0
|
||||
nonPairStart = 0
|
||||
|
||||
while index < @value.length
|
||||
if textUtils.isSurrogatePair(@value, index)
|
||||
if nonSurrogatePairStart isnt index
|
||||
outputTokens.push(new Token({value: @value[nonSurrogatePairStart...index], @scopes}))
|
||||
outputTokens.push(@buildSurrogatePairToken(@value, index))
|
||||
if textUtils.isPairedCharacter(@value, index)
|
||||
if nonPairStart isnt index
|
||||
outputTokens.push(new Token({value: @value[nonPairStart...index], @scopes}))
|
||||
outputTokens.push(@buildPairedCharacterToken(@value, index))
|
||||
index += 2
|
||||
nonSurrogatePairStart = index
|
||||
nonPairStart = index
|
||||
else
|
||||
index++
|
||||
|
||||
if nonSurrogatePairStart isnt index
|
||||
outputTokens.push(new Token({value: @value[nonSurrogatePairStart...index], @scopes}))
|
||||
if nonPairStart isnt index
|
||||
outputTokens.push(new Token({value: @value[nonPairStart...index], @scopes}))
|
||||
|
||||
outputTokens
|
||||
|
||||
buildSurrogatePairToken: (value, index) ->
|
||||
buildPairedCharacterToken: (value, index) ->
|
||||
new Token(
|
||||
value: value[index..index + 1]
|
||||
scopes: @scopes
|
||||
|
||||
Reference in New Issue
Block a user