Add support for Unicode variation sequences

These are character pairs that should be treated as tokens with a buffer delta of 2 and a screen delta of 1.
2026-04-06 03:02:13 -04:00 · 2014-09-17 09:26:52 -07:00
parent b6faffe2db
commit 878da262d2
3 changed files with 73 additions and 41 deletions
--- a/src/text-utils.coffee
+++ b/src/text-utils.coffee
@@ -4,34 +4,59 @@ isHighSurrogate = (string, index) ->
 isLowSurrogate = (string, index) ->
  0xDC00 <= string.charCodeAt(index) <= 0xDFFF

+isVariationSelector = (string, index) ->
+  0xFE00 <= string.charCodeAt(index) <= 0xFE0F
+
 # Is the character at the given index the start of a high/low surrogate pair?
 #
-# string - The {String} to check for a surrogate pair.
-# index - The {Number} index to look for a surrogate pair at.
+# * `string` The {String} to check for a surrogate pair.
+# * `index`  The {Number} index to look for a surrogate pair at.
 #
 # Return a {Boolean}.
 isSurrogatePair = (string, index=0) ->
  isHighSurrogate(string, index) and isLowSurrogate(string, index + 1)

-# Get the number of characters in the string accounting for surrogate pairs.
+# Is the character at the given index the start of a variation sequence?
 #
-# This method counts high/low surrogate pairs as a single character and will
-# always returns a value less than or equal to `string.length`.
+# * `string` The {String} to check for a surrogate pair.
+# * `index`  The {Number} index to look for a surrogate pair at.
 #
-# string - The {String} to count the number of full characters in.
+# Return a {Boolean}.
+isVariationSequence = (string, index=0) ->
+  isVariationSelector(string, index + 1)
+
+# Is the character at the given index the start of high/low surrogate pair
+# or a variation sequence?
+#
+# * `string` The {String} to check for a surrogate pair.
+# * `index`  The {Number} index to look for a surrogate pair at.
+#
+# Return a {Boolean}.
+
+isPairedCharacter = (string, index=0) ->
+  isSurrogatePair(string, index) or isVariationSequence(string, index)
+
+# Get the number of characters in the string accounting for surrogate pairs and
+# variation sequences.
+#
+# This method counts high/low surrogate pairs and variation sequences as a
+# single character and will always returns a value less than or equal to
+# `string.length`.
+#
+# * `string` The {String} to count the number of full characters in.
 #
 # Returns a {Number}.
 getCharacterCount = (string) ->
  count = string.length
-  count-- for index in [0...string.length] when isSurrogatePair(string, index)
+  count-- for index in [0...string.length] when isPairedCharacter(string, index)
  count

-# Does the given string contain at least one surrogate pair?
+# Does the given string contain at least surrogate pair or variation sequence?
 #
-# string - The {String} to check for the presence of surrogate pairs.
+# * `string` The {String} to check for the presence of paired characters.
 #
 # Returns a {Boolean}.
-hasSurrogatePair = (string) ->
+hasPairedCharacter = (string) ->
  string.length isnt getCharacterCount(string)

-module.exports = {getCharacterCount, isSurrogatePair, hasSurrogatePair}
+module.exports = {getCharacterCount, isPairedCharacter, hasPairedCharacter}
--- a/src/token.coffee
+++ b/src/token.coffee
@@ -12,7 +12,7 @@ MaxTokenLength = 20000
 module.exports =
 class Token
  value: null
-  hasSurrogatePair: false
+  hasPairedCharacter: false
  scopes: null
  isAtomic: null
  isHardTab: null
@@ -23,7 +23,7 @@ class Token
  constructor: ({@value, @scopes, @isAtomic, @bufferDelta, @isHardTab}) ->
    @screenDelta = @value.length
    @bufferDelta ?= @screenDelta
-    @hasSurrogatePair = textUtils.hasSurrogatePair(@value)
+    @hasPairedCharacter = textUtils.hasPairedCharacter(@value)

  isEqual: (other) ->
    @value == other.value and _.isEqual(@scopes, other.scopes) and !!@isAtomic == !!other.isAtomic
@@ -57,11 +57,11 @@ class Token
    WhitespaceRegexesByTabLength[tabLength] ?= new RegExp("([ ]{#{tabLength}})|(\t)|([^\t]+)", "g")

  breakOutAtomicTokens: (tabLength, breakOutLeadingSoftTabs, startColumn) ->
-    if @hasSurrogatePair
+    if @hasPairedCharacter
      outputTokens = []
      column = startColumn

-      for token in @breakOutSurrogatePairs()
+      for token in @breakOutPairedCharacters()
        if token.isAtomic
          outputTokens.push(token)
        else
@@ -98,27 +98,27 @@ class Token

      outputTokens

-  breakOutSurrogatePairs: ->
+  breakOutPairedCharacters: ->
    outputTokens = []
    index = 0
-    nonSurrogatePairStart = 0
+    nonPairStart = 0

    while index < @value.length
-      if textUtils.isSurrogatePair(@value, index)
-        if nonSurrogatePairStart isnt index
-          outputTokens.push(new Token({value: @value[nonSurrogatePairStart...index], @scopes}))
-        outputTokens.push(@buildSurrogatePairToken(@value, index))
+      if textUtils.isPairedCharacter(@value, index)
+        if nonPairStart isnt index
+          outputTokens.push(new Token({value: @value[nonPairStart...index], @scopes}))
+        outputTokens.push(@buildPairedCharacterToken(@value, index))
        index += 2
-        nonSurrogatePairStart = index
+        nonPairStart = index
      else
        index++

-    if nonSurrogatePairStart isnt index
-      outputTokens.push(new Token({value: @value[nonSurrogatePairStart...index], @scopes}))
+    if nonPairStart isnt index
+      outputTokens.push(new Token({value: @value[nonPairStart...index], @scopes}))

    outputTokens

-  buildSurrogatePairToken: (value, index) ->
+  buildPairedCharacterToken: (value, index) ->
    new Token(
      value: value[index..index + 1]
      scopes: @scopes