Add support for surrogate pairs

Surrogate pairs, meaning characters outside the Basic Multilingual Plane, are now broken out as atomic tokens. Closes #567
2026-02-18 10:31:54 -05:00 · 2013-06-03 19:16:44 -07:00
parent 9f4fc49790
commit a109a3317e
8 changed files with 181 additions and 19 deletions
--- a/src/app/text-buffer.coffee
+++ b/src/app/text-buffer.coffee
@@ -356,9 +356,9 @@ class Buffer
  # or if its column goes beyond a line's length, this "sanitizes" the value
  # to a real range.
  #
-  # range - The {Point} to clip
+  # range - The {Range} to clip
  #
-  # Returns the new, clipped {Point}. Note that this could be the same as `range` if no clipping was performed.
+  # Returns the new, clipped {Range}. Note that this could be the same as `range` if no clipping was performed.
  clipRange: (range) ->
    range = Range.fromObject(range)
    new Range(@clipPosition(range.start), @clipPosition(range.end))
--- a/src/app/token.coffee
+++ b/src/app/token.coffee
@@ -1,8 +1,10 @@
 _ = require 'underscore'
+textUtils = require 'text-utils'

 module.exports =
 class Token
  value: null
+  hasSurrogatePairs: false
  scopes: null
  isAtomic: null
  isHardTab: null
@@ -12,6 +14,7 @@ class Token
  constructor: ({@value, @scopes, @isAtomic, @bufferDelta, @isHardTab}) ->
    @screenDelta = @value.length
    @bufferDelta ?= @screenDelta
+    @hasSurrogatePairs = textUtils.hasSurrogatePairs(@value)

  ### Public ###

@@ -27,27 +30,67 @@ class Token
    [new Token(value: value1, scopes: @scopes), new Token(value: value2, scopes: @scopes)]

  breakOutAtomicTokens: (tabLength, breakOutLeadingWhitespace) ->
-    if breakOutLeadingWhitespace
-      return [this] unless /^[ ]|\t/.test(@value)
+    if @hasSurrogatePairs
+      outputTokens = []
+
+      for token in @breakOutSurrogatePairs()
+        if token.isAtomic
+          outputTokens.push(token)
+        else
+          outputTokens.push(token.breakOutAtomicTokens(tabLength, breakOutLeadingWhitespace)...)
+        breakOutLeadingWhitespace = token.isOnlyWhitespace() if breakOutLeadingWhitespace
+
+      outputTokens
    else
-      return [this] unless /\t/.test(@value)
-
-    outputTokens = []
-    regex = new RegExp("([ ]{#{tabLength}})|(\t)|([^\t]+)", "g")
-
-    while match = regex.exec(@value)
-      [fullMatch, softTab, hardTab] = match
-      if softTab and breakOutLeadingWhitespace
-        outputTokens.push(@buildSoftTabToken(tabLength, false))
-      else if hardTab
-        breakOutLeadingWhitespace = false
-        outputTokens.push(@buildHardTabToken(tabLength, true))
+      if breakOutLeadingWhitespace
+        return [this] unless /^[ ]|\t/.test(@value)
      else
-        breakOutLeadingWhitespace = false
-        outputTokens.push(new Token(value: match[0], scopes: @scopes))
+        return [this] unless /\t/.test(@value)
+
+      outputTokens = []
+      regex = new RegExp("([ ]{#{tabLength}})|(\t)|([^\t]+)", "g")
+
+      while match = regex.exec(@value)
+        [fullMatch, softTab, hardTab] = match
+        if softTab and breakOutLeadingWhitespace
+          outputTokens.push(@buildSoftTabToken(tabLength, false))
+        else if hardTab
+          breakOutLeadingWhitespace = false
+          outputTokens.push(@buildHardTabToken(tabLength, true))
+        else
+          breakOutLeadingWhitespace = false
+          value = match[0]
+          outputTokens.push(new Token({value, @scopes}))
+
+      outputTokens
+
+  breakOutSurrogatePairs: ->
+    outputTokens = []
+    index = 0
+    nonSurrogatePairStart = 0
+
+    while index < @value.length
+      if textUtils.isSurrogatePair(@value, index)
+        if nonSurrogatePairStart isnt index
+          outputTokens.push(new Token({value: @value[nonSurrogatePairStart...index], @scopes}))
+        outputTokens.push(@buildSurrogatePairToken(@value, index))
+        index += 2
+        nonSurrogatePairStart = index
+      else
+        index++
+
+    if nonSurrogatePairStart isnt index
+      outputTokens.push(new Token({value: @value[nonSurrogatePairStart...index], @scopes}))

    outputTokens

+  buildSurrogatePairToken: (value, index) ->
+    new Token(
+      value: value[index..index + 1]
+      scopes: @scopes
+      isAtomic: true
+    )
+
  buildHardTabToken: (tabLength) ->
    @buildTabToken(tabLength, true)

--- a/src/stdlib/text-utils.coffee
+++ b/src/stdlib/text-utils.coffee
@@ -0,0 +1,19 @@
+isHighSurrogate = (string, index) ->
+  0xD800 <= string.charCodeAt(index) <= 0xDBFF
+
+isLowSurrogate = (string, index) ->
+  0xDC00 <= string.charCodeAt(index) <= 0xDFFF
+
+isSurrogatePair = (string, index) ->
+  isHighSurrogate(string, index) and isLowSurrogate(string, index + 1)
+
+getCharacterCount = (string) ->
+  count = string.length
+  for index in [0...string.length] when isSurrogatePair(string, index)
+    count--
+  count
+
+hasSurrogatePairs = (string) ->
+  string.length isnt getCharacterCount(string)
+
+module.exports = {getCharacterCount, isSurrogatePair, hasSurrogatePairs}