From a109a3317e089122ffa6559a0835dde4de07c3ec Mon Sep 17 00:00:00 2001
From: Kevin Sawicki <kevin@github.com>
Date: Mon, 3 Jun 2013 19:16:44 -0700
Subject: [PATCH] Add support for surrogate pairs

Surrogate pairs, meaning characters outside the Basic
Multilingual Plane, are now broken out as atomic tokens.

Closes #567
---
 package.json                           |  2 +-
 spec/app/edit-session-spec.coffee      | 41 ++++++++++++++
 spec/app/text-mate-grammar-spec.coffee | 13 +++++
 spec/app/tokenized-buffer-spec.coffee  | 28 ++++++++++
 spec/stdlib/text-utils-spec.coffee     | 18 +++++++
 src/app/text-buffer.coffee             |  4 +-
 src/app/token.coffee                   | 75 ++++++++++++++++++++------
 src/stdlib/text-utils.coffee           | 19 +++++++
 8 files changed, 181 insertions(+), 19 deletions(-)
 create mode 100644 spec/stdlib/text-utils-spec.coffee
 create mode 100644 src/stdlib/text-utils.coffee

diff --git a/package.json b/package.json
index 08755f0dc..b9a44dbf1 100644
--- a/package.json
+++ b/package.json
@@ -9,7 +9,7 @@
   "dependencies": {
     "coffee-script": "1.6.2",
     "ctags": "0.3.0",
-    "oniguruma": "0.11.0",
+    "oniguruma": "0.14.0",
     "mkdirp": "0.3.5",
     "git-utils": "0.17.0",
     "underscore": "1.4.4",
diff --git a/spec/app/edit-session-spec.coffee b/spec/app/edit-session-spec.coffee
index cf315ebb8..9d040694e 100644
--- a/spec/app/edit-session-spec.coffee
+++ b/spec/app/edit-session-spec.coffee
@@ -2460,3 +2460,44 @@ describe "EditSession", ->
       expect(editSession.shouldPromptToSave()).toBeFalsy()
       editSession2.destroy()
       expect(editSession.shouldPromptToSave()).toBeTruthy()
+
+  describe "when the edit session contains surrogate pair characters", ->
+    it "correctly backspaces over them", ->
+      editSession.setText('\uD835\uDF97\uD835\uDF97\uD835\uDF97')
+      editSession.moveCursorToBottom()
+      editSession.backspace()
+      expect(editSession.getText()).toBe '\uD835\uDF97\uD835\uDF97'
+      editSession.backspace()
+      expect(editSession.getText()).toBe '\uD835\uDF97'
+      editSession.backspace()
+      expect(editSession.getText()).toBe ''
+
+    it "correctly deletes over them", ->
+      editSession.setText('\uD835\uDF97\uD835\uDF97\uD835\uDF97')
+      editSession.moveCursorToTop()
+      editSession.delete()
+      expect(editSession.getText()).toBe '\uD835\uDF97\uD835\uDF97'
+      editSession.delete()
+      expect(editSession.getText()).toBe '\uD835\uDF97'
+      editSession.delete()
+      expect(editSession.getText()).toBe ''
+
+    it "correctly moves over them", ->
+      editSession.setText('\uD835\uDF97\uD835\uDF97\uD835\uDF97\n')
+      editSession.moveCursorToTop()
+      editSession.moveCursorRight()
+      expect(editSession.getCursorBufferPosition()).toEqual [0, 2]
+      editSession.moveCursorRight()
+      expect(editSession.getCursorBufferPosition()).toEqual [0, 4]
+      editSession.moveCursorRight()
+      expect(editSession.getCursorBufferPosition()).toEqual [0, 6]
+      editSession.moveCursorRight()
+      expect(editSession.getCursorBufferPosition()).toEqual [1, 0]
+      editSession.moveCursorLeft()
+      expect(editSession.getCursorBufferPosition()).toEqual [0, 6]
+      editSession.moveCursorLeft()
+      expect(editSession.getCursorBufferPosition()).toEqual [0, 4]
+      editSession.moveCursorLeft()
+      expect(editSession.getCursorBufferPosition()).toEqual [0, 2]
+      editSession.moveCursorLeft()
+      expect(editSession.getCursorBufferPosition()).toEqual [0, 0]
diff --git a/spec/app/text-mate-grammar-spec.coffee b/spec/app/text-mate-grammar-spec.coffee
index 9537e2199..78a83fe32 100644
--- a/spec/app/text-mate-grammar-spec.coffee
+++ b/spec/app/text-mate-grammar-spec.coffee
@@ -614,3 +614,16 @@ describe "TextMateGrammar", ->
         expect(tokens[0].value).toEqual '//'
         expect(tokens[1].scopes).toEqual ["source.java", "comment.line.double-slash.java"]
         expect(tokens[1].value).toEqual 'comment'
+
+    describe "Surrogate pair characters", ->
+      beforeEach ->
+        atom.activatePackage('javascript-tmbundle', sync: true)
+        grammar = syntax.selectGrammar('main.js')
+        lines = grammar.tokenizeLines "'\uD835\uDF97'"
+
+      it "correctly parses JavaScript strings containing surrogate pair characters", ->
+        tokens = lines[0]
+        expect(tokens.length).toBe 3
+        expect(tokens[0].value).toBe "'"
+        expect(tokens[1].value).toBe "\uD835\uDF97"
+        expect(tokens[2].value).toBe "'"
diff --git a/spec/app/tokenized-buffer-spec.coffee b/spec/app/tokenized-buffer-spec.coffee
index 7c35de244..dd6129f43 100644
--- a/spec/app/tokenized-buffer-spec.coffee
+++ b/spec/app/tokenized-buffer-spec.coffee
@@ -325,6 +325,34 @@ describe "TokenizedBuffer", ->
 
         expect(tokenizedBuffer.lineForScreenRow(2).text).toBe "#{tabAsSpaces} buy()#{tabAsSpaces}while supply > demand"
 
+  describe "when the buffer contains surrogate pairs", ->
+    beforeEach ->
+      atom.activatePackage('javascript-tmbundle', sync: true)
+      buffer = new Buffer('sample-with-pairs.js', "'abc\uD835\uDF97def'")
+      tokenizedBuffer = new TokenizedBuffer(buffer)
+      tokenizedBuffer.setVisible(true)
+
+    afterEach ->
+      tokenizedBuffer.destroy()
+      buffer.release()
+
+    describe "when the buffer is fully tokenized", ->
+      beforeEach ->
+        fullyTokenize(tokenizedBuffer)
+
+      it "renders each surrogate pair as its own atomic token with a value of size 1", ->
+        screenLine0 = tokenizedBuffer.lineForScreenRow(0)
+        expect(screenLine0.text).toBe "'abc\uD835\uDF97def'"
+        { tokens } = screenLine0
+
+        expect(tokens.length).toBe 5
+        expect(tokens[0].value).toBe "'"
+        expect(tokens[1].value).toBe "abc"
+        expect(tokens[2].value).toBe "\uD835\uDF97"
+        expect(tokens[2].isAtomic).toBeTruthy()
+        expect(tokens[3].value).toBe "def"
+        expect(tokens[4].value).toBe "'"
+
   describe "when the grammar is updated because a grammar it includes is activated", ->
     it "retokenizes the buffer", ->
       atom.activatePackage('ruby-tmbundle', sync: true)
diff --git a/spec/stdlib/text-utils-spec.coffee b/spec/stdlib/text-utils-spec.coffee
new file mode 100644
index 000000000..5732fa09e
--- /dev/null
+++ b/spec/stdlib/text-utils-spec.coffee
@@ -0,0 +1,18 @@
+textUtils = require 'text-utils'
+
+describe 'text utilities', ->
+  describe '.getCharacterCount(string)', ->
+    it 'returns the number of full characters in the string', ->
+      expect(textUtils.getCharacterCount('abc')).toBe 3
+      expect(textUtils.getCharacterCount('a\uD835\uDF97b\uD835\uDF97c')).toBe 5
+      expect(textUtils.getCharacterCount('\uD835\uDF97')).toBe 1
+
+  describe '.isSurrogatePair(string, index)', ->
+    it 'returns true when the index is the start of a high/low surrogate pair', ->
+      expect(textUtils.isSurrogatePair('a\uD835\uDF97b\uD835\uDF97c', 0)).toBe false
+      expect(textUtils.isSurrogatePair('a\uD835\uDF97b\uD835\uDF97c', 1)).toBe true
+      expect(textUtils.isSurrogatePair('a\uD835\uDF97b\uD835\uDF97c', 2)).toBe false
+      expect(textUtils.isSurrogatePair('a\uD835\uDF97b\uD835\uDF97c', 3)).toBe false
+      expect(textUtils.isSurrogatePair('a\uD835\uDF97b\uD835\uDF97c', 4)).toBe true
+      expect(textUtils.isSurrogatePair('a\uD835\uDF97b\uD835\uDF97c', 5)).toBe false
+      expect(textUtils.isSurrogatePair('a\uD835\uDF97b\uD835\uDF97c', 6)).toBe false
diff --git a/src/app/text-buffer.coffee b/src/app/text-buffer.coffee
index e038a1591..72d128b76 100644
--- a/src/app/text-buffer.coffee
+++ b/src/app/text-buffer.coffee
@@ -356,9 +356,9 @@ class Buffer
   # or if its column goes beyond a line's length, this "sanitizes" the value
   # to a real range.
   #
-  # range - The {Point} to clip
+  # range - The {Range} to clip
   #
-  # Returns the new, clipped {Point}. Note that this could be the same as `range` if no clipping was performed.
+  # Returns the new, clipped {Range}. Note that this could be the same as `range` if no clipping was performed.
   clipRange: (range) ->
     range = Range.fromObject(range)
     new Range(@clipPosition(range.start), @clipPosition(range.end))
diff --git a/src/app/token.coffee b/src/app/token.coffee
index ca49e893b..b4cee6fd6 100644
--- a/src/app/token.coffee
+++ b/src/app/token.coffee
@@ -1,8 +1,10 @@
 _ = require 'underscore'
+textUtils = require 'text-utils'
 
 module.exports =
 class Token
   value: null
+  hasSurrogatePairs: false
   scopes: null
   isAtomic: null
   isHardTab: null
@@ -12,6 +14,7 @@ class Token
   constructor: ({@value, @scopes, @isAtomic, @bufferDelta, @isHardTab}) ->
     @screenDelta = @value.length
     @bufferDelta ?= @screenDelta
+    @hasSurrogatePairs = textUtils.hasSurrogatePairs(@value)
 
   ### Public ###
 
@@ -27,27 +30,67 @@ class Token
     [new Token(value: value1, scopes: @scopes), new Token(value: value2, scopes: @scopes)]
 
   breakOutAtomicTokens: (tabLength, breakOutLeadingWhitespace) ->
-    if breakOutLeadingWhitespace
-      return [this] unless /^[ ]|\t/.test(@value)
+    if @hasSurrogatePairs
+      outputTokens = []
+
+      for token in @breakOutSurrogatePairs()
+        if token.isAtomic
+          outputTokens.push(token)
+        else
+          outputTokens.push(token.breakOutAtomicTokens(tabLength, breakOutLeadingWhitespace)...)
+        breakOutLeadingWhitespace = token.isOnlyWhitespace() if breakOutLeadingWhitespace
+
+      outputTokens
     else
-      return [this] unless /\t/.test(@value)
-
-    outputTokens = []
-    regex = new RegExp("([ ]{#{tabLength}})|(\t)|([^\t]+)", "g")
-
-    while match = regex.exec(@value)
-      [fullMatch, softTab, hardTab] = match
-      if softTab and breakOutLeadingWhitespace
-        outputTokens.push(@buildSoftTabToken(tabLength, false))
-      else if hardTab
-        breakOutLeadingWhitespace = false
-        outputTokens.push(@buildHardTabToken(tabLength, true))
+      if breakOutLeadingWhitespace
+        return [this] unless /^[ ]|\t/.test(@value)
       else
-        breakOutLeadingWhitespace = false
-        outputTokens.push(new Token(value: match[0], scopes: @scopes))
+        return [this] unless /\t/.test(@value)
+
+      outputTokens = []
+      regex = new RegExp("([ ]{#{tabLength}})|(\t)|([^\t]+)", "g")
+
+      while match = regex.exec(@value)
+        [fullMatch, softTab, hardTab] = match
+        if softTab and breakOutLeadingWhitespace
+          outputTokens.push(@buildSoftTabToken(tabLength, false))
+        else if hardTab
+          breakOutLeadingWhitespace = false
+          outputTokens.push(@buildHardTabToken(tabLength, true))
+        else
+          breakOutLeadingWhitespace = false
+          value = match[0]
+          outputTokens.push(new Token({value, @scopes}))
+
+      outputTokens
+
+  breakOutSurrogatePairs: ->
+    outputTokens = []
+    index = 0
+    nonSurrogatePairStart = 0
+
+    while index < @value.length
+      if textUtils.isSurrogatePair(@value, index)
+        if nonSurrogatePairStart isnt index
+          outputTokens.push(new Token({value: @value[nonSurrogatePairStart...index], @scopes}))
+        outputTokens.push(@buildSurrogatePairToken(@value, index))
+        index += 2
+        nonSurrogatePairStart = index
+      else
+        index++
+
+    if nonSurrogatePairStart isnt index
+      outputTokens.push(new Token({value: @value[nonSurrogatePairStart...index], @scopes}))
 
     outputTokens
 
+  buildSurrogatePairToken: (value, index) ->
+    new Token(
+      value: value[index..index + 1]
+      scopes: @scopes
+      isAtomic: true
+    )
+
   buildHardTabToken: (tabLength) ->
     @buildTabToken(tabLength, true)
 
diff --git a/src/stdlib/text-utils.coffee b/src/stdlib/text-utils.coffee
new file mode 100644
index 000000000..b43af787a
--- /dev/null
+++ b/src/stdlib/text-utils.coffee
@@ -0,0 +1,19 @@
+isHighSurrogate = (string, index) ->
+  0xD800 <= string.charCodeAt(index) <= 0xDBFF
+
+isLowSurrogate = (string, index) ->
+  0xDC00 <= string.charCodeAt(index) <= 0xDFFF
+
+isSurrogatePair = (string, index) ->
+  isHighSurrogate(string, index) and isLowSurrogate(string, index + 1)
+
+getCharacterCount = (string) ->
+  count = string.length
+  for index in [0...string.length] when isSurrogatePair(string, index)
+    count--
+  count
+
+hasSurrogatePairs = (string) ->
+  string.length isnt getCharacterCount(string)
+
+module.exports = {getCharacterCount, isSurrogatePair, hasSurrogatePairs}