From a109a3317e089122ffa6559a0835dde4de07c3ec Mon Sep 17 00:00:00 2001 From: Kevin Sawicki Date: Mon, 3 Jun 2013 19:16:44 -0700 Subject: [PATCH] Add support for surrogate pairs Surrogate pairs, meaning characters outside the Basic Multilingual Plane, are now broken out as atomic tokens. Closes #567 --- package.json | 2 +- spec/app/edit-session-spec.coffee | 41 ++++++++++++++ spec/app/text-mate-grammar-spec.coffee | 13 +++++ spec/app/tokenized-buffer-spec.coffee | 28 ++++++++++ spec/stdlib/text-utils-spec.coffee | 18 +++++++ src/app/text-buffer.coffee | 4 +- src/app/token.coffee | 75 ++++++++++++++++++++------ src/stdlib/text-utils.coffee | 19 +++++++ 8 files changed, 181 insertions(+), 19 deletions(-) create mode 100644 spec/stdlib/text-utils-spec.coffee create mode 100644 src/stdlib/text-utils.coffee diff --git a/package.json b/package.json index 08755f0dc..b9a44dbf1 100644 --- a/package.json +++ b/package.json @@ -9,7 +9,7 @@ "dependencies": { "coffee-script": "1.6.2", "ctags": "0.3.0", - "oniguruma": "0.11.0", + "oniguruma": "0.14.0", "mkdirp": "0.3.5", "git-utils": "0.17.0", "underscore": "1.4.4", diff --git a/spec/app/edit-session-spec.coffee b/spec/app/edit-session-spec.coffee index cf315ebb8..9d040694e 100644 --- a/spec/app/edit-session-spec.coffee +++ b/spec/app/edit-session-spec.coffee @@ -2460,3 +2460,44 @@ describe "EditSession", -> expect(editSession.shouldPromptToSave()).toBeFalsy() editSession2.destroy() expect(editSession.shouldPromptToSave()).toBeTruthy() + + describe "when the edit session contains surrogate pair characters", -> + it "correctly backspaces over them", -> + editSession.setText('\uD835\uDF97\uD835\uDF97\uD835\uDF97') + editSession.moveCursorToBottom() + editSession.backspace() + expect(editSession.getText()).toBe '\uD835\uDF97\uD835\uDF97' + editSession.backspace() + expect(editSession.getText()).toBe '\uD835\uDF97' + editSession.backspace() + expect(editSession.getText()).toBe '' + + it "correctly deletes over them", -> + editSession.setText('\uD835\uDF97\uD835\uDF97\uD835\uDF97') + editSession.moveCursorToTop() + editSession.delete() + expect(editSession.getText()).toBe '\uD835\uDF97\uD835\uDF97' + editSession.delete() + expect(editSession.getText()).toBe '\uD835\uDF97' + editSession.delete() + expect(editSession.getText()).toBe '' + + it "correctly moves over them", -> + editSession.setText('\uD835\uDF97\uD835\uDF97\uD835\uDF97\n') + editSession.moveCursorToTop() + editSession.moveCursorRight() + expect(editSession.getCursorBufferPosition()).toEqual [0, 2] + editSession.moveCursorRight() + expect(editSession.getCursorBufferPosition()).toEqual [0, 4] + editSession.moveCursorRight() + expect(editSession.getCursorBufferPosition()).toEqual [0, 6] + editSession.moveCursorRight() + expect(editSession.getCursorBufferPosition()).toEqual [1, 0] + editSession.moveCursorLeft() + expect(editSession.getCursorBufferPosition()).toEqual [0, 6] + editSession.moveCursorLeft() + expect(editSession.getCursorBufferPosition()).toEqual [0, 4] + editSession.moveCursorLeft() + expect(editSession.getCursorBufferPosition()).toEqual [0, 2] + editSession.moveCursorLeft() + expect(editSession.getCursorBufferPosition()).toEqual [0, 0] diff --git a/spec/app/text-mate-grammar-spec.coffee b/spec/app/text-mate-grammar-spec.coffee index 9537e2199..78a83fe32 100644 --- a/spec/app/text-mate-grammar-spec.coffee +++ b/spec/app/text-mate-grammar-spec.coffee @@ -614,3 +614,16 @@ describe "TextMateGrammar", -> expect(tokens[0].value).toEqual '//' expect(tokens[1].scopes).toEqual ["source.java", "comment.line.double-slash.java"] expect(tokens[1].value).toEqual 'comment' + + describe "Surrogate pair characters", -> + beforeEach -> + atom.activatePackage('javascript-tmbundle', sync: true) + grammar = syntax.selectGrammar('main.js') + lines = grammar.tokenizeLines "'\uD835\uDF97'" + + it "correctly parses JavaScript strings containing surrogate pair characters", -> + tokens = lines[0] + expect(tokens.length).toBe 3 + expect(tokens[0].value).toBe "'" + expect(tokens[1].value).toBe "\uD835\uDF97" + expect(tokens[2].value).toBe "'" diff --git a/spec/app/tokenized-buffer-spec.coffee b/spec/app/tokenized-buffer-spec.coffee index 7c35de244..dd6129f43 100644 --- a/spec/app/tokenized-buffer-spec.coffee +++ b/spec/app/tokenized-buffer-spec.coffee @@ -325,6 +325,34 @@ describe "TokenizedBuffer", -> expect(tokenizedBuffer.lineForScreenRow(2).text).toBe "#{tabAsSpaces} buy()#{tabAsSpaces}while supply > demand" + describe "when the buffer contains surrogate pairs", -> + beforeEach -> + atom.activatePackage('javascript-tmbundle', sync: true) + buffer = new Buffer('sample-with-pairs.js', "'abc\uD835\uDF97def'") + tokenizedBuffer = new TokenizedBuffer(buffer) + tokenizedBuffer.setVisible(true) + + afterEach -> + tokenizedBuffer.destroy() + buffer.release() + + describe "when the buffer is fully tokenized", -> + beforeEach -> + fullyTokenize(tokenizedBuffer) + + it "renders each surrogate pair as its own atomic token with a value of size 1", -> + screenLine0 = tokenizedBuffer.lineForScreenRow(0) + expect(screenLine0.text).toBe "'abc\uD835\uDF97def'" + { tokens } = screenLine0 + + expect(tokens.length).toBe 5 + expect(tokens[0].value).toBe "'" + expect(tokens[1].value).toBe "abc" + expect(tokens[2].value).toBe "\uD835\uDF97" + expect(tokens[2].isAtomic).toBeTruthy() + expect(tokens[3].value).toBe "def" + expect(tokens[4].value).toBe "'" + describe "when the grammar is updated because a grammar it includes is activated", -> it "retokenizes the buffer", -> atom.activatePackage('ruby-tmbundle', sync: true) diff --git a/spec/stdlib/text-utils-spec.coffee b/spec/stdlib/text-utils-spec.coffee new file mode 100644 index 000000000..5732fa09e --- /dev/null +++ b/spec/stdlib/text-utils-spec.coffee @@ -0,0 +1,18 @@ +textUtils = require 'text-utils' + +describe 'text utilities', -> + describe '.getCharacterCount(string)', -> + it 'returns the number of full characters in the string', -> + expect(textUtils.getCharacterCount('abc')).toBe 3 + expect(textUtils.getCharacterCount('a\uD835\uDF97b\uD835\uDF97c')).toBe 5 + expect(textUtils.getCharacterCount('\uD835\uDF97')).toBe 1 + + describe '.isSurrogatePair(string, index)', -> + it 'returns true when the index is the start of a high/low surrogate pair', -> + expect(textUtils.isSurrogatePair('a\uD835\uDF97b\uD835\uDF97c', 0)).toBe false + expect(textUtils.isSurrogatePair('a\uD835\uDF97b\uD835\uDF97c', 1)).toBe true + expect(textUtils.isSurrogatePair('a\uD835\uDF97b\uD835\uDF97c', 2)).toBe false + expect(textUtils.isSurrogatePair('a\uD835\uDF97b\uD835\uDF97c', 3)).toBe false + expect(textUtils.isSurrogatePair('a\uD835\uDF97b\uD835\uDF97c', 4)).toBe true + expect(textUtils.isSurrogatePair('a\uD835\uDF97b\uD835\uDF97c', 5)).toBe false + expect(textUtils.isSurrogatePair('a\uD835\uDF97b\uD835\uDF97c', 6)).toBe false diff --git a/src/app/text-buffer.coffee b/src/app/text-buffer.coffee index e038a1591..72d128b76 100644 --- a/src/app/text-buffer.coffee +++ b/src/app/text-buffer.coffee @@ -356,9 +356,9 @@ class Buffer # or if its column goes beyond a line's length, this "sanitizes" the value # to a real range. # - # range - The {Point} to clip + # range - The {Range} to clip # - # Returns the new, clipped {Point}. Note that this could be the same as `range` if no clipping was performed. + # Returns the new, clipped {Range}. Note that this could be the same as `range` if no clipping was performed. clipRange: (range) -> range = Range.fromObject(range) new Range(@clipPosition(range.start), @clipPosition(range.end)) diff --git a/src/app/token.coffee b/src/app/token.coffee index ca49e893b..b4cee6fd6 100644 --- a/src/app/token.coffee +++ b/src/app/token.coffee @@ -1,8 +1,10 @@ _ = require 'underscore' +textUtils = require 'text-utils' module.exports = class Token value: null + hasSurrogatePairs: false scopes: null isAtomic: null isHardTab: null @@ -12,6 +14,7 @@ class Token constructor: ({@value, @scopes, @isAtomic, @bufferDelta, @isHardTab}) -> @screenDelta = @value.length @bufferDelta ?= @screenDelta + @hasSurrogatePairs = textUtils.hasSurrogatePairs(@value) ### Public ### @@ -27,27 +30,67 @@ class Token [new Token(value: value1, scopes: @scopes), new Token(value: value2, scopes: @scopes)] breakOutAtomicTokens: (tabLength, breakOutLeadingWhitespace) -> - if breakOutLeadingWhitespace - return [this] unless /^[ ]|\t/.test(@value) + if @hasSurrogatePairs + outputTokens = [] + + for token in @breakOutSurrogatePairs() + if token.isAtomic + outputTokens.push(token) + else + outputTokens.push(token.breakOutAtomicTokens(tabLength, breakOutLeadingWhitespace)...) + breakOutLeadingWhitespace = token.isOnlyWhitespace() if breakOutLeadingWhitespace + + outputTokens else - return [this] unless /\t/.test(@value) - - outputTokens = [] - regex = new RegExp("([ ]{#{tabLength}})|(\t)|([^\t]+)", "g") - - while match = regex.exec(@value) - [fullMatch, softTab, hardTab] = match - if softTab and breakOutLeadingWhitespace - outputTokens.push(@buildSoftTabToken(tabLength, false)) - else if hardTab - breakOutLeadingWhitespace = false - outputTokens.push(@buildHardTabToken(tabLength, true)) + if breakOutLeadingWhitespace + return [this] unless /^[ ]|\t/.test(@value) else - breakOutLeadingWhitespace = false - outputTokens.push(new Token(value: match[0], scopes: @scopes)) + return [this] unless /\t/.test(@value) + + outputTokens = [] + regex = new RegExp("([ ]{#{tabLength}})|(\t)|([^\t]+)", "g") + + while match = regex.exec(@value) + [fullMatch, softTab, hardTab] = match + if softTab and breakOutLeadingWhitespace + outputTokens.push(@buildSoftTabToken(tabLength, false)) + else if hardTab + breakOutLeadingWhitespace = false + outputTokens.push(@buildHardTabToken(tabLength, true)) + else + breakOutLeadingWhitespace = false + value = match[0] + outputTokens.push(new Token({value, @scopes})) + + outputTokens + + breakOutSurrogatePairs: -> + outputTokens = [] + index = 0 + nonSurrogatePairStart = 0 + + while index < @value.length + if textUtils.isSurrogatePair(@value, index) + if nonSurrogatePairStart isnt index + outputTokens.push(new Token({value: @value[nonSurrogatePairStart...index], @scopes})) + outputTokens.push(@buildSurrogatePairToken(@value, index)) + index += 2 + nonSurrogatePairStart = index + else + index++ + + if nonSurrogatePairStart isnt index + outputTokens.push(new Token({value: @value[nonSurrogatePairStart...index], @scopes})) outputTokens + buildSurrogatePairToken: (value, index) -> + new Token( + value: value[index..index + 1] + scopes: @scopes + isAtomic: true + ) + buildHardTabToken: (tabLength) -> @buildTabToken(tabLength, true) diff --git a/src/stdlib/text-utils.coffee b/src/stdlib/text-utils.coffee new file mode 100644 index 000000000..b43af787a --- /dev/null +++ b/src/stdlib/text-utils.coffee @@ -0,0 +1,19 @@ +isHighSurrogate = (string, index) -> + 0xD800 <= string.charCodeAt(index) <= 0xDBFF + +isLowSurrogate = (string, index) -> + 0xDC00 <= string.charCodeAt(index) <= 0xDFFF + +isSurrogatePair = (string, index) -> + isHighSurrogate(string, index) and isLowSurrogate(string, index + 1) + +getCharacterCount = (string) -> + count = string.length + for index in [0...string.length] when isSurrogatePair(string, index) + count-- + count + +hasSurrogatePairs = (string) -> + string.length isnt getCharacterCount(string) + +module.exports = {getCharacterCount, isSurrogatePair, hasSurrogatePairs}