From 878da262d2b263222ce4cda628c58ddc974ec7cd Mon Sep 17 00:00:00 2001 From: Kevin Sawicki Date: Wed, 17 Sep 2014 09:26:52 -0700 Subject: [PATCH 1/8] Add support for Unicode variation sequences These are character pairs that should be treated as tokens with a buffer delta of 2 and a screen delta of 1. --- spec/text-utils-spec.coffee | 39 +++++++++++++++++------------- src/text-utils.coffee | 47 ++++++++++++++++++++++++++++--------- src/token.coffee | 28 +++++++++++----------- 3 files changed, 73 insertions(+), 41 deletions(-) diff --git a/spec/text-utils-spec.coffee b/spec/text-utils-spec.coffee index 36ac0b356..31f49cf72 100644 --- a/spec/text-utils-spec.coffee +++ b/spec/text-utils-spec.coffee @@ -6,25 +6,32 @@ describe 'text utilities', -> expect(textUtils.getCharacterCount('abc')).toBe 3 expect(textUtils.getCharacterCount('a\uD835\uDF97b\uD835\uDF97c')).toBe 5 expect(textUtils.getCharacterCount('\uD835\uDF97')).toBe 1 + expect(textUtils.getCharacterCount('\u2714\uFE0E')).toBe 1 expect(textUtils.getCharacterCount('\uD835')).toBe 1 expect(textUtils.getCharacterCount('\uDF97')).toBe 1 - describe '.hasSurrogatePair(string)', -> + describe '.hasPairedCharacter(string)', -> it 'returns true when the string contains a surrogate pair', -> - expect(textUtils.hasSurrogatePair('abc')).toBe false - expect(textUtils.hasSurrogatePair('a\uD835\uDF97b\uD835\uDF97c')).toBe true - expect(textUtils.hasSurrogatePair('\uD835\uDF97')).toBe true - expect(textUtils.hasSurrogatePair('\uD835')).toBe false - expect(textUtils.hasSurrogatePair('\uDF97')).toBe false + expect(textUtils.hasPairedCharacter('abc')).toBe false + expect(textUtils.hasPairedCharacter('a\uD835\uDF97b\uD835\uDF97c')).toBe true + expect(textUtils.hasPairedCharacter('\uD835\uDF97')).toBe true + expect(textUtils.hasPairedCharacter('\u2714\uFE0E')).toBe true + expect(textUtils.hasPairedCharacter('\uD835')).toBe false + expect(textUtils.hasPairedCharacter('\uDF97')).toBe false + expect(textUtils.hasPairedCharacter('\uFE0E')).toBe false - describe '.isSurrogatePair(string, index)', -> + describe '.isPairedCharacter(string, index)', -> it 'returns true when the index is the start of a high/low surrogate pair', -> - expect(textUtils.isSurrogatePair('a\uD835\uDF97b\uD835\uDF97c', 0)).toBe false - expect(textUtils.isSurrogatePair('a\uD835\uDF97b\uD835\uDF97c', 1)).toBe true - expect(textUtils.isSurrogatePair('a\uD835\uDF97b\uD835\uDF97c', 2)).toBe false - expect(textUtils.isSurrogatePair('a\uD835\uDF97b\uD835\uDF97c', 3)).toBe false - expect(textUtils.isSurrogatePair('a\uD835\uDF97b\uD835\uDF97c', 4)).toBe true - expect(textUtils.isSurrogatePair('a\uD835\uDF97b\uD835\uDF97c', 5)).toBe false - expect(textUtils.isSurrogatePair('a\uD835\uDF97b\uD835\uDF97c', 6)).toBe false - expect(textUtils.isSurrogatePair('\uD835')).toBe false - expect(textUtils.isSurrogatePair('\uDF97')).toBe false + expect(textUtils.isPairedCharacter('a\uD835\uDF97b\uD835\uDF97c', 0)).toBe false + expect(textUtils.isPairedCharacter('a\uD835\uDF97b\uD835\uDF97c', 1)).toBe true + expect(textUtils.isPairedCharacter('a\uD835\uDF97b\uD835\uDF97c', 2)).toBe false + expect(textUtils.isPairedCharacter('a\uD835\uDF97b\uD835\uDF97c', 3)).toBe false + expect(textUtils.isPairedCharacter('a\uD835\uDF97b\uD835\uDF97c', 4)).toBe true + expect(textUtils.isPairedCharacter('a\uD835\uDF97b\uD835\uDF97c', 5)).toBe false + expect(textUtils.isPairedCharacter('a\uD835\uDF97b\uD835\uDF97c', 6)).toBe false + expect(textUtils.isPairedCharacter('a\u2714\uFE0E', 0)).toBe false + expect(textUtils.isPairedCharacter('a\u2714\uFE0E', 1)).toBe true + expect(textUtils.isPairedCharacter('a\u2714\uFE0E', 2)).toBe false + expect(textUtils.isPairedCharacter('a\u2714\uFE0E', 3)).toBe false + expect(textUtils.isPairedCharacter('\uD835')).toBe false + expect(textUtils.isPairedCharacter('\uDF97')).toBe false diff --git a/src/text-utils.coffee b/src/text-utils.coffee index a043d7c73..eff284d6d 100644 --- a/src/text-utils.coffee +++ b/src/text-utils.coffee @@ -4,34 +4,59 @@ isHighSurrogate = (string, index) -> isLowSurrogate = (string, index) -> 0xDC00 <= string.charCodeAt(index) <= 0xDFFF +isVariationSelector = (string, index) -> + 0xFE00 <= string.charCodeAt(index) <= 0xFE0F + # Is the character at the given index the start of a high/low surrogate pair? # -# string - The {String} to check for a surrogate pair. -# index - The {Number} index to look for a surrogate pair at. +# * `string` The {String} to check for a surrogate pair. +# * `index` The {Number} index to look for a surrogate pair at. # # Return a {Boolean}. isSurrogatePair = (string, index=0) -> isHighSurrogate(string, index) and isLowSurrogate(string, index + 1) -# Get the number of characters in the string accounting for surrogate pairs. +# Is the character at the given index the start of a variation sequence? # -# This method counts high/low surrogate pairs as a single character and will -# always returns a value less than or equal to `string.length`. +# * `string` The {String} to check for a surrogate pair. +# * `index` The {Number} index to look for a surrogate pair at. # -# string - The {String} to count the number of full characters in. +# Return a {Boolean}. +isVariationSequence = (string, index=0) -> + isVariationSelector(string, index + 1) + +# Is the character at the given index the start of high/low surrogate pair +# or a variation sequence? +# +# * `string` The {String} to check for a surrogate pair. +# * `index` The {Number} index to look for a surrogate pair at. +# +# Return a {Boolean}. + +isPairedCharacter = (string, index=0) -> + isSurrogatePair(string, index) or isVariationSequence(string, index) + +# Get the number of characters in the string accounting for surrogate pairs and +# variation sequences. +# +# This method counts high/low surrogate pairs and variation sequences as a +# single character and will always returns a value less than or equal to +# `string.length`. +# +# * `string` The {String} to count the number of full characters in. # # Returns a {Number}. getCharacterCount = (string) -> count = string.length - count-- for index in [0...string.length] when isSurrogatePair(string, index) + count-- for index in [0...string.length] when isPairedCharacter(string, index) count -# Does the given string contain at least one surrogate pair? +# Does the given string contain at least surrogate pair or variation sequence? # -# string - The {String} to check for the presence of surrogate pairs. +# * `string` The {String} to check for the presence of paired characters. # # Returns a {Boolean}. -hasSurrogatePair = (string) -> +hasPairedCharacter = (string) -> string.length isnt getCharacterCount(string) -module.exports = {getCharacterCount, isSurrogatePair, hasSurrogatePair} +module.exports = {getCharacterCount, isPairedCharacter, hasPairedCharacter} diff --git a/src/token.coffee b/src/token.coffee index 5366f33cc..a36117ea8 100644 --- a/src/token.coffee +++ b/src/token.coffee @@ -12,7 +12,7 @@ MaxTokenLength = 20000 module.exports = class Token value: null - hasSurrogatePair: false + hasPairedCharacter: false scopes: null isAtomic: null isHardTab: null @@ -23,7 +23,7 @@ class Token constructor: ({@value, @scopes, @isAtomic, @bufferDelta, @isHardTab}) -> @screenDelta = @value.length @bufferDelta ?= @screenDelta - @hasSurrogatePair = textUtils.hasSurrogatePair(@value) + @hasPairedCharacter = textUtils.hasPairedCharacter(@value) isEqual: (other) -> @value == other.value and _.isEqual(@scopes, other.scopes) and !!@isAtomic == !!other.isAtomic @@ -57,11 +57,11 @@ class Token WhitespaceRegexesByTabLength[tabLength] ?= new RegExp("([ ]{#{tabLength}})|(\t)|([^\t]+)", "g") breakOutAtomicTokens: (tabLength, breakOutLeadingSoftTabs, startColumn) -> - if @hasSurrogatePair + if @hasPairedCharacter outputTokens = [] column = startColumn - for token in @breakOutSurrogatePairs() + for token in @breakOutPairedCharacters() if token.isAtomic outputTokens.push(token) else @@ -98,27 +98,27 @@ class Token outputTokens - breakOutSurrogatePairs: -> + breakOutPairedCharacters: -> outputTokens = [] index = 0 - nonSurrogatePairStart = 0 + nonPairStart = 0 while index < @value.length - if textUtils.isSurrogatePair(@value, index) - if nonSurrogatePairStart isnt index - outputTokens.push(new Token({value: @value[nonSurrogatePairStart...index], @scopes})) - outputTokens.push(@buildSurrogatePairToken(@value, index)) + if textUtils.isPairedCharacter(@value, index) + if nonPairStart isnt index + outputTokens.push(new Token({value: @value[nonPairStart...index], @scopes})) + outputTokens.push(@buildPairedCharacterToken(@value, index)) index += 2 - nonSurrogatePairStart = index + nonPairStart = index else index++ - if nonSurrogatePairStart isnt index - outputTokens.push(new Token({value: @value[nonSurrogatePairStart...index], @scopes})) + if nonPairStart isnt index + outputTokens.push(new Token({value: @value[nonPairStart...index], @scopes})) outputTokens - buildSurrogatePairToken: (value, index) -> + buildPairedCharacterToken: (value, index) -> new Token( value: value[index..index + 1] scopes: @scopes From fb7061f500acd21bd59879485787d75a1f4d9a0f Mon Sep 17 00:00:00 2001 From: Kevin Sawicki Date: Wed, 17 Sep 2014 09:28:24 -0700 Subject: [PATCH 2/8] :memo: Mention variation sequence in specs --- spec/text-utils-spec.coffee | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spec/text-utils-spec.coffee b/spec/text-utils-spec.coffee index 31f49cf72..83aa9a274 100644 --- a/spec/text-utils-spec.coffee +++ b/spec/text-utils-spec.coffee @@ -11,7 +11,7 @@ describe 'text utilities', -> expect(textUtils.getCharacterCount('\uDF97')).toBe 1 describe '.hasPairedCharacter(string)', -> - it 'returns true when the string contains a surrogate pair', -> + it 'returns true when the string contains a surrogate pair or variation sequence', -> expect(textUtils.hasPairedCharacter('abc')).toBe false expect(textUtils.hasPairedCharacter('a\uD835\uDF97b\uD835\uDF97c')).toBe true expect(textUtils.hasPairedCharacter('\uD835\uDF97')).toBe true @@ -21,7 +21,7 @@ describe 'text utilities', -> expect(textUtils.hasPairedCharacter('\uFE0E')).toBe false describe '.isPairedCharacter(string, index)', -> - it 'returns true when the index is the start of a high/low surrogate pair', -> + it 'returns true when the index is the start of a high/low surrogate pair or variation sequence', -> expect(textUtils.isPairedCharacter('a\uD835\uDF97b\uD835\uDF97c', 0)).toBe false expect(textUtils.isPairedCharacter('a\uD835\uDF97b\uD835\uDF97c', 1)).toBe true expect(textUtils.isPairedCharacter('a\uD835\uDF97b\uD835\uDF97c', 2)).toBe false From df68ae26a2cf7e623730e5d0793e4758bf25bfef Mon Sep 17 00:00:00 2001 From: Kevin Sawicki Date: Wed, 17 Sep 2014 09:30:26 -0700 Subject: [PATCH 3/8] Add specs for variation sequences --- spec/editor-spec.coffee | 43 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/spec/editor-spec.coffee b/spec/editor-spec.coffee index 6ff0aa304..c43b2bae8 100644 --- a/spec/editor-spec.coffee +++ b/spec/editor-spec.coffee @@ -3343,7 +3343,7 @@ describe "Editor", -> editor2.destroy() expect(editor.shouldPromptToSave()).toBeTruthy() - describe "when the edit session contains surrogate pair characters", -> + describe "when the editor text contains surrogate pair characters", -> it "correctly backspaces over them", -> editor.setText('\uD835\uDF97\uD835\uDF97\uD835\uDF97') editor.moveToBottom() @@ -3384,6 +3384,47 @@ describe "Editor", -> editor.moveLeft() expect(editor.getCursorBufferPosition()).toEqual [0, 0] + describe "when the editor contains variation sequence character pairs", -> + it "correctly backspaces over them", -> + editor.setText('\u2714\uFE0E\u2714\uFE0E\u2714\uFE0E') + editor.moveToBottom() + editor.backspace() + expect(editor.getText()).toBe '\u2714\uFE0E\u2714\uFE0E' + editor.backspace() + expect(editor.getText()).toBe '\u2714\uFE0E' + editor.backspace() + expect(editor.getText()).toBe '' + + it "correctly deletes over them", -> + editor.setText('\u2714\uFE0E\u2714\uFE0E\u2714\uFE0E') + editor.moveToTop() + editor.delete() + expect(editor.getText()).toBe '\u2714\uFE0E\u2714\uFE0E' + editor.delete() + expect(editor.getText()).toBe '\u2714\uFE0E' + editor.delete() + expect(editor.getText()).toBe '' + + it "correctly moves over them", -> + editor.setText('\u2714\uFE0E\u2714\uFE0E\u2714\uFE0E\n') + editor.moveToTop() + editor.moveRight() + expect(editor.getCursorBufferPosition()).toEqual [0, 2] + editor.moveRight() + expect(editor.getCursorBufferPosition()).toEqual [0, 4] + editor.moveRight() + expect(editor.getCursorBufferPosition()).toEqual [0, 6] + editor.moveRight() + expect(editor.getCursorBufferPosition()).toEqual [1, 0] + editor.moveLeft() + expect(editor.getCursorBufferPosition()).toEqual [0, 6] + editor.moveLeft() + expect(editor.getCursorBufferPosition()).toEqual [0, 4] + editor.moveLeft() + expect(editor.getCursorBufferPosition()).toEqual [0, 2] + editor.moveLeft() + expect(editor.getCursorBufferPosition()).toEqual [0, 0] + describe ".setIndentationForBufferRow", -> describe "when the editor uses soft tabs but the row has hard tabs", -> it "only replaces whitespace characters", -> From 3acddf3e711e67faa92481a628a63f9cb7de93d5 Mon Sep 17 00:00:00 2001 From: Kevin Sawicki Date: Wed, 17 Sep 2014 09:32:18 -0700 Subject: [PATCH 4/8] :memo: Drop text --- spec/editor-spec.coffee | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/editor-spec.coffee b/spec/editor-spec.coffee index c43b2bae8..27a0dc099 100644 --- a/spec/editor-spec.coffee +++ b/spec/editor-spec.coffee @@ -3343,7 +3343,7 @@ describe "Editor", -> editor2.destroy() expect(editor.shouldPromptToSave()).toBeTruthy() - describe "when the editor text contains surrogate pair characters", -> + describe "when the editor contains surrogate pair characters", -> it "correctly backspaces over them", -> editor.setText('\uD835\uDF97\uD835\uDF97\uD835\uDF97') editor.moveToBottom() From c1aa5c9e48d255d237dcfbf352386cea978d0965 Mon Sep 17 00:00:00 2001 From: Kevin Sawicki Date: Wed, 17 Sep 2014 09:38:29 -0700 Subject: [PATCH 5/8] :memo: Mention variation sequence in comment --- src/text-utils.coffee | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/text-utils.coffee b/src/text-utils.coffee index eff284d6d..16a5427d8 100644 --- a/src/text-utils.coffee +++ b/src/text-utils.coffee @@ -18,8 +18,8 @@ isSurrogatePair = (string, index=0) -> # Is the character at the given index the start of a variation sequence? # -# * `string` The {String} to check for a surrogate pair. -# * `index` The {Number} index to look for a surrogate pair at. +# * `string` The {String} to check for a variation sequence. +# * `index` The {Number} index to look for a variation sequence at. # # Return a {Boolean}. isVariationSequence = (string, index=0) -> @@ -28,7 +28,7 @@ isVariationSequence = (string, index=0) -> # Is the character at the given index the start of high/low surrogate pair # or a variation sequence? # -# * `string` The {String} to check for a surrogate pair. +# * `string` The {String} to check for a surrogate pair or variation sequence. # * `index` The {Number} index to look for a surrogate pair at. # # Return a {Boolean}. From e343b0e0fca4c38b715a928dd2d17e8d10b52794 Mon Sep 17 00:00:00 2001 From: Kevin Sawicki Date: Wed, 17 Sep 2014 09:40:49 -0700 Subject: [PATCH 6/8] Don't treat consecutive variation selectors as a sequence --- spec/text-utils-spec.coffee | 6 ++++++ src/text-utils.coffee | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/spec/text-utils-spec.coffee b/spec/text-utils-spec.coffee index 83aa9a274..9c20408e4 100644 --- a/spec/text-utils-spec.coffee +++ b/spec/text-utils-spec.coffee @@ -9,6 +9,8 @@ describe 'text utilities', -> expect(textUtils.getCharacterCount('\u2714\uFE0E')).toBe 1 expect(textUtils.getCharacterCount('\uD835')).toBe 1 expect(textUtils.getCharacterCount('\uDF97')).toBe 1 + expect(textUtils.getCharacterCount('\uFE0E')).toBe 1 + expect(textUtils.getCharacterCount('\uFE0E\uFE0E')).toBe 2 describe '.hasPairedCharacter(string)', -> it 'returns true when the string contains a surrogate pair or variation sequence', -> @@ -19,6 +21,7 @@ describe 'text utilities', -> expect(textUtils.hasPairedCharacter('\uD835')).toBe false expect(textUtils.hasPairedCharacter('\uDF97')).toBe false expect(textUtils.hasPairedCharacter('\uFE0E')).toBe false + expect(textUtils.hasPairedCharacter('\uFE0E\uFE0E')).toBe false describe '.isPairedCharacter(string, index)', -> it 'returns true when the index is the start of a high/low surrogate pair or variation sequence', -> @@ -35,3 +38,6 @@ describe 'text utilities', -> expect(textUtils.isPairedCharacter('a\u2714\uFE0E', 3)).toBe false expect(textUtils.isPairedCharacter('\uD835')).toBe false expect(textUtils.isPairedCharacter('\uDF97')).toBe false + expect(textUtils.isPairedCharacter('\uFE0E')).toBe false + expect(textUtils.isPairedCharacter('\uFE0E')).toBe false + expect(textUtils.isPairedCharacter('\uFE0E\uFE0E')).toBe false diff --git a/src/text-utils.coffee b/src/text-utils.coffee index 16a5427d8..fdf3f0128 100644 --- a/src/text-utils.coffee +++ b/src/text-utils.coffee @@ -23,7 +23,7 @@ isSurrogatePair = (string, index=0) -> # # Return a {Boolean}. isVariationSequence = (string, index=0) -> - isVariationSelector(string, index + 1) + not isVariationSelector(string, index) and isVariationSelector(string, index + 1) # Is the character at the given index the start of high/low surrogate pair # or a variation sequence? From f1fd13b0b2fea79ea4c7bdcec1009ef97aeeb6cd Mon Sep 17 00:00:00 2001 From: Kevin Sawicki Date: Wed, 17 Sep 2014 10:35:33 -0700 Subject: [PATCH 7/8] Return as soon as first paired character is found Previously the character count of the entire string was taken even though it was only checking for the presence of a paired character. Now hasPairedCharacter returns as early as possible and the now unused getCharacterCount has been removed. --- spec/text-utils-spec.coffee | 11 ----------- src/text-utils.coffee | 23 ++++++----------------- 2 files changed, 6 insertions(+), 28 deletions(-) diff --git a/spec/text-utils-spec.coffee b/spec/text-utils-spec.coffee index 9c20408e4..89cf34aca 100644 --- a/spec/text-utils-spec.coffee +++ b/spec/text-utils-spec.coffee @@ -1,17 +1,6 @@ textUtils = require '../src/text-utils' describe 'text utilities', -> - describe '.getCharacterCount(string)', -> - it 'returns the number of full characters in the string', -> - expect(textUtils.getCharacterCount('abc')).toBe 3 - expect(textUtils.getCharacterCount('a\uD835\uDF97b\uD835\uDF97c')).toBe 5 - expect(textUtils.getCharacterCount('\uD835\uDF97')).toBe 1 - expect(textUtils.getCharacterCount('\u2714\uFE0E')).toBe 1 - expect(textUtils.getCharacterCount('\uD835')).toBe 1 - expect(textUtils.getCharacterCount('\uDF97')).toBe 1 - expect(textUtils.getCharacterCount('\uFE0E')).toBe 1 - expect(textUtils.getCharacterCount('\uFE0E\uFE0E')).toBe 2 - describe '.hasPairedCharacter(string)', -> it 'returns true when the string contains a surrogate pair or variation sequence', -> expect(textUtils.hasPairedCharacter('abc')).toBe false diff --git a/src/text-utils.coffee b/src/text-utils.coffee index fdf3f0128..fc4870f5c 100644 --- a/src/text-utils.coffee +++ b/src/text-utils.coffee @@ -36,27 +36,16 @@ isVariationSequence = (string, index=0) -> isPairedCharacter = (string, index=0) -> isSurrogatePair(string, index) or isVariationSequence(string, index) -# Get the number of characters in the string accounting for surrogate pairs and -# variation sequences. -# -# This method counts high/low surrogate pairs and variation sequences as a -# single character and will always returns a value less than or equal to -# `string.length`. -# -# * `string` The {String} to count the number of full characters in. -# -# Returns a {Number}. -getCharacterCount = (string) -> - count = string.length - count-- for index in [0...string.length] when isPairedCharacter(string, index) - count - # Does the given string contain at least surrogate pair or variation sequence? # # * `string` The {String} to check for the presence of paired characters. # # Returns a {Boolean}. hasPairedCharacter = (string) -> - string.length isnt getCharacterCount(string) + index = 0 + while index < string.length + return true if isPairedCharacter(string, index) + index++ + false -module.exports = {getCharacterCount, isPairedCharacter, hasPairedCharacter} +module.exports = {isPairedCharacter, hasPairedCharacter} From 146e8c2a0bb967ce94a43a6462338cbd5b0f58cc Mon Sep 17 00:00:00 2001 From: Kevin Sawicki Date: Wed, 17 Sep 2014 10:40:12 -0700 Subject: [PATCH 8/8] :lipstick: Remove extra newline --- src/text-utils.coffee | 1 - 1 file changed, 1 deletion(-) diff --git a/src/text-utils.coffee b/src/text-utils.coffee index fc4870f5c..90bf220ff 100644 --- a/src/text-utils.coffee +++ b/src/text-utils.coffee @@ -32,7 +32,6 @@ isVariationSequence = (string, index=0) -> # * `index` The {Number} index to look for a surrogate pair at. # # Return a {Boolean}. - isPairedCharacter = (string, index=0) -> isSurrogatePair(string, index) or isVariationSequence(string, index)