Add support for surrogate pairs

Surrogate pairs, meaning characters outside the Basic
Multilingual Plane, are now broken out as atomic tokens.

Closes #567
This commit is contained in:
Kevin Sawicki
2013-06-03 19:16:44 -07:00
parent 9f4fc49790
commit a109a3317e
8 changed files with 181 additions and 19 deletions

View File

@@ -356,9 +356,9 @@ class Buffer
# or if its column goes beyond a line's length, this "sanitizes" the value
# to a real range.
#
# range - The {Point} to clip
# range - The {Range} to clip
#
# Returns the new, clipped {Point}. Note that this could be the same as `range` if no clipping was performed.
# Returns the new, clipped {Range}. Note that this could be the same as `range` if no clipping was performed.
clipRange: (range) ->
range = Range.fromObject(range)
new Range(@clipPosition(range.start), @clipPosition(range.end))

View File

@@ -1,8 +1,10 @@
_ = require 'underscore'
textUtils = require 'text-utils'
module.exports =
class Token
value: null
hasSurrogatePairs: false
scopes: null
isAtomic: null
isHardTab: null
@@ -12,6 +14,7 @@ class Token
constructor: ({@value, @scopes, @isAtomic, @bufferDelta, @isHardTab}) ->
@screenDelta = @value.length
@bufferDelta ?= @screenDelta
@hasSurrogatePairs = textUtils.hasSurrogatePairs(@value)
### Public ###
@@ -27,27 +30,67 @@ class Token
[new Token(value: value1, scopes: @scopes), new Token(value: value2, scopes: @scopes)]
breakOutAtomicTokens: (tabLength, breakOutLeadingWhitespace) ->
if breakOutLeadingWhitespace
return [this] unless /^[ ]|\t/.test(@value)
if @hasSurrogatePairs
outputTokens = []
for token in @breakOutSurrogatePairs()
if token.isAtomic
outputTokens.push(token)
else
outputTokens.push(token.breakOutAtomicTokens(tabLength, breakOutLeadingWhitespace)...)
breakOutLeadingWhitespace = token.isOnlyWhitespace() if breakOutLeadingWhitespace
outputTokens
else
return [this] unless /\t/.test(@value)
outputTokens = []
regex = new RegExp("([ ]{#{tabLength}})|(\t)|([^\t]+)", "g")
while match = regex.exec(@value)
[fullMatch, softTab, hardTab] = match
if softTab and breakOutLeadingWhitespace
outputTokens.push(@buildSoftTabToken(tabLength, false))
else if hardTab
breakOutLeadingWhitespace = false
outputTokens.push(@buildHardTabToken(tabLength, true))
if breakOutLeadingWhitespace
return [this] unless /^[ ]|\t/.test(@value)
else
breakOutLeadingWhitespace = false
outputTokens.push(new Token(value: match[0], scopes: @scopes))
return [this] unless /\t/.test(@value)
outputTokens = []
regex = new RegExp("([ ]{#{tabLength}})|(\t)|([^\t]+)", "g")
while match = regex.exec(@value)
[fullMatch, softTab, hardTab] = match
if softTab and breakOutLeadingWhitespace
outputTokens.push(@buildSoftTabToken(tabLength, false))
else if hardTab
breakOutLeadingWhitespace = false
outputTokens.push(@buildHardTabToken(tabLength, true))
else
breakOutLeadingWhitespace = false
value = match[0]
outputTokens.push(new Token({value, @scopes}))
outputTokens
breakOutSurrogatePairs: ->
outputTokens = []
index = 0
nonSurrogatePairStart = 0
while index < @value.length
if textUtils.isSurrogatePair(@value, index)
if nonSurrogatePairStart isnt index
outputTokens.push(new Token({value: @value[nonSurrogatePairStart...index], @scopes}))
outputTokens.push(@buildSurrogatePairToken(@value, index))
index += 2
nonSurrogatePairStart = index
else
index++
if nonSurrogatePairStart isnt index
outputTokens.push(new Token({value: @value[nonSurrogatePairStart...index], @scopes}))
outputTokens
buildSurrogatePairToken: (value, index) ->
new Token(
value: value[index..index + 1]
scopes: @scopes
isAtomic: true
)
buildHardTabToken: (tabLength) ->
@buildTabToken(tabLength, true)

View File

@@ -0,0 +1,19 @@
isHighSurrogate = (string, index) ->
0xD800 <= string.charCodeAt(index) <= 0xDBFF
isLowSurrogate = (string, index) ->
0xDC00 <= string.charCodeAt(index) <= 0xDFFF
isSurrogatePair = (string, index) ->
isHighSurrogate(string, index) and isLowSurrogate(string, index + 1)
getCharacterCount = (string) ->
count = string.length
for index in [0...string.length] when isSurrogatePair(string, index)
count--
count
hasSurrogatePairs = (string) ->
string.length isnt getCharacterCount(string)
module.exports = {getCharacterCount, isSurrogatePair, hasSurrogatePairs}