mirror of
https://github.com/atom/atom.git
synced 2026-01-22 21:38:10 -05:00
Add support for surrogate pairs
Surrogate pairs, meaning characters outside the Basic Multilingual Plane, are now broken out as atomic tokens. Closes #567
This commit is contained in:
@@ -9,7 +9,7 @@
|
||||
"dependencies": {
|
||||
"coffee-script": "1.6.2",
|
||||
"ctags": "0.3.0",
|
||||
"oniguruma": "0.11.0",
|
||||
"oniguruma": "0.14.0",
|
||||
"mkdirp": "0.3.5",
|
||||
"git-utils": "0.17.0",
|
||||
"underscore": "1.4.4",
|
||||
|
||||
@@ -2460,3 +2460,44 @@ describe "EditSession", ->
|
||||
expect(editSession.shouldPromptToSave()).toBeFalsy()
|
||||
editSession2.destroy()
|
||||
expect(editSession.shouldPromptToSave()).toBeTruthy()
|
||||
|
||||
describe "when the edit session contains surrogate pair characters", ->
|
||||
it "correctly backspaces over them", ->
|
||||
editSession.setText('\uD835\uDF97\uD835\uDF97\uD835\uDF97')
|
||||
editSession.moveCursorToBottom()
|
||||
editSession.backspace()
|
||||
expect(editSession.getText()).toBe '\uD835\uDF97\uD835\uDF97'
|
||||
editSession.backspace()
|
||||
expect(editSession.getText()).toBe '\uD835\uDF97'
|
||||
editSession.backspace()
|
||||
expect(editSession.getText()).toBe ''
|
||||
|
||||
it "correctly deletes over them", ->
|
||||
editSession.setText('\uD835\uDF97\uD835\uDF97\uD835\uDF97')
|
||||
editSession.moveCursorToTop()
|
||||
editSession.delete()
|
||||
expect(editSession.getText()).toBe '\uD835\uDF97\uD835\uDF97'
|
||||
editSession.delete()
|
||||
expect(editSession.getText()).toBe '\uD835\uDF97'
|
||||
editSession.delete()
|
||||
expect(editSession.getText()).toBe ''
|
||||
|
||||
it "correctly moves over them", ->
|
||||
editSession.setText('\uD835\uDF97\uD835\uDF97\uD835\uDF97\n')
|
||||
editSession.moveCursorToTop()
|
||||
editSession.moveCursorRight()
|
||||
expect(editSession.getCursorBufferPosition()).toEqual [0, 2]
|
||||
editSession.moveCursorRight()
|
||||
expect(editSession.getCursorBufferPosition()).toEqual [0, 4]
|
||||
editSession.moveCursorRight()
|
||||
expect(editSession.getCursorBufferPosition()).toEqual [0, 6]
|
||||
editSession.moveCursorRight()
|
||||
expect(editSession.getCursorBufferPosition()).toEqual [1, 0]
|
||||
editSession.moveCursorLeft()
|
||||
expect(editSession.getCursorBufferPosition()).toEqual [0, 6]
|
||||
editSession.moveCursorLeft()
|
||||
expect(editSession.getCursorBufferPosition()).toEqual [0, 4]
|
||||
editSession.moveCursorLeft()
|
||||
expect(editSession.getCursorBufferPosition()).toEqual [0, 2]
|
||||
editSession.moveCursorLeft()
|
||||
expect(editSession.getCursorBufferPosition()).toEqual [0, 0]
|
||||
|
||||
@@ -614,3 +614,16 @@ describe "TextMateGrammar", ->
|
||||
expect(tokens[0].value).toEqual '//'
|
||||
expect(tokens[1].scopes).toEqual ["source.java", "comment.line.double-slash.java"]
|
||||
expect(tokens[1].value).toEqual 'comment'
|
||||
|
||||
describe "Surrogate pair characters", ->
|
||||
beforeEach ->
|
||||
atom.activatePackage('javascript-tmbundle', sync: true)
|
||||
grammar = syntax.selectGrammar('main.js')
|
||||
lines = grammar.tokenizeLines "'\uD835\uDF97'"
|
||||
|
||||
it "correctly parses JavaScript strings containing surrogate pair characters", ->
|
||||
tokens = lines[0]
|
||||
expect(tokens.length).toBe 3
|
||||
expect(tokens[0].value).toBe "'"
|
||||
expect(tokens[1].value).toBe "\uD835\uDF97"
|
||||
expect(tokens[2].value).toBe "'"
|
||||
|
||||
@@ -325,6 +325,34 @@ describe "TokenizedBuffer", ->
|
||||
|
||||
expect(tokenizedBuffer.lineForScreenRow(2).text).toBe "#{tabAsSpaces} buy()#{tabAsSpaces}while supply > demand"
|
||||
|
||||
describe "when the buffer contains surrogate pairs", ->
|
||||
beforeEach ->
|
||||
atom.activatePackage('javascript-tmbundle', sync: true)
|
||||
buffer = new Buffer('sample-with-pairs.js', "'abc\uD835\uDF97def'")
|
||||
tokenizedBuffer = new TokenizedBuffer(buffer)
|
||||
tokenizedBuffer.setVisible(true)
|
||||
|
||||
afterEach ->
|
||||
tokenizedBuffer.destroy()
|
||||
buffer.release()
|
||||
|
||||
describe "when the buffer is fully tokenized", ->
|
||||
beforeEach ->
|
||||
fullyTokenize(tokenizedBuffer)
|
||||
|
||||
it "renders each surrogate pair as its own atomic token with a value of size 1", ->
|
||||
screenLine0 = tokenizedBuffer.lineForScreenRow(0)
|
||||
expect(screenLine0.text).toBe "'abc\uD835\uDF97def'"
|
||||
{ tokens } = screenLine0
|
||||
|
||||
expect(tokens.length).toBe 5
|
||||
expect(tokens[0].value).toBe "'"
|
||||
expect(tokens[1].value).toBe "abc"
|
||||
expect(tokens[2].value).toBe "\uD835\uDF97"
|
||||
expect(tokens[2].isAtomic).toBeTruthy()
|
||||
expect(tokens[3].value).toBe "def"
|
||||
expect(tokens[4].value).toBe "'"
|
||||
|
||||
describe "when the grammar is updated because a grammar it includes is activated", ->
|
||||
it "retokenizes the buffer", ->
|
||||
atom.activatePackage('ruby-tmbundle', sync: true)
|
||||
|
||||
18
spec/stdlib/text-utils-spec.coffee
Normal file
18
spec/stdlib/text-utils-spec.coffee
Normal file
@@ -0,0 +1,18 @@
|
||||
textUtils = require 'text-utils'
|
||||
|
||||
describe 'text utilities', ->
|
||||
describe '.getCharacterCount(string)', ->
|
||||
it 'returns the number of full characters in the string', ->
|
||||
expect(textUtils.getCharacterCount('abc')).toBe 3
|
||||
expect(textUtils.getCharacterCount('a\uD835\uDF97b\uD835\uDF97c')).toBe 5
|
||||
expect(textUtils.getCharacterCount('\uD835\uDF97')).toBe 1
|
||||
|
||||
describe '.isSurrogatePair(string, index)', ->
|
||||
it 'returns true when the index is the start of a high/low surrogate pair', ->
|
||||
expect(textUtils.isSurrogatePair('a\uD835\uDF97b\uD835\uDF97c', 0)).toBe false
|
||||
expect(textUtils.isSurrogatePair('a\uD835\uDF97b\uD835\uDF97c', 1)).toBe true
|
||||
expect(textUtils.isSurrogatePair('a\uD835\uDF97b\uD835\uDF97c', 2)).toBe false
|
||||
expect(textUtils.isSurrogatePair('a\uD835\uDF97b\uD835\uDF97c', 3)).toBe false
|
||||
expect(textUtils.isSurrogatePair('a\uD835\uDF97b\uD835\uDF97c', 4)).toBe true
|
||||
expect(textUtils.isSurrogatePair('a\uD835\uDF97b\uD835\uDF97c', 5)).toBe false
|
||||
expect(textUtils.isSurrogatePair('a\uD835\uDF97b\uD835\uDF97c', 6)).toBe false
|
||||
@@ -356,9 +356,9 @@ class Buffer
|
||||
# or if its column goes beyond a line's length, this "sanitizes" the value
|
||||
# to a real range.
|
||||
#
|
||||
# range - The {Point} to clip
|
||||
# range - The {Range} to clip
|
||||
#
|
||||
# Returns the new, clipped {Point}. Note that this could be the same as `range` if no clipping was performed.
|
||||
# Returns the new, clipped {Range}. Note that this could be the same as `range` if no clipping was performed.
|
||||
clipRange: (range) ->
|
||||
range = Range.fromObject(range)
|
||||
new Range(@clipPosition(range.start), @clipPosition(range.end))
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
_ = require 'underscore'
|
||||
textUtils = require 'text-utils'
|
||||
|
||||
module.exports =
|
||||
class Token
|
||||
value: null
|
||||
hasSurrogatePairs: false
|
||||
scopes: null
|
||||
isAtomic: null
|
||||
isHardTab: null
|
||||
@@ -12,6 +14,7 @@ class Token
|
||||
constructor: ({@value, @scopes, @isAtomic, @bufferDelta, @isHardTab}) ->
|
||||
@screenDelta = @value.length
|
||||
@bufferDelta ?= @screenDelta
|
||||
@hasSurrogatePairs = textUtils.hasSurrogatePairs(@value)
|
||||
|
||||
### Public ###
|
||||
|
||||
@@ -27,27 +30,67 @@ class Token
|
||||
[new Token(value: value1, scopes: @scopes), new Token(value: value2, scopes: @scopes)]
|
||||
|
||||
breakOutAtomicTokens: (tabLength, breakOutLeadingWhitespace) ->
|
||||
if breakOutLeadingWhitespace
|
||||
return [this] unless /^[ ]|\t/.test(@value)
|
||||
if @hasSurrogatePairs
|
||||
outputTokens = []
|
||||
|
||||
for token in @breakOutSurrogatePairs()
|
||||
if token.isAtomic
|
||||
outputTokens.push(token)
|
||||
else
|
||||
outputTokens.push(token.breakOutAtomicTokens(tabLength, breakOutLeadingWhitespace)...)
|
||||
breakOutLeadingWhitespace = token.isOnlyWhitespace() if breakOutLeadingWhitespace
|
||||
|
||||
outputTokens
|
||||
else
|
||||
return [this] unless /\t/.test(@value)
|
||||
|
||||
outputTokens = []
|
||||
regex = new RegExp("([ ]{#{tabLength}})|(\t)|([^\t]+)", "g")
|
||||
|
||||
while match = regex.exec(@value)
|
||||
[fullMatch, softTab, hardTab] = match
|
||||
if softTab and breakOutLeadingWhitespace
|
||||
outputTokens.push(@buildSoftTabToken(tabLength, false))
|
||||
else if hardTab
|
||||
breakOutLeadingWhitespace = false
|
||||
outputTokens.push(@buildHardTabToken(tabLength, true))
|
||||
if breakOutLeadingWhitespace
|
||||
return [this] unless /^[ ]|\t/.test(@value)
|
||||
else
|
||||
breakOutLeadingWhitespace = false
|
||||
outputTokens.push(new Token(value: match[0], scopes: @scopes))
|
||||
return [this] unless /\t/.test(@value)
|
||||
|
||||
outputTokens = []
|
||||
regex = new RegExp("([ ]{#{tabLength}})|(\t)|([^\t]+)", "g")
|
||||
|
||||
while match = regex.exec(@value)
|
||||
[fullMatch, softTab, hardTab] = match
|
||||
if softTab and breakOutLeadingWhitespace
|
||||
outputTokens.push(@buildSoftTabToken(tabLength, false))
|
||||
else if hardTab
|
||||
breakOutLeadingWhitespace = false
|
||||
outputTokens.push(@buildHardTabToken(tabLength, true))
|
||||
else
|
||||
breakOutLeadingWhitespace = false
|
||||
value = match[0]
|
||||
outputTokens.push(new Token({value, @scopes}))
|
||||
|
||||
outputTokens
|
||||
|
||||
breakOutSurrogatePairs: ->
|
||||
outputTokens = []
|
||||
index = 0
|
||||
nonSurrogatePairStart = 0
|
||||
|
||||
while index < @value.length
|
||||
if textUtils.isSurrogatePair(@value, index)
|
||||
if nonSurrogatePairStart isnt index
|
||||
outputTokens.push(new Token({value: @value[nonSurrogatePairStart...index], @scopes}))
|
||||
outputTokens.push(@buildSurrogatePairToken(@value, index))
|
||||
index += 2
|
||||
nonSurrogatePairStart = index
|
||||
else
|
||||
index++
|
||||
|
||||
if nonSurrogatePairStart isnt index
|
||||
outputTokens.push(new Token({value: @value[nonSurrogatePairStart...index], @scopes}))
|
||||
|
||||
outputTokens
|
||||
|
||||
buildSurrogatePairToken: (value, index) ->
|
||||
new Token(
|
||||
value: value[index..index + 1]
|
||||
scopes: @scopes
|
||||
isAtomic: true
|
||||
)
|
||||
|
||||
buildHardTabToken: (tabLength) ->
|
||||
@buildTabToken(tabLength, true)
|
||||
|
||||
|
||||
19
src/stdlib/text-utils.coffee
Normal file
19
src/stdlib/text-utils.coffee
Normal file
@@ -0,0 +1,19 @@
|
||||
isHighSurrogate = (string, index) ->
|
||||
0xD800 <= string.charCodeAt(index) <= 0xDBFF
|
||||
|
||||
isLowSurrogate = (string, index) ->
|
||||
0xDC00 <= string.charCodeAt(index) <= 0xDFFF
|
||||
|
||||
isSurrogatePair = (string, index) ->
|
||||
isHighSurrogate(string, index) and isLowSurrogate(string, index + 1)
|
||||
|
||||
getCharacterCount = (string) ->
|
||||
count = string.length
|
||||
for index in [0...string.length] when isSurrogatePair(string, index)
|
||||
count--
|
||||
count
|
||||
|
||||
hasSurrogatePairs = (string) ->
|
||||
string.length isnt getCharacterCount(string)
|
||||
|
||||
module.exports = {getCharacterCount, isSurrogatePair, hasSurrogatePairs}
|
||||
Reference in New Issue
Block a user