From 83fd84745d18f9ee9c902407ae821348d888f599 Mon Sep 17 00:00:00 2001 From: Stan Angeloff Date: Sat, 6 Mar 2010 22:16:37 +0200 Subject: [PATCH] Rewriting string tokenizer; allowing nested double-quoted strings inside expression interpolations. --- lib/lexer.js | 63 +++++++++++++++++++++++++-- src/lexer.coffee | 32 +++++++++++++- test/test_string_interpolation.coffee | 5 +++ 3 files changed, 95 insertions(+), 5 deletions(-) diff --git a/lib/lexer.js b/lib/lexer.js index 35fe07d4..ef182c6a 100644 --- a/lib/lexer.js +++ b/lib/lexer.js @@ -1,5 +1,5 @@ (function(){ - var ACCESSORS, ASSIGNMENT, BEFORE_WHEN, CALLABLE, CODE, COFFEE_KEYWORDS, COMMENT, COMMENT_CLEANER, HEREDOC, HEREDOC_INDENT, IDENTIFIER, INTERPOLATION, JS, JS_CLEANER, JS_FORBIDDEN, JS_KEYWORDS, KEYWORDS, LAST_DENT, LAST_DENTS, Lexer, MULTILINER, MULTI_DENT, NOT_REGEX, NO_NEWLINE, NUMBER, OPERATOR, REGEX, RESERVED, Rewriter, STRING, STRING_NEWLINES, WHITESPACE, compact, count, include; + var ACCESSORS, ASSIGNMENT, BEFORE_WHEN, CALLABLE, CODE, COFFEE_KEYWORDS, COMMENT, COMMENT_CLEANER, HEREDOC, HEREDOC_INDENT, IDENTIFIER, INTERPOLATION, JS, JS_CLEANER, JS_FORBIDDEN, JS_KEYWORDS, KEYWORDS, LAST_DENT, LAST_DENTS, Lexer, MULTILINER, MULTI_DENT, NOT_REGEX, NO_NEWLINE, NUMBER, OPERATOR, REGEX, RESERVED, Rewriter, STRING_NEWLINES, WHITESPACE, compact, count, include; // The CoffeeScript Lexer. Uses a series of token-matching regexes to attempt // matches against the beginning of the source code. When a match is found, // a token is produced, we consume the match, and start again. Tokens are in the @@ -33,7 +33,6 @@ // Token matching regexes. IDENTIFIER = /^([a-zA-Z$_](\w|\$)*)/; NUMBER = /^(\b((0(x|X)[0-9a-fA-F]+)|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?)))\b/i; - STRING = /^(""|''|"([\s\S]*?)([^\\]|\\\\)"|'([\s\S]*?)([^\\]|\\\\)')/; HEREDOC = /^("{6}|'{6}|"{3}\n?([\s\S]*?)\n?([ \t]*)"{3}|'{3}\n?([\s\S]*?)\n?([ \t]*)'{3})/; INTERPOLATION = /(^|[\s\S]*?(?:[\\]|\\\\)?)\$([a-zA-Z_@]\w*|{[\s\S]*?(?:[^\\]|\\\\)})/; JS = /^(``|`([\s\S]*?)([^\\]|\\\\)`)/; @@ -169,7 +168,11 @@ // Matches strings, including multi-line strings. Lexer.prototype.string_token = function string_token() { var string; - if (!((string = this.match(STRING, 1)))) { + string = this.balanced_group(['"'], ['${', '}']); + if (string === false) { + string = this.balanced_group(["'"]); + } + if (!(string)) { return false; } this.interpolate_string(string.replace(STRING_NEWLINES, " \\\n")); @@ -212,6 +215,60 @@ this.i += regex.length; return true; }; + // Matches a balanced group such as a single or double-quoted string. + Lexer.prototype.balanced_group = function balanced_group() { + var _a, _b, _c, _d, _e, _f, delimited, each, escaped, i, levels, next, type; + delimited = Array.prototype.slice.call(arguments, 0); + _a = delimited; + for (_b = 0, _c = _a.length; _b < _c; _b++) { + each = _a[_b]; + !(typeof (_d = each[1]) !== "undefined" && _d !== null) ? ((each[1] = each[0])) : null; + } + escaped = '\\'; + next = (function(__this) { + var __func = function(length) { + return this.chunk.substring(i, i + length); + }; + return (function next() { + return __func.apply(__this, arguments); + }); + })(this); + levels = []; + i = 0; + while (i < this.chunk.length) { + if (next(1) === escaped) { + i += 1; + } else { + _e = delimited; + for (type = 0, _f = _e.length; type < _f; type++) { + each = _e[type]; + if (levels.length && next(each[1].length) === each[1] && levels[levels.length - 1] === type) { + levels.pop(); + i += each[1].length - 1; + if (!(levels.length)) { + i += 1; + } + break; + } else if (next(each[0].length) === each[0]) { + levels.push(type); + i += each[0].length - 1; + break; + } + } + } + if (!(levels.length)) { + break; + } + i += 1; + } + if (levels.length) { + throw new Error("SyntaxError: Unterminated " + (delimited[levels.pop()][0]) + " starting on line " + this.line); + } + if (i === 0) { + return false; + } + return this.chunk.substring(0, i); + }; // Matches and conumes comments. Lexer.prototype.comment_token = function comment_token() { var comment, lines; diff --git a/src/lexer.coffee b/src/lexer.coffee index e88db1a1..4fd3bb60 100644 --- a/src/lexer.coffee +++ b/src/lexer.coffee @@ -58,7 +58,6 @@ JS_FORBIDDEN: JS_KEYWORDS.concat RESERVED # Token matching regexes. IDENTIFIER : /^([a-zA-Z$_](\w|\$)*)/ NUMBER : /^(\b((0(x|X)[0-9a-fA-F]+)|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?)))\b/i -STRING : /^(""|''|"([\s\S]*?)([^\\]|\\\\)"|'([\s\S]*?)([^\\]|\\\\)')/ HEREDOC : /^("{6}|'{6}|"{3}\n?([\s\S]*?)\n?([ \t]*)"{3}|'{3}\n?([\s\S]*?)\n?([ \t]*)'{3})/ INTERPOLATION : /(^|[\s\S]*?(?:[\\]|\\\\)?)\$([a-zA-Z_@]\w*|{[\s\S]*?(?:[^\\]|\\\\)})/ JS : /^(``|`([\s\S]*?)([^\\]|\\\\)`)/ @@ -167,7 +166,9 @@ exports.Lexer: class Lexer # Matches strings, including multi-line strings. string_token: -> - return false unless string: @match STRING, 1 + string: @balanced_group ['"'], ['${', '}'] + string: @balanced_group ["'"] if string is false + return false unless string @interpolate_string string.replace STRING_NEWLINES, " \\\n" @line += count string, "\n" @i += string.length @@ -197,6 +198,33 @@ exports.Lexer: class Lexer @i += regex.length true + # Matches a balanced group such as a single or double-quoted string. + balanced_group: (delimited...) -> + (each[1]: each[0]) for each in delimited when not each[1]? + escaped: '\\' + next: (length) => @chunk.substring i, i + length + levels: [] + i: 0 + while i < @chunk.length + if next(1) is escaped + i += 1 + else + for each, type in delimited + if levels.length and next(each[1].length) is each[1] and levels[levels.length - 1] is type + levels.pop() + i += each[1].length - 1 + i += 1 unless levels.length + break + else if next(each[0].length) is each[0] + levels.push(type) + i += each[0].length - 1 + break + break unless levels.length + i += 1 + throw new Error "SyntaxError: Unterminated ${delimited[levels.pop()][0]} starting on line $@line" if levels.length + return false if i is 0 + return @chunk.substring(0, i) + # Matches and conumes comments. comment_token: -> return false unless comment: @match COMMENT, 1 diff --git a/test/test_string_interpolation.coffee b/test/test_string_interpolation.coffee index 526ca24d..bde36f56 100644 --- a/test/test_string_interpolation.coffee +++ b/test/test_string_interpolation.coffee @@ -48,3 +48,8 @@ obj: { hi: -> "Hello $@name." } ok obj.hi() is "Hello Joe." + +ok "I can has ${"cheeze"}" is 'I can has cheeze' +ok 'I can has ${"cheeze"}' is 'I can has ${"cheeze"}' + +ok "Where is ${obj["name"] + '?'}" is 'Where is Joe?'