From 3e0c35bd0fd403ac7b09d5130bd777b58ffaf0fd Mon Sep 17 00:00:00 2001 From: satyr Date: Sun, 26 Sep 2010 07:06:14 +0900 Subject: [PATCH] lexer: enabled multiline interpolations --- lib/lexer.js | 238 ++++++++++++++++++++------------------ src/lexer.coffee | 97 +++++++++------- test/test_heredocs.coffee | 8 ++ 3 files changed, 184 insertions(+), 159 deletions(-) diff --git a/lib/lexer.js b/lib/lexer.js index 896aa9f7..c8892e96 100644 --- a/lib/lexer.js +++ b/lib/lexer.js @@ -23,7 +23,7 @@ this.indents = []; this.tokens = []; while ((this.chunk = code.slice(this.i))) { - this.extractNextToken(); + this.identifierToken() || this.commentToken() || this.whitespaceToken() || this.lineToken() || this.heredocToken() || this.stringToken() || this.numberToken() || this.regexToken() || this.jsToken() || this.literalToken(); } this.closeIndentation(); if (o.rewrite === false) { @@ -31,14 +31,12 @@ } return (new Rewriter()).rewrite(this.tokens); }; - Lexer.prototype.extractNextToken = function() { - return this.identifierToken() || this.commentToken() || this.whitespaceToken() || this.lineToken() || this.heredocToken() || this.stringToken() || this.numberToken() || this.regexToken() || this.jsToken() || this.literalToken(); - }; Lexer.prototype.identifierToken = function() { - var closeIndex, forcedIdentifier, id, tag; - if (!(id = this.match(IDENTIFIER))) { + var closeIndex, forcedIdentifier, id, match, tag; + if (!(match = IDENTIFIER.exec(this.chunk))) { return false; } + id = match[0]; this.i += id.length; if (id === 'all' && this.tag() === 'FOR') { this.token('ALL', id); @@ -86,10 +84,11 @@ return true; }; Lexer.prototype.numberToken = function() { - var number; - if (!(number = this.match(NUMBER))) { + var match, number; + if (!(match = NUMBER.exec(this.chunk))) { return false; } + number = match[0]; if (this.tag() === '.' && number.charAt(0) === '.') { return false; } @@ -98,19 +97,19 @@ return true; }; Lexer.prototype.stringToken = function() { - var string; + var match, string; switch (this.chunk.charAt(0)) { case "'": - if (!(string = this.match(SIMPLESTR))) { + if (!(match = SIMPLESTR.exec(this.chunk))) { return false; } - this.token('STRING', string.replace(MULTILINER, '\\\n')); + this.token('STRING', (string = match[0]).replace(MULTILINER, '\\\n')); break; case '"': if (!(string = this.balancedToken(['"', '"'], ['#{', '}']))) { return false; } - this.interpolateString(string.replace(MULTILINER, '\\\n')); + this.interpolateString(string); break; default: return false; @@ -127,7 +126,8 @@ heredoc = match[0]; quote = heredoc.charAt(0); doc = this.sanitizeHeredoc(match[2], { - quote: quote + quote: quote, + indent: null }); this.interpolateString(quote + doc + quote, { heredoc: true @@ -156,11 +156,11 @@ return true; }; Lexer.prototype.jsToken = function() { - var script; - if (!(this.chunk.charAt(0) === '`' && (script = this.match(JSTOKEN)))) { + var match, script; + if (!(this.chunk.charAt(0) === '`' && (match = JSTOKEN.exec(this.chunk)))) { return false; } - this.token('JS', script.slice(1, -1)); + this.token('JS', (script = match[0]).slice(1, -1)); this.i += script.length; return true; }; @@ -205,16 +205,17 @@ return this.balancedString(this.chunk, delimited); }; Lexer.prototype.lineToken = function() { - var diff, indent, nextCharacter, noNewlines, prev, size; - if (!(indent = this.match(MULTI_DENT))) { + var diff, indent, match, nextCharacter, noNewlines, prev, size; + if (!(match = MULTI_DENT.exec(this.chunk))) { return false; } + indent = match[0]; this.line += count(indent, '\n'); this.i += indent.length; prev = this.prev(2); size = indent.length - 1 - indent.lastIndexOf('\n'); - nextCharacter = this.match(NEXT_CHARACTER, 1); - noNewlines = nextCharacter === '.' || nextCharacter === ',' || this.unfinished(); + nextCharacter = NEXT_CHARACTER.exec(this.chunk)[1]; + noNewlines = (('.' === nextCharacter || ',' === nextCharacter)) || this.unfinished(); if (size - this.indebt === this.indent) { if (noNewlines) { return this.suppressNewlines(); @@ -265,15 +266,15 @@ return true; }; Lexer.prototype.whitespaceToken = function() { - var prev, space; - if (!(space = this.match(WHITESPACE))) { + var match, prev; + if (!(match = WHITESPACE.exec(this.chunk))) { return false; } prev = this.prev(); if (prev) { prev.spaced = true; } - this.i += space.length; + this.i += match[0].length; return true; }; Lexer.prototype.newlineToken = function(newlines) { @@ -369,25 +370,32 @@ return accessor ? 'accessor' : false; }; Lexer.prototype.sanitizeHeredoc = function(doc, options) { - var _ref2, attempt, indent, match; - indent = options.indent; - if (options.herecomment && !include(doc, '\n')) { + var _ref2, attempt, herecomment, indent, match; + _ref2 = options; + indent = _ref2.indent; + herecomment = _ref2.herecomment; + if (herecomment && !include(doc, '\n')) { return doc; } - if (!(options.herecomment)) { + if (!(herecomment)) { while ((match = HEREDOC_INDENT.exec(doc))) { - attempt = (typeof (_ref2 = match[1]) !== "undefined" && _ref2 !== null) ? match[1] : match[2]; - if (!(typeof indent !== "undefined" && indent !== null) || (0 < attempt.length) && (attempt.length < indent.length)) { + attempt = match[1]; + if (indent === null || (0 < attempt.length) && (attempt.length < indent.length)) { indent = attempt; } } } - indent || (indent = ''); - doc = doc.replace(new RegExp('^' + indent, 'gm'), ''); - if (options.herecomment) { + if (indent) { + doc = doc.replace(new RegExp("\\n" + (indent), "g"), '\n'); + } + if (herecomment) { return doc; } - return doc.replace(/^\n/, '').replace(MULTILINER, '\\n').replace(new RegExp(options.quote, 'g'), "\\" + (options.quote)); + doc = doc.replace(/^\n/, '').replace(new RegExp("" + (options.quote), "g"), '\\$&'); + if (options.quote === "'") { + doc = this.oldline(doc, true); + } + return doc; }; Lexer.prototype.tagParameters = function() { var i, tok; @@ -469,83 +477,84 @@ return !i ? false : str.slice(0, i); }; Lexer.prototype.interpolateString = function(str, options) { - var _len, _ref2, _ref3, end, escaped, expr, i, idx, inner, interpolated, lexer, nested, pi, quote, tag, tok, token, tokens, value; + var _len, _ref2, _ref3, end, escaped, expr, i, idx, inner, interpolated, lexer, nested, pi, push, quote, s, tag, tok, token, tokens, value; options || (options = {}); - if (str.length < 3 || str.charAt(0) !== '"') { + quote = str.charAt(0); + if (quote !== '"' || str.length < 3) { return this.token('STRING', str); - } else { - lexer = new Lexer(); - tokens = []; - quote = str.charAt(0); - _ref2 = [1, 1]; - i = _ref2[0]; - pi = _ref2[1]; - end = str.length - 1; - while (i < end) { - if (str.charAt(i) === '\\') { - i += 1; - } else if (expr = this.balancedString(str.slice(i), [['#{', '}']])) { - if (pi < i) { - tokens.push(['STRING', quote + str.slice(pi, i) + quote]); - } - inner = expr.slice(2, -1); - if (inner.length) { - if (options.heredoc) { - inner = inner.replace(new RegExp('\\\\' + quote, 'g'), quote); - } - nested = lexer.tokenize("(" + (inner) + ")", { - line: this.line - }); - _ref2 = nested; - for (idx = 0, _len = _ref2.length; idx < _len; idx++) { - tok = _ref2[idx]; - if (tok[0] === 'CALL_END') { - (tok[0] = ')'); - } - } - nested.pop(); - tokens.push(['TOKENS', nested]); - } else { - tokens.push(['STRING', quote + quote]); - } - i += expr.length - 1; - pi = i + 1; - } - i += 1; - } - if ((i > pi) && (pi < str.length - 1)) { - tokens.push(['STRING', quote + str.slice(pi, i) + quote]); - } - if (tokens[0][0] !== 'STRING') { - tokens.unshift(['STRING', '""']); - } - interpolated = tokens.length > 1; - if (interpolated) { - this.token('(', '('); - } - _ref2 = tokens; - for (i = 0, _len = _ref2.length; i < _len; i++) { - token = _ref2[i]; - _ref3 = token; - tag = _ref3[0]; - value = _ref3[1]; - if (tag === 'TOKENS') { - this.tokens = this.tokens.concat(value); - } else if (tag === 'STRING' && options.escapeQuotes) { - escaped = value.slice(1, -1).replace(/"/g, '\\"'); - this.token(tag, "\"" + (escaped) + "\""); - } else { - this.token(tag, value); - } - if (i < tokens.length - 1) { - this.token('+', '+'); - } - } - if (interpolated) { - this.token(')', ')'); - } - return tokens; } + lexer = new Lexer(); + tokens = []; + i = (pi = 1); + end = str.length - 1; + while (i < end) { + if (str.charAt(i) === '\\') { + i += 1; + } else if (expr = this.balancedString(str.slice(i), [['#{', '}']])) { + if (pi < i) { + s = quote + this.oldline(str.slice(pi, i), options.heredoc) + quote; + tokens.push(['STRING', s]); + } + inner = expr.slice(2, -1).replace(/^\s+/, ''); + if (inner.length) { + if (options.heredoc) { + inner = inner.replace(RegExp('\\\\' + quote, 'g'), quote); + } + nested = lexer.tokenize("(" + (inner) + ")", { + line: this.line + }); + _ref2 = nested; + for (idx = 0, _len = _ref2.length; idx < _len; idx++) { + tok = _ref2[idx]; + if (tok[0] === 'CALL_END') { + (tok[0] = ')'); + } + } + nested.pop(); + tokens.push(['TOKENS', nested]); + } else { + tokens.push(['STRING', quote + quote]); + } + i += expr.length - 1; + pi = i + 1; + } + i += 1; + } + if ((i > pi) && (pi < str.length - 1)) { + s = str.slice(pi, i).replace(MULTILINER, options.heredoc ? '\\n' : ''); + tokens.push(['STRING', quote + s + quote]); + } + if (tokens[0][0] !== 'STRING') { + tokens.unshift(['STRING', '""']); + } + interpolated = tokens.length > 1; + if (interpolated) { + this.token('(', '('); + } + _ref2 = tokens; + push = _ref2.push; + _ref2 = tokens; + for (i = 0, _len = _ref2.length; i < _len; i++) { + token = _ref2[i]; + _ref3 = token; + tag = _ref3[0]; + value = _ref3[1]; + if (tag === 'TOKENS') { + push.apply(this.tokens, value); + } else if (tag === 'STRING' && options.escapeQuotes) { + escaped = value.slice(1, -1).replace(/"/g, '\\"'); + this.token(tag, "\"" + (escaped) + "\""); + } else { + this.token(tag, value); + } + if (i < tokens.length - 1) { + this.token('+', '+'); + } + } + if (interpolated) { + this.token(')', ')'); + } + return tokens; }; Lexer.prototype.token = function(tag, value) { return this.tokens.push([tag, value, this.line]); @@ -579,9 +588,10 @@ }; Lexer.prototype.unfinished = function() { var prev, value; - prev = this.prev(2); - value = this.value(); - return value && NO_NEWLINE.test(value) && prev && prev[0] !== '.' && !CODE.test(value) && !ASSIGNED.test(this.chunk); + return (prev = this.prev(2)) && prev[0] !== '.' && (value = this.value()) && NO_NEWLINE.test(value) && !CODE.test(value) && !ASSIGNED.test(this.chunk); + }; + Lexer.prototype.oldline = function(str, heredoc) { + return str.replace(MULTILINER, heredoc ? '\\n' : ''); }; return Lexer; })(); @@ -591,11 +601,11 @@ RESERVED = ['case', 'default', 'do', 'function', 'var', 'void', 'with', 'const', 'let', 'enum', 'export', 'import', 'native', '__hasProp', '__extends', '__slice']; JS_FORBIDDEN = JS_KEYWORDS.concat(RESERVED); IDENTIFIER = /^[a-zA-Z_$][\w$]*/; - NUMBER = /^(?:0x[\da-f]+)|^(?:\d+(\.\d+)?|\.\d+)(?:e[+-]?\d+)?/i; + NUMBER = /^0x[\da-f]+|^(?:\d+(\.\d+)?|\.\d+)(?:e[+-]?\d+)?/i; HEREDOC = /^("""|''')([\s\S]*?)\n?[ \t]*\1/; OPERATOR = /^(?:-[-=>]?|\+[+=]?|[*&|\/%=<>^:!?]+)(?=([ \t]*))/; WHITESPACE = /^[ \t]+/; - COMMENT = /^###([^#][\s\S]*?)(?:###[ \t]*\n|(?:###)?$)|^(?:\s*#(?!##[^#])[^\n]*)+/; + COMMENT = /^###([^#][\s\S]*?)(?:###[ \t]*\n|(?:###)?$)|^(?:\s*#(?!##[^#]).*)+/; CODE = /^[-=]>/; MULTI_DENT = /^(?:\n[ \t]*)+/; SIMPLESTR = /^'[^\\']*(?:\\.[^\\']*)*'/; @@ -606,9 +616,9 @@ REGEX_ESCAPE = /\\[^#]/g; MULTILINER = /\n/g; NO_NEWLINE = /^(?:[-+*&|\/%=<>!.\\][<>=&|]*|and|or|is(?:nt)?|not|delete|typeof|instanceof)$/; - HEREDOC_INDENT = /\n+([ \t]*)|^([ \t]+)/g; + HEREDOC_INDENT = /\n+([ \t]*)/g; ASSIGNED = /^\s*@?[$A-Za-z_][$\w]*[ \t]*?[:=][^:=>]/; - NEXT_CHARACTER = /^\s*(\S)/; + NEXT_CHARACTER = /^\s*(\S?)/; COMPOUND_ASSIGN = ['-=', '+=', '/=', '*=', '%=', '||=', '&&=', '?=', '<<=', '>>=', '>>>=', '&=', '^=', '|=']; UNARY = ['UMINUS', 'UPLUS', '!', '!!', '~', 'TYPEOF', 'DELETE']; LOGIC = ['&', '|', '^', '&&', '||']; diff --git a/src/lexer.coffee b/src/lexer.coffee index 605a021f..a5d91c9f 100644 --- a/src/lexer.coffee +++ b/src/lexer.coffee @@ -125,7 +125,7 @@ exports.Lexer = class Lexer @token 'STRING', (string = match[0]).replace MULTILINER, '\\\n' when '"' return false unless string = @balancedToken ['"', '"'], ['#{', '}'] - @interpolateString string.replace MULTILINER, '\\\n' + @interpolateString string else return false @line += count string, '\n' @@ -339,9 +339,9 @@ exports.Lexer = class Lexer indent = attempt if indent is null or 0 < attempt.length < indent.length doc = doc.replace /\n#{ indent }/g, '\n' if indent return doc if herecomment - doc.replace(/^\n/, '') - .replace(MULTILINER, '\\n') - .replace(/#{ options.quote }/g, '\\$&') + doc = doc.replace(/^\n/, '').replace(/#{ options.quote }/g, '\\$&') + doc = @oldline doc, on if options.quote is "'" + doc # A source of ambiguity in our grammar used to be parameter lists in function # definitions versus argument lists in function calls. Walk backwards, tagging @@ -406,7 +406,7 @@ exports.Lexer = class Lexer if not i then false else str[0...i] # Expand variables and expressions inside double-quoted strings using - # [ECMA Harmony's interpolation syntax](http://wiki.ecmascript.org/doku.php?id=strawman:string_interpolation) + # Ruby-like notation # for substitution of bare variables as well as arbitrary expressions. # # "Hello #{name.capitalize()}." @@ -415,48 +415,51 @@ exports.Lexer = class Lexer # new Lexer, tokenize the interpolated contents, and merge them into the # token stream. interpolateString: (str, options) -> - options or= {} - if str.length < 3 or str.charAt(0) isnt '"' - @token 'STRING', str - else - lexer = new Lexer - tokens = [] - quote = str.charAt 0 - [i, pi] = [1, 1] - end = str.length - 1 - while i < end - if str.charAt(i) is '\\' - i += 1 - else if expr = @balancedString str[i..], [['#{', '}']] - tokens.push ['STRING', quote + str[pi...i] + quote] if pi < i - inner = expr.slice 2, -1 - if inner.length - inner = inner.replace new RegExp('\\\\' + quote, 'g'), quote if options.heredoc - nested = lexer.tokenize "(#{inner})", line: @line - (tok[0] = ')') for tok, idx in nested when tok[0] is 'CALL_END' - nested.pop() - tokens.push ['TOKENS', nested] - else - tokens.push ['STRING', quote + quote] - i += expr.length - 1 - pi = i + 1 + {heredoc, escapeQuotes} = options or {} + quote = str.charAt 0 + return @token 'STRING', str if quote isnt '"' or str.length < 3 + lexer = new Lexer + tokens = [] + i = pi = 1 + end = str.length - 1 + while i < end + if str.charAt(i) is '\\' i += 1 - tokens.push ['STRING', quote + str[pi...i] + quote] if i > pi < str.length - 1 - tokens.unshift ['STRING', '""'] unless tokens[0][0] is 'STRING' - interpolated = tokens.length > 1 - @token '(', '(' if interpolated - for token, i in tokens - [tag, value] = token - if tag is 'TOKENS' - @tokens = @tokens.concat value - else if tag is 'STRING' and options.escapeQuotes - escaped = value.slice(1, -1).replace(/"/g, '\\"') - @token tag, "\"#{escaped}\"" + else if expr = @balancedString str[i..], [['#{', '}']] + if pi < i + s = quote + @oldline(str[pi...i], heredoc) + quote + tokens.push ['STRING', s] + inner = expr.slice(2, -1).replace /^[ \t]*\n/, '' + if inner.length + inner = inner.replace RegExp('\\\\' + quote, 'g'), quote if heredoc + nested = lexer.tokenize "(#{inner})", line: @line + (tok[0] = ')') for tok, idx in nested when tok[0] is 'CALL_END' + nested.pop() + tokens.push ['TOKENS', nested] else - @token tag, value - @token '+', '+' if i < tokens.length - 1 - @token ')', ')' if interpolated - tokens + tokens.push ['STRING', quote + quote] + i += expr.length - 1 + pi = i + 1 + i += 1 + if i > pi < str.length - 1 + s = str[pi...i].replace MULTILINER, if heredoc then '\\n' else '' + tokens.push ['STRING', quote + s + quote] + tokens.unshift ['STRING', '""'] unless tokens[0][0] is 'STRING' + interpolated = tokens.length > 1 + @token '(', '(' if interpolated + {push} = tokens + for token, i in tokens + [tag, value] = token + if tag is 'TOKENS' + push.apply @tokens, value + else if tag is 'STRING' and escapeQuotes + escaped = value.slice(1, -1).replace(/"/g, '\\"') + @token tag, "\"#{escaped}\"" + else + @token tag, value + @token '+', '+' if i < tokens.length - 1 + @token ')', ')' if interpolated + tokens # Helpers # ------- @@ -487,6 +490,10 @@ exports.Lexer = class Lexer (value = @value()) and NO_NEWLINE.test(value) and not CODE.test(value) and not ASSIGNED.test(@chunk) + # Converts newlines for string literals + oldline: (str, heredoc) -> + str.replace MULTILINER, if heredoc then '\\n' else '' + # Constants # --------- diff --git a/test/test_heredocs.coffee b/test/test_heredocs.coffee index 7389fb09..5c727483 100644 --- a/test/test_heredocs.coffee +++ b/test/test_heredocs.coffee @@ -98,3 +98,11 @@ equal ''' line 0 should not be relevant\n to the indent level ' + + +equal 'multiline nested interpolations work', """multiline #{ + "nested #{(-> + ok yes + "interpolations" + )()}" +} work"""