diff --git a/lib/lexer.js b/lib/lexer.js index 3316c893..e39825f1 100644 --- a/lib/lexer.js +++ b/lib/lexer.js @@ -1,5 +1,5 @@ (function(){ - var ACCESSORS, ASSIGNMENT, BEFORE_WHEN, CALLABLE, CODE, COFFEE_KEYWORDS, COMMENT, COMMENT_CLEANER, HEREDOC, HEREDOC_INDENT, IDENTIFIER, INTERPOLATION, JS, JS_CLEANER, JS_FORBIDDEN, JS_KEYWORDS, KEYWORDS, LAST_DENT, LAST_DENTS, Lexer, MULTILINER, MULTI_DENT, NOT_REGEX, NO_NEWLINE, NUMBER, OPERATOR, REGEX, RESERVED, Rewriter, STRING_NEWLINES, WHITESPACE, compact, count, include; + var ACCESSORS, ASSIGNMENT, BEFORE_WHEN, CALLABLE, CODE, COFFEE_KEYWORDS, COMMENT, COMMENT_CLEANER, HEREDOC, HEREDOC_INDENT, IDENTIFIER, INTERPOLATION, JS_CLEANER, JS_FORBIDDEN, JS_KEYWORDS, KEYWORDS, LAST_DENT, LAST_DENTS, Lexer, MULTILINER, MULTI_DENT, NOT_REGEX, NO_NEWLINE, NUMBER, OPERATOR, REGEX, RESERVED, Rewriter, STRING_NEWLINES, WHITESPACE, compact, count, include; // The CoffeeScript Lexer. Uses a series of token-matching regexes to attempt // matches against the beginning of the source code. When a match is found, // a token is produced, we consume the match, and start again. Tokens are in the @@ -35,7 +35,6 @@ NUMBER = /^(\b((0(x|X)[0-9a-fA-F]+)|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?)))\b/i; HEREDOC = /^("{6}|'{6}|"{3}\n?([\s\S]*?)\n?([ \t]*)"{3}|'{3}\n?([\s\S]*?)\n?([ \t]*)'{3})/; INTERPOLATION = /(^|[\s\S]*?(?:[\\]|\\\\)?)\$([a-zA-Z_@]\w*|{[\s\S]*?(?:[^\\]|\\\\)})/; - JS = /^(``|`([\s\S]*?)([^\\]|\\\\)`)/; OPERATOR = /^([+\*&|\/\-%=<>:!?]+)/; WHITESPACE = /^([ \t]+)/; COMMENT = /^(((\n?[ \t]*)?#[^\n]*)+)/; @@ -77,12 +76,13 @@ Lexer = function Lexer() { }; // Scan by attempting to match tokens one at a time. Slow and steady. Lexer.prototype.tokenize = function tokenize(code, options) { - options = options || {}; + var o; + o = options || {}; this.code = code; // The remainder of the source code. this.i = 0; // Current character position we're parsing. - this.line = 0; + this.line = o.line || 0; // The current line. this.indent = 0; // The current indent level. @@ -95,7 +95,7 @@ this.extract_next_token(); } this.close_indentation(); - if (options.rewrite === false) { + if (o.rewrite === false) { return this.tokens; } return (new Rewriter()).rewrite(this.tokens); @@ -168,9 +168,9 @@ // Matches strings, including multi-line strings. Lexer.prototype.string_token = function string_token() { var string; - string = this.balanced_group(['"'], ['${', '}']); + string = this.balanced_token(['"', '"'], ['${', '}']); if (string === false) { - string = this.balanced_group(["'"]); + string = this.balanced_token(["'", "'"]); } if (!(string)) { return false; @@ -195,7 +195,7 @@ // Matches interpolated JavaScript. Lexer.prototype.js_token = function js_token() { var script; - if (!((script = this.match(JS, 1)))) { + if (!((script = this.balanced_token(['`', '`'])))) { return false; } this.token('JS', script.replace(JS_CLEANER, '')); @@ -215,28 +215,20 @@ this.i += regex.length; return true; }; - // Matches a balanced group such as a single or double-quoted string. - Lexer.prototype.balanced_group = function balanced_group() { - var _a, _b, _c, _d, _e, _f, _g, _h, _i, _j, delimited, each, i, levels, type; + // Matches a balanced group such as a single or double-quoted string. Pass in + // a series of delimiters, all of which must be balanced correctly within the + // token's contents. + Lexer.prototype.balanced_token = function balanced_token() { + var _a, _b, delimited, each, i, levels, type; delimited = Array.prototype.slice.call(arguments, 0); - _a = delimited; - for (_b = 0, _c = _a.length; _b < _c; _b++) { - each = _a[_b]; - !(typeof (_d = each[1]) !== "undefined" && _d !== null) ? ((each[1] = each[0])) : null; - } - _e = delimited; - for (_f = 0, _g = _e.length; _f < _g; _f++) { - each = _e[_f]; - !(typeof (_h = each[2]) !== "undefined" && _h !== null) ? ((each[2] = '\\')) : null; - } levels = []; i = 0; while (i < this.chunk.length) { - _i = delimited; - for (type = 0, _j = _i.length; type < _j; type++) { - each = _i[type]; - if (each[2] !== false && this.chunk.substring(i, i + each[2].length) === each[2]) { - i += each[2].length; + _a = delimited; + for (type = 0, _b = _a.length; type < _b; type++) { + each = _a[type]; + if (levels.length && this.chunk.substring(i, i + 1) === '\\') { + i += 1; break; } else if (levels.length && this.chunk.substring(i, i + each[1].length) === each[1] && levels[levels.length - 1] === type) { levels.pop(); @@ -485,7 +477,8 @@ if (interp.substring(0, 1) === '{') { inner = interp.substring(1, interp.length - 1); nested = lexer.tokenize("(" + inner + ")", { - rewrite: false + rewrite: false, + line: this.line }); nested.pop(); tokens.push(['TOKENS', nested]); diff --git a/src/lexer.coffee b/src/lexer.coffee index f351b6d8..b04cedb1 100644 --- a/src/lexer.coffee +++ b/src/lexer.coffee @@ -60,7 +60,6 @@ IDENTIFIER : /^([a-zA-Z$_](\w|\$)*)/ NUMBER : /^(\b((0(x|X)[0-9a-fA-F]+)|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?)))\b/i HEREDOC : /^("{6}|'{6}|"{3}\n?([\s\S]*?)\n?([ \t]*)"{3}|'{3}\n?([\s\S]*?)\n?([ \t]*)'{3})/ INTERPOLATION : /(^|[\s\S]*?(?:[\\]|\\\\)?)\$([a-zA-Z_@]\w*|{[\s\S]*?(?:[^\\]|\\\\)})/ -JS : /^(``|`([\s\S]*?)([^\\]|\\\\)`)/ OPERATOR : /^([+\*&|\/\-%=<>:!?]+)/ WHITESPACE : /^([ \t]+)/ COMMENT : /^(((\n?[ \t]*)?#[^\n]*)+)/ @@ -113,18 +112,18 @@ exports.Lexer: class Lexer # Scan by attempting to match tokens one at a time. Slow and steady. tokenize: (code, options) -> - options ||= {} - @code : code # The remainder of the source code. - @i : 0 # Current character position we're parsing. - @line : 0 # The current line. - @indent : 0 # The current indent level. - @indents : [] # The stack of all indent levels we are currently within. - @tokens : [] # Collection of all parsed tokens in the form ['TOKEN_TYPE', value, line] + o : options or {} + @code : code # The remainder of the source code. + @i : 0 # Current character position we're parsing. + @line : o.line or 0 # The current line. + @indent : 0 # The current indent level. + @indents : [] # The stack of all indent levels we are currently within. + @tokens : [] # Collection of all parsed tokens in the form ['TOKEN_TYPE', value, line] while @i < @code.length @chunk: @code.slice(@i) @extract_next_token() @close_indentation() - return @tokens if options.rewrite is no + return @tokens if o.rewrite is no (new Rewriter()).rewrite @tokens # At every position, run through this list of attempted matches, @@ -166,8 +165,8 @@ exports.Lexer: class Lexer # Matches strings, including multi-line strings. string_token: -> - string: @balanced_group ['"'], ['${', '}'] - string: @balanced_group ["'"] if string is false + string: @balanced_token ['"', '"'], ['${', '}'] + string: @balanced_token ["'", "'"] if string is false return false unless string @interpolate_string string.replace STRING_NEWLINES, " \\\n" @line += count string, "\n" @@ -185,7 +184,7 @@ exports.Lexer: class Lexer # Matches interpolated JavaScript. js_token: -> - return false unless script: @match JS, 1 + return false unless script: @balanced_token ['`', '`'] @token 'JS', script.replace(JS_CLEANER, '') @i += script.length true @@ -198,16 +197,16 @@ exports.Lexer: class Lexer @i += regex.length true - # Matches a balanced group such as a single or double-quoted string. - balanced_group: (delimited...) -> - (each[1]: each[0]) for each in delimited when not each[1]? - (each[2]: '\\') for each in delimited when not each[2]? + # Matches a balanced group such as a single or double-quoted string. Pass in + # a series of delimiters, all of which must be balanced correctly within the + # token's contents. + balanced_token: (delimited...) -> levels: [] i: 0 while i < @chunk.length for each, type in delimited - if each[2] isnt false and @chunk.substring(i, i + each[2].length) is each[2] - i += each[2].length + if levels.length and @chunk.substring(i, i + 1) is '\\' + i += 1 break else if levels.length and @chunk.substring(i, i + each[1].length) is each[1] and levels[levels.length - 1] is type levels.pop() @@ -394,7 +393,7 @@ exports.Lexer: class Lexer tokens.push ['STRING', "$quote$before$quote"] if before.length if interp.substring(0, 1) is '{' inner: interp.substring(1, interp.length - 1) - nested: lexer.tokenize "($inner)", {rewrite: no} + nested: lexer.tokenize "($inner)", {rewrite: no, line: @line} nested.pop() tokens.push ['TOKENS', nested] else