diff --git a/lib/lexer.js b/lib/lexer.js index 2d149c90..ba366926 100644 --- a/lib/lexer.js +++ b/lib/lexer.js @@ -14,10 +14,11 @@ return Lexer; })(); Lexer.prototype.tokenize = function(code, options) { - var i, o; + var o; code = code.replace(/\r/g, '').replace(TRAILING_SPACES, ''); o = options || {}; this.code = code; + this.i = 0; this.line = o.line || 0; this.indent = 0; this.indebt = 0; @@ -25,9 +26,8 @@ this.indents = []; this.tokens = []; this.seenFor = this.seenFrom = false; - i = 0; - while (this.chunk = code.slice(i)) { - i += this.identifierToken() || this.commentToken() || this.whitespaceToken() || this.lineToken() || this.heredocToken() || this.stringToken() || this.numberToken() || this.regexToken() || this.jsToken() || this.literalToken(); + while (this.chunk = code.slice(this.i)) { + this.identifierToken() || this.commentToken() || this.whitespaceToken() || this.lineToken() || this.heredocToken() || this.stringToken() || this.numberToken() || this.regexToken() || this.jsToken() || this.literalToken(); } this.closeIndentation(); if (o.rewrite === false) { @@ -38,23 +38,24 @@ Lexer.prototype.identifierToken = function() { var _ref2, colon, forcedIdentifier, id, input, match, tag; if (!(match = IDENTIFIER.exec(this.chunk))) { - return 0; + return false; } input = match[0], id = match[1], colon = match[2]; + this.i += input.length; if (id === 'all' && this.tag() === 'FOR') { this.token('ALL', id); - return 3; + return true; } if (id === 'from' && this.tag(1) === 'FOR') { this.seenFor = false; this.seenFrom = true; this.token('FROM', id); - return 4; + return true; } if (id === 'to' && this.seenFrom) { this.seenFrom = false; this.token('TO', id); - return 2; + return true; } forcedIdentifier = colon || this.tagAccessor(); tag = 'IDENTIFIER'; @@ -105,32 +106,33 @@ if (colon) { this.token(':', ':'); } - return input.length; + return true; }; Lexer.prototype.numberToken = function() { var match, number; if (!(match = NUMBER.exec(this.chunk))) { - return 0; + return false; } number = match[0]; if (this.tag() === '.' && number.charAt(0) === '.') { - return 0; + return false; } + this.i += number.length; this.token('NUMBER', number); - return number.length; + return true; }; Lexer.prototype.stringToken = function() { var match, string; switch (this.chunk.charAt(0)) { case "'": if (!(match = SIMPLESTR.exec(this.chunk))) { - return 0; + return false; } this.token('STRING', (string = match[0]).replace(MULTILINER, '\\\n')); break; case '"': if (!(string = this.balancedString(this.chunk, [['"', '"'], ['#{', '}']]))) { - return 0; + return false; } if (0 < string.indexOf('#{', 1)) { this.interpolateString(string.slice(1, -1)); @@ -139,15 +141,16 @@ } break; default: - return 0; + return false; } this.line += count(string, '\n'); - return string.length; + this.i += string.length; + return true; }; Lexer.prototype.heredocToken = function() { var doc, heredoc, match, quote; if (!(match = HEREDOC.exec(this.chunk))) { - return 0; + return false; } heredoc = match[0]; quote = heredoc.charAt(0); @@ -163,15 +166,17 @@ this.token('STRING', this.makeString(doc, quote, true)); } this.line += count(heredoc, '\n'); - return heredoc.length; + this.i += heredoc.length; + return true; }; Lexer.prototype.commentToken = function() { var comment, here, match; if (!(match = this.chunk.match(COMMENT))) { - return 0; + return false; } comment = match[0], here = match[1]; this.line += count(comment, '\n'); + this.i += comment.length; if (here) { this.token('HERECOMMENT', this.sanitizeHeredoc(here, { herecomment: true, @@ -179,41 +184,44 @@ })); this.token('TERMINATOR', '\n'); } - return comment.length; + return true; }; Lexer.prototype.jsToken = function() { var match, script; if (!(this.chunk.charAt(0) === '`' && (match = JSTOKEN.exec(this.chunk)))) { - return 0; + return false; } this.token('JS', (script = match[0]).slice(1, -1)); - return script.length; + this.i += script.length; + return true; }; Lexer.prototype.regexToken = function() { var _ref2, match, regex; if (this.chunk.charAt(0) !== '/') { - return 0; + return false; } if (match = HEREGEX.exec(this.chunk)) { return this.heregexToken(match); } if ((_ref2 = this.tag(), __indexOf.call(NOT_REGEX, _ref2) >= 0)) { - return 0; + return false; } if (!(match = REGEX.exec(this.chunk))) { - return 0; + return false; } regex = match[0]; this.token('REGEX', regex === '//' ? '/(?:)/' : regex); - return regex.length; + this.i += regex.length; + return true; }; Lexer.prototype.heregexToken = function(match) { var _i, _len, _ref2, _ref3, _this, body, flags, heregex, re, tag, tokens, value; heregex = match[0], body = match[1], flags = match[2]; + this.i += heregex.length; if (0 > body.indexOf('#{')) { re = body.replace(HEREGEX_OMIT, '').replace(/\//g, '\\/'); this.token('REGEX', "/" + (re || '(?:)') + "/" + flags); - return heregex.length; + return true; } this.token('IDENTIFIER', 'RegExp'); this.tokens.push(['CALL_START', '(']); @@ -244,32 +252,29 @@ this.tokens.push([',', ','], ['STRING', '"' + flags + '"']); } this.token(')', ')'); - return heregex.length; + return true; }; Lexer.prototype.lineToken = function() { var diff, indent, match, nextCharacter, noNewlines, prev, size; if (!(match = MULTI_DENT.exec(this.chunk))) { - return 0; + return false; } indent = match[0]; this.line += count(indent, '\n'); + this.i += indent.length; prev = last(this.tokens, 1); size = indent.length - 1 - indent.lastIndexOf('\n'); nextCharacter = NEXT_CHARACTER.exec(this.chunk)[1]; noNewlines = ((nextCharacter === '.' || nextCharacter === ',') && !NEXT_ELLIPSIS.test(this.chunk)) || this.unfinished(); if (size - this.indebt === this.indent) { if (noNewlines) { - this.suppressNewlines(); - } else { - this.newlineToken(); + return this.suppressNewlines(); } - return indent.length; - } - if (size > this.indent) { + return this.newlineToken(indent); + } else if (size > this.indent) { if (noNewlines) { this.indebt = size - this.indent; - this.suppressNewlines(); - return indent.length; + return this.suppressNewlines(); } diff = size - this.indent + this.outdebt; this.token('INDENT', diff); @@ -280,7 +285,7 @@ this.outdentToken(this.indent - size, noNewlines); } this.indent = size; - return indent.length; + return true; }; Lexer.prototype.outdentToken = function(moveOut, noNewlines, close) { var dent, len; @@ -307,30 +312,33 @@ if (!(this.tag() === 'TERMINATOR' || noNewlines)) { this.token('TERMINATOR', '\n'); } - return this; + return true; }; Lexer.prototype.whitespaceToken = function() { var match, nline, prev; - if (!((match = WHITESPACE.exec(this.chunk)) || (nline = this.chunk.charAt(0) === '\n'))) { - return 0; + if (!((match = WHITESPACE.exec(this.chunk)) || (nline = this.chunk.substring(0, 1) === '\n'))) { + return false; } prev = last(this.tokens); if (prev) { prev[match ? 'spaced' : 'newLine'] = true; } - return match ? match[0].length : 0; + if (match) { + this.i += match[0].length; + } + return !!match; }; - Lexer.prototype.newlineToken = function() { + Lexer.prototype.newlineToken = function(newlines) { if (this.tag() !== 'TERMINATOR') { this.token('TERMINATOR', '\n'); } - return this; + return true; }; Lexer.prototype.suppressNewlines = function() { if (this.value() === '\\') { this.tokens.pop(); } - return this; + return true; }; Lexer.prototype.literalToken = function() { var _ref2, _ref3, _ref4, _ref5, match, prev, tag, value; @@ -342,6 +350,7 @@ } else { value = this.chunk.charAt(0); } + this.i += value.length; tag = value; prev = last(this.tokens); if (value === '=' && prev) { @@ -351,11 +360,13 @@ if ((_ref3 = prev[1]) === '||' || _ref3 === '&&') { prev[0] = 'COMPOUND_ASSIGN'; prev[1] += '='; - return 1; + return true; } } if (value === ';') { tag = 'TERMINATOR'; + } else if (__indexOf.call(LOGIC, value) >= 0) { + tag = 'LOGIC'; } else if (__indexOf.call(MATH, value) >= 0) { tag = 'MATH'; } else if (__indexOf.call(COMPARE, value) >= 0) { @@ -366,7 +377,7 @@ tag = 'UNARY'; } else if (__indexOf.call(SHIFT, value) >= 0) { tag = 'SHIFT'; - } else if (__indexOf.call(LOGIC, value) >= 0 || value === '?' && ((prev != null) ? prev.spaced : undefined)) { + } else if (value === '?' && ((prev != null) ? prev.spaced : undefined)) { tag = 'LOGIC'; } else if (prev && !prev.spaced) { if (value === '(' && (_ref4 = prev[0], __indexOf.call(CALLABLE, _ref4) >= 0)) { @@ -387,7 +398,7 @@ } } this.token(tag, value); - return value.length; + return true; }; Lexer.prototype.tagAccessor = function() { var prev; @@ -433,7 +444,7 @@ Lexer.prototype.tagParameters = function() { var i, tok; if (this.tag() !== ')') { - return this; + return; } i = this.tokens.length; while (tok = this.tokens[--i]) { @@ -450,7 +461,7 @@ return true; } } - return this; + return true; }; Lexer.prototype.closeIndentation = function() { return this.outdentToken(this.indent); diff --git a/src/lexer.coffee b/src/lexer.coffee index dc487c3c..7e03e1f2 100644 --- a/src/lexer.coffee +++ b/src/lexer.coffee @@ -26,8 +26,9 @@ exports.Lexer = class Lexer # (for interpolations). When the next token has been recorded, we move forward # within the code past the token, and begin again. # - # Each tokenizing method is responsible for returning the number of characters - # it has consumed. + # Each tokenizing method is responsible for incrementing `@i` by the number of + # characters it has consumed. `@i` can be thought of as our finger on the page + # of source. # # Before returning the token stream, run it through the [Rewriter](rewriter.html) # unless explicitly asked not to. @@ -35,6 +36,7 @@ exports.Lexer = class Lexer code = code.replace(/\r/g, '').replace TRAILING_SPACES, '' o = options or {} @code = code # The remainder of the source code. + @i = 0 # Current character position we're parsing. @line = o.line or 0 # The current line. @indent = 0 # The current indentation level. @indebt = 0 # The over-indentation at the current level. @@ -46,18 +48,17 @@ exports.Lexer = class Lexer # At every position, run through this list of attempted matches, # short-circuiting if any of them succeed. Their order determines precedence: # `@literalToken` is the fallback catch-all. - i = 0 - while @chunk = code.slice i - i += @identifierToken() or - @commentToken() or - @whitespaceToken() or - @lineToken() or - @heredocToken() or - @stringToken() or - @numberToken() or - @regexToken() or - @jsToken() or - @literalToken() + while @chunk = code.slice @i + @identifierToken() or + @commentToken() or + @whitespaceToken() or + @lineToken() or + @heredocToken() or + @stringToken() or + @numberToken() or + @regexToken() or + @jsToken() or + @literalToken() @closeIndentation() return @tokens if o.rewrite is off (new Rewriter).rewrite @tokens @@ -72,20 +73,21 @@ exports.Lexer = class Lexer # referenced as property names here, so you can still do `jQuery.is()` even # though `is` means `===` otherwise. identifierToken: -> - return 0 unless match = IDENTIFIER.exec @chunk + return false unless match = IDENTIFIER.exec @chunk [input, id, colon] = match + @i += input.length if id is 'all' and @tag() is 'FOR' @token 'ALL', id - return 3 + return true if id is 'from' and @tag(1) is 'FOR' @seenFor = no @seenFrom = yes @token 'FROM', id - return 4 + return true if id is 'to' and @seenFrom @seenFrom = no @token 'TO', id - return 2 + return true forcedIdentifier = colon or @tagAccessor() tag = 'IDENTIFIER' if id in JS_KEYWORDS or @@ -124,39 +126,41 @@ exports.Lexer = class Lexer tag = 'BOOL' @token tag, id @token ':', ':' if colon - input.length + true # Matches numbers, including decimals, hex, and exponential notation. # Be careful not to interfere with ranges-in-progress. numberToken: -> - return 0 unless match = NUMBER.exec @chunk + return false unless match = NUMBER.exec @chunk number = match[0] - return 0 if @tag() is '.' and number.charAt(0) is '.' + return false if @tag() is '.' and number.charAt(0) is '.' + @i += number.length @token 'NUMBER', number - number.length + true # Matches strings, including multi-line strings. Ensures that quotation marks # are balanced within the string's contents, and within nested interpolations. stringToken: -> switch @chunk.charAt 0 when "'" - return 0 unless match = SIMPLESTR.exec @chunk + return false unless match = SIMPLESTR.exec @chunk @token 'STRING', (string = match[0]).replace MULTILINER, '\\\n' when '"' - return 0 unless string = @balancedString @chunk, [['"', '"'], ['#{', '}']] + return false unless string = @balancedString @chunk, [['"', '"'], ['#{', '}']] if 0 < string.indexOf '#{', 1 @interpolateString string.slice 1, -1 else @token 'STRING', @escapeLines string else - return 0 + return false @line += count string, '\n' - string.length + @i += string.length + true # Matches heredocs, adjusting indentation to the correct level, as heredocs # preserve whitespace, but ignore indentation to the left. heredocToken: -> - return 0 unless match = HEREDOC.exec @chunk + return false unless match = HEREDOC.exec @chunk heredoc = match[0] quote = heredoc.charAt 0 doc = @sanitizeHeredoc match[2], {quote, indent: null} @@ -165,44 +169,49 @@ exports.Lexer = class Lexer else @token 'STRING', @makeString doc, quote, yes @line += count heredoc, '\n' - heredoc.length + @i += heredoc.length + true # Matches and consumes comments. commentToken: -> - return 0 unless match = @chunk.match COMMENT + return false unless match = @chunk.match COMMENT [comment, here] = match @line += count comment, '\n' + @i += comment.length if here @token 'HERECOMMENT', @sanitizeHeredoc here, herecomment: true, indent: Array(@indent + 1).join(' ') @token 'TERMINATOR', '\n' - comment.length + true # Matches JavaScript interpolated directly into the source via backticks. jsToken: -> - return 0 unless @chunk.charAt(0) is '`' and match = JSTOKEN.exec @chunk + return false unless @chunk.charAt(0) is '`' and match = JSTOKEN.exec @chunk @token 'JS', (script = match[0]).slice 1, -1 - script.length + @i += script.length + true # Matches regular expression literals. Lexing regular expressions is difficult # to distinguish from division, so we borrow some basic heuristics from # JavaScript and Ruby. regexToken: -> - return 0 if @chunk.charAt(0) isnt '/' + return false if @chunk.charAt(0) isnt '/' return @heregexToken match if match = HEREGEX.exec @chunk - return 0 if @tag() in NOT_REGEX - return 0 unless match = REGEX.exec @chunk + return false if @tag() in NOT_REGEX + return false unless match = REGEX.exec @chunk [regex] = match @token 'REGEX', if regex is '//' then '/(?:)/' else regex - regex.length + @i += regex.length + true # Matches experimental, multiline and extended regular expression literals. heregexToken: (match) -> [heregex, body, flags] = match + @i += heregex.length if 0 > body.indexOf '#{' re = body.replace(HEREGEX_OMIT, '').replace(/\//g, '\\/') @token 'REGEX', "/#{ re or '(?:)' }/#{flags}" - return heregex.length + return true @token 'IDENTIFIER', 'RegExp' @tokens.push ['CALL_START', '('] tokens = [] @@ -219,7 +228,7 @@ exports.Lexer = class Lexer @tokens.push tokens... @tokens.push [',', ','], ['STRING', '"' + flags + '"'] if flags @token ')', ')' - heregex.length + true # Matches newlines, indents, and outdents, and determines which is which. # If we can detect that the current line is continued onto the the next line, @@ -232,21 +241,21 @@ exports.Lexer = class Lexer # Keeps track of the level of indentation, because a single outdent token # can close multiple indents, so we need to know how far in we happen to be. lineToken: -> - return 0 unless match = MULTI_DENT.exec @chunk + return false unless match = MULTI_DENT.exec @chunk indent = match[0] @line += count indent, '\n' + @i += indent.length prev = last @tokens, 1 size = indent.length - 1 - indent.lastIndexOf '\n' nextCharacter = NEXT_CHARACTER.exec(@chunk)[1] noNewlines = (nextCharacter in ['.', ','] and not NEXT_ELLIPSIS.test(@chunk)) or @unfinished() if size - @indebt is @indent - if noNewlines then @suppressNewlines() else @newlineToken() - return indent.length - if size > @indent + return @suppressNewlines() if noNewlines + return @newlineToken indent + else if size > @indent if noNewlines @indebt = size - @indent - @suppressNewlines() - return indent.length + return @suppressNewlines() diff = size - @indent + @outdebt @token 'INDENT', diff @indents.push diff @@ -255,7 +264,7 @@ exports.Lexer = class Lexer @indebt = 0 @outdentToken @indent - size, noNewlines @indent = size - indent.length + true # Record an outdent token or multiple tokens, if we happen to be moving back # inwards past several recorded indents. @@ -277,27 +286,27 @@ exports.Lexer = class Lexer @token 'OUTDENT', dent @outdebt -= moveOut if dent @token 'TERMINATOR', '\n' unless @tag() is 'TERMINATOR' or noNewlines - this + true # Matches and consumes non-meaningful whitespace. Tag the previous token # as being "spaced", because there are some cases where it makes a difference. whitespaceToken: -> - return 0 unless (match = WHITESPACE.exec @chunk) or - (nline = @chunk.charAt(0) is '\n') + return false unless (match = WHITESPACE.exec @chunk) or nline = @chunk.substring(0, 1) is '\n' prev = last @tokens prev[if match then 'spaced' else 'newLine'] = true if prev - if match then match[0].length else 0 + @i += match[0].length if match + !!match # Generate a newline token. Consecutive newlines get merged together. - newlineToken: -> + newlineToken: (newlines) -> @token 'TERMINATOR', '\n' unless @tag() is 'TERMINATOR' - this + true # Use a `\` at a line-ending to suppress the newline. # The slash is removed here once its job is done. suppressNewlines: -> @tokens.pop() if @value() is '\\' - this + true # We treat all other single characters as a token. Eg.: `( ) , . !` # Multi-character operators are also literal tokens, so that Jison can assign @@ -310,21 +319,23 @@ exports.Lexer = class Lexer @tagParameters() if CODE.test value else value = @chunk.charAt 0 - tag = value + @i += value.length + tag = value prev = last @tokens if value is '=' and prev @assignmentError() if not prev[1].reserved and prev[1] in JS_FORBIDDEN if prev[1] in ['||', '&&'] prev[0] = 'COMPOUND_ASSIGN' prev[1] += '=' - return 1 + return true if value is ';' then tag = 'TERMINATOR' + else if value in LOGIC then tag = 'LOGIC' else if value in MATH then tag = 'MATH' else if value in COMPARE then tag = 'COMPARE' else if value in COMPOUND_ASSIGN then tag = 'COMPOUND_ASSIGN' else if value in UNARY then tag = 'UNARY' else if value in SHIFT then tag = 'SHIFT' - else if value in LOGIC or value is '?' and prev?.spaced then tag = 'LOGIC' + else if value is '?' and prev?.spaced then tag = 'LOGIC' else if prev and not prev.spaced if value is '(' and prev[0] in CALLABLE prev[0] = 'FUNC_EXIST' if prev[0] is '?' @@ -335,7 +346,7 @@ exports.Lexer = class Lexer when '?' then prev[0] = 'INDEX_SOAK' when '::' then prev[0] = 'INDEX_PROTO' @token tag, value - value.length + true # Token Manipulators # ------------------ @@ -350,7 +361,7 @@ exports.Lexer = class Lexer else if prev[1] is '.' and @value(1) isnt '.' if @tag(1) is '?' @tag 0, 'SOAK_ACCESS' - @tokens.splice -2, 1 + @tokens.splice(-2, 1) else @tag 0, 'PROPERTY_ACCESS' else @@ -374,14 +385,14 @@ exports.Lexer = class Lexer # definitions versus argument lists in function calls. Walk backwards, tagging # parameters specially in order to make things easier for the parser. tagParameters: -> - return this if @tag() isnt ')' + return if @tag() isnt ')' i = @tokens.length while tok = @tokens[--i] switch tok[0] when 'IDENTIFIER' then tok[0] = 'PARAM' when ')' then tok[0] = 'PARAM_END' when '(', 'CALL_START' then tok[0] = 'PARAM_START'; return true - this + true # Close up all remaining open blocks at the end of the file. closeIndentation: ->