From a9e95fa43bdcf256ebacd03d143c869a7262dd32 Mon Sep 17 00:00:00 2001 From: satyr Date: Fri, 22 Oct 2010 14:48:26 +0900 Subject: [PATCH] lexer: simplified tokenizers' responsibility --- lib/lexer.js | 111 ++++++++++++++++++---------------------- src/lexer.coffee | 129 ++++++++++++++++++++++------------------------- 2 files changed, 109 insertions(+), 131 deletions(-) diff --git a/lib/lexer.js b/lib/lexer.js index ba366926..2d149c90 100644 --- a/lib/lexer.js +++ b/lib/lexer.js @@ -14,11 +14,10 @@ return Lexer; })(); Lexer.prototype.tokenize = function(code, options) { - var o; + var i, o; code = code.replace(/\r/g, '').replace(TRAILING_SPACES, ''); o = options || {}; this.code = code; - this.i = 0; this.line = o.line || 0; this.indent = 0; this.indebt = 0; @@ -26,8 +25,9 @@ this.indents = []; this.tokens = []; this.seenFor = this.seenFrom = false; - while (this.chunk = code.slice(this.i)) { - this.identifierToken() || this.commentToken() || this.whitespaceToken() || this.lineToken() || this.heredocToken() || this.stringToken() || this.numberToken() || this.regexToken() || this.jsToken() || this.literalToken(); + i = 0; + while (this.chunk = code.slice(i)) { + i += this.identifierToken() || this.commentToken() || this.whitespaceToken() || this.lineToken() || this.heredocToken() || this.stringToken() || this.numberToken() || this.regexToken() || this.jsToken() || this.literalToken(); } this.closeIndentation(); if (o.rewrite === false) { @@ -38,24 +38,23 @@ Lexer.prototype.identifierToken = function() { var _ref2, colon, forcedIdentifier, id, input, match, tag; if (!(match = IDENTIFIER.exec(this.chunk))) { - return false; + return 0; } input = match[0], id = match[1], colon = match[2]; - this.i += input.length; if (id === 'all' && this.tag() === 'FOR') { this.token('ALL', id); - return true; + return 3; } if (id === 'from' && this.tag(1) === 'FOR') { this.seenFor = false; this.seenFrom = true; this.token('FROM', id); - return true; + return 4; } if (id === 'to' && this.seenFrom) { this.seenFrom = false; this.token('TO', id); - return true; + return 2; } forcedIdentifier = colon || this.tagAccessor(); tag = 'IDENTIFIER'; @@ -106,33 +105,32 @@ if (colon) { this.token(':', ':'); } - return true; + return input.length; }; Lexer.prototype.numberToken = function() { var match, number; if (!(match = NUMBER.exec(this.chunk))) { - return false; + return 0; } number = match[0]; if (this.tag() === '.' && number.charAt(0) === '.') { - return false; + return 0; } - this.i += number.length; this.token('NUMBER', number); - return true; + return number.length; }; Lexer.prototype.stringToken = function() { var match, string; switch (this.chunk.charAt(0)) { case "'": if (!(match = SIMPLESTR.exec(this.chunk))) { - return false; + return 0; } this.token('STRING', (string = match[0]).replace(MULTILINER, '\\\n')); break; case '"': if (!(string = this.balancedString(this.chunk, [['"', '"'], ['#{', '}']]))) { - return false; + return 0; } if (0 < string.indexOf('#{', 1)) { this.interpolateString(string.slice(1, -1)); @@ -141,16 +139,15 @@ } break; default: - return false; + return 0; } this.line += count(string, '\n'); - this.i += string.length; - return true; + return string.length; }; Lexer.prototype.heredocToken = function() { var doc, heredoc, match, quote; if (!(match = HEREDOC.exec(this.chunk))) { - return false; + return 0; } heredoc = match[0]; quote = heredoc.charAt(0); @@ -166,17 +163,15 @@ this.token('STRING', this.makeString(doc, quote, true)); } this.line += count(heredoc, '\n'); - this.i += heredoc.length; - return true; + return heredoc.length; }; Lexer.prototype.commentToken = function() { var comment, here, match; if (!(match = this.chunk.match(COMMENT))) { - return false; + return 0; } comment = match[0], here = match[1]; this.line += count(comment, '\n'); - this.i += comment.length; if (here) { this.token('HERECOMMENT', this.sanitizeHeredoc(here, { herecomment: true, @@ -184,44 +179,41 @@ })); this.token('TERMINATOR', '\n'); } - return true; + return comment.length; }; Lexer.prototype.jsToken = function() { var match, script; if (!(this.chunk.charAt(0) === '`' && (match = JSTOKEN.exec(this.chunk)))) { - return false; + return 0; } this.token('JS', (script = match[0]).slice(1, -1)); - this.i += script.length; - return true; + return script.length; }; Lexer.prototype.regexToken = function() { var _ref2, match, regex; if (this.chunk.charAt(0) !== '/') { - return false; + return 0; } if (match = HEREGEX.exec(this.chunk)) { return this.heregexToken(match); } if ((_ref2 = this.tag(), __indexOf.call(NOT_REGEX, _ref2) >= 0)) { - return false; + return 0; } if (!(match = REGEX.exec(this.chunk))) { - return false; + return 0; } regex = match[0]; this.token('REGEX', regex === '//' ? '/(?:)/' : regex); - this.i += regex.length; - return true; + return regex.length; }; Lexer.prototype.heregexToken = function(match) { var _i, _len, _ref2, _ref3, _this, body, flags, heregex, re, tag, tokens, value; heregex = match[0], body = match[1], flags = match[2]; - this.i += heregex.length; if (0 > body.indexOf('#{')) { re = body.replace(HEREGEX_OMIT, '').replace(/\//g, '\\/'); this.token('REGEX', "/" + (re || '(?:)') + "/" + flags); - return true; + return heregex.length; } this.token('IDENTIFIER', 'RegExp'); this.tokens.push(['CALL_START', '(']); @@ -252,29 +244,32 @@ this.tokens.push([',', ','], ['STRING', '"' + flags + '"']); } this.token(')', ')'); - return true; + return heregex.length; }; Lexer.prototype.lineToken = function() { var diff, indent, match, nextCharacter, noNewlines, prev, size; if (!(match = MULTI_DENT.exec(this.chunk))) { - return false; + return 0; } indent = match[0]; this.line += count(indent, '\n'); - this.i += indent.length; prev = last(this.tokens, 1); size = indent.length - 1 - indent.lastIndexOf('\n'); nextCharacter = NEXT_CHARACTER.exec(this.chunk)[1]; noNewlines = ((nextCharacter === '.' || nextCharacter === ',') && !NEXT_ELLIPSIS.test(this.chunk)) || this.unfinished(); if (size - this.indebt === this.indent) { if (noNewlines) { - return this.suppressNewlines(); + this.suppressNewlines(); + } else { + this.newlineToken(); } - return this.newlineToken(indent); - } else if (size > this.indent) { + return indent.length; + } + if (size > this.indent) { if (noNewlines) { this.indebt = size - this.indent; - return this.suppressNewlines(); + this.suppressNewlines(); + return indent.length; } diff = size - this.indent + this.outdebt; this.token('INDENT', diff); @@ -285,7 +280,7 @@ this.outdentToken(this.indent - size, noNewlines); } this.indent = size; - return true; + return indent.length; }; Lexer.prototype.outdentToken = function(moveOut, noNewlines, close) { var dent, len; @@ -312,33 +307,30 @@ if (!(this.tag() === 'TERMINATOR' || noNewlines)) { this.token('TERMINATOR', '\n'); } - return true; + return this; }; Lexer.prototype.whitespaceToken = function() { var match, nline, prev; - if (!((match = WHITESPACE.exec(this.chunk)) || (nline = this.chunk.substring(0, 1) === '\n'))) { - return false; + if (!((match = WHITESPACE.exec(this.chunk)) || (nline = this.chunk.charAt(0) === '\n'))) { + return 0; } prev = last(this.tokens); if (prev) { prev[match ? 'spaced' : 'newLine'] = true; } - if (match) { - this.i += match[0].length; - } - return !!match; + return match ? match[0].length : 0; }; - Lexer.prototype.newlineToken = function(newlines) { + Lexer.prototype.newlineToken = function() { if (this.tag() !== 'TERMINATOR') { this.token('TERMINATOR', '\n'); } - return true; + return this; }; Lexer.prototype.suppressNewlines = function() { if (this.value() === '\\') { this.tokens.pop(); } - return true; + return this; }; Lexer.prototype.literalToken = function() { var _ref2, _ref3, _ref4, _ref5, match, prev, tag, value; @@ -350,7 +342,6 @@ } else { value = this.chunk.charAt(0); } - this.i += value.length; tag = value; prev = last(this.tokens); if (value === '=' && prev) { @@ -360,13 +351,11 @@ if ((_ref3 = prev[1]) === '||' || _ref3 === '&&') { prev[0] = 'COMPOUND_ASSIGN'; prev[1] += '='; - return true; + return 1; } } if (value === ';') { tag = 'TERMINATOR'; - } else if (__indexOf.call(LOGIC, value) >= 0) { - tag = 'LOGIC'; } else if (__indexOf.call(MATH, value) >= 0) { tag = 'MATH'; } else if (__indexOf.call(COMPARE, value) >= 0) { @@ -377,7 +366,7 @@ tag = 'UNARY'; } else if (__indexOf.call(SHIFT, value) >= 0) { tag = 'SHIFT'; - } else if (value === '?' && ((prev != null) ? prev.spaced : undefined)) { + } else if (__indexOf.call(LOGIC, value) >= 0 || value === '?' && ((prev != null) ? prev.spaced : undefined)) { tag = 'LOGIC'; } else if (prev && !prev.spaced) { if (value === '(' && (_ref4 = prev[0], __indexOf.call(CALLABLE, _ref4) >= 0)) { @@ -398,7 +387,7 @@ } } this.token(tag, value); - return true; + return value.length; }; Lexer.prototype.tagAccessor = function() { var prev; @@ -444,7 +433,7 @@ Lexer.prototype.tagParameters = function() { var i, tok; if (this.tag() !== ')') { - return; + return this; } i = this.tokens.length; while (tok = this.tokens[--i]) { @@ -461,7 +450,7 @@ return true; } } - return true; + return this; }; Lexer.prototype.closeIndentation = function() { return this.outdentToken(this.indent); diff --git a/src/lexer.coffee b/src/lexer.coffee index 7e03e1f2..dc487c3c 100644 --- a/src/lexer.coffee +++ b/src/lexer.coffee @@ -26,9 +26,8 @@ exports.Lexer = class Lexer # (for interpolations). When the next token has been recorded, we move forward # within the code past the token, and begin again. # - # Each tokenizing method is responsible for incrementing `@i` by the number of - # characters it has consumed. `@i` can be thought of as our finger on the page - # of source. + # Each tokenizing method is responsible for returning the number of characters + # it has consumed. # # Before returning the token stream, run it through the [Rewriter](rewriter.html) # unless explicitly asked not to. @@ -36,7 +35,6 @@ exports.Lexer = class Lexer code = code.replace(/\r/g, '').replace TRAILING_SPACES, '' o = options or {} @code = code # The remainder of the source code. - @i = 0 # Current character position we're parsing. @line = o.line or 0 # The current line. @indent = 0 # The current indentation level. @indebt = 0 # The over-indentation at the current level. @@ -48,17 +46,18 @@ exports.Lexer = class Lexer # At every position, run through this list of attempted matches, # short-circuiting if any of them succeed. Their order determines precedence: # `@literalToken` is the fallback catch-all. - while @chunk = code.slice @i - @identifierToken() or - @commentToken() or - @whitespaceToken() or - @lineToken() or - @heredocToken() or - @stringToken() or - @numberToken() or - @regexToken() or - @jsToken() or - @literalToken() + i = 0 + while @chunk = code.slice i + i += @identifierToken() or + @commentToken() or + @whitespaceToken() or + @lineToken() or + @heredocToken() or + @stringToken() or + @numberToken() or + @regexToken() or + @jsToken() or + @literalToken() @closeIndentation() return @tokens if o.rewrite is off (new Rewriter).rewrite @tokens @@ -73,21 +72,20 @@ exports.Lexer = class Lexer # referenced as property names here, so you can still do `jQuery.is()` even # though `is` means `===` otherwise. identifierToken: -> - return false unless match = IDENTIFIER.exec @chunk + return 0 unless match = IDENTIFIER.exec @chunk [input, id, colon] = match - @i += input.length if id is 'all' and @tag() is 'FOR' @token 'ALL', id - return true + return 3 if id is 'from' and @tag(1) is 'FOR' @seenFor = no @seenFrom = yes @token 'FROM', id - return true + return 4 if id is 'to' and @seenFrom @seenFrom = no @token 'TO', id - return true + return 2 forcedIdentifier = colon or @tagAccessor() tag = 'IDENTIFIER' if id in JS_KEYWORDS or @@ -126,41 +124,39 @@ exports.Lexer = class Lexer tag = 'BOOL' @token tag, id @token ':', ':' if colon - true + input.length # Matches numbers, including decimals, hex, and exponential notation. # Be careful not to interfere with ranges-in-progress. numberToken: -> - return false unless match = NUMBER.exec @chunk + return 0 unless match = NUMBER.exec @chunk number = match[0] - return false if @tag() is '.' and number.charAt(0) is '.' - @i += number.length + return 0 if @tag() is '.' and number.charAt(0) is '.' @token 'NUMBER', number - true + number.length # Matches strings, including multi-line strings. Ensures that quotation marks # are balanced within the string's contents, and within nested interpolations. stringToken: -> switch @chunk.charAt 0 when "'" - return false unless match = SIMPLESTR.exec @chunk + return 0 unless match = SIMPLESTR.exec @chunk @token 'STRING', (string = match[0]).replace MULTILINER, '\\\n' when '"' - return false unless string = @balancedString @chunk, [['"', '"'], ['#{', '}']] + return 0 unless string = @balancedString @chunk, [['"', '"'], ['#{', '}']] if 0 < string.indexOf '#{', 1 @interpolateString string.slice 1, -1 else @token 'STRING', @escapeLines string else - return false + return 0 @line += count string, '\n' - @i += string.length - true + string.length # Matches heredocs, adjusting indentation to the correct level, as heredocs # preserve whitespace, but ignore indentation to the left. heredocToken: -> - return false unless match = HEREDOC.exec @chunk + return 0 unless match = HEREDOC.exec @chunk heredoc = match[0] quote = heredoc.charAt 0 doc = @sanitizeHeredoc match[2], {quote, indent: null} @@ -169,49 +165,44 @@ exports.Lexer = class Lexer else @token 'STRING', @makeString doc, quote, yes @line += count heredoc, '\n' - @i += heredoc.length - true + heredoc.length # Matches and consumes comments. commentToken: -> - return false unless match = @chunk.match COMMENT + return 0 unless match = @chunk.match COMMENT [comment, here] = match @line += count comment, '\n' - @i += comment.length if here @token 'HERECOMMENT', @sanitizeHeredoc here, herecomment: true, indent: Array(@indent + 1).join(' ') @token 'TERMINATOR', '\n' - true + comment.length # Matches JavaScript interpolated directly into the source via backticks. jsToken: -> - return false unless @chunk.charAt(0) is '`' and match = JSTOKEN.exec @chunk + return 0 unless @chunk.charAt(0) is '`' and match = JSTOKEN.exec @chunk @token 'JS', (script = match[0]).slice 1, -1 - @i += script.length - true + script.length # Matches regular expression literals. Lexing regular expressions is difficult # to distinguish from division, so we borrow some basic heuristics from # JavaScript and Ruby. regexToken: -> - return false if @chunk.charAt(0) isnt '/' + return 0 if @chunk.charAt(0) isnt '/' return @heregexToken match if match = HEREGEX.exec @chunk - return false if @tag() in NOT_REGEX - return false unless match = REGEX.exec @chunk + return 0 if @tag() in NOT_REGEX + return 0 unless match = REGEX.exec @chunk [regex] = match @token 'REGEX', if regex is '//' then '/(?:)/' else regex - @i += regex.length - true + regex.length # Matches experimental, multiline and extended regular expression literals. heregexToken: (match) -> [heregex, body, flags] = match - @i += heregex.length if 0 > body.indexOf '#{' re = body.replace(HEREGEX_OMIT, '').replace(/\//g, '\\/') @token 'REGEX', "/#{ re or '(?:)' }/#{flags}" - return true + return heregex.length @token 'IDENTIFIER', 'RegExp' @tokens.push ['CALL_START', '('] tokens = [] @@ -228,7 +219,7 @@ exports.Lexer = class Lexer @tokens.push tokens... @tokens.push [',', ','], ['STRING', '"' + flags + '"'] if flags @token ')', ')' - true + heregex.length # Matches newlines, indents, and outdents, and determines which is which. # If we can detect that the current line is continued onto the the next line, @@ -241,21 +232,21 @@ exports.Lexer = class Lexer # Keeps track of the level of indentation, because a single outdent token # can close multiple indents, so we need to know how far in we happen to be. lineToken: -> - return false unless match = MULTI_DENT.exec @chunk + return 0 unless match = MULTI_DENT.exec @chunk indent = match[0] @line += count indent, '\n' - @i += indent.length prev = last @tokens, 1 size = indent.length - 1 - indent.lastIndexOf '\n' nextCharacter = NEXT_CHARACTER.exec(@chunk)[1] noNewlines = (nextCharacter in ['.', ','] and not NEXT_ELLIPSIS.test(@chunk)) or @unfinished() if size - @indebt is @indent - return @suppressNewlines() if noNewlines - return @newlineToken indent - else if size > @indent + if noNewlines then @suppressNewlines() else @newlineToken() + return indent.length + if size > @indent if noNewlines @indebt = size - @indent - return @suppressNewlines() + @suppressNewlines() + return indent.length diff = size - @indent + @outdebt @token 'INDENT', diff @indents.push diff @@ -264,7 +255,7 @@ exports.Lexer = class Lexer @indebt = 0 @outdentToken @indent - size, noNewlines @indent = size - true + indent.length # Record an outdent token or multiple tokens, if we happen to be moving back # inwards past several recorded indents. @@ -286,27 +277,27 @@ exports.Lexer = class Lexer @token 'OUTDENT', dent @outdebt -= moveOut if dent @token 'TERMINATOR', '\n' unless @tag() is 'TERMINATOR' or noNewlines - true + this # Matches and consumes non-meaningful whitespace. Tag the previous token # as being "spaced", because there are some cases where it makes a difference. whitespaceToken: -> - return false unless (match = WHITESPACE.exec @chunk) or nline = @chunk.substring(0, 1) is '\n' + return 0 unless (match = WHITESPACE.exec @chunk) or + (nline = @chunk.charAt(0) is '\n') prev = last @tokens prev[if match then 'spaced' else 'newLine'] = true if prev - @i += match[0].length if match - !!match + if match then match[0].length else 0 # Generate a newline token. Consecutive newlines get merged together. - newlineToken: (newlines) -> + newlineToken: -> @token 'TERMINATOR', '\n' unless @tag() is 'TERMINATOR' - true + this # Use a `\` at a line-ending to suppress the newline. # The slash is removed here once its job is done. suppressNewlines: -> @tokens.pop() if @value() is '\\' - true + this # We treat all other single characters as a token. Eg.: `( ) , . !` # Multi-character operators are also literal tokens, so that Jison can assign @@ -319,23 +310,21 @@ exports.Lexer = class Lexer @tagParameters() if CODE.test value else value = @chunk.charAt 0 - @i += value.length - tag = value + tag = value prev = last @tokens if value is '=' and prev @assignmentError() if not prev[1].reserved and prev[1] in JS_FORBIDDEN if prev[1] in ['||', '&&'] prev[0] = 'COMPOUND_ASSIGN' prev[1] += '=' - return true + return 1 if value is ';' then tag = 'TERMINATOR' - else if value in LOGIC then tag = 'LOGIC' else if value in MATH then tag = 'MATH' else if value in COMPARE then tag = 'COMPARE' else if value in COMPOUND_ASSIGN then tag = 'COMPOUND_ASSIGN' else if value in UNARY then tag = 'UNARY' else if value in SHIFT then tag = 'SHIFT' - else if value is '?' and prev?.spaced then tag = 'LOGIC' + else if value in LOGIC or value is '?' and prev?.spaced then tag = 'LOGIC' else if prev and not prev.spaced if value is '(' and prev[0] in CALLABLE prev[0] = 'FUNC_EXIST' if prev[0] is '?' @@ -346,7 +335,7 @@ exports.Lexer = class Lexer when '?' then prev[0] = 'INDEX_SOAK' when '::' then prev[0] = 'INDEX_PROTO' @token tag, value - true + value.length # Token Manipulators # ------------------ @@ -361,7 +350,7 @@ exports.Lexer = class Lexer else if prev[1] is '.' and @value(1) isnt '.' if @tag(1) is '?' @tag 0, 'SOAK_ACCESS' - @tokens.splice(-2, 1) + @tokens.splice -2, 1 else @tag 0, 'PROPERTY_ACCESS' else @@ -385,14 +374,14 @@ exports.Lexer = class Lexer # definitions versus argument lists in function calls. Walk backwards, tagging # parameters specially in order to make things easier for the parser. tagParameters: -> - return if @tag() isnt ')' + return this if @tag() isnt ')' i = @tokens.length while tok = @tokens[--i] switch tok[0] when 'IDENTIFIER' then tok[0] = 'PARAM' when ')' then tok[0] = 'PARAM_END' when '(', 'CALL_START' then tok[0] = 'PARAM_START'; return true - true + this # Close up all remaining open blocks at the end of the file. closeIndentation: ->