From a9e95fa43bdcf256ebacd03d143c869a7262dd32 Mon Sep 17 00:00:00 2001
From: satyr <murky.satyr@gmail.com>
Date: Fri, 22 Oct 2010 14:48:26 +0900
Subject: [PATCH] lexer: simplified tokenizers' responsibility

---
 lib/lexer.js     | 111 ++++++++++++++++++----------------------
 src/lexer.coffee | 129 ++++++++++++++++++++++-------------------------
 2 files changed, 109 insertions(+), 131 deletions(-)

diff --git a/lib/lexer.js b/lib/lexer.js
index ba366926..2d149c90 100644
--- a/lib/lexer.js
+++ b/lib/lexer.js
@@ -14,11 +14,10 @@
       return Lexer;
     })();
     Lexer.prototype.tokenize = function(code, options) {
-      var o;
+      var i, o;
       code = code.replace(/\r/g, '').replace(TRAILING_SPACES, '');
       o = options || {};
       this.code = code;
-      this.i = 0;
       this.line = o.line || 0;
       this.indent = 0;
       this.indebt = 0;
@@ -26,8 +25,9 @@
       this.indents = [];
       this.tokens = [];
       this.seenFor = this.seenFrom = false;
-      while (this.chunk = code.slice(this.i)) {
-        this.identifierToken() || this.commentToken() || this.whitespaceToken() || this.lineToken() || this.heredocToken() || this.stringToken() || this.numberToken() || this.regexToken() || this.jsToken() || this.literalToken();
+      i = 0;
+      while (this.chunk = code.slice(i)) {
+        i += this.identifierToken() || this.commentToken() || this.whitespaceToken() || this.lineToken() || this.heredocToken() || this.stringToken() || this.numberToken() || this.regexToken() || this.jsToken() || this.literalToken();
       }
       this.closeIndentation();
       if (o.rewrite === false) {
@@ -38,24 +38,23 @@
     Lexer.prototype.identifierToken = function() {
       var _ref2, colon, forcedIdentifier, id, input, match, tag;
       if (!(match = IDENTIFIER.exec(this.chunk))) {
-        return false;
+        return 0;
       }
       input = match[0], id = match[1], colon = match[2];
-      this.i += input.length;
       if (id === 'all' && this.tag() === 'FOR') {
         this.token('ALL', id);
-        return true;
+        return 3;
       }
       if (id === 'from' && this.tag(1) === 'FOR') {
         this.seenFor = false;
         this.seenFrom = true;
         this.token('FROM', id);
-        return true;
+        return 4;
       }
       if (id === 'to' && this.seenFrom) {
         this.seenFrom = false;
         this.token('TO', id);
-        return true;
+        return 2;
       }
       forcedIdentifier = colon || this.tagAccessor();
       tag = 'IDENTIFIER';
@@ -106,33 +105,32 @@
       if (colon) {
         this.token(':', ':');
       }
-      return true;
+      return input.length;
     };
     Lexer.prototype.numberToken = function() {
       var match, number;
       if (!(match = NUMBER.exec(this.chunk))) {
-        return false;
+        return 0;
       }
       number = match[0];
       if (this.tag() === '.' && number.charAt(0) === '.') {
-        return false;
+        return 0;
       }
-      this.i += number.length;
       this.token('NUMBER', number);
-      return true;
+      return number.length;
     };
     Lexer.prototype.stringToken = function() {
       var match, string;
       switch (this.chunk.charAt(0)) {
         case "'":
           if (!(match = SIMPLESTR.exec(this.chunk))) {
-            return false;
+            return 0;
           }
           this.token('STRING', (string = match[0]).replace(MULTILINER, '\\\n'));
           break;
         case '"':
           if (!(string = this.balancedString(this.chunk, [['"', '"'], ['#{', '}']]))) {
-            return false;
+            return 0;
           }
           if (0 < string.indexOf('#{', 1)) {
             this.interpolateString(string.slice(1, -1));
@@ -141,16 +139,15 @@
           }
           break;
         default:
-          return false;
+          return 0;
       }
       this.line += count(string, '\n');
-      this.i += string.length;
-      return true;
+      return string.length;
     };
     Lexer.prototype.heredocToken = function() {
       var doc, heredoc, match, quote;
       if (!(match = HEREDOC.exec(this.chunk))) {
-        return false;
+        return 0;
       }
       heredoc = match[0];
       quote = heredoc.charAt(0);
@@ -166,17 +163,15 @@
         this.token('STRING', this.makeString(doc, quote, true));
       }
       this.line += count(heredoc, '\n');
-      this.i += heredoc.length;
-      return true;
+      return heredoc.length;
     };
     Lexer.prototype.commentToken = function() {
       var comment, here, match;
       if (!(match = this.chunk.match(COMMENT))) {
-        return false;
+        return 0;
       }
       comment = match[0], here = match[1];
       this.line += count(comment, '\n');
-      this.i += comment.length;
       if (here) {
         this.token('HERECOMMENT', this.sanitizeHeredoc(here, {
           herecomment: true,
@@ -184,44 +179,41 @@
         }));
         this.token('TERMINATOR', '\n');
       }
-      return true;
+      return comment.length;
     };
     Lexer.prototype.jsToken = function() {
       var match, script;
       if (!(this.chunk.charAt(0) === '`' && (match = JSTOKEN.exec(this.chunk)))) {
-        return false;
+        return 0;
       }
       this.token('JS', (script = match[0]).slice(1, -1));
-      this.i += script.length;
-      return true;
+      return script.length;
     };
     Lexer.prototype.regexToken = function() {
       var _ref2, match, regex;
       if (this.chunk.charAt(0) !== '/') {
-        return false;
+        return 0;
       }
       if (match = HEREGEX.exec(this.chunk)) {
         return this.heregexToken(match);
       }
       if ((_ref2 = this.tag(), __indexOf.call(NOT_REGEX, _ref2) >= 0)) {
-        return false;
+        return 0;
       }
       if (!(match = REGEX.exec(this.chunk))) {
-        return false;
+        return 0;
       }
       regex = match[0];
       this.token('REGEX', regex === '//' ? '/(?:)/' : regex);
-      this.i += regex.length;
-      return true;
+      return regex.length;
     };
     Lexer.prototype.heregexToken = function(match) {
       var _i, _len, _ref2, _ref3, _this, body, flags, heregex, re, tag, tokens, value;
       heregex = match[0], body = match[1], flags = match[2];
-      this.i += heregex.length;
       if (0 > body.indexOf('#{')) {
         re = body.replace(HEREGEX_OMIT, '').replace(/\//g, '\\/');
         this.token('REGEX', "/" + (re || '(?:)') + "/" + flags);
-        return true;
+        return heregex.length;
       }
       this.token('IDENTIFIER', 'RegExp');
       this.tokens.push(['CALL_START', '(']);
@@ -252,29 +244,32 @@
         this.tokens.push([',', ','], ['STRING', '"' + flags + '"']);
       }
       this.token(')', ')');
-      return true;
+      return heregex.length;
     };
     Lexer.prototype.lineToken = function() {
       var diff, indent, match, nextCharacter, noNewlines, prev, size;
       if (!(match = MULTI_DENT.exec(this.chunk))) {
-        return false;
+        return 0;
       }
       indent = match[0];
       this.line += count(indent, '\n');
-      this.i += indent.length;
       prev = last(this.tokens, 1);
       size = indent.length - 1 - indent.lastIndexOf('\n');
       nextCharacter = NEXT_CHARACTER.exec(this.chunk)[1];
       noNewlines = ((nextCharacter === '.' || nextCharacter === ',') && !NEXT_ELLIPSIS.test(this.chunk)) || this.unfinished();
       if (size - this.indebt === this.indent) {
         if (noNewlines) {
-          return this.suppressNewlines();
+          this.suppressNewlines();
+        } else {
+          this.newlineToken();
         }
-        return this.newlineToken(indent);
-      } else if (size > this.indent) {
+        return indent.length;
+      }
+      if (size > this.indent) {
         if (noNewlines) {
           this.indebt = size - this.indent;
-          return this.suppressNewlines();
+          this.suppressNewlines();
+          return indent.length;
         }
         diff = size - this.indent + this.outdebt;
         this.token('INDENT', diff);
@@ -285,7 +280,7 @@
         this.outdentToken(this.indent - size, noNewlines);
       }
       this.indent = size;
-      return true;
+      return indent.length;
     };
     Lexer.prototype.outdentToken = function(moveOut, noNewlines, close) {
       var dent, len;
@@ -312,33 +307,30 @@
       if (!(this.tag() === 'TERMINATOR' || noNewlines)) {
         this.token('TERMINATOR', '\n');
       }
-      return true;
+      return this;
     };
     Lexer.prototype.whitespaceToken = function() {
       var match, nline, prev;
-      if (!((match = WHITESPACE.exec(this.chunk)) || (nline = this.chunk.substring(0, 1) === '\n'))) {
-        return false;
+      if (!((match = WHITESPACE.exec(this.chunk)) || (nline = this.chunk.charAt(0) === '\n'))) {
+        return 0;
       }
       prev = last(this.tokens);
       if (prev) {
         prev[match ? 'spaced' : 'newLine'] = true;
       }
-      if (match) {
-        this.i += match[0].length;
-      }
-      return !!match;
+      return match ? match[0].length : 0;
     };
-    Lexer.prototype.newlineToken = function(newlines) {
+    Lexer.prototype.newlineToken = function() {
       if (this.tag() !== 'TERMINATOR') {
         this.token('TERMINATOR', '\n');
       }
-      return true;
+      return this;
     };
     Lexer.prototype.suppressNewlines = function() {
       if (this.value() === '\\') {
         this.tokens.pop();
       }
-      return true;
+      return this;
     };
     Lexer.prototype.literalToken = function() {
       var _ref2, _ref3, _ref4, _ref5, match, prev, tag, value;
@@ -350,7 +342,6 @@
       } else {
         value = this.chunk.charAt(0);
       }
-      this.i += value.length;
       tag = value;
       prev = last(this.tokens);
       if (value === '=' && prev) {
@@ -360,13 +351,11 @@
         if ((_ref3 = prev[1]) === '||' || _ref3 === '&&') {
           prev[0] = 'COMPOUND_ASSIGN';
           prev[1] += '=';
-          return true;
+          return 1;
         }
       }
       if (value === ';') {
         tag = 'TERMINATOR';
-      } else if (__indexOf.call(LOGIC, value) >= 0) {
-        tag = 'LOGIC';
       } else if (__indexOf.call(MATH, value) >= 0) {
         tag = 'MATH';
       } else if (__indexOf.call(COMPARE, value) >= 0) {
@@ -377,7 +366,7 @@
         tag = 'UNARY';
       } else if (__indexOf.call(SHIFT, value) >= 0) {
         tag = 'SHIFT';
-      } else if (value === '?' && ((prev != null) ? prev.spaced : undefined)) {
+      } else if (__indexOf.call(LOGIC, value) >= 0 || value === '?' && ((prev != null) ? prev.spaced : undefined)) {
         tag = 'LOGIC';
       } else if (prev && !prev.spaced) {
         if (value === '(' && (_ref4 = prev[0], __indexOf.call(CALLABLE, _ref4) >= 0)) {
@@ -398,7 +387,7 @@
         }
       }
       this.token(tag, value);
-      return true;
+      return value.length;
     };
     Lexer.prototype.tagAccessor = function() {
       var prev;
@@ -444,7 +433,7 @@
     Lexer.prototype.tagParameters = function() {
       var i, tok;
       if (this.tag() !== ')') {
-        return;
+        return this;
       }
       i = this.tokens.length;
       while (tok = this.tokens[--i]) {
@@ -461,7 +450,7 @@
             return true;
         }
       }
-      return true;
+      return this;
     };
     Lexer.prototype.closeIndentation = function() {
       return this.outdentToken(this.indent);
diff --git a/src/lexer.coffee b/src/lexer.coffee
index 7e03e1f2..dc487c3c 100644
--- a/src/lexer.coffee
+++ b/src/lexer.coffee
@@ -26,9 +26,8 @@ exports.Lexer = class Lexer
   # (for interpolations). When the next token has been recorded, we move forward
   # within the code past the token, and begin again.
   #
-  # Each tokenizing method is responsible for incrementing `@i` by the number of
-  # characters it has consumed. `@i` can be thought of as our finger on the page
-  # of source.
+  # Each tokenizing method is responsible for returning the number of characters
+  # it has consumed.
   #
   # Before returning the token stream, run it through the [Rewriter](rewriter.html)
   # unless explicitly asked not to.
@@ -36,7 +35,6 @@ exports.Lexer = class Lexer
     code     = code.replace(/\r/g, '').replace TRAILING_SPACES, ''
     o        = options or {}
     @code    = code         # The remainder of the source code.
-    @i       = 0            # Current character position we're parsing.
     @line    = o.line or 0  # The current line.
     @indent  = 0            # The current indentation level.
     @indebt  = 0            # The over-indentation at the current level.
@@ -48,17 +46,18 @@ exports.Lexer = class Lexer
     # At every position, run through this list of attempted matches,
     # short-circuiting if any of them succeed. Their order determines precedence:
     # `@literalToken` is the fallback catch-all.
-    while @chunk = code.slice @i
-      @identifierToken() or
-      @commentToken()    or
-      @whitespaceToken() or
-      @lineToken()       or
-      @heredocToken()    or
-      @stringToken()     or
-      @numberToken()     or
-      @regexToken()      or
-      @jsToken()         or
-      @literalToken()
+    i = 0
+    while @chunk = code.slice i
+      i += @identifierToken() or
+           @commentToken()    or
+           @whitespaceToken() or
+           @lineToken()       or
+           @heredocToken()    or
+           @stringToken()     or
+           @numberToken()     or
+           @regexToken()      or
+           @jsToken()         or
+           @literalToken()
     @closeIndentation()
     return @tokens if o.rewrite is off
     (new Rewriter).rewrite @tokens
@@ -73,21 +72,20 @@ exports.Lexer = class Lexer
   # referenced as property names here, so you can still do `jQuery.is()` even
   # though `is` means `===` otherwise.
   identifierToken: ->
-    return false unless match = IDENTIFIER.exec @chunk
+    return 0 unless match = IDENTIFIER.exec @chunk
     [input, id, colon] = match
-    @i += input.length
     if id is 'all' and @tag() is 'FOR'
       @token 'ALL', id
-      return true
+      return 3
     if id is 'from' and @tag(1) is 'FOR'
       @seenFor  = no
       @seenFrom = yes
       @token 'FROM', id
-      return true
+      return 4
     if id is 'to' and @seenFrom
       @seenFrom = no
       @token 'TO', id
-      return true
+      return 2
     forcedIdentifier = colon or @tagAccessor()
     tag = 'IDENTIFIER'
     if id in JS_KEYWORDS or
@@ -126,41 +124,39 @@ exports.Lexer = class Lexer
         tag = 'BOOL'
     @token tag, id
     @token ':', ':' if colon
-    true
+    input.length
 
   # Matches numbers, including decimals, hex, and exponential notation.
   # Be careful not to interfere with ranges-in-progress.
   numberToken: ->
-    return false unless match = NUMBER.exec @chunk
+    return 0 unless match = NUMBER.exec @chunk
     number = match[0]
-    return false if @tag() is '.' and number.charAt(0) is '.'
-    @i += number.length
+    return 0 if @tag() is '.' and number.charAt(0) is '.'
     @token 'NUMBER', number
-    true
+    number.length
 
   # Matches strings, including multi-line strings. Ensures that quotation marks
   # are balanced within the string's contents, and within nested interpolations.
   stringToken: ->
     switch @chunk.charAt 0
       when "'"
-        return false unless match = SIMPLESTR.exec @chunk
+        return 0 unless match = SIMPLESTR.exec @chunk
         @token 'STRING', (string = match[0]).replace MULTILINER, '\\\n'
       when '"'
-        return false unless string = @balancedString @chunk, [['"', '"'], ['#{', '}']]
+        return 0 unless string = @balancedString @chunk, [['"', '"'], ['#{', '}']]
         if 0 < string.indexOf '#{', 1
           @interpolateString string.slice 1, -1
         else
           @token 'STRING', @escapeLines string
       else
-        return false
+        return 0
     @line += count string, '\n'
-    @i += string.length
-    true
+    string.length
 
   # Matches heredocs, adjusting indentation to the correct level, as heredocs
   # preserve whitespace, but ignore indentation to the left.
   heredocToken: ->
-    return false unless match = HEREDOC.exec @chunk
+    return 0 unless match = HEREDOC.exec @chunk
     heredoc = match[0]
     quote = heredoc.charAt 0
     doc = @sanitizeHeredoc match[2], {quote, indent: null}
@@ -169,49 +165,44 @@ exports.Lexer = class Lexer
     else
       @token 'STRING', @makeString doc, quote, yes
     @line += count heredoc, '\n'
-    @i += heredoc.length
-    true
+    heredoc.length
 
   # Matches and consumes comments.
   commentToken: ->
-    return false unless match = @chunk.match COMMENT
+    return 0 unless match = @chunk.match COMMENT
     [comment, here] = match
     @line += count comment, '\n'
-    @i += comment.length
     if here
       @token 'HERECOMMENT', @sanitizeHeredoc here,
         herecomment: true, indent: Array(@indent + 1).join(' ')
       @token 'TERMINATOR', '\n'
-    true
+    comment.length
 
   # Matches JavaScript interpolated directly into the source via backticks.
   jsToken: ->
-    return false unless @chunk.charAt(0) is '`' and match = JSTOKEN.exec @chunk
+    return 0 unless @chunk.charAt(0) is '`' and match = JSTOKEN.exec @chunk
     @token 'JS', (script = match[0]).slice 1, -1
-    @i += script.length
-    true
+    script.length
 
   # Matches regular expression literals. Lexing regular expressions is difficult
   # to distinguish from division, so we borrow some basic heuristics from
   # JavaScript and Ruby.
   regexToken: ->
-    return false if @chunk.charAt(0) isnt '/'
+    return 0 if @chunk.charAt(0) isnt '/'
     return @heregexToken match if match = HEREGEX.exec @chunk
-    return false if @tag() in NOT_REGEX
-    return false unless match = REGEX.exec @chunk
+    return 0 if @tag() in NOT_REGEX
+    return 0 unless match = REGEX.exec @chunk
     [regex] = match
     @token 'REGEX', if regex is '//' then '/(?:)/' else regex
-    @i += regex.length
-    true
+    regex.length
 
   # Matches experimental, multiline and extended regular expression literals.
   heregexToken: (match) ->
     [heregex, body, flags] = match
-    @i += heregex.length
     if 0 > body.indexOf '#{'
       re = body.replace(HEREGEX_OMIT, '').replace(/\//g, '\\/')
       @token 'REGEX', "/#{ re or '(?:)' }/#{flags}"
-      return true
+      return heregex.length
     @token 'IDENTIFIER', 'RegExp'
     @tokens.push ['CALL_START', '(']
     tokens = []
@@ -228,7 +219,7 @@ exports.Lexer = class Lexer
     @tokens.push tokens...
     @tokens.push [',', ','], ['STRING', '"' + flags + '"'] if flags
     @token ')', ')'
-    true
+    heregex.length
 
   # Matches newlines, indents, and outdents, and determines which is which.
   # If we can detect that the current line is continued onto the the next line,
@@ -241,21 +232,21 @@ exports.Lexer = class Lexer
   # Keeps track of the level of indentation, because a single outdent token
   # can close multiple indents, so we need to know how far in we happen to be.
   lineToken: ->
-    return false unless match = MULTI_DENT.exec @chunk
+    return 0 unless match = MULTI_DENT.exec @chunk
     indent = match[0]
     @line += count indent, '\n'
-    @i    += indent.length
     prev = last @tokens, 1
     size = indent.length - 1 - indent.lastIndexOf '\n'
     nextCharacter = NEXT_CHARACTER.exec(@chunk)[1]
     noNewlines    = (nextCharacter in ['.', ','] and not NEXT_ELLIPSIS.test(@chunk)) or @unfinished()
     if size - @indebt is @indent
-      return @suppressNewlines() if noNewlines
-      return @newlineToken indent
-    else if size > @indent
+      if noNewlines then @suppressNewlines() else @newlineToken()
+      return indent.length
+    if size > @indent
       if noNewlines
         @indebt = size - @indent
-        return @suppressNewlines()
+        @suppressNewlines()
+        return indent.length
       diff = size - @indent + @outdebt
       @token 'INDENT', diff
       @indents.push diff
@@ -264,7 +255,7 @@ exports.Lexer = class Lexer
       @indebt = 0
       @outdentToken @indent - size, noNewlines
     @indent = size
-    true
+    indent.length
 
   # Record an outdent token or multiple tokens, if we happen to be moving back
   # inwards past several recorded indents.
@@ -286,27 +277,27 @@ exports.Lexer = class Lexer
         @token 'OUTDENT', dent
     @outdebt -= moveOut if dent
     @token 'TERMINATOR', '\n' unless @tag() is 'TERMINATOR' or noNewlines
-    true
+    this
 
   # Matches and consumes non-meaningful whitespace. Tag the previous token
   # as being "spaced", because there are some cases where it makes a difference.
   whitespaceToken: ->
-    return false unless (match = WHITESPACE.exec @chunk) or nline = @chunk.substring(0, 1) is '\n'
+    return 0 unless (match = WHITESPACE.exec @chunk) or
+                    (nline = @chunk.charAt(0) is '\n')
     prev = last @tokens
     prev[if match then 'spaced' else 'newLine'] = true if prev
-    @i += match[0].length if match
-    !!match
+    if match then match[0].length else 0
 
   # Generate a newline token. Consecutive newlines get merged together.
-  newlineToken: (newlines) ->
+  newlineToken: ->
     @token 'TERMINATOR', '\n' unless @tag() is 'TERMINATOR'
-    true
+    this
 
   # Use a `\` at a line-ending to suppress the newline.
   # The slash is removed here once its job is done.
   suppressNewlines: ->
     @tokens.pop() if @value() is '\\'
-    true
+    this
 
   # We treat all other single characters as a token. Eg.: `( ) , . !`
   # Multi-character operators are also literal tokens, so that Jison can assign
@@ -319,23 +310,21 @@ exports.Lexer = class Lexer
       @tagParameters() if CODE.test value
     else
       value = @chunk.charAt 0
-    @i += value.length
-    tag = value
+    tag  = value
     prev = last @tokens
     if value is '=' and prev
       @assignmentError() if not prev[1].reserved and prev[1] in JS_FORBIDDEN
       if prev[1] in ['||', '&&']
         prev[0] = 'COMPOUND_ASSIGN'
         prev[1] += '='
-        return true
+        return 1
     if      value is ';'             then tag = 'TERMINATOR'
-    else if value in LOGIC           then tag = 'LOGIC'
     else if value in MATH            then tag = 'MATH'
     else if value in COMPARE         then tag = 'COMPARE'
     else if value in COMPOUND_ASSIGN then tag = 'COMPOUND_ASSIGN'
     else if value in UNARY           then tag = 'UNARY'
     else if value in SHIFT           then tag = 'SHIFT'
-    else if value is '?' and prev?.spaced  then tag = 'LOGIC'
+    else if value in LOGIC or value is '?' and prev?.spaced then tag = 'LOGIC'
     else if prev and not prev.spaced
       if value is '(' and prev[0] in CALLABLE
         prev[0] = 'FUNC_EXIST' if prev[0] is '?'
@@ -346,7 +335,7 @@ exports.Lexer = class Lexer
           when '?'  then prev[0] = 'INDEX_SOAK'
           when '::' then prev[0] = 'INDEX_PROTO'
     @token tag, value
-    true
+    value.length
 
   # Token Manipulators
   # ------------------
@@ -361,7 +350,7 @@ exports.Lexer = class Lexer
     else if prev[1] is '.' and @value(1) isnt '.'
       if @tag(1) is '?'
         @tag 0, 'SOAK_ACCESS'
-        @tokens.splice(-2, 1)
+        @tokens.splice -2, 1
       else
         @tag 0, 'PROPERTY_ACCESS'
     else
@@ -385,14 +374,14 @@ exports.Lexer = class Lexer
   # definitions versus argument lists in function calls. Walk backwards, tagging
   # parameters specially in order to make things easier for the parser.
   tagParameters: ->
-    return if @tag() isnt ')'
+    return this if @tag() isnt ')'
     i = @tokens.length
     while tok = @tokens[--i]
       switch tok[0]
         when 'IDENTIFIER'       then tok[0] = 'PARAM'
         when ')'                then tok[0] = 'PARAM_END'
         when '(', 'CALL_START'  then tok[0] = 'PARAM_START'; return true
-    true
+    this
 
   # Close up all remaining open blocks at the end of the file.
   closeIndentation: ->