Revert "lexer: simplified tokenizers' responsibility"

This reverts commit a9e95fa43b.
2026-01-14 09:17:55 -05:00 · 2010-10-22 08:13:40 -04:00
parent a9e95fa43b
commit 10442239f1
2 changed files with 131 additions and 109 deletions
--- a/lib/lexer.js
+++ b/lib/lexer.js
@@ -14,10 +14,11 @@
      return Lexer;
    })();
    Lexer.prototype.tokenize = function(code, options) {
-      var i, o;
+      var o;
      code = code.replace(/\r/g, '').replace(TRAILING_SPACES, '');
      o = options || {};
      this.code = code;
+      this.i = 0;
      this.line = o.line || 0;
      this.indent = 0;
      this.indebt = 0;
@@ -25,9 +26,8 @@
      this.indents = [];
      this.tokens = [];
      this.seenFor = this.seenFrom = false;
-      i = 0;
-      while (this.chunk = code.slice(i)) {
-        i += this.identifierToken() || this.commentToken() || this.whitespaceToken() || this.lineToken() || this.heredocToken() || this.stringToken() || this.numberToken() || this.regexToken() || this.jsToken() || this.literalToken();
+      while (this.chunk = code.slice(this.i)) {
+        this.identifierToken() || this.commentToken() || this.whitespaceToken() || this.lineToken() || this.heredocToken() || this.stringToken() || this.numberToken() || this.regexToken() || this.jsToken() || this.literalToken();
      }
      this.closeIndentation();
      if (o.rewrite === false) {
@@ -38,23 +38,24 @@
    Lexer.prototype.identifierToken = function() {
      var _ref2, colon, forcedIdentifier, id, input, match, tag;
      if (!(match = IDENTIFIER.exec(this.chunk))) {
-        return 0;
+        return false;
      }
      input = match[0], id = match[1], colon = match[2];
+      this.i += input.length;
      if (id === 'all' && this.tag() === 'FOR') {
        this.token('ALL', id);
-        return 3;
+        return true;
      }
      if (id === 'from' && this.tag(1) === 'FOR') {
        this.seenFor = false;
        this.seenFrom = true;
        this.token('FROM', id);
-        return 4;
+        return true;
      }
      if (id === 'to' && this.seenFrom) {
        this.seenFrom = false;
        this.token('TO', id);
-        return 2;
+        return true;
      }
      forcedIdentifier = colon || this.tagAccessor();
      tag = 'IDENTIFIER';
@@ -105,32 +106,33 @@
      if (colon) {
        this.token(':', ':');
      }
-      return input.length;
+      return true;
    };
    Lexer.prototype.numberToken = function() {
      var match, number;
      if (!(match = NUMBER.exec(this.chunk))) {
-        return 0;
+        return false;
      }
      number = match[0];
      if (this.tag() === '.' && number.charAt(0) === '.') {
-        return 0;
+        return false;
      }
+      this.i += number.length;
      this.token('NUMBER', number);
-      return number.length;
+      return true;
    };
    Lexer.prototype.stringToken = function() {
      var match, string;
      switch (this.chunk.charAt(0)) {
        case "'":
          if (!(match = SIMPLESTR.exec(this.chunk))) {
-            return 0;
+            return false;
          }
          this.token('STRING', (string = match[0]).replace(MULTILINER, '\\\n'));
          break;
        case '"':
          if (!(string = this.balancedString(this.chunk, [['"', '"'], ['#{', '}']]))) {
-            return 0;
+            return false;
          }
          if (0 < string.indexOf('#{', 1)) {
            this.interpolateString(string.slice(1, -1));
@@ -139,15 +141,16 @@
          }
          break;
        default:
-          return 0;
+          return false;
      }
      this.line += count(string, '\n');
-      return string.length;
+      this.i += string.length;
+      return true;
    };
    Lexer.prototype.heredocToken = function() {
      var doc, heredoc, match, quote;
      if (!(match = HEREDOC.exec(this.chunk))) {
-        return 0;
+        return false;
      }
      heredoc = match[0];
      quote = heredoc.charAt(0);
@@ -163,15 +166,17 @@
        this.token('STRING', this.makeString(doc, quote, true));
      }
      this.line += count(heredoc, '\n');
-      return heredoc.length;
+      this.i += heredoc.length;
+      return true;
    };
    Lexer.prototype.commentToken = function() {
      var comment, here, match;
      if (!(match = this.chunk.match(COMMENT))) {
-        return 0;
+        return false;
      }
      comment = match[0], here = match[1];
      this.line += count(comment, '\n');
+      this.i += comment.length;
      if (here) {
        this.token('HERECOMMENT', this.sanitizeHeredoc(here, {
          herecomment: true,
@@ -179,41 +184,44 @@
        }));
        this.token('TERMINATOR', '\n');
      }
-      return comment.length;
+      return true;
    };
    Lexer.prototype.jsToken = function() {
      var match, script;
      if (!(this.chunk.charAt(0) === '`' && (match = JSTOKEN.exec(this.chunk)))) {
-        return 0;
+        return false;
      }
      this.token('JS', (script = match[0]).slice(1, -1));
-      return script.length;
+      this.i += script.length;
+      return true;
    };
    Lexer.prototype.regexToken = function() {
      var _ref2, match, regex;
      if (this.chunk.charAt(0) !== '/') {
-        return 0;
+        return false;
      }
      if (match = HEREGEX.exec(this.chunk)) {
        return this.heregexToken(match);
      }
      if ((_ref2 = this.tag(), __indexOf.call(NOT_REGEX, _ref2) >= 0)) {
-        return 0;
+        return false;
      }
      if (!(match = REGEX.exec(this.chunk))) {
-        return 0;
+        return false;
      }
      regex = match[0];
      this.token('REGEX', regex === '//' ? '/(?:)/' : regex);
-      return regex.length;
+      this.i += regex.length;
+      return true;
    };
    Lexer.prototype.heregexToken = function(match) {
      var _i, _len, _ref2, _ref3, _this, body, flags, heregex, re, tag, tokens, value;
      heregex = match[0], body = match[1], flags = match[2];
+      this.i += heregex.length;
      if (0 > body.indexOf('#{')) {
        re = body.replace(HEREGEX_OMIT, '').replace(/\//g, '\\/');
        this.token('REGEX', "/" + (re || '(?:)') + "/" + flags);
-        return heregex.length;
+        return true;
      }
      this.token('IDENTIFIER', 'RegExp');
      this.tokens.push(['CALL_START', '(']);
@@ -244,32 +252,29 @@
        this.tokens.push([',', ','], ['STRING', '"' + flags + '"']);
      }
      this.token(')', ')');
-      return heregex.length;
+      return true;
    };
    Lexer.prototype.lineToken = function() {
      var diff, indent, match, nextCharacter, noNewlines, prev, size;
      if (!(match = MULTI_DENT.exec(this.chunk))) {
-        return 0;
+        return false;
      }
      indent = match[0];
      this.line += count(indent, '\n');
+      this.i += indent.length;
      prev = last(this.tokens, 1);
      size = indent.length - 1 - indent.lastIndexOf('\n');
      nextCharacter = NEXT_CHARACTER.exec(this.chunk)[1];
      noNewlines = ((nextCharacter === '.' || nextCharacter === ',') && !NEXT_ELLIPSIS.test(this.chunk)) || this.unfinished();
      if (size - this.indebt === this.indent) {
        if (noNewlines) {
-          this.suppressNewlines();
-        } else {
-          this.newlineToken();
+          return this.suppressNewlines();
        }
-        return indent.length;
-      }
-      if (size > this.indent) {
+        return this.newlineToken(indent);
+      } else if (size > this.indent) {
        if (noNewlines) {
          this.indebt = size - this.indent;
-          this.suppressNewlines();
-          return indent.length;
+          return this.suppressNewlines();
        }
        diff = size - this.indent + this.outdebt;
        this.token('INDENT', diff);
@@ -280,7 +285,7 @@
        this.outdentToken(this.indent - size, noNewlines);
      }
      this.indent = size;
-      return indent.length;
+      return true;
    };
    Lexer.prototype.outdentToken = function(moveOut, noNewlines, close) {
      var dent, len;
@@ -307,30 +312,33 @@
      if (!(this.tag() === 'TERMINATOR' || noNewlines)) {
        this.token('TERMINATOR', '\n');
      }
-      return this;
+      return true;
    };
    Lexer.prototype.whitespaceToken = function() {
      var match, nline, prev;
-      if (!((match = WHITESPACE.exec(this.chunk)) || (nline = this.chunk.charAt(0) === '\n'))) {
-        return 0;
+      if (!((match = WHITESPACE.exec(this.chunk)) || (nline = this.chunk.substring(0, 1) === '\n'))) {
+        return false;
      }
      prev = last(this.tokens);
      if (prev) {
        prev[match ? 'spaced' : 'newLine'] = true;
      }
-      return match ? match[0].length : 0;
+      if (match) {
+        this.i += match[0].length;
+      }
+      return !!match;
    };
-    Lexer.prototype.newlineToken = function() {
+    Lexer.prototype.newlineToken = function(newlines) {
      if (this.tag() !== 'TERMINATOR') {
        this.token('TERMINATOR', '\n');
      }
-      return this;
+      return true;
    };
    Lexer.prototype.suppressNewlines = function() {
      if (this.value() === '\\') {
        this.tokens.pop();
      }
-      return this;
+      return true;
    };
    Lexer.prototype.literalToken = function() {
      var _ref2, _ref3, _ref4, _ref5, match, prev, tag, value;
@@ -342,6 +350,7 @@
      } else {
        value = this.chunk.charAt(0);
      }
+      this.i += value.length;
      tag = value;
      prev = last(this.tokens);
      if (value === '=' && prev) {
@@ -351,11 +360,13 @@
        if ((_ref3 = prev[1]) === '||' || _ref3 === '&&') {
          prev[0] = 'COMPOUND_ASSIGN';
          prev[1] += '=';
-          return 1;
+          return true;
        }
      }
      if (value === ';') {
        tag = 'TERMINATOR';
+      } else if (__indexOf.call(LOGIC, value) >= 0) {
+        tag = 'LOGIC';
      } else if (__indexOf.call(MATH, value) >= 0) {
        tag = 'MATH';
      } else if (__indexOf.call(COMPARE, value) >= 0) {
@@ -366,7 +377,7 @@
        tag = 'UNARY';
      } else if (__indexOf.call(SHIFT, value) >= 0) {
        tag = 'SHIFT';
-      } else if (__indexOf.call(LOGIC, value) >= 0 || value === '?' && ((prev != null) ? prev.spaced : undefined)) {
+      } else if (value === '?' && ((prev != null) ? prev.spaced : undefined)) {
        tag = 'LOGIC';
      } else if (prev && !prev.spaced) {
        if (value === '(' && (_ref4 = prev[0], __indexOf.call(CALLABLE, _ref4) >= 0)) {
@@ -387,7 +398,7 @@
        }
      }
      this.token(tag, value);
-      return value.length;
+      return true;
    };
    Lexer.prototype.tagAccessor = function() {
      var prev;
@@ -433,7 +444,7 @@
    Lexer.prototype.tagParameters = function() {
      var i, tok;
      if (this.tag() !== ')') {
-        return this;
+        return;
      }
      i = this.tokens.length;
      while (tok = this.tokens[--i]) {
@@ -450,7 +461,7 @@
            return true;
        }
      }
-      return this;
+      return true;
    };
    Lexer.prototype.closeIndentation = function() {
      return this.outdentToken(this.indent);
--- a/src/lexer.coffee
+++ b/src/lexer.coffee
@@ -26,8 +26,9 @@ exports.Lexer = class Lexer
  # (for interpolations). When the next token has been recorded, we move forward
  # within the code past the token, and begin again.
  #
-  # Each tokenizing method is responsible for returning the number of characters
-  # it has consumed.
+  # Each tokenizing method is responsible for incrementing `@i` by the number of
+  # characters it has consumed. `@i` can be thought of as our finger on the page
+  # of source.
  #
  # Before returning the token stream, run it through the [Rewriter](rewriter.html)
  # unless explicitly asked not to.
@@ -35,6 +36,7 @@ exports.Lexer = class Lexer
    code     = code.replace(/\r/g, '').replace TRAILING_SPACES, ''
    o        = options or {}
    @code    = code         # The remainder of the source code.
+    @i       = 0            # Current character position we're parsing.
    @line    = o.line or 0  # The current line.
    @indent  = 0            # The current indentation level.
    @indebt  = 0            # The over-indentation at the current level.
@@ -46,18 +48,17 @@ exports.Lexer = class Lexer
    # At every position, run through this list of attempted matches,
    # short-circuiting if any of them succeed. Their order determines precedence:
    # `@literalToken` is the fallback catch-all.
-    i = 0
-    while @chunk = code.slice i
-      i += @identifierToken() or
-           @commentToken()    or
-           @whitespaceToken() or
-           @lineToken()       or
-           @heredocToken()    or
-           @stringToken()     or
-           @numberToken()     or
-           @regexToken()      or
-           @jsToken()         or
-           @literalToken()
+    while @chunk = code.slice @i
+      @identifierToken() or
+      @commentToken()    or
+      @whitespaceToken() or
+      @lineToken()       or
+      @heredocToken()    or
+      @stringToken()     or
+      @numberToken()     or
+      @regexToken()      or
+      @jsToken()         or
+      @literalToken()
    @closeIndentation()
    return @tokens if o.rewrite is off
    (new Rewriter).rewrite @tokens
@@ -72,20 +73,21 @@ exports.Lexer = class Lexer
  # referenced as property names here, so you can still do `jQuery.is()` even
  # though `is` means `===` otherwise.
  identifierToken: ->
-    return 0 unless match = IDENTIFIER.exec @chunk
+    return false unless match = IDENTIFIER.exec @chunk
    [input, id, colon] = match
+    @i += input.length
    if id is 'all' and @tag() is 'FOR'
      @token 'ALL', id
-      return 3
+      return true
    if id is 'from' and @tag(1) is 'FOR'
      @seenFor  = no
      @seenFrom = yes
      @token 'FROM', id
-      return 4
+      return true
    if id is 'to' and @seenFrom
      @seenFrom = no
      @token 'TO', id
-      return 2
+      return true
    forcedIdentifier = colon or @tagAccessor()
    tag = 'IDENTIFIER'
    if id in JS_KEYWORDS or
@@ -124,39 +126,41 @@ exports.Lexer = class Lexer
        tag = 'BOOL'
    @token tag, id
    @token ':', ':' if colon
-    input.length
+    true

  # Matches numbers, including decimals, hex, and exponential notation.
  # Be careful not to interfere with ranges-in-progress.
  numberToken: ->
-    return 0 unless match = NUMBER.exec @chunk
+    return false unless match = NUMBER.exec @chunk
    number = match[0]
-    return 0 if @tag() is '.' and number.charAt(0) is '.'
+    return false if @tag() is '.' and number.charAt(0) is '.'
+    @i += number.length
    @token 'NUMBER', number
-    number.length
+    true

  # Matches strings, including multi-line strings. Ensures that quotation marks
  # are balanced within the string's contents, and within nested interpolations.
  stringToken: ->
    switch @chunk.charAt 0
      when "'"
-        return 0 unless match = SIMPLESTR.exec @chunk
+        return false unless match = SIMPLESTR.exec @chunk
        @token 'STRING', (string = match[0]).replace MULTILINER, '\\\n'
      when '"'
-        return 0 unless string = @balancedString @chunk, [['"', '"'], ['#{', '}']]
+        return false unless string = @balancedString @chunk, [['"', '"'], ['#{', '}']]
        if 0 < string.indexOf '#{', 1
          @interpolateString string.slice 1, -1
        else
          @token 'STRING', @escapeLines string
      else
-        return 0
+        return false
    @line += count string, '\n'
-    string.length
+    @i += string.length
+    true

  # Matches heredocs, adjusting indentation to the correct level, as heredocs
  # preserve whitespace, but ignore indentation to the left.
  heredocToken: ->
-    return 0 unless match = HEREDOC.exec @chunk
+    return false unless match = HEREDOC.exec @chunk
    heredoc = match[0]
    quote = heredoc.charAt 0
    doc = @sanitizeHeredoc match[2], {quote, indent: null}
@@ -165,44 +169,49 @@ exports.Lexer = class Lexer
    else
      @token 'STRING', @makeString doc, quote, yes
    @line += count heredoc, '\n'
-    heredoc.length
+    @i += heredoc.length
+    true

  # Matches and consumes comments.
  commentToken: ->
-    return 0 unless match = @chunk.match COMMENT
+    return false unless match = @chunk.match COMMENT
    [comment, here] = match
    @line += count comment, '\n'
+    @i += comment.length
    if here
      @token 'HERECOMMENT', @sanitizeHeredoc here,
        herecomment: true, indent: Array(@indent + 1).join(' ')
      @token 'TERMINATOR', '\n'
-    comment.length
+    true

  # Matches JavaScript interpolated directly into the source via backticks.
  jsToken: ->
-    return 0 unless @chunk.charAt(0) is '`' and match = JSTOKEN.exec @chunk
+    return false unless @chunk.charAt(0) is '`' and match = JSTOKEN.exec @chunk
    @token 'JS', (script = match[0]).slice 1, -1
-    script.length
+    @i += script.length
+    true

  # Matches regular expression literals. Lexing regular expressions is difficult
  # to distinguish from division, so we borrow some basic heuristics from
  # JavaScript and Ruby.
  regexToken: ->
-    return 0 if @chunk.charAt(0) isnt '/'
+    return false if @chunk.charAt(0) isnt '/'
    return @heregexToken match if match = HEREGEX.exec @chunk
-    return 0 if @tag() in NOT_REGEX
-    return 0 unless match = REGEX.exec @chunk
+    return false if @tag() in NOT_REGEX
+    return false unless match = REGEX.exec @chunk
    [regex] = match
    @token 'REGEX', if regex is '//' then '/(?:)/' else regex
-    regex.length
+    @i += regex.length
+    true

  # Matches experimental, multiline and extended regular expression literals.
  heregexToken: (match) ->
    [heregex, body, flags] = match
+    @i += heregex.length
    if 0 > body.indexOf '#{'
      re = body.replace(HEREGEX_OMIT, '').replace(/\//g, '\\/')
      @token 'REGEX', "/#{ re or '(?:)' }/#{flags}"
-      return heregex.length
+      return true
    @token 'IDENTIFIER', 'RegExp'
    @tokens.push ['CALL_START', '(']
    tokens = []
@@ -219,7 +228,7 @@ exports.Lexer = class Lexer
    @tokens.push tokens...
    @tokens.push [',', ','], ['STRING', '"' + flags + '"'] if flags
    @token ')', ')'
-    heregex.length
+    true

  # Matches newlines, indents, and outdents, and determines which is which.
  # If we can detect that the current line is continued onto the the next line,
@@ -232,21 +241,21 @@ exports.Lexer = class Lexer
  # Keeps track of the level of indentation, because a single outdent token
  # can close multiple indents, so we need to know how far in we happen to be.
  lineToken: ->
-    return 0 unless match = MULTI_DENT.exec @chunk
+    return false unless match = MULTI_DENT.exec @chunk
    indent = match[0]
    @line += count indent, '\n'
+    @i    += indent.length
    prev = last @tokens, 1
    size = indent.length - 1 - indent.lastIndexOf '\n'
    nextCharacter = NEXT_CHARACTER.exec(@chunk)[1]
    noNewlines    = (nextCharacter in ['.', ','] and not NEXT_ELLIPSIS.test(@chunk)) or @unfinished()
    if size - @indebt is @indent
-      if noNewlines then @suppressNewlines() else @newlineToken()
-      return indent.length
-    if size > @indent
+      return @suppressNewlines() if noNewlines
+      return @newlineToken indent
+    else if size > @indent
      if noNewlines
        @indebt = size - @indent
-        @suppressNewlines()
-        return indent.length
+        return @suppressNewlines()
      diff = size - @indent + @outdebt
      @token 'INDENT', diff
      @indents.push diff
@@ -255,7 +264,7 @@ exports.Lexer = class Lexer
      @indebt = 0
      @outdentToken @indent - size, noNewlines
    @indent = size
-    indent.length
+    true

  # Record an outdent token or multiple tokens, if we happen to be moving back
  # inwards past several recorded indents.
@@ -277,27 +286,27 @@ exports.Lexer = class Lexer
        @token 'OUTDENT', dent
    @outdebt -= moveOut if dent
    @token 'TERMINATOR', '\n' unless @tag() is 'TERMINATOR' or noNewlines
-    this
+    true

  # Matches and consumes non-meaningful whitespace. Tag the previous token
  # as being "spaced", because there are some cases where it makes a difference.
  whitespaceToken: ->
-    return 0 unless (match = WHITESPACE.exec @chunk) or
-                    (nline = @chunk.charAt(0) is '\n')
+    return false unless (match = WHITESPACE.exec @chunk) or nline = @chunk.substring(0, 1) is '\n'
    prev = last @tokens
    prev[if match then 'spaced' else 'newLine'] = true if prev
-    if match then match[0].length else 0
+    @i += match[0].length if match
+    !!match

  # Generate a newline token. Consecutive newlines get merged together.
-  newlineToken: ->
+  newlineToken: (newlines) ->
    @token 'TERMINATOR', '\n' unless @tag() is 'TERMINATOR'
-    this
+    true

  # Use a `\` at a line-ending to suppress the newline.
  # The slash is removed here once its job is done.
  suppressNewlines: ->
    @tokens.pop() if @value() is '\\'
-    this
+    true

  # We treat all other single characters as a token. Eg.: `( ) , . !`
  # Multi-character operators are also literal tokens, so that Jison can assign
@@ -310,21 +319,23 @@ exports.Lexer = class Lexer
      @tagParameters() if CODE.test value
    else
      value = @chunk.charAt 0
-    tag  = value
+    @i += value.length
+    tag = value
    prev = last @tokens
    if value is '=' and prev
      @assignmentError() if not prev[1].reserved and prev[1] in JS_FORBIDDEN
      if prev[1] in ['||', '&&']
        prev[0] = 'COMPOUND_ASSIGN'
        prev[1] += '='
-        return 1
+        return true
    if      value is ';'             then tag = 'TERMINATOR'
+    else if value in LOGIC           then tag = 'LOGIC'
    else if value in MATH            then tag = 'MATH'
    else if value in COMPARE         then tag = 'COMPARE'
    else if value in COMPOUND_ASSIGN then tag = 'COMPOUND_ASSIGN'
    else if value in UNARY           then tag = 'UNARY'
    else if value in SHIFT           then tag = 'SHIFT'
-    else if value in LOGIC or value is '?' and prev?.spaced then tag = 'LOGIC'
+    else if value is '?' and prev?.spaced  then tag = 'LOGIC'
    else if prev and not prev.spaced
      if value is '(' and prev[0] in CALLABLE
        prev[0] = 'FUNC_EXIST' if prev[0] is '?'
@@ -335,7 +346,7 @@ exports.Lexer = class Lexer
          when '?'  then prev[0] = 'INDEX_SOAK'
          when '::' then prev[0] = 'INDEX_PROTO'
    @token tag, value
-    value.length
+    true

  # Token Manipulators
  # ------------------
@@ -350,7 +361,7 @@ exports.Lexer = class Lexer
    else if prev[1] is '.' and @value(1) isnt '.'
      if @tag(1) is '?'
        @tag 0, 'SOAK_ACCESS'
-        @tokens.splice -2, 1
+        @tokens.splice(-2, 1)
      else
        @tag 0, 'PROPERTY_ACCESS'
    else
@@ -374,14 +385,14 @@ exports.Lexer = class Lexer
  # definitions versus argument lists in function calls. Walk backwards, tagging
  # parameters specially in order to make things easier for the parser.
  tagParameters: ->
-    return this if @tag() isnt ')'
+    return if @tag() isnt ')'
    i = @tokens.length
    while tok = @tokens[--i]
      switch tok[0]
        when 'IDENTIFIER'       then tok[0] = 'PARAM'
        when ')'                then tok[0] = 'PARAM_END'
        when '(', 'CALL_START'  then tok[0] = 'PARAM_START'; return true
-    this
+    true

  # Close up all remaining open blocks at the end of the file.
  closeIndentation: ->