From 3e0c35bd0fd403ac7b09d5130bd777b58ffaf0fd Mon Sep 17 00:00:00 2001
From: satyr <murky.satyr@gmail.com>
Date: Sun, 26 Sep 2010 07:06:14 +0900
Subject: [PATCH] lexer: enabled multiline interpolations

---
 lib/lexer.js              | 238 ++++++++++++++++++++------------------
 src/lexer.coffee          |  97 +++++++++-------
 test/test_heredocs.coffee |   8 ++
 3 files changed, 184 insertions(+), 159 deletions(-)

diff --git a/lib/lexer.js b/lib/lexer.js
index 896aa9f7..c8892e96 100644
--- a/lib/lexer.js
+++ b/lib/lexer.js
@@ -23,7 +23,7 @@
       this.indents = [];
       this.tokens = [];
       while ((this.chunk = code.slice(this.i))) {
-        this.extractNextToken();
+        this.identifierToken() || this.commentToken() || this.whitespaceToken() || this.lineToken() || this.heredocToken() || this.stringToken() || this.numberToken() || this.regexToken() || this.jsToken() || this.literalToken();
       }
       this.closeIndentation();
       if (o.rewrite === false) {
@@ -31,14 +31,12 @@
       }
       return (new Rewriter()).rewrite(this.tokens);
     };
-    Lexer.prototype.extractNextToken = function() {
-      return this.identifierToken() || this.commentToken() || this.whitespaceToken() || this.lineToken() || this.heredocToken() || this.stringToken() || this.numberToken() || this.regexToken() || this.jsToken() || this.literalToken();
-    };
     Lexer.prototype.identifierToken = function() {
-      var closeIndex, forcedIdentifier, id, tag;
-      if (!(id = this.match(IDENTIFIER))) {
+      var closeIndex, forcedIdentifier, id, match, tag;
+      if (!(match = IDENTIFIER.exec(this.chunk))) {
         return false;
       }
+      id = match[0];
       this.i += id.length;
       if (id === 'all' && this.tag() === 'FOR') {
         this.token('ALL', id);
@@ -86,10 +84,11 @@
       return true;
     };
     Lexer.prototype.numberToken = function() {
-      var number;
-      if (!(number = this.match(NUMBER))) {
+      var match, number;
+      if (!(match = NUMBER.exec(this.chunk))) {
         return false;
       }
+      number = match[0];
       if (this.tag() === '.' && number.charAt(0) === '.') {
         return false;
       }
@@ -98,19 +97,19 @@
       return true;
     };
     Lexer.prototype.stringToken = function() {
-      var string;
+      var match, string;
       switch (this.chunk.charAt(0)) {
         case "'":
-          if (!(string = this.match(SIMPLESTR))) {
+          if (!(match = SIMPLESTR.exec(this.chunk))) {
             return false;
           }
-          this.token('STRING', string.replace(MULTILINER, '\\\n'));
+          this.token('STRING', (string = match[0]).replace(MULTILINER, '\\\n'));
           break;
         case '"':
           if (!(string = this.balancedToken(['"', '"'], ['#{', '}']))) {
             return false;
           }
-          this.interpolateString(string.replace(MULTILINER, '\\\n'));
+          this.interpolateString(string);
           break;
         default:
           return false;
@@ -127,7 +126,8 @@
       heredoc = match[0];
       quote = heredoc.charAt(0);
       doc = this.sanitizeHeredoc(match[2], {
-        quote: quote
+        quote: quote,
+        indent: null
       });
       this.interpolateString(quote + doc + quote, {
         heredoc: true
@@ -156,11 +156,11 @@
       return true;
     };
     Lexer.prototype.jsToken = function() {
-      var script;
-      if (!(this.chunk.charAt(0) === '`' && (script = this.match(JSTOKEN)))) {
+      var match, script;
+      if (!(this.chunk.charAt(0) === '`' && (match = JSTOKEN.exec(this.chunk)))) {
         return false;
       }
-      this.token('JS', script.slice(1, -1));
+      this.token('JS', (script = match[0]).slice(1, -1));
       this.i += script.length;
       return true;
     };
@@ -205,16 +205,17 @@
       return this.balancedString(this.chunk, delimited);
     };
     Lexer.prototype.lineToken = function() {
-      var diff, indent, nextCharacter, noNewlines, prev, size;
-      if (!(indent = this.match(MULTI_DENT))) {
+      var diff, indent, match, nextCharacter, noNewlines, prev, size;
+      if (!(match = MULTI_DENT.exec(this.chunk))) {
         return false;
       }
+      indent = match[0];
       this.line += count(indent, '\n');
       this.i += indent.length;
       prev = this.prev(2);
       size = indent.length - 1 - indent.lastIndexOf('\n');
-      nextCharacter = this.match(NEXT_CHARACTER, 1);
-      noNewlines = nextCharacter === '.' || nextCharacter === ',' || this.unfinished();
+      nextCharacter = NEXT_CHARACTER.exec(this.chunk)[1];
+      noNewlines = (('.' === nextCharacter || ',' === nextCharacter)) || this.unfinished();
       if (size - this.indebt === this.indent) {
         if (noNewlines) {
           return this.suppressNewlines();
@@ -265,15 +266,15 @@
       return true;
     };
     Lexer.prototype.whitespaceToken = function() {
-      var prev, space;
-      if (!(space = this.match(WHITESPACE))) {
+      var match, prev;
+      if (!(match = WHITESPACE.exec(this.chunk))) {
         return false;
       }
       prev = this.prev();
       if (prev) {
         prev.spaced = true;
       }
-      this.i += space.length;
+      this.i += match[0].length;
       return true;
     };
     Lexer.prototype.newlineToken = function(newlines) {
@@ -369,25 +370,32 @@
       return accessor ? 'accessor' : false;
     };
     Lexer.prototype.sanitizeHeredoc = function(doc, options) {
-      var _ref2, attempt, indent, match;
-      indent = options.indent;
-      if (options.herecomment && !include(doc, '\n')) {
+      var _ref2, attempt, herecomment, indent, match;
+      _ref2 = options;
+      indent = _ref2.indent;
+      herecomment = _ref2.herecomment;
+      if (herecomment && !include(doc, '\n')) {
         return doc;
       }
-      if (!(options.herecomment)) {
+      if (!(herecomment)) {
         while ((match = HEREDOC_INDENT.exec(doc))) {
-          attempt = (typeof (_ref2 = match[1]) !== "undefined" && _ref2 !== null) ? match[1] : match[2];
-          if (!(typeof indent !== "undefined" && indent !== null) || (0 < attempt.length) && (attempt.length < indent.length)) {
+          attempt = match[1];
+          if (indent === null || (0 < attempt.length) && (attempt.length < indent.length)) {
             indent = attempt;
           }
         }
       }
-      indent || (indent = '');
-      doc = doc.replace(new RegExp('^' + indent, 'gm'), '');
-      if (options.herecomment) {
+      if (indent) {
+        doc = doc.replace(new RegExp("\\n" + (indent), "g"), '\n');
+      }
+      if (herecomment) {
         return doc;
       }
-      return doc.replace(/^\n/, '').replace(MULTILINER, '\\n').replace(new RegExp(options.quote, 'g'), "\\" + (options.quote));
+      doc = doc.replace(/^\n/, '').replace(new RegExp("" + (options.quote), "g"), '\\$&');
+      if (options.quote === "'") {
+        doc = this.oldline(doc, true);
+      }
+      return doc;
     };
     Lexer.prototype.tagParameters = function() {
       var i, tok;
@@ -469,83 +477,84 @@
       return !i ? false : str.slice(0, i);
     };
     Lexer.prototype.interpolateString = function(str, options) {
-      var _len, _ref2, _ref3, end, escaped, expr, i, idx, inner, interpolated, lexer, nested, pi, quote, tag, tok, token, tokens, value;
+      var _len, _ref2, _ref3, end, escaped, expr, i, idx, inner, interpolated, lexer, nested, pi, push, quote, s, tag, tok, token, tokens, value;
       options || (options = {});
-      if (str.length < 3 || str.charAt(0) !== '"') {
+      quote = str.charAt(0);
+      if (quote !== '"' || str.length < 3) {
         return this.token('STRING', str);
-      } else {
-        lexer = new Lexer();
-        tokens = [];
-        quote = str.charAt(0);
-        _ref2 = [1, 1];
-        i = _ref2[0];
-        pi = _ref2[1];
-        end = str.length - 1;
-        while (i < end) {
-          if (str.charAt(i) === '\\') {
-            i += 1;
-          } else if (expr = this.balancedString(str.slice(i), [['#{', '}']])) {
-            if (pi < i) {
-              tokens.push(['STRING', quote + str.slice(pi, i) + quote]);
-            }
-            inner = expr.slice(2, -1);
-            if (inner.length) {
-              if (options.heredoc) {
-                inner = inner.replace(new RegExp('\\\\' + quote, 'g'), quote);
-              }
-              nested = lexer.tokenize("(" + (inner) + ")", {
-                line: this.line
-              });
-              _ref2 = nested;
-              for (idx = 0, _len = _ref2.length; idx < _len; idx++) {
-                tok = _ref2[idx];
-                if (tok[0] === 'CALL_END') {
-                  (tok[0] = ')');
-                }
-              }
-              nested.pop();
-              tokens.push(['TOKENS', nested]);
-            } else {
-              tokens.push(['STRING', quote + quote]);
-            }
-            i += expr.length - 1;
-            pi = i + 1;
-          }
-          i += 1;
-        }
-        if ((i > pi) && (pi < str.length - 1)) {
-          tokens.push(['STRING', quote + str.slice(pi, i) + quote]);
-        }
-        if (tokens[0][0] !== 'STRING') {
-          tokens.unshift(['STRING', '""']);
-        }
-        interpolated = tokens.length > 1;
-        if (interpolated) {
-          this.token('(', '(');
-        }
-        _ref2 = tokens;
-        for (i = 0, _len = _ref2.length; i < _len; i++) {
-          token = _ref2[i];
-          _ref3 = token;
-          tag = _ref3[0];
-          value = _ref3[1];
-          if (tag === 'TOKENS') {
-            this.tokens = this.tokens.concat(value);
-          } else if (tag === 'STRING' && options.escapeQuotes) {
-            escaped = value.slice(1, -1).replace(/"/g, '\\"');
-            this.token(tag, "\"" + (escaped) + "\"");
-          } else {
-            this.token(tag, value);
-          }
-          if (i < tokens.length - 1) {
-            this.token('+', '+');
-          }
-        }
-        if (interpolated) {
-          this.token(')', ')');
-        }
-        return tokens;
       }
+      lexer = new Lexer();
+      tokens = [];
+      i = (pi = 1);
+      end = str.length - 1;
+      while (i < end) {
+        if (str.charAt(i) === '\\') {
+          i += 1;
+        } else if (expr = this.balancedString(str.slice(i), [['#{', '}']])) {
+          if (pi < i) {
+            s = quote + this.oldline(str.slice(pi, i), options.heredoc) + quote;
+            tokens.push(['STRING', s]);
+          }
+          inner = expr.slice(2, -1).replace(/^\s+/, '');
+          if (inner.length) {
+            if (options.heredoc) {
+              inner = inner.replace(RegExp('\\\\' + quote, 'g'), quote);
+            }
+            nested = lexer.tokenize("(" + (inner) + ")", {
+              line: this.line
+            });
+            _ref2 = nested;
+            for (idx = 0, _len = _ref2.length; idx < _len; idx++) {
+              tok = _ref2[idx];
+              if (tok[0] === 'CALL_END') {
+                (tok[0] = ')');
+              }
+            }
+            nested.pop();
+            tokens.push(['TOKENS', nested]);
+          } else {
+            tokens.push(['STRING', quote + quote]);
+          }
+          i += expr.length - 1;
+          pi = i + 1;
+        }
+        i += 1;
+      }
+      if ((i > pi) && (pi < str.length - 1)) {
+        s = str.slice(pi, i).replace(MULTILINER, options.heredoc ? '\\n' : '');
+        tokens.push(['STRING', quote + s + quote]);
+      }
+      if (tokens[0][0] !== 'STRING') {
+        tokens.unshift(['STRING', '""']);
+      }
+      interpolated = tokens.length > 1;
+      if (interpolated) {
+        this.token('(', '(');
+      }
+      _ref2 = tokens;
+      push = _ref2.push;
+      _ref2 = tokens;
+      for (i = 0, _len = _ref2.length; i < _len; i++) {
+        token = _ref2[i];
+        _ref3 = token;
+        tag = _ref3[0];
+        value = _ref3[1];
+        if (tag === 'TOKENS') {
+          push.apply(this.tokens, value);
+        } else if (tag === 'STRING' && options.escapeQuotes) {
+          escaped = value.slice(1, -1).replace(/"/g, '\\"');
+          this.token(tag, "\"" + (escaped) + "\"");
+        } else {
+          this.token(tag, value);
+        }
+        if (i < tokens.length - 1) {
+          this.token('+', '+');
+        }
+      }
+      if (interpolated) {
+        this.token(')', ')');
+      }
+      return tokens;
     };
     Lexer.prototype.token = function(tag, value) {
       return this.tokens.push([tag, value, this.line]);
@@ -579,9 +588,10 @@
     };
     Lexer.prototype.unfinished = function() {
       var prev, value;
-      prev = this.prev(2);
-      value = this.value();
-      return value && NO_NEWLINE.test(value) && prev && prev[0] !== '.' && !CODE.test(value) && !ASSIGNED.test(this.chunk);
+      return (prev = this.prev(2)) && prev[0] !== '.' && (value = this.value()) && NO_NEWLINE.test(value) && !CODE.test(value) && !ASSIGNED.test(this.chunk);
+    };
+    Lexer.prototype.oldline = function(str, heredoc) {
+      return str.replace(MULTILINER, heredoc ? '\\n' : '');
     };
     return Lexer;
   })();
@@ -591,11 +601,11 @@
   RESERVED = ['case', 'default', 'do', 'function', 'var', 'void', 'with', 'const', 'let', 'enum', 'export', 'import', 'native', '__hasProp', '__extends', '__slice'];
   JS_FORBIDDEN = JS_KEYWORDS.concat(RESERVED);
   IDENTIFIER = /^[a-zA-Z_$][\w$]*/;
-  NUMBER = /^(?:0x[\da-f]+)|^(?:\d+(\.\d+)?|\.\d+)(?:e[+-]?\d+)?/i;
+  NUMBER = /^0x[\da-f]+|^(?:\d+(\.\d+)?|\.\d+)(?:e[+-]?\d+)?/i;
   HEREDOC = /^("""|''')([\s\S]*?)\n?[ \t]*\1/;
   OPERATOR = /^(?:-[-=>]?|\+[+=]?|[*&|\/%=<>^:!?]+)(?=([ \t]*))/;
   WHITESPACE = /^[ \t]+/;
-  COMMENT = /^###([^#][\s\S]*?)(?:###[ \t]*\n|(?:###)?$)|^(?:\s*#(?!##[^#])[^\n]*)+/;
+  COMMENT = /^###([^#][\s\S]*?)(?:###[ \t]*\n|(?:###)?$)|^(?:\s*#(?!##[^#]).*)+/;
   CODE = /^[-=]>/;
   MULTI_DENT = /^(?:\n[ \t]*)+/;
   SIMPLESTR = /^'[^\\']*(?:\\.[^\\']*)*'/;
@@ -606,9 +616,9 @@
   REGEX_ESCAPE = /\\[^#]/g;
   MULTILINER = /\n/g;
   NO_NEWLINE = /^(?:[-+*&|\/%=<>!.\\][<>=&|]*|and|or|is(?:nt)?|not|delete|typeof|instanceof)$/;
-  HEREDOC_INDENT = /\n+([ \t]*)|^([ \t]+)/g;
+  HEREDOC_INDENT = /\n+([ \t]*)/g;
   ASSIGNED = /^\s*@?[$A-Za-z_][$\w]*[ \t]*?[:=][^:=>]/;
-  NEXT_CHARACTER = /^\s*(\S)/;
+  NEXT_CHARACTER = /^\s*(\S?)/;
   COMPOUND_ASSIGN = ['-=', '+=', '/=', '*=', '%=', '||=', '&&=', '?=', '<<=', '>>=', '>>>=', '&=', '^=', '|='];
   UNARY = ['UMINUS', 'UPLUS', '!', '!!', '~', 'TYPEOF', 'DELETE'];
   LOGIC = ['&', '|', '^', '&&', '||'];
diff --git a/src/lexer.coffee b/src/lexer.coffee
index 605a021f..a5d91c9f 100644
--- a/src/lexer.coffee
+++ b/src/lexer.coffee
@@ -125,7 +125,7 @@ exports.Lexer = class Lexer
         @token 'STRING', (string = match[0]).replace MULTILINER, '\\\n'
       when '"'
         return false unless string = @balancedToken ['"', '"'], ['#{', '}']
-        @interpolateString string.replace MULTILINER, '\\\n'
+        @interpolateString string
       else
         return false
     @line += count string, '\n'
@@ -339,9 +339,9 @@ exports.Lexer = class Lexer
         indent = attempt if indent is null or 0 < attempt.length < indent.length
     doc = doc.replace /\n#{ indent }/g, '\n' if indent
     return doc if herecomment
-    doc.replace(/^\n/, '')
-       .replace(MULTILINER, '\\n')
-       .replace(/#{ options.quote }/g, '\\$&')
+    doc = doc.replace(/^\n/, '').replace(/#{ options.quote }/g, '\\$&')
+    doc = @oldline doc, on if options.quote is "'"
+    doc
 
   # A source of ambiguity in our grammar used to be parameter lists in function
   # definitions versus argument lists in function calls. Walk backwards, tagging
@@ -406,7 +406,7 @@ exports.Lexer = class Lexer
     if not i then false else str[0...i]
 
   # Expand variables and expressions inside double-quoted strings using
-  # [ECMA Harmony's interpolation syntax](http://wiki.ecmascript.org/doku.php?id=strawman:string_interpolation)
+  # Ruby-like notation
   # for substitution of bare variables as well as arbitrary expressions.
   #
   #     "Hello #{name.capitalize()}."
@@ -415,48 +415,51 @@ exports.Lexer = class Lexer
   # new Lexer, tokenize the interpolated contents, and merge them into the
   # token stream.
   interpolateString: (str, options) ->
-    options or= {}
-    if str.length < 3 or str.charAt(0) isnt '"'
-      @token 'STRING', str
-    else
-      lexer   = new Lexer
-      tokens  = []
-      quote   = str.charAt 0
-      [i, pi] = [1, 1]
-      end = str.length - 1
-      while i < end
-        if str.charAt(i) is '\\'
-          i += 1
-        else if expr = @balancedString str[i..], [['#{', '}']]
-          tokens.push ['STRING', quote + str[pi...i] + quote] if pi < i
-          inner = expr.slice 2, -1
-          if inner.length
-            inner = inner.replace new RegExp('\\\\' + quote, 'g'), quote if options.heredoc
-            nested = lexer.tokenize "(#{inner})", line: @line
-            (tok[0] = ')') for tok, idx in nested when tok[0] is 'CALL_END'
-            nested.pop()
-            tokens.push ['TOKENS', nested]
-          else
-            tokens.push ['STRING', quote + quote]
-          i += expr.length - 1
-          pi = i + 1
+    {heredoc, escapeQuotes} = options or {}
+    quote = str.charAt 0
+    return @token 'STRING', str if quote isnt '"' or str.length < 3
+    lexer  = new Lexer
+    tokens = []
+    i = pi = 1
+    end = str.length - 1
+    while i < end
+      if str.charAt(i) is '\\'
         i += 1
-      tokens.push ['STRING', quote + str[pi...i] + quote] if i > pi < str.length - 1
-      tokens.unshift ['STRING', '""'] unless tokens[0][0] is 'STRING'
-      interpolated = tokens.length > 1
-      @token '(', '(' if interpolated
-      for token, i in tokens
-        [tag, value] = token
-        if tag is 'TOKENS'
-          @tokens = @tokens.concat value
-        else if tag is 'STRING' and options.escapeQuotes
-          escaped = value.slice(1, -1).replace(/"/g, '\\"')
-          @token tag, "\"#{escaped}\""
+      else if expr = @balancedString str[i..], [['#{', '}']]
+        if pi < i
+          s = quote + @oldline(str[pi...i], heredoc) + quote
+          tokens.push ['STRING', s]
+        inner = expr.slice(2, -1).replace /^[ \t]*\n/, ''
+        if inner.length
+          inner = inner.replace RegExp('\\\\' + quote, 'g'), quote if heredoc
+          nested = lexer.tokenize "(#{inner})", line: @line
+          (tok[0] = ')') for tok, idx in nested when tok[0] is 'CALL_END'
+          nested.pop()
+          tokens.push ['TOKENS', nested]
         else
-          @token tag, value
-        @token '+', '+' if i < tokens.length - 1
-      @token ')', ')' if interpolated
-      tokens
+          tokens.push ['STRING', quote + quote]
+        i += expr.length - 1
+        pi = i + 1
+      i += 1
+    if i > pi < str.length - 1
+      s = str[pi...i].replace MULTILINER, if heredoc then '\\n' else ''
+      tokens.push ['STRING', quote + s + quote]
+    tokens.unshift ['STRING', '""'] unless tokens[0][0] is 'STRING'
+    interpolated = tokens.length > 1
+    @token '(', '(' if interpolated
+    {push} = tokens
+    for token, i in tokens
+      [tag, value] = token
+      if tag is 'TOKENS'
+        push.apply @tokens, value
+      else if tag is 'STRING' and escapeQuotes
+        escaped = value.slice(1, -1).replace(/"/g, '\\"')
+        @token tag, "\"#{escaped}\""
+      else
+        @token tag, value
+      @token '+', '+' if i < tokens.length - 1
+    @token ')', ')' if interpolated
+    tokens
 
   # Helpers
   # -------
@@ -487,6 +490,10 @@ exports.Lexer = class Lexer
     (value = @value()) and NO_NEWLINE.test(value) and not CODE.test(value) and
     not ASSIGNED.test(@chunk)
 
+  # Converts newlines for string literals
+  oldline: (str, heredoc) ->
+    str.replace MULTILINER, if heredoc then '\\n' else ''
+
 # Constants
 # ---------
 
diff --git a/test/test_heredocs.coffee b/test/test_heredocs.coffee
index 7389fb09..5c727483 100644
--- a/test/test_heredocs.coffee
+++ b/test/test_heredocs.coffee
@@ -98,3 +98,11 @@ equal ''' line 0
 should not be relevant\n
   to the indent level
 '
+
+
+equal 'multiline nested interpolations work', """multiline #{
+  "nested #{(->
+    ok yes
+    "interpolations"
+  )()}"
+} work"""