first stub at heregex

2026-02-19 03:44:23 -05:00 · 2010-10-04 08:22:42 +09:00
parent ae55c70ac5
commit c605b3e232
3 changed files with 185 additions and 126 deletions
--- a/lib/lexer.js
+++ b/lib/lexer.js
@@ -1,13 +1,12 @@
 (function() {
-  var ASSIGNED, CALLABLE, CODE, COFFEE_ALIASES, COFFEE_KEYWORDS, COMMENT, COMPARE, COMPOUND_ASSIGN, CONVERSIONS, HEREDOC, HEREDOC_INDENT, IDENTIFIER, JSTOKEN, JS_FORBIDDEN, JS_KEYWORDS, LINE_BREAK, LOGIC, Lexer, MATH, MULTILINER, MULTI_DENT, NEXT_CHARACTER, NOT_REGEX, NO_NEWLINE, NUMBER, OPERATOR, REGEX_END, REGEX_ESCAPE, REGEX_INTERPOLATION, REGEX_START, RESERVED, Rewriter, SHIFT, SIMPLESTR, UNARY, WHITESPACE, _ref, compact, count, include, last, starts;
+  var ASSIGNED, CALLABLE, CODE, COFFEE_ALIASES, COFFEE_KEYWORDS, COMMENT, COMPARE, COMPOUND_ASSIGN, CONVERSIONS, HEREDOC, HEREDOC_INDENT, HEREGEX, HEREGEX_OMIT, IDENTIFIER, JSTOKEN, JS_FORBIDDEN, JS_KEYWORDS, LEADING_SPACES, LINE_BREAK, LOGIC, Lexer, MATH, MULTILINER, MULTI_DENT, NEXT_CHARACTER, NOT_REGEX, NO_NEWLINE, NUMBER, OPERATOR, REGEX_END, REGEX_ESCAPE, REGEX_START, RESERVED, Rewriter, SHIFT, SIMPLESTR, TRAILING_SPACES, UNARY, WHITESPACE, _ref, compact, count, include, last, starts;
  var __slice = Array.prototype.slice;
  Rewriter = require('./rewriter').Rewriter;
  _ref = require('./helpers'), include = _ref.include, count = _ref.count, starts = _ref.starts, compact = _ref.compact, last = _ref.last;
  exports.Lexer = (function() {
    Lexer = function() {};
    Lexer.prototype.tokenize = function(code, options) {
      var o;
-      code = code.replace(/\r/g, '').replace(/\s+$/, '');
+      code = code.replace(/\r/g, '').replace(TRAILING_SPACES, '');
      o = options || {};
      this.code = code;
      this.i = 0;
@@ -101,10 +100,14 @@
          this.token('STRING', (string = match[0]).replace(MULTILINER, '\\\n'));
          break;
        case '"':
-          if (!(string = this.balancedToken(['"', '"'], ['#{', '}']))) {
+          if (!(string = this.balancedString(this.chunk, [['"', '"'], ['#{', '}']]))) {
            return false;
          }
-          this.interpolateString(string);
+          if (~string.indexOf('#{')) {
            this.interpolateString(string);
          } else {
            this.token('STRING', this.escapeLines(string));
          }
          break;
        default:
          return false;
@@ -115,7 +118,7 @@
    };
    Lexer.prototype.heredocToken = function() {
      var doc, heredoc, match, quote;
-      if (!(match = this.chunk.match(HEREDOC))) {
+      if (!(match = HEREDOC.exec(this.chunk))) {
        return false;
      }
      heredoc = match[0];
@@ -124,12 +127,12 @@
        quote: quote,
        indent: null
      });
-      if (quote === '"') {
+      if (quote === '"' && ~doc.indexOf('#{')) {
        this.interpolateString(quote + doc + quote, {
          heredoc: true
        });
      } else {
-        this.token('STRING', quote + doc + quote);
+        this.token('STRING', quote + this.escapeLines(doc, true) + quote);
      }
      this.line += count(heredoc, '\n');
      this.i += heredoc.length;
@@ -162,8 +165,14 @@
      return true;
    };
    Lexer.prototype.regexToken = function() {
-      var _ref2, end, first, flags, regex, str;
+      var _ref2, end, first, flags, match, regex, str;
-      if (!(first = this.chunk.match(REGEX_START))) {
+      if (this.chunk.charAt(0) !== '/') {
        return false;
      }
      if (match = HEREGEX.exec(this.chunk)) {
        return this.heregexToken(match);
      }
      if (!(first = REGEX_START.exec(this.chunk))) {
        return false;
      }
      if (first[1] === ' ' && !('CALL_START' === (_ref2 = this.tag()) || '=' === _ref2)) {
@@ -172,34 +181,48 @@
      if (include(NOT_REGEX, this.tag())) {
        return false;
      }
-      if (!(regex = this.balancedToken(['/', '/']))) {
+      if (!(regex = this.balancedString(this.chunk, [['/', '/']]))) {
        return false;
      }
      if (!(end = this.chunk.slice(regex.length).match(REGEX_END))) {
        return false;
      }
      flags = end[0];
-      if (REGEX_INTERPOLATION.test(regex)) {
+      if (~regex.indexOf('#{')) {
        str = regex.slice(1, -1);
-        str = str.replace(REGEX_ESCAPE, '\\$&');
+        this.tokens.push(['IDENTIFIER', 'RegExp'], ['CALL_START', '(']);
        this.tokens.push(['(', '('], ['IDENTIFIER', 'RegExp'], ['CALL_START', '(']);
        this.interpolateString("\"" + (str) + "\"", {
-          escapeQuotes: true
+          regex: true
        });
        if (flags) {
          this.tokens.push([',', ','], ['STRING', ("\"" + (flags) + "\"")]);
        }
-        this.tokens.push([')', ')'], [')', ')']);
+        this.tokens.push(['CALL_END', ')']);
      } else {
        this.token('REGEX', regex + flags);
      }
      this.i += regex.length + flags.length;
      return true;
    };
-    Lexer.prototype.balancedToken = function() {
+    Lexer.prototype.heregexToken = function(match) {
-      var delimited;
+      var _ref2, body, flags, heregex;
-      delimited = __slice.call(arguments, 0);
+      _ref2 = match, heregex = _ref2[0], body = _ref2[1], flags = _ref2[2];
-      return this.balancedString(this.chunk, delimited);
+      this.i += heregex.length;
      if (!(~body.indexOf('#{'))) {
        this.token('REGEX', '/' + body.replace(HEREGEX_OMIT, '') + '/' + flags);
        return true;
      }
      this.token('IDENTIFIER', 'RegExp');
      this.tokens.push(['CALL_START', '(']);
      this.interpolateString("\"" + (body) + "\"", {
        regex: true,
        heregex: true
      });
      if (flags) {
        this.tokens.push([',', ','], ['STRING', '"' + flags + '"']);
      }
      this.tokens.push(['CALL_END', ')']);
      return true;
    };
    Lexer.prototype.lineToken = function() {
      var diff, indent, match, nextCharacter, noNewlines, prev, size;
@@ -450,7 +473,8 @@
                i += 1;
              }
              break;
-            } else if (starts(str, open, i)) {
+            }
            if (starts(str, open, i)) {
              levels.push(pair);
              i += open.length - 1;
              break;
@@ -471,74 +495,78 @@
      return !i ? false : str.slice(0, i);
    };
    Lexer.prototype.interpolateString = function(str, options) {
-      var _len, _ref2, end, escapeQuotes, escaped, expr, heredoc, i, idx, inner, interpolated, lexer, nested, pi, push, quote, s, tag, tok, token, tokens, value;
+      var _i, _len, _ref2, char, expr, heredoc, i, inner, interpolated, lexer, nested, pi, push, regex, s, tag, tok, tokens, value;
-      _ref2 = options || {}, heredoc = _ref2.heredoc, escapeQuotes = _ref2.escapeQuotes;
+      if (str.length < 5) {
-      quote = str.charAt(0);
+        return this.token('STRING', this.escapeLines(str, heredoc));
      if (quote !== '"' || str.length < 3) {
        return this.token('STRING', str);
      }
      _ref2 = options || (options = {}), heredoc = _ref2.heredoc, regex = _ref2.regex;
      lexer = new Lexer;
      tokens = [];
-      i = (pi = 1);
+      pi = 1;
-      end = str.length - 1;
+      i = 0;
-      while (i < end) {
+      while (char = str.charAt(i += 1)) {
-        if (str.charAt(i) === '\\') {
+        if (char === '\\') {
          i += 1;
-        } else if (expr = this.balancedString(str.slice(i), [['#{', '}']])) {
+          continue;
          if (pi < i) {
            s = quote + this.escapeLines(str.slice(pi, i), heredoc) + quote;
            tokens.push(['STRING', s]);
          }
          inner = expr.slice(2, -1).replace(/^[ \t]*\n/, '');
          if (inner.length) {
            if (heredoc) {
              inner = inner.replace(RegExp('\\\\' + quote, 'g'), quote);
            }
            nested = lexer.tokenize("(" + (inner) + ")", {
              line: this.line
            });
            for (idx = 0, _len = nested.length; idx < _len; idx++) {
              tok = nested[idx];
              if (tok[0] === 'CALL_END') {
                (tok[0] = ')');
              }
            }
            nested.pop();
            tokens.push(['TOKENS', nested]);
          } else {
            tokens.push(['STRING', quote + quote]);
          }
          i += expr.length - 1;
          pi = i + 1;
        }
-        i += 1;
+        if (!(char === '#' && str.charAt(i + 1) === '{' && (expr = this.balancedString(str.slice(i + 1), [['{', '}']])))) {
          continue;
        }
        if (pi < i) {
          tokens.push(['STRING', '"' + this.escapeLines(str.slice(pi, i), heredoc) + '"']);
        }
        inner = expr.slice(1, -1).replace(LEADING_SPACES, '').replace(TRAILING_SPACES, '');
        if (inner.length) {
          if (heredoc) {
            inner = inner.replace(/\\\"/g, '"');
          }
          nested = lexer.tokenize("(" + (inner) + ")", {
            line: this.line
          });
          for (_i = 0, _len = nested.length; _i < _len; _i++) {
            tok = nested[_i];
            if (tok[0] === 'CALL_END') {
              (tok[0] = ')');
            }
          }
          nested.pop();
          tokens.push(['TOKENS', nested]);
        } else {
          tokens.push(['STRING', '""']);
        }
        i += expr.length;
        pi = i + 1;
      }
      if ((i > pi) && (pi < str.length - 1)) {
-        s = str.slice(pi, i).replace(MULTILINER, heredoc ? '\\n' : '');
+        s = this.escapeLines(str.slice(pi, -1), heredoc);
-        tokens.push(['STRING', quote + s + quote]);
+        tokens.push(['STRING', '"' + s + '"']);
      }
      if (tokens[0][0] !== 'STRING') {
        tokens.unshift(['STRING', '""']);
      }
-      interpolated = tokens.length > 1;
+      interpolated = !regex && tokens.length > 1;
      if (interpolated) {
        this.token('(', '(');
      }
      push = tokens.push;
      for (i = 0, _len = tokens.length; i < _len; i++) {
-        token = tokens[i];
+        _ref2 = tokens[i], tag = _ref2[0], value = _ref2[1];
-        _ref2 = token, tag = _ref2[0], value = _ref2[1];
+        if (i) {
        if (tag === 'TOKENS') {
          push.apply(this.tokens, value);
        } else if (tag === 'STRING' && escapeQuotes) {
          escaped = value.slice(1, -1).replace(/"/g, '\\"');
          this.token(tag, "\"" + (escaped) + "\"");
        } else {
          this.token(tag, value);
        }
        if (i < tokens.length - 1) {
          this.token('+', '+');
        }
        if (tag === 'TOKENS') {
          push.apply(this.tokens, value);
          continue;
        }
        if (regex) {
          value = value.slice(1, -1);
          value = value.replace(/[\\\"]/g, '\\$&');
          if (options.heregex) {
            value = value.replace(HEREGEX_OMIT, '');
          }
          value = '"' + value + '"';
        }
        this.token(tag, value);
      }
      if (interpolated) {
        this.token(')', ')');
@@ -587,14 +615,17 @@
  SIMPLESTR = /^'[^\\']*(?:\\.[^\\']*)*'/;
  JSTOKEN = /^`[^\\`]*(?:\\.[^\\`]*)*`/;
  REGEX_START = /^\/([^\/])/;
  REGEX_INTERPOLATION = /[^\\]#\{.*[^\\]\}/;
  REGEX_END = /^[imgy]{0,4}(?![a-zA-Z])/;
  REGEX_ESCAPE = /\\[^#]/g;
  HEREGEX = /^\/{3}([\s\S]+?)\/{3}([imgy]{0,4})(?![A-Za-z])/;
  HEREGEX_OMIT = /\s+(?:#.*)?/g;
  MULTILINER = /\n/g;
  NO_NEWLINE = /^(?:[-+*&|\/%=<>!.\\][<>=&|]*|and|or|is(?:nt)?|n(?:ot|ew)|delete|typeof|instanceof)$/;
  HEREDOC_INDENT = /\n+([ \t]*)/g;
  ASSIGNED = /^\s*@?[$A-Za-z_][$\w]*[ \t]*?[:=][^:=>]/;
  NEXT_CHARACTER = /^\s*(\S?)/;
  LEADING_SPACES = /^\s+/;
  TRAILING_SPACES = /\s+$/;
  COMPOUND_ASSIGN = ['-=', '+=', '/=', '*=', '%=', '||=', '&&=', '?=', '<<=', '>>=', '>>>=', '&=', '^=', '|='];
  UNARY = ['UMINUS', 'UPLUS', '!', '!!', '~', 'NEW', 'TYPEOF', 'DELETE'];
  LOGIC = ['&', '|', '^', '&&', '||'];
--- a/src/lexer.coffee
+++ b/src/lexer.coffee
@@ -33,7 +33,7 @@ exports.Lexer = class Lexer
  # Before returning the token stream, run it through the [Rewriter](rewriter.html)
  # unless explicitly asked not to.
  tokenize: (code, options) ->
-    code     = code.replace(/\r/g, '').replace /\s+$/, ''
+    code     = code.replace(/\r/g, '').replace TRAILING_SPACES, ''
    o        = options or {}
    @code    = code         # The remainder of the source code.
    @i       = 0            # Current character position we're parsing.
@@ -124,8 +124,11 @@ exports.Lexer = class Lexer
        return false unless match = SIMPLESTR.exec @chunk
        @token 'STRING', (string = match[0]).replace MULTILINER, '\\\n'
      when '"'
-        return false unless string = @balancedToken ['"', '"'], ['#{', '}']
+        return false unless string = @balancedString @chunk, [['"', '"'], ['#{', '}']]
-        @interpolateString string
+        if ~string.indexOf '#{'
          @interpolateString string
        else
          @token 'STRING', @escapeLines string
      else
        return false
    @line += count string, '\n'
@@ -135,14 +138,14 @@ exports.Lexer = class Lexer
  # Matches heredocs, adjusting indentation to the correct level, as heredocs
  # preserve whitespace, but ignore indentation to the left.
  heredocToken: ->
-    return false unless match = @chunk.match HEREDOC
+    return false unless match = HEREDOC.exec @chunk
    heredoc = match[0]
    quote = heredoc.charAt 0
    doc = @sanitizeHeredoc match[2], {quote, indent: null}
-    if quote is '"'
+    if quote is '"' and ~doc.indexOf '#{'
      @interpolateString quote + doc + quote, heredoc: yes
    else
-      @token 'STRING', quote + doc + quote
+      @token 'STRING', quote + @escapeLines(doc, yes) + quote
    @line += count heredoc, '\n'
    @i += heredoc.length
    true
@@ -168,31 +171,41 @@ exports.Lexer = class Lexer
  # Matches regular expression literals. Lexing regular expressions is difficult
  # to distinguish from division, so we borrow some basic heuristics from
-  # JavaScript and Ruby, borrow slash balancing from `@balancedToken`, and
+  # JavaScript and Ruby, borrow slash balancing from `@balancedString`, and
  # borrow interpolation from `@interpolateString`.
  regexToken: ->
-    return false unless first = @chunk.match REGEX_START
+    return false if @chunk.charAt(0) isnt '/'
    return @heregexToken match if match = HEREGEX.exec @chunk
    return false unless first = REGEX_START.exec @chunk
    return false if first[1] is ' ' and @tag() not in ['CALL_START', '=']
    return false if include NOT_REGEX, @tag()
-    return false unless regex = @balancedToken ['/', '/']
+    return false unless regex = @balancedString @chunk, [['/', '/']]
    return false unless end = @chunk[regex.length..].match REGEX_END
    flags = end[0]
-    if REGEX_INTERPOLATION.test regex
+    if ~regex.indexOf '#{'
      str = regex.slice 1, -1
-      str = str.replace REGEX_ESCAPE, '\\$&'
+      @tokens.push ['IDENTIFIER', 'RegExp'], ['CALL_START', '(']
-      @tokens.push ['(', '('], ['IDENTIFIER', 'RegExp'], ['CALL_START', '(']
+      @interpolateString "\"#{str}\"", regex: yes
      @interpolateString "\"#{str}\"", escapeQuotes: yes
      @tokens.push [',', ','], ['STRING', "\"#{flags}\""] if flags
-      @tokens.push [')', ')'], [')', ')']
+      @tokens.push ['CALL_END', ')']
    else
      @token 'REGEX', regex + flags
    @i += regex.length + flags.length
    true
-  # Matches a token in which the passed delimiter pairs must be correctly
+  # Matches experimental, multiline and extended regular expression literals.
-  # balanced (ie. strings, JS literals).
+  heregexToken: (match) ->
-  balancedToken: (delimited...) ->
+    [heregex, body, flags] = match
-    @balancedString @chunk, delimited
+    @i += heregex.length
    unless ~body.indexOf '#{'
      @token 'REGEX', '/' + body.replace(HEREGEX_OMIT, '') + '/' + flags
      return true
    @token 'IDENTIFIER', 'RegExp'
    @tokens.push ['CALL_START', '(']
    @interpolateString "\"#{body}\"", regex: yes, heregex: yes
    @tokens.push [',', ','], ['STRING', '"' + flags + '"'] if flags
    @tokens.push ['CALL_END', ')']
    true
  # Matches newlines, indents, and outdents, and determines which is which.
  # If we can detect that the current line is continued onto the the next line,
@@ -399,7 +412,7 @@ exports.Lexer = class Lexer
            i += close.length - 1
            i += 1 unless levels.length
            break
-          else if starts str, open, i
+          if starts str, open, i
            levels.push(pair)
            i += open.length - 1
            break
@@ -419,49 +432,50 @@ exports.Lexer = class Lexer
  # new Lexer, tokenize the interpolated contents, and merge them into the
  # token stream.
  interpolateString: (str, options) ->
-    {heredoc, escapeQuotes} = options or {}
+    return @token 'STRING', @escapeLines(str, heredoc) if str.length < 5  # "#{}"
-    quote = str.charAt 0
+    {heredoc, regex} = options or= {}
    return @token 'STRING', str if quote isnt '"' or str.length < 3
    lexer  = new Lexer
    tokens = []
-    i = pi = 1
+    pi = 1
-    end = str.length - 1
+    i  = 0
-    while i < end
+    while char = str.charAt i += 1
-      if str.charAt(i) is '\\'
+      if char is '\\'
        i += 1
-      else if expr = @balancedString str[i..], [['#{', '}']]
+        continue
-        if pi < i
+      unless char is '#' and str.charAt(i+1) is '{' and
-          s = quote + @escapeLines(str[pi...i], heredoc) + quote
+             (expr = @balancedString str[i+1..], [['{', '}']])
-          tokens.push ['STRING', s]
+        continue
-        inner = expr.slice(2, -1).replace /^[ \t]*\n/, ''
+      if pi < i
-        if inner.length
+        tokens.push ['STRING', '"' + @escapeLines(str[pi...i], heredoc) + '"']
-          inner = inner.replace RegExp('\\\\' + quote, 'g'), quote if heredoc
+      inner = expr.slice(1, -1).replace(LEADING_SPACES, '').replace(TRAILING_SPACES, '')
-          nested = lexer.tokenize "(#{inner})", line: @line
+      if inner.length
-          (tok[0] = ')') for tok, idx in nested when tok[0] is 'CALL_END'
+        inner = inner.replace /\\\"/g, '"' if heredoc
-          nested.pop()
+        nested = lexer.tokenize "(#{inner})", line: @line
-          tokens.push ['TOKENS', nested]
+        (tok[0] = ')') for tok in nested when tok[0] is 'CALL_END'
-        else
+        nested.pop()
-          tokens.push ['STRING', quote + quote]
+        tokens.push ['TOKENS', nested]
-        i += expr.length - 1
+      else
-        pi = i + 1
+        tokens.push ['STRING', '""']
-      i += 1
+      i += expr.length
      pi = i + 1
    if i > pi < str.length - 1
-      s = str[pi...i].replace MULTILINER, if heredoc then '\\n' else ''
+      s = @escapeLines str.slice(pi, -1), heredoc
-      tokens.push ['STRING', quote + s + quote]
+      tokens.push ['STRING', '"' + s + '"']
    tokens.unshift ['STRING', '""'] unless tokens[0][0] is 'STRING'
-    interpolated = tokens.length > 1
+    interpolated = not regex and tokens.length > 1
    @token '(', '(' if interpolated
    {push} = tokens
-    for token, i in tokens
+    for [tag, value], i in tokens
-      [tag, value] = token
+      @token '+', '+' if i
      if tag is 'TOKENS'
        push.apply @tokens, value
-      else if tag is 'STRING' and escapeQuotes
+        continue
-        escaped = value.slice(1, -1).replace(/"/g, '\\"')
+      if regex
-        @token tag, "\"#{escaped}\""
+        value = value.slice 1, -1
-      else
+        value = value.replace /[\\\"]/g, '\\$&'
-        @token tag, value
+        value = value.replace HEREGEX_OMIT, '' if options.heregex
-      @token '+', '+' if i < tokens.length - 1
+        value = '"' + value + '"'
      @token tag, value
    @token ')', ')' if interpolated
    tokens
@@ -544,16 +558,20 @@ JSTOKEN    = /^`[^\\`]*(?:\\.[^\\`]*)*`/
 # Regex-matching-regexes.
 REGEX_START         = /^\/([^\/])/
 REGEX_INTERPOLATION = /[^\\]#\{.*[^\\]\}/
 REGEX_END           = /^[imgy]{0,4}(?![a-zA-Z])/
 REGEX_ESCAPE        = /\\[^#]/g
 HEREGEX      = /^\/{3}([\s\S]+?)\/{3}([imgy]{0,4})(?![A-Za-z])/
 HEREGEX_OMIT = /\s+(?:#.*)?/g
 # Token cleaning regexes.
 MULTILINER      = /\n/g
 NO_NEWLINE      = /^(?:[-+*&|\/%=<>!.\\][<>=&|]*|and|or|is(?:nt)?|n(?:ot|ew)|delete|typeof|instanceof)$/
 HEREDOC_INDENT  = /\n+([ \t]*)/g
 ASSIGNED        = /^\s*@?[$A-Za-z_][$\w]*[ \t]*?[:=][^:=>]/
 NEXT_CHARACTER  = /^\s*(\S?)/
 LEADING_SPACES  = /^\s+/
 TRAILING_SPACES = /\s+$/
 # Compound assignment tokens.
 COMPOUND_ASSIGN = ['-=', '+=', '/=', '*=', '%=', '||=', '&&=', '?=', '<<=', '>>=', '>>>=', '&=', '^=', '|=']
--- a/test/test_regexps.coffee
+++ b/test/test_regexps.coffee
@@ -26,3 +26,13 @@ regexp = / /
 ok ' '.match regexp
 ok (obj.width()/id - obj.height()/id) is -5
 eq /^I'm\s+Heregex?/gim + '', ///
  ^ I'm \s+ Heregex? # or not
 ///gim + ''
 eq '\\\\#{}', ///
 #{
   "#{ '\\' }"
 }
 \#{}
 ///.source