first stub at heregex

2026-02-19 03:44:23 -05:00 · 2010-10-04 08:22:42 +09:00
parent ae55c70ac5
commit c605b3e232
3 changed files with 185 additions and 126 deletions
--- a/lib/lexer.js
+++ b/lib/lexer.js
@@ -1,13 +1,12 @@
 (function() {
-  var ASSIGNED, CALLABLE, CODE, COFFEE_ALIASES, COFFEE_KEYWORDS, COMMENT, COMPARE, COMPOUND_ASSIGN, CONVERSIONS, HEREDOC, HEREDOC_INDENT, IDENTIFIER, JSTOKEN, JS_FORBIDDEN, JS_KEYWORDS, LINE_BREAK, LOGIC, Lexer, MATH, MULTILINER, MULTI_DENT, NEXT_CHARACTER, NOT_REGEX, NO_NEWLINE, NUMBER, OPERATOR, REGEX_END, REGEX_ESCAPE, REGEX_INTERPOLATION, REGEX_START, RESERVED, Rewriter, SHIFT, SIMPLESTR, UNARY, WHITESPACE, _ref, compact, count, include, last, starts;
-  var __slice = Array.prototype.slice;
+  var ASSIGNED, CALLABLE, CODE, COFFEE_ALIASES, COFFEE_KEYWORDS, COMMENT, COMPARE, COMPOUND_ASSIGN, CONVERSIONS, HEREDOC, HEREDOC_INDENT, HEREGEX, HEREGEX_OMIT, IDENTIFIER, JSTOKEN, JS_FORBIDDEN, JS_KEYWORDS, LEADING_SPACES, LINE_BREAK, LOGIC, Lexer, MATH, MULTILINER, MULTI_DENT, NEXT_CHARACTER, NOT_REGEX, NO_NEWLINE, NUMBER, OPERATOR, REGEX_END, REGEX_ESCAPE, REGEX_START, RESERVED, Rewriter, SHIFT, SIMPLESTR, TRAILING_SPACES, UNARY, WHITESPACE, _ref, compact, count, include, last, starts;
  Rewriter = require('./rewriter').Rewriter;
  _ref = require('./helpers'), include = _ref.include, count = _ref.count, starts = _ref.starts, compact = _ref.compact, last = _ref.last;
  exports.Lexer = (function() {
    Lexer = function() {};
    Lexer.prototype.tokenize = function(code, options) {
      var o;
-      code = code.replace(/\r/g, '').replace(/\s+$/, '');
+      code = code.replace(/\r/g, '').replace(TRAILING_SPACES, '');
      o = options || {};
      this.code = code;
      this.i = 0;
@@ -101,10 +100,14 @@
          this.token('STRING', (string = match[0]).replace(MULTILINER, '\\\n'));
          break;
        case '"':
-          if (!(string = this.balancedToken(['"', '"'], ['#{', '}']))) {
+          if (!(string = this.balancedString(this.chunk, [['"', '"'], ['#{', '}']]))) {
            return false;
          }
-          this.interpolateString(string);
+          if (~string.indexOf('#{')) {
+            this.interpolateString(string);
+          } else {
+            this.token('STRING', this.escapeLines(string));
+          }
          break;
        default:
          return false;
@@ -115,7 +118,7 @@
    };
    Lexer.prototype.heredocToken = function() {
      var doc, heredoc, match, quote;
-      if (!(match = this.chunk.match(HEREDOC))) {
+      if (!(match = HEREDOC.exec(this.chunk))) {
        return false;
      }
      heredoc = match[0];
@@ -124,12 +127,12 @@
        quote: quote,
        indent: null
      });
-      if (quote === '"') {
+      if (quote === '"' && ~doc.indexOf('#{')) {
        this.interpolateString(quote + doc + quote, {
          heredoc: true
        });
      } else {
-        this.token('STRING', quote + doc + quote);
+        this.token('STRING', quote + this.escapeLines(doc, true) + quote);
      }
      this.line += count(heredoc, '\n');
      this.i += heredoc.length;
@@ -162,8 +165,14 @@
      return true;
    };
    Lexer.prototype.regexToken = function() {
-      var _ref2, end, first, flags, regex, str;
-      if (!(first = this.chunk.match(REGEX_START))) {
+      var _ref2, end, first, flags, match, regex, str;
+      if (this.chunk.charAt(0) !== '/') {
+        return false;
+      }
+      if (match = HEREGEX.exec(this.chunk)) {
+        return this.heregexToken(match);
+      }
+      if (!(first = REGEX_START.exec(this.chunk))) {
        return false;
      }
      if (first[1] === ' ' && !('CALL_START' === (_ref2 = this.tag()) || '=' === _ref2)) {
@@ -172,34 +181,48 @@
      if (include(NOT_REGEX, this.tag())) {
        return false;
      }
-      if (!(regex = this.balancedToken(['/', '/']))) {
+      if (!(regex = this.balancedString(this.chunk, [['/', '/']]))) {
        return false;
      }
      if (!(end = this.chunk.slice(regex.length).match(REGEX_END))) {
        return false;
      }
      flags = end[0];
-      if (REGEX_INTERPOLATION.test(regex)) {
+      if (~regex.indexOf('#{')) {
        str = regex.slice(1, -1);
-        str = str.replace(REGEX_ESCAPE, '\\$&');
-        this.tokens.push(['(', '('], ['IDENTIFIER', 'RegExp'], ['CALL_START', '(']);
+        this.tokens.push(['IDENTIFIER', 'RegExp'], ['CALL_START', '(']);
        this.interpolateString("\"" + (str) + "\"", {
-          escapeQuotes: true
+          regex: true
        });
        if (flags) {
          this.tokens.push([',', ','], ['STRING', ("\"" + (flags) + "\"")]);
        }
-        this.tokens.push([')', ')'], [')', ')']);
+        this.tokens.push(['CALL_END', ')']);
      } else {
        this.token('REGEX', regex + flags);
      }
      this.i += regex.length + flags.length;
      return true;
    };
-    Lexer.prototype.balancedToken = function() {
-      var delimited;
-      delimited = __slice.call(arguments, 0);
-      return this.balancedString(this.chunk, delimited);
+    Lexer.prototype.heregexToken = function(match) {
+      var _ref2, body, flags, heregex;
+      _ref2 = match, heregex = _ref2[0], body = _ref2[1], flags = _ref2[2];
+      this.i += heregex.length;
+      if (!(~body.indexOf('#{'))) {
+        this.token('REGEX', '/' + body.replace(HEREGEX_OMIT, '') + '/' + flags);
+        return true;
+      }
+      this.token('IDENTIFIER', 'RegExp');
+      this.tokens.push(['CALL_START', '(']);
+      this.interpolateString("\"" + (body) + "\"", {
+        regex: true,
+        heregex: true
+      });
+      if (flags) {
+        this.tokens.push([',', ','], ['STRING', '"' + flags + '"']);
+      }
+      this.tokens.push(['CALL_END', ')']);
+      return true;
    };
    Lexer.prototype.lineToken = function() {
      var diff, indent, match, nextCharacter, noNewlines, prev, size;
@@ -450,7 +473,8 @@
                i += 1;
              }
              break;
-            } else if (starts(str, open, i)) {
+            }
+            if (starts(str, open, i)) {
              levels.push(pair);
              i += open.length - 1;
              break;
@@ -471,74 +495,78 @@
      return !i ? false : str.slice(0, i);
    };
    Lexer.prototype.interpolateString = function(str, options) {
-      var _len, _ref2, end, escapeQuotes, escaped, expr, heredoc, i, idx, inner, interpolated, lexer, nested, pi, push, quote, s, tag, tok, token, tokens, value;
-      _ref2 = options || {}, heredoc = _ref2.heredoc, escapeQuotes = _ref2.escapeQuotes;
-      quote = str.charAt(0);
-      if (quote !== '"' || str.length < 3) {
-        return this.token('STRING', str);
+      var _i, _len, _ref2, char, expr, heredoc, i, inner, interpolated, lexer, nested, pi, push, regex, s, tag, tok, tokens, value;
+      if (str.length < 5) {
+        return this.token('STRING', this.escapeLines(str, heredoc));
      }
+      _ref2 = options || (options = {}), heredoc = _ref2.heredoc, regex = _ref2.regex;
      lexer = new Lexer;
      tokens = [];
-      i = (pi = 1);
-      end = str.length - 1;
-      while (i < end) {
-        if (str.charAt(i) === '\\') {
+      pi = 1;
+      i = 0;
+      while (char = str.charAt(i += 1)) {
+        if (char === '\\') {
          i += 1;
-        } else if (expr = this.balancedString(str.slice(i), [['#{', '}']])) {
-          if (pi < i) {
-            s = quote + this.escapeLines(str.slice(pi, i), heredoc) + quote;
-            tokens.push(['STRING', s]);
-          }
-          inner = expr.slice(2, -1).replace(/^[ \t]*\n/, '');
-          if (inner.length) {
-            if (heredoc) {
-              inner = inner.replace(RegExp('\\\\' + quote, 'g'), quote);
-            }
-            nested = lexer.tokenize("(" + (inner) + ")", {
-              line: this.line
-            });
-            for (idx = 0, _len = nested.length; idx < _len; idx++) {
-              tok = nested[idx];
-              if (tok[0] === 'CALL_END') {
-                (tok[0] = ')');
-              }
-            }
-            nested.pop();
-            tokens.push(['TOKENS', nested]);
-          } else {
-            tokens.push(['STRING', quote + quote]);
-          }
-          i += expr.length - 1;
-          pi = i + 1;
+          continue;
        }
-        i += 1;
+        if (!(char === '#' && str.charAt(i + 1) === '{' && (expr = this.balancedString(str.slice(i + 1), [['{', '}']])))) {
+          continue;
+        }
+        if (pi < i) {
+          tokens.push(['STRING', '"' + this.escapeLines(str.slice(pi, i), heredoc) + '"']);
+        }
+        inner = expr.slice(1, -1).replace(LEADING_SPACES, '').replace(TRAILING_SPACES, '');
+        if (inner.length) {
+          if (heredoc) {
+            inner = inner.replace(/\\\"/g, '"');
+          }
+          nested = lexer.tokenize("(" + (inner) + ")", {
+            line: this.line
+          });
+          for (_i = 0, _len = nested.length; _i < _len; _i++) {
+            tok = nested[_i];
+            if (tok[0] === 'CALL_END') {
+              (tok[0] = ')');
+            }
+          }
+          nested.pop();
+          tokens.push(['TOKENS', nested]);
+        } else {
+          tokens.push(['STRING', '""']);
+        }
+        i += expr.length;
+        pi = i + 1;
      }
      if ((i > pi) && (pi < str.length - 1)) {
-        s = str.slice(pi, i).replace(MULTILINER, heredoc ? '\\n' : '');
-        tokens.push(['STRING', quote + s + quote]);
+        s = this.escapeLines(str.slice(pi, -1), heredoc);
+        tokens.push(['STRING', '"' + s + '"']);
      }
      if (tokens[0][0] !== 'STRING') {
        tokens.unshift(['STRING', '""']);
      }
-      interpolated = tokens.length > 1;
+      interpolated = !regex && tokens.length > 1;
      if (interpolated) {
        this.token('(', '(');
      }
      push = tokens.push;
      for (i = 0, _len = tokens.length; i < _len; i++) {
-        token = tokens[i];
-        _ref2 = token, tag = _ref2[0], value = _ref2[1];
-        if (tag === 'TOKENS') {
-          push.apply(this.tokens, value);
-        } else if (tag === 'STRING' && escapeQuotes) {
-          escaped = value.slice(1, -1).replace(/"/g, '\\"');
-          this.token(tag, "\"" + (escaped) + "\"");
-        } else {
-          this.token(tag, value);
-        }
-        if (i < tokens.length - 1) {
+        _ref2 = tokens[i], tag = _ref2[0], value = _ref2[1];
+        if (i) {
          this.token('+', '+');
        }
+        if (tag === 'TOKENS') {
+          push.apply(this.tokens, value);
+          continue;
+        }
+        if (regex) {
+          value = value.slice(1, -1);
+          value = value.replace(/[\\\"]/g, '\\$&');
+          if (options.heregex) {
+            value = value.replace(HEREGEX_OMIT, '');
+          }
+          value = '"' + value + '"';
+        }
+        this.token(tag, value);
      }
      if (interpolated) {
        this.token(')', ')');
@@ -587,14 +615,17 @@
  SIMPLESTR = /^'[^\\']*(?:\\.[^\\']*)*'/;
  JSTOKEN = /^`[^\\`]*(?:\\.[^\\`]*)*`/;
  REGEX_START = /^\/([^\/])/;
-  REGEX_INTERPOLATION = /[^\\]#\{.*[^\\]\}/;
  REGEX_END = /^[imgy]{0,4}(?![a-zA-Z])/;
  REGEX_ESCAPE = /\\[^#]/g;
+  HEREGEX = /^\/{3}([\s\S]+?)\/{3}([imgy]{0,4})(?![A-Za-z])/;
+  HEREGEX_OMIT = /\s+(?:#.*)?/g;
  MULTILINER = /\n/g;
  NO_NEWLINE = /^(?:[-+*&|\/%=<>!.\\][<>=&|]*|and|or|is(?:nt)?|n(?:ot|ew)|delete|typeof|instanceof)$/;
  HEREDOC_INDENT = /\n+([ \t]*)/g;
  ASSIGNED = /^\s*@?[$A-Za-z_][$\w]*[ \t]*?[:=][^:=>]/;
  NEXT_CHARACTER = /^\s*(\S?)/;
+  LEADING_SPACES = /^\s+/;
+  TRAILING_SPACES = /\s+$/;
  COMPOUND_ASSIGN = ['-=', '+=', '/=', '*=', '%=', '||=', '&&=', '?=', '<<=', '>>=', '>>>=', '&=', '^=', '|='];
  UNARY = ['UMINUS', 'UPLUS', '!', '!!', '~', 'NEW', 'TYPEOF', 'DELETE'];
  LOGIC = ['&', '|', '^', '&&', '||'];