lexer: optimized regexes

2026-01-14 17:27:59 -05:00 · 2010-09-23 13:41:53 +09:00
parent a04e17c4ea
commit 20dae3758a
1 changed files with 53 additions and 54 deletions
--- a/src/lexer.coffee
+++ b/src/lexer.coffee
@@ -33,7 +33,7 @@ exports.Lexer = class Lexer
  # Before returning the token stream, run it through the [Rewriter](rewriter.html)
  # unless explicitly asked not to.
  tokenize: (code, options) ->
-    code     = code.replace /(\r|\s+$)/g, ''
+    code     = code.replace(/\r/g, '').replace /\s+$/, ''
    o        = options or {}
    @code    = code         # The remainder of the source code.
    @i       = 0            # Current character position we're parsing.
@@ -75,7 +75,7 @@ exports.Lexer = class Lexer
  # referenced as property names here, so you can still do `jQuery.is()` even
  # though `is` means `===` otherwise.
  identifierToken: ->
-    return false unless id = @match IDENTIFIER, 1
+    return false unless id = @match IDENTIFIER
    @i += id.length
    forcedIdentifier = @tagAccessor() or @match ASSIGNED, 1
    tag = 'IDENTIFIER'
@@ -104,7 +104,7 @@ exports.Lexer = class Lexer
  # Matches numbers, including decimals, hex, and exponential notation.
  # Be careful not to interfere with ranges-in-progress.
  numberToken: ->
-    return false unless number = @match NUMBER, 1
+    return false unless number = @match NUMBER
    return false if @tag() is '.' and starts number, '.'
    @i += number.length
    @token 'NUMBER', number
@@ -117,7 +117,7 @@ exports.Lexer = class Lexer
    return false unless string =
      @balancedToken(['"', '"'], ['#{', '}']) or
      @balancedToken ["'", "'"]
-    @interpolateString string.replace /\n/g, '\\\n'
+    @interpolateString string.replace MULTILINER, '\\\n'
    @line += count string, "\n"
    @i += string.length
    true
@@ -126,20 +126,22 @@ exports.Lexer = class Lexer
  # preserve whitespace, but ignore indentation to the left.
  heredocToken: ->
    return false unless match = @chunk.match HEREDOC
-    quote = match[1].substr 0, 1
-    doc = @sanitizeHeredoc match[2] or match[4] or '', {quote}
+    heredoc = match[0]
+    quote = heredoc.charAt 0
+    doc = @sanitizeHeredoc match[2], {quote}
    @interpolateString quote + doc + quote, heredoc: yes
-    @line += count match[1], "\n"
-    @i += match[1].length
+    @line += count heredoc, '\n'
+    @i += heredoc.length
    true

  # Matches and consumes comments.
  commentToken: ->
-    return false unless match = @chunk.match(COMMENT)
-    @line += count match[1], "\n"
-    @i += match[1].length
-    if match[2]
-      @token 'HERECOMMENT', @sanitizeHeredoc match[2],
+    return false unless match = @chunk.match COMMENT
+    [comment, here] = match
+    @line += count comment, '\n'
+    @i += comment.length
+    if here
+      @token 'HERECOMMENT', @sanitizeHeredoc here,
        herecomment: true, indent: Array(@indent + 1).join(' ')
      @token 'TERMINATOR', '\n'
    true
@@ -148,7 +150,7 @@ exports.Lexer = class Lexer
  jsToken: ->
    return false unless starts @chunk, '`'
    return false unless script = @balancedToken ['`', '`']
-    @token 'JS', script.replace JS_CLEANER, ''
+    @token 'JS', script.slice 1, -1
    @i += script.length
    true

@@ -161,18 +163,18 @@ exports.Lexer = class Lexer
    return false if first[1] is ' ' and @tag() not in ['CALL_START', '=']
    return false if     include NOT_REGEX, @tag()
    return false unless regex = @balancedToken ['/', '/']
-    return false unless end = @chunk.substr(regex.length).match REGEX_END
-    regex += flags = end[2] if end[2]
-    if regex.match REGEX_INTERPOLATION
-      str = regex.substring(1).split('/')[0]
-      str = str.replace REGEX_ESCAPE, (escaped) -> '\\' + escaped
-      @tokens = @tokens.concat [['(', '('], ['NEW', 'new'], ['IDENTIFIER', 'RegExp'], ['CALL_START', '(']]
+    return false unless end = @chunk[regex.length..].match REGEX_END
+    flags = end[0]
+    if REGEX_INTERPOLATION.test regex
+      str = regex.slice 1, -1
+      str = str.replace REGEX_ESCAPE, '\\$&'
+      @tokens.push ['(', '('], ['NEW', 'new'], ['IDENTIFIER', 'RegExp'], ['CALL_START', '(']
      @interpolateString "\"#{str}\"", escapeQuotes: yes
-      @tokens.splice @tokens.length, 0, [',', ','], ['STRING', "\"#{flags}\""] if flags
-      @tokens.splice @tokens.length, 0, [')', ')'], [')', ')']
+      @tokens.push [',', ','], ['STRING', "\"#{flags}\""] if flags
+      @tokens.push [')', ')'], [')', ')']
    else
-      @token 'REGEX', regex
-    @i += regex.length
+      @token 'REGEX', regex + flags
+    @i += regex.length + flags.length
    true

  # Matches a token in which which the passed delimiter pairs must be correctly
@@ -191,11 +193,11 @@ exports.Lexer = class Lexer
  # Keeps track of the level of indentation, because a single outdent token
  # can close multiple indents, so we need to know how far in we happen to be.
  lineToken: ->
-    return false unless indent = @match MULTI_DENT, 1
-    @line += count indent, "\n"
+    return false unless indent = @match MULTI_DENT
+    @line += count indent, '\n'
    @i    += indent.length
-    prev = @prev(2)
-    size = indent.match(LAST_DENTS).reverse()[0].match(LAST_DENT)[1].length
+    prev = @prev 2
+    size = indent.length - 1 - indent.lastIndexOf '\n'
    nextCharacter = @match NEXT_CHARACTER, 1
    noNewlines = nextCharacter is '.' or nextCharacter is ',' or @unfinished()
    if size - @indebt is @indent
@@ -235,13 +237,13 @@ exports.Lexer = class Lexer
        @outdebt = 0
        @token 'OUTDENT', dent
    @outdebt -= moveOut if dent
-    @token 'TERMINATOR', "\n" unless @tag() is 'TERMINATOR' or noNewlines
+    @token 'TERMINATOR', '\n' unless @tag() is 'TERMINATOR' or noNewlines
    true

  # Matches and consumes non-meaningful whitespace. Tag the previous token
  # as being "spaced", because there are some cases where it makes a difference.
  whitespaceToken: ->
-    return false unless space = @match WHITESPACE, 1
+    return false unless space = @match WHITESPACE
    prev = @prev()
    prev.spaced = true if prev
    @i += space.length
@@ -264,11 +266,11 @@ exports.Lexer = class Lexer
  # here. `;` and newlines are both treated as a `TERMINATOR`, we distinguish
  # parentheses that indicate a method call from regular parentheses, and so on.
  literalToken: ->
-    match = @chunk.match OPERATOR
-    value = match and match[1]
-    space = match and match[2]
-    @tagParameters() if value and value.match CODE
-    value or= @chunk.substr 0, 1
+    if match = @chunk.match OPERATOR
+      [value, space] = match
+      @tagParameters() if CODE.test value
+    else
+      value = @chunk.charAt 0
    @i += value.length
    spaced = (prev = @prev()) and prev.spaced
    tag = value
@@ -321,8 +323,8 @@ exports.Lexer = class Lexer
    indent = options.indent
    return doc if options.herecomment and not include doc, '\n'
    unless options.herecomment
-      while (match = HEREDOC_INDENT.exec(doc)) isnt null
-        attempt = if match[2]? then match[2] else match[3]
+      while (match = HEREDOC_INDENT.exec doc)
+        attempt = if match[1]? then match[1] else match[2]
        indent = attempt if not indent? or 0 < attempt.length < indent.length
    indent or= ''
    doc = doc.replace(new RegExp("^" + indent, 'gm'), '')
@@ -519,29 +521,26 @@ RESERVED = [
 JS_FORBIDDEN = JS_KEYWORDS.concat RESERVED

 # Token matching regexes.
-IDENTIFIER    = /^([a-zA-Z\$_](\w|\$)*)/
-NUMBER        = /^(((\b0(x|X)[0-9a-fA-F]+)|((\b[0-9]+(\.[0-9]+)?|\.[0-9]+)(e[+\-]?[0-9]+)?)))\b/i
-HEREDOC       = /^("{6}|'{6}|"{3}([\s\S]*?)\n?([ \t]*)"{3}|'{3}([\s\S]*?)\n?([ \t]*)'{3})/
-OPERATOR      = /^(-[\-=>]?|\+[+=]?|[*&|\/%=<>^:!?]+)([ \t]*)/
-WHITESPACE    = /^([ \t]+)/
-COMMENT       = /^(###([^#][\s\S]*?)(###[ \t]*\n|(###)?$)|(\s*#(?!##[^#])[^\n]*)+)/
-CODE          = /^((-|=)>)/
-MULTI_DENT    = /^((\n([ \t]*))+)(\.)?/
-LAST_DENTS    = /\n([ \t]*)/g
-LAST_DENT     = /\n([ \t]*)/
+IDENTIFIER = /^[a-zA-Z_$][\w$]*/
+NUMBER     = /^(?:0x[\da-f]+)|^(?:\d+(\.\d+)?|\.\d+)(?:e[+-]?\d+)?/i
+HEREDOC    = /^("""|''')([\s\S]*?)\n?[ \t]*\1/
+OPERATOR   = /^(?:-[-=>]?|\+[+=]?|[*&|\/%=<>^:!?]+)(?=([ \t]*))/
+WHITESPACE = /^[ \t]+/
+COMMENT    = /^###([^#][\s\S]*?)(?:###[ \t]*\n|(?:###)?$)|^(?:\s*#(?!##[^#])[^\n]*)+/
+CODE       = /^[-=]>/
+MULTI_DENT = /^(?:\n[ \t]*)+/

 # Regex-matching-regexes.
 REGEX_START         = /^\/([^\/])/
-REGEX_INTERPOLATION = /([^\\]#\{.*[^\\]\})/
-REGEX_END           = /^(([imgy]{1,4})\b|\W|$)/
-REGEX_ESCAPE        = /\\[^\$]/g
+REGEX_INTERPOLATION = /[^\\]#\{.*[^\\]\}/
+REGEX_END           = /^[imgy]{0,4}(?![a-zA-Z])/
+REGEX_ESCAPE        = /\\[^#]/g

 # Token cleaning regexes.
-JS_CLEANER      = /(^`|`$)/g
 MULTILINER      = /\n/g
-NO_NEWLINE      = /^([+\*&|\/\-%=<>!.\\][<>=&|]*|and|or|is|isnt|not|delete|typeof|instanceof)$/
-HEREDOC_INDENT  = /(\n+([ \t]*)|^([ \t]+))/g
-ASSIGNED        = /^\s*(([a-zA-Z\$_@]\w*|["'][^\r\n]+?["']|\d+)[ \t]*?[:=][^:=])/
+NO_NEWLINE      = /^(?:[-+*&|\/%=<>!.\\][<>=&|]*|and|or|is(?:nt)?|not|delete|typeof|instanceof)$/
+HEREDOC_INDENT  = /\n+([ \t]*)|^([ \t]+)/g
+ASSIGNED        = /^\s*((?:[a-zA-Z$_@]\w*|["'][^\n]+?["']|\d+)[ \t]*?[:=][^:=])/
 NEXT_CHARACTER  = /^\s*(\S)/

 # Compound assignment tokens.