From c605b3e232e4479fbb346a506d5dc8bcfd31fdf5 Mon Sep 17 00:00:00 2001 From: satyr Date: Mon, 4 Oct 2010 08:22:42 +0900 Subject: [PATCH] first stub at heregex --- lib/lexer.js | 175 +++++++++++++++++++++++---------------- src/lexer.coffee | 126 ++++++++++++++++------------ test/test_regexps.coffee | 10 +++ 3 files changed, 185 insertions(+), 126 deletions(-) diff --git a/lib/lexer.js b/lib/lexer.js index e5813690..6cf74b84 100644 --- a/lib/lexer.js +++ b/lib/lexer.js @@ -1,13 +1,12 @@ (function() { - var ASSIGNED, CALLABLE, CODE, COFFEE_ALIASES, COFFEE_KEYWORDS, COMMENT, COMPARE, COMPOUND_ASSIGN, CONVERSIONS, HEREDOC, HEREDOC_INDENT, IDENTIFIER, JSTOKEN, JS_FORBIDDEN, JS_KEYWORDS, LINE_BREAK, LOGIC, Lexer, MATH, MULTILINER, MULTI_DENT, NEXT_CHARACTER, NOT_REGEX, NO_NEWLINE, NUMBER, OPERATOR, REGEX_END, REGEX_ESCAPE, REGEX_INTERPOLATION, REGEX_START, RESERVED, Rewriter, SHIFT, SIMPLESTR, UNARY, WHITESPACE, _ref, compact, count, include, last, starts; - var __slice = Array.prototype.slice; + var ASSIGNED, CALLABLE, CODE, COFFEE_ALIASES, COFFEE_KEYWORDS, COMMENT, COMPARE, COMPOUND_ASSIGN, CONVERSIONS, HEREDOC, HEREDOC_INDENT, HEREGEX, HEREGEX_OMIT, IDENTIFIER, JSTOKEN, JS_FORBIDDEN, JS_KEYWORDS, LEADING_SPACES, LINE_BREAK, LOGIC, Lexer, MATH, MULTILINER, MULTI_DENT, NEXT_CHARACTER, NOT_REGEX, NO_NEWLINE, NUMBER, OPERATOR, REGEX_END, REGEX_ESCAPE, REGEX_START, RESERVED, Rewriter, SHIFT, SIMPLESTR, TRAILING_SPACES, UNARY, WHITESPACE, _ref, compact, count, include, last, starts; Rewriter = require('./rewriter').Rewriter; _ref = require('./helpers'), include = _ref.include, count = _ref.count, starts = _ref.starts, compact = _ref.compact, last = _ref.last; exports.Lexer = (function() { Lexer = function() {}; Lexer.prototype.tokenize = function(code, options) { var o; - code = code.replace(/\r/g, '').replace(/\s+$/, ''); + code = code.replace(/\r/g, '').replace(TRAILING_SPACES, ''); o = options || {}; this.code = code; this.i = 0; @@ -101,10 +100,14 @@ this.token('STRING', (string = match[0]).replace(MULTILINER, '\\\n')); break; case '"': - if (!(string = this.balancedToken(['"', '"'], ['#{', '}']))) { + if (!(string = this.balancedString(this.chunk, [['"', '"'], ['#{', '}']]))) { return false; } - this.interpolateString(string); + if (~string.indexOf('#{')) { + this.interpolateString(string); + } else { + this.token('STRING', this.escapeLines(string)); + } break; default: return false; @@ -115,7 +118,7 @@ }; Lexer.prototype.heredocToken = function() { var doc, heredoc, match, quote; - if (!(match = this.chunk.match(HEREDOC))) { + if (!(match = HEREDOC.exec(this.chunk))) { return false; } heredoc = match[0]; @@ -124,12 +127,12 @@ quote: quote, indent: null }); - if (quote === '"') { + if (quote === '"' && ~doc.indexOf('#{')) { this.interpolateString(quote + doc + quote, { heredoc: true }); } else { - this.token('STRING', quote + doc + quote); + this.token('STRING', quote + this.escapeLines(doc, true) + quote); } this.line += count(heredoc, '\n'); this.i += heredoc.length; @@ -162,8 +165,14 @@ return true; }; Lexer.prototype.regexToken = function() { - var _ref2, end, first, flags, regex, str; - if (!(first = this.chunk.match(REGEX_START))) { + var _ref2, end, first, flags, match, regex, str; + if (this.chunk.charAt(0) !== '/') { + return false; + } + if (match = HEREGEX.exec(this.chunk)) { + return this.heregexToken(match); + } + if (!(first = REGEX_START.exec(this.chunk))) { return false; } if (first[1] === ' ' && !('CALL_START' === (_ref2 = this.tag()) || '=' === _ref2)) { @@ -172,34 +181,48 @@ if (include(NOT_REGEX, this.tag())) { return false; } - if (!(regex = this.balancedToken(['/', '/']))) { + if (!(regex = this.balancedString(this.chunk, [['/', '/']]))) { return false; } if (!(end = this.chunk.slice(regex.length).match(REGEX_END))) { return false; } flags = end[0]; - if (REGEX_INTERPOLATION.test(regex)) { + if (~regex.indexOf('#{')) { str = regex.slice(1, -1); - str = str.replace(REGEX_ESCAPE, '\\$&'); - this.tokens.push(['(', '('], ['IDENTIFIER', 'RegExp'], ['CALL_START', '(']); + this.tokens.push(['IDENTIFIER', 'RegExp'], ['CALL_START', '(']); this.interpolateString("\"" + (str) + "\"", { - escapeQuotes: true + regex: true }); if (flags) { this.tokens.push([',', ','], ['STRING', ("\"" + (flags) + "\"")]); } - this.tokens.push([')', ')'], [')', ')']); + this.tokens.push(['CALL_END', ')']); } else { this.token('REGEX', regex + flags); } this.i += regex.length + flags.length; return true; }; - Lexer.prototype.balancedToken = function() { - var delimited; - delimited = __slice.call(arguments, 0); - return this.balancedString(this.chunk, delimited); + Lexer.prototype.heregexToken = function(match) { + var _ref2, body, flags, heregex; + _ref2 = match, heregex = _ref2[0], body = _ref2[1], flags = _ref2[2]; + this.i += heregex.length; + if (!(~body.indexOf('#{'))) { + this.token('REGEX', '/' + body.replace(HEREGEX_OMIT, '') + '/' + flags); + return true; + } + this.token('IDENTIFIER', 'RegExp'); + this.tokens.push(['CALL_START', '(']); + this.interpolateString("\"" + (body) + "\"", { + regex: true, + heregex: true + }); + if (flags) { + this.tokens.push([',', ','], ['STRING', '"' + flags + '"']); + } + this.tokens.push(['CALL_END', ')']); + return true; }; Lexer.prototype.lineToken = function() { var diff, indent, match, nextCharacter, noNewlines, prev, size; @@ -450,7 +473,8 @@ i += 1; } break; - } else if (starts(str, open, i)) { + } + if (starts(str, open, i)) { levels.push(pair); i += open.length - 1; break; @@ -471,74 +495,78 @@ return !i ? false : str.slice(0, i); }; Lexer.prototype.interpolateString = function(str, options) { - var _len, _ref2, end, escapeQuotes, escaped, expr, heredoc, i, idx, inner, interpolated, lexer, nested, pi, push, quote, s, tag, tok, token, tokens, value; - _ref2 = options || {}, heredoc = _ref2.heredoc, escapeQuotes = _ref2.escapeQuotes; - quote = str.charAt(0); - if (quote !== '"' || str.length < 3) { - return this.token('STRING', str); + var _i, _len, _ref2, char, expr, heredoc, i, inner, interpolated, lexer, nested, pi, push, regex, s, tag, tok, tokens, value; + if (str.length < 5) { + return this.token('STRING', this.escapeLines(str, heredoc)); } + _ref2 = options || (options = {}), heredoc = _ref2.heredoc, regex = _ref2.regex; lexer = new Lexer; tokens = []; - i = (pi = 1); - end = str.length - 1; - while (i < end) { - if (str.charAt(i) === '\\') { + pi = 1; + i = 0; + while (char = str.charAt(i += 1)) { + if (char === '\\') { i += 1; - } else if (expr = this.balancedString(str.slice(i), [['#{', '}']])) { - if (pi < i) { - s = quote + this.escapeLines(str.slice(pi, i), heredoc) + quote; - tokens.push(['STRING', s]); - } - inner = expr.slice(2, -1).replace(/^[ \t]*\n/, ''); - if (inner.length) { - if (heredoc) { - inner = inner.replace(RegExp('\\\\' + quote, 'g'), quote); - } - nested = lexer.tokenize("(" + (inner) + ")", { - line: this.line - }); - for (idx = 0, _len = nested.length; idx < _len; idx++) { - tok = nested[idx]; - if (tok[0] === 'CALL_END') { - (tok[0] = ')'); - } - } - nested.pop(); - tokens.push(['TOKENS', nested]); - } else { - tokens.push(['STRING', quote + quote]); - } - i += expr.length - 1; - pi = i + 1; + continue; } - i += 1; + if (!(char === '#' && str.charAt(i + 1) === '{' && (expr = this.balancedString(str.slice(i + 1), [['{', '}']])))) { + continue; + } + if (pi < i) { + tokens.push(['STRING', '"' + this.escapeLines(str.slice(pi, i), heredoc) + '"']); + } + inner = expr.slice(1, -1).replace(LEADING_SPACES, '').replace(TRAILING_SPACES, ''); + if (inner.length) { + if (heredoc) { + inner = inner.replace(/\\\"/g, '"'); + } + nested = lexer.tokenize("(" + (inner) + ")", { + line: this.line + }); + for (_i = 0, _len = nested.length; _i < _len; _i++) { + tok = nested[_i]; + if (tok[0] === 'CALL_END') { + (tok[0] = ')'); + } + } + nested.pop(); + tokens.push(['TOKENS', nested]); + } else { + tokens.push(['STRING', '""']); + } + i += expr.length; + pi = i + 1; } if ((i > pi) && (pi < str.length - 1)) { - s = str.slice(pi, i).replace(MULTILINER, heredoc ? '\\n' : ''); - tokens.push(['STRING', quote + s + quote]); + s = this.escapeLines(str.slice(pi, -1), heredoc); + tokens.push(['STRING', '"' + s + '"']); } if (tokens[0][0] !== 'STRING') { tokens.unshift(['STRING', '""']); } - interpolated = tokens.length > 1; + interpolated = !regex && tokens.length > 1; if (interpolated) { this.token('(', '('); } push = tokens.push; for (i = 0, _len = tokens.length; i < _len; i++) { - token = tokens[i]; - _ref2 = token, tag = _ref2[0], value = _ref2[1]; - if (tag === 'TOKENS') { - push.apply(this.tokens, value); - } else if (tag === 'STRING' && escapeQuotes) { - escaped = value.slice(1, -1).replace(/"/g, '\\"'); - this.token(tag, "\"" + (escaped) + "\""); - } else { - this.token(tag, value); - } - if (i < tokens.length - 1) { + _ref2 = tokens[i], tag = _ref2[0], value = _ref2[1]; + if (i) { this.token('+', '+'); } + if (tag === 'TOKENS') { + push.apply(this.tokens, value); + continue; + } + if (regex) { + value = value.slice(1, -1); + value = value.replace(/[\\\"]/g, '\\$&'); + if (options.heregex) { + value = value.replace(HEREGEX_OMIT, ''); + } + value = '"' + value + '"'; + } + this.token(tag, value); } if (interpolated) { this.token(')', ')'); @@ -587,14 +615,17 @@ SIMPLESTR = /^'[^\\']*(?:\\.[^\\']*)*'/; JSTOKEN = /^`[^\\`]*(?:\\.[^\\`]*)*`/; REGEX_START = /^\/([^\/])/; - REGEX_INTERPOLATION = /[^\\]#\{.*[^\\]\}/; REGEX_END = /^[imgy]{0,4}(?![a-zA-Z])/; REGEX_ESCAPE = /\\[^#]/g; + HEREGEX = /^\/{3}([\s\S]+?)\/{3}([imgy]{0,4})(?![A-Za-z])/; + HEREGEX_OMIT = /\s+(?:#.*)?/g; MULTILINER = /\n/g; NO_NEWLINE = /^(?:[-+*&|\/%=<>!.\\][<>=&|]*|and|or|is(?:nt)?|n(?:ot|ew)|delete|typeof|instanceof)$/; HEREDOC_INDENT = /\n+([ \t]*)/g; ASSIGNED = /^\s*@?[$A-Za-z_][$\w]*[ \t]*?[:=][^:=>]/; NEXT_CHARACTER = /^\s*(\S?)/; + LEADING_SPACES = /^\s+/; + TRAILING_SPACES = /\s+$/; COMPOUND_ASSIGN = ['-=', '+=', '/=', '*=', '%=', '||=', '&&=', '?=', '<<=', '>>=', '>>>=', '&=', '^=', '|=']; UNARY = ['UMINUS', 'UPLUS', '!', '!!', '~', 'NEW', 'TYPEOF', 'DELETE']; LOGIC = ['&', '|', '^', '&&', '||']; diff --git a/src/lexer.coffee b/src/lexer.coffee index 4d9c8bd9..e0dc758a 100644 --- a/src/lexer.coffee +++ b/src/lexer.coffee @@ -33,7 +33,7 @@ exports.Lexer = class Lexer # Before returning the token stream, run it through the [Rewriter](rewriter.html) # unless explicitly asked not to. tokenize: (code, options) -> - code = code.replace(/\r/g, '').replace /\s+$/, '' + code = code.replace(/\r/g, '').replace TRAILING_SPACES, '' o = options or {} @code = code # The remainder of the source code. @i = 0 # Current character position we're parsing. @@ -124,8 +124,11 @@ exports.Lexer = class Lexer return false unless match = SIMPLESTR.exec @chunk @token 'STRING', (string = match[0]).replace MULTILINER, '\\\n' when '"' - return false unless string = @balancedToken ['"', '"'], ['#{', '}'] - @interpolateString string + return false unless string = @balancedString @chunk, [['"', '"'], ['#{', '}']] + if ~string.indexOf '#{' + @interpolateString string + else + @token 'STRING', @escapeLines string else return false @line += count string, '\n' @@ -135,14 +138,14 @@ exports.Lexer = class Lexer # Matches heredocs, adjusting indentation to the correct level, as heredocs # preserve whitespace, but ignore indentation to the left. heredocToken: -> - return false unless match = @chunk.match HEREDOC + return false unless match = HEREDOC.exec @chunk heredoc = match[0] quote = heredoc.charAt 0 doc = @sanitizeHeredoc match[2], {quote, indent: null} - if quote is '"' + if quote is '"' and ~doc.indexOf '#{' @interpolateString quote + doc + quote, heredoc: yes else - @token 'STRING', quote + doc + quote + @token 'STRING', quote + @escapeLines(doc, yes) + quote @line += count heredoc, '\n' @i += heredoc.length true @@ -168,31 +171,41 @@ exports.Lexer = class Lexer # Matches regular expression literals. Lexing regular expressions is difficult # to distinguish from division, so we borrow some basic heuristics from - # JavaScript and Ruby, borrow slash balancing from `@balancedToken`, and + # JavaScript and Ruby, borrow slash balancing from `@balancedString`, and # borrow interpolation from `@interpolateString`. regexToken: -> - return false unless first = @chunk.match REGEX_START + return false if @chunk.charAt(0) isnt '/' + return @heregexToken match if match = HEREGEX.exec @chunk + return false unless first = REGEX_START.exec @chunk return false if first[1] is ' ' and @tag() not in ['CALL_START', '='] return false if include NOT_REGEX, @tag() - return false unless regex = @balancedToken ['/', '/'] + return false unless regex = @balancedString @chunk, [['/', '/']] return false unless end = @chunk[regex.length..].match REGEX_END flags = end[0] - if REGEX_INTERPOLATION.test regex + if ~regex.indexOf '#{' str = regex.slice 1, -1 - str = str.replace REGEX_ESCAPE, '\\$&' - @tokens.push ['(', '('], ['IDENTIFIER', 'RegExp'], ['CALL_START', '('] - @interpolateString "\"#{str}\"", escapeQuotes: yes + @tokens.push ['IDENTIFIER', 'RegExp'], ['CALL_START', '('] + @interpolateString "\"#{str}\"", regex: yes @tokens.push [',', ','], ['STRING', "\"#{flags}\""] if flags - @tokens.push [')', ')'], [')', ')'] + @tokens.push ['CALL_END', ')'] else @token 'REGEX', regex + flags @i += regex.length + flags.length true - # Matches a token in which the passed delimiter pairs must be correctly - # balanced (ie. strings, JS literals). - balancedToken: (delimited...) -> - @balancedString @chunk, delimited + # Matches experimental, multiline and extended regular expression literals. + heregexToken: (match) -> + [heregex, body, flags] = match + @i += heregex.length + unless ~body.indexOf '#{' + @token 'REGEX', '/' + body.replace(HEREGEX_OMIT, '') + '/' + flags + return true + @token 'IDENTIFIER', 'RegExp' + @tokens.push ['CALL_START', '('] + @interpolateString "\"#{body}\"", regex: yes, heregex: yes + @tokens.push [',', ','], ['STRING', '"' + flags + '"'] if flags + @tokens.push ['CALL_END', ')'] + true # Matches newlines, indents, and outdents, and determines which is which. # If we can detect that the current line is continued onto the the next line, @@ -399,7 +412,7 @@ exports.Lexer = class Lexer i += close.length - 1 i += 1 unless levels.length break - else if starts str, open, i + if starts str, open, i levels.push(pair) i += open.length - 1 break @@ -419,49 +432,50 @@ exports.Lexer = class Lexer # new Lexer, tokenize the interpolated contents, and merge them into the # token stream. interpolateString: (str, options) -> - {heredoc, escapeQuotes} = options or {} - quote = str.charAt 0 - return @token 'STRING', str if quote isnt '"' or str.length < 3 + return @token 'STRING', @escapeLines(str, heredoc) if str.length < 5 # "#{}" + {heredoc, regex} = options or= {} lexer = new Lexer tokens = [] - i = pi = 1 - end = str.length - 1 - while i < end - if str.charAt(i) is '\\' + pi = 1 + i = 0 + while char = str.charAt i += 1 + if char is '\\' i += 1 - else if expr = @balancedString str[i..], [['#{', '}']] - if pi < i - s = quote + @escapeLines(str[pi...i], heredoc) + quote - tokens.push ['STRING', s] - inner = expr.slice(2, -1).replace /^[ \t]*\n/, '' - if inner.length - inner = inner.replace RegExp('\\\\' + quote, 'g'), quote if heredoc - nested = lexer.tokenize "(#{inner})", line: @line - (tok[0] = ')') for tok, idx in nested when tok[0] is 'CALL_END' - nested.pop() - tokens.push ['TOKENS', nested] - else - tokens.push ['STRING', quote + quote] - i += expr.length - 1 - pi = i + 1 - i += 1 + continue + unless char is '#' and str.charAt(i+1) is '{' and + (expr = @balancedString str[i+1..], [['{', '}']]) + continue + if pi < i + tokens.push ['STRING', '"' + @escapeLines(str[pi...i], heredoc) + '"'] + inner = expr.slice(1, -1).replace(LEADING_SPACES, '').replace(TRAILING_SPACES, '') + if inner.length + inner = inner.replace /\\\"/g, '"' if heredoc + nested = lexer.tokenize "(#{inner})", line: @line + (tok[0] = ')') for tok in nested when tok[0] is 'CALL_END' + nested.pop() + tokens.push ['TOKENS', nested] + else + tokens.push ['STRING', '""'] + i += expr.length + pi = i + 1 if i > pi < str.length - 1 - s = str[pi...i].replace MULTILINER, if heredoc then '\\n' else '' - tokens.push ['STRING', quote + s + quote] + s = @escapeLines str.slice(pi, -1), heredoc + tokens.push ['STRING', '"' + s + '"'] tokens.unshift ['STRING', '""'] unless tokens[0][0] is 'STRING' - interpolated = tokens.length > 1 + interpolated = not regex and tokens.length > 1 @token '(', '(' if interpolated {push} = tokens - for token, i in tokens - [tag, value] = token + for [tag, value], i in tokens + @token '+', '+' if i if tag is 'TOKENS' push.apply @tokens, value - else if tag is 'STRING' and escapeQuotes - escaped = value.slice(1, -1).replace(/"/g, '\\"') - @token tag, "\"#{escaped}\"" - else - @token tag, value - @token '+', '+' if i < tokens.length - 1 + continue + if regex + value = value.slice 1, -1 + value = value.replace /[\\\"]/g, '\\$&' + value = value.replace HEREGEX_OMIT, '' if options.heregex + value = '"' + value + '"' + @token tag, value @token ')', ')' if interpolated tokens @@ -544,16 +558,20 @@ JSTOKEN = /^`[^\\`]*(?:\\.[^\\`]*)*`/ # Regex-matching-regexes. REGEX_START = /^\/([^\/])/ -REGEX_INTERPOLATION = /[^\\]#\{.*[^\\]\}/ REGEX_END = /^[imgy]{0,4}(?![a-zA-Z])/ REGEX_ESCAPE = /\\[^#]/g +HEREGEX = /^\/{3}([\s\S]+?)\/{3}([imgy]{0,4})(?![A-Za-z])/ +HEREGEX_OMIT = /\s+(?:#.*)?/g + # Token cleaning regexes. MULTILINER = /\n/g NO_NEWLINE = /^(?:[-+*&|\/%=<>!.\\][<>=&|]*|and|or|is(?:nt)?|n(?:ot|ew)|delete|typeof|instanceof)$/ HEREDOC_INDENT = /\n+([ \t]*)/g ASSIGNED = /^\s*@?[$A-Za-z_][$\w]*[ \t]*?[:=][^:=>]/ NEXT_CHARACTER = /^\s*(\S?)/ +LEADING_SPACES = /^\s+/ +TRAILING_SPACES = /\s+$/ # Compound assignment tokens. COMPOUND_ASSIGN = ['-=', '+=', '/=', '*=', '%=', '||=', '&&=', '?=', '<<=', '>>=', '>>>=', '&=', '^=', '|='] diff --git a/test/test_regexps.coffee b/test/test_regexps.coffee index 4b63f67a..77a1466e 100644 --- a/test/test_regexps.coffee +++ b/test/test_regexps.coffee @@ -26,3 +26,13 @@ regexp = / / ok ' '.match regexp ok (obj.width()/id - obj.height()/id) is -5 + +eq /^I'm\s+Heregex?/gim + '', /// + ^ I'm \s+ Heregex? # or not +///gim + '' +eq '\\\\#{}', /// + #{ + "#{ '\\' }" + } + \#{} +///.source