Revert "lexer: simplified tokenizers' responsibility"

This reverts commit a9e95fa43b.
This commit is contained in:
Jeremy Ashkenas
2010-10-22 08:13:40 -04:00
parent a9e95fa43b
commit 10442239f1
2 changed files with 131 additions and 109 deletions

View File

@@ -14,10 +14,11 @@
return Lexer;
})();
Lexer.prototype.tokenize = function(code, options) {
var i, o;
var o;
code = code.replace(/\r/g, '').replace(TRAILING_SPACES, '');
o = options || {};
this.code = code;
this.i = 0;
this.line = o.line || 0;
this.indent = 0;
this.indebt = 0;
@@ -25,9 +26,8 @@
this.indents = [];
this.tokens = [];
this.seenFor = this.seenFrom = false;
i = 0;
while (this.chunk = code.slice(i)) {
i += this.identifierToken() || this.commentToken() || this.whitespaceToken() || this.lineToken() || this.heredocToken() || this.stringToken() || this.numberToken() || this.regexToken() || this.jsToken() || this.literalToken();
while (this.chunk = code.slice(this.i)) {
this.identifierToken() || this.commentToken() || this.whitespaceToken() || this.lineToken() || this.heredocToken() || this.stringToken() || this.numberToken() || this.regexToken() || this.jsToken() || this.literalToken();
}
this.closeIndentation();
if (o.rewrite === false) {
@@ -38,23 +38,24 @@
Lexer.prototype.identifierToken = function() {
var _ref2, colon, forcedIdentifier, id, input, match, tag;
if (!(match = IDENTIFIER.exec(this.chunk))) {
return 0;
return false;
}
input = match[0], id = match[1], colon = match[2];
this.i += input.length;
if (id === 'all' && this.tag() === 'FOR') {
this.token('ALL', id);
return 3;
return true;
}
if (id === 'from' && this.tag(1) === 'FOR') {
this.seenFor = false;
this.seenFrom = true;
this.token('FROM', id);
return 4;
return true;
}
if (id === 'to' && this.seenFrom) {
this.seenFrom = false;
this.token('TO', id);
return 2;
return true;
}
forcedIdentifier = colon || this.tagAccessor();
tag = 'IDENTIFIER';
@@ -105,32 +106,33 @@
if (colon) {
this.token(':', ':');
}
return input.length;
return true;
};
Lexer.prototype.numberToken = function() {
var match, number;
if (!(match = NUMBER.exec(this.chunk))) {
return 0;
return false;
}
number = match[0];
if (this.tag() === '.' && number.charAt(0) === '.') {
return 0;
return false;
}
this.i += number.length;
this.token('NUMBER', number);
return number.length;
return true;
};
Lexer.prototype.stringToken = function() {
var match, string;
switch (this.chunk.charAt(0)) {
case "'":
if (!(match = SIMPLESTR.exec(this.chunk))) {
return 0;
return false;
}
this.token('STRING', (string = match[0]).replace(MULTILINER, '\\\n'));
break;
case '"':
if (!(string = this.balancedString(this.chunk, [['"', '"'], ['#{', '}']]))) {
return 0;
return false;
}
if (0 < string.indexOf('#{', 1)) {
this.interpolateString(string.slice(1, -1));
@@ -139,15 +141,16 @@
}
break;
default:
return 0;
return false;
}
this.line += count(string, '\n');
return string.length;
this.i += string.length;
return true;
};
Lexer.prototype.heredocToken = function() {
var doc, heredoc, match, quote;
if (!(match = HEREDOC.exec(this.chunk))) {
return 0;
return false;
}
heredoc = match[0];
quote = heredoc.charAt(0);
@@ -163,15 +166,17 @@
this.token('STRING', this.makeString(doc, quote, true));
}
this.line += count(heredoc, '\n');
return heredoc.length;
this.i += heredoc.length;
return true;
};
Lexer.prototype.commentToken = function() {
var comment, here, match;
if (!(match = this.chunk.match(COMMENT))) {
return 0;
return false;
}
comment = match[0], here = match[1];
this.line += count(comment, '\n');
this.i += comment.length;
if (here) {
this.token('HERECOMMENT', this.sanitizeHeredoc(here, {
herecomment: true,
@@ -179,41 +184,44 @@
}));
this.token('TERMINATOR', '\n');
}
return comment.length;
return true;
};
Lexer.prototype.jsToken = function() {
var match, script;
if (!(this.chunk.charAt(0) === '`' && (match = JSTOKEN.exec(this.chunk)))) {
return 0;
return false;
}
this.token('JS', (script = match[0]).slice(1, -1));
return script.length;
this.i += script.length;
return true;
};
Lexer.prototype.regexToken = function() {
var _ref2, match, regex;
if (this.chunk.charAt(0) !== '/') {
return 0;
return false;
}
if (match = HEREGEX.exec(this.chunk)) {
return this.heregexToken(match);
}
if ((_ref2 = this.tag(), __indexOf.call(NOT_REGEX, _ref2) >= 0)) {
return 0;
return false;
}
if (!(match = REGEX.exec(this.chunk))) {
return 0;
return false;
}
regex = match[0];
this.token('REGEX', regex === '//' ? '/(?:)/' : regex);
return regex.length;
this.i += regex.length;
return true;
};
Lexer.prototype.heregexToken = function(match) {
var _i, _len, _ref2, _ref3, _this, body, flags, heregex, re, tag, tokens, value;
heregex = match[0], body = match[1], flags = match[2];
this.i += heregex.length;
if (0 > body.indexOf('#{')) {
re = body.replace(HEREGEX_OMIT, '').replace(/\//g, '\\/');
this.token('REGEX', "/" + (re || '(?:)') + "/" + flags);
return heregex.length;
return true;
}
this.token('IDENTIFIER', 'RegExp');
this.tokens.push(['CALL_START', '(']);
@@ -244,32 +252,29 @@
this.tokens.push([',', ','], ['STRING', '"' + flags + '"']);
}
this.token(')', ')');
return heregex.length;
return true;
};
Lexer.prototype.lineToken = function() {
var diff, indent, match, nextCharacter, noNewlines, prev, size;
if (!(match = MULTI_DENT.exec(this.chunk))) {
return 0;
return false;
}
indent = match[0];
this.line += count(indent, '\n');
this.i += indent.length;
prev = last(this.tokens, 1);
size = indent.length - 1 - indent.lastIndexOf('\n');
nextCharacter = NEXT_CHARACTER.exec(this.chunk)[1];
noNewlines = ((nextCharacter === '.' || nextCharacter === ',') && !NEXT_ELLIPSIS.test(this.chunk)) || this.unfinished();
if (size - this.indebt === this.indent) {
if (noNewlines) {
this.suppressNewlines();
} else {
this.newlineToken();
return this.suppressNewlines();
}
return indent.length;
}
if (size > this.indent) {
return this.newlineToken(indent);
} else if (size > this.indent) {
if (noNewlines) {
this.indebt = size - this.indent;
this.suppressNewlines();
return indent.length;
return this.suppressNewlines();
}
diff = size - this.indent + this.outdebt;
this.token('INDENT', diff);
@@ -280,7 +285,7 @@
this.outdentToken(this.indent - size, noNewlines);
}
this.indent = size;
return indent.length;
return true;
};
Lexer.prototype.outdentToken = function(moveOut, noNewlines, close) {
var dent, len;
@@ -307,30 +312,33 @@
if (!(this.tag() === 'TERMINATOR' || noNewlines)) {
this.token('TERMINATOR', '\n');
}
return this;
return true;
};
Lexer.prototype.whitespaceToken = function() {
var match, nline, prev;
if (!((match = WHITESPACE.exec(this.chunk)) || (nline = this.chunk.charAt(0) === '\n'))) {
return 0;
if (!((match = WHITESPACE.exec(this.chunk)) || (nline = this.chunk.substring(0, 1) === '\n'))) {
return false;
}
prev = last(this.tokens);
if (prev) {
prev[match ? 'spaced' : 'newLine'] = true;
}
return match ? match[0].length : 0;
if (match) {
this.i += match[0].length;
}
return !!match;
};
Lexer.prototype.newlineToken = function() {
Lexer.prototype.newlineToken = function(newlines) {
if (this.tag() !== 'TERMINATOR') {
this.token('TERMINATOR', '\n');
}
return this;
return true;
};
Lexer.prototype.suppressNewlines = function() {
if (this.value() === '\\') {
this.tokens.pop();
}
return this;
return true;
};
Lexer.prototype.literalToken = function() {
var _ref2, _ref3, _ref4, _ref5, match, prev, tag, value;
@@ -342,6 +350,7 @@
} else {
value = this.chunk.charAt(0);
}
this.i += value.length;
tag = value;
prev = last(this.tokens);
if (value === '=' && prev) {
@@ -351,11 +360,13 @@
if ((_ref3 = prev[1]) === '||' || _ref3 === '&&') {
prev[0] = 'COMPOUND_ASSIGN';
prev[1] += '=';
return 1;
return true;
}
}
if (value === ';') {
tag = 'TERMINATOR';
} else if (__indexOf.call(LOGIC, value) >= 0) {
tag = 'LOGIC';
} else if (__indexOf.call(MATH, value) >= 0) {
tag = 'MATH';
} else if (__indexOf.call(COMPARE, value) >= 0) {
@@ -366,7 +377,7 @@
tag = 'UNARY';
} else if (__indexOf.call(SHIFT, value) >= 0) {
tag = 'SHIFT';
} else if (__indexOf.call(LOGIC, value) >= 0 || value === '?' && ((prev != null) ? prev.spaced : undefined)) {
} else if (value === '?' && ((prev != null) ? prev.spaced : undefined)) {
tag = 'LOGIC';
} else if (prev && !prev.spaced) {
if (value === '(' && (_ref4 = prev[0], __indexOf.call(CALLABLE, _ref4) >= 0)) {
@@ -387,7 +398,7 @@
}
}
this.token(tag, value);
return value.length;
return true;
};
Lexer.prototype.tagAccessor = function() {
var prev;
@@ -433,7 +444,7 @@
Lexer.prototype.tagParameters = function() {
var i, tok;
if (this.tag() !== ')') {
return this;
return;
}
i = this.tokens.length;
while (tok = this.tokens[--i]) {
@@ -450,7 +461,7 @@
return true;
}
}
return this;
return true;
};
Lexer.prototype.closeIndentation = function() {
return this.outdentToken(this.indent);

View File

@@ -26,8 +26,9 @@ exports.Lexer = class Lexer
# (for interpolations). When the next token has been recorded, we move forward
# within the code past the token, and begin again.
#
# Each tokenizing method is responsible for returning the number of characters
# it has consumed.
# Each tokenizing method is responsible for incrementing `@i` by the number of
# characters it has consumed. `@i` can be thought of as our finger on the page
# of source.
#
# Before returning the token stream, run it through the [Rewriter](rewriter.html)
# unless explicitly asked not to.
@@ -35,6 +36,7 @@ exports.Lexer = class Lexer
code = code.replace(/\r/g, '').replace TRAILING_SPACES, ''
o = options or {}
@code = code # The remainder of the source code.
@i = 0 # Current character position we're parsing.
@line = o.line or 0 # The current line.
@indent = 0 # The current indentation level.
@indebt = 0 # The over-indentation at the current level.
@@ -46,18 +48,17 @@ exports.Lexer = class Lexer
# At every position, run through this list of attempted matches,
# short-circuiting if any of them succeed. Their order determines precedence:
# `@literalToken` is the fallback catch-all.
i = 0
while @chunk = code.slice i
i += @identifierToken() or
@commentToken() or
@whitespaceToken() or
@lineToken() or
@heredocToken() or
@stringToken() or
@numberToken() or
@regexToken() or
@jsToken() or
@literalToken()
while @chunk = code.slice @i
@identifierToken() or
@commentToken() or
@whitespaceToken() or
@lineToken() or
@heredocToken() or
@stringToken() or
@numberToken() or
@regexToken() or
@jsToken() or
@literalToken()
@closeIndentation()
return @tokens if o.rewrite is off
(new Rewriter).rewrite @tokens
@@ -72,20 +73,21 @@ exports.Lexer = class Lexer
# referenced as property names here, so you can still do `jQuery.is()` even
# though `is` means `===` otherwise.
identifierToken: ->
return 0 unless match = IDENTIFIER.exec @chunk
return false unless match = IDENTIFIER.exec @chunk
[input, id, colon] = match
@i += input.length
if id is 'all' and @tag() is 'FOR'
@token 'ALL', id
return 3
return true
if id is 'from' and @tag(1) is 'FOR'
@seenFor = no
@seenFrom = yes
@token 'FROM', id
return 4
return true
if id is 'to' and @seenFrom
@seenFrom = no
@token 'TO', id
return 2
return true
forcedIdentifier = colon or @tagAccessor()
tag = 'IDENTIFIER'
if id in JS_KEYWORDS or
@@ -124,39 +126,41 @@ exports.Lexer = class Lexer
tag = 'BOOL'
@token tag, id
@token ':', ':' if colon
input.length
true
# Matches numbers, including decimals, hex, and exponential notation.
# Be careful not to interfere with ranges-in-progress.
numberToken: ->
return 0 unless match = NUMBER.exec @chunk
return false unless match = NUMBER.exec @chunk
number = match[0]
return 0 if @tag() is '.' and number.charAt(0) is '.'
return false if @tag() is '.' and number.charAt(0) is '.'
@i += number.length
@token 'NUMBER', number
number.length
true
# Matches strings, including multi-line strings. Ensures that quotation marks
# are balanced within the string's contents, and within nested interpolations.
stringToken: ->
switch @chunk.charAt 0
when "'"
return 0 unless match = SIMPLESTR.exec @chunk
return false unless match = SIMPLESTR.exec @chunk
@token 'STRING', (string = match[0]).replace MULTILINER, '\\\n'
when '"'
return 0 unless string = @balancedString @chunk, [['"', '"'], ['#{', '}']]
return false unless string = @balancedString @chunk, [['"', '"'], ['#{', '}']]
if 0 < string.indexOf '#{', 1
@interpolateString string.slice 1, -1
else
@token 'STRING', @escapeLines string
else
return 0
return false
@line += count string, '\n'
string.length
@i += string.length
true
# Matches heredocs, adjusting indentation to the correct level, as heredocs
# preserve whitespace, but ignore indentation to the left.
heredocToken: ->
return 0 unless match = HEREDOC.exec @chunk
return false unless match = HEREDOC.exec @chunk
heredoc = match[0]
quote = heredoc.charAt 0
doc = @sanitizeHeredoc match[2], {quote, indent: null}
@@ -165,44 +169,49 @@ exports.Lexer = class Lexer
else
@token 'STRING', @makeString doc, quote, yes
@line += count heredoc, '\n'
heredoc.length
@i += heredoc.length
true
# Matches and consumes comments.
commentToken: ->
return 0 unless match = @chunk.match COMMENT
return false unless match = @chunk.match COMMENT
[comment, here] = match
@line += count comment, '\n'
@i += comment.length
if here
@token 'HERECOMMENT', @sanitizeHeredoc here,
herecomment: true, indent: Array(@indent + 1).join(' ')
@token 'TERMINATOR', '\n'
comment.length
true
# Matches JavaScript interpolated directly into the source via backticks.
jsToken: ->
return 0 unless @chunk.charAt(0) is '`' and match = JSTOKEN.exec @chunk
return false unless @chunk.charAt(0) is '`' and match = JSTOKEN.exec @chunk
@token 'JS', (script = match[0]).slice 1, -1
script.length
@i += script.length
true
# Matches regular expression literals. Lexing regular expressions is difficult
# to distinguish from division, so we borrow some basic heuristics from
# JavaScript and Ruby.
regexToken: ->
return 0 if @chunk.charAt(0) isnt '/'
return false if @chunk.charAt(0) isnt '/'
return @heregexToken match if match = HEREGEX.exec @chunk
return 0 if @tag() in NOT_REGEX
return 0 unless match = REGEX.exec @chunk
return false if @tag() in NOT_REGEX
return false unless match = REGEX.exec @chunk
[regex] = match
@token 'REGEX', if regex is '//' then '/(?:)/' else regex
regex.length
@i += regex.length
true
# Matches experimental, multiline and extended regular expression literals.
heregexToken: (match) ->
[heregex, body, flags] = match
@i += heregex.length
if 0 > body.indexOf '#{'
re = body.replace(HEREGEX_OMIT, '').replace(/\//g, '\\/')
@token 'REGEX', "/#{ re or '(?:)' }/#{flags}"
return heregex.length
return true
@token 'IDENTIFIER', 'RegExp'
@tokens.push ['CALL_START', '(']
tokens = []
@@ -219,7 +228,7 @@ exports.Lexer = class Lexer
@tokens.push tokens...
@tokens.push [',', ','], ['STRING', '"' + flags + '"'] if flags
@token ')', ')'
heregex.length
true
# Matches newlines, indents, and outdents, and determines which is which.
# If we can detect that the current line is continued onto the the next line,
@@ -232,21 +241,21 @@ exports.Lexer = class Lexer
# Keeps track of the level of indentation, because a single outdent token
# can close multiple indents, so we need to know how far in we happen to be.
lineToken: ->
return 0 unless match = MULTI_DENT.exec @chunk
return false unless match = MULTI_DENT.exec @chunk
indent = match[0]
@line += count indent, '\n'
@i += indent.length
prev = last @tokens, 1
size = indent.length - 1 - indent.lastIndexOf '\n'
nextCharacter = NEXT_CHARACTER.exec(@chunk)[1]
noNewlines = (nextCharacter in ['.', ','] and not NEXT_ELLIPSIS.test(@chunk)) or @unfinished()
if size - @indebt is @indent
if noNewlines then @suppressNewlines() else @newlineToken()
return indent.length
if size > @indent
return @suppressNewlines() if noNewlines
return @newlineToken indent
else if size > @indent
if noNewlines
@indebt = size - @indent
@suppressNewlines()
return indent.length
return @suppressNewlines()
diff = size - @indent + @outdebt
@token 'INDENT', diff
@indents.push diff
@@ -255,7 +264,7 @@ exports.Lexer = class Lexer
@indebt = 0
@outdentToken @indent - size, noNewlines
@indent = size
indent.length
true
# Record an outdent token or multiple tokens, if we happen to be moving back
# inwards past several recorded indents.
@@ -277,27 +286,27 @@ exports.Lexer = class Lexer
@token 'OUTDENT', dent
@outdebt -= moveOut if dent
@token 'TERMINATOR', '\n' unless @tag() is 'TERMINATOR' or noNewlines
this
true
# Matches and consumes non-meaningful whitespace. Tag the previous token
# as being "spaced", because there are some cases where it makes a difference.
whitespaceToken: ->
return 0 unless (match = WHITESPACE.exec @chunk) or
(nline = @chunk.charAt(0) is '\n')
return false unless (match = WHITESPACE.exec @chunk) or nline = @chunk.substring(0, 1) is '\n'
prev = last @tokens
prev[if match then 'spaced' else 'newLine'] = true if prev
if match then match[0].length else 0
@i += match[0].length if match
!!match
# Generate a newline token. Consecutive newlines get merged together.
newlineToken: ->
newlineToken: (newlines) ->
@token 'TERMINATOR', '\n' unless @tag() is 'TERMINATOR'
this
true
# Use a `\` at a line-ending to suppress the newline.
# The slash is removed here once its job is done.
suppressNewlines: ->
@tokens.pop() if @value() is '\\'
this
true
# We treat all other single characters as a token. Eg.: `( ) , . !`
# Multi-character operators are also literal tokens, so that Jison can assign
@@ -310,21 +319,23 @@ exports.Lexer = class Lexer
@tagParameters() if CODE.test value
else
value = @chunk.charAt 0
tag = value
@i += value.length
tag = value
prev = last @tokens
if value is '=' and prev
@assignmentError() if not prev[1].reserved and prev[1] in JS_FORBIDDEN
if prev[1] in ['||', '&&']
prev[0] = 'COMPOUND_ASSIGN'
prev[1] += '='
return 1
return true
if value is ';' then tag = 'TERMINATOR'
else if value in LOGIC then tag = 'LOGIC'
else if value in MATH then tag = 'MATH'
else if value in COMPARE then tag = 'COMPARE'
else if value in COMPOUND_ASSIGN then tag = 'COMPOUND_ASSIGN'
else if value in UNARY then tag = 'UNARY'
else if value in SHIFT then tag = 'SHIFT'
else if value in LOGIC or value is '?' and prev?.spaced then tag = 'LOGIC'
else if value is '?' and prev?.spaced then tag = 'LOGIC'
else if prev and not prev.spaced
if value is '(' and prev[0] in CALLABLE
prev[0] = 'FUNC_EXIST' if prev[0] is '?'
@@ -335,7 +346,7 @@ exports.Lexer = class Lexer
when '?' then prev[0] = 'INDEX_SOAK'
when '::' then prev[0] = 'INDEX_PROTO'
@token tag, value
value.length
true
# Token Manipulators
# ------------------
@@ -350,7 +361,7 @@ exports.Lexer = class Lexer
else if prev[1] is '.' and @value(1) isnt '.'
if @tag(1) is '?'
@tag 0, 'SOAK_ACCESS'
@tokens.splice -2, 1
@tokens.splice(-2, 1)
else
@tag 0, 'PROPERTY_ACCESS'
else
@@ -374,14 +385,14 @@ exports.Lexer = class Lexer
# definitions versus argument lists in function calls. Walk backwards, tagging
# parameters specially in order to make things easier for the parser.
tagParameters: ->
return this if @tag() isnt ')'
return if @tag() isnt ')'
i = @tokens.length
while tok = @tokens[--i]
switch tok[0]
when 'IDENTIFIER' then tok[0] = 'PARAM'
when ')' then tok[0] = 'PARAM_END'
when '(', 'CALL_START' then tok[0] = 'PARAM_START'; return true
this
true
# Close up all remaining open blocks at the end of the file.
closeIndentation: ->