Files
meteor/packages/jsparse/parser.js
David Greenspan fb183e1e5c a few name changes
2012-09-11 16:49:47 -07:00

605 lines
22 KiB
JavaScript

///// JAVASCRIPT PARSER
// XXX unit tests
// What we don't have from ECMA-262 5.1:
// - object literal trailing comma
// - object literal get/set
var parse = function (tokenizer) {
var noLineTerminatorHere = describe(
'noLineTerminator', function (t) {
return t.isLineTerminatorHere ? null : [];
});
// Function that takes one-item arrays to their single item and names other
// arrays with `name`. Works on parsers too.
var nameIfMultipart = function (name, parser) {
return revalue(
parser,
function (parts) {
if (! parts)
return null;
return (parts.length === 1) ?
parts[0] : named(name, parts);
});
};
// These "pointers" allow grammar circularity, i.e. accessing
// later parsers from earlier ones.
var expressionPtrFunc = function (noIn) {
return describe(
"expression",
function (t) {
return expressionFunc(noIn)(t);
});
};
var expressionPtr = expressionPtrFunc(false);
var assignmentExpressionPtrFunc = function (noIn) {
return describe(
"expression",
function (t) {
return assignmentExpressionFunc(noIn)(t);
});
};
var assignmentExpressionPtr = assignmentExpressionPtrFunc(false);
var functionBodyPtr = describe(
"functionBody", function (t) {
return functionBody(t);
});
var statementPtr = describe(
"statement", function (t) {
return statement(t);
});
var arrayLiteral =
named('array',
seq(token('['),
unpack(opt(list(token(',')))),
unpack(
opt(
list(
describe(
'expression',
or(assignmentExpressionPtr,
// count a peeked-at ']' as an expression
// to support elisions at end, e.g.
// `[1,2,3,,,,,,]`. Because it's unpacked,
// the look-ahead won't show up in the
// parse tree.
unpack(lookAheadToken(']')))),
// list seperator is one or more commas
// to support elision
unpack(list(token(',')))),
lookAheadToken(']'))),
token(']')));
var propertyName = describe('propertyName', or(
named('idPropName', tokenClass('IDENTIFIER')),
named('numPropName', tokenClass('NUMBER')),
named('strPropName', tokenClass('STRING'))));
var nameColonValue = describe(
'name:value',
named('prop', seq(propertyName, token(':'), assignmentExpressionPtr)));
var objectLiteral =
named('object',
seq(token('{'),
unpack(opt(list(nameColonValue,
token(',')), lookAheadToken('}'))),
token('}')));
// not memoized; only call at construction time
var functionFunc = function (nameRequired) {
return seq(token('function'),
(nameRequired ? tokenClass('IDENTIFIER') :
or(tokenClass('IDENTIFIER'),
revalue(lookAheadToken('('), named('nil', [])))),
token('('),
unpack(opt(list(tokenClass('IDENTIFIER'), token(',')),
lookAheadToken(')'))),
token(')'),
token('{'),
unpack(functionBodyPtr),
token('}'));
};
var functionExpression = named('functionExpr',
functionFunc(false));
var primaryOrFunctionExpression =
describe('expression',
or(named('this', token('this')),
named('identifier', tokenClass('IDENTIFIER')),
named('number', tokenClass('NUMBER')),
named('boolean', tokenClass('BOOLEAN')),
named('null', tokenClass('NULL')),
named('regex', tokenClass('REGEX')),
named('string', tokenClass('STRING')),
named('parens',
seq(token('('), expressionPtr, token(')'))),
arrayLiteral,
objectLiteral,
functionExpression));
var dotEnding = seq(token('.'), tokenClass('IDENTIFIER'));
var bracketEnding = seq(token('['), expressionPtr, token(']'));
var callArgs = seq(token('('),
unpack(opt(list(assignmentExpressionPtr,
token(',')), lookAheadToken(')'))),
token(')'));
var newKeyword = token('new');
// This is a completely equivalent refactor of the spec's production
// for a LeftHandSideExpression.
//
// An lhsExpression is basically an expression that can serve as
// the left-hand-side of an assignment, though function calls and
// "new" invocation are included because they have the same
// precedence. Actually, the spec technically allows a function
// call to "return" a valid l-value, as in `foo(bar) = baz`,
// though no built-in or user-specifiable call has this property
// (it would have to be defined by a browser or other "host").
var lhsExpression = describe(
'expression',
function (t) {
// Accumulate all initial "new" keywords, not yet knowing
// if they have a corresponding argument list later.
var news = [];
var n;
while ((n = newKeyword(t)))
news.push(n);
// Read the primaryOrFunctionExpression that will be the "core"
// of this lhsExpression. It is preceded by zero or more `new`
// keywords, and followed by any sequence of (...), [...],
// and .foo add-ons.
// if we have 'new' keywords, we are committed and must
// match an expression or error.
var result = runMaybeRequired(news.length, primaryOrFunctionExpression,
t, news[news.length - 1]);
if (! result)
return null;
// Our plan of attack is to apply each dot, bracket, or call
// as we come across it. Whether a call is a `new` call depends
// on whether there are `new` keywords we haven't used. If so,
// we pop one off the stack.
var done = false;
while (! done) {
var r;
if ((r = dotEnding(t))) {
result = named('dot', [result].concat(r));
} else if ((r = bracketEnding(t))) {
result = named('bracket', [result].concat(r));
} else if ((r = callArgs(t))) {
if (news.length)
result = named('newcall', [news.pop(), result].concat(r));
else
result = named('call', [result].concat(r));
} else {
done = true;
}
}
// There may be more `new` keywords than calls, which is how
// paren-less constructions (`new Date`) are parsed. We've
// already handled `new foo().bar()`, now handle `new new foo().bar`.
while (news.length)
result = named('new', [news.pop(), result]);
// mark any LeftHandSideExpression, for the benefit of
// assignmentExpression
result.lhs = true;
return result;
});
var postfixToken = token('++ --');
var postfixLookahead = lookAheadToken('++ --');
var postfixExpression = describe(
'expression',
nameIfMultipart(
'postfix',
seq(lhsExpression,
unpack(opt(lookAhead(noLineTerminatorHere,
lookAhead(postfixLookahead,
postfixToken)))))));
var unaryList = opt(list(or(token('delete void typeof'),
preSlashToken('++ -- + - ~ !', false))));
var unaryExpression = describe(
'expression',
function (t) {
var unaries = unaryList(t);
// if we have unaries, we are committed and
// have to match an expression or error.
var result = runMaybeRequired(unaries.length, postfixExpression,
t, unaries[unaries.length - 1]);
if (! result)
return null;
while (unaries.length)
result = named('unary', [unaries.pop(), result]);
return result;
});
var memoizeBooleanFunc = function (func) {
var trueResult, falseResult;
return function (flag) {
if (flag)
return trueResult || (trueResult = func(true));
else
return falseResult || (falseResult = func(false));
};
};
// actually this is the spec's LogicalORExpression
var binaryExpressionFunc = memoizeBooleanFunc(
function (noIn) {
// high to low precedence
var binaryOps = [token('* / %'),
token('+ -'),
token('<< >> >>>'),
or(token('< > <= >='),
noIn ? token('instanceof') :
token('instanceof in')),
token('== != === !=='),
token('&'),
token('^'),
token('|'),
token('&&'),
token('||')];
return describe(
'expression',
binaryLeft(unaryExpression, binaryOps));
});
var binaryExpression = binaryExpressionFunc(false);
var conditionalExpressionFunc = memoizeBooleanFunc(
function (noIn) {
return describe(
'expression',
nameIfMultipart(
'ternary',
seq(binaryExpressionFunc(noIn), unpack(opt(seq(
token('?'),
assignmentExpressionPtrFunc(false), token(':'),
assignmentExpressionPtrFunc(noIn)))))));
});
var conditionalExpression = conditionalExpressionFunc(false);
var assignOp = token('= *= /= %= += -= <<= >>= >>>= &= ^= |=');
var assignmentExpressionFunc = memoizeBooleanFunc(
function (noIn) {
return describe(
'expression',
function (t) {
var r = conditionalExpressionFunc(noIn)(t);
if (! r)
return null;
// Assignment is right-associative.
// Plan of attack: make a list of all the parts
// [expression, op, expression, op, ... expression]
// and then fold them up at the end.
var parts = [r];
var op;
while (r.lhs && (op = assignOp(t)))
parts.push(op,
runRequired(conditionalExpressionFunc(noIn), t, op));
var result = parts.pop();
while (parts.length) {
op = parts.pop();
var lhs = parts.pop();
result = named('assignment', [lhs, op, result]);
}
return result;
});
});
var assignmentExpression = assignmentExpressionFunc(false);
var expressionFunc = memoizeBooleanFunc(
function (noIn) {
return describe(
'expression',
nameIfMultipart(
'comma',
list(assignmentExpressionFunc(noIn), token(','))));
});
var expression = expressionFunc(false);
// STATEMENTS
var statements = list(statementPtr);
// implements JavaScript's semicolon "insertion" rules
var maybeSemicolon = describe(
'semicolon',
or(token(';'),
revalue(
or(
lookAheadToken('}'),
lookAheadTokenClass('EOF'),
function (t) {
return t.isLineTerminatorHere ? [] : null;
}), named(';', []))));
var expressionStatement = named(
'expressionStmnt',
negLookAhead(
or(lookAheadToken('{'), lookAheadToken('function')),
seq(expression,
describe('semicolon',
or(maybeSemicolon,
// allow presence of colon to terminate
// statement legally, for the benefit of
// expressionOrLabelStatement. Basically assume
// an implicit semicolon. This
// is safe because a colon can never legally
// follow a semicolon anyway.
lookAheadToken(':'))))));
// it's hard to parse statement labels, as in
// `foo: x = 1`, because we can't tell from the
// first token whether we are looking at an expression
// statement or a label statement. To work around this,
// expressionOrLabelStatement parses the expression and
// then rewrites the result if it is an identifier
// followed by a colon.
var labelColonAndStatement = seq(token(':'), statementPtr);
var noColon = describe(
'semicolon',
negLookAhead(lookAheadToken(':')));
var expressionOrLabelStatement = function (t) {
var exprStmnt = expressionStatement(t);
if (! exprStmnt)
return null;
var expr = exprStmnt[1];
var maybeSemi = exprStmnt[2];
if (expr[0] !== 'identifier' || ! isArray(maybeSemi)) {
// For better error messages, for example in `1+1:`,
// if there is a colon at the end of the expression,
// fail now and say "Expected semicolon" instead of failing
// later saying "Expected statement" at the colon.
runRequired(noColon, t);
return exprStmnt;
}
var rest = labelColonAndStatement(t);
if (! rest)
return exprStmnt;
return named('labelStmnt',
[expr[1]].concat(rest));
};
var emptyStatement = named('emptyStmnt', token(';')); // not maybeSemicolon
var blockStatement = named('blockStmnt', seq(
token('{'), unpack(opt(statements, lookAheadToken('}'))),
token('}')));
var varDeclFunc = memoizeBooleanFunc(function (noIn) {
return named(
'varDecl',
seq(tokenClass('IDENTIFIER'),
unpack(opt(seq(token('='),
assignmentExpressionFunc(noIn))))));
});
var varDecl = varDeclFunc(false);
var variableStatement = named(
'varStmnt',
seq(token('var'), unpack(list(varDecl, token(','))),
maybeSemicolon));
// A paren that may be followed by a statement
// beginning with a regex literal.
var closeParenBeforeStatement = preSlashToken(')', false);
var ifStatement = named(
'ifStmnt',
seq(token('if'), token('('), expression,
closeParenBeforeStatement, statementPtr,
unpack(opt(seq(token('else'), statementPtr)))));
var secondThirdClauses = describe(
'semicolon',
lookAhead(lookAheadToken(';'),
seq(
token(';'),
opt(expressionPtr, revalue(lookAheadToken(';'), named('nil', []))),
token(';'),
opt(expressionPtr, revalue(lookAheadToken(')'), named('nil', []))))));
var inExpr = seq(token('in'), expression);
var inExprExpectingSemi = describe('semicolon',
seq(token('in'), expression));
var forSpec = revalue(named(
'forSpec',
or(seq(token('var'),
varDeclFunc(true),
describe(
'commaOrIn',
or(unpack(inExpr),
unpack(seq(
unpack(opt(
seq(token(','),
unpack(list(varDeclFunc(true), token(',')))),
lookAheadToken(';'))),
unpack(secondThirdClauses)))))),
// get the case where the first clause is empty out of the way.
// the lookAhead's return value is the empty placeholder for the
// missing expression.
seq(revalue(lookAheadToken(';'), named('nil', [])), unpack(secondThirdClauses)),
// custom parser the non-var case because we have to
// read the first expression before we know if there's
// an "in".
function (t) {
var firstExpr = expressionFunc(true)(t);
if (! firstExpr)
return null;
var rest = secondThirdClauses(t);
if (! rest) {
// we need a left-hand-side expression for a
// `for (x in y)` loop.
if (! firstExpr.lhs)
throw parseError(t, secondThirdClauses);
// if we don't see 'in' at this point, it's probably
// a missing semicolon
rest = runRequired(inExprExpectingSemi, t);
}
return [firstExpr].concat(rest);
})),
function (clauses) {
// There are four kinds of for-loop, and we call the
// part between the parens one of forSpec, forVarSpec,
// forInSpec, and forVarInSpec. Having parsed it
// already, we rewrite the node name based on how
// many items came out. forIn and forVarIn always
// have 3 and 4 items respectively. for has 5
// (the optional expressions are present as nils).
// forVar has 6 or more, because `for(var x;;);`
// produces [`var` `x` `;` nil `;` nil].
if (! clauses)
return null;
if (clauses.length === 4)
clauses[0] = 'forInSpec';
else if (clauses.length === 5)
clauses[0] = 'forVarInSpec';
else if (clauses.length >= 7)
clauses[0] = 'forVarSpec';
return clauses;
});
var iterationStatement = or(
named('doStmnt', seq(token('do'), statementPtr, token('while'),
token('('), expression, token(')'),
maybeSemicolon)),
named('whileStmnt', seq(token('while'), token('('), expression,
closeParenBeforeStatement, statementPtr)),
// semicolons must be real, not maybeSemicolons
named('forStmnt', seq(
token('for'), token('('), forSpec, closeParenBeforeStatement,
statementPtr)));
var returnStatement = named(
'returnStmnt',
seq(token('return'), or(
lookAhead(noLineTerminatorHere, expression), constant(named('nil', []))),
maybeSemicolon));
var continueStatement = named(
'continueStmnt',
seq(token('continue'), or(
lookAhead(noLineTerminatorHere, tokenClass('IDENTIFIER')), constant(named('nil', []))),
maybeSemicolon));
var breakStatement = named(
'breakStmnt',
seq(token('break'), or(
lookAhead(noLineTerminatorHere, tokenClass('IDENTIFIER')), constant(named('nil', []))),
maybeSemicolon));
var throwStatement = named(
'throwStmnt',
seq(token('throw'),
lookAhead(noLineTerminatorHere, expression),
maybeSemicolon));
var withStatement = named(
'withStmnt',
seq(token('with'), token('('), expression, closeParenBeforeStatement,
statementPtr));
var switchCase = named(
'case',
seq(token('case'), expression, token(':'),
unpack(opt(statements, or(lookAheadToken('}'),
lookAheadToken('case default'))))));
var switchDefault = named(
'default',
seq(token('default'), token(':'),
unpack(opt(statements, or(lookAheadToken('}'),
lookAheadToken('case'))))));
var switchStatement = named(
'switchStmnt',
seq(token('switch'), token('('), expression, token(')'),
token('{'), unpack(opt(list(switchCase),
or(lookAheadToken('}'),
lookAheadToken('default')))),
unpack(opt(seq(switchDefault,
unpack(opt(list(switchCase)))))),
token('}')));
var catchFinally = describe(
'catchOrFinally',
lookAhead(lookAheadToken('catch finally'),
seq(
or(named(
'catch',
seq(token('catch'), token('('), tokenClass('IDENTIFIER'),
token(')'), blockStatement)),
constant(named('nil', []))),
or(named(
'finally',
seq(token('finally'), blockStatement)),
constant(named('nil', []))))));
var tryStatement = named(
'tryStmnt',
seq(token('try'), blockStatement, unpack(catchFinally)));
var debuggerStatement = named(
'debuggerStmnt', seq(token('debugger'), maybeSemicolon));
var statement = describe('statement',
or(expressionOrLabelStatement,
emptyStatement,
blockStatement,
variableStatement,
ifStatement,
iterationStatement,
returnStatement,
continueStatement,
breakStatement,
withStatement,
switchStatement,
throwStatement,
tryStatement,
debuggerStatement));
// PROGRAM
var functionDecl = named('functionDecl',
functionFunc(true));
var sourceElement = or(statement, functionDecl);
var sourceElements = list(sourceElement);
var functionBody = describe('functionBody',
opt(sourceElements,
lookAheadToken('}')));
var program = named('program',
seq(unpack(opt(sourceElements)),
// we rely on the fact that opt(sourceElements)
// will never fail, and non-first arguments
// to seq are required to succeed -- meaning
// this parser will never fail without throwing
// a parse error.
describe('statement',
revalue(lookAheadTokenClass("EOF"),
function (v, t) {
if (! v)
return null;
// eat the ending "EOF" so that
// our position is updated
t.consume();
return unpack([]);
}))));
return program(tokenizer);
};