Parser object

This commit is contained in:
David Greenspan
2012-09-11 19:36:47 -07:00
parent a6bd5747b2
commit 132b4897af
2 changed files with 360 additions and 346 deletions

View File

@@ -2,90 +2,119 @@
// XXX unit tests
// XXX remove unnecessary ParseNode.NILs in lookaheads
// XXX SeqParser
// XXX find all revalues, see if constant ones are necessary.
// API may be confusing if constant affects only non-null.
// What we don't have from ECMA-262 5.1:
// - object literal trailing comma
// - object literal get/set
var parse = function (tokenizer) {
var noLineTerminatorHere = describe(
var noLineTerminatorHere = new Parser(
'noLineTerminator', function (t) {
return t.isLineTerminatorHere ? null : [];
});
// Like token, but marks tokens that need to defy the lexer's
// heuristic about whether the next '/' is a division or
// starts a regex.
var preSlashToken = function (text, divisionNotRegex) {
var inner = token(text);
return new Parser(
inner.expecting,
function (t) {
// temporarily set divisionPermitted,
// restoring it if we don't match.
var oldValue = t.lexer.divisionPermitted;
var result;
try {
t.lexer.divisionPermitted = divisionNotRegex;
result = inner.parse(t);
return result;
} finally {
if (! result)
t.lexer.divisionPermitted = oldValue;
}
});
};
// Function that takes one-item arrays to their single item and names other
// arrays with `name`. Works on parsers too.
var nameIfMultipart = function (name, parser) {
var nodeIfMultipart = function (name, arrayParser) {
return revalue(
parser,
arrayParser,
function (parts) {
if (! parts)
return null;
return (parts.length === 1) ?
parts[0] : named(name, parts);
parts[0] : new ParseNode(name, parts);
});
};
// These "pointers" allow grammar circularity, i.e. accessing
// later parsers from earlier ones.
var expressionPtrFunc = function (noIn) {
return describe(
return new Parser(
"expression",
function (t) {
return expressionFunc(noIn)(t);
return expressionFunc(noIn).parse(t);
});
};
var expressionPtr = expressionPtrFunc(false);
var assignmentExpressionPtrFunc = function (noIn) {
return describe(
return new Parser(
"expression",
function (t) {
return assignmentExpressionFunc(noIn)(t);
return assignmentExpressionFunc(noIn).parse(t);
});
};
var assignmentExpressionPtr = assignmentExpressionPtrFunc(false);
var functionBodyPtr = describe(
var functionBodyPtr = new Parser(
"functionBody", function (t) {
return functionBody(t);
return functionBody.parse(t);
});
var statementPtr = describe(
var statementPtr = new Parser(
"statement", function (t) {
return statement(t);
return statement.parse(t);
});
var arrayLiteral =
named('array',
seq(token('['),
unpack(opt(list(token(',')))),
unpack(
opt(
list(
describe(
'expression',
or(assignmentExpressionPtr,
// count a peeked-at ']' as an expression
// to support elisions at end, e.g.
// `[1,2,3,,,,,,]`. Because it's unpacked,
// the look-ahead won't show up in the
// parse tree.
unpack(lookAheadToken(']')))),
// list seperator is one or more commas
// to support elision
unpack(list(token(',')))),
lookAheadToken(']'))),
token(']')));
node('array',
seq(token('['),
unpack(opt(list(token(',')))),
unpack(
opt(
list(
expecting(
'expression',
or(assignmentExpressionPtr,
// count a peeked-at ']' as an expression
// to support elisions at end, e.g.
// `[1,2,3,,,,,,]`. Because it's unpacked,
// the look-ahead won't show up in the
// parse tree.
unpack(lookAheadToken(']')))),
// list seperator is one or more commas
// to support elision
unpack(list(token(',')))),
lookAheadToken(']'))),
token(']')));
var propertyName = describe('propertyName', or(
named('idPropName', seq(tokenClass('IDENTIFIER'))),
named('numPropName', seq(tokenClass('NUMBER'))),
named('strPropName', seq(tokenClass('STRING')))));
var nameColonValue = describe(
var propertyName = expecting('propertyName', or(
node('idPropName', seq(tokenClass('IDENTIFIER'))),
node('numPropName', seq(tokenClass('NUMBER'))),
node('strPropName', seq(tokenClass('STRING')))));
var nameColonValue = expecting(
'name:value',
named('prop', seq(propertyName, token(':'), assignmentExpressionPtr)));
node('prop', seq(propertyName, token(':'), assignmentExpressionPtr)));
var objectLiteral =
named('object',
node('object',
seq(token('{'),
unpack(opt(list(nameColonValue,
token(',')), lookAheadToken('}'))),
@@ -96,7 +125,7 @@ var parse = function (tokenizer) {
return seq(token('function'),
(nameRequired ? tokenClass('IDENTIFIER') :
or(tokenClass('IDENTIFIER'),
revalue(lookAheadToken('('), named('nil', [])))),
revalue(lookAheadToken('('), ParseNode.NIL))),
token('('),
unpack(opt(list(tokenClass('IDENTIFIER'), token(',')),
lookAheadToken(')'))),
@@ -105,23 +134,23 @@ var parse = function (tokenizer) {
unpack(functionBodyPtr),
token('}'));
};
var functionExpression = named('functionExpr',
var functionExpression = node('functionExpr',
functionFunc(false));
var primaryOrFunctionExpression =
describe('expression',
or(named('this', seq(token('this'))),
named('identifier', seq(tokenClass('IDENTIFIER'))),
named('number', seq(tokenClass('NUMBER'))),
named('boolean', seq(tokenClass('BOOLEAN'))),
named('null', seq(tokenClass('NULL'))),
named('regex', seq(tokenClass('REGEX'))),
named('string', seq(tokenClass('STRING'))),
named('parens',
expecting('expression',
or(node('this', seq(token('this'))),
node('identifier', seq(tokenClass('IDENTIFIER'))),
node('number', seq(tokenClass('NUMBER'))),
node('boolean', seq(tokenClass('BOOLEAN'))),
node('null', seq(tokenClass('NULL'))),
node('regex', seq(tokenClass('REGEX'))),
node('string', seq(tokenClass('STRING'))),
node('parens',
seq(token('('), expressionPtr, token(')'))),
arrayLiteral,
objectLiteral,
functionExpression));
arrayLiteral,
objectLiteral,
functionExpression));
var dotEnding = seq(token('.'), tokenClass('IDENTIFIER'));
var bracketEnding = seq(token('['), expressionPtr, token(']'));
@@ -142,14 +171,14 @@ var parse = function (tokenizer) {
// call to "return" a valid l-value, as in `foo(bar) = baz`,
// though no built-in or user-specifiable call has this property
// (it would have to be defined by a browser or other "host").
var lhsExpression = describe(
var lhsExpression = new Parser(
'expression',
function (t) {
// Accumulate all initial "new" keywords, not yet knowing
// if they have a corresponding argument list later.
var news = [];
var n;
while ((n = newKeyword(t)))
while ((n = newKeyword.parse(t)))
news.push(n);
// Read the primaryOrFunctionExpression that will be the "core"
@@ -158,8 +187,8 @@ var parse = function (tokenizer) {
// and .foo add-ons.
// if we have 'new' keywords, we are committed and must
// match an expression or error.
var result = runMaybeRequired(
news.length, primaryOrFunctionExpression, t);
var result = primaryOrFunctionExpression.parse(
t, {required: news.length});
if (! result)
return null;
@@ -170,15 +199,15 @@ var parse = function (tokenizer) {
var done = false;
while (! done) {
var r;
if ((r = dotEnding(t))) {
result = named('dot', [result].concat(r));
} else if ((r = bracketEnding(t))) {
result = named('bracket', [result].concat(r));
} else if ((r = callArgs(t))) {
if ((r = dotEnding.parse(t))) {
result = new ParseNode('dot', [result].concat(r));
} else if ((r = bracketEnding.parse(t))) {
result = new ParseNode('bracket', [result].concat(r));
} else if ((r = callArgs.parse(t))) {
if (news.length)
result = named('newcall', [news.pop(), result].concat(r));
result = new ParseNode('newcall', [news.pop(), result].concat(r));
else
result = named('call', [result].concat(r));
result = new ParseNode('call', [result].concat(r));
} else {
done = true;
}
@@ -188,7 +217,7 @@ var parse = function (tokenizer) {
// paren-less constructions (`new Date`) are parsed. We've
// already handled `new foo().bar()`, now handle `new new foo().bar`.
while (news.length)
result = named('new', [news.pop(), result]);
result = new ParseNode('new', [news.pop(), result]);
// mark any LeftHandSideExpression, for the benefit of
// assignmentExpression
@@ -199,9 +228,9 @@ var parse = function (tokenizer) {
var postfixToken = token('++ --');
var postfixLookahead = lookAheadToken('++ --');
var postfixExpression = describe(
var postfixExpression = expecting(
'expression',
nameIfMultipart(
nodeIfMultipart(
'postfix',
seq(lhsExpression,
unpack(opt(lookAhead(noLineTerminatorHere,
@@ -209,18 +238,19 @@ var parse = function (tokenizer) {
postfixToken)))))));
var unaryList = opt(list(or(token('delete void typeof'),
preSlashToken('++ -- + - ~ !', false))));
var unaryExpression = describe(
var unaryExpression = new Parser(
'expression',
function (t) {
var unaries = unaryList(t);
var unaries = unaryList.parse(t);
// if we have unaries, we are committed and
// have to match an expression or error.
var result = runMaybeRequired(unaries.length, postfixExpression, t);
var result = postfixExpression.parse(
t, {required: unaries.length});
if (! result)
return null;
while (unaries.length)
result = named('unary', [unaries.pop(), result]);
result = new ParseNode('unary', [unaries.pop(), result]);
return result;
});
@@ -250,7 +280,7 @@ var parse = function (tokenizer) {
token('|'),
token('&&'),
token('||')];
return describe(
return expecting(
'expression',
binaryLeft(unaryExpression, binaryOps));
});
@@ -258,9 +288,9 @@ var parse = function (tokenizer) {
var conditionalExpressionFunc = memoizeBooleanFunc(
function (noIn) {
return describe(
return expecting(
'expression',
nameIfMultipart(
nodeIfMultipart(
'ternary',
seq(binaryExpressionFunc(noIn), unpack(opt(seq(
token('?'),
@@ -273,10 +303,10 @@ var parse = function (tokenizer) {
var assignmentExpressionFunc = memoizeBooleanFunc(
function (noIn) {
return describe(
return new Parser(
'expression',
function (t) {
var r = conditionalExpressionFunc(noIn)(t);
var r = conditionalExpressionFunc(noIn).parse(t);
if (! r)
return null;
@@ -286,15 +316,16 @@ var parse = function (tokenizer) {
// and then fold them up at the end.
var parts = [r];
var op;
while (r.lhs && (op = assignOp(t)))
while (r.lhs && (op = assignOp.parse(t)))
parts.push(op,
runRequired(conditionalExpressionFunc(noIn), t));
conditionalExpressionFunc(noIn).parse(
t, {required: true}));
var result = parts.pop();
while (parts.length) {
op = parts.pop();
var lhs = parts.pop();
result = named('assignment', [lhs, op, result]);
result = new ParseNode('assignment', [lhs, op, result]);
}
return result;
});
@@ -303,9 +334,9 @@ var parse = function (tokenizer) {
var expressionFunc = memoizeBooleanFunc(
function (noIn) {
return describe(
return expecting(
'expression',
nameIfMultipart(
nodeIfMultipart(
'comma',
list(assignmentExpressionFunc(noIn), token(','))));
});
@@ -316,31 +347,32 @@ var parse = function (tokenizer) {
var statements = list(statementPtr);
// implements JavaScript's semicolon "insertion" rules
var maybeSemicolon = describe(
var maybeSemicolon = expecting(
'semicolon',
or(token(';'),
revalue(
or(
lookAheadToken('}'),
lookAheadTokenClass('EOF'),
function (t) {
return t.isLineTerminatorHere ? [] : null;
}), named(';', []))));
new Parser(null,
function (t) {
return t.isLineTerminatorHere ? [] : null;
})), new ParseNode(';', []))));
var expressionStatement = named(
var expressionStatement = node(
'expressionStmnt',
negLookAhead(
or(lookAheadToken('{'), lookAheadToken('function')),
seq(expression,
describe('semicolon',
or(maybeSemicolon,
// allow presence of colon to terminate
// statement legally, for the benefit of
// expressionOrLabelStatement. Basically assume
// an implicit semicolon. This
// is safe because a colon can never legally
// follow a semicolon anyway.
revalue(lookAheadToken(':'), named(';', [])))))));
expecting('semicolon',
or(maybeSemicolon,
// allow presence of colon to terminate
// statement legally, for the benefit of
// expressionOrLabelStatement. Basically assume
// an implicit semicolon. This
// is safe because a colon can never legally
// follow a semicolon anyway.
revalue(lookAheadToken(':'), new ParseNode(';', [])))))));
// it's hard to parse statement labels, as in
// `foo: x = 1`, because we can't tell from the
@@ -350,44 +382,46 @@ var parse = function (tokenizer) {
// then rewrites the result if it is an identifier
// followed by a colon.
var labelColonAndStatement = seq(token(':'), statementPtr);
var noColon = describe(
var noColon = expecting(
'semicolon',
negLookAhead(lookAheadToken(':')));
var expressionOrLabelStatement = function (t) {
var exprStmnt = expressionStatement(t);
if (! exprStmnt)
return null;
var expressionOrLabelStatement = new Parser(
null,
function (t) {
var exprStmnt = expressionStatement.parse(t);
if (! exprStmnt)
return null;
var expr = exprStmnt.children[0];
var maybeSemi = exprStmnt.children[1];
if (expr.name !== 'identifier' ||
! (maybeSemi instanceof ParseNode)) {
// We either have a non-identifier expression or a present
// semicolon. This is not a label.
//
// Fail now if we are looking at a colon, causing an
// error message on input like `1+1:` of the same kind
// you'd get without statement label parsing.
runRequired(noColon, t);
return exprStmnt;
}
var expr = exprStmnt.children[0];
var maybeSemi = exprStmnt.children[1];
if (expr.name !== 'identifier' ||
! (maybeSemi instanceof ParseNode)) {
// We either have a non-identifier expression or a present
// semicolon. This is not a label.
//
// Fail now if we are looking at a colon, causing an
// error message on input like `1+1:` of the same kind
// you'd get without statement label parsing.
noColon.parse(t, {required: true});
return exprStmnt;
}
var rest = labelColonAndStatement(t);
if (! rest)
return exprStmnt;
var rest = labelColonAndStatement.parse(t);
if (! rest)
return exprStmnt;
return named('labelStmnt',
[expr.children[0]].concat(rest));
};
return new ParseNode('labelStmnt',
[expr.children[0]].concat(rest));
});
var emptyStatement = named('emptyStmnt', seq(token(';'))); // not maybeSemicolon
var emptyStatement = node('emptyStmnt', seq(token(';'))); // not maybeSemicolon
var blockStatement = describe('block', named('blockStmnt', seq(
var blockStatement = expecting('block', node('blockStmnt', seq(
token('{'), unpack(opt(statements, lookAheadToken('}'))),
token('}'))));
var varDeclFunc = memoizeBooleanFunc(function (noIn) {
return named(
return node(
'varDecl',
seq(tokenClass('IDENTIFIER'),
unpack(opt(seq(token('='),
@@ -395,7 +429,7 @@ var parse = function (tokenizer) {
});
var varDecl = varDeclFunc(false);
var variableStatement = named(
var variableStatement = node(
'varStmnt',
seq(token('var'), unpack(list(varDecl, token(','))),
maybeSemicolon));
@@ -404,28 +438,28 @@ var parse = function (tokenizer) {
// beginning with a regex literal.
var closeParenBeforeStatement = preSlashToken(')', false);
var ifStatement = named(
var ifStatement = node(
'ifStmnt',
seq(token('if'), token('('), expression,
closeParenBeforeStatement, statementPtr,
unpack(opt(seq(token('else'), statementPtr)))));
var secondThirdClauses = describe(
var secondThirdClauses = expecting(
'semicolon',
lookAhead(lookAheadToken(';'),
seq(
describe('semicolon', token(';')),
opt(expressionPtr, revalue(lookAheadToken(';'), named('nil', []))),
describe('semicolon', token(';')),
opt(expressionPtr, revalue(lookAheadToken(')'), named('nil', []))))));
expecting('semicolon', token(';')),
opt(expressionPtr, revalue(lookAheadToken(';'), ParseNode.NIL)),
expecting('semicolon', token(';')),
opt(expressionPtr, revalue(lookAheadToken(')'), ParseNode.NIL)))));
var inExpr = seq(token('in'), expression);
var inExprExpectingSemi = describe('semicolon',
seq(token('in'), expression));
var forSpec = revalue(named(
var inExprExpectingSemi = expecting('semicolon',
seq(token('in'), expression));
var forSpec = revalue(node(
'forSpec',
or(seq(token('var'),
varDeclFunc(true),
describe(
expecting(
'commaOrIn',
or(unpack(inExpr),
unpack(seq(
@@ -437,76 +471,78 @@ var parse = function (tokenizer) {
// get the case where the first clause is empty out of the way.
// the lookAhead's return value is the empty placeholder for the
// missing expression.
seq(revalue(lookAheadToken(';'), named('nil', [])), unpack(secondThirdClauses)),
seq(revalue(lookAheadToken(';'), ParseNode.NIL), unpack(secondThirdClauses)),
// custom parser the non-var case because we have to
// read the first expression before we know if there's
// an "in".
function (t) {
var firstExpr = expressionFunc(true)(t);
if (! firstExpr)
return null;
var rest = secondThirdClauses(t);
if (! rest) {
// we need a left-hand-side expression for a
// `for (x in y)` loop.
if (! firstExpr.lhs)
throw parseError(t, secondThirdClauses);
// if we don't see 'in' at this point, it's probably
// a missing semicolon
rest = runRequired(inExprExpectingSemi, t);
}
new Parser(
null,
function (t) {
var firstExpr = expressionFunc(true).parse(t);
if (! firstExpr)
return null;
var rest = secondThirdClauses.parse(t);
if (! rest) {
// we need a left-hand-side expression for a
// `for (x in y)` loop.
if (! firstExpr.lhs)
throw parseError(t, secondThirdClauses);
// if we don't see 'in' at this point, it's probably
// a missing semicolon
rest = inExprExpectingSemi.parse(t, {required: true});
}
return [firstExpr].concat(rest);
})),
function (clauses) {
// There are four kinds of for-loop, and we call the
// part between the parens one of forSpec, forVarSpec,
// forInSpec, and forVarInSpec. Having parsed it
// already, we rewrite the node name based on how
// many items came out. forIn and forVarIn always
// have 3 and 4 items respectively. for has 5
// (the optional expressions are present as nils).
// forVar has 6 or more, because `for(var x;;);`
// produces [`var` `x` `;` nil `;` nil].
if (! clauses)
return null;
var numChildren = clauses.children.length;
if (numChildren === 3)
return new ParseNode('forInSpec', clauses.children);
else if (numChildren === 4)
return new ParseNode('forVarInSpec', clauses.children);
else if (numChildren >= 6)
return new ParseNode('forVarSpec', clauses.children);
return clauses;
});
return [firstExpr].concat(rest);
}))),
function (clauses) {
// There are four kinds of for-loop, and we call the
// part between the parens one of forSpec, forVarSpec,
// forInSpec, and forVarInSpec. Having parsed it
// already, we rewrite the node name based on how
// many items came out. forIn and forVarIn always
// have 3 and 4 items respectively. for has 5
// (the optional expressions are present as nils).
// forVar has 6 or more, because `for(var x;;);`
// produces [`var` `x` `;` nil `;` nil].
if (! clauses)
return null;
var numChildren = clauses.children.length;
if (numChildren === 3)
return new ParseNode('forInSpec', clauses.children);
else if (numChildren === 4)
return new ParseNode('forVarInSpec', clauses.children);
else if (numChildren >= 6)
return new ParseNode('forVarSpec', clauses.children);
return clauses;
});
var iterationStatement = or(
named('doStmnt', seq(token('do'), statementPtr, token('while'),
node('doStmnt', seq(token('do'), statementPtr, token('while'),
token('('), expression, token(')'),
maybeSemicolon)),
named('whileStmnt', seq(token('while'), token('('), expression,
node('whileStmnt', seq(token('while'), token('('), expression,
closeParenBeforeStatement, statementPtr)),
// semicolons must be real, not maybeSemicolons
named('forStmnt', seq(
node('forStmnt', seq(
token('for'), token('('), forSpec, closeParenBeforeStatement,
statementPtr)));
var returnStatement = named(
var returnStatement = node(
'returnStmnt',
seq(token('return'), or(
lookAhead(noLineTerminatorHere, expression), constant(named('nil', []))),
lookAhead(noLineTerminatorHere, expression), constant(ParseNode.NIL)),
maybeSemicolon));
var continueStatement = named(
var continueStatement = node(
'continueStmnt',
seq(token('continue'), or(
lookAhead(noLineTerminatorHere, tokenClass('IDENTIFIER')), constant(named('nil', []))),
lookAhead(noLineTerminatorHere, tokenClass('IDENTIFIER')), constant(ParseNode.NIL)),
maybeSemicolon));
var breakStatement = named(
var breakStatement = node(
'breakStmnt',
seq(token('break'), or(
lookAhead(noLineTerminatorHere, tokenClass('IDENTIFIER')), constant(named('nil', []))),
lookAhead(noLineTerminatorHere, tokenClass('IDENTIFIER')), constant(ParseNode.NIL)),
maybeSemicolon));
var throwStatement = named(
var throwStatement = node(
'throwStmnt',
seq(token('throw'),
lookAhead(revalue(noLineTerminatorHere,
@@ -519,23 +555,23 @@ var parse = function (tokenizer) {
}), expression),
maybeSemicolon));
var withStatement = named(
var withStatement = node(
'withStmnt',
seq(token('with'), token('('), expression, closeParenBeforeStatement,
statementPtr));
var switchCase = named(
var switchCase = node(
'case',
seq(token('case'), expression, token(':'),
unpack(opt(statements, or(lookAheadToken('}'),
lookAheadToken('case default'))))));
var switchDefault = named(
var switchDefault = node(
'default',
seq(token('default'), token(':'),
unpack(opt(statements, or(lookAheadToken('}'),
lookAheadToken('case'))))));
var switchStatement = named(
var switchStatement = node(
'switchStmnt',
seq(token('switch'), token('('), expression, token(')'),
token('{'), unpack(opt(list(switchCase),
@@ -545,70 +581,70 @@ var parse = function (tokenizer) {
unpack(opt(list(switchCase)))))),
token('}')));
var catchFinally = describe(
var catchFinally = expecting(
'catch',
lookAhead(lookAheadToken('catch finally'),
seq(
or(named(
or(node(
'catch',
seq(token('catch'), token('('), tokenClass('IDENTIFIER'),
token(')'), blockStatement)),
constant(named('nil', []))),
or(named(
constant(ParseNode.NIL)),
or(node(
'finally',
seq(token('finally'), blockStatement)),
constant(named('nil', []))))));
var tryStatement = named(
constant(ParseNode.NIL)))));
var tryStatement = node(
'tryStmnt',
seq(token('try'), blockStatement, unpack(catchFinally)));
var debuggerStatement = named(
var debuggerStatement = node(
'debuggerStmnt', seq(token('debugger'), maybeSemicolon));
var statement = describe('statement',
or(expressionOrLabelStatement,
emptyStatement,
blockStatement,
variableStatement,
ifStatement,
iterationStatement,
returnStatement,
continueStatement,
breakStatement,
withStatement,
switchStatement,
throwStatement,
tryStatement,
debuggerStatement));
var statement = expecting('statement',
or(expressionOrLabelStatement,
emptyStatement,
blockStatement,
variableStatement,
ifStatement,
iterationStatement,
returnStatement,
continueStatement,
breakStatement,
withStatement,
switchStatement,
throwStatement,
tryStatement,
debuggerStatement));
// PROGRAM
var functionDecl = named('functionDecl',
var functionDecl = node('functionDecl',
functionFunc(true));
var sourceElement = or(statement, functionDecl);
var sourceElements = list(sourceElement);
var functionBody = describe('functionBody',
opt(sourceElements,
lookAheadToken('}')));
var functionBody = expecting('functionBody',
opt(sourceElements,
lookAheadToken('}')));
var program = named('program',
var program = node('program',
seq(unpack(opt(sourceElements)),
// we rely on the fact that opt(sourceElements)
// will never fail, and non-first arguments
// to seq are required to succeed -- meaning
// this parser will never fail without throwing
// a parse error.
describe('statement',
revalue(lookAheadTokenClass("EOF"),
function (v, t) {
if (! v)
return null;
// eat the ending "EOF" so that
// our position is updated
t.consume();
return unpack([]);
}))));
expecting('statement',
revalue(lookAheadTokenClass("EOF"),
function (v, t) {
if (! v)
return null;
// eat the ending "EOF" so that
// our position is updated
t.consume();
return unpack([]);
}))));
return program(tokenizer);
return program.parse(tokenizer);
};

View File

@@ -1,7 +1,6 @@
///// TOKENIZER AND PARSER COMBINATORS
// XXX make Parser object with parse method?
// XXX rework describe, call "expecting"?
// XXX track line/col position, for errors and maybe token info
// XXX unit tests
@@ -17,6 +16,26 @@ var ParseNode = function (name, children) {
throw new Error("Expected array in new ParseNode(" + name + ", ...)");
};
ParseNode.NIL = new ParseNode('nil', []);
var Parser = function (expecting, runFunc) {
this.expecting = expecting;
this._run = runFunc;
};
_.extend(Parser.prototype, {
parse: function (t, options) {
var result = this._run(t);
if (options) {
if (options.required && ! result)
throw parseError(t, this);
}
return result;
}
});
Tokenizer = function (codeOrLexer) {
// XXX rethink codeOrLexer later
this.lexer = (codeOrLexer instanceof Lexer ? codeOrLexer :
@@ -66,16 +85,17 @@ _.extend(Tokenizer.prototype, {
// A parser that consume()s has to succeed.
// Similarly, a parser that fails can't have consumed.
// mutates the parser; don't describe an existing parser.
var describe = function (description, parser) {
parser.description = description;
// mutates the parser
var expecting = function (expecting, parser) {
parser.expecting = expecting;
return parser;
};
// Call this as `throw parseError(...)`.
// `expected` is a parser, `after` is a string.
var parseError = function (t, expected, found) {
var str = (expected.description ? "Expected " + expected.description :
var parseError = function (t, expectedParser, found) {
var str = (expectedParser.expecting ? "Expected " +
expectedParser.expecting :
// all parsers that might error should have descriptions,
// but just in case:
"Unexpected token");
@@ -89,14 +109,14 @@ var parseError = function (t, expected, found) {
///// TERMINAL PARSER CONSTRUCTORS
var _tokenClassImpl = function (type, text, dontConsume) {
var _tokenClassImpl = function (type, text, onlyLook) {
var textSet = (text ? makeSet(text.split(' ')) : null);
var description = (text ? text.split(' ').join(', ') : type);
return describe(
description,
var expecting = (text ? text.split(' ').join(', ') : type);
return new Parser(
expecting,
function (t) {
if (t.peekType == type && (!text || textSet[t.peekText])) {
if (dontConsume)
if (onlyLook)
return [];
var ret = {text: t.peekText, pos: t.pos};
t.consume();
@@ -106,10 +126,10 @@ var _tokenClassImpl = function (type, text, dontConsume) {
});
};
var _tokenImpl = function (text, dontConsume) {
var _tokenImpl = function (text, onlyLook) {
if (/\w/.test(text))
return _tokenClassImpl('KEYWORD', text, dontConsume);
return _tokenClassImpl('PUNCTUATION', text, dontConsume);
return _tokenClassImpl('KEYWORD', text, onlyLook);
return _tokenClassImpl('PUNCTUATION', text, onlyLook);
};
var tokenClass = function (type, text) {
@@ -122,28 +142,6 @@ var token = function (text) {
return _tokenImpl(text);
};
// Like token, but marks tokens that need to defy the lexer's
// heuristic about whether the next '/' is a division or
// starts a regex.
var preSlashToken = function (text, divisionNotRegex) {
var impl = _tokenImpl(text);
return describe(impl.description,
function (t) {
// temporarily set divisionPermitted,
// restoring it if we don't match.
var oldValue = t.lexer.divisionPermitted;
var result;
try {
t.lexer.divisionPermitted = divisionNotRegex;
result = impl(t);
return result;
} finally {
if (! result)
t.lexer.divisionPermitted = oldValue;
}
});
};
// NON-CONSUMING PARSER CONSTRUCTORS
var lookAheadTokenClass = function (type, text) {
@@ -156,45 +154,28 @@ var lookAheadToken = function (text) {
///// NON-TERMINAL PARSER CONSTRUCTORS
// run parser(tokenizer) and assert it matches
var runRequired = function (parser, tokenizer) {
return revalue(
tokenizer ? parser(tokenizer) : parser,
function (v, t) {
if (! v)
throw parseError(t || tokenizer, parser);
return v;
});
};
var runMaybeRequired = function (require, parser, tokenizer) {
return require ? runRequired(parser, tokenizer) : parser(tokenizer);
};
// Polymorphic in parsers and results; an experiment.
var named = function (name, parserOrResult) {
return describe(
name,
revalue(
parserOrResult,
function (value) {
if (! value)
return null;
return new ParseNode(name, Array.prototype.slice.call(value));
}));
var node = function (name, childrenParser) {
return new Parser(name, function (t) {
var children = childrenParser.parse(t);
if (! children)
return null;
return new ParseNode(name, children);
});
};
var or = function (/*parsers*/) {
var args = arguments;
return function (t) {
var result;
for(var i = 0, N = args.length; i < N; i++) {
result = args[i](t);
if (result)
return result;
}
return null;
};
return new Parser(
null,
function (t) {
var result;
for(var i = 0, N = args.length; i < N; i++) {
result = args[i].parse(t);
if (result)
return result;
}
return null;
});
};
// Parses a left-recursive expression with zero or more occurrences
@@ -220,18 +201,18 @@ var binaryLeft = function (termParser, opParser) {
}
}
return describe(
termParser.description,
return new Parser(
termParser.expecting,
function (t) {
var result = termParser(t);
var result = termParser.parse(t);
if (! result)
return null;
var op;
while ((op = opParser(t))) {
result = named(
while ((op = opParser.parse(t))) {
result = new ParseNode(
'binary',
[result, op, runRequired(termParser, t, op)]);
[result, op, termParser.parse(t, {required: true})]);
}
return result;
});
@@ -250,25 +231,24 @@ var list = function (itemParser, sepParser) {
else
array.push(newThing);
};
return describe(
itemParser.description,
return new Parser(
itemParser.expecting,
function (t) {
var result = [];
var firstItem = itemParser(t);
var firstItem = itemParser.parse(t);
if (! firstItem)
return null;
push(result, firstItem);
if (sepParser) {
var sep;
while ((sep = sepParser(t))) {
while ((sep = sepParser.parse(t))) {
push(result, sep);
push(result, runRequired(itemParser, t,
sep.unpack ? sep[sep.length - 1] : sep));
push(result, itemParser.parse(t, {required: true}));
}
} else {
var item;
while ((item = itemParser(t)))
while ((item = itemParser.parse(t)))
push(result, item);
}
return result;
@@ -278,20 +258,17 @@ var list = function (itemParser, sepParser) {
var seq = function (/*parsers*/) {
var args = arguments;
if (! args.length)
return describe("(empty)",
function (t) { return []; });
return new Parser("(empty)",
function (t) { return []; });
var description = args[0].description;
for (var i = 1; i < args.length; i++)
description += " " + args[i].description;
return describe(
description,
return new Parser(
args[0].expecting,
function (t) {
var result = [];
for (var i = 0, N = args.length; i < N; i++) {
// first item in sequence can fail, and we
// fail (without error); after that, error on failure
var r = runMaybeRequired(i > 0, args[i], t);
var r = args[i].parse(t, {required: i > 0});
if (! r)
return null;
@@ -304,8 +281,12 @@ var seq = function (/*parsers*/) {
});
};
var unpack = function (arrayParser) {
return revalue(arrayParser, function (v) {
var unpack = function (arrayOrParser) {
if (isArray(arrayOrParser)) {
arrayOrParser.unpack = true;
return arrayOrParser;
}
return revalue(arrayOrParser, function (v) {
if (v && isArray(v))
v.unpack = true;
return v;
@@ -314,35 +295,36 @@ var unpack = function (arrayParser) {
// lookAhead parser must never consume
var lookAhead = function (lookAheadParser, nextParser) {
return describe(
nextParser.description,
return new Parser(
nextParser.expecting,
function (t) {
if (! lookAheadParser(t))
if (! lookAheadParser.parse(t))
return null;
return nextParser(t);
return nextParser.parse(t);
});
};
var negLookAhead = function (lookAheadParser, nextParser) {
if (! nextParser)
return function (t) {
return lookAheadParser(t) ? null : [];
};
return new Parser(
null,
function (t) {
return lookAheadParser.parse(t) ? null : [];
});
return describe(
nextParser.description,
return new Parser(
nextParser.expecting,
function (t) {
if (lookAheadParser(t))
if (lookAheadParser.parse(t))
return null;
return nextParser(t);
return nextParser.parse(t);
});
};
// parser that looks at nothing and returns result
var constant = function (result) {
// no description
return function (t) {
return result;
};
return new Parser(null,
function (t) { return result; });
};
// afterLookAhead allows the parser to fail rather than
@@ -356,14 +338,13 @@ var constant = function (result) {
// instead of "Expected ;" when the optional expression
// turns out to be an illegal `var`.
var opt = function (parser, afterLookAhead) {
return describe(parser.description,
or(parser, afterLookAhead ? afterLookAhead : seq()));
return expecting(parser.expecting,
or(parser, afterLookAhead ? afterLookAhead : seq()));
};
// note: valueTransformFunc gets the tokenizer as a second argument
// if it's called on a parser. This func is allowed to then
// run more parsers.
var revalue = function (parserOrValue, valueTransformFunc) {
// note: valueTransformFunc gets the tokenizer as a second argument.
// This func is allowed to then run more parsers.
var revalue = function (parser, valueTransformFunc) {
if (typeof valueTransformFunc !== 'function') {
var value = valueTransformFunc;
valueTransformFunc = function (v) {
@@ -371,12 +352,9 @@ var revalue = function (parserOrValue, valueTransformFunc) {
};
}
if (typeof parserOrValue === 'function')
// it's a parser
return describe(parserOrValue.description,
function (t) {
return valueTransformFunc(parserOrValue(t), t);
});
else
return valueTransformFunc(parserOrValue);
return new Parser(
parser.expecting,
function (t) {
return valueTransformFunc(parser.parse(t), t);
});
};