From 132b4897afe6bb877f1bb949b884e83cbcbf54bc Mon Sep 17 00:00:00 2001 From: David Greenspan Date: Tue, 11 Sep 2012 19:36:47 -0700 Subject: [PATCH] Parser object --- packages/jsparse/parser.js | 482 ++++++++++++++++++---------------- packages/jsparse/parserlib.js | 224 +++++++--------- 2 files changed, 360 insertions(+), 346 deletions(-) diff --git a/packages/jsparse/parser.js b/packages/jsparse/parser.js index 89a3c90faf..70f3a2f9b4 100644 --- a/packages/jsparse/parser.js +++ b/packages/jsparse/parser.js @@ -2,90 +2,119 @@ // XXX unit tests +// XXX remove unnecessary ParseNode.NILs in lookaheads +// XXX SeqParser +// XXX find all revalues, see if constant ones are necessary. +// API may be confusing if constant affects only non-null. + // What we don't have from ECMA-262 5.1: // - object literal trailing comma // - object literal get/set var parse = function (tokenizer) { - var noLineTerminatorHere = describe( + var noLineTerminatorHere = new Parser( 'noLineTerminator', function (t) { return t.isLineTerminatorHere ? null : []; }); + + // Like token, but marks tokens that need to defy the lexer's + // heuristic about whether the next '/' is a division or + // starts a regex. + var preSlashToken = function (text, divisionNotRegex) { + var inner = token(text); + return new Parser( + inner.expecting, + function (t) { + // temporarily set divisionPermitted, + // restoring it if we don't match. + var oldValue = t.lexer.divisionPermitted; + var result; + try { + t.lexer.divisionPermitted = divisionNotRegex; + result = inner.parse(t); + return result; + } finally { + if (! result) + t.lexer.divisionPermitted = oldValue; + } + }); + }; + // Function that takes one-item arrays to their single item and names other // arrays with `name`. Works on parsers too. - var nameIfMultipart = function (name, parser) { + var nodeIfMultipart = function (name, arrayParser) { return revalue( - parser, + arrayParser, function (parts) { if (! parts) return null; return (parts.length === 1) ? - parts[0] : named(name, parts); + parts[0] : new ParseNode(name, parts); }); }; // These "pointers" allow grammar circularity, i.e. accessing // later parsers from earlier ones. var expressionPtrFunc = function (noIn) { - return describe( + return new Parser( "expression", function (t) { - return expressionFunc(noIn)(t); + return expressionFunc(noIn).parse(t); }); }; var expressionPtr = expressionPtrFunc(false); var assignmentExpressionPtrFunc = function (noIn) { - return describe( + return new Parser( "expression", function (t) { - return assignmentExpressionFunc(noIn)(t); + return assignmentExpressionFunc(noIn).parse(t); }); }; var assignmentExpressionPtr = assignmentExpressionPtrFunc(false); - var functionBodyPtr = describe( + var functionBodyPtr = new Parser( "functionBody", function (t) { - return functionBody(t); + return functionBody.parse(t); }); - var statementPtr = describe( + var statementPtr = new Parser( "statement", function (t) { - return statement(t); + return statement.parse(t); }); var arrayLiteral = - named('array', - seq(token('['), - unpack(opt(list(token(',')))), - unpack( - opt( - list( - describe( - 'expression', - or(assignmentExpressionPtr, - // count a peeked-at ']' as an expression - // to support elisions at end, e.g. - // `[1,2,3,,,,,,]`. Because it's unpacked, - // the look-ahead won't show up in the - // parse tree. - unpack(lookAheadToken(']')))), - // list seperator is one or more commas - // to support elision - unpack(list(token(',')))), - lookAheadToken(']'))), - token(']'))); + node('array', + seq(token('['), + unpack(opt(list(token(',')))), + unpack( + opt( + list( + expecting( + 'expression', + or(assignmentExpressionPtr, + // count a peeked-at ']' as an expression + // to support elisions at end, e.g. + // `[1,2,3,,,,,,]`. Because it's unpacked, + // the look-ahead won't show up in the + // parse tree. + unpack(lookAheadToken(']')))), + // list seperator is one or more commas + // to support elision + unpack(list(token(',')))), + lookAheadToken(']'))), + token(']'))); - var propertyName = describe('propertyName', or( - named('idPropName', seq(tokenClass('IDENTIFIER'))), - named('numPropName', seq(tokenClass('NUMBER'))), - named('strPropName', seq(tokenClass('STRING'))))); - var nameColonValue = describe( + var propertyName = expecting('propertyName', or( + node('idPropName', seq(tokenClass('IDENTIFIER'))), + node('numPropName', seq(tokenClass('NUMBER'))), + node('strPropName', seq(tokenClass('STRING'))))); + var nameColonValue = expecting( 'name:value', - named('prop', seq(propertyName, token(':'), assignmentExpressionPtr))); + node('prop', seq(propertyName, token(':'), assignmentExpressionPtr))); var objectLiteral = - named('object', + node('object', seq(token('{'), unpack(opt(list(nameColonValue, token(',')), lookAheadToken('}'))), @@ -96,7 +125,7 @@ var parse = function (tokenizer) { return seq(token('function'), (nameRequired ? tokenClass('IDENTIFIER') : or(tokenClass('IDENTIFIER'), - revalue(lookAheadToken('('), named('nil', [])))), + revalue(lookAheadToken('('), ParseNode.NIL))), token('('), unpack(opt(list(tokenClass('IDENTIFIER'), token(',')), lookAheadToken(')'))), @@ -105,23 +134,23 @@ var parse = function (tokenizer) { unpack(functionBodyPtr), token('}')); }; - var functionExpression = named('functionExpr', + var functionExpression = node('functionExpr', functionFunc(false)); var primaryOrFunctionExpression = - describe('expression', - or(named('this', seq(token('this'))), - named('identifier', seq(tokenClass('IDENTIFIER'))), - named('number', seq(tokenClass('NUMBER'))), - named('boolean', seq(tokenClass('BOOLEAN'))), - named('null', seq(tokenClass('NULL'))), - named('regex', seq(tokenClass('REGEX'))), - named('string', seq(tokenClass('STRING'))), - named('parens', + expecting('expression', + or(node('this', seq(token('this'))), + node('identifier', seq(tokenClass('IDENTIFIER'))), + node('number', seq(tokenClass('NUMBER'))), + node('boolean', seq(tokenClass('BOOLEAN'))), + node('null', seq(tokenClass('NULL'))), + node('regex', seq(tokenClass('REGEX'))), + node('string', seq(tokenClass('STRING'))), + node('parens', seq(token('('), expressionPtr, token(')'))), - arrayLiteral, - objectLiteral, - functionExpression)); + arrayLiteral, + objectLiteral, + functionExpression)); var dotEnding = seq(token('.'), tokenClass('IDENTIFIER')); var bracketEnding = seq(token('['), expressionPtr, token(']')); @@ -142,14 +171,14 @@ var parse = function (tokenizer) { // call to "return" a valid l-value, as in `foo(bar) = baz`, // though no built-in or user-specifiable call has this property // (it would have to be defined by a browser or other "host"). - var lhsExpression = describe( + var lhsExpression = new Parser( 'expression', function (t) { // Accumulate all initial "new" keywords, not yet knowing // if they have a corresponding argument list later. var news = []; var n; - while ((n = newKeyword(t))) + while ((n = newKeyword.parse(t))) news.push(n); // Read the primaryOrFunctionExpression that will be the "core" @@ -158,8 +187,8 @@ var parse = function (tokenizer) { // and .foo add-ons. // if we have 'new' keywords, we are committed and must // match an expression or error. - var result = runMaybeRequired( - news.length, primaryOrFunctionExpression, t); + var result = primaryOrFunctionExpression.parse( + t, {required: news.length}); if (! result) return null; @@ -170,15 +199,15 @@ var parse = function (tokenizer) { var done = false; while (! done) { var r; - if ((r = dotEnding(t))) { - result = named('dot', [result].concat(r)); - } else if ((r = bracketEnding(t))) { - result = named('bracket', [result].concat(r)); - } else if ((r = callArgs(t))) { + if ((r = dotEnding.parse(t))) { + result = new ParseNode('dot', [result].concat(r)); + } else if ((r = bracketEnding.parse(t))) { + result = new ParseNode('bracket', [result].concat(r)); + } else if ((r = callArgs.parse(t))) { if (news.length) - result = named('newcall', [news.pop(), result].concat(r)); + result = new ParseNode('newcall', [news.pop(), result].concat(r)); else - result = named('call', [result].concat(r)); + result = new ParseNode('call', [result].concat(r)); } else { done = true; } @@ -188,7 +217,7 @@ var parse = function (tokenizer) { // paren-less constructions (`new Date`) are parsed. We've // already handled `new foo().bar()`, now handle `new new foo().bar`. while (news.length) - result = named('new', [news.pop(), result]); + result = new ParseNode('new', [news.pop(), result]); // mark any LeftHandSideExpression, for the benefit of // assignmentExpression @@ -199,9 +228,9 @@ var parse = function (tokenizer) { var postfixToken = token('++ --'); var postfixLookahead = lookAheadToken('++ --'); - var postfixExpression = describe( + var postfixExpression = expecting( 'expression', - nameIfMultipart( + nodeIfMultipart( 'postfix', seq(lhsExpression, unpack(opt(lookAhead(noLineTerminatorHere, @@ -209,18 +238,19 @@ var parse = function (tokenizer) { postfixToken))))))); var unaryList = opt(list(or(token('delete void typeof'), preSlashToken('++ -- + - ~ !', false)))); - var unaryExpression = describe( + var unaryExpression = new Parser( 'expression', function (t) { - var unaries = unaryList(t); + var unaries = unaryList.parse(t); // if we have unaries, we are committed and // have to match an expression or error. - var result = runMaybeRequired(unaries.length, postfixExpression, t); + var result = postfixExpression.parse( + t, {required: unaries.length}); if (! result) return null; while (unaries.length) - result = named('unary', [unaries.pop(), result]); + result = new ParseNode('unary', [unaries.pop(), result]); return result; }); @@ -250,7 +280,7 @@ var parse = function (tokenizer) { token('|'), token('&&'), token('||')]; - return describe( + return expecting( 'expression', binaryLeft(unaryExpression, binaryOps)); }); @@ -258,9 +288,9 @@ var parse = function (tokenizer) { var conditionalExpressionFunc = memoizeBooleanFunc( function (noIn) { - return describe( + return expecting( 'expression', - nameIfMultipart( + nodeIfMultipart( 'ternary', seq(binaryExpressionFunc(noIn), unpack(opt(seq( token('?'), @@ -273,10 +303,10 @@ var parse = function (tokenizer) { var assignmentExpressionFunc = memoizeBooleanFunc( function (noIn) { - return describe( + return new Parser( 'expression', function (t) { - var r = conditionalExpressionFunc(noIn)(t); + var r = conditionalExpressionFunc(noIn).parse(t); if (! r) return null; @@ -286,15 +316,16 @@ var parse = function (tokenizer) { // and then fold them up at the end. var parts = [r]; var op; - while (r.lhs && (op = assignOp(t))) + while (r.lhs && (op = assignOp.parse(t))) parts.push(op, - runRequired(conditionalExpressionFunc(noIn), t)); + conditionalExpressionFunc(noIn).parse( + t, {required: true})); var result = parts.pop(); while (parts.length) { op = parts.pop(); var lhs = parts.pop(); - result = named('assignment', [lhs, op, result]); + result = new ParseNode('assignment', [lhs, op, result]); } return result; }); @@ -303,9 +334,9 @@ var parse = function (tokenizer) { var expressionFunc = memoizeBooleanFunc( function (noIn) { - return describe( + return expecting( 'expression', - nameIfMultipart( + nodeIfMultipart( 'comma', list(assignmentExpressionFunc(noIn), token(',')))); }); @@ -316,31 +347,32 @@ var parse = function (tokenizer) { var statements = list(statementPtr); // implements JavaScript's semicolon "insertion" rules - var maybeSemicolon = describe( + var maybeSemicolon = expecting( 'semicolon', or(token(';'), revalue( or( lookAheadToken('}'), lookAheadTokenClass('EOF'), - function (t) { - return t.isLineTerminatorHere ? [] : null; - }), named(';', [])))); + new Parser(null, + function (t) { + return t.isLineTerminatorHere ? [] : null; + })), new ParseNode(';', [])))); - var expressionStatement = named( + var expressionStatement = node( 'expressionStmnt', negLookAhead( or(lookAheadToken('{'), lookAheadToken('function')), seq(expression, - describe('semicolon', - or(maybeSemicolon, - // allow presence of colon to terminate - // statement legally, for the benefit of - // expressionOrLabelStatement. Basically assume - // an implicit semicolon. This - // is safe because a colon can never legally - // follow a semicolon anyway. - revalue(lookAheadToken(':'), named(';', []))))))); + expecting('semicolon', + or(maybeSemicolon, + // allow presence of colon to terminate + // statement legally, for the benefit of + // expressionOrLabelStatement. Basically assume + // an implicit semicolon. This + // is safe because a colon can never legally + // follow a semicolon anyway. + revalue(lookAheadToken(':'), new ParseNode(';', []))))))); // it's hard to parse statement labels, as in // `foo: x = 1`, because we can't tell from the @@ -350,44 +382,46 @@ var parse = function (tokenizer) { // then rewrites the result if it is an identifier // followed by a colon. var labelColonAndStatement = seq(token(':'), statementPtr); - var noColon = describe( + var noColon = expecting( 'semicolon', negLookAhead(lookAheadToken(':'))); - var expressionOrLabelStatement = function (t) { - var exprStmnt = expressionStatement(t); - if (! exprStmnt) - return null; + var expressionOrLabelStatement = new Parser( + null, + function (t) { + var exprStmnt = expressionStatement.parse(t); + if (! exprStmnt) + return null; - var expr = exprStmnt.children[0]; - var maybeSemi = exprStmnt.children[1]; - if (expr.name !== 'identifier' || - ! (maybeSemi instanceof ParseNode)) { - // We either have a non-identifier expression or a present - // semicolon. This is not a label. - // - // Fail now if we are looking at a colon, causing an - // error message on input like `1+1:` of the same kind - // you'd get without statement label parsing. - runRequired(noColon, t); - return exprStmnt; - } + var expr = exprStmnt.children[0]; + var maybeSemi = exprStmnt.children[1]; + if (expr.name !== 'identifier' || + ! (maybeSemi instanceof ParseNode)) { + // We either have a non-identifier expression or a present + // semicolon. This is not a label. + // + // Fail now if we are looking at a colon, causing an + // error message on input like `1+1:` of the same kind + // you'd get without statement label parsing. + noColon.parse(t, {required: true}); + return exprStmnt; + } - var rest = labelColonAndStatement(t); - if (! rest) - return exprStmnt; + var rest = labelColonAndStatement.parse(t); + if (! rest) + return exprStmnt; - return named('labelStmnt', - [expr.children[0]].concat(rest)); - }; + return new ParseNode('labelStmnt', + [expr.children[0]].concat(rest)); + }); - var emptyStatement = named('emptyStmnt', seq(token(';'))); // not maybeSemicolon + var emptyStatement = node('emptyStmnt', seq(token(';'))); // not maybeSemicolon - var blockStatement = describe('block', named('blockStmnt', seq( + var blockStatement = expecting('block', node('blockStmnt', seq( token('{'), unpack(opt(statements, lookAheadToken('}'))), token('}')))); var varDeclFunc = memoizeBooleanFunc(function (noIn) { - return named( + return node( 'varDecl', seq(tokenClass('IDENTIFIER'), unpack(opt(seq(token('='), @@ -395,7 +429,7 @@ var parse = function (tokenizer) { }); var varDecl = varDeclFunc(false); - var variableStatement = named( + var variableStatement = node( 'varStmnt', seq(token('var'), unpack(list(varDecl, token(','))), maybeSemicolon)); @@ -404,28 +438,28 @@ var parse = function (tokenizer) { // beginning with a regex literal. var closeParenBeforeStatement = preSlashToken(')', false); - var ifStatement = named( + var ifStatement = node( 'ifStmnt', seq(token('if'), token('('), expression, closeParenBeforeStatement, statementPtr, unpack(opt(seq(token('else'), statementPtr))))); - var secondThirdClauses = describe( + var secondThirdClauses = expecting( 'semicolon', lookAhead(lookAheadToken(';'), seq( - describe('semicolon', token(';')), - opt(expressionPtr, revalue(lookAheadToken(';'), named('nil', []))), - describe('semicolon', token(';')), - opt(expressionPtr, revalue(lookAheadToken(')'), named('nil', [])))))); + expecting('semicolon', token(';')), + opt(expressionPtr, revalue(lookAheadToken(';'), ParseNode.NIL)), + expecting('semicolon', token(';')), + opt(expressionPtr, revalue(lookAheadToken(')'), ParseNode.NIL))))); var inExpr = seq(token('in'), expression); - var inExprExpectingSemi = describe('semicolon', - seq(token('in'), expression)); - var forSpec = revalue(named( + var inExprExpectingSemi = expecting('semicolon', + seq(token('in'), expression)); + var forSpec = revalue(node( 'forSpec', or(seq(token('var'), varDeclFunc(true), - describe( + expecting( 'commaOrIn', or(unpack(inExpr), unpack(seq( @@ -437,76 +471,78 @@ var parse = function (tokenizer) { // get the case where the first clause is empty out of the way. // the lookAhead's return value is the empty placeholder for the // missing expression. - seq(revalue(lookAheadToken(';'), named('nil', [])), unpack(secondThirdClauses)), + seq(revalue(lookAheadToken(';'), ParseNode.NIL), unpack(secondThirdClauses)), // custom parser the non-var case because we have to // read the first expression before we know if there's // an "in". - function (t) { - var firstExpr = expressionFunc(true)(t); - if (! firstExpr) - return null; - var rest = secondThirdClauses(t); - if (! rest) { - // we need a left-hand-side expression for a - // `for (x in y)` loop. - if (! firstExpr.lhs) - throw parseError(t, secondThirdClauses); - // if we don't see 'in' at this point, it's probably - // a missing semicolon - rest = runRequired(inExprExpectingSemi, t); - } + new Parser( + null, + function (t) { + var firstExpr = expressionFunc(true).parse(t); + if (! firstExpr) + return null; + var rest = secondThirdClauses.parse(t); + if (! rest) { + // we need a left-hand-side expression for a + // `for (x in y)` loop. + if (! firstExpr.lhs) + throw parseError(t, secondThirdClauses); + // if we don't see 'in' at this point, it's probably + // a missing semicolon + rest = inExprExpectingSemi.parse(t, {required: true}); + } - return [firstExpr].concat(rest); - })), - function (clauses) { - // There are four kinds of for-loop, and we call the - // part between the parens one of forSpec, forVarSpec, - // forInSpec, and forVarInSpec. Having parsed it - // already, we rewrite the node name based on how - // many items came out. forIn and forVarIn always - // have 3 and 4 items respectively. for has 5 - // (the optional expressions are present as nils). - // forVar has 6 or more, because `for(var x;;);` - // produces [`var` `x` `;` nil `;` nil]. - if (! clauses) - return null; - var numChildren = clauses.children.length; - if (numChildren === 3) - return new ParseNode('forInSpec', clauses.children); - else if (numChildren === 4) - return new ParseNode('forVarInSpec', clauses.children); - else if (numChildren >= 6) - return new ParseNode('forVarSpec', clauses.children); - return clauses; - }); + return [firstExpr].concat(rest); + }))), + function (clauses) { + // There are four kinds of for-loop, and we call the + // part between the parens one of forSpec, forVarSpec, + // forInSpec, and forVarInSpec. Having parsed it + // already, we rewrite the node name based on how + // many items came out. forIn and forVarIn always + // have 3 and 4 items respectively. for has 5 + // (the optional expressions are present as nils). + // forVar has 6 or more, because `for(var x;;);` + // produces [`var` `x` `;` nil `;` nil]. + if (! clauses) + return null; + var numChildren = clauses.children.length; + if (numChildren === 3) + return new ParseNode('forInSpec', clauses.children); + else if (numChildren === 4) + return new ParseNode('forVarInSpec', clauses.children); + else if (numChildren >= 6) + return new ParseNode('forVarSpec', clauses.children); + return clauses; + }); var iterationStatement = or( - named('doStmnt', seq(token('do'), statementPtr, token('while'), + node('doStmnt', seq(token('do'), statementPtr, token('while'), token('('), expression, token(')'), maybeSemicolon)), - named('whileStmnt', seq(token('while'), token('('), expression, + node('whileStmnt', seq(token('while'), token('('), expression, closeParenBeforeStatement, statementPtr)), // semicolons must be real, not maybeSemicolons - named('forStmnt', seq( + node('forStmnt', seq( token('for'), token('('), forSpec, closeParenBeforeStatement, statementPtr))); - var returnStatement = named( + var returnStatement = node( 'returnStmnt', seq(token('return'), or( - lookAhead(noLineTerminatorHere, expression), constant(named('nil', []))), + lookAhead(noLineTerminatorHere, expression), constant(ParseNode.NIL)), maybeSemicolon)); - var continueStatement = named( + var continueStatement = node( 'continueStmnt', seq(token('continue'), or( - lookAhead(noLineTerminatorHere, tokenClass('IDENTIFIER')), constant(named('nil', []))), + lookAhead(noLineTerminatorHere, tokenClass('IDENTIFIER')), constant(ParseNode.NIL)), maybeSemicolon)); - var breakStatement = named( + var breakStatement = node( 'breakStmnt', seq(token('break'), or( - lookAhead(noLineTerminatorHere, tokenClass('IDENTIFIER')), constant(named('nil', []))), + lookAhead(noLineTerminatorHere, tokenClass('IDENTIFIER')), constant(ParseNode.NIL)), maybeSemicolon)); - var throwStatement = named( + var throwStatement = node( 'throwStmnt', seq(token('throw'), lookAhead(revalue(noLineTerminatorHere, @@ -519,23 +555,23 @@ var parse = function (tokenizer) { }), expression), maybeSemicolon)); - var withStatement = named( + var withStatement = node( 'withStmnt', seq(token('with'), token('('), expression, closeParenBeforeStatement, statementPtr)); - var switchCase = named( + var switchCase = node( 'case', seq(token('case'), expression, token(':'), unpack(opt(statements, or(lookAheadToken('}'), lookAheadToken('case default')))))); - var switchDefault = named( + var switchDefault = node( 'default', seq(token('default'), token(':'), unpack(opt(statements, or(lookAheadToken('}'), lookAheadToken('case')))))); - var switchStatement = named( + var switchStatement = node( 'switchStmnt', seq(token('switch'), token('('), expression, token(')'), token('{'), unpack(opt(list(switchCase), @@ -545,70 +581,70 @@ var parse = function (tokenizer) { unpack(opt(list(switchCase)))))), token('}'))); - var catchFinally = describe( + var catchFinally = expecting( 'catch', lookAhead(lookAheadToken('catch finally'), seq( - or(named( + or(node( 'catch', seq(token('catch'), token('('), tokenClass('IDENTIFIER'), token(')'), blockStatement)), - constant(named('nil', []))), - or(named( + constant(ParseNode.NIL)), + or(node( 'finally', seq(token('finally'), blockStatement)), - constant(named('nil', [])))))); - var tryStatement = named( + constant(ParseNode.NIL))))); + var tryStatement = node( 'tryStmnt', seq(token('try'), blockStatement, unpack(catchFinally))); - var debuggerStatement = named( + var debuggerStatement = node( 'debuggerStmnt', seq(token('debugger'), maybeSemicolon)); - var statement = describe('statement', - or(expressionOrLabelStatement, - emptyStatement, - blockStatement, - variableStatement, - ifStatement, - iterationStatement, - returnStatement, - continueStatement, - breakStatement, - withStatement, - switchStatement, - throwStatement, - tryStatement, - debuggerStatement)); + var statement = expecting('statement', + or(expressionOrLabelStatement, + emptyStatement, + blockStatement, + variableStatement, + ifStatement, + iterationStatement, + returnStatement, + continueStatement, + breakStatement, + withStatement, + switchStatement, + throwStatement, + tryStatement, + debuggerStatement)); // PROGRAM - var functionDecl = named('functionDecl', + var functionDecl = node('functionDecl', functionFunc(true)); var sourceElement = or(statement, functionDecl); var sourceElements = list(sourceElement); - var functionBody = describe('functionBody', - opt(sourceElements, - lookAheadToken('}'))); + var functionBody = expecting('functionBody', + opt(sourceElements, + lookAheadToken('}'))); - var program = named('program', + var program = node('program', seq(unpack(opt(sourceElements)), // we rely on the fact that opt(sourceElements) // will never fail, and non-first arguments // to seq are required to succeed -- meaning // this parser will never fail without throwing // a parse error. - describe('statement', - revalue(lookAheadTokenClass("EOF"), - function (v, t) { - if (! v) - return null; - // eat the ending "EOF" so that - // our position is updated - t.consume(); - return unpack([]); - })))); + expecting('statement', + revalue(lookAheadTokenClass("EOF"), + function (v, t) { + if (! v) + return null; + // eat the ending "EOF" so that + // our position is updated + t.consume(); + return unpack([]); + })))); - return program(tokenizer); + return program.parse(tokenizer); }; diff --git a/packages/jsparse/parserlib.js b/packages/jsparse/parserlib.js index 0511b95f9e..9cb9a05737 100644 --- a/packages/jsparse/parserlib.js +++ b/packages/jsparse/parserlib.js @@ -1,7 +1,6 @@ ///// TOKENIZER AND PARSER COMBINATORS // XXX make Parser object with parse method? -// XXX rework describe, call "expecting"? // XXX track line/col position, for errors and maybe token info // XXX unit tests @@ -17,6 +16,26 @@ var ParseNode = function (name, children) { throw new Error("Expected array in new ParseNode(" + name + ", ...)"); }; +ParseNode.NIL = new ParseNode('nil', []); + +var Parser = function (expecting, runFunc) { + this.expecting = expecting; + this._run = runFunc; +}; + +_.extend(Parser.prototype, { + parse: function (t, options) { + var result = this._run(t); + + if (options) { + if (options.required && ! result) + throw parseError(t, this); + } + + return result; + } +}); + Tokenizer = function (codeOrLexer) { // XXX rethink codeOrLexer later this.lexer = (codeOrLexer instanceof Lexer ? codeOrLexer : @@ -66,16 +85,17 @@ _.extend(Tokenizer.prototype, { // A parser that consume()s has to succeed. // Similarly, a parser that fails can't have consumed. -// mutates the parser; don't describe an existing parser. -var describe = function (description, parser) { - parser.description = description; +// mutates the parser +var expecting = function (expecting, parser) { + parser.expecting = expecting; return parser; }; // Call this as `throw parseError(...)`. // `expected` is a parser, `after` is a string. -var parseError = function (t, expected, found) { - var str = (expected.description ? "Expected " + expected.description : +var parseError = function (t, expectedParser, found) { + var str = (expectedParser.expecting ? "Expected " + + expectedParser.expecting : // all parsers that might error should have descriptions, // but just in case: "Unexpected token"); @@ -89,14 +109,14 @@ var parseError = function (t, expected, found) { ///// TERMINAL PARSER CONSTRUCTORS -var _tokenClassImpl = function (type, text, dontConsume) { +var _tokenClassImpl = function (type, text, onlyLook) { var textSet = (text ? makeSet(text.split(' ')) : null); - var description = (text ? text.split(' ').join(', ') : type); - return describe( - description, + var expecting = (text ? text.split(' ').join(', ') : type); + return new Parser( + expecting, function (t) { if (t.peekType == type && (!text || textSet[t.peekText])) { - if (dontConsume) + if (onlyLook) return []; var ret = {text: t.peekText, pos: t.pos}; t.consume(); @@ -106,10 +126,10 @@ var _tokenClassImpl = function (type, text, dontConsume) { }); }; -var _tokenImpl = function (text, dontConsume) { +var _tokenImpl = function (text, onlyLook) { if (/\w/.test(text)) - return _tokenClassImpl('KEYWORD', text, dontConsume); - return _tokenClassImpl('PUNCTUATION', text, dontConsume); + return _tokenClassImpl('KEYWORD', text, onlyLook); + return _tokenClassImpl('PUNCTUATION', text, onlyLook); }; var tokenClass = function (type, text) { @@ -122,28 +142,6 @@ var token = function (text) { return _tokenImpl(text); }; -// Like token, but marks tokens that need to defy the lexer's -// heuristic about whether the next '/' is a division or -// starts a regex. -var preSlashToken = function (text, divisionNotRegex) { - var impl = _tokenImpl(text); - return describe(impl.description, - function (t) { - // temporarily set divisionPermitted, - // restoring it if we don't match. - var oldValue = t.lexer.divisionPermitted; - var result; - try { - t.lexer.divisionPermitted = divisionNotRegex; - result = impl(t); - return result; - } finally { - if (! result) - t.lexer.divisionPermitted = oldValue; - } - }); -}; - // NON-CONSUMING PARSER CONSTRUCTORS var lookAheadTokenClass = function (type, text) { @@ -156,45 +154,28 @@ var lookAheadToken = function (text) { ///// NON-TERMINAL PARSER CONSTRUCTORS -// run parser(tokenizer) and assert it matches -var runRequired = function (parser, tokenizer) { - return revalue( - tokenizer ? parser(tokenizer) : parser, - function (v, t) { - if (! v) - throw parseError(t || tokenizer, parser); - return v; - }); -}; - -var runMaybeRequired = function (require, parser, tokenizer) { - return require ? runRequired(parser, tokenizer) : parser(tokenizer); -}; - -// Polymorphic in parsers and results; an experiment. -var named = function (name, parserOrResult) { - return describe( - name, - revalue( - parserOrResult, - function (value) { - if (! value) - return null; - return new ParseNode(name, Array.prototype.slice.call(value)); - })); +var node = function (name, childrenParser) { + return new Parser(name, function (t) { + var children = childrenParser.parse(t); + if (! children) + return null; + return new ParseNode(name, children); + }); }; var or = function (/*parsers*/) { var args = arguments; - return function (t) { - var result; - for(var i = 0, N = args.length; i < N; i++) { - result = args[i](t); - if (result) - return result; - } - return null; - }; + return new Parser( + null, + function (t) { + var result; + for(var i = 0, N = args.length; i < N; i++) { + result = args[i].parse(t); + if (result) + return result; + } + return null; + }); }; // Parses a left-recursive expression with zero or more occurrences @@ -220,18 +201,18 @@ var binaryLeft = function (termParser, opParser) { } } - return describe( - termParser.description, + return new Parser( + termParser.expecting, function (t) { - var result = termParser(t); + var result = termParser.parse(t); if (! result) return null; var op; - while ((op = opParser(t))) { - result = named( + while ((op = opParser.parse(t))) { + result = new ParseNode( 'binary', - [result, op, runRequired(termParser, t, op)]); + [result, op, termParser.parse(t, {required: true})]); } return result; }); @@ -250,25 +231,24 @@ var list = function (itemParser, sepParser) { else array.push(newThing); }; - return describe( - itemParser.description, + return new Parser( + itemParser.expecting, function (t) { var result = []; - var firstItem = itemParser(t); + var firstItem = itemParser.parse(t); if (! firstItem) return null; push(result, firstItem); if (sepParser) { var sep; - while ((sep = sepParser(t))) { + while ((sep = sepParser.parse(t))) { push(result, sep); - push(result, runRequired(itemParser, t, - sep.unpack ? sep[sep.length - 1] : sep)); + push(result, itemParser.parse(t, {required: true})); } } else { var item; - while ((item = itemParser(t))) + while ((item = itemParser.parse(t))) push(result, item); } return result; @@ -278,20 +258,17 @@ var list = function (itemParser, sepParser) { var seq = function (/*parsers*/) { var args = arguments; if (! args.length) - return describe("(empty)", - function (t) { return []; }); + return new Parser("(empty)", + function (t) { return []; }); - var description = args[0].description; - for (var i = 1; i < args.length; i++) - description += " " + args[i].description; - return describe( - description, + return new Parser( + args[0].expecting, function (t) { var result = []; for (var i = 0, N = args.length; i < N; i++) { // first item in sequence can fail, and we // fail (without error); after that, error on failure - var r = runMaybeRequired(i > 0, args[i], t); + var r = args[i].parse(t, {required: i > 0}); if (! r) return null; @@ -304,8 +281,12 @@ var seq = function (/*parsers*/) { }); }; -var unpack = function (arrayParser) { - return revalue(arrayParser, function (v) { +var unpack = function (arrayOrParser) { + if (isArray(arrayOrParser)) { + arrayOrParser.unpack = true; + return arrayOrParser; + } + return revalue(arrayOrParser, function (v) { if (v && isArray(v)) v.unpack = true; return v; @@ -314,35 +295,36 @@ var unpack = function (arrayParser) { // lookAhead parser must never consume var lookAhead = function (lookAheadParser, nextParser) { - return describe( - nextParser.description, + return new Parser( + nextParser.expecting, function (t) { - if (! lookAheadParser(t)) + if (! lookAheadParser.parse(t)) return null; - return nextParser(t); + return nextParser.parse(t); }); }; + var negLookAhead = function (lookAheadParser, nextParser) { if (! nextParser) - return function (t) { - return lookAheadParser(t) ? null : []; - }; + return new Parser( + null, + function (t) { + return lookAheadParser.parse(t) ? null : []; + }); - return describe( - nextParser.description, + return new Parser( + nextParser.expecting, function (t) { - if (lookAheadParser(t)) + if (lookAheadParser.parse(t)) return null; - return nextParser(t); + return nextParser.parse(t); }); }; // parser that looks at nothing and returns result var constant = function (result) { - // no description - return function (t) { - return result; - }; + return new Parser(null, + function (t) { return result; }); }; // afterLookAhead allows the parser to fail rather than @@ -356,14 +338,13 @@ var constant = function (result) { // instead of "Expected ;" when the optional expression // turns out to be an illegal `var`. var opt = function (parser, afterLookAhead) { - return describe(parser.description, - or(parser, afterLookAhead ? afterLookAhead : seq())); + return expecting(parser.expecting, + or(parser, afterLookAhead ? afterLookAhead : seq())); }; -// note: valueTransformFunc gets the tokenizer as a second argument -// if it's called on a parser. This func is allowed to then -// run more parsers. -var revalue = function (parserOrValue, valueTransformFunc) { +// note: valueTransformFunc gets the tokenizer as a second argument. +// This func is allowed to then run more parsers. +var revalue = function (parser, valueTransformFunc) { if (typeof valueTransformFunc !== 'function') { var value = valueTransformFunc; valueTransformFunc = function (v) { @@ -371,12 +352,9 @@ var revalue = function (parserOrValue, valueTransformFunc) { }; } - if (typeof parserOrValue === 'function') - // it's a parser - return describe(parserOrValue.description, - function (t) { - return valueTransformFunc(parserOrValue(t), t); - }); - else - return valueTransformFunc(parserOrValue); + return new Parser( + parser.expecting, + function (t) { + return valueTransformFunc(parser.parse(t), t); + }); };