From 969c2e528db4d5180f97ff4ba080f6e06db51b60 Mon Sep 17 00:00:00 2001 From: Jeremy Ashkenas Date: Sun, 28 Feb 2010 13:34:52 -0500 Subject: [PATCH] a number of refactors to the Lexer. It should be a good bit clearer to read now. --- lib/lexer.js | 85 +++++++++++++++++++++++++++++++----------------- src/lexer.coffee | 84 ++++++++++++++++++++++++++++++----------------- 2 files changed, 110 insertions(+), 59 deletions(-) diff --git a/lib/lexer.js b/lib/lexer.js index a5a581d2..7d0883dd 100644 --- a/lib/lexer.js +++ b/lib/lexer.js @@ -1,5 +1,5 @@ (function(){ - var ACCESSORS, ASSIGNMENT, BEFORE_WHEN, CALLABLE, CODE, COFFEE_KEYWORDS, COMMENT, COMMENT_CLEANER, HEREDOC, HEREDOC_INDENT, IDENTIFIER, JS, JS_CLEANER, JS_FORBIDDEN, JS_KEYWORDS, KEYWORDS, LAST_DENT, LAST_DENTS, Lexer, MULTILINER, MULTI_DENT, NOT_REGEX, NO_NEWLINE, NUMBER, OPERATOR, REGEX, RESERVED, Rewriter, STRING, STRING_NEWLINES, WHITESPACE; + var ACCESSORS, ASSIGNMENT, BEFORE_WHEN, CALLABLE, CODE, COFFEE_KEYWORDS, COMMENT, COMMENT_CLEANER, HEREDOC, HEREDOC_INDENT, IDENTIFIER, JS, JS_CLEANER, JS_FORBIDDEN, JS_KEYWORDS, KEYWORDS, LAST_DENT, LAST_DENTS, Lexer, MULTILINER, MULTI_DENT, NOT_REGEX, NO_NEWLINE, NUMBER, OPERATOR, REGEX, RESERVED, Rewriter, STRING, STRING_NEWLINES, WHITESPACE, include; if ((typeof process !== "undefined" && process !== null)) { Rewriter = require('./rewriter').Rewriter; } else { @@ -60,9 +60,9 @@ // Scan by attempting to match tokens one character at a time. Slow and steady. Lexer.prototype.tokenize = function tokenize(code) { this.code = code; - // Cleanup code by remove extra line breaks, TODO: chomp + // The remainder of the source code. this.i = 0; - // Current character position we're parsing + // Current character position we're parsing. this.line = 1; // The current line. this.indent = 0; @@ -70,7 +70,7 @@ this.indents = []; // The stack of all indent levels we are currently within. this.tokens = []; - // Collection of all parsed tokens in the form [:TOKEN_TYPE, value] + // Collection of all parsed tokens in the form ['TOKEN_TYPE', value] while (this.i < this.code.length) { this.chunk = this.code.slice(this.i); this.extract_next_token(); @@ -117,25 +117,15 @@ if (!((id = this.match(IDENTIFIER, 1)))) { return false; } - if (this.value() === '::') { - this.tag(1, 'PROTOTYPE_ACCESS'); - } - if (this.value() === '.' && !(this.value(2) === '.')) { - if (this.tag(2) === '?') { - this.tag(1, 'SOAK_ACCESS'); - this.tokens.splice(-2, 1); - } else { - this.tag(1, 'PROPERTY_ACCESS'); - } - } + this.name_access_type(); tag = 'IDENTIFIER'; - if (KEYWORDS.indexOf(id) >= 0 && !((ACCESSORS.indexOf(this.tag()) >= 0) && !this.prev().spaced)) { + if (include(KEYWORDS, id) && !(include(ACCESSORS, this.tag(0)) && !this.prev().spaced)) { tag = id.toUpperCase(); } - if (RESERVED.indexOf(id) >= 0) { - throw new Error('SyntaxError: Reserved word "' + id + '" on line ' + this.line); + if (include(RESERVED, id)) { + this.identifier_error(id); } - if (tag === 'WHEN' && BEFORE_WHEN.indexOf(this.tag()) >= 0) { + if (tag === 'WHEN' && include(BEFORE_WHEN, this.tag())) { tag = 'LEADING_WHEN'; } this.token(tag, id); @@ -166,13 +156,11 @@ }; // Matches heredocs, adjusting indentation to the correct level. Lexer.prototype.heredoc_token = function heredoc_token() { - var doc, indent, match; + var doc, match; if (!((match = this.chunk.match(HEREDOC)))) { return false; } - doc = match[2] || match[4]; - indent = (doc.match(HEREDOC_INDENT) || ['']).sort()[0]; - doc = doc.replace(new RegExp("^" + indent, 'gm'), '').replace(MULTILINER, "\\n").replace(/"/g, '\\"'); + doc = this.sanitize_heredoc(match[2] || match[4]); this.token('STRING', '"' + doc + '"'); this.line += this.count(match[1], "\n"); this.i += match[1].length; @@ -194,7 +182,7 @@ if (!((regex = this.match(REGEX, 1)))) { return false; } - if (NOT_REGEX.indexOf(this.tag()) >= 0) { + if (include(NOT_REGEX, this.tag())) { return false; } this.token('REGEX', regex); @@ -221,9 +209,9 @@ } this.line += indent.match(MULTILINER).length; this.i += indent.length; - next_character = this.chunk.match(MULTI_DENT)[4]; prev = this.prev(2); size = indent.match(LAST_DENTS).reverse()[0].match(LAST_DENT)[1].length; + next_character = this.chunk.match(MULTI_DENT)[4]; no_newlines = next_character === '.' || (this.value() && this.value().match(NO_NEWLINE) && prev && (prev[0] !== '.') && !this.value().match(CODE)); if (size === this.indent) { if (no_newlines) { @@ -243,7 +231,7 @@ this.indent = size; return true; }; - // Record an oudent token or tokens, if we're moving back inwards past + // Record an outdent token or tokens, if we're moving back inwards past // multiple recorded indents. Lexer.prototype.outdent_token = function outdent_token(move_out, no_newlines) { var last_indent; @@ -257,7 +245,8 @@ } return true; }; - // Matches and consumes non-meaningful whitespace. + // Matches and consumes non-meaningful whitespace. Tag the previous token + // as being "spaced", because there are some cases where it matters. Lexer.prototype.whitespace_token = function whitespace_token() { var prev, space; if (!((space = this.match(WHITESPACE, 1)))) { @@ -300,8 +289,8 @@ tag = value; if (value.match(ASSIGNMENT)) { tag = 'ASSIGN'; - if (JS_FORBIDDEN.indexOf(this.value()) >= 0) { - throw new Error('SyntaxError: Reserved word "' + this.value() + '" on line ' + this.line + ' can\'t be assigned'); + if (include(JS_FORBIDDEN, this.value)) { + this.assignment_error(); } } else if (value === ';') { tag = 'TERMINATOR'; @@ -312,7 +301,7 @@ } else if (value === ']' && this.soaked_index) { tag = 'SOAKED_INDEX_END'; this.soaked_index = false; - } else if (CALLABLE.indexOf(this.tag()) >= 0 && not_spaced) { + } else if (include(CALLABLE, this.tag()) && not_spaced) { if (value === '(') { tag = 'CALL_START'; } @@ -324,6 +313,37 @@ this.i += value.length; return true; }; + // Token Manipulators ================================================== + // As we consume a new IDENTIFIER, look at the previous token to determine + // if it's a special kind of access. + Lexer.prototype.name_access_type = function name_access_type() { + if (this.value() === '::') { + this.tag(1, 'PROTOTYPE_ACCESS'); + } + if (this.value() === '.' && !(this.value(2) === '.')) { + if (this.tag(2) === '?') { + this.tag(1, 'SOAK_ACCESS'); + return this.tokens.splice(-2, 1); + } else { + return this.tag(1, 'PROPERTY_ACCESS'); + } + } + }; + // Sanitize a heredoc by escaping double quotes and erasing all external + // indentation on the left-hand side. + Lexer.prototype.sanitize_heredoc = function sanitize_heredoc(doc) { + var indent; + indent = (doc.match(HEREDOC_INDENT) || ['']).sort()[0]; + return doc.replace(new RegExp("^" + indent, 'gm'), '').replace(MULTILINER, "\\n").replace(/"/g, '\\"'); + }; + // When you try to use a forbidden word in JavaScript as an identifier. + Lexer.prototype.identifier_error = function identifier_error(word) { + throw new Error('SyntaxError: Reserved word "' + word + '" on line ' + this.line); + }; + // When you try to assign to a reserved word in JavaScript, like "function". + Lexer.prototype.assignment_error = function assignment_error() { + throw new Error('SyntaxError: Reserved word "' + this.value() + '" on line ' + this.line + ' can\'t be assigned'); + }; // Helpers ============================================================= // Add a token to the results, taking note of the line number. Lexer.prototype.token = function token(tag, value) { @@ -408,4 +428,9 @@ }; return Lexer; }).call(this); + // Helper functions: + // Does a list include a value? + include = function include(list, value) { + return list.indexOf(value) >= 0; + }; })(); diff --git a/src/lexer.coffee b/src/lexer.coffee index a9714c7e..60f81974 100644 --- a/src/lexer.coffee +++ b/src/lexer.coffee @@ -89,12 +89,12 @@ exports.Lexer: class Lexer # Scan by attempting to match tokens one character at a time. Slow and steady. tokenize: (code) -> - @code : code # Cleanup code by remove extra line breaks, TODO: chomp - @i : 0 # Current character position we're parsing - @line : 1 # The current line. - @indent : 0 # The current indent level. - @indents : [] # The stack of all indent levels we are currently within. - @tokens : [] # Collection of all parsed tokens in the form [:TOKEN_TYPE, value] + @code : code # The remainder of the source code. + @i : 0 # Current character position we're parsing. + @line : 1 # The current line. + @indent : 0 # The current indent level. + @indents : [] # The stack of all indent levels we are currently within. + @tokens : [] # Collection of all parsed tokens in the form ['TOKEN_TYPE', value] while @i < @code.length @chunk: @code.slice(@i) @extract_next_token() @@ -120,18 +120,12 @@ exports.Lexer: class Lexer # Matches identifying literals: variables, keywords, method names, etc. identifier_token: -> return false unless id: @match IDENTIFIER, 1 - @tag(1, 'PROTOTYPE_ACCESS') if @value() is '::' - if @value() is '.' and not (@value(2) is '.') - if @tag(2) is '?' - @tag(1, 'SOAK_ACCESS') - @tokens.splice(-2, 1) - else - @tag(1, 'PROPERTY_ACCESS') + @name_access_type() tag: 'IDENTIFIER' - tag: id.toUpperCase() if KEYWORDS.indexOf(id) >= 0 and - not ((ACCESSORS.indexOf(@tag()) >= 0) and not @prev().spaced) - throw new Error('SyntaxError: Reserved word "' + id + '" on line ' + @line) if RESERVED.indexOf(id) >= 0 - tag: 'LEADING_WHEN' if tag is 'WHEN' and BEFORE_WHEN.indexOf(@tag()) >= 0 + tag: id.toUpperCase() if include(KEYWORDS, id) and + not (include(ACCESSORS, @tag(0)) and not @prev().spaced) + @identifier_error id if include RESERVED, id + tag: 'LEADING_WHEN' if tag is 'WHEN' and include BEFORE_WHEN, @tag() @token(tag, id) @i += id.length true @@ -155,11 +149,7 @@ exports.Lexer: class Lexer # Matches heredocs, adjusting indentation to the correct level. heredoc_token: -> return false unless match = @chunk.match(HEREDOC) - doc: match[2] or match[4] - indent: (doc.match(HEREDOC_INDENT) or ['']).sort()[0] - doc: doc.replace(new RegExp("^" + indent, 'gm'), '') - .replace(MULTILINER, "\\n") - .replace(/"/g, '\\"') + doc: @sanitize_heredoc match[2] or match[4] @token 'STRING', '"' + doc + '"' @line += @count match[1], "\n" @i += match[1].length @@ -175,7 +165,7 @@ exports.Lexer: class Lexer # Matches regular expression literals. regex_token: -> return false unless regex: @match REGEX, 1 - return false if NOT_REGEX.indexOf(@tag()) >= 0 + return false if include NOT_REGEX, @tag() @token 'REGEX', regex @i += regex.length true @@ -194,10 +184,11 @@ exports.Lexer: class Lexer return false unless indent: @match MULTI_DENT, 1 @line += indent.match(MULTILINER).length @i += indent.length - next_character: @chunk.match(MULTI_DENT)[4] prev: @prev(2) size: indent.match(LAST_DENTS).reverse()[0].match(LAST_DENT)[1].length - no_newlines: next_character is '.' or (@value() and @value().match(NO_NEWLINE) and prev and (prev[0] isnt '.') and not @value().match(CODE)) + next_character: @chunk.match(MULTI_DENT)[4] + no_newlines: next_character is '.' or (@value() and @value().match(NO_NEWLINE) and + prev and (prev[0] isnt '.') and not @value().match(CODE)) if size is @indent return @suppress_newlines(indent) if no_newlines return @newline_token(indent) @@ -211,7 +202,7 @@ exports.Lexer: class Lexer @indent: size true - # Record an oudent token or tokens, if we're moving back inwards past + # Record an outdent token or tokens, if we're moving back inwards past # multiple recorded indents. outdent_token: (move_out, no_newlines) -> while move_out > 0 and @indents.length @@ -221,7 +212,8 @@ exports.Lexer: class Lexer @token 'TERMINATOR', "\n" unless @tag() is 'TERMINATOR' or no_newlines true - # Matches and consumes non-meaningful whitespace. + # Matches and consumes non-meaningful whitespace. Tag the previous token + # as being "spaced", because there are some cases where it matters. whitespace_token: -> return false unless space: @match WHITESPACE, 1 prev: @prev() @@ -252,7 +244,7 @@ exports.Lexer: class Lexer tag: value if value.match(ASSIGNMENT) tag: 'ASSIGN' - throw new Error('SyntaxError: Reserved word "' + @value() + '" on line ' + @line + ' can\'t be assigned') if JS_FORBIDDEN.indexOf(@value()) >= 0 + @assignment_error() if include JS_FORBIDDEN, @value else if value is ';' tag: 'TERMINATOR' else if value is '[' and @tag() is '?' and not_spaced @@ -262,13 +254,42 @@ exports.Lexer: class Lexer else if value is ']' and @soaked_index tag: 'SOAKED_INDEX_END' @soaked_index: false - else if CALLABLE.indexOf(@tag()) >= 0 and not_spaced + else if include(CALLABLE, @tag()) and not_spaced tag: 'CALL_START' if value is '(' tag: 'INDEX_START' if value is '[' @token tag, value @i += value.length true + # Token Manipulators ================================================== + + # As we consume a new IDENTIFIER, look at the previous token to determine + # if it's a special kind of access. + name_access_type: -> + @tag(1, 'PROTOTYPE_ACCESS') if @value() is '::' + if @value() is '.' and not (@value(2) is '.') + if @tag(2) is '?' + @tag(1, 'SOAK_ACCESS') + @tokens.splice(-2, 1) + else + @tag 1, 'PROPERTY_ACCESS' + + # Sanitize a heredoc by escaping double quotes and erasing all external + # indentation on the left-hand side. + sanitize_heredoc: (doc) -> + indent: (doc.match(HEREDOC_INDENT) or ['']).sort()[0] + doc.replace(new RegExp("^" + indent, 'gm'), '') + .replace(MULTILINER, "\\n") + .replace(/"/g, '\\"') + + # When you try to use a forbidden word in JavaScript as an identifier. + identifier_error: (word) -> + throw new Error 'SyntaxError: Reserved word "' + word + '" on line ' + @line + + # When you try to assign to a reserved word in JavaScript, like "function". + assignment_error: -> + throw new Error 'SyntaxError: Reserved word "' + @value() + '" on line ' + @line + ' can\'t be assigned' + # Helpers ============================================================= # Add a token to the results, taking note of the line number. @@ -327,3 +348,8 @@ exports.Lexer: class Lexer # axe it. close_indentation: -> @outdent_token(@indent) + +# Helper functions: + +# Does a list include a value? +include: (list, value) -> list.indexOf(value) >= 0 \ No newline at end of file