diff --git a/documentation/coffee/super.coffee b/documentation/coffee/super.coffee index 0f726513..6a5fd3c5 100644 --- a/documentation/coffee/super.coffee +++ b/documentation/coffee/super.coffee @@ -1,27 +1,22 @@ -Animal: -> +class Animal + move: (meters) -> + alert @name + " moved " + meters + "m." -Animal::move: (meters) -> - alert @name + " moved " + meters + "m." +class Snake extends Animal + constructor: (name) -> + @name: name -Snake: (name) -> - @name: name - this + move: -> + alert "Slithering..." + super 5 -Snake extends Animal +class Horse extends Animal + constructor: (name) -> + @name: name -Snake::move: -> - alert "Slithering..." - super 5 - -Horse: (name) -> - @name: name - this - -Horse extends Animal - -Horse::move: -> - alert "Galloping..." - super 45 + move: -> + alert "Galloping..." + super 45 sam: new Snake "Sammy the Python" tom: new Horse "Tommy the Palomino" diff --git a/examples/code.coffee b/examples/code.coffee index 85b75169..e7c799e0 100644 --- a/examples/code.coffee +++ b/examples/code.coffee @@ -140,24 +140,28 @@ sed diam nonummy nibh euismod tincidunt ut laoreet dolore magna aliquam erat volutpat. Ut wisi enim ad." # Inheritance and calling super. -Animal: -> -Animal::move: (meters) -> - alert(this.name + " moved " + meters + "m.") +class Animal + move: (meters) -> + alert this.name + " moved " + meters + "m." -Snake: (name) -> this.name: name -Snake extends Animal -Snake::move: -> - alert('Slithering...') - super(5) +class Snake extends Animal + constructor: (name) -> + @name: name -Horse: (name) -> this.name: name -Horse extends Animal -Horse::move: -> - alert('Galloping...') - super(45) + move: -> + alert 'Slithering...' + super 5 -sam: new Snake("Sammy the Snake") -tom: new Horse("Tommy the Horse") +class Horse extends Animal + constructor: (name) -> + @name: name + + move: -> + alert 'Galloping...' + super 45 + +sam: new Snake "Sammy the Snake" +tom: new Horse "Tommy the Horse" sam.move() tom.move() diff --git a/examples/computer_science/linked_list.coffee b/examples/computer_science/linked_list.coffee index 1d38e175..d5aea43f 100644 --- a/examples/computer_science/linked_list.coffee +++ b/examples/computer_science/linked_list.coffee @@ -1,90 +1,92 @@ # "Classic" linked list implementation that doesn't keep track of its size. -LinkedList: -> - this._head: null # Pointer to the first item in the list. +class LinkedList + + constructor: -> + this._head: null # Pointer to the first item in the list. -# Appends some data to the end of the list. This method traverses the existing -# list and places the value at the end in a new node. -LinkedList::add: (data) -> + # Appends some data to the end of the list. This method traverses the existing + # list and places the value at the end in a new node. + add: (data) -> - # Create a new node object to wrap the data. - node: {data: data, next: null} + # Create a new node object to wrap the data. + node: {data: data, next: null} - current: this._head ||= node + current: this._head ||= node - if this._head isnt node - current: current.next while current.next - current.next: node + if this._head isnt node + current: current.next while current.next + current.next: node - this + this -# Retrieves the data at the given position in the list. -LinkedList::item: (index) -> + # Retrieves the data at the given position in the list. + item: (index) -> - # Check for out-of-bounds values. - return null if index < 0 + # Check for out-of-bounds values. + return null if index < 0 - current: this._head or null - i: -1 + current: this._head or null + i: -1 - # Advance through the list. - current: current.next while current and index > (i += 1) + # Advance through the list. + current: current.next while current and index > (i += 1) - # Return null if we've reached the end. - current and current.data + # Return null if we've reached the end. + current and current.data -# Remove the item from the given location in the list. -LinkedList::remove: (index) -> + # Remove the item from the given location in the list. + remove: (index) -> - # Check for out-of-bounds values. - return null if index < 0 + # Check for out-of-bounds values. + return null if index < 0 - current: this._head or null - i: -1 + current: this._head or null + i: -1 - # Special case: removing the first item. - if index is 0 - this._head: current.next - else + # Special case: removing the first item. + if index is 0 + this._head: current.next + else - # Find the right location. - [previous, current]: [current, current.next] while index > (i += 1) + # Find the right location. + [previous, current]: [current, current.next] while index > (i += 1) - # Skip over the item to remove. - previous.next: current.next + # Skip over the item to remove. + previous.next: current.next - # Return the value. - current and current.data + # Return the value. + current and current.data -# Calculate the number of items in the list. -LinkedList::size: -> - current: this._head - count: 0 + # Calculate the number of items in the list. + size: -> + current: this._head + count: 0 - while current - count += 1 - current: current.next + while current + count += 1 + current: current.next - count + count -# Convert the list into an array. -LinkedList::toArray: -> - result: [] - current: this._head + # Convert the list into an array. + toArray: -> + result: [] + current: this._head - while current - result.push(current.data) - current: current.next + while current + result.push(current.data) + current: current.next - result + result -# The string representation of the linked list. -LinkedList::toString: -> this.toArray().toString() + # The string representation of the linked list. + toString: -> this.toArray().toString() # Tests. diff --git a/examples/potion.coffee b/examples/potion.coffee index c8c5ddf9..cc20a131 100644 --- a/examples/potion.coffee +++ b/examples/potion.coffee @@ -53,16 +53,16 @@ for key, val of {dog: 'canine', cat: 'feline', fox: 'vulpine'} # Person print = (): # ('My name is ', /name, '.') join print. -Person: -> -Person::print: -> - print('My name is ' + this.name + '.') +class Person + print: -> + print 'My name is ' + this.name + '.' # p = Person () # p /name string print p: new Person() -print(p.name) +print p.name # Policeman = Person class (rank): /rank = rank. @@ -71,12 +71,13 @@ print(p.name) # # Policeman ('Constable') print -Policeman: (rank) -> this.rank: rank -Policeman extends Person -Policeman::print: -> - print('My name is ' + this.name + " and I'm a " + this.rank + '.') +class Policeman extends Person + constructor: (rank) -> + @rank: rank + print: -> + print 'My name is ' + this.name + " and I'm a " + this.rank + '.' -print(new Policeman('Constable')) +print new Policeman 'Constable' # app = [window (width=200, height=400) diff --git a/lib/lexer.js b/lib/lexer.js index 9fef9e1b..59bf3e92 100644 --- a/lib/lexer.js +++ b/lib/lexer.js @@ -1,15 +1,11 @@ (function(){ - var ACCESSORS, ASSIGNMENT, BEFORE_WHEN, CALLABLE, CODE, COFFEE_KEYWORDS, COMMENT, COMMENT_CLEANER, HEREDOC, HEREDOC_INDENT, IDENTIFIER, JS, JS_CLEANER, JS_FORBIDDEN, JS_KEYWORDS, KEYWORDS, LAST_DENT, LAST_DENTS, MULTILINER, MULTI_DENT, NOT_REGEX, NO_NEWLINE, NUMBER, OPERATOR, REGEX, RESERVED, Rewriter, STRING, STRING_NEWLINES, WHITESPACE, lex; + var ACCESSORS, ASSIGNMENT, BEFORE_WHEN, CALLABLE, CODE, COFFEE_KEYWORDS, COMMENT, COMMENT_CLEANER, HEREDOC, HEREDOC_INDENT, IDENTIFIER, JS, JS_CLEANER, JS_FORBIDDEN, JS_KEYWORDS, KEYWORDS, LAST_DENT, LAST_DENTS, Lexer, MULTILINER, MULTI_DENT, NOT_REGEX, NO_NEWLINE, NUMBER, OPERATOR, REGEX, RESERVED, Rewriter, STRING, STRING_NEWLINES, WHITESPACE; if ((typeof process !== "undefined" && process !== null)) { Rewriter = require('./rewriter').Rewriter; } else { this.exports = this; Rewriter = this.Rewriter; } - // The lexer reads a stream of CoffeeScript and divvys it up into tagged - // tokens. A minor bit of the ambiguity in the grammar has been avoided by - // pushing some extra smarts into the Lexer. - exports.Lexer = (lex = function lex() { }); // Constants ============================================================ // Keywords that CoffeScript shares in common with JS. JS_KEYWORDS = ["if", "else", "true", "false", "new", "return", "try", "catch", "finally", "throw", "break", "continue", "for", "in", "while", "delete", "instanceof", "typeof", "switch", "super", "extends", "class"]; @@ -56,351 +52,358 @@ ACCESSORS = ['PROPERTY_ACCESS', 'PROTOTYPE_ACCESS', 'SOAK_ACCESS', '@']; // Tokens that, when immediately preceding a 'WHEN', indicate that its leading. BEFORE_WHEN = ['INDENT', 'OUTDENT', 'TERMINATOR']; - // Scan by attempting to match tokens one character at a time. Slow and steady. - lex.prototype.tokenize = function tokenize(code) { - this.code = code; - // Cleanup code by remove extra line breaks, TODO: chomp - this.i = 0; - // Current character position we're parsing - this.line = 1; - // The current line. - this.indent = 0; - // The current indent level. - this.indents = []; - // The stack of all indent levels we are currently within. - this.tokens = []; - // Collection of all parsed tokens in the form [:TOKEN_TYPE, value] - while (this.i < this.code.length) { - this.chunk = this.code.slice(this.i); - this.extract_next_token(); - } - this.close_indentation(); - return (new Rewriter()).rewrite(this.tokens); - }; - // At every position, run through this list of attempted matches, - // short-circuiting if any of them succeed. - lex.prototype.extract_next_token = function extract_next_token() { - if (this.identifier_token()) { - return null; - } - if (this.number_token()) { - return null; - } - if (this.heredoc_token()) { - return null; - } - if (this.string_token()) { - return null; - } - if (this.js_token()) { - return null; - } - if (this.regex_token()) { - return null; - } - if (this.indent_token()) { - return null; - } - if (this.comment_token()) { - return null; - } - if (this.whitespace_token()) { - return null; - } - return this.literal_token(); - }; - // Tokenizers ========================================================== - // Matches identifying literals: variables, keywords, method names, etc. - lex.prototype.identifier_token = function identifier_token() { - var id, tag; - if (!((id = this.match(IDENTIFIER, 1)))) { - return false; - } - if (this.value() === '::') { - this.tag(1, 'PROTOTYPE_ACCESS'); - } - if (this.value() === '.' && !(this.value(2) === '.')) { - if (this.tag(2) === '?') { - this.tag(1, 'SOAK_ACCESS'); - this.tokens.splice(-2, 1); - } else { - this.tag(1, 'PROPERTY_ACCESS'); + // The lexer reads a stream of CoffeeScript and divvys it up into tagged + // tokens. A minor bit of the ambiguity in the grammar has been avoided by + // pushing some extra smarts into the Lexer. + exports.Lexer = (function() { + Lexer = function Lexer() { }; + // Scan by attempting to match tokens one character at a time. Slow and steady. + Lexer.prototype.tokenize = function tokenize(code) { + this.code = code; + // Cleanup code by remove extra line breaks, TODO: chomp + this.i = 0; + // Current character position we're parsing + this.line = 1; + // The current line. + this.indent = 0; + // The current indent level. + this.indents = []; + // The stack of all indent levels we are currently within. + this.tokens = []; + // Collection of all parsed tokens in the form [:TOKEN_TYPE, value] + while (this.i < this.code.length) { + this.chunk = this.code.slice(this.i); + this.extract_next_token(); } - } - tag = 'IDENTIFIER'; - if (KEYWORDS.indexOf(id) >= 0 && !((ACCESSORS.indexOf(this.tag()) >= 0) && !this.prev().spaced)) { - tag = id.toUpperCase(); - } - if (RESERVED.indexOf(id) >= 0) { - throw new Error('SyntaxError: Reserved word "' + id + '" on line ' + this.line); - } - if (tag === 'WHEN' && BEFORE_WHEN.indexOf(this.tag()) >= 0) { - tag = 'LEADING_WHEN'; - } - this.token(tag, id); - this.i += id.length; - return true; - }; - // Matches numbers, including decimals, hex, and exponential notation. - lex.prototype.number_token = function number_token() { - var number; - if (!((number = this.match(NUMBER, 1)))) { - return false; - } - this.token('NUMBER', number); - this.i += number.length; - return true; - }; - // Matches strings, including multi-line strings. - lex.prototype.string_token = function string_token() { - var escaped, string; - if (!((string = this.match(STRING, 1)))) { - return false; - } - escaped = string.replace(STRING_NEWLINES, " \\\n"); - this.token('STRING', escaped); - this.line += this.count(string, "\n"); - this.i += string.length; - return true; - }; - // Matches heredocs, adjusting indentation to the correct level. - lex.prototype.heredoc_token = function heredoc_token() { - var doc, indent, match; - if (!((match = this.chunk.match(HEREDOC)))) { - return false; - } - doc = match[2] || match[4]; - indent = (doc.match(HEREDOC_INDENT) || ['']).sort()[0]; - doc = doc.replace(new RegExp("^" + indent, 'gm'), '').replace(MULTILINER, "\\n").replace('"', '\\"'); - this.token('STRING', '"' + doc + '"'); - this.line += this.count(match[1], "\n"); - this.i += match[1].length; - return true; - }; - // Matches interpolated JavaScript. - lex.prototype.js_token = function js_token() { - var script; - if (!((script = this.match(JS, 1)))) { - return false; - } - this.token('JS', script.replace(JS_CLEANER, '')); - this.i += script.length; - return true; - }; - // Matches regular expression literals. - lex.prototype.regex_token = function regex_token() { - var regex; - if (!((regex = this.match(REGEX, 1)))) { - return false; - } - if (NOT_REGEX.indexOf(this.tag()) >= 0) { - return false; - } - this.token('REGEX', regex); - this.i += regex.length; - return true; - }; - // Matches and conumes comments. - lex.prototype.comment_token = function comment_token() { - var comment; - if (!((comment = this.match(COMMENT, 1)))) { - return false; - } - this.line += (comment.match(MULTILINER) || []).length; - this.token('COMMENT', comment.replace(COMMENT_CLEANER, '').split(MULTILINER)); - this.token('TERMINATOR', "\n"); - this.i += comment.length; - return true; - }; - // Record tokens for indentation differing from the previous line. - lex.prototype.indent_token = function indent_token() { - var diff, indent, next_character, no_newlines, prev, size; - if (!((indent = this.match(MULTI_DENT, 1)))) { - return false; - } - this.line += indent.match(MULTILINER).length; - this.i += indent.length; - next_character = this.chunk.match(MULTI_DENT)[4]; - prev = this.prev(2); - no_newlines = next_character === '.' || (this.value() && this.value().match(NO_NEWLINE) && prev && (prev[0] !== '.') && !this.value().match(CODE)); - if (no_newlines) { - return this.suppress_newlines(indent); - } - size = indent.match(LAST_DENTS).reverse()[0].match(LAST_DENT)[1].length; - if (size === this.indent) { - return this.newline_token(indent); - } - if (size > this.indent) { - diff = size - this.indent; - this.token('INDENT', diff); - this.indents.push(diff); - } else { - this.outdent_token(this.indent - size); - } - this.indent = size; - return true; - }; - // Record an oudent token or tokens, if we're moving back inwards past - // multiple recorded indents. - lex.prototype.outdent_token = function outdent_token(move_out) { - var last_indent; - while (move_out > 0 && this.indents.length) { - last_indent = this.indents.pop(); - this.token('OUTDENT', last_indent); - move_out -= last_indent; - } - if (!(this.tag() === 'TERMINATOR')) { - this.token('TERMINATOR', "\n"); - } - return true; - }; - // Matches and consumes non-meaningful whitespace. - lex.prototype.whitespace_token = function whitespace_token() { - var prev, space; - if (!((space = this.match(WHITESPACE, 1)))) { - return false; - } - prev = this.prev(); - if (prev) { - prev.spaced = true; - } - this.i += space.length; - return true; - }; - // Multiple newlines get merged together. - // Use a trailing \ to escape newlines. - lex.prototype.newline_token = function newline_token(newlines) { - if (!(this.tag() === 'TERMINATOR')) { - this.token('TERMINATOR', "\n"); - } - return true; - }; - // Tokens to explicitly escape newlines are removed once their job is done. - lex.prototype.suppress_newlines = function suppress_newlines(newlines) { - if (this.value() === "\\") { - this.tokens.pop(); - } - return true; - }; - // We treat all other single characters as a token. Eg.: ( ) , . ! - // Multi-character operators are also literal tokens, so that Racc can assign - // the proper order of operations. - lex.prototype.literal_token = function literal_token() { - var match, not_spaced, tag, value; - match = this.chunk.match(OPERATOR); - value = match && match[1]; - if (value && value.match(CODE)) { - this.tag_parameters(); - } - value = value || this.chunk.substr(0, 1); - not_spaced = !this.prev() || !this.prev().spaced; - tag = value; - if (value.match(ASSIGNMENT)) { - tag = 'ASSIGN'; - if (JS_FORBIDDEN.indexOf(this.value()) >= 0) { - throw new Error('SyntaxError: Reserved word "' + this.value() + '" on line ' + this.line + ' can\'t be assigned'); - } - } else if (value === ';') { - tag = 'TERMINATOR'; - } else if (value === '[' && this.tag() === '?' && not_spaced) { - tag = 'SOAKED_INDEX_START'; - this.soaked_index = true; - this.tokens.pop(); - } else if (value === ']' && this.soaked_index) { - tag = 'SOAKED_INDEX_END'; - this.soaked_index = false; - } else if (CALLABLE.indexOf(this.tag()) >= 0 && not_spaced) { - if (value === '(') { - tag = 'CALL_START'; - } - if (value === '[') { - tag = 'INDEX_START'; - } - } - this.token(tag, value); - this.i += value.length; - return true; - }; - // Helpers ============================================================= - // Add a token to the results, taking note of the line number. - lex.prototype.token = function token(tag, value) { - return this.tokens.push([tag, value, this.line]); - }; - // Look at a tag in the current token stream. - lex.prototype.tag = function tag(index, tag) { - var tok; - if (!((tok = this.prev(index)))) { - return null; - } - if ((typeof tag !== "undefined" && tag !== null)) { - return (tok[0] = tag); - } - return tok[0]; - }; - // Look at a value in the current token stream. - lex.prototype.value = function value(index, val) { - var tok; - if (!((tok = this.prev(index)))) { - return null; - } - if ((typeof val !== "undefined" && val !== null)) { - return (tok[1] = val); - } - return tok[1]; - }; - // Look at a previous token. - lex.prototype.prev = function prev(index) { - return this.tokens[this.tokens.length - (index || 1)]; - }; - // Count the occurences of a character in a string. - lex.prototype.count = function count(string, letter) { - var num, pos; - num = 0; - pos = string.indexOf(letter); - while (pos !== -1) { - num += 1; - pos = string.indexOf(letter, pos + 1); - } - return num; - }; - // Attempt to match a string against the current chunk, returning the indexed - // match. - lex.prototype.match = function match(regex, index) { - var m; - if (!((m = this.chunk.match(regex)))) { - return false; - } - return m ? m[index] : false; - }; - // A source of ambiguity in our grammar was parameter lists in function - // definitions (as opposed to argument lists in function calls). Tag - // parameter identifiers in order to avoid this. Also, parameter lists can - // make use of splats. - lex.prototype.tag_parameters = function tag_parameters() { - var _a, i, tok; - if (this.tag() !== ')') { - return null; - } - i = 0; - while (true) { - i += 1; - tok = this.prev(i); - if (!tok) { + this.close_indentation(); + return (new Rewriter()).rewrite(this.tokens); + }; + // At every position, run through this list of attempted matches, + // short-circuiting if any of them succeed. + Lexer.prototype.extract_next_token = function extract_next_token() { + if (this.identifier_token()) { return null; } - if ((_a = tok[0]) === 'IDENTIFIER') { - tok[0] = 'PARAM'; - } else if (_a === ')') { - tok[0] = 'PARAM_END'; - } else if (_a === '(') { - return (tok[0] = 'PARAM_START'); + if (this.number_token()) { + return null; } - } - return true; - }; - // Close up all remaining open blocks. IF the first token is an indent, - // axe it. - lex.prototype.close_indentation = function close_indentation() { - return this.outdent_token(this.indent); - }; + if (this.heredoc_token()) { + return null; + } + if (this.string_token()) { + return null; + } + if (this.js_token()) { + return null; + } + if (this.regex_token()) { + return null; + } + if (this.indent_token()) { + return null; + } + if (this.comment_token()) { + return null; + } + if (this.whitespace_token()) { + return null; + } + return this.literal_token(); + }; + // Tokenizers ========================================================== + // Matches identifying literals: variables, keywords, method names, etc. + Lexer.prototype.identifier_token = function identifier_token() { + var id, tag; + if (!((id = this.match(IDENTIFIER, 1)))) { + return false; + } + if (this.value() === '::') { + this.tag(1, 'PROTOTYPE_ACCESS'); + } + if (this.value() === '.' && !(this.value(2) === '.')) { + if (this.tag(2) === '?') { + this.tag(1, 'SOAK_ACCESS'); + this.tokens.splice(-2, 1); + } else { + this.tag(1, 'PROPERTY_ACCESS'); + } + } + tag = 'IDENTIFIER'; + if (KEYWORDS.indexOf(id) >= 0 && !((ACCESSORS.indexOf(this.tag()) >= 0) && !this.prev().spaced)) { + tag = id.toUpperCase(); + } + if (RESERVED.indexOf(id) >= 0) { + throw new Error('SyntaxError: Reserved word "' + id + '" on line ' + this.line); + } + if (tag === 'WHEN' && BEFORE_WHEN.indexOf(this.tag()) >= 0) { + tag = 'LEADING_WHEN'; + } + this.token(tag, id); + this.i += id.length; + return true; + }; + // Matches numbers, including decimals, hex, and exponential notation. + Lexer.prototype.number_token = function number_token() { + var number; + if (!((number = this.match(NUMBER, 1)))) { + return false; + } + this.token('NUMBER', number); + this.i += number.length; + return true; + }; + // Matches strings, including multi-line strings. + Lexer.prototype.string_token = function string_token() { + var escaped, string; + if (!((string = this.match(STRING, 1)))) { + return false; + } + escaped = string.replace(STRING_NEWLINES, " \\\n"); + this.token('STRING', escaped); + this.line += this.count(string, "\n"); + this.i += string.length; + return true; + }; + // Matches heredocs, adjusting indentation to the correct level. + Lexer.prototype.heredoc_token = function heredoc_token() { + var doc, indent, match; + if (!((match = this.chunk.match(HEREDOC)))) { + return false; + } + doc = match[2] || match[4]; + indent = (doc.match(HEREDOC_INDENT) || ['']).sort()[0]; + doc = doc.replace(new RegExp("^" + indent, 'gm'), '').replace(MULTILINER, "\\n").replace('"', '\\"'); + this.token('STRING', '"' + doc + '"'); + this.line += this.count(match[1], "\n"); + this.i += match[1].length; + return true; + }; + // Matches interpolated JavaScript. + Lexer.prototype.js_token = function js_token() { + var script; + if (!((script = this.match(JS, 1)))) { + return false; + } + this.token('JS', script.replace(JS_CLEANER, '')); + this.i += script.length; + return true; + }; + // Matches regular expression literals. + Lexer.prototype.regex_token = function regex_token() { + var regex; + if (!((regex = this.match(REGEX, 1)))) { + return false; + } + if (NOT_REGEX.indexOf(this.tag()) >= 0) { + return false; + } + this.token('REGEX', regex); + this.i += regex.length; + return true; + }; + // Matches and conumes comments. + Lexer.prototype.comment_token = function comment_token() { + var comment; + if (!((comment = this.match(COMMENT, 1)))) { + return false; + } + this.line += (comment.match(MULTILINER) || []).length; + this.token('COMMENT', comment.replace(COMMENT_CLEANER, '').split(MULTILINER)); + this.token('TERMINATOR', "\n"); + this.i += comment.length; + return true; + }; + // Record tokens for indentation differing from the previous line. + Lexer.prototype.indent_token = function indent_token() { + var diff, indent, next_character, no_newlines, prev, size; + if (!((indent = this.match(MULTI_DENT, 1)))) { + return false; + } + this.line += indent.match(MULTILINER).length; + this.i += indent.length; + next_character = this.chunk.match(MULTI_DENT)[4]; + prev = this.prev(2); + no_newlines = next_character === '.' || (this.value() && this.value().match(NO_NEWLINE) && prev && (prev[0] !== '.') && !this.value().match(CODE)); + if (no_newlines) { + return this.suppress_newlines(indent); + } + size = indent.match(LAST_DENTS).reverse()[0].match(LAST_DENT)[1].length; + if (size === this.indent) { + return this.newline_token(indent); + } + if (size > this.indent) { + diff = size - this.indent; + this.token('INDENT', diff); + this.indents.push(diff); + } else { + this.outdent_token(this.indent - size); + } + this.indent = size; + return true; + }; + // Record an oudent token or tokens, if we're moving back inwards past + // multiple recorded indents. + Lexer.prototype.outdent_token = function outdent_token(move_out) { + var last_indent; + while (move_out > 0 && this.indents.length) { + last_indent = this.indents.pop(); + this.token('OUTDENT', last_indent); + move_out -= last_indent; + } + if (!(this.tag() === 'TERMINATOR')) { + this.token('TERMINATOR', "\n"); + } + return true; + }; + // Matches and consumes non-meaningful whitespace. + Lexer.prototype.whitespace_token = function whitespace_token() { + var prev, space; + if (!((space = this.match(WHITESPACE, 1)))) { + return false; + } + prev = this.prev(); + if (prev) { + prev.spaced = true; + } + this.i += space.length; + return true; + }; + // Multiple newlines get merged together. + // Use a trailing \ to escape newlines. + Lexer.prototype.newline_token = function newline_token(newlines) { + if (!(this.tag() === 'TERMINATOR')) { + this.token('TERMINATOR', "\n"); + } + return true; + }; + // Tokens to explicitly escape newlines are removed once their job is done. + Lexer.prototype.suppress_newlines = function suppress_newlines(newlines) { + if (this.value() === "\\") { + this.tokens.pop(); + } + return true; + }; + // We treat all other single characters as a token. Eg.: ( ) , . ! + // Multi-character operators are also literal tokens, so that Racc can assign + // the proper order of operations. + Lexer.prototype.literal_token = function literal_token() { + var match, not_spaced, tag, value; + match = this.chunk.match(OPERATOR); + value = match && match[1]; + if (value && value.match(CODE)) { + this.tag_parameters(); + } + value = value || this.chunk.substr(0, 1); + not_spaced = !this.prev() || !this.prev().spaced; + tag = value; + if (value.match(ASSIGNMENT)) { + tag = 'ASSIGN'; + if (JS_FORBIDDEN.indexOf(this.value()) >= 0) { + throw new Error('SyntaxError: Reserved word "' + this.value() + '" on line ' + this.line + ' can\'t be assigned'); + } + } else if (value === ';') { + tag = 'TERMINATOR'; + } else if (value === '[' && this.tag() === '?' && not_spaced) { + tag = 'SOAKED_INDEX_START'; + this.soaked_index = true; + this.tokens.pop(); + } else if (value === ']' && this.soaked_index) { + tag = 'SOAKED_INDEX_END'; + this.soaked_index = false; + } else if (CALLABLE.indexOf(this.tag()) >= 0 && not_spaced) { + if (value === '(') { + tag = 'CALL_START'; + } + if (value === '[') { + tag = 'INDEX_START'; + } + } + this.token(tag, value); + this.i += value.length; + return true; + }; + // Helpers ============================================================= + // Add a token to the results, taking note of the line number. + Lexer.prototype.token = function token(tag, value) { + return this.tokens.push([tag, value, this.line]); + }; + // Look at a tag in the current token stream. + Lexer.prototype.tag = function tag(index, tag) { + var tok; + if (!((tok = this.prev(index)))) { + return null; + } + if ((typeof tag !== "undefined" && tag !== null)) { + return (tok[0] = tag); + } + return tok[0]; + }; + // Look at a value in the current token stream. + Lexer.prototype.value = function value(index, val) { + var tok; + if (!((tok = this.prev(index)))) { + return null; + } + if ((typeof val !== "undefined" && val !== null)) { + return (tok[1] = val); + } + return tok[1]; + }; + // Look at a previous token. + Lexer.prototype.prev = function prev(index) { + return this.tokens[this.tokens.length - (index || 1)]; + }; + // Count the occurences of a character in a string. + Lexer.prototype.count = function count(string, letter) { + var num, pos; + num = 0; + pos = string.indexOf(letter); + while (pos !== -1) { + num += 1; + pos = string.indexOf(letter, pos + 1); + } + return num; + }; + // Attempt to match a string against the current chunk, returning the indexed + // match. + Lexer.prototype.match = function match(regex, index) { + var m; + if (!((m = this.chunk.match(regex)))) { + return false; + } + return m ? m[index] : false; + }; + // A source of ambiguity in our grammar was parameter lists in function + // definitions (as opposed to argument lists in function calls). Tag + // parameter identifiers in order to avoid this. Also, parameter lists can + // make use of splats. + Lexer.prototype.tag_parameters = function tag_parameters() { + var _a, i, tok; + if (this.tag() !== ')') { + return null; + } + i = 0; + while (true) { + i += 1; + tok = this.prev(i); + if (!tok) { + return null; + } + if ((_a = tok[0]) === 'IDENTIFIER') { + tok[0] = 'PARAM'; + } else if (_a === ')') { + tok[0] = 'PARAM_END'; + } else if (_a === '(') { + return (tok[0] = 'PARAM_START'); + } + } + return true; + }; + // Close up all remaining open blocks. IF the first token is an indent, + // axe it. + Lexer.prototype.close_indentation = function close_indentation() { + return this.outdent_token(this.indent); + }; + return Lexer; + }).call(this); })(); diff --git a/src/lexer.coffee b/src/lexer.coffee index 65816048..6390572b 100644 --- a/src/lexer.coffee +++ b/src/lexer.coffee @@ -4,11 +4,6 @@ else this.exports: this Rewriter: this.Rewriter -# The lexer reads a stream of CoffeeScript and divvys it up into tagged -# tokens. A minor bit of the ambiguity in the grammar has been avoided by -# pushing some extra smarts into the Lexer. -exports.Lexer: lex: -> - # Constants ============================================================ # Keywords that CoffeScript shares in common with JS. @@ -86,241 +81,246 @@ ACCESSORS: ['PROPERTY_ACCESS', 'PROTOTYPE_ACCESS', 'SOAK_ACCESS', '@'] # Tokens that, when immediately preceding a 'WHEN', indicate that its leading. BEFORE_WHEN: ['INDENT', 'OUTDENT', 'TERMINATOR'] -# Scan by attempting to match tokens one character at a time. Slow and steady. -lex::tokenize: (code) -> - @code : code # Cleanup code by remove extra line breaks, TODO: chomp - @i : 0 # Current character position we're parsing - @line : 1 # The current line. - @indent : 0 # The current indent level. - @indents : [] # The stack of all indent levels we are currently within. - @tokens : [] # Collection of all parsed tokens in the form [:TOKEN_TYPE, value] - while @i < @code.length - @chunk: @code.slice(@i) - @extract_next_token() - @close_indentation() - (new Rewriter()).rewrite @tokens +# The lexer reads a stream of CoffeeScript and divvys it up into tagged +# tokens. A minor bit of the ambiguity in the grammar has been avoided by +# pushing some extra smarts into the Lexer. +exports.Lexer: class Lexer -# At every position, run through this list of attempted matches, -# short-circuiting if any of them succeed. -lex::extract_next_token: -> - return if @identifier_token() - return if @number_token() - return if @heredoc_token() - return if @string_token() - return if @js_token() - return if @regex_token() - return if @indent_token() - return if @comment_token() - return if @whitespace_token() - return @literal_token() - -# Tokenizers ========================================================== - -# Matches identifying literals: variables, keywords, method names, etc. -lex::identifier_token: -> - return false unless id: @match IDENTIFIER, 1 - @tag(1, 'PROTOTYPE_ACCESS') if @value() is '::' - if @value() is '.' and not (@value(2) is '.') - if @tag(2) is '?' - @tag(1, 'SOAK_ACCESS') - @tokens.splice(-2, 1) + # Scan by attempting to match tokens one character at a time. Slow and steady. + tokenize: (code) -> + @code : code # Cleanup code by remove extra line breaks, TODO: chomp + @i : 0 # Current character position we're parsing + @line : 1 # The current line. + @indent : 0 # The current indent level. + @indents : [] # The stack of all indent levels we are currently within. + @tokens : [] # Collection of all parsed tokens in the form [:TOKEN_TYPE, value] + while @i < @code.length + @chunk: @code.slice(@i) + @extract_next_token() + @close_indentation() + (new Rewriter()).rewrite @tokens + + # At every position, run through this list of attempted matches, + # short-circuiting if any of them succeed. + extract_next_token: -> + return if @identifier_token() + return if @number_token() + return if @heredoc_token() + return if @string_token() + return if @js_token() + return if @regex_token() + return if @indent_token() + return if @comment_token() + return if @whitespace_token() + return @literal_token() + + # Tokenizers ========================================================== + + # Matches identifying literals: variables, keywords, method names, etc. + identifier_token: -> + return false unless id: @match IDENTIFIER, 1 + @tag(1, 'PROTOTYPE_ACCESS') if @value() is '::' + if @value() is '.' and not (@value(2) is '.') + if @tag(2) is '?' + @tag(1, 'SOAK_ACCESS') + @tokens.splice(-2, 1) + else + @tag(1, 'PROPERTY_ACCESS') + tag: 'IDENTIFIER' + tag: id.toUpperCase() if KEYWORDS.indexOf(id) >= 0 and + not ((ACCESSORS.indexOf(@tag()) >= 0) and not @prev().spaced) + throw new Error('SyntaxError: Reserved word "' + id + '" on line ' + @line) if RESERVED.indexOf(id) >= 0 + tag: 'LEADING_WHEN' if tag is 'WHEN' and BEFORE_WHEN.indexOf(@tag()) >= 0 + @token(tag, id) + @i += id.length + true + + # Matches numbers, including decimals, hex, and exponential notation. + number_token: -> + return false unless number: @match NUMBER, 1 + @token 'NUMBER', number + @i += number.length + true + + # Matches strings, including multi-line strings. + string_token: -> + return false unless string: @match STRING, 1 + escaped: string.replace STRING_NEWLINES, " \\\n" + @token 'STRING', escaped + @line += @count string, "\n" + @i += string.length + true + + # Matches heredocs, adjusting indentation to the correct level. + heredoc_token: -> + return false unless match = @chunk.match(HEREDOC) + doc: match[2] or match[4] + indent: (doc.match(HEREDOC_INDENT) or ['']).sort()[0] + doc: doc.replace(new RegExp("^" + indent, 'gm'), '') + .replace(MULTILINER, "\\n") + .replace('"', '\\"') + @token 'STRING', '"' + doc + '"' + @line += @count match[1], "\n" + @i += match[1].length + true + + # Matches interpolated JavaScript. + js_token: -> + return false unless script: @match JS, 1 + @token 'JS', script.replace(JS_CLEANER, '') + @i += script.length + true + + # Matches regular expression literals. + regex_token: -> + return false unless regex: @match REGEX, 1 + return false if NOT_REGEX.indexOf(@tag()) >= 0 + @token 'REGEX', regex + @i += regex.length + true + + # Matches and conumes comments. + comment_token: -> + return false unless comment: @match COMMENT, 1 + @line += (comment.match(MULTILINER) or []).length + @token 'COMMENT', comment.replace(COMMENT_CLEANER, '').split(MULTILINER) + @token 'TERMINATOR', "\n" + @i += comment.length + true + + # Record tokens for indentation differing from the previous line. + indent_token: -> + return false unless indent: @match MULTI_DENT, 1 + @line += indent.match(MULTILINER).length + @i += indent.length + next_character: @chunk.match(MULTI_DENT)[4] + prev: @prev(2) + no_newlines: next_character is '.' or (@value() and @value().match(NO_NEWLINE) and prev and (prev[0] isnt '.') and not @value().match(CODE)) + return @suppress_newlines(indent) if no_newlines + size: indent.match(LAST_DENTS).reverse()[0].match(LAST_DENT)[1].length + return @newline_token(indent) if size is @indent + if size > @indent + diff: size - @indent + @token 'INDENT', diff + @indents.push diff else - @tag(1, 'PROPERTY_ACCESS') - tag: 'IDENTIFIER' - tag: id.toUpperCase() if KEYWORDS.indexOf(id) >= 0 and - not ((ACCESSORS.indexOf(@tag()) >= 0) and not @prev().spaced) - throw new Error('SyntaxError: Reserved word "' + id + '" on line ' + @line) if RESERVED.indexOf(id) >= 0 - tag: 'LEADING_WHEN' if tag is 'WHEN' and BEFORE_WHEN.indexOf(@tag()) >= 0 - @token(tag, id) - @i += id.length - true - -# Matches numbers, including decimals, hex, and exponential notation. -lex::number_token: -> - return false unless number: @match NUMBER, 1 - @token 'NUMBER', number - @i += number.length - true - -# Matches strings, including multi-line strings. -lex::string_token: -> - return false unless string: @match STRING, 1 - escaped: string.replace STRING_NEWLINES, " \\\n" - @token 'STRING', escaped - @line += @count string, "\n" - @i += string.length - true - -# Matches heredocs, adjusting indentation to the correct level. -lex::heredoc_token: -> - return false unless match = @chunk.match(HEREDOC) - doc: match[2] or match[4] - indent: (doc.match(HEREDOC_INDENT) or ['']).sort()[0] - doc: doc.replace(new RegExp("^" + indent, 'gm'), '') - .replace(MULTILINER, "\\n") - .replace('"', '\\"') - @token 'STRING', '"' + doc + '"' - @line += @count match[1], "\n" - @i += match[1].length - true - -# Matches interpolated JavaScript. -lex::js_token: -> - return false unless script: @match JS, 1 - @token 'JS', script.replace(JS_CLEANER, '') - @i += script.length - true - -# Matches regular expression literals. -lex::regex_token: -> - return false unless regex: @match REGEX, 1 - return false if NOT_REGEX.indexOf(@tag()) >= 0 - @token 'REGEX', regex - @i += regex.length - true - -# Matches and conumes comments. -lex::comment_token: -> - return false unless comment: @match COMMENT, 1 - @line += (comment.match(MULTILINER) or []).length - @token 'COMMENT', comment.replace(COMMENT_CLEANER, '').split(MULTILINER) - @token 'TERMINATOR', "\n" - @i += comment.length - true - -# Record tokens for indentation differing from the previous line. -lex::indent_token: -> - return false unless indent: @match MULTI_DENT, 1 - @line += indent.match(MULTILINER).length - @i += indent.length - next_character: @chunk.match(MULTI_DENT)[4] - prev: @prev(2) - no_newlines: next_character is '.' or (@value() and @value().match(NO_NEWLINE) and prev and (prev[0] isnt '.') and not @value().match(CODE)) - return @suppress_newlines(indent) if no_newlines - size: indent.match(LAST_DENTS).reverse()[0].match(LAST_DENT)[1].length - return @newline_token(indent) if size is @indent - if size > @indent - diff: size - @indent - @token 'INDENT', diff - @indents.push diff - else - @outdent_token @indent - size - @indent: size - true - -# Record an oudent token or tokens, if we're moving back inwards past -# multiple recorded indents. -lex::outdent_token: (move_out) -> - while move_out > 0 and @indents.length - last_indent: @indents.pop() - @token 'OUTDENT', last_indent - move_out -= last_indent - @token 'TERMINATOR', "\n" unless @tag() is 'TERMINATOR' - true - -# Matches and consumes non-meaningful whitespace. -lex::whitespace_token: -> - return false unless space: @match WHITESPACE, 1 - prev: @prev() - prev.spaced: true if prev - @i += space.length - true - -# Multiple newlines get merged together. -# Use a trailing \ to escape newlines. -lex::newline_token: (newlines) -> - @token 'TERMINATOR', "\n" unless @tag() is 'TERMINATOR' - true - -# Tokens to explicitly escape newlines are removed once their job is done. -lex::suppress_newlines: (newlines) -> - @tokens.pop() if @value() is "\\" - true - -# We treat all other single characters as a token. Eg.: ( ) , . ! -# Multi-character operators are also literal tokens, so that Racc can assign -# the proper order of operations. -lex::literal_token: -> - match: @chunk.match(OPERATOR) - value: match and match[1] - @tag_parameters() if value and value.match(CODE) - value ||= @chunk.substr(0, 1) - not_spaced: not @prev() or not @prev().spaced - tag: value - if value.match(ASSIGNMENT) - tag: 'ASSIGN' - throw new Error('SyntaxError: Reserved word "' + @value() + '" on line ' + @line + ' can\'t be assigned') if JS_FORBIDDEN.indexOf(@value()) >= 0 - else if value is ';' - tag: 'TERMINATOR' - else if value is '[' and @tag() is '?' and not_spaced - tag: 'SOAKED_INDEX_START' - @soaked_index: true - @tokens.pop() - else if value is ']' and @soaked_index - tag: 'SOAKED_INDEX_END' - @soaked_index: false - else if CALLABLE.indexOf(@tag()) >= 0 and not_spaced - tag: 'CALL_START' if value is '(' - tag: 'INDEX_START' if value is '[' - @token tag, value - @i += value.length - true - -# Helpers ============================================================= - -# Add a token to the results, taking note of the line number. -lex::token: (tag, value) -> - @tokens.push([tag, value, @line]) - -# Look at a tag in the current token stream. -lex::tag: (index, tag) -> - return unless tok: @prev(index) - return tok[0]: tag if tag? - tok[0] - -# Look at a value in the current token stream. -lex::value: (index, val) -> - return unless tok: @prev(index) - return tok[1]: val if val? - tok[1] - -# Look at a previous token. -lex::prev: (index) -> - @tokens[@tokens.length - (index or 1)] - -# Count the occurences of a character in a string. -lex::count: (string, letter) -> - num: 0 - pos: string.indexOf(letter) - while pos isnt -1 - num += 1 - pos: string.indexOf(letter, pos + 1) - num - -# Attempt to match a string against the current chunk, returning the indexed -# match. -lex::match: (regex, index) -> - return false unless m: @chunk.match(regex) - if m then m[index] else false - -# A source of ambiguity in our grammar was parameter lists in function -# definitions (as opposed to argument lists in function calls). Tag -# parameter identifiers in order to avoid this. Also, parameter lists can -# make use of splats. -lex::tag_parameters: -> - return if @tag() isnt ')' - i: 0 - while true - i += 1 - tok: @prev(i) - return if not tok - switch tok[0] - when 'IDENTIFIER' then tok[0]: 'PARAM' - when ')' then tok[0]: 'PARAM_END' - when '(' then return tok[0]: 'PARAM_START' - true - -# Close up all remaining open blocks. IF the first token is an indent, -# axe it. -lex::close_indentation: -> - @outdent_token(@indent) + @outdent_token @indent - size + @indent: size + true + + # Record an oudent token or tokens, if we're moving back inwards past + # multiple recorded indents. + outdent_token: (move_out) -> + while move_out > 0 and @indents.length + last_indent: @indents.pop() + @token 'OUTDENT', last_indent + move_out -= last_indent + @token 'TERMINATOR', "\n" unless @tag() is 'TERMINATOR' + true + + # Matches and consumes non-meaningful whitespace. + whitespace_token: -> + return false unless space: @match WHITESPACE, 1 + prev: @prev() + prev.spaced: true if prev + @i += space.length + true + + # Multiple newlines get merged together. + # Use a trailing \ to escape newlines. + newline_token: (newlines) -> + @token 'TERMINATOR', "\n" unless @tag() is 'TERMINATOR' + true + + # Tokens to explicitly escape newlines are removed once their job is done. + suppress_newlines: (newlines) -> + @tokens.pop() if @value() is "\\" + true + + # We treat all other single characters as a token. Eg.: ( ) , . ! + # Multi-character operators are also literal tokens, so that Racc can assign + # the proper order of operations. + literal_token: -> + match: @chunk.match(OPERATOR) + value: match and match[1] + @tag_parameters() if value and value.match(CODE) + value ||= @chunk.substr(0, 1) + not_spaced: not @prev() or not @prev().spaced + tag: value + if value.match(ASSIGNMENT) + tag: 'ASSIGN' + throw new Error('SyntaxError: Reserved word "' + @value() + '" on line ' + @line + ' can\'t be assigned') if JS_FORBIDDEN.indexOf(@value()) >= 0 + else if value is ';' + tag: 'TERMINATOR' + else if value is '[' and @tag() is '?' and not_spaced + tag: 'SOAKED_INDEX_START' + @soaked_index: true + @tokens.pop() + else if value is ']' and @soaked_index + tag: 'SOAKED_INDEX_END' + @soaked_index: false + else if CALLABLE.indexOf(@tag()) >= 0 and not_spaced + tag: 'CALL_START' if value is '(' + tag: 'INDEX_START' if value is '[' + @token tag, value + @i += value.length + true + + # Helpers ============================================================= + + # Add a token to the results, taking note of the line number. + token: (tag, value) -> + @tokens.push([tag, value, @line]) + + # Look at a tag in the current token stream. + tag: (index, tag) -> + return unless tok: @prev(index) + return tok[0]: tag if tag? + tok[0] + + # Look at a value in the current token stream. + value: (index, val) -> + return unless tok: @prev(index) + return tok[1]: val if val? + tok[1] + + # Look at a previous token. + prev: (index) -> + @tokens[@tokens.length - (index or 1)] + + # Count the occurences of a character in a string. + count: (string, letter) -> + num: 0 + pos: string.indexOf(letter) + while pos isnt -1 + num += 1 + pos: string.indexOf(letter, pos + 1) + num + + # Attempt to match a string against the current chunk, returning the indexed + # match. + match: (regex, index) -> + return false unless m: @chunk.match(regex) + if m then m[index] else false + + # A source of ambiguity in our grammar was parameter lists in function + # definitions (as opposed to argument lists in function calls). Tag + # parameter identifiers in order to avoid this. Also, parameter lists can + # make use of splats. + tag_parameters: -> + return if @tag() isnt ')' + i: 0 + while true + i += 1 + tok: @prev(i) + return if not tok + switch tok[0] + when 'IDENTIFIER' then tok[0]: 'PARAM' + when ')' then tok[0]: 'PARAM_END' + when '(' then return tok[0]: 'PARAM_START' + true + + # Close up all remaining open blocks. IF the first token is an indent, + # axe it. + close_indentation: -> + @outdent_token(@indent)