coffeescript/src/lexer.coffee

# The CoffeeScript Lexer. Uses a series of token-matching regexes to attempt
# matches against the beginning of the source code. When a match is found,
# a token is produced, we consume the match, and start again. Tokens are in the
# form:
#
#     [tag, value, line_number]
#
# Which is a format that can be fed directly into [Jison](http://github.com/zaach/jison).

# Set up the Lexer for both Node.js and the browser, depending on where we are.
if process?
  Rewriter: require('./rewriter').Rewriter
else
  this.exports: this
  Rewriter: this.Rewriter

# Constants
# ---------

# Keywords that CoffeeScript shares in common with JavaScript.
JS_KEYWORDS: [
  "if", "else",
  "true", "false",
  "new", "return",
  "try", "catch", "finally", "throw",
  "break", "continue",
  "for", "in", "while",
  "delete", "instanceof", "typeof",
  "switch", "super", "extends", "class"
]

# CoffeeScript-only keywords, which we're more relaxed about allowing. They can't
# be used standalone, but you can reference them as an attached property.
COFFEE_KEYWORDS: [
  "then", "unless",
  "yes", "no", "on", "off",
  "and", "or", "is", "isnt", "not",
  "of", "by", "where", "when"
]

# The combined list of keywords is the superset that gets passed verbatim to
# the parser.
KEYWORDS: JS_KEYWORDS.concat COFFEE_KEYWORDS

# The list of keywords that are reserved by JavaScript, but not used, or are
# used by CoffeeScript internally. We throw an error when these are encountered,
# to avoid having a JavaScript error at runtime.
RESERVED: [
  "case", "default", "do", "function", "var", "void", "with"
  "const", "let", "debugger", "enum", "export", "import", "native",
  "__extends", "__hasProp"
]

# The superset of both JavaScript keywords and reserved words, none of which may
# be used as identifiers or properties.
JS_FORBIDDEN: JS_KEYWORDS.concat RESERVED

# Token matching regexes.
IDENTIFIER    : /^([a-zA-Z$_](\w|\$)*)/
NUMBER        : /^(\b((0(x|X)[0-9a-fA-F]+)|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?)))\b/i
HEREDOC       : /^("{6}|'{6}|"{3}\n?([\s\S]*?)\n?([ \t]*)"{3}|'{3}\n?([\s\S]*?)\n?([ \t]*)'{3})/
INTERPOLATION : /(^|[\s\S]*?(?:[\\]|\\\\)?)\$([a-zA-Z_@]\w*|{[\s\S]*?(?:[^\\]|\\\\)})/
OPERATOR      : /^([+\*&|\/\-%=<>:!?]+)/
WHITESPACE    : /^([ \t]+)/
COMMENT       : /^(((\n?[ \t]*)?#[^\n]*)+)/
CODE          : /^((-|=)>)/
REGEX         : /^(\/(\S.*?)?([^\\]|\\\\)\/[imgy]{0,4})/
MULTI_DENT    : /^((\n([ \t]*))+)(\.)?/
LAST_DENTS    : /\n([ \t]*)/g
LAST_DENT     : /\n([ \t]*)/
ASSIGNMENT    : /^(:|=)$/

# Token cleaning regexes.
JS_CLEANER      : /(^`|`$)/g
MULTILINER      : /\n/g
STRING_NEWLINES : /\n[ \t]*/g
COMMENT_CLEANER : /(^[ \t]*#|\n[ \t]*$)/mg
NO_NEWLINE      : /^([+\*&|\/\-%=<>:!.\\][<>=&|]*|and|or|is|isnt|not|delete|typeof|instanceof)$/
HEREDOC_INDENT  : /^[ \t]+/mg

# Tokens which a regular expression will never immediately follow, but which
# a division operator might.
#
# See: http://www.mozilla.org/js/language/js20-2002-04/rationale/syntax.html#regular-expressions
#
# Our list is shorter, due to sans-parentheses method calls.
NOT_REGEX: [
  'NUMBER', 'REGEX', '++', '--', 'FALSE', 'NULL', 'TRUE'
]

# Tokens which could legitimately be invoked or indexed. A opening
# parentheses or bracket following these tokens will be recorded as the start
# of a function invocation or indexing operation.
CALLABLE: ['IDENTIFIER', 'SUPER', ')', ']', '}', 'STRING', '@']

# Tokens that indicate an access -- keywords immediately following will be
# treated as identifiers.
ACCESSORS: ['PROPERTY_ACCESS', 'PROTOTYPE_ACCESS', 'SOAK_ACCESS', '@']

# Tokens that, when immediately preceding a `WHEN`, indicate that the `WHEN`
# occurs at the start of a line. We disambiguate these from trailing whens to
# avoid an ambiguity in the grammar.
BEFORE_WHEN: ['INDENT', 'OUTDENT', 'TERMINATOR']

# The Lexer Class
# ---------------

# The Lexer class reads a stream of CoffeeScript and divvys it up into tagged
# tokens. A minor bit of the ambiguity in the grammar has been avoided by
# pushing some extra smarts into the Lexer.
exports.Lexer: class Lexer

  # Scan by attempting to match tokens one at a time. Slow and steady.
  tokenize: (code, options) ->
    o        : options or {}
    @code    : code         # The remainder of the source code.
    @i       : 0            # Current character position we're parsing.
    @line    : o.line or 0  # The current line.
    @indent  : 0            # The current indent level.
    @indents : []           # The stack of all indent levels we are currently within.
    @tokens  : []           # Collection of all parsed tokens in the form ['TOKEN_TYPE', value, line]
    while @i < @code.length
      @chunk: @code.slice(@i)
      @extract_next_token()
    @close_indentation()
    return @tokens if o.rewrite is no
    (new Rewriter()).rewrite @tokens

  # At every position, run through this list of attempted matches,
  # short-circuiting if any of them succeed.
  extract_next_token: ->
    return if @identifier_token()
    return if @number_token()
    return if @heredoc_token()
    return if @string_token()
    return if @js_token()
    return if @regex_token()
    return if @comment_token()
    return if @line_token()
    return if @whitespace_token()
    return    @literal_token()

  # Tokenizers
  # ----------

  # Matches identifying literals: variables, keywords, method names, etc.
  identifier_token: ->
    return false unless id: @match IDENTIFIER, 1
    @name_access_type()
    tag: 'IDENTIFIER'
    tag: id.toUpperCase() if include(KEYWORDS, id) and
      not (include(ACCESSORS, @tag(0)) and not @prev().spaced)
    @identifier_error id  if include RESERVED, id
    tag: 'LEADING_WHEN'   if tag is 'WHEN' and include BEFORE_WHEN, @tag()
    @token(tag, id)
    @i += id.length
    true

  # Matches numbers, including decimals, hex, and exponential notation.
  number_token: ->
    return false unless number: @match NUMBER, 1
    @token 'NUMBER', number
    @i += number.length
    true

  # Matches strings, including multi-line strings.
  string_token: ->
    string: @balanced_token ['"', '"'], ['${', '}']
    string: @balanced_token ["'", "'"] if string is false
    return false unless string
    @interpolate_string string.replace STRING_NEWLINES, " \\\n"
    @line += count string, "\n"
    @i += string.length
    true

  # Matches heredocs, adjusting indentation to the correct level.
  heredoc_token: ->
    return false unless match = @chunk.match(HEREDOC)
    doc: @sanitize_heredoc match[2] or match[4]
    @token 'STRING', "\"$doc\""
    @line += count match[1], "\n"
    @i += match[1].length
    true

  # Matches interpolated JavaScript.
  js_token: ->
    return false unless script: @balanced_token ['`', '`']
    @token 'JS', script.replace(JS_CLEANER, '')
    @i += script.length
    true

  # Matches regular expression literals.
  regex_token: ->
    return false unless regex: @match REGEX, 1
    return false if include NOT_REGEX, @tag()
    @token 'REGEX', regex
    @i += regex.length
    true

  # Matches a balanced group such as a single or double-quoted string. Pass in
  # a series of delimiters, all of which must be balanced correctly within the
  # token's contents.
  balanced_token: (delimited...) ->
    levels: []
    i: 0
    while i < @chunk.length
      for pair in delimited
        [open, close]: pair
        if levels.length and starts @chunk, '\\', i
          i += 1
          break
        else if levels.length and starts(@chunk, close, i) and levels[levels.length - 1] is pair
          levels.pop()
          i += close.length - 1
          i += 1 unless levels.length
          break
        else if starts @chunk, open, i
          levels.push(pair)
          i += open.length - 1
          break
      break unless levels.length
      i += 1
    throw new Error "SyntaxError: Unterminated ${levels.pop()[0]} starting on line ${@line + 1}" if levels.length
    return false if i is 0
    return @chunk.substring(0, i)

  # Matches and conumes comments.
  comment_token: ->
    return false unless comment: @match COMMENT, 1
    @line += (comment.match(MULTILINER) or []).length
    lines: comment.replace(COMMENT_CLEANER, '').split(MULTILINER)
    @token 'COMMENT', compact lines
    @token 'TERMINATOR', "\n"
    @i += comment.length
    true

  # Matches newlines, indents, and outdents, and determines which is which.
  line_token: ->
    return false unless indent: @match MULTI_DENT, 1
    @line += indent.match(MULTILINER).length
    @i    += indent.length
    prev: @prev(2)
    size: indent.match(LAST_DENTS).reverse()[0].match(LAST_DENT)[1].length
    next_character: @chunk.match(MULTI_DENT)[4]
    no_newlines: next_character is '.' or (@value() and @value().match(NO_NEWLINE) and
      prev and (prev[0] isnt '.') and not @value().match(CODE))
    if size is @indent
      return @suppress_newlines(indent) if no_newlines
      return @newline_token(indent)
    else if size > @indent
      return @suppress_newlines(indent) if no_newlines
      diff: size - @indent
      @token 'INDENT', diff
      @indents.push diff
    else
      @outdent_token @indent - size, no_newlines
    @indent: size
    true

  # Record an outdent token or tokens, if we happen to be moving back inwards
  # past multiple recorded indents.
  outdent_token: (move_out, no_newlines) ->
    while move_out > 0 and @indents.length
      last_indent: @indents.pop()
      @token 'OUTDENT', last_indent
      move_out -= last_indent
    @token 'TERMINATOR', "\n" unless @tag() is 'TERMINATOR' or no_newlines
    true

  # Matches and consumes non-meaningful whitespace. Tag the previous token
  # as being "spaced", because there are some cases where it makes a difference.
  whitespace_token: ->
    return false unless space: @match WHITESPACE, 1
    prev: @prev()
    prev.spaced: true if prev
    @i += space.length
    true

  # Generate a newline token. Multiple newlines get merged together.
  newline_token: (newlines) ->
    @token 'TERMINATOR', "\n" unless @tag() is 'TERMINATOR'
    true

  # Use a `\` at a line-ending to suppress the newline.
  # The slash is removed here once its job is done.
  suppress_newlines: (newlines) ->
    @tokens.pop() if @value() is "\\"
    true

  # We treat all other single characters as a token. Eg.: `( ) , . !`
  # Multi-character operators are also literal tokens, so that Jison can assign
  # the proper order of operations.
  literal_token: ->
    match: @chunk.match(OPERATOR)
    value: match and match[1]
    @tag_parameters() if value and value.match(CODE)
    value ||= @chunk.substr(0, 1)
    not_spaced: not @prev() or not @prev().spaced
    tag: value
    if value.match(ASSIGNMENT)
      tag: 'ASSIGN'
      @assignment_error() if include JS_FORBIDDEN, @value
    else if value is ';'
      tag: 'TERMINATOR'
    else if value is '[' and @tag() is '?' and not_spaced
      tag: 'SOAKED_INDEX_START'
      @soaked_index: true
      @tokens.pop()
    else if value is ']' and @soaked_index
      tag: 'SOAKED_INDEX_END'
      @soaked_index: false
    else if include(CALLABLE, @tag()) and not_spaced
      tag: 'CALL_START'  if value is '('
      tag: 'INDEX_START' if value is '['
    @token tag, value
    @i += value.length
    true

  # Token Manipulators
  # ------------------

  # As we consume a new `IDENTIFIER`, look at the previous token to determine
  # if it's a special kind of accessor.
  name_access_type: ->
    @tag(1, 'PROTOTYPE_ACCESS') if @value() is '::'
    if @value() is '.' and not (@value(2) is '.')
      if @tag(2) is '?'
        @tag(1, 'SOAK_ACCESS')
        @tokens.splice(-2, 1)
      else
        @tag 1, 'PROPERTY_ACCESS'

  # Sanitize a heredoc by escaping double quotes and erasing all external
  # indentation on the left-hand side.
  sanitize_heredoc: (doc) ->
    indent: (doc.match(HEREDOC_INDENT) or ['']).sort()[0]
    doc.replace(new RegExp("^" +indent, 'gm'), '')
       .replace(MULTILINER, "\\n")
       .replace(/"/g, '\\"')

  # A source of ambiguity in our grammar was parameter lists in function
  # definitions (as opposed to argument lists in function calls). Tag
  # parameter identifiers in order to avoid this. Also, parameter lists can
  # make use of splats.
  tag_parameters: ->
    return if @tag() isnt ')'
    i: 0
    while true
      i += 1
      tok: @prev(i)
      return if not tok
      switch tok[0]
        when 'IDENTIFIER' then tok[0]: 'PARAM'
        when ')'          then tok[0]: 'PARAM_END'
        when '('          then return tok[0]: 'PARAM_START'
    true

  # Close up all remaining open blocks at the end of the file.
  close_indentation: ->
    @outdent_token(@indent)

  # Error for when you try to use a forbidden word in JavaScript as
  # an identifier.
  identifier_error: (word) ->
    throw new Error "SyntaxError: Reserved word \"$word\" on line ${@line + 1}"

  # Error for when you try to assign to a reserved word in JavaScript,
  # like "function" or "default".
  assignment_error: ->
    throw new Error "SyntaxError: Reserved word \"${@value()}\" on line ${@line + 1} can't be assigned"

  # Expand variables and expressions inside double-quoted strings using
  # [ECMA Harmony's interpolation syntax](http://wiki.ecmascript.org/doku.php?id=strawman:string_interpolation).
  #
  #     "Hello $name."
  #     "Hello ${name.capitalize()}."
  #
  interpolate_string: (str) ->
    if str.length < 3 or not starts str, '"'
      @token 'STRING', str
    else
      lexer:  new Lexer()
      tokens: []
      quote:  str.substring(0, 1)
      str:    str.substring(1, str.length - 1)
      while str.length
        match: str.match INTERPOLATION
        if match
          [group, before, interp]: match
          if starts before, '\\', before.length - 1
            prev: before.substring(0, before.length - 1)
            tokens.push ['STRING', "$quote$prev$$interp$quote"] if before.length
          else
            tokens.push ['STRING', "$quote$before$quote"] if before.length
            if starts interp, '{'
              inner: interp.substring(1, interp.length - 1)
              nested: lexer.tokenize "($inner)", {rewrite: no, line: @line}
              nested.pop()
              tokens.push ['TOKENS', nested]
            else
              interp: "this.${ interp.substring(1) }" if starts interp, '@'
              tokens.push ['IDENTIFIER', interp]
          str: str.substring(group.length)
        else
          tokens.push ['STRING', "$quote$str$quote"]
          str: ''
      if tokens.length > 1
        for i in [tokens.length - 1..1]
          [prev, tok]: [tokens[i - 1], tokens[i]]
          if tok[0] is 'STRING' and prev[0] is 'STRING'
            [prev, tok]: [prev[1].substring(1, prev[1].length - 1), tok[1].substring(1, tok[1].length - 1)]
            tokens.splice i - 1, 2, ['STRING', "$quote$prev$tok$quote"]
      for each, i in tokens
        if each[0] is 'TOKENS'
          @token nested[0], nested[1] for nested in each[1]
        else
          @token each[0], each[1]
        @token '+', '+' if i < tokens.length - 1

  # Helpers
  # -------

  # Add a token to the results, taking note of the line number.
  token: (tag, value) ->
    @tokens.push([tag, value, @line])

  # Peek at a tag in the current token stream.
  tag: (index, tag) ->
    return unless tok: @prev(index)
    return tok[0]: tag if tag?
    tok[0]

  # Peek at a value in the current token stream.
  value: (index, val) ->
    return unless tok: @prev(index)
    return tok[1]: val if val?
    tok[1]

  # Peek at a previous token, entire.
  prev: (index) ->
    @tokens[@tokens.length - (index or 1)]

  # Attempt to match a string against the current chunk, returning the indexed
  # match if successful, and `false` otherwise.
  match: (regex, index) ->
    return false unless m: @chunk.match(regex)
    if m then m[index] else false

# Utility Functions
# -----------------

# Does a list include a value?
include: (list, value) ->
  list.indexOf(value) >= 0

# Peek at the beginning of a given string to see if it matches a sequence.
starts: (string, literal, start) ->
  string.substring(start, (start or 0) + literal.length) is literal

# Trim out all falsy values from an array.
compact: (array) -> item for item in array when item

# Count the number of occurences of a character in a string.
count: (string, letter) ->
  num: 0
  pos: string.indexOf(letter)
  while pos isnt -1
    num += 1
    pos: string.indexOf(letter, pos + 1)
  num