From 9ec427ba806c08cc7f1cc7bfb0787e143ddece6b Mon Sep 17 00:00:00 2001
From: Simon Lydell <simon.lydell@gmail.com>
Date: Tue, 6 Jan 2015 21:32:14 +0100
Subject: [PATCH] Fix #2516, #3560: Unicode space handling

It is possible to match only valid JavaScript identifiers with a really long
regex (like coco and CoffeeScriptRedux does), but CoffeeScript uses a much
simpler one, which allows a bit too much.

Quoting jashkenas/coffeescript#1718 #issuecomment-2152464 @jashkenas:

> But it still seems very much across the "worth it" line. You'll get the
> SyntaxError as soon as it hits JS, and performance aside -- even the increase
> in filesize for our browser coffee-script.js lib seems too much, considering
> this is something no one ever does, apart from experimentation.

In short, CoffeeScript treats any non-ASCII character as part of an identifier.
However, unicode spaces should be excluded since having blank characters as part
of a _word_ is very confusing. This commit does so, while still keeping the
regex really simple.
---
 lib/coffee-script/lexer.js |  2 +-
 src/lexer.coffee           |  3 ++-
 test/compilation.coffee    | 26 ++++++++++++++++++++++++++
 3 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/lib/coffee-script/lexer.js b/lib/coffee-script/lexer.js
index 149f1935..9e4eb109 100644
--- a/lib/coffee-script/lexer.js
+++ b/lib/coffee-script/lexer.js
@@ -815,7 +815,7 @@
 
   BOM = 65279;
 
-  IDENTIFIER = /^([$A-Za-z_\x7f-\uffff][$\w\x7f-\uffff]*)([^\n\S]*:(?!:))?/;
+  IDENTIFIER = /^(?!\d)((?:(?!\s)[$\w\x7f-\uffff])+)([^\n\S]*:(?!:))?/;
 
   NUMBER = /^0b[01]+|^0o[0-7]+|^0x[\da-f]+|^\d*\.?\d+(?:e[+-]?\d+)?/i;
 
diff --git a/src/lexer.coffee b/src/lexer.coffee
index 0ed1d71d..11cea8e9 100644
--- a/src/lexer.coffee
+++ b/src/lexer.coffee
@@ -731,7 +731,8 @@ BOM = 65279
 
 # Token matching regexes.
 IDENTIFIER = /// ^
-  ( [$A-Za-z_\x7f-\uffff][$\w\x7f-\uffff]* )
+  (?!\d)
+  ( (?: (?!\s)[$\w\x7f-\uffff] )+ )
   ( [^\n\S]* : (?!:) )?  # Is this a property name?
 ///
 
diff --git a/test/compilation.coffee b/test/compilation.coffee
index b43e0b67..b1b5247a 100644
--- a/test/compilation.coffee
+++ b/test/compilation.coffee
@@ -52,6 +52,32 @@ test "Issue #986: Unicode identifiers", ->
   λ = 5
   eq λ, 5
 
+test "#2516: Unicode spaces should not be part of identifiers", ->
+  a = (x) -> x * 2
+  b = 3
+  eq 6, a b # U+00A0 NO-BREAK SPACE
+  eq 6, a b # U+1680 OGHAM SPACE MARK
+  eq 6, a b # U+2000 EN QUAD
+  eq 6, a b # U+2001 EM QUAD
+  eq 6, a b # U+2002 EN SPACE
+  eq 6, a b # U+2003 EM SPACE
+  eq 6, a b # U+2004 THREE-PER-EM SPACE
+  eq 6, a b # U+2005 FOUR-PER-EM SPACE
+  eq 6, a b # U+2006 SIX-PER-EM SPACE
+  eq 6, a b # U+2007 FIGURE SPACE
+  eq 6, a b # U+2008 PUNCTUATION SPACE
+  eq 6, a b # U+2009 THIN SPACE
+  eq 6, a b # U+200A HAIR SPACE
+  eq 6, a b # U+202F NARROW NO-BREAK SPACE
+  eq 6, a b # U+205F MEDIUM MATHEMATICAL SPACE
+  eq 6, a　b # U+3000 IDEOGRAPHIC SPACE
+
+  # #3560: Non-breaking space (U+00A0) (before `'c'`)
+  eq 5, {c: 5}[ 'c' ]
+
+  # A line where every space in non-breaking
+  eq 1 + 1, 2  
+
 test "don't accidentally stringify keywords", ->
   ok (-> this == 'this')() is false