Make Tree-sitter grammars' contentRegExp work as intended

2026-04-28 03:01:47 -04:00 · 2018-06-04 16:35:32 -07:00
parent fceb6b0063
commit 4dbfadd0d5
3 changed files with 97 additions and 44 deletions
--- a/spec/grammar-registry-spec.js
+++ b/spec/grammar-registry-spec.js
@@ -283,32 +283,6 @@ describe('GrammarRegistry', () => {
      expect(atom.grammars.selectGrammar('/hu.git/config').name).toBe('Null Grammar')
    })

-    describe('when the grammar has a contentRegExp field', () => {
-      it('favors grammars whose contentRegExp matches a prefix of the file\'s content', () => {
-        atom.grammars.addGrammar({
-          id: 'javascript-1',
-          fileTypes: ['js']
-        })
-        atom.grammars.addGrammar({
-          id: 'flow-javascript',
-          contentRegExp: new RegExp('//.*@flow'),
-          fileTypes: ['js']
-        })
-        atom.grammars.addGrammar({
-          id: 'javascript-2',
-          fileTypes: ['js']
-        })
-
-        const selectedGrammar = atom.grammars.selectGrammar('test.js', dedent`
-          // Copyright EvilCorp
-          // @flow
-
-          module.exports = function () { return 1 + 1 }
-        `)
-        expect(selectedGrammar.id).toBe('flow-javascript')
-      })
-    })
-
    it("uses the filePath's shebang line if the grammar cannot be determined by the extension or basename", async () => {
      await atom.packages.activatePackage('language-javascript')
      await atom.packages.activatePackage('language-ruby')
@@ -442,6 +416,84 @@ describe('GrammarRegistry', () => {
        expect(grammar instanceof TreeSitterGrammar).toBe(true)
      })
    })
+
+    describe('tree-sitter grammars with content regexes', () => {
+      it('recognizes C++ header files', () => {
+        atom.config.set('core.useTreeSitterParsers', true)
+        grammarRegistry.loadGrammarSync(require.resolve('language-c/grammars/tree-sitter-c.cson'))
+        grammarRegistry.loadGrammarSync(require.resolve('language-c/grammars/tree-sitter-cpp.cson'))
+        grammarRegistry.loadGrammarSync(require.resolve('language-coffee-script/grammars/coffeescript.cson'))
+
+        let grammar = grammarRegistry.selectGrammar('test.h', dedent `
+          #include <string.h>
+
+          typedef struct {
+            void verb();
+          } Noun;
+        `)
+        expect(grammar.name).toBe('C')
+
+        grammar = grammarRegistry.selectGrammar('test.h', dedent `
+          #include <string>
+
+          class Noun {
+           public:
+            void verb();
+          };
+        `)
+        expect(grammar.name).toBe('C++')
+
+        // The word `class` only indicates C++ in `.h` files, not in all files.
+        grammar = grammarRegistry.selectGrammar('test.coffee', dedent `
+          module.exports =
+          class Noun
+            verb: -> true
+        `)
+        expect(grammar.name).toBe('CoffeeScript')
+      })
+
+      it('recognizes shell scripts with shebang lines', () => {
+        atom.config.set('core.useTreeSitterParsers', true)
+        grammarRegistry.loadGrammarSync(require.resolve('language-shellscript/grammars/shell-unix-bash.cson'))
+        grammarRegistry.loadGrammarSync(require.resolve('language-shellscript/grammars/tree-sitter-bash.cson'))
+
+        let grammar = grammarRegistry.selectGrammar('test.h', dedent `
+          #!/bin/bash
+
+          echo "hi"
+        `)
+        expect(grammar.name).toBe('Shell Script')
+        expect(grammar instanceof TreeSitterGrammar).toBeTruthy()
+
+        atom.config.set('core.useTreeSitterParsers', false)
+        grammar = grammarRegistry.selectGrammar('test.h', dedent `
+          #!/bin/bash
+
+          echo "hi"
+        `)
+        expect(grammar.name).toBe('Shell Script')
+        expect(grammar instanceof TreeSitterGrammar).toBeFalsy()
+      })
+
+      it('recognizes JavaScript files that use Flow', () => {
+        atom.config.set('core.useTreeSitterParsers', true)
+        grammarRegistry.loadGrammarSync(require.resolve('language-javascript/grammars/tree-sitter-javascript.cson'))
+        grammarRegistry.loadGrammarSync(require.resolve('language-typescript/grammars/tree-sitter-flow.cson'))
+
+        let grammar = grammarRegistry.selectGrammar('test.js', dedent`
+          // Copyright something
+          // @flow
+
+          module.exports = function () { return 1 + 1 }
+        `)
+        expect(grammar.name).toBe('Flow JavaScript')
+
+        grammar = grammarRegistry.selectGrammar('test.js', dedent`
+          module.exports = function () { return 1 + 1 }
+        `)
+        expect(grammar.name).toBe('JavaScript')
+      })
+    })
  })

  describe('.removeGrammar(grammar)', () => {
--- a/spec/package-manager-spec.js
+++ b/spec/package-manager-spec.js
@@ -1032,6 +1032,7 @@ describe('PackageManager', () => {
      })

      it('loads any tree-sitter grammars defined in the package', async () => {
+        atom.config.set('core.useTreeSitterParsers', true)
        await atom.packages.activatePackage('package-with-tree-sitter-grammar')
        const grammar = atom.grammars.selectGrammar('test.somelang')
        expect(grammar.name).toBe('Some Language')
--- a/src/grammar-registry.js
+++ b/src/grammar-registry.js
@@ -10,7 +10,6 @@ const Token = require('./token')
 const fs = require('fs-plus')
 const {Point, Range} = require('text-buffer')

-const GRAMMAR_TYPE_BONUS = 1000
 const PATH_SPLIT_REGEX = new RegExp('[/.]')

 // Extended: This class holds the grammars used for tokenizing.
@@ -213,12 +212,23 @@ class GrammarRegistry {
    if (score > 0 && !grammar.bundledPackage) {
      score += 0.125
    }
-    if (this.grammarMatchesContents(grammar, contents)) {
-      score += 0.25
-    }

-    if (score > 0 && this.isGrammarPreferredType(grammar)) {
-      score += GRAMMAR_TYPE_BONUS
+    if (grammar instanceof TreeSitterGrammar) {
+      if (this.config.get('core.useTreeSitterParsers')) {
+        score += 0.05
+      } else {
+        score = -Infinity
+      }
+
+      if (grammar.contentRegExp) {
+        if (grammar.contentRegExp.test(contents)) {
+          score += 0.25
+        } else {
+          score -= 0.25
+        }
+      }
+    } else if (this.grammarMatchesPrefix(grammar, contents)) {
+      score += 0.25
    }

    return score
@@ -256,12 +266,8 @@ class GrammarRegistry {
    return pathScore
  }

-  grammarMatchesContents (grammar, contents) {
-    if (contents == null) return false
-
-    if (grammar.contentRegExp) { // TreeSitter grammars
-      return grammar.contentRegExp.test(contents)
-    } else if (grammar.firstLineRegex) { // FirstMate grammars
+  grammarMatchesPrefix (grammar, contents) {
+    if (contents && grammar.firstLineRegex) {
      let escaped = false
      let numberOfNewlinesInRegex = 0
      for (let character of grammar.firstLineRegex.source) {
@@ -511,12 +517,6 @@ class GrammarRegistry {
    return this.textmateRegistry.scopeForId(id)
  }

-  isGrammarPreferredType (grammar) {
-    return this.config.get('core.useTreeSitterParsers')
-      ? grammar instanceof TreeSitterGrammar
-      : grammar instanceof FirstMate.Grammar
-  }
-
  normalizeLanguageId (languageId) {
    if (this.config.get('core.useTreeSitterParsers')) {
      return this.treeSitterLanguageIdsByTextMateScopeName.get(languageId) || languageId