Support two different content-matching regexes for tree-sitter grammars

* firstLineRegex is only tested against the first line of the file, and if it matches, it allows the grammar to be used for the file. * contentRegex is only tested if the grammar matched the file name or the grammar's firstLineRegex matched. It is used to break ties between grammars like C and C++, Flow and JS
2026-01-25 14:59:03 -05:00 · 2018-08-09 17:19:32 -07:00
parent 36bb4d51e9
commit 6f41353e9f
4 changed files with 79 additions and 21 deletions
--- a/spec/grammar-registry-spec.js
+++ b/spec/grammar-registry-spec.js
@@ -472,6 +472,19 @@ describe('GrammarRegistry', () => {
        expect(grammar.name).toBe('C++')
      })

+      it('does not apply content regexes from grammars without filetype or first line matches', () => {
+        atom.config.set('core.useTreeSitterParsers', true)
+        grammarRegistry.loadGrammarSync(require.resolve('language-c/grammars/tree-sitter-cpp.cson'))
+
+        let grammar = grammarRegistry.selectGrammar('', dedent `
+          class Foo
+            # this is ruby, not C++
+          end
+        `)
+
+        expect(grammar.name).toBe('Null Grammar')
+      })
+
      it('recognizes shell scripts with shebang lines', () => {
        atom.config.set('core.useTreeSitterParsers', true)
        grammarRegistry.loadGrammarSync(require.resolve('language-shellscript/grammars/shell-unix-bash.cson'))
@@ -485,6 +498,14 @@ describe('GrammarRegistry', () => {
        expect(grammar.name).toBe('Shell Script')
        expect(grammar instanceof TreeSitterGrammar).toBeTruthy()

+        grammar = grammarRegistry.selectGrammar('test.h', dedent `
+          # vim: set ft=bash
+
+          echo "hi"
+        `)
+        expect(grammar.name).toBe('Shell Script')
+        expect(grammar instanceof TreeSitterGrammar).toBeTruthy()
+
        atom.config.set('core.useTreeSitterParsers', false)
        grammar = grammarRegistry.selectGrammar('test.h', dedent `
          #!/bin/bash
--- a/src/grammar-registry.js
+++ b/src/grammar-registry.js
@@ -208,25 +208,35 @@ class GrammarRegistry {
      contents = fs.readFileSync(filePath, 'utf8')
    }

+    // Initially identify matching grammars based on the filename and the first
+    // line of the file.
    let score = this.getGrammarPathScore(grammar, filePath)
-    if (score > 0 && !grammar.bundledPackage) {
-      score += 0.125
-    }
+    if (this.grammarMatchesPrefix(grammar, contents)) score += 0.5

-    if (grammar instanceof TreeSitterGrammar) {
-      if (!this.config.get('core.useTreeSitterParsers')) return -Infinity
+    // If multiple grammars match by one of the above criteria, break ties.
+    if (score > 0) {

-      if (grammar.contentRegExp) {
-        if (grammar.contentRegExp.test(contents)) {
-          score += 0.25
+      // Prefer either TextMate or Tree-sitter grammars based on the user's settings.
+      if (grammar instanceof TreeSitterGrammar) {
+        if (this.config.get('core.useTreeSitterParsers')) {
+          score += 0.1
        } else {
-          score -= 0.25
+          return -Infinity
        }
      }

-      if (score > 0) score += 0.5
-    } else if (this.grammarMatchesPrefix(grammar, contents)) {
-      score += 0.25
+      // Prefer grammars with matching content regexes. Prefer a grammar with no content regex
+      // over one with a non-matching content regex.
+      if (grammar.contentRegex) {
+        if (grammar.contentRegex.test(contents)) {
+          score += 0.05
+        } else {
+          score -= 0.05
+        }
+      }
+
+      // Prefer grammars that the user has manually installed over bundled grammars.
+      if (!grammar.bundledPackage) score += 0.01
    }

    return score
@@ -282,8 +292,13 @@ class GrammarRegistry {
        }
      }

-      const lines = contents.split('\n')
-      return grammar.firstLineRegex.testSync(lines.slice(0, numberOfNewlinesInRegex + 1).join('\n'))
+      const prefix = contents.split('\n').slice(0, numberOfNewlinesInRegex + 1).join('\n')
+      if (grammar.firstLineRegex.testSync) {
+        return grammar.firstLineRegex.testSync(prefix)
+      } else {
+
+        return grammar.firstLineRegex.test(prefix)
+      }
    } else {
      return false
    }
@@ -395,7 +410,7 @@ class GrammarRegistry {
  // * `injectionPoint` An {Object} with the following keys:
  //   * `type` The {String} type of syntax node that may embed other languages
  //   * `language` A {Function} that is called with syntax nodes of the specified `type` and
-  //     returns a {String} that will be tested against other grammars' `injectionRegExp` in
+  //     returns a {String} that will be tested against other grammars' `injectionRegex` in
  //     order to determine what language should be embedded.
  //   * `content` A {Function} that is called with syntax nodes of the specified `type` and
  //     returns another syntax node or array of syntax nodes that contain the embedded source code.
@@ -542,12 +557,22 @@ class GrammarRegistry {
  }

  treeSitterGrammarForLanguageString (languageString) {
+    let longestMatchLength = 0
+    let grammarWithLongestMatch = null
    for (const id in this.treeSitterGrammarsById) {
      const grammar = this.treeSitterGrammarsById[id]
-      if (grammar.injectionRegExp && grammar.injectionRegExp.test(languageString)) {
-        return grammar
+      if (grammar.injectionRegex) {
+        const match = languageString.match(grammar.injectionRegex)
+        if (match) {
+          const {length} = match[0]
+          if (length > longestMatchLength) {
+            grammarWithLongestMatch = grammar
+            longestMatchLength = length
+          }
+        }
      }
    }
+    return grammarWithLongestMatch
  }

  normalizeLanguageId (languageId) {
--- a/src/tree-sitter-grammar.js
+++ b/src/tree-sitter-grammar.js
@@ -9,8 +9,12 @@ class TreeSitterGrammar {
    this.id = params.id
    this.name = params.name
    this.legacyScopeName = params.legacyScopeName
-    if (params.contentRegExp) this.contentRegExp = new RegExp(params.contentRegExp)
-    if (params.injectionRegExp) this.injectionRegExp = new RegExp(params.injectionRegExp)
+
+    // TODO - Remove the `RegExp` spelling and only support `Regex`, once all of the existing
+    // Tree-sitter grammars are updated to spell it `Regex`.
+    this.contentRegex = buildRegex(params.contentRegex || params.contentRegExp)
+    this.injectionRegex = buildRegex(params.injectionRegex || params.injectionRegExp)
+    this.firstLineRegex = buildRegex(params.firstLineRegex)

    this.folds = params.folds || []
    this.folds.forEach(normalizeFoldSpecification)
@@ -36,7 +40,7 @@ class TreeSitterGrammar {
    }

    this.scopeMap = new SyntaxScopeMap(scopeSelectors)
-    this.fileTypes = params.fileTypes
+    this.fileTypes = params.fileTypes || []
    this.injectionPoints = params.injectionPoints || []

    // TODO - When we upgrade to a new enough version of node, use `require.resolve`
@@ -125,3 +129,11 @@ function normalizeFoldSpecification (spec) {
  if (spec.start) normalizeFoldSpecification(spec.start)
  if (spec.end) normalizeFoldSpecification(spec.end)
 }
+
+function buildRegex (value) {
+  // Allow multiple alternatives to be specified via an array, for
+  // readability of the grammar file
+  if (Array.isArray(value)) value = value.map(_ => `(${_})`).join('|')
+  if (typeof value === 'string') return new RegExp(value)
+  return null
+}
--- a/src/tree-sitter-language-mode.js
+++ b/src/tree-sitter-language-mode.js
@@ -534,7 +534,7 @@ class LanguageLayer {
  }

  updateInjections (grammar) {
-    if (grammar.injectionRegExp) {
+    if (grammar.injectionRegex) {
      if (!this.currentParsePromise) this.currentParsePromise = Promise.resolve()
      this.currentParsePromise = this.currentParsePromise.then(async () => {
        await this._populateInjections(MAX_RANGE, null)