Support two different content-matching regexes for tree-sitter grammars

* firstLineRegex is only tested against the first line of the file, and
  if it matches, it allows the grammar to be used for the file.
* contentRegex is only tested if the grammar matched the file name
  or the grammar's firstLineRegex matched. It is used to break ties
  between grammars like C and C++, Flow and JS
This commit is contained in:
Max Brunsfeld
2018-08-09 17:19:32 -07:00
parent 36bb4d51e9
commit 6f41353e9f
4 changed files with 79 additions and 21 deletions

View File

@@ -472,6 +472,19 @@ describe('GrammarRegistry', () => {
expect(grammar.name).toBe('C++')
})
it('does not apply content regexes from grammars without filetype or first line matches', () => {
atom.config.set('core.useTreeSitterParsers', true)
grammarRegistry.loadGrammarSync(require.resolve('language-c/grammars/tree-sitter-cpp.cson'))
let grammar = grammarRegistry.selectGrammar('', dedent `
class Foo
# this is ruby, not C++
end
`)
expect(grammar.name).toBe('Null Grammar')
})
it('recognizes shell scripts with shebang lines', () => {
atom.config.set('core.useTreeSitterParsers', true)
grammarRegistry.loadGrammarSync(require.resolve('language-shellscript/grammars/shell-unix-bash.cson'))
@@ -485,6 +498,14 @@ describe('GrammarRegistry', () => {
expect(grammar.name).toBe('Shell Script')
expect(grammar instanceof TreeSitterGrammar).toBeTruthy()
grammar = grammarRegistry.selectGrammar('test.h', dedent `
# vim: set ft=bash
echo "hi"
`)
expect(grammar.name).toBe('Shell Script')
expect(grammar instanceof TreeSitterGrammar).toBeTruthy()
atom.config.set('core.useTreeSitterParsers', false)
grammar = grammarRegistry.selectGrammar('test.h', dedent `
#!/bin/bash

View File

@@ -208,25 +208,35 @@ class GrammarRegistry {
contents = fs.readFileSync(filePath, 'utf8')
}
// Initially identify matching grammars based on the filename and the first
// line of the file.
let score = this.getGrammarPathScore(grammar, filePath)
if (score > 0 && !grammar.bundledPackage) {
score += 0.125
}
if (this.grammarMatchesPrefix(grammar, contents)) score += 0.5
if (grammar instanceof TreeSitterGrammar) {
if (!this.config.get('core.useTreeSitterParsers')) return -Infinity
// If multiple grammars match by one of the above criteria, break ties.
if (score > 0) {
if (grammar.contentRegExp) {
if (grammar.contentRegExp.test(contents)) {
score += 0.25
// Prefer either TextMate or Tree-sitter grammars based on the user's settings.
if (grammar instanceof TreeSitterGrammar) {
if (this.config.get('core.useTreeSitterParsers')) {
score += 0.1
} else {
score -= 0.25
return -Infinity
}
}
if (score > 0) score += 0.5
} else if (this.grammarMatchesPrefix(grammar, contents)) {
score += 0.25
// Prefer grammars with matching content regexes. Prefer a grammar with no content regex
// over one with a non-matching content regex.
if (grammar.contentRegex) {
if (grammar.contentRegex.test(contents)) {
score += 0.05
} else {
score -= 0.05
}
}
// Prefer grammars that the user has manually installed over bundled grammars.
if (!grammar.bundledPackage) score += 0.01
}
return score
@@ -282,8 +292,13 @@ class GrammarRegistry {
}
}
const lines = contents.split('\n')
return grammar.firstLineRegex.testSync(lines.slice(0, numberOfNewlinesInRegex + 1).join('\n'))
const prefix = contents.split('\n').slice(0, numberOfNewlinesInRegex + 1).join('\n')
if (grammar.firstLineRegex.testSync) {
return grammar.firstLineRegex.testSync(prefix)
} else {
return grammar.firstLineRegex.test(prefix)
}
} else {
return false
}
@@ -395,7 +410,7 @@ class GrammarRegistry {
// * `injectionPoint` An {Object} with the following keys:
// * `type` The {String} type of syntax node that may embed other languages
// * `language` A {Function} that is called with syntax nodes of the specified `type` and
// returns a {String} that will be tested against other grammars' `injectionRegExp` in
// returns a {String} that will be tested against other grammars' `injectionRegex` in
// order to determine what language should be embedded.
// * `content` A {Function} that is called with syntax nodes of the specified `type` and
// returns another syntax node or array of syntax nodes that contain the embedded source code.
@@ -542,12 +557,22 @@ class GrammarRegistry {
}
treeSitterGrammarForLanguageString (languageString) {
let longestMatchLength = 0
let grammarWithLongestMatch = null
for (const id in this.treeSitterGrammarsById) {
const grammar = this.treeSitterGrammarsById[id]
if (grammar.injectionRegExp && grammar.injectionRegExp.test(languageString)) {
return grammar
if (grammar.injectionRegex) {
const match = languageString.match(grammar.injectionRegex)
if (match) {
const {length} = match[0]
if (length > longestMatchLength) {
grammarWithLongestMatch = grammar
longestMatchLength = length
}
}
}
}
return grammarWithLongestMatch
}
normalizeLanguageId (languageId) {

View File

@@ -9,8 +9,12 @@ class TreeSitterGrammar {
this.id = params.id
this.name = params.name
this.legacyScopeName = params.legacyScopeName
if (params.contentRegExp) this.contentRegExp = new RegExp(params.contentRegExp)
if (params.injectionRegExp) this.injectionRegExp = new RegExp(params.injectionRegExp)
// TODO - Remove the `RegExp` spelling and only support `Regex`, once all of the existing
// Tree-sitter grammars are updated to spell it `Regex`.
this.contentRegex = buildRegex(params.contentRegex || params.contentRegExp)
this.injectionRegex = buildRegex(params.injectionRegex || params.injectionRegExp)
this.firstLineRegex = buildRegex(params.firstLineRegex)
this.folds = params.folds || []
this.folds.forEach(normalizeFoldSpecification)
@@ -36,7 +40,7 @@ class TreeSitterGrammar {
}
this.scopeMap = new SyntaxScopeMap(scopeSelectors)
this.fileTypes = params.fileTypes
this.fileTypes = params.fileTypes || []
this.injectionPoints = params.injectionPoints || []
// TODO - When we upgrade to a new enough version of node, use `require.resolve`
@@ -125,3 +129,11 @@ function normalizeFoldSpecification (spec) {
if (spec.start) normalizeFoldSpecification(spec.start)
if (spec.end) normalizeFoldSpecification(spec.end)
}
function buildRegex (value) {
// Allow multiple alternatives to be specified via an array, for
// readability of the grammar file
if (Array.isArray(value)) value = value.map(_ => `(${_})`).join('|')
if (typeof value === 'string') return new RegExp(value)
return null
}

View File

@@ -534,7 +534,7 @@ class LanguageLayer {
}
updateInjections (grammar) {
if (grammar.injectionRegExp) {
if (grammar.injectionRegex) {
if (!this.currentParsePromise) this.currentParsePromise = Promise.resolve()
this.currentParsePromise = this.currentParsePromise.then(async () => {
await this._populateInjections(MAX_RANGE, null)