mirror of
https://github.com/atom/atom.git
synced 2026-01-25 14:59:03 -05:00
Support two different content-matching regexes for tree-sitter grammars
* firstLineRegex is only tested against the first line of the file, and if it matches, it allows the grammar to be used for the file. * contentRegex is only tested if the grammar matched the file name or the grammar's firstLineRegex matched. It is used to break ties between grammars like C and C++, Flow and JS
This commit is contained in:
@@ -472,6 +472,19 @@ describe('GrammarRegistry', () => {
|
||||
expect(grammar.name).toBe('C++')
|
||||
})
|
||||
|
||||
it('does not apply content regexes from grammars without filetype or first line matches', () => {
|
||||
atom.config.set('core.useTreeSitterParsers', true)
|
||||
grammarRegistry.loadGrammarSync(require.resolve('language-c/grammars/tree-sitter-cpp.cson'))
|
||||
|
||||
let grammar = grammarRegistry.selectGrammar('', dedent `
|
||||
class Foo
|
||||
# this is ruby, not C++
|
||||
end
|
||||
`)
|
||||
|
||||
expect(grammar.name).toBe('Null Grammar')
|
||||
})
|
||||
|
||||
it('recognizes shell scripts with shebang lines', () => {
|
||||
atom.config.set('core.useTreeSitterParsers', true)
|
||||
grammarRegistry.loadGrammarSync(require.resolve('language-shellscript/grammars/shell-unix-bash.cson'))
|
||||
@@ -485,6 +498,14 @@ describe('GrammarRegistry', () => {
|
||||
expect(grammar.name).toBe('Shell Script')
|
||||
expect(grammar instanceof TreeSitterGrammar).toBeTruthy()
|
||||
|
||||
grammar = grammarRegistry.selectGrammar('test.h', dedent `
|
||||
# vim: set ft=bash
|
||||
|
||||
echo "hi"
|
||||
`)
|
||||
expect(grammar.name).toBe('Shell Script')
|
||||
expect(grammar instanceof TreeSitterGrammar).toBeTruthy()
|
||||
|
||||
atom.config.set('core.useTreeSitterParsers', false)
|
||||
grammar = grammarRegistry.selectGrammar('test.h', dedent `
|
||||
#!/bin/bash
|
||||
|
||||
@@ -208,25 +208,35 @@ class GrammarRegistry {
|
||||
contents = fs.readFileSync(filePath, 'utf8')
|
||||
}
|
||||
|
||||
// Initially identify matching grammars based on the filename and the first
|
||||
// line of the file.
|
||||
let score = this.getGrammarPathScore(grammar, filePath)
|
||||
if (score > 0 && !grammar.bundledPackage) {
|
||||
score += 0.125
|
||||
}
|
||||
if (this.grammarMatchesPrefix(grammar, contents)) score += 0.5
|
||||
|
||||
if (grammar instanceof TreeSitterGrammar) {
|
||||
if (!this.config.get('core.useTreeSitterParsers')) return -Infinity
|
||||
// If multiple grammars match by one of the above criteria, break ties.
|
||||
if (score > 0) {
|
||||
|
||||
if (grammar.contentRegExp) {
|
||||
if (grammar.contentRegExp.test(contents)) {
|
||||
score += 0.25
|
||||
// Prefer either TextMate or Tree-sitter grammars based on the user's settings.
|
||||
if (grammar instanceof TreeSitterGrammar) {
|
||||
if (this.config.get('core.useTreeSitterParsers')) {
|
||||
score += 0.1
|
||||
} else {
|
||||
score -= 0.25
|
||||
return -Infinity
|
||||
}
|
||||
}
|
||||
|
||||
if (score > 0) score += 0.5
|
||||
} else if (this.grammarMatchesPrefix(grammar, contents)) {
|
||||
score += 0.25
|
||||
// Prefer grammars with matching content regexes. Prefer a grammar with no content regex
|
||||
// over one with a non-matching content regex.
|
||||
if (grammar.contentRegex) {
|
||||
if (grammar.contentRegex.test(contents)) {
|
||||
score += 0.05
|
||||
} else {
|
||||
score -= 0.05
|
||||
}
|
||||
}
|
||||
|
||||
// Prefer grammars that the user has manually installed over bundled grammars.
|
||||
if (!grammar.bundledPackage) score += 0.01
|
||||
}
|
||||
|
||||
return score
|
||||
@@ -282,8 +292,13 @@ class GrammarRegistry {
|
||||
}
|
||||
}
|
||||
|
||||
const lines = contents.split('\n')
|
||||
return grammar.firstLineRegex.testSync(lines.slice(0, numberOfNewlinesInRegex + 1).join('\n'))
|
||||
const prefix = contents.split('\n').slice(0, numberOfNewlinesInRegex + 1).join('\n')
|
||||
if (grammar.firstLineRegex.testSync) {
|
||||
return grammar.firstLineRegex.testSync(prefix)
|
||||
} else {
|
||||
|
||||
return grammar.firstLineRegex.test(prefix)
|
||||
}
|
||||
} else {
|
||||
return false
|
||||
}
|
||||
@@ -395,7 +410,7 @@ class GrammarRegistry {
|
||||
// * `injectionPoint` An {Object} with the following keys:
|
||||
// * `type` The {String} type of syntax node that may embed other languages
|
||||
// * `language` A {Function} that is called with syntax nodes of the specified `type` and
|
||||
// returns a {String} that will be tested against other grammars' `injectionRegExp` in
|
||||
// returns a {String} that will be tested against other grammars' `injectionRegex` in
|
||||
// order to determine what language should be embedded.
|
||||
// * `content` A {Function} that is called with syntax nodes of the specified `type` and
|
||||
// returns another syntax node or array of syntax nodes that contain the embedded source code.
|
||||
@@ -542,12 +557,22 @@ class GrammarRegistry {
|
||||
}
|
||||
|
||||
treeSitterGrammarForLanguageString (languageString) {
|
||||
let longestMatchLength = 0
|
||||
let grammarWithLongestMatch = null
|
||||
for (const id in this.treeSitterGrammarsById) {
|
||||
const grammar = this.treeSitterGrammarsById[id]
|
||||
if (grammar.injectionRegExp && grammar.injectionRegExp.test(languageString)) {
|
||||
return grammar
|
||||
if (grammar.injectionRegex) {
|
||||
const match = languageString.match(grammar.injectionRegex)
|
||||
if (match) {
|
||||
const {length} = match[0]
|
||||
if (length > longestMatchLength) {
|
||||
grammarWithLongestMatch = grammar
|
||||
longestMatchLength = length
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return grammarWithLongestMatch
|
||||
}
|
||||
|
||||
normalizeLanguageId (languageId) {
|
||||
|
||||
@@ -9,8 +9,12 @@ class TreeSitterGrammar {
|
||||
this.id = params.id
|
||||
this.name = params.name
|
||||
this.legacyScopeName = params.legacyScopeName
|
||||
if (params.contentRegExp) this.contentRegExp = new RegExp(params.contentRegExp)
|
||||
if (params.injectionRegExp) this.injectionRegExp = new RegExp(params.injectionRegExp)
|
||||
|
||||
// TODO - Remove the `RegExp` spelling and only support `Regex`, once all of the existing
|
||||
// Tree-sitter grammars are updated to spell it `Regex`.
|
||||
this.contentRegex = buildRegex(params.contentRegex || params.contentRegExp)
|
||||
this.injectionRegex = buildRegex(params.injectionRegex || params.injectionRegExp)
|
||||
this.firstLineRegex = buildRegex(params.firstLineRegex)
|
||||
|
||||
this.folds = params.folds || []
|
||||
this.folds.forEach(normalizeFoldSpecification)
|
||||
@@ -36,7 +40,7 @@ class TreeSitterGrammar {
|
||||
}
|
||||
|
||||
this.scopeMap = new SyntaxScopeMap(scopeSelectors)
|
||||
this.fileTypes = params.fileTypes
|
||||
this.fileTypes = params.fileTypes || []
|
||||
this.injectionPoints = params.injectionPoints || []
|
||||
|
||||
// TODO - When we upgrade to a new enough version of node, use `require.resolve`
|
||||
@@ -125,3 +129,11 @@ function normalizeFoldSpecification (spec) {
|
||||
if (spec.start) normalizeFoldSpecification(spec.start)
|
||||
if (spec.end) normalizeFoldSpecification(spec.end)
|
||||
}
|
||||
|
||||
function buildRegex (value) {
|
||||
// Allow multiple alternatives to be specified via an array, for
|
||||
// readability of the grammar file
|
||||
if (Array.isArray(value)) value = value.map(_ => `(${_})`).join('|')
|
||||
if (typeof value === 'string') return new RegExp(value)
|
||||
return null
|
||||
}
|
||||
|
||||
@@ -534,7 +534,7 @@ class LanguageLayer {
|
||||
}
|
||||
|
||||
updateInjections (grammar) {
|
||||
if (grammar.injectionRegExp) {
|
||||
if (grammar.injectionRegex) {
|
||||
if (!this.currentParsePromise) this.currentParsePromise = Promise.resolve()
|
||||
this.currentParsePromise = this.currentParsePromise.then(async () => {
|
||||
await this._populateInjections(MAX_RANGE, null)
|
||||
|
||||
Reference in New Issue
Block a user