Make Tree-sitter grammars' contentRegExp work as intended

This commit is contained in:
Max Brunsfeld
2018-06-04 16:35:32 -07:00
parent fceb6b0063
commit 4dbfadd0d5
3 changed files with 97 additions and 44 deletions

View File

@@ -283,32 +283,6 @@ describe('GrammarRegistry', () => {
expect(atom.grammars.selectGrammar('/hu.git/config').name).toBe('Null Grammar')
})
describe('when the grammar has a contentRegExp field', () => {
it('favors grammars whose contentRegExp matches a prefix of the file\'s content', () => {
atom.grammars.addGrammar({
id: 'javascript-1',
fileTypes: ['js']
})
atom.grammars.addGrammar({
id: 'flow-javascript',
contentRegExp: new RegExp('//.*@flow'),
fileTypes: ['js']
})
atom.grammars.addGrammar({
id: 'javascript-2',
fileTypes: ['js']
})
const selectedGrammar = atom.grammars.selectGrammar('test.js', dedent`
// Copyright EvilCorp
// @flow
module.exports = function () { return 1 + 1 }
`)
expect(selectedGrammar.id).toBe('flow-javascript')
})
})
it("uses the filePath's shebang line if the grammar cannot be determined by the extension or basename", async () => {
await atom.packages.activatePackage('language-javascript')
await atom.packages.activatePackage('language-ruby')
@@ -442,6 +416,84 @@ describe('GrammarRegistry', () => {
expect(grammar instanceof TreeSitterGrammar).toBe(true)
})
})
describe('tree-sitter grammars with content regexes', () => {
it('recognizes C++ header files', () => {
atom.config.set('core.useTreeSitterParsers', true)
grammarRegistry.loadGrammarSync(require.resolve('language-c/grammars/tree-sitter-c.cson'))
grammarRegistry.loadGrammarSync(require.resolve('language-c/grammars/tree-sitter-cpp.cson'))
grammarRegistry.loadGrammarSync(require.resolve('language-coffee-script/grammars/coffeescript.cson'))
let grammar = grammarRegistry.selectGrammar('test.h', dedent `
#include <string.h>
typedef struct {
void verb();
} Noun;
`)
expect(grammar.name).toBe('C')
grammar = grammarRegistry.selectGrammar('test.h', dedent `
#include <string>
class Noun {
public:
void verb();
};
`)
expect(grammar.name).toBe('C++')
// The word `class` only indicates C++ in `.h` files, not in all files.
grammar = grammarRegistry.selectGrammar('test.coffee', dedent `
module.exports =
class Noun
verb: -> true
`)
expect(grammar.name).toBe('CoffeeScript')
})
it('recognizes shell scripts with shebang lines', () => {
atom.config.set('core.useTreeSitterParsers', true)
grammarRegistry.loadGrammarSync(require.resolve('language-shellscript/grammars/shell-unix-bash.cson'))
grammarRegistry.loadGrammarSync(require.resolve('language-shellscript/grammars/tree-sitter-bash.cson'))
let grammar = grammarRegistry.selectGrammar('test.h', dedent `
#!/bin/bash
echo "hi"
`)
expect(grammar.name).toBe('Shell Script')
expect(grammar instanceof TreeSitterGrammar).toBeTruthy()
atom.config.set('core.useTreeSitterParsers', false)
grammar = grammarRegistry.selectGrammar('test.h', dedent `
#!/bin/bash
echo "hi"
`)
expect(grammar.name).toBe('Shell Script')
expect(grammar instanceof TreeSitterGrammar).toBeFalsy()
})
it('recognizes JavaScript files that use Flow', () => {
atom.config.set('core.useTreeSitterParsers', true)
grammarRegistry.loadGrammarSync(require.resolve('language-javascript/grammars/tree-sitter-javascript.cson'))
grammarRegistry.loadGrammarSync(require.resolve('language-typescript/grammars/tree-sitter-flow.cson'))
let grammar = grammarRegistry.selectGrammar('test.js', dedent`
// Copyright something
// @flow
module.exports = function () { return 1 + 1 }
`)
expect(grammar.name).toBe('Flow JavaScript')
grammar = grammarRegistry.selectGrammar('test.js', dedent`
module.exports = function () { return 1 + 1 }
`)
expect(grammar.name).toBe('JavaScript')
})
})
})
describe('.removeGrammar(grammar)', () => {

View File

@@ -1032,6 +1032,7 @@ describe('PackageManager', () => {
})
it('loads any tree-sitter grammars defined in the package', async () => {
atom.config.set('core.useTreeSitterParsers', true)
await atom.packages.activatePackage('package-with-tree-sitter-grammar')
const grammar = atom.grammars.selectGrammar('test.somelang')
expect(grammar.name).toBe('Some Language')

View File

@@ -10,7 +10,6 @@ const Token = require('./token')
const fs = require('fs-plus')
const {Point, Range} = require('text-buffer')
const GRAMMAR_TYPE_BONUS = 1000
const PATH_SPLIT_REGEX = new RegExp('[/.]')
// Extended: This class holds the grammars used for tokenizing.
@@ -213,12 +212,23 @@ class GrammarRegistry {
if (score > 0 && !grammar.bundledPackage) {
score += 0.125
}
if (this.grammarMatchesContents(grammar, contents)) {
score += 0.25
}
if (score > 0 && this.isGrammarPreferredType(grammar)) {
score += GRAMMAR_TYPE_BONUS
if (grammar instanceof TreeSitterGrammar) {
if (this.config.get('core.useTreeSitterParsers')) {
score += 0.05
} else {
score = -Infinity
}
if (grammar.contentRegExp) {
if (grammar.contentRegExp.test(contents)) {
score += 0.25
} else {
score -= 0.25
}
}
} else if (this.grammarMatchesPrefix(grammar, contents)) {
score += 0.25
}
return score
@@ -256,12 +266,8 @@ class GrammarRegistry {
return pathScore
}
grammarMatchesContents (grammar, contents) {
if (contents == null) return false
if (grammar.contentRegExp) { // TreeSitter grammars
return grammar.contentRegExp.test(contents)
} else if (grammar.firstLineRegex) { // FirstMate grammars
grammarMatchesPrefix (grammar, contents) {
if (contents && grammar.firstLineRegex) {
let escaped = false
let numberOfNewlinesInRegex = 0
for (let character of grammar.firstLineRegex.source) {
@@ -511,12 +517,6 @@ class GrammarRegistry {
return this.textmateRegistry.scopeForId(id)
}
isGrammarPreferredType (grammar) {
return this.config.get('core.useTreeSitterParsers')
? grammar instanceof TreeSitterGrammar
: grammar instanceof FirstMate.Grammar
}
normalizeLanguageId (languageId) {
if (this.config.get('core.useTreeSitterParsers')) {
return this.treeSitterLanguageIdsByTextMateScopeName.get(languageId) || languageId