Files
context-mod/tests/languageProcessing.test.ts
FoxxMD 7542947029 feat(sentiment): More improvements to sentiment and language processing
* Implement separate language detection functionality
* Clearer/simpler sentiment processing
* Add languageHints to help coerce low confidence language detection
* Add test cases for lang detection, sentiment detection, and sentiment tests
* Fix neutral range -- was not using normalized score range
2022-05-25 12:59:37 -04:00

343 lines
15 KiB
TypeScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import {describe, it} from 'mocha';
import chai,{assert} from 'chai';
import chaiAsPromised from 'chai-as-promised';
import {
getContentLanguage,
getLanguageTypeFromValue,
getStringSentiment, parseTextToNumberComparison,
testSentiment
} from "../src/Common/LangaugeProcessing";
import {GenericComparison, RangedComparison} from "../src/Common/Infrastructure/Comparisons";
chai.use(chaiAsPromised);
const longNeutralEnglish = "This is a normal english sentence without emotion";
const longNeutralEnglish2 = 'I am neutral on the current subject';
const longNeutralEnglish3 = 'The midterms were an election that happened';
const longNegativeEnglish = "I hate when idiots drive their bad cars terribly. 😡";
const longPositiveEnglish = "We love to be happy and laugh on this wonderful, amazing day";
const shortIndistinctEnglish = "metal gear";
const shortIndistinctEnglish2 = "idk hole ref";
const shortPositiveEnglish = "haha fun";
const shortNegativeEnglish = "fuck you";
const shortSlangPositiveEnglish = "lol lmao";
const shortSlangNegativeEnglish = "get fuked";
const longIndonesian = "setiap kali scroll mesti nampak dia nie haih";
const shortIndonesian = "Saya bangga saya rasis";
const shortPolish = 'Dobry wieczór';
const longRussian = 'Чит на золото для аватарии без скачивания бесплатно';
const longItalian = 'Sembra ormai passato un secolo, visto che gli anime sono praticamente scomparsi dalla televisione.';
const shortRomanian = 'Tu știi unde sta?';
const longRomanian = 'Deci , daca aveti chef de un mic protest , va astept la aceste coordonate';
const longFrench = "Japprouve et à ce moment là ça se soigne plus malheureusement";
const longSpanish = "La segunda parece una mezcla entre una convención de fanáticos de los monster truck y un vertedero.";
const longPositiveSpanish = 'me encanta esta hermosa cancion';
const longPositiveSpanish2 = 'Increíble muy divertido gracias por compartir';
const longGerman = "bin mir auch sicher, dass zb mein 65er halb so viel wiegt wie ein kasten Bier";
const shortEmojiNegative = "France 😫 😞 :(";
const shortEmojiPositive = "France 😂 😄 😁";
describe('Language Detection', function () {
describe('Derives language from user input', async function () {
it('gets from valid, case-insensitive alpha2', async function () {
const lang = await getLanguageTypeFromValue('eN');
assert.equal(lang.alpha2, 'en');
});
it('gets from valid, case-insensitive alpha3', async function () {
const lang = await getLanguageTypeFromValue('eNg');
assert.equal(lang.alpha2, 'en');
});
it('gets from valid, case-insensitive language name', async function () {
const lang = await getLanguageTypeFromValue('EnGlIsH');
assert.equal(lang.alpha2, 'en');
});
it('throws on invalid value', function () {
assert.isRejected(getLanguageTypeFromValue('pofdsfa'))
});
})
describe('Recognizes the language in moderately long content well', function () {
it('should recognize english', async function () {
const lang = await getContentLanguage(longPositiveEnglish);
assert.equal(lang.language.alpha2, 'en');
assert.isFalse(lang.usedDefault);
assert.isAtLeast(lang.bestGuess.score, 0.9);
});
it('should recognize french', async function () {
const lang = await getContentLanguage(longFrench);
assert.equal(lang.language.alpha2, 'fr');
assert.isFalse(lang.usedDefault);
assert.isAtLeast(lang.bestGuess.score, 0.9);
});
it('should recognize spanish', async function () {
const lang = await getContentLanguage(longSpanish);
assert.equal(lang.language.alpha2, 'es');
assert.isFalse(lang.usedDefault);
assert.isAtLeast(lang.bestGuess.score, 0.9);
});
it('should recognize german', async function () {
const lang = await getContentLanguage(longGerman);
assert.equal(lang.language.alpha2, 'de');
assert.isFalse(lang.usedDefault);
assert.isAtLeast(lang.bestGuess.score, 0.9);
});
it('should recognize indonesian', async function () {
const lang = await getContentLanguage(longIndonesian);
assert.equal(lang.language.alpha2, 'id');
assert.isFalse(lang.usedDefault);
assert.isAtLeast(lang.bestGuess.score, 0.9);
});
});
describe('Correctly handles short content classification', function () {
it('uses default language', async function () {
for (const content of [shortIndistinctEnglish, shortIndistinctEnglish2, shortIndonesian]) {
const lang = await getContentLanguage(content);
assert.equal(lang.language.alpha2, 'en', content);
assert.isTrue(lang.usedDefault, content);
}
});
it('uses best guess when default language is not provided', async function () {
for (const content of [shortIndistinctEnglish, shortIndistinctEnglish2, shortIndonesian]) {
const lang = await getContentLanguage(content, {defaultLanguage: false});
assert.isFalse(lang.usedDefault);
}
});
});
});
describe('Sentiment', function() {
describe('Is conservative when no default language is used for short content', function() {
it('should return unusable result for short, ambiguous english content', async function() {
for(const content of [shortIndistinctEnglish, shortIndistinctEnglish2]) {
const res = await getStringSentiment(content, {defaultLanguage: false});
assert.isFalse(res.usableScore);
}
});
it('should return unusable result for short, non-english content', async function() {
for(const content of [shortIndonesian, shortPolish, shortRomanian]) {
const res = await getStringSentiment(content, {defaultLanguage: false});
assert.isFalse(res.usableScore);
}
});
});
describe('Is conservative when language confidence is high for unsupported languages', function() {
it('should return unusable result for long, non-english content', async function() {
for(const content of [longIndonesian, longRussian, longItalian, longRomanian]) {
const res = await getStringSentiment(content);
assert.isFalse(res.usableScore, content);
}
});
});
describe('vader/wink supersedes low confidence language guess', function() {
it('should return usable result when valid words found by vader/wink', async function() {
for(const content of [shortPositiveEnglish,shortNegativeEnglish]) {
const res = await getStringSentiment(content, {defaultLanguage: false});
assert.isTrue(res.usableScore);
}
});
it('should return usable result when valid slang found by vader/wink', async function() {
for(const content of [shortSlangPositiveEnglish,shortSlangNegativeEnglish]) {
const res = await getStringSentiment(content, {defaultLanguage: false});
assert.isTrue(res.usableScore);
}
});
it('should return usable result when valid emojis found by vader/wink', async function() {
for(const content of [shortEmojiPositive,shortEmojiNegative]) {
const res = await getStringSentiment(content, {defaultLanguage: false});
assert.isTrue(res.usableScore);
}
});
})
describe('Detects correct sentiment', function() {
describe('In English', function() {
it('should detect positive sentiment', async function() {
for(const content of [shortEmojiPositive,longPositiveEnglish, shortPositiveEnglish, shortSlangPositiveEnglish]) {
const res = await getStringSentiment(content);
assert.isTrue(res.usableScore);
assert.isAtLeast(res.scoreWeighted, 0.1);
}
});
it('should detect negative sentiment', async function() {
for(const content of [shortEmojiNegative,longNegativeEnglish, shortNegativeEnglish, shortSlangNegativeEnglish]) {
const res = await getStringSentiment(content);
assert.isTrue(res.usableScore);
assert.isAtMost(res.scoreWeighted, -0.1);
}
});
it('should detect neutral sentiment', async function() {
for(const content of [longNeutralEnglish, longNeutralEnglish2, longNeutralEnglish3]) {
const res = await getStringSentiment(content);
assert.isTrue(res.usableScore, content);
assert.isAtMost(res.scoreWeighted, 0.1, content);
assert.isAtLeast(res.scoreWeighted, -0.1, content);
}
});
it('should detect neutral sentiment for short content when english is default language', async function() {
for(const content of [shortIndistinctEnglish, shortIndistinctEnglish2, shortPolish]) {
const res = await getStringSentiment(content);
assert.isTrue(res.usableScore);
assert.isAtMost(res.scoreWeighted, 0.1, content);
assert.isAtLeast(res.scoreWeighted, -0.1, content);
}
});
});
describe('In Spanish', function() {
it('should detect positive ', async function() {
for(const content of [longPositiveSpanish, longPositiveSpanish2]) {
const res = await getStringSentiment(content);
assert.isTrue(res.usableScore, longPositiveSpanish2);
assert.isAtLeast(res.scoreWeighted, 0.1, longPositiveSpanish2);
}
});
});
});
describe('Testing', function () {
describe('Parsing user input to comparison', function() {
it(`parses 'is neutral'`, function() {
const res = parseTextToNumberComparison('is neutral') as RangedComparison;
assert.deepEqual(res.range, [-0.1, 0.1]);
assert.isFalse(res.not);
});
it(`parses 'is not neutral'`, function() {
const res = parseTextToNumberComparison('is not neutral') as RangedComparison;
assert.deepEqual(res.range, [-0.1, 0.1]);
assert.isTrue(res.not);
});
it(`parses 'is positive'`, function() {
const res = parseTextToNumberComparison('is positive') as GenericComparison;
assert.equal(res.operator, '>=');
assert.equal(res.value, 0.1);
});
it(`parses 'is very positive'`, function() {
const res = parseTextToNumberComparison('is very positive') as GenericComparison;
assert.equal(res.operator, '>=');
assert.equal(res.value, 0.3);
});
it(`parses 'is extremely positive'`, function() {
const res = parseTextToNumberComparison('is extremely positive') as GenericComparison;
assert.equal(res.operator, '>=');
assert.equal(res.value, 0.6);
});
it(`parses 'is negative'`, function() {
const res = parseTextToNumberComparison('is negative') as GenericComparison;
assert.equal(res.operator, '<=');
assert.equal(res.value, -0.1);
});
it(`parses 'is very negative'`, function() {
const res = parseTextToNumberComparison('is very negative') as GenericComparison;
assert.equal(res.operator, '<=');
assert.equal(res.value, -0.3);
});
it(`parses 'is extremely negative'`, function() {
const res = parseTextToNumberComparison('is extremely negative') as GenericComparison;
assert.equal(res.operator, '<=');
assert.equal(res.value, -0.6);
});
it(`parses negative negations`, function() {
const res = parseTextToNumberComparison('is not extremely negative') as GenericComparison;
assert.equal(res.operator, '>');
assert.equal(res.value, -0.6);
});
it(`parses positive negations`, function() {
const res = parseTextToNumberComparison('is not positive') as GenericComparison;
assert.equal(res.operator, '<');
assert.equal(res.value, 0.1);
});
});
it('should fail test if score is unusable', async function() {
const comparison = parseTextToNumberComparison('is positive');
for(const content of [shortIndistinctEnglish, shortIndistinctEnglish2, shortPolish, longRomanian]) {
const sentimentResult = await getStringSentiment(content, {defaultLanguage: false});
const testResult = testSentiment(sentimentResult, comparison);
assert.isFalse(testResult.passes);
}
});
it('should handle generic comparisons', async function() {
const comparison = parseTextToNumberComparison('is positive');
for(const content of [shortEmojiPositive,longPositiveEnglish, shortPositiveEnglish, shortSlangPositiveEnglish]) {
const sentimentResult = await getStringSentiment(content, {defaultLanguage: false});
const testResult = testSentiment(sentimentResult, comparison);
assert.isTrue(testResult.passes);
}
});
it('should handle ranged comparisons', async function() {
const comparison = parseTextToNumberComparison('is neutral');
for(const content of [longNeutralEnglish, longNeutralEnglish2, longNeutralEnglish3]) {
const sentimentResult = await getStringSentiment(content, {defaultLanguage: false});
const testResult = testSentiment(sentimentResult, comparison);
assert.isTrue(testResult.passes);
}
});
it('should handle negated ranged comparisons', async function() {
const comparison = parseTextToNumberComparison('is not neutral');
for(const content of [longPositiveEnglish, longPositiveSpanish, longNegativeEnglish]) {
const sentimentResult = await getStringSentiment(content, {defaultLanguage: false});
const testResult = testSentiment(sentimentResult, comparison);
assert.isTrue(testResult.passes, content);
}
});
});
});