From f8967d55c4e148b458b08ce29f9784fd61394f99 Mon Sep 17 00:00:00 2001 From: FoxxMD Date: Mon, 31 Jan 2022 14:08:21 -0500 Subject: [PATCH] feat(repeat): Use newer text comparison technique to improve repeat detection * Use same technique as repost rule which has high accuracy and let false-positives * Implement ability to see similarity score, case sensitivity, and text transformations --- src/Common/interfaces.ts | 53 +++++++++++++ src/Rule/RepeatActivityRule.ts | 134 ++++++++++++++++++++------------- src/Rule/RepostRule.ts | 55 +------------- src/Schema/App.json | 21 +++++- src/Schema/OperatorConfig.json | 6 +- src/Schema/Rule.json | 21 +++++- src/Schema/RuleSet.json | 21 +++++- 7 files changed, 195 insertions(+), 116 deletions(-) diff --git a/src/Common/interfaces.ts b/src/Common/interfaces.ts index 24cf188..10ff9e9 100644 --- a/src/Common/interfaces.ts +++ b/src/Common/interfaces.ts @@ -2100,3 +2100,56 @@ export interface FilterResult { join: JoinOperands passed: boolean } + +export interface TextTransformOptions { + /** + * A set of search-and-replace operations to perform on text values before performing a match. Transformations are performed in the order they are defined. + * + * * If `transformationsActivity` IS NOT defined then these transformations will be performed on BOTH the activity text (submission title or comment) AND the repost candidate text + * * If `transformationsActivity` IS defined then these transformations are only performed on repost candidate text + * */ + transformations?: SearchAndReplaceRegExp[] + + /** + * Specify a separate set of transformations for the activity text (submission title or comment) + * + * To perform no transformations when `transformations` is defined set this to an empty array (`[]`) + * */ + transformationsActivity?: SearchAndReplaceRegExp[] +} + +export interface TextMatchOptions { + /** + * The percentage, as a whole number, of a repost title/comment that must match the title/comment being checked in order to consider both a match + * + * Note: Setting to 0 will make every candidate considered a match -- useful if you want to match if the URL has been reposted anywhere + * + * Defaults to `85` (85%) + * + * @default 85 + * @example [85] + * */ + matchScore?: number + + /** + * The minimum number of words in the activity being checked for which this rule will run on + * + * If the word count is below the minimum the rule fails + * + * Defaults to 2 + * + * @default 2 + * @example [2] + * */ + minWordCount?: number + + /** + * Should text matching be case sensitive? + * + * Defaults to false + * + * @default false + * @example [false] + **/ + caseSensitive?: boolean +} diff --git a/src/Rule/RepeatActivityRule.ts b/src/Rule/RepeatActivityRule.ts index 8de44ef..34db8c7 100644 --- a/src/Rule/RepeatActivityRule.ts +++ b/src/Rule/RepeatActivityRule.ts @@ -1,17 +1,28 @@ import {Rule, RuleJSONConfig, RuleOptions, RuleResult} from "./index"; import {Comment} from "snoowrap"; import { - activityWindowText, asSubmission, - comparisonTextOp, FAIL, getActivitySubredditName, isExternalUrlSubmission, isRedditMedia, - parseGenericValueComparison, parseSubredditName, - parseUsableLinkIdentifier as linkParser, PASS, subredditStateIsNameOnly, toStrongSubredditState + activityWindowText, + asSubmission, + comparisonTextOp, + FAIL, + getActivitySubredditName, + isExternalUrlSubmission, + isRedditMedia, + parseGenericValueComparison, + parseSubredditName, + parseUsableLinkIdentifier as linkParser, + PASS, + searchAndReplace, + stringSameness, + subredditStateIsNameOnly, + toStrongSubredditState } from "../util"; import { ActivityWindow, ActivityWindowType, - ReferenceSubmission, + ReferenceSubmission, SearchAndReplaceRegExp, StrongSubredditState, - SubredditState + SubredditState, TextMatchOptions, TextTransformOptions } from "../Common/interfaces"; import Submission from "snoowrap/dist/objects/Submission"; import dayjs from "dayjs"; @@ -29,27 +40,6 @@ interface RepeatActivityReducer { allSets: RepeatActivityData[] } -const getActivityIdentifier = (activity: (Submission | Comment), length = 200) => { - let identifier: string; - if (asSubmission(activity)) { - if (activity.is_self) { - identifier = `${activity.title}${activity.selftext.slice(0, length)}`; - } else if(isRedditMedia(activity)) { - identifier = activity.title; - } else { - identifier = parseUsableLinkIdentifier(activity.url) as string; - } - } else { - identifier = activity.body.slice(0, length); - } - return identifier; -} - -const fuzzyOptions = { - includeScore: true, - distance: 15 -}; - export class RepeatActivityRule extends Rule { threshold: string; window: ActivityWindowType; @@ -62,6 +52,9 @@ export class RepeatActivityRule extends Rule { activityFilterFunc: (x: Submission|Comment) => Promise = async (x) => true; keepRemoved: boolean; minWordCount: number; + transformations: SearchAndReplaceRegExp[] + caseSensitive: boolean + matchScore: number constructor(options: RepeatActivityOptions) { super(options); @@ -75,7 +68,13 @@ export class RepeatActivityRule extends Rule { include = [], exclude = [], keepRemoved = false, + transformations = [], + caseSensitive = true, + matchScore = 85, } = options; + this.matchScore = matchScore; + this.transformations = transformations; + this.caseSensitive = caseSensitive; this.minWordCount = minWordCount; this.keepRemoved = keepRemoved; this.threshold = threshold; @@ -136,6 +135,37 @@ export class RepeatActivityRule extends Rule { } } + getActivityIdentifier(activity: (Submission | Comment), length = 200, transform = true) { + let identifier: string; + if (asSubmission(activity)) { + if (activity.is_self) { + identifier = `${activity.title}${activity.selftext.slice(0, length)}`; + } else if(isRedditMedia(activity)) { + identifier = activity.title; + } else { + identifier = parseUsableLinkIdentifier(activity.url) as string; + } + } else { + identifier = activity.body.slice(0, length); + } + + if(!transform) { + return identifier; + } + + // apply any transforms + if (this.transformations.length > 0) { + identifier = searchAndReplace(identifier, this.transformations); + } + + // perform after transformations so as not to mess up regex's depending on case + if(!this.caseSensitive) { + identifier = identifier.toLowerCase(); + } + + return identifier; + } + async process(item: Submission|Comment): Promise<[boolean, RuleResult]> { let referenceUrl; if(asSubmission(item) && this.useSubmissionAsReference) { @@ -162,9 +192,10 @@ export class RepeatActivityRule extends Rule { const acc = await accProm; const {openSets = [], allSets = []} = acc; - let identifier = getActivityIdentifier(activity); + let identifier = this.getActivityIdentifier(activity); + const isUrl = isExternalUrlSubmission(activity); - let fu = new Fuse([identifier], !isUrl ? fuzzyOptions : {...fuzzyOptions, distance: 5}); + //let fu = new Fuse([identifier], !isUrl ? fuzzyOptions : {...fuzzyOptions, distance: 5}); const validSub = await this.activityFilterFunc(activity); let minMet = identifier.length >= this.minWordCount; @@ -174,12 +205,15 @@ export class RepeatActivityRule extends Rule { let currIdentifierInOpen = false; const bufferedActivities = this.gapAllowance === undefined || this.gapAllowance === 0 ? [] : activities.slice(Math.max(0, index - this.gapAllowance), Math.max(0, index)); for (const o of openSets) { - const res = fu.search(o.identifier); - const match = res.length > 0; - if (match && validSub && minMet) { + const strMatchResults = stringSameness(o.identifier, identifier); + if (strMatchResults.highScoreWeighted >= this.matchScore && minMet) { updatedOpenSets.push({...o, sets: [...o.sets, activity]}); currIdentifierInOpen = true; - } else if (bufferedActivities.some(x => fu.search(getActivityIdentifier(x)).length > 0) && validSub && minMet) { + } else if (bufferedActivities.some(x => { + let buffIdentifier = this.getActivityIdentifier(x); + const buffMatch = stringSameness(identifier, buffIdentifier); + return buffMatch.highScoreWeighted >= this.matchScore; + }) && validSub && minMet) { updatedOpenSets.push(o); } else if(!currIdentifierInOpen && !isUrl) { updatedAllSets.push(o); @@ -193,15 +227,18 @@ export class RepeatActivityRule extends Rule { // could be that a spammer is using different URLs for each submission but similar submission titles so search by title as well const sub = activity as Submission; identifier = sub.title; - fu = new Fuse([identifier], !isUrl ? fuzzyOptions : {...fuzzyOptions, distance: 5}); + //fu = new Fuse([identifier], !isUrl ? fuzzyOptions : {...fuzzyOptions, distance: 5}); minMet = identifier.length >= this.minWordCount; for (const o of openSets) { - const res = fu.search(o.identifier); - const match = res.length > 0; - if (match && validSub && minMet) { + const strMatchResults = stringSameness(o.identifier, identifier); + if (strMatchResults.highScoreWeighted >= this.matchScore && minMet) { updatedOpenSets.push({...o, sets: [...o.sets, activity]}); currIdentifierInOpen = true; - } else if (bufferedActivities.some(x => fu.search(getActivityIdentifier(x)).length > 0) && validSub && minMet && !updatedOpenSets.includes(o)) { + } else if (bufferedActivities.some(x => { + let buffIdentifier = this.getActivityIdentifier(x); + const buffMatch = stringSameness(identifier, buffIdentifier); + return buffMatch.highScoreWeighted >= this.matchScore; + }) && validSub && minMet && !updatedOpenSets.includes(o)) { updatedOpenSets.push(o); } else if(!updatedAllSets.includes(o)) { updatedAllSets.push(o); @@ -232,7 +269,7 @@ export class RepeatActivityRule extends Rule { let applicableGroupedActivities = identifierGroupedActivities; if (this.useSubmissionAsReference) { applicableGroupedActivities = new Map(); - let identifier = getActivityIdentifier(item); + let identifier = this.getActivityIdentifier(item); let referenceSubmissions = identifierGroupedActivities.get(identifier); if(referenceSubmissions === undefined && isExternalUrlSubmission(item)) { // if external url sub then try by title @@ -240,7 +277,7 @@ export class RepeatActivityRule extends Rule { referenceSubmissions = identifierGroupedActivities.get(identifier); if(referenceSubmissions === undefined) { // didn't get by title so go back to url since that's the default - identifier = getActivityIdentifier(item); + identifier = this.getActivityIdentifier(item); } } @@ -265,7 +302,7 @@ export class RepeatActivityRule extends Rule { }; for (let set of value) { const test = comparisonTextOp(set.length, operator, thresholdValue); - const md = set.map((x: (Comment | Submission)) => `[${asSubmission(x) ? x.title : getActivityIdentifier(x, 50)}](https://reddit.com${x.permalink}) in ${x.subreddit_name_prefixed} on ${dayjs(x.created_utc * 1000).utc().format()}`); + const md = set.map((x: (Comment | Submission)) => `[${asSubmission(x) ? x.title : this.getActivityIdentifier(x, 50)}](https://reddit.com${x.permalink}) in ${x.subreddit_name_prefixed} on ${dayjs(x.created_utc * 1000).utc().format()}`); summaryData.sets.push(set); summaryData.largestTrigger = Math.max(summaryData.largestTrigger, set.length); @@ -325,7 +362,7 @@ interface SummaryData { triggeringSetsMarkdown: string[] } -interface RepeatActivityConfig extends ActivityWindow, ReferenceSubmission { +interface RepeatActivityConfig extends ActivityWindow, ReferenceSubmission, TextMatchOptions { /** * The number of repeat submissions that will trigger the rule * @default ">= 5" @@ -383,18 +420,9 @@ interface RepeatActivityConfig extends ActivityWindow, ReferenceSubmission { keepRemoved?: boolean /** - * For activities that are text-based this is the minimum number of words required for the activity to be considered for a repeat - * - * EX if `minimumWordCount=5` and a comment is `what about you` then it is ignored because `3 is less than 5` - * - * **For self-text submissions** -- title + body text - * - * **For comments* -- body text - * - * @default 1 - * @example [1] + * A set of search-and-replace operations to perform on text values before performing a match. Transformations are performed in the order they are defined. * */ - minWordCount?: number, + transformations?: SearchAndReplaceRegExp[] } export interface RepeatActivityOptions extends RepeatActivityConfig, RuleOptions { diff --git a/src/Rule/RepostRule.ts b/src/Rule/RepostRule.ts index bdb594f..85e9118 100644 --- a/src/Rule/RepostRule.ts +++ b/src/Rule/RepostRule.ts @@ -18,7 +18,7 @@ import { RepostItem, RepostItemResult, SearchAndReplaceRegExp, - SearchFacetType, + SearchFacetType, TextMatchOptions, TextTransformOptions, } from "../Common/interfaces"; import objectHash from "object-hash"; import {getActivities, getAttributionIdentifier} from "../Utils/SnoowrapUtils"; @@ -30,59 +30,6 @@ import {rest} from "lodash"; const parseYtIdentifier = parseUsableLinkIdentifier(); -export interface TextMatchOptions { - /** - * The percentage, as a whole number, of a repost title/comment that must match the title/comment being checked in order to consider both a match - * - * Note: Setting to 0 will make every candidate considered a match -- useful if you want to match if the URL has been reposted anywhere - * - * Defaults to `85` (85%) - * - * @default 85 - * @example [85] - * */ - matchScore?: number - - /** - * The minimum number of words in the activity being checked for which this rule will run on - * - * If the word count is below the minimum the rule fails - * - * Defaults to 2 - * - * @default 2 - * @example [2] - * */ - minWordCount?: number - - /** - * Should text matching be case sensitive? - * - * Defaults to false - * - * @default false - * @example [false] - **/ - caseSensitive?: boolean -} - -export interface TextTransformOptions { - /** - * A set of search-and-replace operations to perform on text values before performing a match. Transformations are performed in the order they are defined. - * - * * If `transformationsActivity` IS NOT defined then these transformations will be performed on BOTH the activity text (submission title or comment) AND the repost candidate text - * * If `transformationsActivity` IS defined then these transformations are only performed on repost candidate text - * */ - transformations?: SearchAndReplaceRegExp[] - - /** - * Specify a separate set of transformations for the activity text (submission title or comment) - * - * To perform no transformations when `transformations` is defined set this to an empty array (`[]`) - * */ - transformationsActivity?: SearchAndReplaceRegExp[] -} - export interface SearchFacetJSONConfig extends TextMatchOptions, TextTransformOptions, ActivityWindow { kind: SearchFacetType | SearchFacetType[] } diff --git a/src/Schema/App.json b/src/Schema/App.json index 774d643..81f7601 100644 --- a/src/Schema/App.json +++ b/src/Schema/App.json @@ -2667,6 +2667,11 @@ } ] }, + "caseSensitive": { + "default": false, + "description": "Should text matching be case sensitive?\n\nDefaults to false", + "type": "boolean" + }, "exclude": { "description": "If present, activities will be counted only if they are **NOT** found in this list of Subreddits\n\nEach value in the list can be either:\n\n * string (name of subreddit)\n * regular expression to run on the subreddit name\n * `SubredditState`\n\nEX `[\"mealtimevideos\",\"askscience\", \"/onlyfans*\\/i\", {\"over18\": true}]`", "examples": [ @@ -2757,9 +2762,14 @@ ], "type": "string" }, + "matchScore": { + "default": 85, + "description": "The percentage, as a whole number, of a repost title/comment that must match the title/comment being checked in order to consider both a match\n\nNote: Setting to 0 will make every candidate considered a match -- useful if you want to match if the URL has been reposted anywhere\n\nDefaults to `85` (85%)", + "type": "number" + }, "minWordCount": { - "default": 1, - "description": "For activities that are text-based this is the minimum number of words required for the activity to be considered for a repeat\n\nEX if `minimumWordCount=5` and a comment is `what about you` then it is ignored because `3 is less than 5`\n\n**For self-text submissions** -- title + body text\n\n**For comments* -- body text", + "default": 2, + "description": "The minimum number of words in the activity being checked for which this rule will run on\n\nIf the word count is below the minimum the rule fails\n\nDefaults to 2", "type": "number" }, "name": { @@ -2775,6 +2785,13 @@ "description": "The number of repeat submissions that will trigger the rule", "type": "string" }, + "transformations": { + "description": "A set of search-and-replace operations to perform on text values before performing a match. Transformations are performed in the order they are defined.", + "items": { + "$ref": "#/definitions/SearchAndReplaceRegExp" + }, + "type": "array" + }, "useSubmissionAsReference": { "default": true, "description": "If activity is a Submission and is a link (not self-post) then only look at Submissions that contain this link, otherwise consider all activities.", diff --git a/src/Schema/OperatorConfig.json b/src/Schema/OperatorConfig.json index 94df7c7..ee27ceb 100644 --- a/src/Schema/OperatorConfig.json +++ b/src/Schema/OperatorConfig.json @@ -633,6 +633,9 @@ }, "file": { "allOf": [ + { + "$ref": "#/definitions/Omit" + }, { "properties": { "dirname": { @@ -658,9 +661,6 @@ } }, "type": "object" - }, - { - "$ref": "#/definitions/Omit" } ], "description": "Options for Rotating File logging" diff --git a/src/Schema/Rule.json b/src/Schema/Rule.json index e48fa20..830d9f5 100644 --- a/src/Schema/Rule.json +++ b/src/Schema/Rule.json @@ -1460,6 +1460,11 @@ } ] }, + "caseSensitive": { + "default": false, + "description": "Should text matching be case sensitive?\n\nDefaults to false", + "type": "boolean" + }, "exclude": { "description": "If present, activities will be counted only if they are **NOT** found in this list of Subreddits\n\nEach value in the list can be either:\n\n * string (name of subreddit)\n * regular expression to run on the subreddit name\n * `SubredditState`\n\nEX `[\"mealtimevideos\",\"askscience\", \"/onlyfans*\\/i\", {\"over18\": true}]`", "examples": [ @@ -1550,9 +1555,14 @@ ], "type": "string" }, + "matchScore": { + "default": 85, + "description": "The percentage, as a whole number, of a repost title/comment that must match the title/comment being checked in order to consider both a match\n\nNote: Setting to 0 will make every candidate considered a match -- useful if you want to match if the URL has been reposted anywhere\n\nDefaults to `85` (85%)", + "type": "number" + }, "minWordCount": { - "default": 1, - "description": "For activities that are text-based this is the minimum number of words required for the activity to be considered for a repeat\n\nEX if `minimumWordCount=5` and a comment is `what about you` then it is ignored because `3 is less than 5`\n\n**For self-text submissions** -- title + body text\n\n**For comments* -- body text", + "default": 2, + "description": "The minimum number of words in the activity being checked for which this rule will run on\n\nIf the word count is below the minimum the rule fails\n\nDefaults to 2", "type": "number" }, "name": { @@ -1568,6 +1578,13 @@ "description": "The number of repeat submissions that will trigger the rule", "type": "string" }, + "transformations": { + "description": "A set of search-and-replace operations to perform on text values before performing a match. Transformations are performed in the order they are defined.", + "items": { + "$ref": "#/definitions/SearchAndReplaceRegExp" + }, + "type": "array" + }, "useSubmissionAsReference": { "default": true, "description": "If activity is a Submission and is a link (not self-post) then only look at Submissions that contain this link, otherwise consider all activities.", diff --git a/src/Schema/RuleSet.json b/src/Schema/RuleSet.json index 3e3769e..04f1482 100644 --- a/src/Schema/RuleSet.json +++ b/src/Schema/RuleSet.json @@ -1434,6 +1434,11 @@ } ] }, + "caseSensitive": { + "default": false, + "description": "Should text matching be case sensitive?\n\nDefaults to false", + "type": "boolean" + }, "exclude": { "description": "If present, activities will be counted only if they are **NOT** found in this list of Subreddits\n\nEach value in the list can be either:\n\n * string (name of subreddit)\n * regular expression to run on the subreddit name\n * `SubredditState`\n\nEX `[\"mealtimevideos\",\"askscience\", \"/onlyfans*\\/i\", {\"over18\": true}]`", "examples": [ @@ -1524,9 +1529,14 @@ ], "type": "string" }, + "matchScore": { + "default": 85, + "description": "The percentage, as a whole number, of a repost title/comment that must match the title/comment being checked in order to consider both a match\n\nNote: Setting to 0 will make every candidate considered a match -- useful if you want to match if the URL has been reposted anywhere\n\nDefaults to `85` (85%)", + "type": "number" + }, "minWordCount": { - "default": 1, - "description": "For activities that are text-based this is the minimum number of words required for the activity to be considered for a repeat\n\nEX if `minimumWordCount=5` and a comment is `what about you` then it is ignored because `3 is less than 5`\n\n**For self-text submissions** -- title + body text\n\n**For comments* -- body text", + "default": 2, + "description": "The minimum number of words in the activity being checked for which this rule will run on\n\nIf the word count is below the minimum the rule fails\n\nDefaults to 2", "type": "number" }, "name": { @@ -1542,6 +1552,13 @@ "description": "The number of repeat submissions that will trigger the rule", "type": "string" }, + "transformations": { + "description": "A set of search-and-replace operations to perform on text values before performing a match. Transformations are performed in the order they are defined.", + "items": { + "$ref": "#/definitions/SearchAndReplaceRegExp" + }, + "type": "array" + }, "useSubmissionAsReference": { "default": true, "description": "If activity is a Submission and is a link (not self-post) then only look at Submissions that contain this link, otherwise consider all activities.",