From f8967d55c4e148b458b08ce29f9784fd61394f99 Mon Sep 17 00:00:00 2001
From: FoxxMD <FoxxMD@users.noreply.github.com>
Date: Mon, 31 Jan 2022 14:08:21 -0500
Subject: [PATCH] feat(repeat): Use newer text comparison technique to improve
 repeat detection

* Use same technique as repost rule which has high accuracy and let false-positives
* Implement ability to see similarity score, case sensitivity, and text transformations
---
 src/Common/interfaces.ts       |  53 +++++++++++++
 src/Rule/RepeatActivityRule.ts | 134 ++++++++++++++++++++-------------
 src/Rule/RepostRule.ts         |  55 +-------------
 src/Schema/App.json            |  21 +++++-
 src/Schema/OperatorConfig.json |   6 +-
 src/Schema/Rule.json           |  21 +++++-
 src/Schema/RuleSet.json        |  21 +++++-
 7 files changed, 195 insertions(+), 116 deletions(-)
diff --git a/src/Common/interfaces.ts b/src/Common/interfaces.ts
index 24cf188..10ff9e9 100644
--- a/src/Common/interfaces.ts
+++ b/src/Common/interfaces.ts
@@ -2100,3 +2100,56 @@ export interface FilterResult<T> {
     join: JoinOperands
     passed: boolean
 }
+
+export interface TextTransformOptions {
+    /**
+     * A set of search-and-replace operations to perform on text values before performing a match. Transformations are performed in the order they are defined.
+     *
+     * * If `transformationsActivity` IS NOT defined then these transformations will be performed on BOTH the activity text (submission title or comment) AND the repost candidate text
+     * * If `transformationsActivity` IS defined then these transformations are only performed on repost candidate text
+     * */
+    transformations?: SearchAndReplaceRegExp[]
+
+    /**
+     * Specify a separate set of transformations for the activity text (submission title or comment)
+     *
+     * To perform no transformations when `transformations` is defined set this to an empty array (`[]`)
+     * */
+    transformationsActivity?: SearchAndReplaceRegExp[]
+}
+
+export interface TextMatchOptions {
+    /**
+     * The percentage, as a whole number, of a repost title/comment that must match the title/comment being checked in order to consider both a match
+     *
+     * Note: Setting to 0 will make every candidate considered a match -- useful if you want to match if the URL has been reposted anywhere
+     *
+     * Defaults to `85` (85%)
+     *
+     * @default 85
+     * @example [85]
+     * */
+    matchScore?: number
+
+    /**
+     * The minimum number of words in the activity being checked for which this rule will run on
+     *
+     * If the word count is below the minimum the rule fails
+     *
+     * Defaults to 2
+     *
+     * @default 2
+     * @example [2]
+     * */
+    minWordCount?: number
+
+    /**
+     * Should text matching be case sensitive?
+     *
+     * Defaults to false
+     *
+     * @default false
+     * @example [false]
+     **/
+    caseSensitive?: boolean
+}
diff --git a/src/Rule/RepeatActivityRule.ts b/src/Rule/RepeatActivityRule.ts
index 8de44ef..34db8c7 100644
--- a/src/Rule/RepeatActivityRule.ts
+++ b/src/Rule/RepeatActivityRule.ts
@@ -1,17 +1,28 @@
 import {Rule, RuleJSONConfig, RuleOptions, RuleResult} from "./index";
 import {Comment} from "snoowrap";
 import {
-    activityWindowText, asSubmission,
-    comparisonTextOp, FAIL, getActivitySubredditName, isExternalUrlSubmission, isRedditMedia,
-    parseGenericValueComparison, parseSubredditName,
-    parseUsableLinkIdentifier as linkParser, PASS, subredditStateIsNameOnly, toStrongSubredditState
+    activityWindowText,
+    asSubmission,
+    comparisonTextOp,
+    FAIL,
+    getActivitySubredditName,
+    isExternalUrlSubmission,
+    isRedditMedia,
+    parseGenericValueComparison,
+    parseSubredditName,
+    parseUsableLinkIdentifier as linkParser,
+    PASS,
+    searchAndReplace,
+    stringSameness,
+    subredditStateIsNameOnly,
+    toStrongSubredditState
 } from "../util";
 import {
     ActivityWindow,
     ActivityWindowType,
-    ReferenceSubmission,
+    ReferenceSubmission, SearchAndReplaceRegExp,
     StrongSubredditState,
-    SubredditState
+    SubredditState, TextMatchOptions, TextTransformOptions
 } from "../Common/interfaces";
 import Submission from "snoowrap/dist/objects/Submission";
 import dayjs from "dayjs";
@@ -29,27 +40,6 @@ interface RepeatActivityReducer {
     allSets: RepeatActivityData[]
 }
 
-const getActivityIdentifier = (activity: (Submission | Comment), length = 200) => {
-    let identifier: string;
-    if (asSubmission(activity)) {
-        if (activity.is_self) {
-            identifier = `${activity.title}${activity.selftext.slice(0, length)}`;
-        } else if(isRedditMedia(activity)) {
-            identifier = activity.title;
-        } else {
-            identifier = parseUsableLinkIdentifier(activity.url) as string;
-        }
-    } else {
-        identifier = activity.body.slice(0, length);
-    }
-    return identifier;
-}
-
-const fuzzyOptions = {
-    includeScore: true,
-    distance: 15
-};
-
 export class RepeatActivityRule extends Rule {
     threshold: string;
     window: ActivityWindowType;
@@ -62,6 +52,9 @@ export class RepeatActivityRule extends Rule {
     activityFilterFunc: (x: Submission|Comment) => Promise<boolean> = async (x) => true;
     keepRemoved: boolean;
     minWordCount: number;
+    transformations: SearchAndReplaceRegExp[]
+    caseSensitive: boolean
+    matchScore: number
 
     constructor(options: RepeatActivityOptions) {
         super(options);
@@ -75,7 +68,13 @@ export class RepeatActivityRule extends Rule {
             include = [],
             exclude = [],
             keepRemoved = false,
+            transformations = [],
+            caseSensitive = true,
+            matchScore = 85,
         } = options;
+        this.matchScore = matchScore;
+        this.transformations = transformations;
+        this.caseSensitive = caseSensitive;
         this.minWordCount = minWordCount;
         this.keepRemoved = keepRemoved;
         this.threshold = threshold;
@@ -136,6 +135,37 @@ export class RepeatActivityRule extends Rule {
         }
     }
 
+    getActivityIdentifier(activity: (Submission | Comment), length = 200, transform = true) {
+        let identifier: string;
+        if (asSubmission(activity)) {
+            if (activity.is_self) {
+                identifier = `${activity.title}${activity.selftext.slice(0, length)}`;
+            } else if(isRedditMedia(activity)) {
+                identifier = activity.title;
+            } else {
+                identifier = parseUsableLinkIdentifier(activity.url) as string;
+            }
+        } else {
+            identifier = activity.body.slice(0, length);
+        }
+
+        if(!transform) {
+            return identifier;
+        }
+
+        // apply any transforms
+        if (this.transformations.length > 0) {
+            identifier = searchAndReplace(identifier, this.transformations);
+        }
+
+        // perform after transformations so as not to mess up regex's depending on case
+        if(!this.caseSensitive) {
+            identifier = identifier.toLowerCase();
+        }
+
+        return identifier;
+    }
+
     async process(item: Submission|Comment): Promise<[boolean, RuleResult]> {
         let referenceUrl;
         if(asSubmission(item) && this.useSubmissionAsReference) {
@@ -162,9 +192,10 @@ export class RepeatActivityRule extends Rule {
             const acc = await accProm;
             const {openSets = [], allSets = []} = acc;
 
-            let identifier = getActivityIdentifier(activity);
+            let identifier = this.getActivityIdentifier(activity);
+
             const isUrl = isExternalUrlSubmission(activity);
-            let fu = new Fuse([identifier], !isUrl ? fuzzyOptions : {...fuzzyOptions, distance: 5});
+            //let fu = new Fuse([identifier], !isUrl ? fuzzyOptions : {...fuzzyOptions, distance: 5});
             const validSub = await this.activityFilterFunc(activity);
             let minMet = identifier.length >= this.minWordCount;
 
@@ -174,12 +205,15 @@ export class RepeatActivityRule extends Rule {
             let currIdentifierInOpen = false;
             const bufferedActivities = this.gapAllowance === undefined || this.gapAllowance === 0 ? [] : activities.slice(Math.max(0, index - this.gapAllowance), Math.max(0, index));
             for (const o of openSets) {
-                const res = fu.search(o.identifier);
-                const match = res.length > 0;
-                if (match && validSub && minMet) {
+                const strMatchResults = stringSameness(o.identifier, identifier);
+                if (strMatchResults.highScoreWeighted >= this.matchScore && minMet) {
                     updatedOpenSets.push({...o, sets: [...o.sets, activity]});
                     currIdentifierInOpen = true;
-                } else if (bufferedActivities.some(x => fu.search(getActivityIdentifier(x)).length > 0) && validSub && minMet) {
+                } else if (bufferedActivities.some(x => {
+                    let buffIdentifier = this.getActivityIdentifier(x);
+                    const buffMatch = stringSameness(identifier, buffIdentifier);
+                    return buffMatch.highScoreWeighted >= this.matchScore;
+                }) && validSub && minMet) {
                     updatedOpenSets.push(o);
                 } else if(!currIdentifierInOpen && !isUrl) {
                     updatedAllSets.push(o);
@@ -193,15 +227,18 @@ export class RepeatActivityRule extends Rule {
                     // could be that a spammer is using different URLs for each submission but similar submission titles so search by title as well
                     const sub = activity as Submission;
                     identifier = sub.title;
-                    fu = new Fuse([identifier], !isUrl ? fuzzyOptions : {...fuzzyOptions, distance: 5});
+                    //fu = new Fuse([identifier], !isUrl ? fuzzyOptions : {...fuzzyOptions, distance: 5});
                     minMet = identifier.length >= this.minWordCount;
                     for (const o of openSets) {
-                        const res = fu.search(o.identifier);
-                        const match = res.length > 0;
-                        if (match && validSub && minMet) {
+                        const strMatchResults = stringSameness(o.identifier, identifier);
+                        if (strMatchResults.highScoreWeighted >= this.matchScore && minMet) {
                             updatedOpenSets.push({...o, sets: [...o.sets, activity]});
                             currIdentifierInOpen = true;
-                        } else if (bufferedActivities.some(x => fu.search(getActivityIdentifier(x)).length > 0) && validSub && minMet && !updatedOpenSets.includes(o)) {
+                        } else if (bufferedActivities.some(x => {
+                            let buffIdentifier = this.getActivityIdentifier(x);
+                            const buffMatch = stringSameness(identifier, buffIdentifier);
+                            return buffMatch.highScoreWeighted >= this.matchScore;
+                        }) && validSub && minMet && !updatedOpenSets.includes(o)) {
                             updatedOpenSets.push(o);
                         } else if(!updatedAllSets.includes(o)) {
                             updatedAllSets.push(o);
@@ -232,7 +269,7 @@ export class RepeatActivityRule extends Rule {
         let applicableGroupedActivities = identifierGroupedActivities;
         if (this.useSubmissionAsReference) {
             applicableGroupedActivities = new Map();
-            let identifier = getActivityIdentifier(item);
+            let identifier = this.getActivityIdentifier(item);
             let referenceSubmissions = identifierGroupedActivities.get(identifier);
             if(referenceSubmissions === undefined && isExternalUrlSubmission(item)) {
                 // if external url sub then try by title
@@ -240,7 +277,7 @@ export class RepeatActivityRule extends Rule {
                 referenceSubmissions = identifierGroupedActivities.get(identifier);
                 if(referenceSubmissions === undefined) {
                     // didn't get by title so go back to url since that's the default
-                    identifier = getActivityIdentifier(item);
+                    identifier = this.getActivityIdentifier(item);
                 }
             }
 
@@ -265,7 +302,7 @@ export class RepeatActivityRule extends Rule {
             };
             for (let set of value) {
                 const test = comparisonTextOp(set.length, operator, thresholdValue);
-                const md = set.map((x: (Comment | Submission)) => `[${asSubmission(x) ? x.title : getActivityIdentifier(x, 50)}](https://reddit.com${x.permalink}) in ${x.subreddit_name_prefixed} on ${dayjs(x.created_utc * 1000).utc().format()}`);
+                const md = set.map((x: (Comment | Submission)) => `[${asSubmission(x) ? x.title : this.getActivityIdentifier(x, 50)}](https://reddit.com${x.permalink}) in ${x.subreddit_name_prefixed} on ${dayjs(x.created_utc * 1000).utc().format()}`);
 
                 summaryData.sets.push(set);
                 summaryData.largestTrigger = Math.max(summaryData.largestTrigger, set.length);
@@ -325,7 +362,7 @@ interface SummaryData {
     triggeringSetsMarkdown: string[]
 }
 
-interface RepeatActivityConfig extends ActivityWindow, ReferenceSubmission {
+interface RepeatActivityConfig extends ActivityWindow, ReferenceSubmission, TextMatchOptions {
     /**
      * The number of repeat submissions that will trigger the rule
      * @default ">= 5"
@@ -383,18 +420,9 @@ interface RepeatActivityConfig extends ActivityWindow, ReferenceSubmission {
     keepRemoved?: boolean
 
     /**
-     * For activities that are text-based this is the minimum number of words required for the activity to be considered for a repeat
-     *
-     * EX if `minimumWordCount=5` and a comment is `what about you` then it is ignored because `3 is less than 5`
-     *
-     * **For self-text submissions** -- title + body text
-     *
-     * **For comments* -- body text
-     *
-     * @default 1
-     * @example [1]
+     * A set of search-and-replace operations to perform on text values before performing a match. Transformations are performed in the order they are defined.
      * */
-    minWordCount?: number,
+    transformations?: SearchAndReplaceRegExp[]
 }
 
 export interface RepeatActivityOptions extends RepeatActivityConfig, RuleOptions {
diff --git a/src/Rule/RepostRule.ts b/src/Rule/RepostRule.ts
index bdb594f..85e9118 100644
--- a/src/Rule/RepostRule.ts
+++ b/src/Rule/RepostRule.ts
@@ -18,7 +18,7 @@ import {
     RepostItem,
     RepostItemResult,
     SearchAndReplaceRegExp,
-    SearchFacetType,
+    SearchFacetType, TextMatchOptions, TextTransformOptions,
 } from "../Common/interfaces";
 import objectHash from "object-hash";
 import {getActivities, getAttributionIdentifier} from "../Utils/SnoowrapUtils";
@@ -30,59 +30,6 @@ import {rest} from "lodash";
 
 const parseYtIdentifier = parseUsableLinkIdentifier();
 
-export interface TextMatchOptions {
-    /**
-     * The percentage, as a whole number, of a repost title/comment that must match the title/comment being checked in order to consider both a match
-     *
-     * Note: Setting to 0 will make every candidate considered a match -- useful if you want to match if the URL has been reposted anywhere
-     *
-     * Defaults to `85` (85%)
-     *
-     * @default 85
-     * @example [85]
-     * */
-    matchScore?: number
-
-    /**
-     * The minimum number of words in the activity being checked for which this rule will run on
-     *
-     * If the word count is below the minimum the rule fails
-     *
-     * Defaults to 2
-     *
-     * @default 2
-     * @example [2]
-     * */
-    minWordCount?: number
-
-    /**
-     * Should text matching be case sensitive?
-     *
-     * Defaults to false
-     *
-     * @default false
-     * @example [false]
-     **/
-    caseSensitive?: boolean
-}
-
-export interface TextTransformOptions {
-    /**
-     * A set of search-and-replace operations to perform on text values before performing a match. Transformations are performed in the order they are defined.
-     *
-     * * If `transformationsActivity` IS NOT defined then these transformations will be performed on BOTH the activity text (submission title or comment) AND the repost candidate text
-     * * If `transformationsActivity` IS defined then these transformations are only performed on repost candidate text
-     * */
-    transformations?: SearchAndReplaceRegExp[]
-
-    /**
-     * Specify a separate set of transformations for the activity text (submission title or comment)
-     *
-     * To perform no transformations when `transformations` is defined set this to an empty array (`[]`)
-     * */
-    transformationsActivity?: SearchAndReplaceRegExp[]
-}
-
 export interface SearchFacetJSONConfig extends TextMatchOptions, TextTransformOptions, ActivityWindow {
     kind: SearchFacetType | SearchFacetType[]
 }
diff --git a/src/Schema/App.json b/src/Schema/App.json
index 774d643..81f7601 100644
--- a/src/Schema/App.json
+++ b/src/Schema/App.json
@@ -2667,6 +2667,11 @@
                         }
                     ]
                 },
+                "caseSensitive": {
+                    "default": false,
+                    "description": "Should text matching be case sensitive?\n\nDefaults to false",
+                    "type": "boolean"
+                },
                 "exclude": {
                     "description": "If present, activities will be counted only if they are **NOT** found in this list of Subreddits\n\nEach value in the list can be either:\n\n * string (name of subreddit)\n * regular expression to run on the subreddit name\n * `SubredditState`\n\nEX `[\"mealtimevideos\",\"askscience\", \"/onlyfans*\\/i\", {\"over18\": true}]`",
                     "examples": [
@@ -2757,9 +2762,14 @@
                     ],
                     "type": "string"
                 },
+                "matchScore": {
+                    "default": 85,
+                    "description": "The percentage, as a whole number, of a repost title/comment that must match the title/comment being checked in order to consider both a match\n\nNote: Setting to 0 will make every candidate considered a match -- useful if you want to match if the URL has been reposted anywhere\n\nDefaults to `85` (85%)",
+                    "type": "number"
+                },
                 "minWordCount": {
-                    "default": 1,
-                    "description": "For activities that are text-based this is the minimum number of words required for the activity to be considered for a repeat\n\nEX if `minimumWordCount=5` and a comment is `what about you` then it is ignored because `3 is less than 5`\n\n**For self-text submissions** -- title + body text\n\n**For comments* -- body text",
+                    "default": 2,
+                    "description": "The minimum number of words in the activity being checked for which this rule will run on\n\nIf the word count is below the minimum the rule fails\n\nDefaults to 2",
                     "type": "number"
                 },
                 "name": {
@@ -2775,6 +2785,13 @@
                     "description": "The number of repeat submissions that will trigger the rule",
                     "type": "string"
                 },
+                "transformations": {
+                    "description": "A set of search-and-replace operations to perform on text values before performing a match. Transformations are performed in the order they are defined.",
+                    "items": {
+                        "$ref": "#/definitions/SearchAndReplaceRegExp"
+                    },
+                    "type": "array"
+                },
                 "useSubmissionAsReference": {
                     "default": true,
                     "description": "If activity is a Submission and is a link (not self-post) then only look at Submissions that contain this link, otherwise consider all activities.",
diff --git a/src/Schema/OperatorConfig.json b/src/Schema/OperatorConfig.json
index 94df7c7..ee27ceb 100644
--- a/src/Schema/OperatorConfig.json
+++ b/src/Schema/OperatorConfig.json
@@ -633,6 +633,9 @@
                 },
                 "file": {
                     "allOf": [
+                        {
+                            "$ref": "#/definitions/Omit<DailyRotateFileTransportOptions,\"stream\"|\"dirname\"|\"options\"|\"handleRejections\"|\"format\"|\"handleExceptions\"|\"log\"|\"logv\"|\"close\">"
+                        },
                         {
                             "properties": {
                                 "dirname": {
@@ -658,9 +661,6 @@
                                 }
                             },
                             "type": "object"
-                        },
-                        {
-                            "$ref": "#/definitions/Omit<DailyRotateFileTransportOptions,\"stream\"|\"dirname\"|\"options\"|\"handleRejections\"|\"format\"|\"handleExceptions\"|\"log\"|\"logv\"|\"close\">"
                         }
                     ],
                     "description": "Options for Rotating File logging"
diff --git a/src/Schema/Rule.json b/src/Schema/Rule.json
index e48fa20..830d9f5 100644
--- a/src/Schema/Rule.json
+++ b/src/Schema/Rule.json
@@ -1460,6 +1460,11 @@
                         }
                     ]
                 },
+                "caseSensitive": {
+                    "default": false,
+                    "description": "Should text matching be case sensitive?\n\nDefaults to false",
+                    "type": "boolean"
+                },
                 "exclude": {
                     "description": "If present, activities will be counted only if they are **NOT** found in this list of Subreddits\n\nEach value in the list can be either:\n\n * string (name of subreddit)\n * regular expression to run on the subreddit name\n * `SubredditState`\n\nEX `[\"mealtimevideos\",\"askscience\", \"/onlyfans*\\/i\", {\"over18\": true}]`",
                     "examples": [
@@ -1550,9 +1555,14 @@
                     ],
                     "type": "string"
                 },
+                "matchScore": {
+                    "default": 85,
+                    "description": "The percentage, as a whole number, of a repost title/comment that must match the title/comment being checked in order to consider both a match\n\nNote: Setting to 0 will make every candidate considered a match -- useful if you want to match if the URL has been reposted anywhere\n\nDefaults to `85` (85%)",
+                    "type": "number"
+                },
                 "minWordCount": {
-                    "default": 1,
-                    "description": "For activities that are text-based this is the minimum number of words required for the activity to be considered for a repeat\n\nEX if `minimumWordCount=5` and a comment is `what about you` then it is ignored because `3 is less than 5`\n\n**For self-text submissions** -- title + body text\n\n**For comments* -- body text",
+                    "default": 2,
+                    "description": "The minimum number of words in the activity being checked for which this rule will run on\n\nIf the word count is below the minimum the rule fails\n\nDefaults to 2",
                     "type": "number"
                 },
                 "name": {
@@ -1568,6 +1578,13 @@
                     "description": "The number of repeat submissions that will trigger the rule",
                     "type": "string"
                 },
+                "transformations": {
+                    "description": "A set of search-and-replace operations to perform on text values before performing a match. Transformations are performed in the order they are defined.",
+                    "items": {
+                        "$ref": "#/definitions/SearchAndReplaceRegExp"
+                    },
+                    "type": "array"
+                },
                 "useSubmissionAsReference": {
                     "default": true,
                     "description": "If activity is a Submission and is a link (not self-post) then only look at Submissions that contain this link, otherwise consider all activities.",
diff --git a/src/Schema/RuleSet.json b/src/Schema/RuleSet.json
index 3e3769e..04f1482 100644
--- a/src/Schema/RuleSet.json
+++ b/src/Schema/RuleSet.json
@@ -1434,6 +1434,11 @@
                         }
                     ]
                 },
+                "caseSensitive": {
+                    "default": false,
+                    "description": "Should text matching be case sensitive?\n\nDefaults to false",
+                    "type": "boolean"
+                },
                 "exclude": {
                     "description": "If present, activities will be counted only if they are **NOT** found in this list of Subreddits\n\nEach value in the list can be either:\n\n * string (name of subreddit)\n * regular expression to run on the subreddit name\n * `SubredditState`\n\nEX `[\"mealtimevideos\",\"askscience\", \"/onlyfans*\\/i\", {\"over18\": true}]`",
                     "examples": [
@@ -1524,9 +1529,14 @@
                     ],
                     "type": "string"
                 },
+                "matchScore": {
+                    "default": 85,
+                    "description": "The percentage, as a whole number, of a repost title/comment that must match the title/comment being checked in order to consider both a match\n\nNote: Setting to 0 will make every candidate considered a match -- useful if you want to match if the URL has been reposted anywhere\n\nDefaults to `85` (85%)",
+                    "type": "number"
+                },
                 "minWordCount": {
-                    "default": 1,
-                    "description": "For activities that are text-based this is the minimum number of words required for the activity to be considered for a repeat\n\nEX if `minimumWordCount=5` and a comment is `what about you` then it is ignored because `3 is less than 5`\n\n**For self-text submissions** -- title + body text\n\n**For comments* -- body text",
+                    "default": 2,
+                    "description": "The minimum number of words in the activity being checked for which this rule will run on\n\nIf the word count is below the minimum the rule fails\n\nDefaults to 2",
                     "type": "number"
                 },
                 "name": {
@@ -1542,6 +1552,13 @@
                     "description": "The number of repeat submissions that will trigger the rule",
                     "type": "string"
                 },
+                "transformations": {
+                    "description": "A set of search-and-replace operations to perform on text values before performing a match. Transformations are performed in the order they are defined.",
+                    "items": {
+                        "$ref": "#/definitions/SearchAndReplaceRegExp"
+                    },
+                    "type": "array"
+                },
                 "useSubmissionAsReference": {
                     "default": true,
                     "description": "If activity is a Submission and is a link (not self-post) then only look at Submissions that contain this link, otherwise consider all activities.",