Add benchmark and changes for rapidfuzz (#752)

Co-authored-by: Pushpam Punjabi <pushpampunjabi@yahooo.com>
2026-01-09 14:18:06 -05:00 · 2022-04-10 23:36:33 +05:30
parent ec2bc492e6
commit f73ec35929
3 changed files with 58 additions and 4 deletions
--- a/Scripts/benchmark_distance.py
+++ b/Scripts/benchmark_distance.py
@@ -0,0 +1,54 @@
+"""
+Benchmark rapidfuzz and python-Levenshtein time
+
+Author:
+    Pushpam Punjabi
+    Machine Learning Engineer
+"""
+
+import random
+from datetime import datetime
+
+import numpy as np
+from Levenshtein import ratio
+from rapidfuzz import fuzz
+
+print("\nGenerating experiment...")
+
+# Create random sentences
+CHARS = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
+lengths = np.arange(10, 1000)
+NUM_PAIRS = 10000
+
+# Initialize string lists
+x = []
+y = []
+
+# Generate random strings
+for _ in range(NUM_PAIRS):
+    x_len = random.choice(lengths)
+    y_len = random.choice(lengths)
+    temp_x = []
+    temp_y = []
+    for _ in range(x_len):
+        temp_x.append(random.choice(CHARS))
+    for _ in range(y_len):
+        temp_y.append(random.choice(CHARS))
+    x.append("".join(temp_x))
+    y.append("".join(temp_y))
+
+print("Generated experiment.\n\nRunning benchmark...")
+
+# Benchmart time for python-Levenshtein
+start = datetime.now()
+for sen_x, sen_y in zip(x, y):
+    value = ratio(sen_x, sen_y)
+end = datetime.now()
+print(f"\npython-Levenshtein time: {end - start}")
+
+# Benchmart time for rapidfuzz
+start = datetime.now()
+for sen_x, sen_y in zip(x, y):
+    value = fuzz.ratio(sen_x, sen_y) / 100
+end = datetime.now()
+print(f"rapidfuzz time: {end - start}\n")
--- a/Scripts/operations.py
+++ b/Scripts/operations.py
@@ -12,7 +12,7 @@ import unicodedata
 import time
 import itertools
 from datetime import datetime
-from Levenshtein import ratio
+from rapidfuzz import fuzz
 from googleapiclient.errors import HttpError

 ##########################################################################################
@@ -628,7 +628,7 @@ def check_duplicates(current, config, miscData, allVideoCommentsDict, videoID):
                matchedIndexes.append(i)
                matchedIndexes.append(j)
                break
-              elif ratio(x,y) > levenshtein:
+              elif fuzz.ratio(x,y) / 100 > levenshtein:
                # List the indexes of the matched comments in the list
                matchedIndexes.append(i)
                matchedIndexes.append(j)
@@ -703,7 +703,7 @@ def check_reposts(current, config, miscData, allVideoCommentsDict, videoID):
      for j in range(0,i-1): # Only need to check against comments that came before it, so have index less than current
        olderCommentText = flatCommentList[j]['commentText']
        if len(scrutinizedText) >= minLength and flatCommentList[j]['authorChannelID'] != scrutinizedAuthorID and x['commentID'] not in current.matchedCommentsDict and x['commentID'] not in current.duplicateCommentsDict:
-          if (not fuzzy and scrutinizedText == olderCommentText) or (fuzzy and ratio(scrutinizedText, olderCommentText) > levenshtein):
+          if (not fuzzy and scrutinizedText == olderCommentText) or (fuzzy and fuzz.ratio(scrutinizedText, olderCommentText) / 100 > levenshtein):
            # List the indexes of the matched comments in the list
            x['originalCommentID'] = flatCommentList[j]['commentID']
            add_spam(current, config, miscData, x, videoID, matchReason="Repost")
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,5 +5,5 @@ colorama==0.4.4
 rtfunicode==2.0
 certifi>=2021.10.8
 six>=1.16.0
-python-Levenshtein>=0.12.2
+rapidfuzz>=2.0.7
 regex>=2022.1.18