mirror of
https://github.com/ThioJoe/YT-Spammer-Purge.git
synced 2026-01-09 14:18:06 -05:00
Add benchmark and changes for rapidfuzz (#752)
Co-authored-by: Pushpam Punjabi <pushpampunjabi@yahooo.com>
This commit is contained in:
54
Scripts/benchmark_distance.py
Normal file
54
Scripts/benchmark_distance.py
Normal file
@@ -0,0 +1,54 @@
|
||||
"""
|
||||
Benchmark rapidfuzz and python-Levenshtein time
|
||||
|
||||
Author:
|
||||
Pushpam Punjabi
|
||||
Machine Learning Engineer
|
||||
"""
|
||||
|
||||
import random
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
from Levenshtein import ratio
|
||||
from rapidfuzz import fuzz
|
||||
|
||||
print("\nGenerating experiment...")
|
||||
|
||||
# Create random sentences
|
||||
CHARS = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
lengths = np.arange(10, 1000)
|
||||
NUM_PAIRS = 10000
|
||||
|
||||
# Initialize string lists
|
||||
x = []
|
||||
y = []
|
||||
|
||||
# Generate random strings
|
||||
for _ in range(NUM_PAIRS):
|
||||
x_len = random.choice(lengths)
|
||||
y_len = random.choice(lengths)
|
||||
temp_x = []
|
||||
temp_y = []
|
||||
for _ in range(x_len):
|
||||
temp_x.append(random.choice(CHARS))
|
||||
for _ in range(y_len):
|
||||
temp_y.append(random.choice(CHARS))
|
||||
x.append("".join(temp_x))
|
||||
y.append("".join(temp_y))
|
||||
|
||||
print("Generated experiment.\n\nRunning benchmark...")
|
||||
|
||||
# Benchmart time for python-Levenshtein
|
||||
start = datetime.now()
|
||||
for sen_x, sen_y in zip(x, y):
|
||||
value = ratio(sen_x, sen_y)
|
||||
end = datetime.now()
|
||||
print(f"\npython-Levenshtein time: {end - start}")
|
||||
|
||||
# Benchmart time for rapidfuzz
|
||||
start = datetime.now()
|
||||
for sen_x, sen_y in zip(x, y):
|
||||
value = fuzz.ratio(sen_x, sen_y) / 100
|
||||
end = datetime.now()
|
||||
print(f"rapidfuzz time: {end - start}\n")
|
||||
@@ -12,7 +12,7 @@ import unicodedata
|
||||
import time
|
||||
import itertools
|
||||
from datetime import datetime
|
||||
from Levenshtein import ratio
|
||||
from rapidfuzz import fuzz
|
||||
from googleapiclient.errors import HttpError
|
||||
|
||||
##########################################################################################
|
||||
@@ -628,7 +628,7 @@ def check_duplicates(current, config, miscData, allVideoCommentsDict, videoID):
|
||||
matchedIndexes.append(i)
|
||||
matchedIndexes.append(j)
|
||||
break
|
||||
elif ratio(x,y) > levenshtein:
|
||||
elif fuzz.ratio(x,y) / 100 > levenshtein:
|
||||
# List the indexes of the matched comments in the list
|
||||
matchedIndexes.append(i)
|
||||
matchedIndexes.append(j)
|
||||
@@ -703,7 +703,7 @@ def check_reposts(current, config, miscData, allVideoCommentsDict, videoID):
|
||||
for j in range(0,i-1): # Only need to check against comments that came before it, so have index less than current
|
||||
olderCommentText = flatCommentList[j]['commentText']
|
||||
if len(scrutinizedText) >= minLength and flatCommentList[j]['authorChannelID'] != scrutinizedAuthorID and x['commentID'] not in current.matchedCommentsDict and x['commentID'] not in current.duplicateCommentsDict:
|
||||
if (not fuzzy and scrutinizedText == olderCommentText) or (fuzzy and ratio(scrutinizedText, olderCommentText) > levenshtein):
|
||||
if (not fuzzy and scrutinizedText == olderCommentText) or (fuzzy and fuzz.ratio(scrutinizedText, olderCommentText) / 100 > levenshtein):
|
||||
# List the indexes of the matched comments in the list
|
||||
x['originalCommentID'] = flatCommentList[j]['commentID']
|
||||
add_spam(current, config, miscData, x, videoID, matchReason="Repost")
|
||||
|
||||
@@ -5,5 +5,5 @@ colorama==0.4.4
|
||||
rtfunicode==2.0
|
||||
certifi>=2021.10.8
|
||||
six>=1.16.0
|
||||
python-Levenshtein>=0.12.2
|
||||
rapidfuzz>=2.0.7
|
||||
regex>=2022.1.18
|
||||
|
||||
Reference in New Issue
Block a user