Add benchmark and changes for rapidfuzz (#752)

Co-authored-by: Pushpam Punjabi <pushpampunjabi@yahooo.com>
This commit is contained in:
Pushpam Punjabi
2022-04-10 23:36:33 +05:30
committed by GitHub
parent ec2bc492e6
commit f73ec35929
3 changed files with 58 additions and 4 deletions

View File

@@ -0,0 +1,54 @@
"""
Benchmark rapidfuzz and python-Levenshtein time
Author:
Pushpam Punjabi
Machine Learning Engineer
"""
import random
from datetime import datetime
import numpy as np
from Levenshtein import ratio
from rapidfuzz import fuzz
print("\nGenerating experiment...")
# Create random sentences
CHARS = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
lengths = np.arange(10, 1000)
NUM_PAIRS = 10000
# Initialize string lists
x = []
y = []
# Generate random strings
for _ in range(NUM_PAIRS):
x_len = random.choice(lengths)
y_len = random.choice(lengths)
temp_x = []
temp_y = []
for _ in range(x_len):
temp_x.append(random.choice(CHARS))
for _ in range(y_len):
temp_y.append(random.choice(CHARS))
x.append("".join(temp_x))
y.append("".join(temp_y))
print("Generated experiment.\n\nRunning benchmark...")
# Benchmart time for python-Levenshtein
start = datetime.now()
for sen_x, sen_y in zip(x, y):
value = ratio(sen_x, sen_y)
end = datetime.now()
print(f"\npython-Levenshtein time: {end - start}")
# Benchmart time for rapidfuzz
start = datetime.now()
for sen_x, sen_y in zip(x, y):
value = fuzz.ratio(sen_x, sen_y) / 100
end = datetime.now()
print(f"rapidfuzz time: {end - start}\n")

View File

@@ -12,7 +12,7 @@ import unicodedata
import time
import itertools
from datetime import datetime
from Levenshtein import ratio
from rapidfuzz import fuzz
from googleapiclient.errors import HttpError
##########################################################################################
@@ -628,7 +628,7 @@ def check_duplicates(current, config, miscData, allVideoCommentsDict, videoID):
matchedIndexes.append(i)
matchedIndexes.append(j)
break
elif ratio(x,y) > levenshtein:
elif fuzz.ratio(x,y) / 100 > levenshtein:
# List the indexes of the matched comments in the list
matchedIndexes.append(i)
matchedIndexes.append(j)
@@ -703,7 +703,7 @@ def check_reposts(current, config, miscData, allVideoCommentsDict, videoID):
for j in range(0,i-1): # Only need to check against comments that came before it, so have index less than current
olderCommentText = flatCommentList[j]['commentText']
if len(scrutinizedText) >= minLength and flatCommentList[j]['authorChannelID'] != scrutinizedAuthorID and x['commentID'] not in current.matchedCommentsDict and x['commentID'] not in current.duplicateCommentsDict:
if (not fuzzy and scrutinizedText == olderCommentText) or (fuzzy and ratio(scrutinizedText, olderCommentText) > levenshtein):
if (not fuzzy and scrutinizedText == olderCommentText) or (fuzzy and fuzz.ratio(scrutinizedText, olderCommentText) / 100 > levenshtein):
# List the indexes of the matched comments in the list
x['originalCommentID'] = flatCommentList[j]['commentID']
add_spam(current, config, miscData, x, videoID, matchReason="Repost")

View File

@@ -5,5 +5,5 @@ colorama==0.4.4
rtfunicode==2.0
certifi>=2021.10.8
six>=1.16.0
python-Levenshtein>=0.12.2
rapidfuzz>=2.0.7
regex>=2022.1.18