feat(image): Implement perceptual hashing image comparison

* Generate perceptual hashes using blockhash-js of images that can be cache/stored
* Take advantage of reddit thumbnail code ImageData to hash lower-res to begin with (but represent full url)
* Refactor imageDetection config so hash and pixel approaches have different configs
* Cache phash results to reduce reddit traffic and speed up performance

Addresses cpu/memory issues with pixel comparison. Allow pixel for finer comparisons if needed using tiered thresholds. Closes #26
This commit is contained in:
FoxxMD
2021-10-11 15:28:48 -04:00
parent 4b3bea661d
commit 463a4dc0eb
11 changed files with 684 additions and 23 deletions

14
package-lock.json generated
View File

@@ -38,6 +38,7 @@
"js-yaml": "^4.1.0",
"json5": "^2.2.0",
"jsonwebtoken": "^8.5.1",
"leven": "^3.1.0",
"lodash": "^4.17.21",
"lru-cache": "^6.0.0",
"monaco-editor": "^0.27.0",
@@ -2764,6 +2765,14 @@
"resolved": "https://registry.npmjs.org/kuler/-/kuler-2.0.0.tgz",
"integrity": "sha512-Xq9nH7KlWZmXAtodXDDRE7vs6DU1gTU8zYDHDiWLSip45Egwq3plLHzPn27NgvzL2r1LMPC1vdqh98sQxtqj4A=="
},
"node_modules/leven": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/leven/-/leven-3.1.0.tgz",
"integrity": "sha512-qsda+H8jTaUaN/x5vzW2rzc+8Rw4TAQ/4KjB46IwK5VH+IlVeeeje/EoZRpiXvIqjFgK84QffqPztGI3VBLG1A==",
"engines": {
"node": ">=6"
}
},
"node_modules/lodash": {
"version": "4.17.21",
"resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz",
@@ -7046,6 +7055,11 @@
"resolved": "https://registry.npmjs.org/kuler/-/kuler-2.0.0.tgz",
"integrity": "sha512-Xq9nH7KlWZmXAtodXDDRE7vs6DU1gTU8zYDHDiWLSip45Egwq3plLHzPn27NgvzL2r1LMPC1vdqh98sQxtqj4A=="
},
"leven": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/leven/-/leven-3.1.0.tgz",
"integrity": "sha512-qsda+H8jTaUaN/x5vzW2rzc+8Rw4TAQ/4KjB46IwK5VH+IlVeeeje/EoZRpiXvIqjFgK84QffqPztGI3VBLG1A=="
},
"lodash": {
"version": "4.17.21",
"resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz",

View File

@@ -54,6 +54,7 @@
"js-yaml": "^4.1.0",
"json5": "^2.2.0",
"jsonwebtoken": "^8.5.1",
"leven": "^3.1.0",
"lodash": "^4.17.21",
"lru-cache": "^6.0.0",
"monaco-editor": "^0.27.0",

View File

@@ -5,6 +5,7 @@ import {absPercentDifference, getSharpAsync, isValidImageURL} from "../util";
import sizeOf from "image-size";
import SimpleError from "../Utils/SimpleError";
import {Sharp} from "sharp";
import {blockhash} from "./blockhash/blockhash";
export interface ImageDataOptions {
width?: number,
@@ -21,6 +22,7 @@ class ImageData {
variants: ImageData[] = []
preferredResolution?: [number, number]
sharpImg!: Sharp
hashResult!: string
actualResolution?: [number, number]
constructor(data: ImageDataOptions, aggressive = false) {
@@ -38,6 +40,20 @@ class ImageData {
return await (await this.sharp()).clone().toFormat(format).toBuffer();
}
async hash(bits: number, useVariantIfPossible = true): Promise<string> {
if(this.hashResult === undefined) {
let ref: ImageData | undefined;
if(useVariantIfPossible && this.preferredResolution !== undefined) {
ref = this.getSimilarResolutionVariant(this.preferredResolution[0], this.preferredResolution[1]);
}
if(ref === undefined) {
ref = this;
}
this.hashResult = await blockhash((await ref.sharp()).clone(), bits);
}
return this.hashResult;
}
async sharp(): Promise<Sharp> {
if (this.sharpImg === undefined) {
try {
@@ -92,6 +108,10 @@ class ImageData {
return this.width !== undefined && this.height !== undefined;
}
get baseUrl() {
return `${this.url.origin}${this.url.pathname}`;
}
setPreferredResolutionByWidth(prefWidth: number) {
let height: number | undefined = undefined,
width: number | undefined = undefined;

View File

@@ -0,0 +1,234 @@
// Perceptual image hash calculation tool based on algorithm descibed in
// Block Mean Value Based Image Perceptual Hashing by Bian Yang, Fan Gu and Xiamu Niu
//
// Copyright 2014 Commons Machinery http://commonsmachinery.se/
// Distributed under an MIT license, please see LICENSE in the top dir.
// https://github.com/commonsmachinery/blockhash-js/blob/master/index.js
import {Sharp} from "sharp";
interface BlockImageData {
data: Buffer,
width: number,
height: number
}
var one_bits = [0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4];
/* Calculate the hamming distance for two hashes in hex format */
export const hammingDistance = (hash1: string, hash2: string) => {
var d = 0;
var i;
if (hash1.length !== hash2.length) {
throw new Error("Can't compare hashes with different length");
}
for (i = 0; i < hash1.length; i++) {
var n1 = parseInt(hash1[i], 16);
var n2 = parseInt(hash2[i], 16);
d += one_bits[n1 ^ n2];
}
return d;
};
var median = function(data: number[]) {
var mdarr = data.slice(0);
mdarr.sort(function(a, b) { return a-b; });
if (mdarr.length % 2 === 0) {
return (mdarr[mdarr.length/2 - 1] + mdarr[mdarr.length/2]) / 2.0;
}
return mdarr[Math.floor(mdarr.length/2)];
};
var translate_blocks_to_bits = function(blocks: number[], pixels_per_block: number) {
var half_block_value = pixels_per_block * 256 * 3 / 2;
var bandsize = blocks.length / 4;
// Compare medians across four horizontal bands
for (var i = 0; i < 4; i++) {
var m = median(blocks.slice(i * bandsize, (i + 1) * bandsize));
for (var j = i * bandsize; j < (i + 1) * bandsize; j++) {
var v = blocks[j];
// Output a 1 if the block is brighter than the median.
// With images dominated by black or white, the median may
// end up being 0 or the max value, and thus having a lot
// of blocks of value equal to the median. To avoid
// generating hashes of all zeros or ones, in that case output
// 0 if the median is in the lower value space, 1 otherwise
blocks[j] = Number(v > m || (Math.abs(v - m) < 1 && m > half_block_value));
}
}
};
var bits_to_hexhash = function(bitsArray: number[]) {
var hex = [];
for (var i = 0; i < bitsArray.length; i += 4) {
var nibble = bitsArray.slice(i, i + 4);
hex.push(parseInt(nibble.join(''), 2).toString(16));
}
return hex.join('');
};
var bmvbhash_even = function(data: BlockImageData, bits: number) {
var blocksize_x = Math.floor(data.width / bits);
var blocksize_y = Math.floor(data.height / bits);
var result = [];
for (var y = 0; y < bits; y++) {
for (var x = 0; x < bits; x++) {
var total = 0;
for (var iy = 0; iy < blocksize_y; iy++) {
for (var ix = 0; ix < blocksize_x; ix++) {
var cx = x * blocksize_x + ix;
var cy = y * blocksize_y + iy;
var ii = (cy * data.width + cx) * 4;
var alpha = data.data[ii+3];
if (alpha === 0) {
total += 765;
} else {
total += data.data[ii] + data.data[ii+1] + data.data[ii+2];
}
}
}
result.push(total);
}
}
translate_blocks_to_bits(result, blocksize_x * blocksize_y);
return bits_to_hexhash(result);
};
var bmvbhash = function(data: BlockImageData, bits: number) {
var result = [];
var i, j, x, y;
var block_width, block_height;
var weight_top, weight_bottom, weight_left, weight_right;
var block_top, block_bottom, block_left, block_right;
var y_mod, y_frac, y_int;
var x_mod, x_frac, x_int;
var blocks: number[][] = [];
var even_x = data.width % bits === 0;
var even_y = data.height % bits === 0;
if (even_x && even_y) {
return bmvbhash_even(data, bits);
}
// initialize blocks array with 0s
for (i = 0; i < bits; i++) {
blocks.push([]);
for (j = 0; j < bits; j++) {
blocks[i].push(0);
}
}
block_width = data.width / bits;
block_height = data.height / bits;
for (y = 0; y < data.height; y++) {
if (even_y) {
// don't bother dividing y, if the size evenly divides by bits
block_top = block_bottom = Math.floor(y / block_height);
weight_top = 1;
weight_bottom = 0;
} else {
y_mod = (y + 1) % block_height;
y_frac = y_mod - Math.floor(y_mod);
y_int = y_mod - y_frac;
weight_top = (1 - y_frac);
weight_bottom = (y_frac);
// y_int will be 0 on bottom/right borders and on block boundaries
if (y_int > 0 || (y + 1) === data.height) {
block_top = block_bottom = Math.floor(y / block_height);
} else {
block_top = Math.floor(y / block_height);
block_bottom = Math.ceil(y / block_height);
}
}
for (x = 0; x < data.width; x++) {
var ii = (y * data.width + x) * 4;
var avgvalue, alpha = data.data[ii+3];
if (alpha === 0) {
avgvalue = 765;
} else {
avgvalue = data.data[ii] + data.data[ii+1] + data.data[ii+2];
}
if (even_x) {
block_left = block_right = Math.floor(x / block_width);
weight_left = 1;
weight_right = 0;
} else {
x_mod = (x + 1) % block_width;
x_frac = x_mod - Math.floor(x_mod);
x_int = x_mod - x_frac;
weight_left = (1 - x_frac);
weight_right = x_frac;
// x_int will be 0 on bottom/right borders and on block boundaries
if (x_int > 0 || (x + 1) === data.width) {
block_left = block_right = Math.floor(x / block_width);
} else {
block_left = Math.floor(x / block_width);
block_right = Math.ceil(x / block_width);
}
}
// add weighted pixel value to relevant blocks
blocks[block_top][block_left] += avgvalue * weight_top * weight_left;
blocks[block_top][block_right] += avgvalue * weight_top * weight_right;
blocks[block_bottom][block_left] += avgvalue * weight_bottom * weight_left;
blocks[block_bottom][block_right] += avgvalue * weight_bottom * weight_right;
}
}
for (i = 0; i < bits; i++) {
for (j = 0; j < bits; j++) {
result.push(blocks[i][j]);
}
}
translate_blocks_to_bits(result, block_width * block_height);
return bits_to_hexhash(result);
};
var blockhashData = function(imgData: BlockImageData, bits: number, method: number) {
var hash;
if (method === 1) {
hash = bmvbhash_even(imgData, bits);
}
else if (method === 2) {
hash = bmvbhash(imgData, bits);
}
else {
throw new Error("Bad hashing method");
}
return hash;
};
export const blockhash = async function(src: Sharp, bits: number, method: number = 2): Promise<string> {
const {data: buff, info} = await src.ensureAlpha().raw().toBuffer({resolveWithObject: true});
return blockhashData({
width: info.width,
height: info.height,
data: buff,
}, bits, method);
};

View File

@@ -257,13 +257,131 @@ export interface ImageDetection {
* */
fetchBehavior?: 'extension' | 'unknown' | 'all',
/**
* The percentage, as a whole number, of pixels that are **different** between the two images at which point the images are not considered the same.
* The percentage, as a whole number, of difference between two images at which point they will not be considered the same.
*
* Will be used as `hash.hardThreshold` and `pixel.threshold` if those values are not specified
*
* Default is `5`
*
* @default 5
* */
threshold?: number
/**
* Use perceptual hashing (blockhash-js) to compare images
*
* Pros:
*
* * very fast
* * low cpu/memory usage
* * results can be cached
*
* Cons:
*
* * not as accurate as pixel comparison
* * weaker for text-heavy images
* * mostly color-blind
*
* Best uses:
*
* * Detecting (general) duplicate images
* * Comparing large number of images
* */
hash?: {
/**
* Enabled by default.
*
* If both `hash` and `pixel` are enabled then `pixel` will be used to verify image comparison when hashes matches
*
* @default true
* */
enable?: boolean
/**
* Bit count determines accuracy of hash and granularity of hash comparison (comparison to other hashes)
*
* Default is `32`
*
* **NOTE:** Hashes of different sizes (bitS) cannot be compared. If you are caching results make sure all rules where results may be shared use the same bit count to ensure hashes can be compared. Otherwise hashes will be recomputed.
*
* @default 32
* */
bits?: number
/**
* Number of seconds to cache image hash
* */
ttl?: number
/**
* High Confidence Threshold
*
* If the difference in comparison is equal to or less than this number the images are considered the same and pixel comparison WILL NOT occur
*
* Defaults to the parent-level `threshold` value if not present
*
* Use `null` if you want pixel comparison to ALWAYS occur (softThreshold must be present)
* */
hardThreshold?: number | null
/**
* Low Confidence Threshold -- only used if `pixel` is enabled
*
* If the difference in comparison is
*
* 1) equal to or less than this value and
* 2) the value is greater than `hardThreshold`
*
* the images will be compared using the `pixel` method
* */
softThreshold?: number
}
/**
* Use pixel counting to compare images
*
* Pros:
*
* * most accurate
* * strong with text or color-only changes
*
* Cons:
*
* * much slower than hashing
* * memory/cpu intensive
*
* Best uses:
*
* * Comparison text-only images
* * Comparison requires high degree of accuracy or changes are subtle
* */
pixel?: {
/**
* Disabled by default.
*
* @default false
* */
enable?: boolean
/**
* The percentage, as a whole number, of pixels that are **different** between the two images at which point the images are not considered the same.
* */
threshold?: number
}
}
export interface StrongImageDetection {
enable: boolean,
fetchBehavior: 'extension' | 'unknown' | 'all'
threshold: number,
hash: {
enable: boolean
bits: number
ttl?: number
hardThreshold: number | null
softThreshold?: number
}
pixel: {
enable: boolean
threshold: number
}
}
// export interface ImageData {

View File

@@ -7,7 +7,9 @@ import pMap from 'p-map';
import subImageMatch from 'matches-subimage';
import {
activityWindowText,
asSubmission, compareImages,
asSubmission, bitsToHexLength,
// blockHashImage,
compareImages,
comparisonTextOp,
FAIL,
formatNumber,
@@ -29,10 +31,12 @@ import {
ActivityWindowType, CommentState,
//ImageData,
ImageDetection,
ReferenceSubmission, StrongSubredditState, SubmissionState,
ReferenceSubmission, StrongImageDetection, StrongSubredditState, SubmissionState,
SubredditCriteria, SubredditState
} from "../Common/interfaces";
import ImageData from "../Common/ImageData";
import {blockhash, hammingDistance} from "../Common/blockhash/blockhash";
import leven from "leven";
const parseLink = parseUsableLinkIdentifier();
@@ -40,7 +44,7 @@ export class RecentActivityRule extends Rule {
window: ActivityWindowType;
thresholds: ActivityThreshold[];
useSubmissionAsReference: boolean;
imageDetection: Required<ImageDetection>
imageDetection: StrongImageDetection
lookAt?: 'comments' | 'submissions';
constructor(options: RecentActivityRuleOptions) {
@@ -55,13 +59,39 @@ export class RecentActivityRule extends Rule {
const {
enable = false,
fetchBehavior = 'extension',
threshold = 5
threshold = 5,
hash = {},
pixel = {},
} = imageDetection || {};
const {
enable: hEnable = true,
bits = 16,
ttl = 60,
hardThreshold = threshold,
softThreshold
} = hash || {};
const {
enable: pEnable = true,
threshold: pThreshold = threshold,
} = pixel || {};
this.imageDetection = {
enable,
fetchBehavior,
threshold
threshold,
hash: {
enable: hEnable,
hardThreshold,
softThreshold,
bits,
ttl,
},
pixel: {
enable: pEnable,
threshold: pThreshold
}
};
this.lookAt = lookAt;
this.useSubmissionAsReference = useSubmissionAsReference;
@@ -113,11 +143,28 @@ export class RecentActivityRule extends Rule {
if (this.imageDetection.enable) {
try {
referenceImage = ImageData.fromSubmission(item);
await referenceImage.sharp();
referenceImage.setPreferredResolutionByWidth(1000);
if (referenceImage.preferredResolution !== undefined) {
await (referenceImage.getSimilarResolutionVariant(...referenceImage.preferredResolution) as ImageData).sharp();
referenceImage.setPreferredResolutionByWidth(800);
if(this.imageDetection.hash.enable) {
let refHash: string | undefined;
if(this.imageDetection.hash.ttl !== undefined) {
refHash = await this.resources.getImageHash(referenceImage);
if(refHash === undefined) {
refHash = await referenceImage.hash(this.imageDetection.hash.bits);
await this.resources.setImageHash(referenceImage, refHash, this.imageDetection.hash.ttl);
} else if(refHash.length !== bitsToHexLength(this.imageDetection.hash.bits)) {
this.logger.warn('Reference image hash length did not correspond to bits specified in config. Recomputing...');
refHash = await referenceImage.hash(this.imageDetection.hash.bits);
await this.resources.setImageHash(referenceImage, refHash, this.imageDetection.hash.ttl);
}
} else {
refHash = await referenceImage.hash(this.imageDetection.hash.bits);
}
}
//await referenceImage.sharp();
// await referenceImage.hash();
// if (referenceImage.preferredResolution !== undefined) {
// await (referenceImage.getSimilarResolutionVariant(...referenceImage.preferredResolution) as ImageData).sharp();
// }
} catch (err) {
this.logger.verbose(err.message);
}
@@ -145,14 +192,58 @@ export class RecentActivityRule extends Rule {
if (referenceImage !== undefined) {
try {
let imgData = ImageData.fromSubmission(x);
try {
const [compareResult, sameImage] = await compareImages(referenceImage, imgData, this.imageDetection.threshold / 100);
analysisTimes.push(compareResult.analysisTime);
if (sameImage) {
imgData.setPreferredResolutionByWidth(800);
if(this.imageDetection.hash.enable) {
let compareHash: string | undefined;
if(this.imageDetection.hash.ttl !== undefined) {
compareHash = await this.resources.getImageHash(imgData);
}
if(compareHash === undefined)
{
compareHash = await imgData.hash(this.imageDetection.hash.bits);
if(this.imageDetection.hash.ttl !== undefined) {
await this.resources.setImageHash(imgData, compareHash, this.imageDetection.hash.ttl);
}
}
const refHash = await referenceImage.hash(this.imageDetection.hash.bits);
if(refHash.length !== compareHash.length) {
this.logger.debug(`Hash lengths were not the same! Will need to recompute compare hash to match reference.\n\nReference: ${referenceImage.baseUrl} has is ${refHash.length} char long | Comparing: ${imgData.baseUrl} has is ${compareHash} ${compareHash.length} long`);
compareHash = await imgData.hash(this.imageDetection.hash.bits)
}
const distance = leven(refHash, compareHash);
const diff = (distance/refHash.length)*100;
// return image if hard is defined and diff is less
if(null !== this.imageDetection.hash.hardThreshold && diff <= this.imageDetection.hash.hardThreshold) {
return x;
}
} catch (err) {
this.logger.warn(`Unexpected error encountered while comparing images, will skip comparison => ${err.message}`);
// hard is either not defined or diff was gerater than hard
// if soft is defined
if (this.imageDetection.hash.softThreshold !== undefined) {
// and diff is greater than soft allowance
if(diff > this.imageDetection.hash.softThreshold) {
// not similar enough
return null;
}
// similar enough, will continue on to pixel (if enabled!)
} else {
// only hard was defined and did not pass
return null;
}
}
// at this point either hash was not enabled or it was and we hit soft threshold but not hard
if(this.imageDetection.pixel.enable) {
try {
const [compareResult, sameImage] = await compareImages(referenceImage, imgData, this.imageDetection.pixel.threshold / 100);
analysisTimes.push(compareResult.analysisTime);
if (sameImage) {
return x;
}
} catch (err) {
this.logger.warn(`Unexpected error encountered while pixel-comparing images, will skip comparison => ${err.message}`);
}
}
} catch (err) {
if(!err.message.includes('did not end with a valid image extension')) {
@@ -164,10 +255,12 @@ export class RecentActivityRule extends Rule {
}
// parallel all the things
this.logger.profile('asyncCompare');
const results = await pMap(viableActivity, ci, {concurrency: imageCompareMaxConcurrencyGuess});
this.logger.profile('asyncCompare', {level: 'debug', message: 'Total time for image download and compare'});
const results = await pMap(viableActivity, ci, {concurrency: 1});
this.logger.profile('asyncCompare', {level: 'debug', message: 'Total time for image comparison (incl download/cache calls)'});
const totalAnalysisTime = analysisTimes.reduce((acc, x) => acc + x,0);
this.logger.debug(`Reference image compared ${analysisTimes.length} times. Timings: Avg ${formatNumber(totalAnalysisTime / analysisTimes.length, {toFixed: 0})}ms | Max: ${Math.max(...analysisTimes)}ms | Min: ${Math.min(...analysisTimes)}ms | Total: ${totalAnalysisTime}ms (${formatNumber(totalAnalysisTime/1000)}s)`);
if(analysisTimes.length > 0) {
this.logger.debug(`Reference image pixel-compared ${analysisTimes.length} times. Timings: Avg ${formatNumber(totalAnalysisTime / analysisTimes.length, {toFixed: 0})}ms | Max: ${Math.max(...analysisTimes)}ms | Min: ${Math.min(...analysisTimes)}ms | Total: ${totalAnalysisTime}ms (${formatNumber(totalAnalysisTime/1000)}s)`);
}
filteredActivity = filteredActivity.concat(results.filter(x => x !== null));
if (longRun !== undefined) {
clearTimeout(longRun);

View File

@@ -1645,9 +1645,55 @@
],
"type": "string"
},
"hash": {
"description": "Use perceptual hashing (blockhash-js) to compare images\n\nPros:\n\n* very fast\n* low cpu/memory usage\n* results can be cached\n\nCons:\n\n* not as accurate as pixel comparison\n* weaker for text-heavy images\n* mostly color-blind\n\nBest uses:\n\n* Detecting (general) duplicate images\n* Comparing large number of images",
"properties": {
"bits": {
"default": 32,
"description": "Bit count determines accuracy of hash and granularity of hash comparison (comparison to other hashes)\n\nDefault is `32`\n\n**NOTE:** Hashes of different sizes (bitS) cannot be compared. If you are caching results make sure all rules where results may be shared use the same bit count to ensure hashes can be compared. Otherwise hashes will be recomputed.",
"type": "number"
},
"enable": {
"default": true,
"description": "Enabled by default.\n\nIf both `hash` and `pixel` are enabled then `pixel` will be used to verify image comparison when hashes matches",
"type": "boolean"
},
"hardThreshold": {
"description": "High Confidence Threshold\n\nIf the difference in comparison is equal to or less than this number the images are considered the same and pixel comparison WILL NOT occur\n\nDefaults to the parent-level `threshold` value if not present\n\nUse `null` if you want pixel comparison to ALWAYS occur (softThreshold must be present)",
"type": [
"null",
"number"
]
},
"softThreshold": {
"description": "Low Confidence Threshold -- only used if `pixel` is enabled\n\nIf the difference in comparison is\n\n1) equal to or less than this value and\n2) the value is greater than `hardThreshold`\n\nthe images will be compared using the `pixel` method",
"type": "number"
},
"ttl": {
"description": "Number of seconds to cache image hash",
"type": "number"
}
},
"type": "object"
},
"pixel": {
"description": "Use pixel counting to compare images\n\nPros:\n\n* most accurate\n* strong with text or color-only changes\n\nCons:\n\n* much slower than hashing\n* memory/cpu intensive\n\nBest uses:\n\n* Comparison text-only images\n* Comparison requires high degree of accuracy or changes are subtle",
"properties": {
"enable": {
"default": false,
"description": "Disabled by default.",
"type": "boolean"
},
"threshold": {
"description": "The percentage, as a whole number, of pixels that are **different** between the two images at which point the images are not considered the same.",
"type": "number"
}
},
"type": "object"
},
"threshold": {
"default": 5,
"description": "The percentage, as a whole number, of pixels that are **different** between the two images at which point the images are not considered the same.\n\nDefault is `5`",
"description": "The percentage, as a whole number, of difference between two images at which point they will not be considered the same.\n\nWill be used as `hash.hardThreshold` and `pixel.threshold` if those values are not specified\n\nDefault is `5`",
"type": "number"
}
},

View File

@@ -885,9 +885,55 @@
],
"type": "string"
},
"hash": {
"description": "Use perceptual hashing (blockhash-js) to compare images\n\nPros:\n\n* very fast\n* low cpu/memory usage\n* results can be cached\n\nCons:\n\n* not as accurate as pixel comparison\n* weaker for text-heavy images\n* mostly color-blind\n\nBest uses:\n\n* Detecting (general) duplicate images\n* Comparing large number of images",
"properties": {
"bits": {
"default": 32,
"description": "Bit count determines accuracy of hash and granularity of hash comparison (comparison to other hashes)\n\nDefault is `32`\n\n**NOTE:** Hashes of different sizes (bitS) cannot be compared. If you are caching results make sure all rules where results may be shared use the same bit count to ensure hashes can be compared. Otherwise hashes will be recomputed.",
"type": "number"
},
"enable": {
"default": true,
"description": "Enabled by default.\n\nIf both `hash` and `pixel` are enabled then `pixel` will be used to verify image comparison when hashes matches",
"type": "boolean"
},
"hardThreshold": {
"description": "High Confidence Threshold\n\nIf the difference in comparison is equal to or less than this number the images are considered the same and pixel comparison WILL NOT occur\n\nDefaults to the parent-level `threshold` value if not present\n\nUse `null` if you want pixel comparison to ALWAYS occur (softThreshold must be present)",
"type": [
"null",
"number"
]
},
"softThreshold": {
"description": "Low Confidence Threshold -- only used if `pixel` is enabled\n\nIf the difference in comparison is\n\n1) equal to or less than this value and\n2) the value is greater than `hardThreshold`\n\nthe images will be compared using the `pixel` method",
"type": "number"
},
"ttl": {
"description": "Number of seconds to cache image hash",
"type": "number"
}
},
"type": "object"
},
"pixel": {
"description": "Use pixel counting to compare images\n\nPros:\n\n* most accurate\n* strong with text or color-only changes\n\nCons:\n\n* much slower than hashing\n* memory/cpu intensive\n\nBest uses:\n\n* Comparison text-only images\n* Comparison requires high degree of accuracy or changes are subtle",
"properties": {
"enable": {
"default": false,
"description": "Disabled by default.",
"type": "boolean"
},
"threshold": {
"description": "The percentage, as a whole number, of pixels that are **different** between the two images at which point the images are not considered the same.",
"type": "number"
}
},
"type": "object"
},
"threshold": {
"default": 5,
"description": "The percentage, as a whole number, of pixels that are **different** between the two images at which point the images are not considered the same.\n\nDefault is `5`",
"description": "The percentage, as a whole number, of difference between two images at which point they will not be considered the same.\n\nWill be used as `hash.hardThreshold` and `pixel.threshold` if those values are not specified\n\nDefault is `5`",
"type": "number"
}
},

View File

@@ -862,9 +862,55 @@
],
"type": "string"
},
"hash": {
"description": "Use perceptual hashing (blockhash-js) to compare images\n\nPros:\n\n* very fast\n* low cpu/memory usage\n* results can be cached\n\nCons:\n\n* not as accurate as pixel comparison\n* weaker for text-heavy images\n* mostly color-blind\n\nBest uses:\n\n* Detecting (general) duplicate images\n* Comparing large number of images",
"properties": {
"bits": {
"default": 32,
"description": "Bit count determines accuracy of hash and granularity of hash comparison (comparison to other hashes)\n\nDefault is `32`\n\n**NOTE:** Hashes of different sizes (bitS) cannot be compared. If you are caching results make sure all rules where results may be shared use the same bit count to ensure hashes can be compared. Otherwise hashes will be recomputed.",
"type": "number"
},
"enable": {
"default": true,
"description": "Enabled by default.\n\nIf both `hash` and `pixel` are enabled then `pixel` will be used to verify image comparison when hashes matches",
"type": "boolean"
},
"hardThreshold": {
"description": "High Confidence Threshold\n\nIf the difference in comparison is equal to or less than this number the images are considered the same and pixel comparison WILL NOT occur\n\nDefaults to the parent-level `threshold` value if not present\n\nUse `null` if you want pixel comparison to ALWAYS occur (softThreshold must be present)",
"type": [
"null",
"number"
]
},
"softThreshold": {
"description": "Low Confidence Threshold -- only used if `pixel` is enabled\n\nIf the difference in comparison is\n\n1) equal to or less than this value and\n2) the value is greater than `hardThreshold`\n\nthe images will be compared using the `pixel` method",
"type": "number"
},
"ttl": {
"description": "Number of seconds to cache image hash",
"type": "number"
}
},
"type": "object"
},
"pixel": {
"description": "Use pixel counting to compare images\n\nPros:\n\n* most accurate\n* strong with text or color-only changes\n\nCons:\n\n* much slower than hashing\n* memory/cpu intensive\n\nBest uses:\n\n* Comparison text-only images\n* Comparison requires high degree of accuracy or changes are subtle",
"properties": {
"enable": {
"default": false,
"description": "Disabled by default.",
"type": "boolean"
},
"threshold": {
"description": "The percentage, as a whole number, of pixels that are **different** between the two images at which point the images are not considered the same.",
"type": "number"
}
},
"type": "object"
},
"threshold": {
"default": 5,
"description": "The percentage, as a whole number, of pixels that are **different** between the two images at which point the images are not considered the same.\n\nDefault is `5`",
"description": "The percentage, as a whole number, of difference between two images at which point they will not be considered the same.\n\nWill be used as `hash.hardThreshold` and `pixel.threshold` if those values are not specified\n\nDefault is `5`",
"type": "number"
}
},

View File

@@ -50,6 +50,7 @@ import {cacheTTLDefaults, createHistoricalDefaults, historicalDefaults} from "..
import {check} from "tcp-port-used";
import {ExtendedSnoowrap} from "../Utils/SnoowrapClients";
import dayjs from "dayjs";
import ImageData from "../Common/ImageData";
export const DEFAULT_FOOTER = '\r\n*****\r\nThis action was performed by [a bot.]({{botLink}}) Mention a moderator or [send a modmail]({{modmailLink}}) if you any ideas, questions, or concerns about this action.';
@@ -420,7 +421,13 @@ export class SubredditResources {
async hasSubreddit(name: string) {
if (this.subredditTTL !== false) {
const hash = `sub-${name}`;
this.stats.cache.subreddit.requests++
this.stats.cache.subreddit.requestTimestamps.push(Date.now());
await this.stats.cache.subreddit.identifierRequestCount.set(hash, (await this.stats.cache.subreddit.identifierRequestCount.wrap(hash, () => 0) as number) + 1);
const val = await this.cache.get(hash);
if(val === undefined || val === null) {
this.stats.cache.subreddit.miss++;
}
return val !== undefined && val !== null;
}
return false;
@@ -897,6 +904,8 @@ export class SubredditResources {
const userName = getActivityAuthorName(item.author);
const hash = `commentUserResult-${userName}-${item.link_id}-${objectHash.sha1(checkConfig)}`;
this.stats.cache.commentCheck.requests++;
this.stats.cache.commentCheck.requestTimestamps.push(Date.now());
await this.stats.cache.commentCheck.identifierRequestCount.set(hash, (await this.stats.cache.commentCheck.identifierRequestCount.wrap(hash, () => 0) as number) + 1);
let result = await this.cache.get(hash) as UserResultCache | undefined | null;
if(result === null) {
result = undefined;
@@ -927,6 +936,33 @@ export class SubredditResources {
const footerRawContent = await this.getContent(footer, item.subreddit);
return he.decode(Mustache.render(footerRawContent, {subName, permaLink, modmailLink, botLink: BOT_LINK}));
}
async getImageHash(img: ImageData): Promise<string|undefined> {
const hash = `imgHash-${img.baseUrl}`;
const result = await this.cache.get(hash) as string | undefined | null;
this.stats.cache.imageHash.requests++
this.stats.cache.imageHash.requestTimestamps.push(Date.now());
await this.stats.cache.imageHash.identifierRequestCount.set(hash, (await this.stats.cache.imageHash.identifierRequestCount.wrap(hash, () => 0) as number) + 1);
if(result !== undefined && result !== null) {
return result;
}
this.stats.cache.commentCheck.miss++;
return undefined;
// const hash = await this.cache.wrap(img.baseUrl, async () => await img.hash(true), { ttl }) as string;
// if(img.hashResult === undefined) {
// img.hashResult = hash;
// }
// return hash;
}
async setImageHash(img: ImageData, hash: string, ttl: number): Promise<void> {
await this.cache.set(`imgHash-${img.baseUrl}`, hash, {ttl});
// const hash = await this.cache.wrap(img.baseUrl, async () => await img.hash(true), { ttl }) as string;
// if(img.hashResult === undefined) {
// img.hashResult = hash;
// }
// return hash;
}
}
export class BotResourcesManager {

View File

@@ -52,6 +52,8 @@ import fetch, {Response} from "node-fetch";
import { URL } from "url";
import ImageData from "./Common/ImageData";
import {Sharp, SharpOptions} from "sharp";
// @ts-ignore
import {blockhashData, hammingDistance} from 'blockhash';
//import {ResembleSingleCallbackComparisonResult} from "resemblejs";
// want to guess how many concurrent image comparisons we should be doing
@@ -1049,7 +1051,8 @@ export const cacheStats = (): ResourceStats => {
submission: {requests: 0, miss: 0, identifierRequestCount: statMetricCache(), requestTimestamps: timestampArr(), averageTimeBetweenHits: 'N/A', identifierAverageHit: 0},
comment: {requests: 0, miss: 0, identifierRequestCount: statMetricCache(), requestTimestamps: timestampArr(), averageTimeBetweenHits: 'N/A', identifierAverageHit: 0},
subreddit: {requests: 0, miss: 0, identifierRequestCount: statMetricCache(), requestTimestamps: timestampArr(), averageTimeBetweenHits: 'N/A', identifierAverageHit: 0},
commentCheck: {requests: 0, miss: 0, identifierRequestCount: statMetricCache(), requestTimestamps: timestampArr(), averageTimeBetweenHits: 'N/A', identifierAverageHit: 0}
commentCheck: {requests: 0, miss: 0, identifierRequestCount: statMetricCache(), requestTimestamps: timestampArr(), averageTimeBetweenHits: 'N/A', identifierAverageHit: 0},
imageHash: {requests: 0, miss: 0, identifierRequestCount: statMetricCache(), requestTimestamps: timestampArr(), averageTimeBetweenHits: 'N/A', identifierAverageHit: 0}
};
}
@@ -1406,3 +1409,7 @@ export const subredditStateIsNameOnly = (state: SubredditState | StrongSubreddit
export const absPercentDifference = (num1: number, num2: number) => {
return Math.abs((num1 - num2) / num1) * 100;
}
export const bitsToHexLength = (bits: number): number => {
return Math.pow(bits, 2) / 4;
}