mirror of
https://github.com/samsonjs/spirit-tracker.git
synced 2026-03-25 09:25:51 +00:00
290 lines
7 KiB
JavaScript
290 lines
7 KiB
JavaScript
// viz/app/linker/similarity.js
|
|
import { tokenizeQuery, normSearchText } from "../sku.js";
|
|
|
|
// Ignore ultra-common / low-signal tokens in bottle names.
|
|
const SIM_STOP_TOKENS = new Set([
|
|
"the",
|
|
"a",
|
|
"an",
|
|
"and",
|
|
"of",
|
|
"to",
|
|
"in",
|
|
"for",
|
|
"with",
|
|
"year",
|
|
"years",
|
|
"yr",
|
|
"yrs",
|
|
"old",
|
|
"whisky",
|
|
"whiskey",
|
|
"scotch",
|
|
"single",
|
|
"malt",
|
|
"cask",
|
|
"finish",
|
|
"edition",
|
|
"release",
|
|
"batch",
|
|
"strength",
|
|
"abv",
|
|
"proof",
|
|
"anniversary",
|
|
]);
|
|
|
|
const SMWS_WORD_RE = /\bsmws\b/i;
|
|
const SMWS_CODE_RE = /\b(\d{1,3}\.\d{1,4})\b/;
|
|
|
|
export function smwsKeyFromName(name) {
|
|
const s = String(name || "");
|
|
if (!SMWS_WORD_RE.test(s)) return "";
|
|
const m = s.match(SMWS_CODE_RE);
|
|
return m ? m[1] : "";
|
|
}
|
|
|
|
const ORDINAL_RE = /^(\d+)(st|nd|rd|th)$/i;
|
|
|
|
export function numKey(t) {
|
|
const s = String(t || "")
|
|
.trim()
|
|
.toLowerCase();
|
|
if (!s) return "";
|
|
if (/^\d+$/.test(s)) return s;
|
|
const m = s.match(ORDINAL_RE);
|
|
return m ? m[1] : "";
|
|
}
|
|
|
|
function isNumberToken(t) {
|
|
return !!numKey(t);
|
|
}
|
|
|
|
export function extractAgeFromText(normName) {
|
|
const s = String(normName || "");
|
|
if (!s) return "";
|
|
|
|
const m = s.match(/\b(?:aged\s*)?(\d{1,2})\s*(?:yr|yrs|year|years)\b/i);
|
|
if (m && m[1]) return String(parseInt(m[1], 10));
|
|
|
|
const m2 = s.match(/\b(\d{1,2})\s*yo\b/i);
|
|
if (m2 && m2[1]) return String(parseInt(m2[1], 10));
|
|
|
|
return "";
|
|
}
|
|
|
|
export function filterSimTokens(tokens) {
|
|
const out = [];
|
|
const seen = new Set();
|
|
|
|
const SIM_EQUIV = new Map([
|
|
["years", "yr"],
|
|
["year", "yr"],
|
|
["yrs", "yr"],
|
|
["yr", "yr"],
|
|
["whiskey", "whisky"],
|
|
["whisky", "whisky"],
|
|
["bourbon", "bourbon"],
|
|
]);
|
|
|
|
const VOL_UNIT = new Set(["ml", "l", "cl", "oz", "liter", "liters", "litre", "litres"]);
|
|
const VOL_INLINE_RE = /^\d+(?:\.\d+)?(?:ml|l|cl|oz)$/i; // 700ml, 1.14l
|
|
const PCT_INLINE_RE = /^\d+(?:\.\d+)?%$/; // 46%, 40.0%
|
|
|
|
const arr = Array.isArray(tokens) ? tokens : [];
|
|
|
|
for (let i = 0; i < arr.length; i++) {
|
|
const raw = arr[i];
|
|
let t = String(raw || "")
|
|
.trim()
|
|
.toLowerCase();
|
|
if (!t) continue;
|
|
|
|
if (!/[a-z0-9]/i.test(t)) continue;
|
|
|
|
if (VOL_INLINE_RE.test(t)) continue;
|
|
if (PCT_INLINE_RE.test(t)) continue;
|
|
|
|
t = SIM_EQUIV.get(t) || t;
|
|
|
|
const nk = numKey(t);
|
|
if (nk) t = nk;
|
|
|
|
if (VOL_UNIT.has(t) || t === "abv" || t === "proof") continue;
|
|
|
|
if (/^\d+(?:\.\d+)?$/.test(t)) {
|
|
const next = String(arr[i + 1] || "")
|
|
.trim()
|
|
.toLowerCase();
|
|
const nextNorm = SIM_EQUIV.get(next) || next;
|
|
if (VOL_UNIT.has(nextNorm)) {
|
|
i++;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if (!isNumberToken(t) && SIM_STOP_TOKENS.has(t)) continue;
|
|
|
|
if (seen.has(t)) continue;
|
|
seen.add(t);
|
|
out.push(t);
|
|
}
|
|
|
|
return out;
|
|
}
|
|
|
|
export function numberMismatchPenalty(aTokens, bTokens) {
|
|
const aNums = new Set((aTokens || []).map(numKey).filter(Boolean));
|
|
const bNums = new Set((bTokens || []).map(numKey).filter(Boolean));
|
|
if (!aNums.size || !bNums.size) return 1.0;
|
|
for (const n of aNums) if (bNums.has(n)) return 1.0;
|
|
return 0.28;
|
|
}
|
|
|
|
export function levenshtein(a, b) {
|
|
a = String(a || "");
|
|
b = String(b || "");
|
|
const n = a.length,
|
|
m = b.length;
|
|
if (!n) return m;
|
|
if (!m) return n;
|
|
|
|
const dp = new Array(m + 1);
|
|
for (let j = 0; j <= m; j++) dp[j] = j;
|
|
|
|
for (let i = 1; i <= n; i++) {
|
|
let prev = dp[0];
|
|
dp[0] = i;
|
|
const ca = a.charCodeAt(i - 1);
|
|
for (let j = 1; j <= m; j++) {
|
|
const tmp = dp[j];
|
|
const cost = ca === b.charCodeAt(j - 1) ? 0 : 1;
|
|
dp[j] = Math.min(dp[j] + 1, dp[j - 1] + 1, prev + cost);
|
|
prev = tmp;
|
|
}
|
|
}
|
|
return dp[m];
|
|
}
|
|
|
|
export function tokenContainmentScore(aTokens, bTokens) {
|
|
const A = filterSimTokens(aTokens || []);
|
|
const B = filterSimTokens(bTokens || []);
|
|
if (!A.length || !B.length) return 0;
|
|
|
|
const aSet = new Set(A);
|
|
const bSet = new Set(B);
|
|
|
|
const small = aSet.size <= bSet.size ? aSet : bSet;
|
|
const big = aSet.size <= bSet.size ? bSet : aSet;
|
|
|
|
let hit = 0;
|
|
for (const t of small) if (big.has(t)) hit++;
|
|
|
|
const recall = hit / Math.max(1, small.size);
|
|
const precision = hit / Math.max(1, big.size);
|
|
const f1 = (2 * precision * recall) / Math.max(1e-9, precision + recall);
|
|
|
|
return f1;
|
|
}
|
|
|
|
export function similarityScore(aName, bName) {
|
|
const a = normSearchText(aName);
|
|
const b = normSearchText(bName);
|
|
if (!a || !b) return 0;
|
|
|
|
const aAge = extractAgeFromText(a);
|
|
const bAge = extractAgeFromText(b);
|
|
const ageBoth = !!(aAge && bAge);
|
|
const ageMatch = ageBoth && aAge === bAge;
|
|
const ageMismatch = ageBoth && aAge !== bAge;
|
|
|
|
const aToksRaw = tokenizeQuery(a);
|
|
const bToksRaw = tokenizeQuery(b);
|
|
|
|
const aToks = filterSimTokens(aToksRaw);
|
|
const bToks = filterSimTokens(bToksRaw);
|
|
if (!aToks.length || !bToks.length) return 0;
|
|
|
|
const contain = tokenContainmentScore(aToksRaw, bToksRaw);
|
|
|
|
const aFirst = aToks[0] || "";
|
|
const bFirst = bToks[0] || "";
|
|
const firstMatch = aFirst && bFirst && aFirst === bFirst ? 1 : 0;
|
|
|
|
const A = new Set(aToks.slice(1));
|
|
const B = new Set(bToks.slice(1));
|
|
let inter = 0;
|
|
for (const w of A) if (B.has(w)) inter++;
|
|
const denom = Math.max(1, Math.max(A.size, B.size));
|
|
const overlapTail = inter / denom;
|
|
|
|
const d = levenshtein(a, b);
|
|
const maxLen = Math.max(1, Math.max(a.length, b.length));
|
|
const levSim = 1 - d / maxLen;
|
|
|
|
let gate = firstMatch ? 1.0 : Math.min(0.8, 0.06 + 0.95 * contain);
|
|
|
|
const smallN = Math.min(aToks.length, bToks.length);
|
|
if (!firstMatch && smallN <= 3 && contain < 0.78) gate *= 0.18;
|
|
|
|
const numGate = numberMismatchPenalty(aToks, bToks);
|
|
|
|
let s = numGate * (firstMatch * 3.0 + overlapTail * 2.2 * gate + levSim * (firstMatch ? 1.0 : 0.1 + 0.7 * contain));
|
|
|
|
if (ageMatch) s *= 2.2;
|
|
else if (ageMismatch) s *= 0.18;
|
|
|
|
s *= 1 + 0.9 * contain;
|
|
|
|
return s;
|
|
}
|
|
|
|
export function fastSimilarityScore(aTokens, bTokens, aNormName, bNormName) {
|
|
const aTokensRaw = aTokens || [];
|
|
const bTokensRaw = bTokens || [];
|
|
|
|
const aTokF = filterSimTokens(aTokensRaw);
|
|
const bTokF = filterSimTokens(bTokensRaw);
|
|
if (!aTokF.length || !bTokF.length) return 0;
|
|
|
|
const a = String(aNormName || "");
|
|
const b = String(bNormName || "");
|
|
|
|
const aAge = extractAgeFromText(a);
|
|
const bAge = extractAgeFromText(b);
|
|
const ageBoth = !!(aAge && bAge);
|
|
const ageMatch = ageBoth && aAge === bAge;
|
|
const ageMismatch = ageBoth && aAge !== bAge;
|
|
|
|
const contain = tokenContainmentScore(aTokensRaw, bTokensRaw);
|
|
|
|
const aFirst = aTokF[0] || "";
|
|
const bFirst = bTokF[0] || "";
|
|
const firstMatch = aFirst && bFirst && aFirst === bFirst ? 1 : 0;
|
|
|
|
const aTail = aTokF.slice(1);
|
|
const bTail = bTokF.slice(1);
|
|
|
|
let inter = 0;
|
|
const bSet = new Set(bTail);
|
|
for (const t of aTail) if (bSet.has(t)) inter++;
|
|
|
|
const denom = Math.max(1, Math.max(aTail.length, bTail.length));
|
|
const overlapTail = inter / denom;
|
|
|
|
const pref = firstMatch && a.slice(0, 10) && b.slice(0, 10) && a.slice(0, 10) === b.slice(0, 10) ? 0.2 : 0;
|
|
|
|
let gate = firstMatch ? 1.0 : Math.min(0.8, 0.06 + 0.95 * contain);
|
|
const smallN = Math.min(aTokF.length, bTokF.length);
|
|
if (!firstMatch && smallN <= 3 && contain < 0.78) gate *= 0.18;
|
|
|
|
const numGate = numberMismatchPenalty(aTokF, bTokF);
|
|
|
|
let s = numGate * (firstMatch * 2.4 + overlapTail * 2.0 * gate + pref);
|
|
|
|
if (ageMatch) s *= 2.0;
|
|
else if (ageMismatch) s *= 0.2;
|
|
|
|
s *= 1 + 0.9 * contain;
|
|
|
|
return s;
|
|
}
|