spirit-tracker/viz/app/linker/similarity.js
Brennan Wilkes (Text Groove) 7a33d51c90 UX Improvements
2026-02-10 16:45:22 -08:00

290 lines
7 KiB
JavaScript

// viz/app/linker/similarity.js
import { tokenizeQuery, normSearchText } from "../sku.js";
// Ignore ultra-common / low-signal tokens in bottle names.
const SIM_STOP_TOKENS = new Set([
"the",
"a",
"an",
"and",
"of",
"to",
"in",
"for",
"with",
"year",
"years",
"yr",
"yrs",
"old",
"whisky",
"whiskey",
"scotch",
"single",
"malt",
"cask",
"finish",
"edition",
"release",
"batch",
"strength",
"abv",
"proof",
"anniversary",
]);
const SMWS_WORD_RE = /\bsmws\b/i;
const SMWS_CODE_RE = /\b(\d{1,3}\.\d{1,4})\b/;
export function smwsKeyFromName(name) {
const s = String(name || "");
if (!SMWS_WORD_RE.test(s)) return "";
const m = s.match(SMWS_CODE_RE);
return m ? m[1] : "";
}
const ORDINAL_RE = /^(\d+)(st|nd|rd|th)$/i;
export function numKey(t) {
const s = String(t || "")
.trim()
.toLowerCase();
if (!s) return "";
if (/^\d+$/.test(s)) return s;
const m = s.match(ORDINAL_RE);
return m ? m[1] : "";
}
function isNumberToken(t) {
return !!numKey(t);
}
export function extractAgeFromText(normName) {
const s = String(normName || "");
if (!s) return "";
const m = s.match(/\b(?:aged\s*)?(\d{1,2})\s*(?:yr|yrs|year|years)\b/i);
if (m && m[1]) return String(parseInt(m[1], 10));
const m2 = s.match(/\b(\d{1,2})\s*yo\b/i);
if (m2 && m2[1]) return String(parseInt(m2[1], 10));
return "";
}
export function filterSimTokens(tokens) {
const out = [];
const seen = new Set();
const SIM_EQUIV = new Map([
["years", "yr"],
["year", "yr"],
["yrs", "yr"],
["yr", "yr"],
["whiskey", "whisky"],
["whisky", "whisky"],
["bourbon", "bourbon"],
]);
const VOL_UNIT = new Set(["ml", "l", "cl", "oz", "liter", "liters", "litre", "litres"]);
const VOL_INLINE_RE = /^\d+(?:\.\d+)?(?:ml|l|cl|oz)$/i; // 700ml, 1.14l
const PCT_INLINE_RE = /^\d+(?:\.\d+)?%$/; // 46%, 40.0%
const arr = Array.isArray(tokens) ? tokens : [];
for (let i = 0; i < arr.length; i++) {
const raw = arr[i];
let t = String(raw || "")
.trim()
.toLowerCase();
if (!t) continue;
if (!/[a-z0-9]/i.test(t)) continue;
if (VOL_INLINE_RE.test(t)) continue;
if (PCT_INLINE_RE.test(t)) continue;
t = SIM_EQUIV.get(t) || t;
const nk = numKey(t);
if (nk) t = nk;
if (VOL_UNIT.has(t) || t === "abv" || t === "proof") continue;
if (/^\d+(?:\.\d+)?$/.test(t)) {
const next = String(arr[i + 1] || "")
.trim()
.toLowerCase();
const nextNorm = SIM_EQUIV.get(next) || next;
if (VOL_UNIT.has(nextNorm)) {
i++;
continue;
}
}
if (!isNumberToken(t) && SIM_STOP_TOKENS.has(t)) continue;
if (seen.has(t)) continue;
seen.add(t);
out.push(t);
}
return out;
}
export function numberMismatchPenalty(aTokens, bTokens) {
const aNums = new Set((aTokens || []).map(numKey).filter(Boolean));
const bNums = new Set((bTokens || []).map(numKey).filter(Boolean));
if (!aNums.size || !bNums.size) return 1.0;
for (const n of aNums) if (bNums.has(n)) return 1.0;
return 0.28;
}
export function levenshtein(a, b) {
a = String(a || "");
b = String(b || "");
const n = a.length,
m = b.length;
if (!n) return m;
if (!m) return n;
const dp = new Array(m + 1);
for (let j = 0; j <= m; j++) dp[j] = j;
for (let i = 1; i <= n; i++) {
let prev = dp[0];
dp[0] = i;
const ca = a.charCodeAt(i - 1);
for (let j = 1; j <= m; j++) {
const tmp = dp[j];
const cost = ca === b.charCodeAt(j - 1) ? 0 : 1;
dp[j] = Math.min(dp[j] + 1, dp[j - 1] + 1, prev + cost);
prev = tmp;
}
}
return dp[m];
}
export function tokenContainmentScore(aTokens, bTokens) {
const A = filterSimTokens(aTokens || []);
const B = filterSimTokens(bTokens || []);
if (!A.length || !B.length) return 0;
const aSet = new Set(A);
const bSet = new Set(B);
const small = aSet.size <= bSet.size ? aSet : bSet;
const big = aSet.size <= bSet.size ? bSet : aSet;
let hit = 0;
for (const t of small) if (big.has(t)) hit++;
const recall = hit / Math.max(1, small.size);
const precision = hit / Math.max(1, big.size);
const f1 = (2 * precision * recall) / Math.max(1e-9, precision + recall);
return f1;
}
export function similarityScore(aName, bName) {
const a = normSearchText(aName);
const b = normSearchText(bName);
if (!a || !b) return 0;
const aAge = extractAgeFromText(a);
const bAge = extractAgeFromText(b);
const ageBoth = !!(aAge && bAge);
const ageMatch = ageBoth && aAge === bAge;
const ageMismatch = ageBoth && aAge !== bAge;
const aToksRaw = tokenizeQuery(a);
const bToksRaw = tokenizeQuery(b);
const aToks = filterSimTokens(aToksRaw);
const bToks = filterSimTokens(bToksRaw);
if (!aToks.length || !bToks.length) return 0;
const contain = tokenContainmentScore(aToksRaw, bToksRaw);
const aFirst = aToks[0] || "";
const bFirst = bToks[0] || "";
const firstMatch = aFirst && bFirst && aFirst === bFirst ? 1 : 0;
const A = new Set(aToks.slice(1));
const B = new Set(bToks.slice(1));
let inter = 0;
for (const w of A) if (B.has(w)) inter++;
const denom = Math.max(1, Math.max(A.size, B.size));
const overlapTail = inter / denom;
const d = levenshtein(a, b);
const maxLen = Math.max(1, Math.max(a.length, b.length));
const levSim = 1 - d / maxLen;
let gate = firstMatch ? 1.0 : Math.min(0.8, 0.06 + 0.95 * contain);
const smallN = Math.min(aToks.length, bToks.length);
if (!firstMatch && smallN <= 3 && contain < 0.78) gate *= 0.18;
const numGate = numberMismatchPenalty(aToks, bToks);
let s = numGate * (firstMatch * 3.0 + overlapTail * 2.2 * gate + levSim * (firstMatch ? 1.0 : 0.1 + 0.7 * contain));
if (ageMatch) s *= 2.2;
else if (ageMismatch) s *= 0.18;
s *= 1 + 0.9 * contain;
return s;
}
export function fastSimilarityScore(aTokens, bTokens, aNormName, bNormName) {
const aTokensRaw = aTokens || [];
const bTokensRaw = bTokens || [];
const aTokF = filterSimTokens(aTokensRaw);
const bTokF = filterSimTokens(bTokensRaw);
if (!aTokF.length || !bTokF.length) return 0;
const a = String(aNormName || "");
const b = String(bNormName || "");
const aAge = extractAgeFromText(a);
const bAge = extractAgeFromText(b);
const ageBoth = !!(aAge && bAge);
const ageMatch = ageBoth && aAge === bAge;
const ageMismatch = ageBoth && aAge !== bAge;
const contain = tokenContainmentScore(aTokensRaw, bTokensRaw);
const aFirst = aTokF[0] || "";
const bFirst = bTokF[0] || "";
const firstMatch = aFirst && bFirst && aFirst === bFirst ? 1 : 0;
const aTail = aTokF.slice(1);
const bTail = bTokF.slice(1);
let inter = 0;
const bSet = new Set(bTail);
for (const t of aTail) if (bSet.has(t)) inter++;
const denom = Math.max(1, Math.max(aTail.length, bTail.length));
const overlapTail = inter / denom;
const pref = firstMatch && a.slice(0, 10) && b.slice(0, 10) && a.slice(0, 10) === b.slice(0, 10) ? 0.2 : 0;
let gate = firstMatch ? 1.0 : Math.min(0.8, 0.06 + 0.95 * contain);
const smallN = Math.min(aTokF.length, bTokF.length);
if (!firstMatch && smallN <= 3 && contain < 0.78) gate *= 0.18;
const numGate = numberMismatchPenalty(aTokF, bTokF);
let s = numGate * (firstMatch * 2.4 + overlapTail * 2.0 * gate + pref);
if (ageMatch) s *= 2.0;
else if (ageMismatch) s *= 0.2;
s *= 1 + 0.9 * contain;
return s;
}