diff --git a/viz/app/linker_page.js b/viz/app/linker_page.js index 222363f..9235009 100644 --- a/viz/app/linker_page.js +++ b/viz/app/linker_page.js @@ -26,6 +26,7 @@ import { } from "./pending.js"; /* ---------------- Similarity helpers ---------------- */ + // Ignore ultra-common / low-signal tokens in bottle names. const SIM_STOP_TOKENS = new Set([ "the", @@ -60,7 +61,7 @@ const SIM_STOP_TOKENS = new Set([ "abv", "proof", - // helps your Benromach “20th Anniversary” case + // helps e.g. "20th Anniversary" "anniversary", ]); @@ -213,67 +214,6 @@ function levenshtein(a, b) { return dp[m]; } - -/* ---------------- Size helpers ---------------- */ - -const SIZE_TOLERANCE_ML = 8; // tolerate minor formatting noise (e.g. 749 vs 750) - -function parseSizesMlFromText(text) { - const s = String(text || "").toLowerCase(); - if (!s) return []; - - const out = new Set(); - - // 750ml, 700 ml, 1140ml, 1.14l, 70cl, etc. - const re = /\b(\d+(?:\.\d+)?)\s*(ml|cl|l|litre|litres|liter|liters)\b/g; - let m; - while ((m = re.exec(s))) { - const val = parseFloat(m[1]); - const unit = m[2]; - if (!isFinite(val) || val <= 0) continue; - - let ml = 0; - if (unit === "ml") ml = Math.round(val); - else if (unit === "cl") ml = Math.round(val * 10); - else ml = Math.round(val * 1000); // l/litre/liter - - // sanity: ignore crazy - if (ml >= 50 && ml <= 5000) out.add(ml); - } - - return Array.from(out); -} - -function mergeSizeSet(intoSet, sizesArr) { - if (!intoSet || !sizesArr) return; - for (const x of sizesArr) { - const n = Number(x); - if (Number.isFinite(n) && n > 0) intoSet.add(n); - } -} - -function sizeSetsMatch(aSet, bSet) { - if (!aSet?.size || !bSet?.size) return false; - for (const a of aSet) { - for (const b of bSet) { - if (Math.abs(a - b) <= SIZE_TOLERANCE_ML) return true; - } - } - return false; -} - -function sizePenalty(aSet, bSet) { - // If either side has no known sizes, don't punish much. - if (!aSet?.size || !bSet?.size) return 1.0; - - // If any size matches (within tolerance), no penalty. - if (sizeSetsMatch(aSet, bSet)) return 1.0; - - // Both have sizes but none match => probably different products (750 vs 1140). - return 0.08; -} - - function tokenContainmentScore(aTokens, bTokens) { // Measures how well the smaller token set is contained in the larger one. // Returns 0..1 (1 = perfect containment). @@ -406,6 +346,57 @@ function fastSimilarityScore(aTokens, bTokens, aNormName, bNormName) { return s; } +/* ---------------- Size helpers ---------------- */ + +const SIZE_TOLERANCE_ML = 8; // tolerate minor formatting noise (e.g. 749 vs 750) + +function parseSizesMlFromText(text) { + const s = String(text || "").toLowerCase(); + if (!s) return []; + + const out = new Set(); + + // 750ml, 700 ml, 1140ml, 1.14l, 70cl, etc. + const re = /\b(\d+(?:\.\d+)?)\s*(ml|cl|l|litre|litres|liter|liters)\b/g; + let m; + while ((m = re.exec(s))) { + const val = parseFloat(m[1]); + const unit = m[2]; + if (!isFinite(val) || val <= 0) continue; + + let ml = 0; + if (unit === "ml") ml = Math.round(val); + else if (unit === "cl") ml = Math.round(val * 10); + else ml = Math.round(val * 1000); // l/litre/liter + + // sanity: ignore crazy + if (ml >= 50 && ml <= 5000) out.add(ml); + } + + return Array.from(out); +} + +function sizeSetsMatch(aSet, bSet) { + if (!aSet?.size || !bSet?.size) return false; + for (const a of aSet) { + for (const b of bSet) { + if (Math.abs(a - b) <= SIZE_TOLERANCE_ML) return true; + } + } + return false; +} + +function sizePenalty(aSet, bSet) { + // If either side has no known sizes, don't punish much. + if (!aSet?.size || !bSet?.size) return 1.0; + + // If any size matches (within tolerance), no penalty. + if (sizeSetsMatch(aSet, bSet)) return 1.0; + + // Both have sizes but none match => probably different products (750 vs 1140). + return 0.08; +} + /* ---------------- Store-overlap rule ---------------- */ function storesOverlap(aItem, bItem) { @@ -580,6 +571,7 @@ function recommendSimilar(allAgg, pinned, limit, otherPinnedSku, mappedSkus, isI const pinnedSku = String(pinned.sku || ""); const otherSku = otherPinnedSku ? String(otherPinnedSku) : ""; + const base = String(pinned.name || ""); const pinNorm = normSearchText(pinned.name || ""); const pinRawToks = tokenizeQuery(pinNorm); @@ -587,20 +579,35 @@ function recommendSimilar(allAgg, pinned, limit, otherPinnedSku, mappedSkus, isI // "brand" = first meaningful token (usually distillery) const pinBrand = pinToks[0] || ""; - const pinAge = extractAgeFromText(pinNorm); + const pinnedSmws = smwsKeyFromName(pinned.name || ""); const scored = []; for (const it of allAgg) { if (!it) continue; const itSku = String(it.sku || ""); - if (!itSku || itSku === pinnedSku || (otherSku && itSku === otherSku)) continue; + if (!itSku) continue; + + if (itSku === pinnedSku) continue; + if (otherSku && itSku === otherSku) continue; if (storesOverlap(pinned, it)) continue; if (typeof isIgnoredPairFn === "function" && isIgnoredPairFn(pinnedSku, itSku)) continue; + // SMWS exact NUM.NUM match => force to top (requires SMWS + code match) + if (pinnedSmws) { + const k = smwsKeyFromName(it.name || ""); + if (k && k === pinnedSmws) { + const stores = it.stores ? it.stores.size : 0; + const hasPrice = it.cheapestPriceNum != null ? 1 : 0; + const s = 1e9 + stores * 10 + hasPrice; // tie-break within exact matches + scored.push({ it, s }); + continue; + } + } + const itNorm = normSearchText(it.name || ""); if (!itNorm) continue; @@ -608,20 +615,17 @@ function recommendSimilar(allAgg, pinned, limit, otherPinnedSku, mappedSkus, isI const itToks = filterSimTokens(itRawToks); const itBrand = itToks[0] || ""; - // HARD brand gate: if brands disagree, skip. - // This eliminates Tamnavulin/Jura/etc from a Benromach pin. + // HARD brand gate: eliminates Tamnavulin/Jura/etc when Benromach pinned if (pinBrand && itBrand && pinBrand !== itBrand) continue; let s = similarityScore(base, it.name || ""); if (s <= 0) continue; if (typeof sizePenaltyFn === "function") { - s *= sizePenaltyFn(pinnedSku, String(it.sku || "")); + s *= sizePenaltyFn(pinnedSku, itSku); + if (s <= 0) continue; } - if (s > 0) scored.push({ it, s }); - - // Extra age boost when pinned has an age and candidate matches it. const itAge = extractAgeFromText(itNorm); if (pinAge && itAge) { @@ -629,14 +633,18 @@ function recommendSimilar(allAgg, pinned, limit, otherPinnedSku, mappedSkus, isI else s *= 0.15; } - scored.push({ it, s }); + // Small boost if either side is an unknown sku (u:...) + const aUnknown = pinnedSku.startsWith("u:"); + const bUnknown = itSku.startsWith("u:"); + if (aUnknown || bUnknown) s *= 1.12; + + if (s > 0) scored.push({ it, s }); } scored.sort((a, b) => b.s - a.s); return scored.slice(0, limit).map((x) => x.it); } - function computeInitialPairsFast(allAgg, mappedSkus, limitPairs, isIgnoredPairFn) { const itemsAll = allAgg.filter((it) => !!it); @@ -1018,60 +1026,70 @@ export async function renderSkuLinker($app) { const mappedSkus = buildMappedSkuSet(meta.links || [], rules); let ignoreSet = rules.ignoreSet; - /* ---------------- Canonical-group size cache ---------------- */ + /* ---------------- Canonical-group size cache (FAST) ---------------- */ - // sizes observed for a specific skuKey (from allRows + agg name) - const SKU_SIZE_CACHE = new Map(); // skuKey -> Set + // skuKey -> Set + const SKU_SIZE_CACHE = new Map(); - function skuSizesMl(skuKey) { - const k = String(skuKey || ""); - if (!k) return new Set(); - const prev = SKU_SIZE_CACHE.get(k); - if (prev) return prev; - - const set = new Set(); - - // include agg display name (often best normalized name) - const agg = allAgg.find((x) => String(x?.sku || "") === k); - if (agg?.name) mergeSizeSet(set, parseSizesMlFromText(agg.name)); - - // include any row names for this skuKey - for (const r of allRows) { - if (!r || r.removed) continue; - if (String(keySkuForRow(r) || "") !== k) continue; - mergeSizeSet(set, parseSizesMlFromText(r.name || r.title || r.productName || "")); - } - - SKU_SIZE_CACHE.set(k, set); + function ensureSkuSet(k) { + let set = SKU_SIZE_CACHE.get(k); + if (!set) SKU_SIZE_CACHE.set(k, (set = new Set())); return set; } - // canonicalSku -> Set (sizes anywhere in that group) - const CANON_SIZE_CACHE = new Map(); - - for (const it of allAgg) { - const skuKey = String(it?.sku || ""); + // 1) One pass over rows (O(allRows)) + for (const r of allRows) { + if (!r || r.removed) continue; + const skuKey = String(keySkuForRow(r) || "").trim(); if (!skuKey) continue; - const canon = String(rules.canonicalSku(skuKey) || skuKey); - let set = CANON_SIZE_CACHE.get(canon); - if (!set) CANON_SIZE_CACHE.set(canon, (set = new Set())); - const s = skuSizesMl(skuKey); - for (const x of s) set.add(x); + + const name = r.name || r.title || r.productName || ""; + const sizes = parseSizesMlFromText(name); + if (!sizes.length) continue; + + const set = ensureSkuSet(skuKey); + for (const x of sizes) set.add(x); } - function groupSizesMl(skuKey) { - const canon = String(rules.canonicalSku(String(skuKey || "")) || ""); - return canon ? (CANON_SIZE_CACHE.get(canon) || new Set()) : new Set(); + // 2) One pass over aggregated names (O(allAgg)) + for (const it of allAgg) { + const skuKey = String(it?.sku || "").trim(); + if (!skuKey || !it?.name) continue; + const sizes = parseSizesMlFromText(it.name); + if (!sizes.length) continue; + + const set = ensureSkuSet(skuKey); + for (const x of sizes) set.add(x); + } + + // 3) canon -> Set (O(allAgg)) + const CANON_SIZE_CACHE = new Map(); + + function ensureCanonSet(k) { + let set = CANON_SIZE_CACHE.get(k); + if (!set) CANON_SIZE_CACHE.set(k, (set = new Set())); + return set; + } + + for (const it of allAgg) { + const skuKey = String(it?.sku || "").trim(); + if (!skuKey) continue; + + const canon = String(rules.canonicalSku(skuKey) || skuKey); + const canonSet = ensureCanonSet(canon); + + const skuSet = SKU_SIZE_CACHE.get(skuKey); + if (skuSet) for (const x of skuSet) canonSet.add(x); } function sizePenaltyForPair(aSku, bSku) { - const A = groupSizesMl(aSku); - const B = groupSizesMl(bSku); + const aCanon = String(rules.canonicalSku(String(aSku || "")) || ""); + const bCanon = String(rules.canonicalSku(String(bSku || "")) || ""); + const A = aCanon ? (CANON_SIZE_CACHE.get(aCanon) || new Set()) : new Set(); + const B = bCanon ? (CANON_SIZE_CACHE.get(bCanon) || new Set()) : new Set(); return sizePenalty(A, B); } - - function isIgnoredPair(a, b) { return rules.isIgnoredPair(String(a || ""), String(b || "")); } @@ -1148,7 +1166,15 @@ export async function renderSkuLinker($app) { // auto-suggestions: never include mapped skus if (otherPinned) - return recommendSimilar(allAgg, otherPinned, 60, otherSku, mappedSkus, isIgnoredPair, sizePenaltyForPair); + return recommendSimilar( + allAgg, + otherPinned, + 60, + otherSku, + mappedSkus, + isIgnoredPair, + sizePenaltyForPair + ); if (initialPairs && initialPairs.length) { const list = side === "L" ? initialPairs.map((p) => p.a) : initialPairs.map((p) => p.b);