diff --git a/viz/app.js b/viz/app.js index 023a45c..631b26a 100644 --- a/viz/app.js +++ b/viz/app.js @@ -519,6 +519,7 @@ function similarityScore(aName, bName) { const denom = Math.max(1, Math.max(A.size, B.size)); const overlap = inter / denom; // 0..1 + // expensive; used sparingly const d = levenshtein(a, b); const maxLen = Math.max(1, Math.max(a.length, b.length)); const levSim = 1 - d / maxLen; // ~0..1 @@ -526,6 +527,28 @@ function similarityScore(aName, bName) { return overlap * 2.2 + levSim * 1.0; } +// fast & cheap score: shared token ratio + prefix hint. used for initial pairing only. +function fastSimilarityScore(aTokens, bTokens, aNormName, bNormName) { + if (!aTokens.length || !bTokens.length) return 0; + + // count intersection (tokens are small) + let inter = 0; + const bSet = new Set(bTokens); + for (const t of aTokens) if (bSet.has(t)) inter++; + + const denom = Math.max(1, Math.max(aTokens.length, bTokens.length)); + const overlap = inter / denom; + + // small prefix bonus if starts similarly (cheap) + const a = String(aNormName || ""); + const b = String(bNormName || ""); + const aPref = a.slice(0, 10); + const bPref = b.slice(0, 10); + const pref = aPref && bPref && aPref === bPref ? 0.2 : 0; + + return overlap * 2.0 + pref; +} + function isBCStoreLabel(label) { const s = String(label || "").toLowerCase(); return s.includes("bcl") || s.includes("strath"); @@ -564,82 +587,6 @@ function buildMappedSkuSet(links) { return s; } -function computeInitialPairs(allAgg, mappedSkus, limitPairs) { - // Pair suggestions: (A,B) where names are similar, SKUs differ, and neither SKU is mapped. - const items = allAgg.filter((it) => { - if (!it) return false; - if (isUnknownSkuKey(it.sku)) return false; - if (mappedSkus && mappedSkus.has(String(it.sku))) return false; - return true; - }); - - // Build token -> items index - const tokMap = new Map(); - const itemTokens = new Map(); - for (const it of items) { - const toks = Array.from(new Set(tokenizeQuery(it.name || ""))).filter(Boolean); - itemTokens.set(it.sku, toks); - for (const t of toks) { - let arr = tokMap.get(t); - if (!arr) tokMap.set(t, (arr = [])); - arr.push(it); - } - } - - // Best match per item from shared-token candidates - const bestByPairKey = new Map(); // "a|b" canonical -> {a,b,score} - for (const a of items) { - const toks = itemTokens.get(a.sku) || []; - const cand = new Set(); - for (const t of toks) { - const arr = tokMap.get(t); - if (!arr) continue; - for (const b of arr) { - if (!b) continue; - if (b.sku === a.sku) continue; // identical SKU never - cand.add(b); - } - } - - let bestB = null; - let bestS = 0; - for (const b of cand) { - const s = similarityScore(a.name || "", b.name || ""); - if (s > bestS) { - bestS = s; - bestB = b; - } - } - - // require some similarity - if (!bestB || bestS < 0.55) continue; - - const aSku = String(a.sku); - const bSku = String(bestB.sku); - const key = aSku < bSku ? `${aSku}|${bSku}` : `${bSku}|${aSku}`; - - const prev = bestByPairKey.get(key); - if (!prev || bestS > prev.score) bestByPairKey.set(key, { a, b: bestB, score: bestS }); - } - - const pairs = Array.from(bestByPairKey.values()); - pairs.sort((x, y) => y.score - x.score); - - // ensure we don't reuse a SKU across multiple initial pairs - const used = new Set(); - const out = []; - for (const p of pairs) { - const aSku = String(p.a.sku), - bSku = String(p.b.sku); - if (used.has(aSku) || used.has(bSku)) continue; - used.add(aSku); - used.add(bSku); - out.push({ a: p.a, b: p.b, score: p.score }); - if (out.length >= limitPairs) break; - } - return out; -} - function topSuggestions(allAgg, limit, otherPinnedSku, mappedSkus) { const scored = []; for (const it of allAgg) { @@ -669,6 +616,7 @@ function recommendSimilar(allAgg, pinned, limit, otherPinnedSku, mappedSkus) { if (it.sku === pinned.sku) continue; if (otherPinnedSku && String(it.sku) === String(otherPinnedSku)) continue; + // keep this reasonably cheap (recommend list sizes are capped) const s = similarityScore(base, it.name || ""); if (s > 0) scored.push({ it, s }); } @@ -676,6 +624,117 @@ function recommendSimilar(allAgg, pinned, limit, otherPinnedSku, mappedSkus) { return scored.slice(0, limit).map((x) => x.it); } +// FAST initial pairing: avoids global O(n^2). Token index with caps + small candidate set + cheap scoring. +function computeInitialPairsFast(allAgg, mappedSkus, limitPairs) { + const items = allAgg.filter((it) => { + if (!it) return false; + if (isUnknownSkuKey(it.sku)) return false; + if (mappedSkus && mappedSkus.has(String(it.sku))) return false; + return true; + }); + + // pick a small seed set (fast + good enough) + const seeds = topSuggestions(items, Math.min(220, items.length), "", mappedSkus); + + // token index with per-token cap to prevent huge buckets + const TOKEN_BUCKET_CAP = 180; + const tokMap = new Map(); // token -> item[] + const itemTokens = new Map(); // sku -> tokens[] + const itemNormName = new Map(); // sku -> norm name + + for (const it of items) { + const toks = Array.from(new Set(tokenizeQuery(it.name || ""))).filter(Boolean).slice(0, 10); + itemTokens.set(it.sku, toks); + itemNormName.set(it.sku, normSearchText(it.name || "")); + for (const t of toks) { + let arr = tokMap.get(t); + if (!arr) tokMap.set(t, (arr = [])); + if (arr.length < TOKEN_BUCKET_CAP) arr.push(it); + } + } + + const bestByPair = new Map(); // canonical "a|b" -> {a,b,score} + const MAX_CAND_TOTAL = 90; + const MAX_FINE = 6; // only run expensive score on top few + + for (const a of seeds) { + const aSku = String(a.sku || ""); + const aToks = itemTokens.get(aSku) || []; + if (!aSku || !aToks.length) continue; + + const cand = new Map(); // sku -> item + for (const t of aToks) { + const arr = tokMap.get(t); + if (!arr) continue; + + // grab only a slice from each bucket + for (let i = 0; i < arr.length && cand.size < MAX_CAND_TOTAL; i++) { + const b = arr[i]; + if (!b) continue; + const bSku = String(b.sku || ""); + if (!bSku || bSku === aSku) continue; + if (mappedSkus && mappedSkus.has(bSku)) continue; + if (isUnknownSkuKey(bSku)) continue; + cand.set(bSku, b); + } + if (cand.size >= MAX_CAND_TOTAL) break; + } + + if (!cand.size) continue; + + // cheap rank by token overlap + const aNameN = itemNormName.get(aSku) || ""; + const cheap = []; + for (const b of cand.values()) { + const bSku = String(b.sku || ""); + const bToks = itemTokens.get(bSku) || []; + const bNameN = itemNormName.get(bSku) || ""; + const s = fastSimilarityScore(aToks, bToks, aNameN, bNameN); + if (s > 0) cheap.push({ b, s }); + } + if (!cheap.length) continue; + cheap.sort((x, y) => y.s - x.s); + + // refine top few with full score (levenshtein) + let bestB = null; + let bestS = 0; + const top = cheap.slice(0, MAX_FINE); + for (const x of top) { + const s = similarityScore(a.name || "", x.b.name || ""); + if (s > bestS) { + bestS = s; + bestB = x.b; + } + } + + // threshold to avoid garbage; keep moderate + if (!bestB || bestS < 0.6) continue; + + const bSku = String(bestB.sku || ""); + const key = aSku < bSku ? `${aSku}|${bSku}` : `${bSku}|${aSku}`; + const prev = bestByPair.get(key); + if (!prev || bestS > prev.score) bestByPair.set(key, { a, b: bestB, score: bestS }); + } + + const pairs = Array.from(bestByPair.values()); + pairs.sort((x, y) => y.score - x.score); + + // avoid reusing skus across initial pairs + const used = new Set(); + const out = []; + for (const p of pairs) { + const aSku = String(p.a.sku || ""); + const bSku = String(p.b.sku || ""); + if (!aSku || !bSku || aSku === bSku) continue; + if (used.has(aSku) || used.has(bSku)) continue; + used.add(aSku); + used.add(bSku); + out.push({ a: p.a, b: p.b, score: p.score }); + if (out.length >= limitPairs) break; + } + return out; +} + async function apiWriteSkuLink(fromSku, toSku) { const res = await fetch("/__stviz/sku-links", { method: "POST", @@ -702,7 +761,7 @@ async function renderSkuLinker() {