diff --git a/viz/app/linker/suggestions.js b/viz/app/linker/suggestions.js index f0ac3cb..7da7d0b 100644 --- a/viz/app/linker/suggestions.js +++ b/viz/app/linker/suggestions.js @@ -257,245 +257,318 @@ export function recommendSimilar( - -export function computeInitialPairsFast(allAgg, mappedSkus, limitPairs, isIgnoredPairFn, sameStoreFn) { - const itemsAll = allAgg.filter((it) => !!it); - - const seed = (Date.now() ^ ((Math.random() * 1e9) | 0)) >>> 0; - const rnd = mulberry32(seed); - const itemsShuf = itemsAll.slice(); - shuffleInPlace(itemsShuf, rnd); - - const WORK_CAP = 5000; - const workAll = itemsShuf.length > WORK_CAP ? itemsShuf.slice(0, WORK_CAP) : itemsShuf; - - const work = workAll.filter((it) => !(mappedSkus && mappedSkus.has(String(it.sku)))); - - function itemRank(it) { - const stores = it.stores ? it.stores.size : 0; - const hasPrice = it.cheapestPriceNum != null ? 1 : 0; - const hasName = it.name ? 1 : 0; - const unknown = String(it.sku || "").startsWith("u:") ? 1 : 0; - return stores * 3 + hasPrice * 2 + hasName * 0.5 + unknown * 0.25; - } - - function smwsPairsFirst(workArr, limit) { - const buckets = new Map(); // code -> items[] - for (const it of workArr) { - if (!it) continue; - const sku = String(it.sku || ""); - if (!sku) continue; - - const code = smwsKeyFromName(it.name || ""); - if (!code) continue; - - let arr = buckets.get(code); - if (!arr) buckets.set(code, (arr = [])); - arr.push(it); + export function computeInitialPairsFast( + allAgg, + mappedSkus, + limitPairs, + isIgnoredPairFn, + sameStoreFn, + sizePenaltyFn // ✅ NEW: pass sizePenaltyForPair in + ) { + const itemsAll = allAgg.filter((it) => !!it); + + const seed = (Date.now() ^ ((Math.random() * 1e9) | 0)) >>> 0; + const rnd = mulberry32(seed); + const itemsShuf = itemsAll.slice(); + shuffleInPlace(itemsShuf, rnd); + + // Bigger cap is fine; still bounded + const WORK_CAP = Math.min(9000, itemsShuf.length); + const workAll = itemsShuf.length > WORK_CAP ? itemsShuf.slice(0, WORK_CAP) : itemsShuf; + + // Unmapped-only view for normal similarity stage + const work = workAll.filter((it) => { + if (!it) return false; + return !(mappedSkus && mappedSkus.has(String(it.sku))); + }); + + function itemRank(it) { + const stores = it.stores ? it.stores.size : 0; + const hasPrice = it.cheapestPriceNum != null ? 1 : 0; + const hasName = it.name ? 1 : 0; + const unknown = String(it.sku || "").startsWith("u:") ? 1 : 0; + return stores * 3 + hasPrice * 2 + hasName * 0.5 + unknown * 0.25; } - - const candPairs = []; - - for (const arr0 of buckets.values()) { - if (!arr0 || arr0.length < 2) continue; - - const arr = arr0 - .slice() - .sort((a, b) => itemRank(b) - itemRank(a)) - .slice(0, 80); - - const mapped = []; - const unmapped = []; - for (const it of arr) { + + // --- SMWS exact-code pairs first (kept as-is, but apply sameStore/isIgnored) --- + function smwsPairsFirst(workArr, limit) { + const buckets = new Map(); // code -> items[] + for (const it of workArr) { + if (!it) continue; const sku = String(it.sku || ""); - if (mappedSkus && mappedSkus.has(sku)) mapped.push(it); - else unmapped.push(it); + if (!sku) continue; + const code = smwsKeyFromName(it.name || ""); + if (!code) continue; + let arr = buckets.get(code); + if (!arr) buckets.set(code, (arr = [])); + arr.push(it); } - - const anchor = (mapped.length ? mapped : unmapped) - .slice() - .sort((a, b) => itemRank(b) - itemRank(a))[0]; - - if (!anchor) continue; - - if (unmapped.length) { - for (const u of unmapped) { + + const candPairs = []; + + for (const arr0 of buckets.values()) { + if (!arr0 || arr0.length < 2) continue; + + const arr = arr0 + .slice() + .sort((a, b) => itemRank(b) - itemRank(a)) + .slice(0, 80); + + // Prefer an unmapped anchor if possible; otherwise best overall + const anchor = arr.slice().sort((a, b) => itemRank(b) - itemRank(a))[0]; + if (!anchor) continue; + + for (const u of arr) { + if (u === anchor) continue; const a = anchor; const b = u; const aSku = String(a.sku || ""); const bSku = String(b.sku || ""); if (!aSku || !bSku || aSku === bSku) continue; + + // Only link *unmapped* targets in this stage + if (mappedSkus && mappedSkus.has(bSku)) continue; + if (typeof sameStoreFn === "function" && sameStoreFn(aSku, bSku)) continue; if (typeof isIgnoredPairFn === "function" && isIgnoredPairFn(aSku, bSku)) continue; - + const s = 1e9 + itemRank(a) + itemRank(b); - candPairs.push({ a, b, score: s, aIsMapped: mappedSkus && mappedSkus.has(aSku) }); + candPairs.push({ a, b, score: s }); } } + + candPairs.sort((x, y) => y.score - x.score); + + const usedUnmapped = new Set(); + const out0 = []; + for (const p of candPairs) { + const bSku = String(p.b.sku || ""); + if (!bSku) continue; + if (usedUnmapped.has(bSku)) continue; + usedUnmapped.add(bSku); + out0.push(p); + if (out0.length >= limit) break; + } + + return { pairs: out0, usedUnmapped }; } - - candPairs.sort((x, y) => y.score - x.score); - - const usedUnmapped = new Set(); - const anchorUse = new Map(); - const ANCHOR_REUSE_CAP = 6; - - const out0 = []; - for (const p of candPairs) { + + const smwsFirst = smwsPairsFirst(workAll, limitPairs); + const used = new Set(smwsFirst.usedUnmapped); + const out = smwsFirst.pairs.slice(); + if (out.length >= limitPairs) return out.slice(0, limitPairs); + + // --- Improved general pairing logic (uses same “good” scoring knobs) --- + + const seeds = topSuggestions(work, Math.min(220, work.length), "", mappedSkus).filter( + (it) => !used.has(String(it?.sku || "")) + ); + + // Build token buckets over *normalized* names (better hits) + const TOKEN_BUCKET_CAP = 700; + const tokMap = new Map(); // token -> items[] + const itemRawToks = new Map(); // sku -> raw tokens + const itemNorm = new Map(); // sku -> norm name + const itemFilt = new Map(); // sku -> filtered tokens (for first-token logic) + + for (const it of work) { + const sku = String(it.sku || ""); + if (!sku) continue; + + const n = normSearchText(it.name || ""); + const raw = tokenizeQuery(n); + const filt = filterSimTokens(raw); + + itemNorm.set(sku, n); + itemRawToks.set(sku, raw); + itemFilt.set(sku, filt); + + // bucket using a handful of filtered tokens (higher signal) + for (const t of filt.slice(0, 12)) { + let arr = tokMap.get(t); + if (!arr) tokMap.set(t, (arr = [])); + if (arr.length < TOKEN_BUCKET_CAP) arr.push(it); + } + } + + const bestByPair = new Map(); + const MAX_CAND_TOTAL = 450; + const MAX_CHEAP = 40; + const MAX_FINE = 18; + + for (const a of seeds) { + const aSku = String(a.sku || ""); + if (!aSku || used.has(aSku)) continue; + + const aNorm = itemNorm.get(aSku) || normSearchText(a.name || ""); + const aRaw = itemRawToks.get(aSku) || tokenizeQuery(aNorm); + const aFilt = itemFilt.get(aSku) || filterSimTokens(aRaw); + if (!aFilt.length) continue; + + const aBrand = aFilt[0] || ""; + const aAge = extractAgeFromText(aNorm); + + // Gather candidates from token buckets + const cand = new Map(); + for (const t of aFilt.slice(0, 10)) { + const arr = tokMap.get(t); + if (!arr) continue; + + for (let i = 0; i < arr.length && cand.size < MAX_CAND_TOTAL; i++) { + const b = arr[i]; + if (!b) continue; + const bSku = String(b.sku || ""); + if (!bSku || bSku === aSku) continue; + if (used.has(bSku)) continue; + if (mappedSkus && mappedSkus.has(bSku)) continue; + + if (typeof isIgnoredPairFn === "function" && isIgnoredPairFn(aSku, bSku)) continue; + if (typeof sameStoreFn === "function" && sameStoreFn(aSku, bSku)) continue; + + cand.set(bSku, b); + } + if (cand.size >= MAX_CAND_TOTAL) break; + } + if (!cand.size) continue; + + // Cheap score stage (fastSimilarity + containment + size + age + first-token mismatch penalty) + const cheap = []; + for (const b of cand.values()) { + const bSku = String(b.sku || ""); + const bNorm = itemNorm.get(bSku) || normSearchText(b.name || ""); + const bRaw = itemRawToks.get(bSku) || tokenizeQuery(bNorm); + const bFilt = itemFilt.get(bSku) || filterSimTokens(bRaw); + if (!bFilt.length) continue; + + const contain = tokenContainmentScore(aRaw, bRaw); + const bBrand = bFilt[0] || ""; + const firstMatch = aBrand && bBrand && aBrand === bBrand; + + let s = fastSimilarityScore(aRaw, bRaw, aNorm, bNorm); + if (s <= 0) s = 0.01 + 0.25 * contain; + + if (!firstMatch) { + const smallN = Math.min(aFilt.length || 0, bFilt.length || 0); + let mult = 0.10 + 0.95 * contain; + if (smallN <= 3 && contain < 0.78) mult *= 0.22; + s *= Math.min(1.0, mult); + } + + if (typeof sizePenaltyFn === "function") s *= sizePenaltyFn(aSku, bSku); + + const bAge = extractAgeFromText(bNorm); + if (aAge && bAge) { + if (aAge === bAge) s *= 1.6; + else s *= 0.22; + } + + if (String(aSku).startsWith("u:") || String(bSku).startsWith("u:")) s *= 1.06; + + if (s > 0) cheap.push({ b, s, bNorm, bRaw, bFilt, contain, firstMatch, bAge }); + } + + if (!cheap.length) continue; + cheap.sort((x, y) => y.s - x.s); + + // Fine stage (expensive similarityScore + same penalties again) + let bestB = null; + let bestS = 0; + + for (const x of cheap.slice(0, MAX_FINE)) { + const b = x.b; + const bSku = String(b.sku || ""); + + let s = similarityScore(a.name || "", b.name || ""); + if (s <= 0) continue; + + // first-token mismatch soft penalty + if (!x.firstMatch) { + const smallN = Math.min(aFilt.length || 0, (x.bFilt || []).length || 0); + let mult = 0.10 + 0.95 * x.contain; + if (smallN <= 3 && x.contain < 0.78) mult *= 0.22; + s *= Math.min(1.0, mult); + if (s <= 0) continue; + } + + if (typeof sizePenaltyFn === "function") { + s *= sizePenaltyFn(aSku, bSku); + if (s <= 0) continue; + } + + if (aAge && x.bAge) { + if (aAge === x.bAge) s *= 2.0; + else s *= 0.15; + } + + if (String(aSku).startsWith("u:") || String(bSku).startsWith("u:")) s *= 1.10; + + if (s > bestS) { + bestS = s; + bestB = b; + } + } + + // Threshold (slightly lower than before, because we now punish mismatches more intelligently) + if (!bestB || bestS < 0.50) continue; + + const bSku = String(bestB.sku || ""); + if (!bSku || used.has(bSku)) continue; + + const key = aSku < bSku ? `${aSku}|${bSku}` : `${bSku}|${aSku}`; + const prev = bestByPair.get(key); + if (!prev || bestS > prev.score) bestByPair.set(key, { a, b: bestB, score: bestS }); + } + + const pairs = Array.from(bestByPair.values()); + pairs.sort((x, y) => y.score - x.score); + + // ---- light randomness inside a top band (same behavior as before) ---- + const need = Math.max(0, limitPairs - out.length); + if (!need) return out.slice(0, limitPairs); + + const TOP_BAND = Math.min(700, pairs.length); + const JITTER = 0.08; + + const band = pairs.slice(0, TOP_BAND).map((p) => { + const jitter = (rnd() - 0.5) * JITTER; + return { ...p, _rank: p.score * (1 + jitter) }; + }); + band.sort((a, b) => b._rank - a._rank); + + function tryTake(p) { const aSku = String(p.a.sku || ""); const bSku = String(p.b.sku || ""); - if (!aSku || !bSku) continue; - - if (usedUnmapped.has(bSku)) continue; - - const k = aSku; - const n = anchorUse.get(k) || 0; - if (n >= ANCHOR_REUSE_CAP) continue; - - usedUnmapped.add(bSku); - anchorUse.set(k, n + 1); - out0.push(p); - - if (out0.length >= limit) break; + if (!aSku || !bSku || aSku === bSku) return false; + if (used.has(aSku) || used.has(bSku)) return false; + if (typeof sameStoreFn === "function" && sameStoreFn(aSku, bSku)) return false; + if (typeof isIgnoredPairFn === "function" && isIgnoredPairFn(aSku, bSku)) return false; + + used.add(aSku); + used.add(bSku); + out.push({ a: p.a, b: p.b, score: p.score }); + return true; } - - return { pairs: out0, usedUnmapped }; - } - - const smwsFirst = smwsPairsFirst(workAll, limitPairs); - const used = new Set(smwsFirst.usedUnmapped); - const out = smwsFirst.pairs.slice(); - - if (out.length >= limitPairs) return out.slice(0, limitPairs); - - const seeds = topSuggestions(work, Math.min(150, work.length), "", mappedSkus).filter( - (it) => !used.has(String(it?.sku || "")) - ); - - const TOKEN_BUCKET_CAP = 500; - const tokMap = new Map(); - const itemTokens = new Map(); - const itemNormName = new Map(); - - for (const it of work) { - const toks = Array.from(new Set(tokenizeQuery(it.name || ""))).filter(Boolean).slice(0, 10); - itemTokens.set(it.sku, toks); - itemNormName.set(it.sku, normSearchText(it.name || "")); - for (const t of toks) { - let arr = tokMap.get(t); - if (!arr) tokMap.set(t, (arr = [])); - if (arr.length < TOKEN_BUCKET_CAP) arr.push(it); - } - } - - const bestByPair = new Map(); - const MAX_CAND_TOTAL = 250; - const MAX_FINE = 10; - - for (const a of seeds) { - const aSku = String(a.sku || ""); - if (!aSku || used.has(aSku)) continue; - - const aToks = itemTokens.get(aSku) || []; - if (!aToks.length) continue; - - const cand = new Map(); - for (const t of aToks) { - const arr = tokMap.get(t); - if (!arr) continue; - - for (let i = 0; i < arr.length && cand.size < MAX_CAND_TOTAL; i++) { - const b = arr[i]; - if (!b) continue; - const bSku = String(b.sku || ""); - if (!bSku || bSku === aSku) continue; - if (used.has(bSku)) continue; - if (mappedSkus && mappedSkus.has(bSku)) continue; - - if (typeof isIgnoredPairFn === "function" && isIgnoredPairFn(aSku, bSku)) continue; - if (typeof sameStoreFn === "function" && sameStoreFn(aSku, bSku)) continue; - - cand.set(bSku, b); - } - if (cand.size >= MAX_CAND_TOTAL) break; - } - if (!cand.size) continue; - - const aNameN = itemNormName.get(aSku) || ""; - const cheap = []; - for (const b of cand.values()) { - const bSku = String(b.sku || ""); - const bToks = itemTokens.get(bSku) || []; - const bNameN = itemNormName.get(bSku) || ""; - const s = fastSimilarityScore(aToks, bToks, aNameN, bNameN); - if (s > 0) cheap.push({ b, s }); - } - if (!cheap.length) continue; - cheap.sort((x, y) => y.s - x.s); - - let bestB = null; - let bestS = 0; - for (const x of cheap.slice(0, MAX_FINE)) { - const s = similarityScore(a.name || "", x.b.name || ""); - if (s > bestS) { - bestS = s; - bestB = x.b; - } - } - - if (!bestB || bestS < 0.6) continue; - - const bSku = String(bestB.sku || ""); - if (!bSku || used.has(bSku)) continue; - - const key = aSku < bSku ? `${aSku}|${bSku}` : `${bSku}|${aSku}`; - const prev = bestByPair.get(key); - if (!prev || bestS > prev.score) bestByPair.set(key, { a, b: bestB, score: bestS }); - } - - const pairs = Array.from(bestByPair.values()); - pairs.sort((x, y) => y.score - x.score); - - const need = Math.max(0, limitPairs - out.length); - if (!need) return out.slice(0, limitPairs); - - const TOP_BAND = Math.min(600, pairs.length); - const JITTER = 0.08; - - const band = pairs.slice(0, TOP_BAND).map((p) => { - const jitter = (rnd() - 0.5) * JITTER; - return { ...p, _rank: p.score * (1 + jitter) }; - }); - band.sort((a, b) => b._rank - a._rank); - - function tryTake(p) { - const aSku = String(p.a.sku || ""); - const bSku = String(p.b.sku || ""); - if (!aSku || !bSku || aSku === bSku) return false; - if (used.has(aSku) || used.has(bSku)) return false; - if (typeof sameStoreFn === "function" && sameStoreFn(aSku, bSku)) return false; - - used.add(aSku); - used.add(bSku); - out.push({ a: p.a, b: p.b, score: p.score }); - return true; - } - - for (const p of band) { - if (out.length >= limitPairs) break; - tryTake(p); - } - - if (out.length < limitPairs) { - for (let i = TOP_BAND; i < pairs.length; i++) { + + for (const p of band) { if (out.length >= limitPairs) break; - tryTake(pairs[i]); + tryTake(p); } + + if (out.length < limitPairs) { + for (let i = TOP_BAND; i < pairs.length; i++) { + if (out.length >= limitPairs) break; + tryTake(pairs[i]); + } + } + + return out.slice(0, limitPairs); } + - return out.slice(0, limitPairs); -} -function fnv1a32u(str) { + + + + function fnv1a32u(str) { let h = 0x811c9dc5; str = String(str || ""); for (let i = 0; i < str.length; i++) { diff --git a/viz/app/linker_page.js b/viz/app/linker_page.js index 51d0143..2991d17 100644 --- a/viz/app/linker_page.js +++ b/viz/app/linker_page.js @@ -139,7 +139,30 @@ export async function renderSkuLinker($app) { return String(rules.canonicalSku(aSku)) === String(rules.canonicalSku(bSku)); } - const initialPairs = computeInitialPairsFast(allAgg, mappedSkus, 28, isIgnoredPair, sameStoreCanon); + let initialPairs = null; + + function getInitialPairsIfNeeded() { + // never compute if either side is pinned + if (pinnedL || pinnedR) return null; + + // never compute if URL query param was used (preselect flow) + if (shouldReloadAfterLink) return null; + + if (initialPairs) return initialPairs; + + initialPairs = computeInitialPairsFast( + allAgg, + mappedSkus, + 28, + isIgnoredPair, + sameStoreCanon, + sizePenaltyForPair // ✅ NEW + ); + + return initialPairs; + } + + let pinnedL = null; let pinnedR = null; @@ -221,16 +244,17 @@ export async function renderSkuLinker($app) { sameGroup ); - if (initialPairs && initialPairs.length) { - const list = side === "L" ? initialPairs.map((p) => p.a) : initialPairs.map((p) => p.b); - return list.filter( - (it) => - it && - it.sku !== otherSku && - (!mappedSkus.has(String(it.sku)) || smwsKeyFromName(it.name || "")) - ); - } - + const pairs = getInitialPairsIfNeeded(); + if (pairs && pairs.length) { + const list = side === "L" ? pairs.map((p) => p.a) : pairs.map((p) => p.b); + return list.filter( + (it) => + it && + it.sku !== otherSku && + (!mappedSkus.has(String(it.sku)) || smwsKeyFromName(it.name || "")) + ); + } + return topSuggestions(allAgg, 60, otherSku, mappedSkus); }