spirit-tracker/viz/app/linker/suggestions.js
Brennan Wilkes (Text Groove) 9552daeef8 link sku
2026-02-01 11:32:09 -08:00

580 lines
19 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// viz/app/linker/suggestions.js
import { tokenizeQuery, normSearchText } from "../sku.js";
import {
smwsKeyFromName,
extractAgeFromText,
filterSimTokens,
tokenContainmentScore,
fastSimilarityScore,
similarityScore,
} from "./similarity.js";
/* ---------------- Randomization helpers ---------------- */
function mulberry32(seed) {
let t = seed >>> 0;
return function () {
t += 0x6d2b79f5;
let x = Math.imul(t ^ (t >>> 15), 1 | t);
x ^= x + Math.imul(x ^ (x >>> 7), 61 | x);
return ((x ^ (x >>> 14)) >>> 0) / 4294967296;
};
}
function shuffleInPlace(arr, rnd) {
for (let i = arr.length - 1; i > 0; i--) {
const j = (rnd() * (i + 1)) | 0;
const tmp = arr[i];
arr[i] = arr[j];
arr[j] = tmp;
}
return arr;
}
/* ---------------- Suggestion helpers ---------------- */
export function topSuggestions(allAgg, limit, otherPinnedSku, mappedSkus) {
const scored = [];
for (const it of allAgg) {
if (!it) continue;
if (mappedSkus && mappedSkus.has(String(it.sku))) continue;
if (otherPinnedSku && String(it.sku) === String(otherPinnedSku)) continue;
const stores = it.stores ? it.stores.size : 0;
const hasPrice = it.cheapestPriceNum !== null ? 1 : 0;
const hasName = it.name ? 1 : 0;
const unknown = String(it.sku || "").startsWith("u:") ? 1 : 0;
scored.push({ it, s: stores * 2 + hasPrice * 1.2 + hasName * 1.0 + unknown * 0.6 });
}
scored.sort((a, b) => b.s - a.s);
return scored.slice(0, limit).map((x) => x.it);
}
// viz/app/linker/suggestions.js
// (requires fnv1a32u(str) helper to exist in this file)
export function recommendSimilar(
allAgg,
pinned,
limit,
otherPinnedSku,
mappedSkus,
isIgnoredPairFn,
sizePenaltyFn,
sameStoreFn,
sameGroupFn
) {
if (!pinned || !pinned.name) return topSuggestions(allAgg, limit, otherPinnedSku, mappedSkus);
const pinnedSku = String(pinned.sku || "");
const otherSku = otherPinnedSku ? String(otherPinnedSku) : "";
const base = String(pinned.name || "");
const pinNorm = normSearchText(pinned.name || "");
const pinRawToks = tokenizeQuery(pinNorm);
const pinToks = filterSimTokens(pinRawToks);
const pinBrand = pinToks[0] || "";
const pinAge = extractAgeFromText(pinNorm);
const pinnedSmws = smwsKeyFromName(pinned.name || "");
// ---- Tuning knobs ----
const MAX_SCAN = 5000; // cap for huge catalogs
const FULL_SCAN_UNDER = 12000; // ✅ scan everything if catalog is "small"
const MAX_CHEAP_KEEP = 320; // keep top candidates from cheap stage
const MAX_FINE = 70; // expensive score only on top-N
// ----------------------
// Faster "topK" keeper: only sorts occasionally.
function pushTopK(arr, item, k) {
arr.push(item);
if (arr.length >= k * 2) {
arr.sort((a, b) => b.s - a.s);
arr.length = k;
}
}
const cheap = [];
const nAll = allAgg.length || 0;
if (!nAll) return [];
// ✅ scan whole catalog when it's not huge
const scanN = nAll <= FULL_SCAN_UNDER ? nAll : Math.min(MAX_SCAN, nAll);
// ✅ rotate start to avoid alphabetical bias, but still cover scanN sequentially
const start = (fnv1a32u(pinnedSku || pinNorm) % nAll) >>> 0;
// Optional debug: uncomment to verify were actually hitting the region you expect
// console.log("[linker] recommendSimilar scan2", { pinnedSku, nAll, scanN, start, startName: allAgg[start]?.name });
for (let i = 0; i < scanN; i++) {
const it = allAgg[(start + i) % nAll];
if (!it) continue;
const itSku = String(it.sku || "");
if (!itSku) continue;
if (itSku === pinnedSku) continue;
if (otherSku && itSku === otherSku) continue;
// HARD BLOCKS ONLY:
if (typeof sameStoreFn === "function" && sameStoreFn(pinnedSku, itSku)) continue;
if (typeof isIgnoredPairFn === "function" && isIgnoredPairFn(pinnedSku, itSku)) continue;
if (typeof sameGroupFn === "function" && sameGroupFn(pinnedSku, itSku)) continue;
// (Optional) original mapped exclusion lives here in your codebase.
// Keep it if you want, but it wasn't your issue:
if (mappedSkus && mappedSkus.has(itSku)) continue;
// SMWS exact NUM.NUM match => keep at top
if (pinnedSmws) {
const k = smwsKeyFromName(it.name || "");
if (k && k === pinnedSmws) {
const stores = it.stores ? it.stores.size : 0;
const hasPrice = it.cheapestPriceNum != null ? 1 : 0;
pushTopK(
cheap,
{ it, s: 1e9 + stores * 10 + hasPrice, itNorm: "", itRawToks: null },
MAX_CHEAP_KEEP
);
continue;
}
}
const itNorm = normSearchText(it.name || "");
if (!itNorm) continue;
const itRawToks = tokenizeQuery(itNorm);
const itToks = filterSimTokens(itRawToks);
if (!itToks.length) continue;
const itBrand = itToks[0] || "";
const firstMatch = pinBrand && itBrand && pinBrand === itBrand;
const contain = tokenContainmentScore(pinRawToks, itRawToks);
// Cheap score first (no Levenshtein)
let s0 = fastSimilarityScore(pinRawToks, itRawToks, pinNorm, itNorm);
if (s0 <= 0) s0 = 0.01 + 0.25 * contain;
// Soft first-token mismatch penalty (never blocks)
if (!firstMatch) {
const smallN = Math.min(pinToks.length || 0, itToks.length || 0);
let mult = 0.10 + 0.95 * contain;
if (smallN <= 3 && contain < 0.78) mult *= 0.22;
s0 *= Math.min(1.0, mult);
}
// Size penalty early
if (typeof sizePenaltyFn === "function") {
s0 *= sizePenaltyFn(pinnedSku, itSku);
}
// Age handling early
const itAge = extractAgeFromText(itNorm);
if (pinAge && itAge) {
if (pinAge === itAge) s0 *= 1.6;
else s0 *= 0.22;
}
// Unknown boost
if (pinnedSku.startsWith("u:") || itSku.startsWith("u:")) s0 *= 1.08;
pushTopK(cheap, { it, s: s0, itNorm, itRawToks }, MAX_CHEAP_KEEP);
}
// Final trim/sort for cheap stage
cheap.sort((a, b) => b.s - a.s);
if (cheap.length > MAX_CHEAP_KEEP) cheap.length = MAX_CHEAP_KEEP;
// Fine stage: expensive scoring only on top candidates
const fine = [];
for (const x of cheap.slice(0, MAX_FINE)) {
const it = x.it;
const itSku = String(it.sku || "");
let s = similarityScore(base, it.name || "");
if (s <= 0) continue;
const itNorm = x.itNorm || normSearchText(it.name || "");
const itRawToks = x.itRawToks || tokenizeQuery(itNorm);
const itToks = filterSimTokens(itRawToks);
const itBrand = itToks[0] || "";
const firstMatch = pinBrand && itBrand && pinBrand === itBrand;
const contain = tokenContainmentScore(pinRawToks, itRawToks);
if (!firstMatch) {
const smallN = Math.min(pinToks.length || 0, itToks.length || 0);
let mult = 0.10 + 0.95 * contain;
if (smallN <= 3 && contain < 0.78) mult *= 0.22;
s *= Math.min(1.0, mult);
if (s <= 0) continue;
}
if (typeof sizePenaltyFn === "function") {
s *= sizePenaltyFn(pinnedSku, itSku);
if (s <= 0) continue;
}
const itAge = extractAgeFromText(itNorm);
if (pinAge && itAge) {
if (pinAge === itAge) s *= 2.0;
else s *= 0.15;
}
if (pinnedSku.startsWith("u:") || itSku.startsWith("u:")) s *= 1.12;
fine.push({ it, s });
}
fine.sort((a, b) => b.s - a.s);
const out = fine.slice(0, limit).map((x) => x.it);
if (out.length) return out;
// Fallback (unchanged)
const fallback = [];
for (const it of allAgg) {
if (!it) continue;
const itSku = String(it.sku || "");
if (!itSku) continue;
if (itSku === pinnedSku) continue;
if (otherSku && itSku === otherSku) continue;
if (typeof sameStoreFn === "function" && sameStoreFn(pinnedSku, itSku)) continue;
if (typeof isIgnoredPairFn === "function" && isIgnoredPairFn(pinnedSku, itSku)) continue;
if (typeof sameGroupFn === "function" && sameGroupFn(pinnedSku, itSku)) continue;
const stores = it.stores ? it.stores.size : 0;
const hasPrice = it.cheapestPriceNum !== null ? 1 : 0;
const hasName = it.name ? 1 : 0;
fallback.push({ it, s: stores * 2 + hasPrice * 1.2 + hasName * 1.0 });
if (fallback.length >= 250) break;
}
fallback.sort((a, b) => b.s - a.s);
return fallback.slice(0, limit).map((x) => x.it);
}
export function computeInitialPairsFast(
allAgg,
mappedSkus,
limitPairs,
isIgnoredPairFn,
sameStoreFn,
sizePenaltyFn // ✅ NEW: pass sizePenaltyForPair in
) {
const itemsAll = allAgg.filter((it) => !!it);
const seed = (Date.now() ^ ((Math.random() * 1e9) | 0)) >>> 0;
const rnd = mulberry32(seed);
const itemsShuf = itemsAll.slice();
shuffleInPlace(itemsShuf, rnd);
// Bigger cap is fine; still bounded
const WORK_CAP = Math.min(9000, itemsShuf.length);
const workAll = itemsShuf.length > WORK_CAP ? itemsShuf.slice(0, WORK_CAP) : itemsShuf;
// Unmapped-only view for normal similarity stage
const work = workAll.filter((it) => {
if (!it) return false;
return !(mappedSkus && mappedSkus.has(String(it.sku)));
});
function itemRank(it) {
const stores = it.stores ? it.stores.size : 0;
const hasPrice = it.cheapestPriceNum != null ? 1 : 0;
const hasName = it.name ? 1 : 0;
const unknown = String(it.sku || "").startsWith("u:") ? 1 : 0;
return stores * 3 + hasPrice * 2 + hasName * 0.5 + unknown * 0.25;
}
// --- SMWS exact-code pairs first (kept as-is, but apply sameStore/isIgnored) ---
function smwsPairsFirst(workArr, limit) {
const buckets = new Map(); // code -> items[]
for (const it of workArr) {
if (!it) continue;
const sku = String(it.sku || "");
if (!sku) continue;
const code = smwsKeyFromName(it.name || "");
if (!code) continue;
let arr = buckets.get(code);
if (!arr) buckets.set(code, (arr = []));
arr.push(it);
}
const candPairs = [];
for (const arr0 of buckets.values()) {
if (!arr0 || arr0.length < 2) continue;
const arr = arr0
.slice()
.sort((a, b) => itemRank(b) - itemRank(a))
.slice(0, 80);
// Prefer an unmapped anchor if possible; otherwise best overall
const anchor = arr.slice().sort((a, b) => itemRank(b) - itemRank(a))[0];
if (!anchor) continue;
for (const u of arr) {
if (u === anchor) continue;
const a = anchor;
const b = u;
const aSku = String(a.sku || "");
const bSku = String(b.sku || "");
if (!aSku || !bSku || aSku === bSku) continue;
// Only link *unmapped* targets in this stage
if (mappedSkus && mappedSkus.has(bSku)) continue;
if (typeof sameStoreFn === "function" && sameStoreFn(aSku, bSku)) continue;
if (typeof isIgnoredPairFn === "function" && isIgnoredPairFn(aSku, bSku)) continue;
const s = 1e9 + itemRank(a) + itemRank(b);
candPairs.push({ a, b, score: s });
}
}
candPairs.sort((x, y) => y.score - x.score);
const usedUnmapped = new Set();
const out0 = [];
for (const p of candPairs) {
const bSku = String(p.b.sku || "");
if (!bSku) continue;
if (usedUnmapped.has(bSku)) continue;
usedUnmapped.add(bSku);
out0.push(p);
if (out0.length >= limit) break;
}
return { pairs: out0, usedUnmapped };
}
const smwsFirst = smwsPairsFirst(workAll, limitPairs);
const used = new Set(smwsFirst.usedUnmapped);
const out = smwsFirst.pairs.slice();
if (out.length >= limitPairs) return out.slice(0, limitPairs);
// --- Improved general pairing logic (uses same “good” scoring knobs) ---
const seeds = topSuggestions(work, Math.min(220, work.length), "", mappedSkus).filter(
(it) => !used.has(String(it?.sku || ""))
);
// Build token buckets over *normalized* names (better hits)
const TOKEN_BUCKET_CAP = 700;
const tokMap = new Map(); // token -> items[]
const itemRawToks = new Map(); // sku -> raw tokens
const itemNorm = new Map(); // sku -> norm name
const itemFilt = new Map(); // sku -> filtered tokens (for first-token logic)
for (const it of work) {
const sku = String(it.sku || "");
if (!sku) continue;
const n = normSearchText(it.name || "");
const raw = tokenizeQuery(n);
const filt = filterSimTokens(raw);
itemNorm.set(sku, n);
itemRawToks.set(sku, raw);
itemFilt.set(sku, filt);
// bucket using a handful of filtered tokens (higher signal)
for (const t of filt.slice(0, 12)) {
let arr = tokMap.get(t);
if (!arr) tokMap.set(t, (arr = []));
if (arr.length < TOKEN_BUCKET_CAP) arr.push(it);
}
}
const bestByPair = new Map();
const MAX_CAND_TOTAL = 450;
const MAX_CHEAP = 40;
const MAX_FINE = 18;
for (const a of seeds) {
const aSku = String(a.sku || "");
if (!aSku || used.has(aSku)) continue;
const aNorm = itemNorm.get(aSku) || normSearchText(a.name || "");
const aRaw = itemRawToks.get(aSku) || tokenizeQuery(aNorm);
const aFilt = itemFilt.get(aSku) || filterSimTokens(aRaw);
if (!aFilt.length) continue;
const aBrand = aFilt[0] || "";
const aAge = extractAgeFromText(aNorm);
// Gather candidates from token buckets
const cand = new Map();
for (const t of aFilt.slice(0, 10)) {
const arr = tokMap.get(t);
if (!arr) continue;
for (let i = 0; i < arr.length && cand.size < MAX_CAND_TOTAL; i++) {
const b = arr[i];
if (!b) continue;
const bSku = String(b.sku || "");
if (!bSku || bSku === aSku) continue;
if (used.has(bSku)) continue;
if (mappedSkus && mappedSkus.has(bSku)) continue;
if (typeof isIgnoredPairFn === "function" && isIgnoredPairFn(aSku, bSku)) continue;
if (typeof sameStoreFn === "function" && sameStoreFn(aSku, bSku)) continue;
cand.set(bSku, b);
}
if (cand.size >= MAX_CAND_TOTAL) break;
}
if (!cand.size) continue;
// Cheap score stage (fastSimilarity + containment + size + age + first-token mismatch penalty)
const cheap = [];
for (const b of cand.values()) {
const bSku = String(b.sku || "");
const bNorm = itemNorm.get(bSku) || normSearchText(b.name || "");
const bRaw = itemRawToks.get(bSku) || tokenizeQuery(bNorm);
const bFilt = itemFilt.get(bSku) || filterSimTokens(bRaw);
if (!bFilt.length) continue;
const contain = tokenContainmentScore(aRaw, bRaw);
const bBrand = bFilt[0] || "";
const firstMatch = aBrand && bBrand && aBrand === bBrand;
let s = fastSimilarityScore(aRaw, bRaw, aNorm, bNorm);
if (s <= 0) s = 0.01 + 0.25 * contain;
if (!firstMatch) {
const smallN = Math.min(aFilt.length || 0, bFilt.length || 0);
let mult = 0.10 + 0.95 * contain;
if (smallN <= 3 && contain < 0.78) mult *= 0.22;
s *= Math.min(1.0, mult);
}
if (typeof sizePenaltyFn === "function") s *= sizePenaltyFn(aSku, bSku);
const bAge = extractAgeFromText(bNorm);
if (aAge && bAge) {
if (aAge === bAge) s *= 1.6;
else s *= 0.22;
}
if (String(aSku).startsWith("u:") || String(bSku).startsWith("u:")) s *= 1.06;
if (s > 0) cheap.push({ b, s, bNorm, bRaw, bFilt, contain, firstMatch, bAge });
}
if (!cheap.length) continue;
cheap.sort((x, y) => y.s - x.s);
// Fine stage (expensive similarityScore + same penalties again)
let bestB = null;
let bestS = 0;
for (const x of cheap.slice(0, MAX_FINE)) {
const b = x.b;
const bSku = String(b.sku || "");
let s = similarityScore(a.name || "", b.name || "");
if (s <= 0) continue;
// first-token mismatch soft penalty
if (!x.firstMatch) {
const smallN = Math.min(aFilt.length || 0, (x.bFilt || []).length || 0);
let mult = 0.10 + 0.95 * x.contain;
if (smallN <= 3 && x.contain < 0.78) mult *= 0.22;
s *= Math.min(1.0, mult);
if (s <= 0) continue;
}
if (typeof sizePenaltyFn === "function") {
s *= sizePenaltyFn(aSku, bSku);
if (s <= 0) continue;
}
if (aAge && x.bAge) {
if (aAge === x.bAge) s *= 2.0;
else s *= 0.15;
}
if (String(aSku).startsWith("u:") || String(bSku).startsWith("u:")) s *= 1.10;
if (s > bestS) {
bestS = s;
bestB = b;
}
}
// Threshold (slightly lower than before, because we now punish mismatches more intelligently)
if (!bestB || bestS < 0.50) continue;
const bSku = String(bestB.sku || "");
if (!bSku || used.has(bSku)) continue;
const key = aSku < bSku ? `${aSku}|${bSku}` : `${bSku}|${aSku}`;
const prev = bestByPair.get(key);
if (!prev || bestS > prev.score) bestByPair.set(key, { a, b: bestB, score: bestS });
}
const pairs = Array.from(bestByPair.values());
pairs.sort((x, y) => y.score - x.score);
// ---- light randomness inside a top band (same behavior as before) ----
const need = Math.max(0, limitPairs - out.length);
if (!need) return out.slice(0, limitPairs);
const TOP_BAND = Math.min(700, pairs.length);
const JITTER = 0.08;
const band = pairs.slice(0, TOP_BAND).map((p) => {
const jitter = (rnd() - 0.5) * JITTER;
return { ...p, _rank: p.score * (1 + jitter) };
});
band.sort((a, b) => b._rank - a._rank);
function tryTake(p) {
const aSku = String(p.a.sku || "");
const bSku = String(p.b.sku || "");
if (!aSku || !bSku || aSku === bSku) return false;
if (used.has(aSku) || used.has(bSku)) return false;
if (typeof sameStoreFn === "function" && sameStoreFn(aSku, bSku)) return false;
if (typeof isIgnoredPairFn === "function" && isIgnoredPairFn(aSku, bSku)) return false;
used.add(aSku);
used.add(bSku);
out.push({ a: p.a, b: p.b, score: p.score });
return true;
}
for (const p of band) {
if (out.length >= limitPairs) break;
tryTake(p);
}
if (out.length < limitPairs) {
for (let i = TOP_BAND; i < pairs.length; i++) {
if (out.length >= limitPairs) break;
tryTake(pairs[i]);
}
}
return out.slice(0, limitPairs);
}
function fnv1a32u(str) {
let h = 0x811c9dc5;
str = String(str || "");
for (let i = 0; i < str.length; i++) {
h ^= str.charCodeAt(i);
h = Math.imul(h, 0x01000193);
}
return h >>> 0;
}