This commit is contained in:
Brennan Wilkes (Text Groove) 2026-02-01 11:32:09 -08:00
parent 2356eb8f8f
commit 9552daeef8
2 changed files with 326 additions and 229 deletions

View file

@ -257,8 +257,14 @@ export function recommendSimilar(
export function computeInitialPairsFast(
export function computeInitialPairsFast(allAgg, mappedSkus, limitPairs, isIgnoredPairFn, sameStoreFn) { allAgg,
mappedSkus,
limitPairs,
isIgnoredPairFn,
sameStoreFn,
sizePenaltyFn // ✅ NEW: pass sizePenaltyForPair in
) {
const itemsAll = allAgg.filter((it) => !!it); const itemsAll = allAgg.filter((it) => !!it);
const seed = (Date.now() ^ ((Math.random() * 1e9) | 0)) >>> 0; const seed = (Date.now() ^ ((Math.random() * 1e9) | 0)) >>> 0;
@ -266,10 +272,15 @@ export function computeInitialPairsFast(allAgg, mappedSkus, limitPairs, isIgnore
const itemsShuf = itemsAll.slice(); const itemsShuf = itemsAll.slice();
shuffleInPlace(itemsShuf, rnd); shuffleInPlace(itemsShuf, rnd);
const WORK_CAP = 5000; // Bigger cap is fine; still bounded
const WORK_CAP = Math.min(9000, itemsShuf.length);
const workAll = itemsShuf.length > WORK_CAP ? itemsShuf.slice(0, WORK_CAP) : itemsShuf; const workAll = itemsShuf.length > WORK_CAP ? itemsShuf.slice(0, WORK_CAP) : itemsShuf;
const work = workAll.filter((it) => !(mappedSkus && mappedSkus.has(String(it.sku)))); // Unmapped-only view for normal similarity stage
const work = workAll.filter((it) => {
if (!it) return false;
return !(mappedSkus && mappedSkus.has(String(it.sku)));
});
function itemRank(it) { function itemRank(it) {
const stores = it.stores ? it.stores.size : 0; const stores = it.stores ? it.stores.size : 0;
@ -279,16 +290,15 @@ export function computeInitialPairsFast(allAgg, mappedSkus, limitPairs, isIgnore
return stores * 3 + hasPrice * 2 + hasName * 0.5 + unknown * 0.25; return stores * 3 + hasPrice * 2 + hasName * 0.5 + unknown * 0.25;
} }
// --- SMWS exact-code pairs first (kept as-is, but apply sameStore/isIgnored) ---
function smwsPairsFirst(workArr, limit) { function smwsPairsFirst(workArr, limit) {
const buckets = new Map(); // code -> items[] const buckets = new Map(); // code -> items[]
for (const it of workArr) { for (const it of workArr) {
if (!it) continue; if (!it) continue;
const sku = String(it.sku || ""); const sku = String(it.sku || "");
if (!sku) continue; if (!sku) continue;
const code = smwsKeyFromName(it.name || ""); const code = smwsKeyFromName(it.name || "");
if (!code) continue; if (!code) continue;
let arr = buckets.get(code); let arr = buckets.get(code);
if (!arr) buckets.set(code, (arr = [])); if (!arr) buckets.set(code, (arr = []));
arr.push(it); arr.push(it);
@ -304,58 +314,39 @@ export function computeInitialPairsFast(allAgg, mappedSkus, limitPairs, isIgnore
.sort((a, b) => itemRank(b) - itemRank(a)) .sort((a, b) => itemRank(b) - itemRank(a))
.slice(0, 80); .slice(0, 80);
const mapped = []; // Prefer an unmapped anchor if possible; otherwise best overall
const unmapped = []; const anchor = arr.slice().sort((a, b) => itemRank(b) - itemRank(a))[0];
for (const it of arr) {
const sku = String(it.sku || "");
if (mappedSkus && mappedSkus.has(sku)) mapped.push(it);
else unmapped.push(it);
}
const anchor = (mapped.length ? mapped : unmapped)
.slice()
.sort((a, b) => itemRank(b) - itemRank(a))[0];
if (!anchor) continue; if (!anchor) continue;
if (unmapped.length) { for (const u of arr) {
for (const u of unmapped) { if (u === anchor) continue;
const a = anchor; const a = anchor;
const b = u; const b = u;
const aSku = String(a.sku || ""); const aSku = String(a.sku || "");
const bSku = String(b.sku || ""); const bSku = String(b.sku || "");
if (!aSku || !bSku || aSku === bSku) continue; if (!aSku || !bSku || aSku === bSku) continue;
// Only link *unmapped* targets in this stage
if (mappedSkus && mappedSkus.has(bSku)) continue;
if (typeof sameStoreFn === "function" && sameStoreFn(aSku, bSku)) continue; if (typeof sameStoreFn === "function" && sameStoreFn(aSku, bSku)) continue;
if (typeof isIgnoredPairFn === "function" && isIgnoredPairFn(aSku, bSku)) continue; if (typeof isIgnoredPairFn === "function" && isIgnoredPairFn(aSku, bSku)) continue;
const s = 1e9 + itemRank(a) + itemRank(b); const s = 1e9 + itemRank(a) + itemRank(b);
candPairs.push({ a, b, score: s, aIsMapped: mappedSkus && mappedSkus.has(aSku) }); candPairs.push({ a, b, score: s });
}
} }
} }
candPairs.sort((x, y) => y.score - x.score); candPairs.sort((x, y) => y.score - x.score);
const usedUnmapped = new Set(); const usedUnmapped = new Set();
const anchorUse = new Map();
const ANCHOR_REUSE_CAP = 6;
const out0 = []; const out0 = [];
for (const p of candPairs) { for (const p of candPairs) {
const aSku = String(p.a.sku || "");
const bSku = String(p.b.sku || ""); const bSku = String(p.b.sku || "");
if (!aSku || !bSku) continue; if (!bSku) continue;
if (usedUnmapped.has(bSku)) continue; if (usedUnmapped.has(bSku)) continue;
const k = aSku;
const n = anchorUse.get(k) || 0;
if (n >= ANCHOR_REUSE_CAP) continue;
usedUnmapped.add(bSku); usedUnmapped.add(bSku);
anchorUse.set(k, n + 1);
out0.push(p); out0.push(p);
if (out0.length >= limit) break; if (out0.length >= limit) break;
} }
@ -365,23 +356,35 @@ export function computeInitialPairsFast(allAgg, mappedSkus, limitPairs, isIgnore
const smwsFirst = smwsPairsFirst(workAll, limitPairs); const smwsFirst = smwsPairsFirst(workAll, limitPairs);
const used = new Set(smwsFirst.usedUnmapped); const used = new Set(smwsFirst.usedUnmapped);
const out = smwsFirst.pairs.slice(); const out = smwsFirst.pairs.slice();
if (out.length >= limitPairs) return out.slice(0, limitPairs); if (out.length >= limitPairs) return out.slice(0, limitPairs);
const seeds = topSuggestions(work, Math.min(150, work.length), "", mappedSkus).filter( // --- Improved general pairing logic (uses same “good” scoring knobs) ---
const seeds = topSuggestions(work, Math.min(220, work.length), "", mappedSkus).filter(
(it) => !used.has(String(it?.sku || "")) (it) => !used.has(String(it?.sku || ""))
); );
const TOKEN_BUCKET_CAP = 500; // Build token buckets over *normalized* names (better hits)
const tokMap = new Map(); const TOKEN_BUCKET_CAP = 700;
const itemTokens = new Map(); const tokMap = new Map(); // token -> items[]
const itemNormName = new Map(); const itemRawToks = new Map(); // sku -> raw tokens
const itemNorm = new Map(); // sku -> norm name
const itemFilt = new Map(); // sku -> filtered tokens (for first-token logic)
for (const it of work) { for (const it of work) {
const toks = Array.from(new Set(tokenizeQuery(it.name || ""))).filter(Boolean).slice(0, 10); const sku = String(it.sku || "");
itemTokens.set(it.sku, toks); if (!sku) continue;
itemNormName.set(it.sku, normSearchText(it.name || ""));
for (const t of toks) { const n = normSearchText(it.name || "");
const raw = tokenizeQuery(n);
const filt = filterSimTokens(raw);
itemNorm.set(sku, n);
itemRawToks.set(sku, raw);
itemFilt.set(sku, filt);
// bucket using a handful of filtered tokens (higher signal)
for (const t of filt.slice(0, 12)) {
let arr = tokMap.get(t); let arr = tokMap.get(t);
if (!arr) tokMap.set(t, (arr = [])); if (!arr) tokMap.set(t, (arr = []));
if (arr.length < TOKEN_BUCKET_CAP) arr.push(it); if (arr.length < TOKEN_BUCKET_CAP) arr.push(it);
@ -389,18 +392,25 @@ export function computeInitialPairsFast(allAgg, mappedSkus, limitPairs, isIgnore
} }
const bestByPair = new Map(); const bestByPair = new Map();
const MAX_CAND_TOTAL = 250; const MAX_CAND_TOTAL = 450;
const MAX_FINE = 10; const MAX_CHEAP = 40;
const MAX_FINE = 18;
for (const a of seeds) { for (const a of seeds) {
const aSku = String(a.sku || ""); const aSku = String(a.sku || "");
if (!aSku || used.has(aSku)) continue; if (!aSku || used.has(aSku)) continue;
const aToks = itemTokens.get(aSku) || []; const aNorm = itemNorm.get(aSku) || normSearchText(a.name || "");
if (!aToks.length) continue; const aRaw = itemRawToks.get(aSku) || tokenizeQuery(aNorm);
const aFilt = itemFilt.get(aSku) || filterSimTokens(aRaw);
if (!aFilt.length) continue;
const aBrand = aFilt[0] || "";
const aAge = extractAgeFromText(aNorm);
// Gather candidates from token buckets
const cand = new Map(); const cand = new Map();
for (const t of aToks) { for (const t of aFilt.slice(0, 10)) {
const arr = tokMap.get(t); const arr = tokMap.get(t);
if (!arr) continue; if (!arr) continue;
@ -421,29 +431,85 @@ export function computeInitialPairsFast(allAgg, mappedSkus, limitPairs, isIgnore
} }
if (!cand.size) continue; if (!cand.size) continue;
const aNameN = itemNormName.get(aSku) || ""; // Cheap score stage (fastSimilarity + containment + size + age + first-token mismatch penalty)
const cheap = []; const cheap = [];
for (const b of cand.values()) { for (const b of cand.values()) {
const bSku = String(b.sku || ""); const bSku = String(b.sku || "");
const bToks = itemTokens.get(bSku) || []; const bNorm = itemNorm.get(bSku) || normSearchText(b.name || "");
const bNameN = itemNormName.get(bSku) || ""; const bRaw = itemRawToks.get(bSku) || tokenizeQuery(bNorm);
const s = fastSimilarityScore(aToks, bToks, aNameN, bNameN); const bFilt = itemFilt.get(bSku) || filterSimTokens(bRaw);
if (s > 0) cheap.push({ b, s }); if (!bFilt.length) continue;
const contain = tokenContainmentScore(aRaw, bRaw);
const bBrand = bFilt[0] || "";
const firstMatch = aBrand && bBrand && aBrand === bBrand;
let s = fastSimilarityScore(aRaw, bRaw, aNorm, bNorm);
if (s <= 0) s = 0.01 + 0.25 * contain;
if (!firstMatch) {
const smallN = Math.min(aFilt.length || 0, bFilt.length || 0);
let mult = 0.10 + 0.95 * contain;
if (smallN <= 3 && contain < 0.78) mult *= 0.22;
s *= Math.min(1.0, mult);
} }
if (typeof sizePenaltyFn === "function") s *= sizePenaltyFn(aSku, bSku);
const bAge = extractAgeFromText(bNorm);
if (aAge && bAge) {
if (aAge === bAge) s *= 1.6;
else s *= 0.22;
}
if (String(aSku).startsWith("u:") || String(bSku).startsWith("u:")) s *= 1.06;
if (s > 0) cheap.push({ b, s, bNorm, bRaw, bFilt, contain, firstMatch, bAge });
}
if (!cheap.length) continue; if (!cheap.length) continue;
cheap.sort((x, y) => y.s - x.s); cheap.sort((x, y) => y.s - x.s);
// Fine stage (expensive similarityScore + same penalties again)
let bestB = null; let bestB = null;
let bestS = 0; let bestS = 0;
for (const x of cheap.slice(0, MAX_FINE)) { for (const x of cheap.slice(0, MAX_FINE)) {
const s = similarityScore(a.name || "", x.b.name || ""); const b = x.b;
const bSku = String(b.sku || "");
let s = similarityScore(a.name || "", b.name || "");
if (s <= 0) continue;
// first-token mismatch soft penalty
if (!x.firstMatch) {
const smallN = Math.min(aFilt.length || 0, (x.bFilt || []).length || 0);
let mult = 0.10 + 0.95 * x.contain;
if (smallN <= 3 && x.contain < 0.78) mult *= 0.22;
s *= Math.min(1.0, mult);
if (s <= 0) continue;
}
if (typeof sizePenaltyFn === "function") {
s *= sizePenaltyFn(aSku, bSku);
if (s <= 0) continue;
}
if (aAge && x.bAge) {
if (aAge === x.bAge) s *= 2.0;
else s *= 0.15;
}
if (String(aSku).startsWith("u:") || String(bSku).startsWith("u:")) s *= 1.10;
if (s > bestS) { if (s > bestS) {
bestS = s; bestS = s;
bestB = x.b; bestB = b;
} }
} }
if (!bestB || bestS < 0.6) continue; // Threshold (slightly lower than before, because we now punish mismatches more intelligently)
if (!bestB || bestS < 0.50) continue;
const bSku = String(bestB.sku || ""); const bSku = String(bestB.sku || "");
if (!bSku || used.has(bSku)) continue; if (!bSku || used.has(bSku)) continue;
@ -456,10 +522,11 @@ export function computeInitialPairsFast(allAgg, mappedSkus, limitPairs, isIgnore
const pairs = Array.from(bestByPair.values()); const pairs = Array.from(bestByPair.values());
pairs.sort((x, y) => y.score - x.score); pairs.sort((x, y) => y.score - x.score);
// ---- light randomness inside a top band (same behavior as before) ----
const need = Math.max(0, limitPairs - out.length); const need = Math.max(0, limitPairs - out.length);
if (!need) return out.slice(0, limitPairs); if (!need) return out.slice(0, limitPairs);
const TOP_BAND = Math.min(600, pairs.length); const TOP_BAND = Math.min(700, pairs.length);
const JITTER = 0.08; const JITTER = 0.08;
const band = pairs.slice(0, TOP_BAND).map((p) => { const band = pairs.slice(0, TOP_BAND).map((p) => {
@ -474,6 +541,7 @@ export function computeInitialPairsFast(allAgg, mappedSkus, limitPairs, isIgnore
if (!aSku || !bSku || aSku === bSku) return false; if (!aSku || !bSku || aSku === bSku) return false;
if (used.has(aSku) || used.has(bSku)) return false; if (used.has(aSku) || used.has(bSku)) return false;
if (typeof sameStoreFn === "function" && sameStoreFn(aSku, bSku)) return false; if (typeof sameStoreFn === "function" && sameStoreFn(aSku, bSku)) return false;
if (typeof isIgnoredPairFn === "function" && isIgnoredPairFn(aSku, bSku)) return false;
used.add(aSku); used.add(aSku);
used.add(bSku); used.add(bSku);
@ -494,8 +562,13 @@ export function computeInitialPairsFast(allAgg, mappedSkus, limitPairs, isIgnore
} }
return out.slice(0, limitPairs); return out.slice(0, limitPairs);
} }
function fnv1a32u(str) {
function fnv1a32u(str) {
let h = 0x811c9dc5; let h = 0x811c9dc5;
str = String(str || ""); str = String(str || "");
for (let i = 0; i < str.length; i++) { for (let i = 0; i < str.length; i++) {

View file

@ -139,7 +139,30 @@ export async function renderSkuLinker($app) {
return String(rules.canonicalSku(aSku)) === String(rules.canonicalSku(bSku)); return String(rules.canonicalSku(aSku)) === String(rules.canonicalSku(bSku));
} }
const initialPairs = computeInitialPairsFast(allAgg, mappedSkus, 28, isIgnoredPair, sameStoreCanon); let initialPairs = null;
function getInitialPairsIfNeeded() {
// never compute if either side is pinned
if (pinnedL || pinnedR) return null;
// never compute if URL query param was used (preselect flow)
if (shouldReloadAfterLink) return null;
if (initialPairs) return initialPairs;
initialPairs = computeInitialPairsFast(
allAgg,
mappedSkus,
28,
isIgnoredPair,
sameStoreCanon,
sizePenaltyForPair // ✅ NEW
);
return initialPairs;
}
let pinnedL = null; let pinnedL = null;
let pinnedR = null; let pinnedR = null;
@ -221,8 +244,9 @@ export async function renderSkuLinker($app) {
sameGroup sameGroup
); );
if (initialPairs && initialPairs.length) { const pairs = getInitialPairsIfNeeded();
const list = side === "L" ? initialPairs.map((p) => p.a) : initialPairs.map((p) => p.b); if (pairs && pairs.length) {
const list = side === "L" ? pairs.map((p) => p.a) : pairs.map((p) => p.b);
return list.filter( return list.filter(
(it) => (it) =>
it && it &&