mirror of
https://github.com/samsonjs/spirit-tracker.git
synced 2026-03-25 09:25:51 +00:00
link sku
This commit is contained in:
parent
1e4b24d391
commit
7b341c2e07
7 changed files with 1071 additions and 1184 deletions
91
viz/app/linker/canonical_pref.js
Normal file
91
viz/app/linker/canonical_pref.js
Normal file
|
|
@ -0,0 +1,91 @@
|
|||
// viz/app/linker/canonical_pref.js
|
||||
import { keySkuForRow } from "../sku.js";
|
||||
|
||||
function isRealSkuKey(skuKey) {
|
||||
const s = String(skuKey || "").trim();
|
||||
return /^\d{6}$/.test(s);
|
||||
}
|
||||
|
||||
function isSoftSkuKey(k) {
|
||||
const s = String(k || "");
|
||||
return s.startsWith("upc:") || s.startsWith("id:");
|
||||
}
|
||||
|
||||
function isUnknownSkuKey2(k) {
|
||||
return String(k || "").trim().startsWith("u:");
|
||||
}
|
||||
|
||||
function isBCStoreLabel(label) {
|
||||
const s = String(label || "").toLowerCase();
|
||||
return (
|
||||
s.includes("bcl") ||
|
||||
s.includes("strath") ||
|
||||
s.includes("gull") ||
|
||||
s.includes("legacy") ||
|
||||
s.includes("tudor") ||
|
||||
s.includes("vessel") ||
|
||||
s.includes("vintagespirits")
|
||||
);
|
||||
}
|
||||
|
||||
function skuIsBC(allRows, skuKey) {
|
||||
for (const r of allRows) {
|
||||
if (keySkuForRow(r) !== skuKey) continue;
|
||||
const lab = String(r.storeLabel || r.store || "");
|
||||
if (isBCStoreLabel(lab)) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
function isABStoreLabel(label) {
|
||||
const s = String(label || "").toLowerCase();
|
||||
return (
|
||||
s.includes("alberta") ||
|
||||
s.includes("calgary") ||
|
||||
s.includes("edmonton") ||
|
||||
/\bab\b/.test(s)
|
||||
);
|
||||
}
|
||||
|
||||
function skuIsAB(allRows, skuKey) {
|
||||
for (const r of allRows) {
|
||||
if (keySkuForRow(r) !== skuKey) continue;
|
||||
const lab = String(r.storeLabel || r.store || "");
|
||||
if (isABStoreLabel(lab)) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
function scoreCanonical(allRows, skuKey) {
|
||||
const s = String(skuKey || "");
|
||||
const real = isRealSkuKey(s) ? 1 : 0;
|
||||
const ab = skuIsAB(allRows, s) ? 1 : 0;
|
||||
const bc = skuIsBC(allRows, s) ? 1 : 0;
|
||||
const soft = isSoftSkuKey(s) ? 1 : 0;
|
||||
const unk = isUnknownSkuKey2(s) ? 1 : 0;
|
||||
|
||||
let base = 0;
|
||||
if (real) base = 1000;
|
||||
else if (soft) base = 200;
|
||||
else if (!unk) base = 100;
|
||||
else base = -1000;
|
||||
|
||||
return base + ab * 25 - bc * 10;
|
||||
}
|
||||
|
||||
export function pickPreferredCanonical(allRows, skuKeys) {
|
||||
let best = "";
|
||||
let bestScore = -Infinity;
|
||||
for (const k of skuKeys) {
|
||||
const s = String(k || "").trim();
|
||||
if (!s) continue;
|
||||
const sc = scoreCanonical(allRows, s);
|
||||
if (sc > bestScore) {
|
||||
bestScore = sc;
|
||||
best = s;
|
||||
} else if (sc === bestScore && s && best && s < best) {
|
||||
best = s;
|
||||
}
|
||||
}
|
||||
return best;
|
||||
}
|
||||
269
viz/app/linker/similarity.js
Normal file
269
viz/app/linker/similarity.js
Normal file
|
|
@ -0,0 +1,269 @@
|
|||
// viz/app/linker/similarity.js
|
||||
import { tokenizeQuery, normSearchText } from "../sku.js";
|
||||
|
||||
// Ignore ultra-common / low-signal tokens in bottle names.
|
||||
const SIM_STOP_TOKENS = new Set([
|
||||
"the","a","an","and","of","to","in","for","with",
|
||||
"year","years","yr","yrs","old",
|
||||
"whisky","whiskey","scotch","single","malt","cask","finish","edition","release","batch","strength","abv","proof",
|
||||
"anniversary",
|
||||
]);
|
||||
|
||||
const SMWS_WORD_RE = /\bsmws\b/i;
|
||||
const SMWS_CODE_RE = /\b(\d{1,3}\.\d{1,4})\b/;
|
||||
|
||||
export function smwsKeyFromName(name) {
|
||||
const s = String(name || "");
|
||||
if (!SMWS_WORD_RE.test(s)) return "";
|
||||
const m = s.match(SMWS_CODE_RE);
|
||||
return m ? m[1] : "";
|
||||
}
|
||||
|
||||
const ORDINAL_RE = /^(\d+)(st|nd|rd|th)$/i;
|
||||
|
||||
export function numKey(t) {
|
||||
const s = String(t || "").trim().toLowerCase();
|
||||
if (!s) return "";
|
||||
if (/^\d+$/.test(s)) return s;
|
||||
const m = s.match(ORDINAL_RE);
|
||||
return m ? m[1] : "";
|
||||
}
|
||||
|
||||
function isNumberToken(t) {
|
||||
return !!numKey(t);
|
||||
}
|
||||
|
||||
export function extractAgeFromText(normName) {
|
||||
const s = String(normName || "");
|
||||
if (!s) return "";
|
||||
|
||||
const m = s.match(/\b(?:aged\s*)?(\d{1,2})\s*(?:yr|yrs|year|years)\b/i);
|
||||
if (m && m[1]) return String(parseInt(m[1], 10));
|
||||
|
||||
const m2 = s.match(/\b(\d{1,2})\s*yo\b/i);
|
||||
if (m2 && m2[1]) return String(parseInt(m2[1], 10));
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
export function filterSimTokens(tokens) {
|
||||
const out = [];
|
||||
const seen = new Set();
|
||||
|
||||
const SIM_EQUIV = new Map([
|
||||
["years", "yr"],
|
||||
["year", "yr"],
|
||||
["yrs", "yr"],
|
||||
["yr", "yr"],
|
||||
["whiskey", "whisky"],
|
||||
["whisky", "whisky"],
|
||||
["bourbon", "bourbon"],
|
||||
]);
|
||||
|
||||
const VOL_UNIT = new Set(["ml","l","cl","oz","liter","liters","litre","litres"]);
|
||||
const VOL_INLINE_RE = /^\d+(?:\.\d+)?(?:ml|l|cl|oz)$/i; // 700ml, 1.14l
|
||||
const PCT_INLINE_RE = /^\d+(?:\.\d+)?%$/; // 46%, 40.0%
|
||||
|
||||
const arr = Array.isArray(tokens) ? tokens : [];
|
||||
|
||||
for (let i = 0; i < arr.length; i++) {
|
||||
const raw = arr[i];
|
||||
let t = String(raw || "").trim().toLowerCase();
|
||||
if (!t) continue;
|
||||
|
||||
if (!/[a-z0-9]/i.test(t)) continue;
|
||||
|
||||
if (VOL_INLINE_RE.test(t)) continue;
|
||||
if (PCT_INLINE_RE.test(t)) continue;
|
||||
|
||||
t = SIM_EQUIV.get(t) || t;
|
||||
|
||||
const nk = numKey(t);
|
||||
if (nk) t = nk;
|
||||
|
||||
if (VOL_UNIT.has(t) || t === "abv" || t === "proof") continue;
|
||||
|
||||
if (/^\d+(?:\.\d+)?$/.test(t)) {
|
||||
const next = String(arr[i + 1] || "").trim().toLowerCase();
|
||||
const nextNorm = SIM_EQUIV.get(next) || next;
|
||||
if (VOL_UNIT.has(nextNorm)) {
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (!isNumberToken(t) && SIM_STOP_TOKENS.has(t)) continue;
|
||||
|
||||
if (seen.has(t)) continue;
|
||||
seen.add(t);
|
||||
out.push(t);
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
export function numberMismatchPenalty(aTokens, bTokens) {
|
||||
const aNums = new Set((aTokens || []).map(numKey).filter(Boolean));
|
||||
const bNums = new Set((bTokens || []).map(numKey).filter(Boolean));
|
||||
if (!aNums.size || !bNums.size) return 1.0;
|
||||
for (const n of aNums) if (bNums.has(n)) return 1.0;
|
||||
return 0.28;
|
||||
}
|
||||
|
||||
export function levenshtein(a, b) {
|
||||
a = String(a || "");
|
||||
b = String(b || "");
|
||||
const n = a.length, m = b.length;
|
||||
if (!n) return m;
|
||||
if (!m) return n;
|
||||
|
||||
const dp = new Array(m + 1);
|
||||
for (let j = 0; j <= m; j++) dp[j] = j;
|
||||
|
||||
for (let i = 1; i <= n; i++) {
|
||||
let prev = dp[0];
|
||||
dp[0] = i;
|
||||
const ca = a.charCodeAt(i - 1);
|
||||
for (let j = 1; j <= m; j++) {
|
||||
const tmp = dp[j];
|
||||
const cost = ca === b.charCodeAt(j - 1) ? 0 : 1;
|
||||
dp[j] = Math.min(dp[j] + 1, dp[j - 1] + 1, prev + cost);
|
||||
prev = tmp;
|
||||
}
|
||||
}
|
||||
return dp[m];
|
||||
}
|
||||
|
||||
export function tokenContainmentScore(aTokens, bTokens) {
|
||||
const A = filterSimTokens(aTokens || []);
|
||||
const B = filterSimTokens(bTokens || []);
|
||||
if (!A.length || !B.length) return 0;
|
||||
|
||||
const aSet = new Set(A);
|
||||
const bSet = new Set(B);
|
||||
|
||||
const small = aSet.size <= bSet.size ? aSet : bSet;
|
||||
const big = aSet.size <= bSet.size ? bSet : aSet;
|
||||
|
||||
let hit = 0;
|
||||
for (const t of small) if (big.has(t)) hit++;
|
||||
|
||||
const recall = hit / Math.max(1, small.size);
|
||||
const precision = hit / Math.max(1, big.size);
|
||||
const f1 = (2 * precision * recall) / Math.max(1e-9, precision + recall);
|
||||
|
||||
return f1;
|
||||
}
|
||||
|
||||
export function similarityScore(aName, bName) {
|
||||
const a = normSearchText(aName);
|
||||
const b = normSearchText(bName);
|
||||
if (!a || !b) return 0;
|
||||
|
||||
const aAge = extractAgeFromText(a);
|
||||
const bAge = extractAgeFromText(b);
|
||||
const ageBoth = !!(aAge && bAge);
|
||||
const ageMatch = ageBoth && aAge === bAge;
|
||||
const ageMismatch = ageBoth && aAge !== bAge;
|
||||
|
||||
const aToksRaw = tokenizeQuery(a);
|
||||
const bToksRaw = tokenizeQuery(b);
|
||||
|
||||
const aToks = filterSimTokens(aToksRaw);
|
||||
const bToks = filterSimTokens(bToksRaw);
|
||||
if (!aToks.length || !bToks.length) return 0;
|
||||
|
||||
const contain = tokenContainmentScore(aToksRaw, bToksRaw);
|
||||
|
||||
const aFirst = aToks[0] || "";
|
||||
const bFirst = bToks[0] || "";
|
||||
const firstMatch = aFirst && bFirst && aFirst === bFirst ? 1 : 0;
|
||||
|
||||
const A = new Set(aToks.slice(1));
|
||||
const B = new Set(bToks.slice(1));
|
||||
let inter = 0;
|
||||
for (const w of A) if (B.has(w)) inter++;
|
||||
const denom = Math.max(1, Math.max(A.size, B.size));
|
||||
const overlapTail = inter / denom;
|
||||
|
||||
const d = levenshtein(a, b);
|
||||
const maxLen = Math.max(1, Math.max(a.length, b.length));
|
||||
const levSim = 1 - d / maxLen;
|
||||
|
||||
let gate = firstMatch ? 1.0 : Math.min(0.80, 0.06 + 0.95 * contain);
|
||||
|
||||
const smallN = Math.min(aToks.length, bToks.length);
|
||||
if (!firstMatch && smallN <= 3 && contain < 0.78) gate *= 0.18;
|
||||
|
||||
const numGate = numberMismatchPenalty(aToks, bToks);
|
||||
|
||||
let s =
|
||||
numGate *
|
||||
(firstMatch * 3.0 +
|
||||
overlapTail * 2.2 * gate +
|
||||
levSim * (firstMatch ? 1.0 : (0.10 + 0.70 * contain)));
|
||||
|
||||
if (ageMatch) s *= 2.2;
|
||||
else if (ageMismatch) s *= 0.18;
|
||||
|
||||
s *= 1 + 0.9 * contain;
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
export function fastSimilarityScore(aTokens, bTokens, aNormName, bNormName) {
|
||||
const aTokensRaw = aTokens || [];
|
||||
const bTokensRaw = bTokens || [];
|
||||
|
||||
const aTokF = filterSimTokens(aTokensRaw);
|
||||
const bTokF = filterSimTokens(bTokensRaw);
|
||||
if (!aTokF.length || !bTokF.length) return 0;
|
||||
|
||||
const a = String(aNormName || "");
|
||||
const b = String(bNormName || "");
|
||||
|
||||
const aAge = extractAgeFromText(a);
|
||||
const bAge = extractAgeFromText(b);
|
||||
const ageBoth = !!(aAge && bAge);
|
||||
const ageMatch = ageBoth && aAge === bAge;
|
||||
const ageMismatch = ageBoth && aAge !== bAge;
|
||||
|
||||
const contain = tokenContainmentScore(aTokensRaw, bTokensRaw);
|
||||
|
||||
const aFirst = aTokF[0] || "";
|
||||
const bFirst = bTokF[0] || "";
|
||||
const firstMatch = aFirst && bFirst && aFirst === bFirst ? 1 : 0;
|
||||
|
||||
const aTail = aTokF.slice(1);
|
||||
const bTail = bTokF.slice(1);
|
||||
|
||||
let inter = 0;
|
||||
const bSet = new Set(bTail);
|
||||
for (const t of aTail) if (bSet.has(t)) inter++;
|
||||
|
||||
const denom = Math.max(1, Math.max(aTail.length, bTail.length));
|
||||
const overlapTail = inter / denom;
|
||||
|
||||
const pref =
|
||||
firstMatch &&
|
||||
a.slice(0, 10) &&
|
||||
b.slice(0, 10) &&
|
||||
a.slice(0, 10) === b.slice(0, 10)
|
||||
? 0.2
|
||||
: 0;
|
||||
|
||||
let gate = firstMatch ? 1.0 : Math.min(0.80, 0.06 + 0.95 * contain);
|
||||
const smallN = Math.min(aTokF.length, bTokF.length);
|
||||
if (!firstMatch && smallN <= 3 && contain < 0.78) gate *= 0.18;
|
||||
|
||||
const numGate = numberMismatchPenalty(aTokF, bTokF);
|
||||
|
||||
let s = numGate * (firstMatch * 2.4 + overlapTail * 2.0 * gate + pref);
|
||||
|
||||
if (ageMatch) s *= 2.0;
|
||||
else if (ageMismatch) s *= 0.2;
|
||||
|
||||
s *= 1 + 0.9 * contain;
|
||||
|
||||
return s;
|
||||
}
|
||||
108
viz/app/linker/size.js
Normal file
108
viz/app/linker/size.js
Normal file
|
|
@ -0,0 +1,108 @@
|
|||
// viz/app/linker/size.js
|
||||
import { keySkuForRow } from "../sku.js";
|
||||
|
||||
const SIZE_TOLERANCE_ML = 8;
|
||||
|
||||
export function parseSizesMlFromText(text) {
|
||||
const s = String(text || "").toLowerCase();
|
||||
if (!s) return [];
|
||||
|
||||
const out = new Set();
|
||||
const re = /\b(\d+(?:\.\d+)?)\s*(ml|cl|l|litre|litres|liter|liters)\b/g;
|
||||
|
||||
let m;
|
||||
while ((m = re.exec(s))) {
|
||||
const val = parseFloat(m[1]);
|
||||
const unit = m[2];
|
||||
if (!isFinite(val) || val <= 0) continue;
|
||||
|
||||
let ml = 0;
|
||||
if (unit === "ml") ml = Math.round(val);
|
||||
else if (unit === "cl") ml = Math.round(val * 10);
|
||||
else ml = Math.round(val * 1000);
|
||||
|
||||
if (ml >= 50 && ml <= 5000) out.add(ml);
|
||||
}
|
||||
|
||||
return Array.from(out);
|
||||
}
|
||||
|
||||
function sizeSetsMatch(aSet, bSet) {
|
||||
if (!aSet?.size || !bSet?.size) return false;
|
||||
for (const a of aSet) {
|
||||
for (const b of bSet) {
|
||||
if (Math.abs(a - b) <= SIZE_TOLERANCE_ML) return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
export function sizePenalty(aSet, bSet) {
|
||||
if (!aSet?.size || !bSet?.size) return 1.0;
|
||||
if (sizeSetsMatch(aSet, bSet)) return 1.0;
|
||||
return 0.08;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds caches and returns a function (aSku,bSku)=>penalty.
|
||||
* This keeps linker_page.js clean and makes cache rebuild explicit when rules change.
|
||||
*/
|
||||
export function buildSizePenaltyForPair({ allRows, allAgg, rules }) {
|
||||
const SKU_SIZE_CACHE = new Map(); // skuKey -> Set<int ml>
|
||||
|
||||
function ensureSkuSet(k) {
|
||||
let set = SKU_SIZE_CACHE.get(k);
|
||||
if (!set) SKU_SIZE_CACHE.set(k, (set = new Set()));
|
||||
return set;
|
||||
}
|
||||
|
||||
for (const r of allRows) {
|
||||
if (!r || r.removed) continue;
|
||||
const skuKey = String(keySkuForRow(r) || "").trim();
|
||||
if (!skuKey) continue;
|
||||
|
||||
const name = r.name || r.title || r.productName || "";
|
||||
const sizes = parseSizesMlFromText(name);
|
||||
if (!sizes.length) continue;
|
||||
|
||||
const set = ensureSkuSet(skuKey);
|
||||
for (const x of sizes) set.add(x);
|
||||
}
|
||||
|
||||
for (const it of allAgg) {
|
||||
const skuKey = String(it?.sku || "").trim();
|
||||
if (!skuKey || !it?.name) continue;
|
||||
const sizes = parseSizesMlFromText(it.name);
|
||||
if (!sizes.length) continue;
|
||||
|
||||
const set = ensureSkuSet(skuKey);
|
||||
for (const x of sizes) set.add(x);
|
||||
}
|
||||
|
||||
const CANON_SIZE_CACHE = new Map(); // canon -> Set<int ml>
|
||||
|
||||
function ensureCanonSet(k) {
|
||||
let set = CANON_SIZE_CACHE.get(k);
|
||||
if (!set) CANON_SIZE_CACHE.set(k, (set = new Set()));
|
||||
return set;
|
||||
}
|
||||
|
||||
for (const it of allAgg) {
|
||||
const skuKey = String(it?.sku || "").trim();
|
||||
if (!skuKey) continue;
|
||||
|
||||
const canon = String(rules.canonicalSku(skuKey) || skuKey);
|
||||
const canonSet = ensureCanonSet(canon);
|
||||
|
||||
const skuSet = SKU_SIZE_CACHE.get(skuKey);
|
||||
if (skuSet) for (const x of skuSet) canonSet.add(x);
|
||||
}
|
||||
|
||||
return function sizePenaltyForPair(aSku, bSku) {
|
||||
const aCanon = String(rules.canonicalSku(String(aSku || "")) || "");
|
||||
const bCanon = String(rules.canonicalSku(String(bSku || "")) || "");
|
||||
const A = aCanon ? (CANON_SIZE_CACHE.get(aCanon) || new Set()) : new Set();
|
||||
const B = bCanon ? (CANON_SIZE_CACHE.get(bCanon) || new Set()) : new Set();
|
||||
return sizePenalty(A, B);
|
||||
};
|
||||
}
|
||||
43
viz/app/linker/store_cache.js
Normal file
43
viz/app/linker/store_cache.js
Normal file
|
|
@ -0,0 +1,43 @@
|
|||
// viz/app/linker/store_cache.js
|
||||
|
||||
function canonKeyForSku(rules, skuKey) {
|
||||
const s = String(skuKey || "").trim();
|
||||
if (!s) return "";
|
||||
return String(rules.canonicalSku(s) || s);
|
||||
}
|
||||
|
||||
export function buildCanonStoreCache(allAgg, rules) {
|
||||
const m = new Map(); // canonSku -> Set<storeLabel>
|
||||
|
||||
for (const it of allAgg) {
|
||||
if (!it) continue;
|
||||
|
||||
const skuKey = String(it.sku || "").trim();
|
||||
if (!skuKey) continue;
|
||||
|
||||
const canon = String(rules.canonicalSku(skuKey) || skuKey);
|
||||
let set = m.get(canon);
|
||||
if (!set) m.set(canon, (set = new Set()));
|
||||
|
||||
const stores = it.stores;
|
||||
if (stores && stores.size) for (const s of stores) set.add(s);
|
||||
}
|
||||
|
||||
return m;
|
||||
}
|
||||
|
||||
function canonStoresForSku(rules, canonStoreCache, skuKey) {
|
||||
const canon = canonKeyForSku(rules, skuKey);
|
||||
return canon ? canonStoreCache.get(canon) || new Set() : new Set();
|
||||
}
|
||||
|
||||
export function makeSameStoreCanonFn(rules, canonStoreCache) {
|
||||
return function sameStoreCanon(aSku, bSku) {
|
||||
const A = canonStoresForSku(rules, canonStoreCache, String(aSku || ""));
|
||||
const B = canonStoresForSku(rules, canonStoreCache, String(bSku || ""));
|
||||
if (!A.size || !B.size) return false;
|
||||
for (const s of A) if (B.has(s)) return true;
|
||||
return false;
|
||||
};
|
||||
}
|
||||
|
||||
458
viz/app/linker/suggestions.js
Normal file
458
viz/app/linker/suggestions.js
Normal file
|
|
@ -0,0 +1,458 @@
|
|||
// viz/app/linker/suggestions.js
|
||||
import { tokenizeQuery, normSearchText } from "../sku.js";
|
||||
import {
|
||||
smwsKeyFromName,
|
||||
extractAgeFromText,
|
||||
filterSimTokens,
|
||||
tokenContainmentScore,
|
||||
fastSimilarityScore,
|
||||
similarityScore,
|
||||
} from "./similarity.js";
|
||||
|
||||
/* ---------------- Randomization helpers ---------------- */
|
||||
|
||||
function mulberry32(seed) {
|
||||
let t = seed >>> 0;
|
||||
return function () {
|
||||
t += 0x6d2b79f5;
|
||||
let x = Math.imul(t ^ (t >>> 15), 1 | t);
|
||||
x ^= x + Math.imul(x ^ (x >>> 7), 61 | x);
|
||||
return ((x ^ (x >>> 14)) >>> 0) / 4294967296;
|
||||
};
|
||||
}
|
||||
|
||||
function shuffleInPlace(arr, rnd) {
|
||||
for (let i = arr.length - 1; i > 0; i--) {
|
||||
const j = (rnd() * (i + 1)) | 0;
|
||||
const tmp = arr[i];
|
||||
arr[i] = arr[j];
|
||||
arr[j] = tmp;
|
||||
}
|
||||
return arr;
|
||||
}
|
||||
|
||||
/* ---------------- Suggestion helpers ---------------- */
|
||||
|
||||
export function topSuggestions(allAgg, limit, otherPinnedSku, mappedSkus) {
|
||||
const scored = [];
|
||||
for (const it of allAgg) {
|
||||
if (!it) continue;
|
||||
if (mappedSkus && mappedSkus.has(String(it.sku))) continue;
|
||||
if (otherPinnedSku && String(it.sku) === String(otherPinnedSku)) continue;
|
||||
|
||||
const stores = it.stores ? it.stores.size : 0;
|
||||
const hasPrice = it.cheapestPriceNum !== null ? 1 : 0;
|
||||
const hasName = it.name ? 1 : 0;
|
||||
const unknown = String(it.sku || "").startsWith("u:") ? 1 : 0;
|
||||
|
||||
scored.push({ it, s: stores * 2 + hasPrice * 1.2 + hasName * 1.0 + unknown * 0.6 });
|
||||
}
|
||||
scored.sort((a, b) => b.s - a.s);
|
||||
return scored.slice(0, limit).map((x) => x.it);
|
||||
}
|
||||
|
||||
// same behavior guarantees as your comment in linker_page.js
|
||||
export function recommendSimilar(
|
||||
allAgg,
|
||||
pinned,
|
||||
limit,
|
||||
otherPinnedSku,
|
||||
mappedSkus,
|
||||
isIgnoredPairFn,
|
||||
sizePenaltyFn,
|
||||
sameStoreFn,
|
||||
sameGroupFn
|
||||
) {
|
||||
if (!pinned || !pinned.name) return topSuggestions(allAgg, limit, otherPinnedSku, mappedSkus);
|
||||
|
||||
const pinnedSku = String(pinned.sku || "");
|
||||
const otherSku = otherPinnedSku ? String(otherPinnedSku) : "";
|
||||
const base = String(pinned.name || "");
|
||||
|
||||
const pinNorm = normSearchText(pinned.name || "");
|
||||
const pinRawToks = tokenizeQuery(pinNorm);
|
||||
const pinToks = filterSimTokens(pinRawToks);
|
||||
const pinBrand = pinToks[0] || "";
|
||||
const pinAge = extractAgeFromText(pinNorm);
|
||||
const pinnedSmws = smwsKeyFromName(pinned.name || "");
|
||||
|
||||
const MAX_SCAN = 5000;
|
||||
const MAX_CHEAP_KEEP = 320;
|
||||
const MAX_FINE = 70;
|
||||
|
||||
function pushTopK(arr, item, k) {
|
||||
arr.push(item);
|
||||
if (arr.length > k) {
|
||||
arr.sort((a, b) => b.s - a.s);
|
||||
arr.length = k;
|
||||
}
|
||||
}
|
||||
|
||||
const cheap = [];
|
||||
let scanned = 0;
|
||||
|
||||
for (const it of allAgg) {
|
||||
if (!it) continue;
|
||||
if (scanned++ > MAX_SCAN) break;
|
||||
|
||||
const itSku = String(it.sku || "");
|
||||
if (!itSku) continue;
|
||||
|
||||
if (itSku === pinnedSku) continue;
|
||||
if (otherSku && itSku === otherSku) continue;
|
||||
|
||||
if (typeof sameStoreFn === "function" && sameStoreFn(pinnedSku, itSku)) continue;
|
||||
if (typeof isIgnoredPairFn === "function" && isIgnoredPairFn(pinnedSku, itSku)) continue;
|
||||
if (typeof sameGroupFn === "function" && sameGroupFn(pinnedSku, itSku)) continue;
|
||||
|
||||
if (pinnedSmws) {
|
||||
const k = smwsKeyFromName(it.name || "");
|
||||
if (k && k === pinnedSmws) {
|
||||
const stores = it.stores ? it.stores.size : 0;
|
||||
const hasPrice = it.cheapestPriceNum != null ? 1 : 0;
|
||||
pushTopK(cheap, { it, s: 1e9 + stores * 10 + hasPrice, itNorm: "", itRawToks: null }, MAX_CHEAP_KEEP);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
const itNorm = normSearchText(it.name || "");
|
||||
if (!itNorm) continue;
|
||||
|
||||
const itRawToks = tokenizeQuery(itNorm);
|
||||
const itToks = filterSimTokens(itRawToks);
|
||||
if (!itToks.length) continue;
|
||||
|
||||
const itBrand = itToks[0] || "";
|
||||
const firstMatch = pinBrand && itBrand && pinBrand === itBrand;
|
||||
const contain = tokenContainmentScore(pinRawToks, itRawToks);
|
||||
|
||||
let s0 = fastSimilarityScore(pinRawToks, itRawToks, pinNorm, itNorm);
|
||||
if (s0 <= 0) s0 = 0.01 + 0.25 * contain;
|
||||
|
||||
if (!firstMatch) {
|
||||
const smallN = Math.min(pinToks.length || 0, itToks.length || 0);
|
||||
let mult = 0.10 + 0.95 * contain;
|
||||
if (smallN <= 3 && contain < 0.78) mult *= 0.22;
|
||||
s0 *= Math.min(1.0, mult);
|
||||
}
|
||||
|
||||
if (typeof sizePenaltyFn === "function") {
|
||||
s0 *= sizePenaltyFn(pinnedSku, itSku);
|
||||
}
|
||||
|
||||
const itAge = extractAgeFromText(itNorm);
|
||||
if (pinAge && itAge) {
|
||||
if (pinAge === itAge) s0 *= 1.6;
|
||||
else s0 *= 0.22;
|
||||
}
|
||||
|
||||
if (pinnedSku.startsWith("u:") || itSku.startsWith("u:")) s0 *= 1.08;
|
||||
|
||||
pushTopK(cheap, { it, s: s0, itNorm, itRawToks }, MAX_CHEAP_KEEP);
|
||||
}
|
||||
|
||||
cheap.sort((a, b) => b.s - a.s);
|
||||
|
||||
const fine = [];
|
||||
for (const x of cheap.slice(0, MAX_FINE)) {
|
||||
const it = x.it;
|
||||
const itSku = String(it.sku || "");
|
||||
|
||||
let s = similarityScore(base, it.name || "");
|
||||
if (s <= 0) continue;
|
||||
|
||||
const itNorm = x.itNorm || normSearchText(it.name || "");
|
||||
const itRawToks = x.itRawToks || tokenizeQuery(itNorm);
|
||||
const itToks = filterSimTokens(itRawToks);
|
||||
const itBrand = itToks[0] || "";
|
||||
const firstMatch = pinBrand && itBrand && pinBrand === itBrand;
|
||||
const contain = tokenContainmentScore(pinRawToks, itRawToks);
|
||||
|
||||
if (!firstMatch) {
|
||||
const smallN = Math.min(pinToks.length || 0, itToks.length || 0);
|
||||
let mult = 0.10 + 0.95 * contain;
|
||||
if (smallN <= 3 && contain < 0.78) mult *= 0.22;
|
||||
s *= Math.min(1.0, mult);
|
||||
if (s <= 0) continue;
|
||||
}
|
||||
|
||||
if (typeof sizePenaltyFn === "function") {
|
||||
s *= sizePenaltyFn(pinnedSku, itSku);
|
||||
if (s <= 0) continue;
|
||||
}
|
||||
|
||||
const itAge = extractAgeFromText(itNorm);
|
||||
if (pinAge && itAge) {
|
||||
if (pinAge === itAge) s *= 2.0;
|
||||
else s *= 0.15;
|
||||
}
|
||||
|
||||
if (pinnedSku.startsWith("u:") || itSku.startsWith("u:")) s *= 1.12;
|
||||
|
||||
if (s > 0) fine.push({ it, s });
|
||||
}
|
||||
|
||||
fine.sort((a, b) => b.s - a.s);
|
||||
const out = fine.slice(0, limit).map((x) => x.it);
|
||||
if (out.length) return out;
|
||||
|
||||
const fallback = [];
|
||||
for (const it of allAgg) {
|
||||
if (!it) continue;
|
||||
const itSku = String(it.sku || "");
|
||||
if (!itSku) continue;
|
||||
if (itSku === pinnedSku) continue;
|
||||
if (otherSku && itSku === otherSku) continue;
|
||||
|
||||
if (typeof sameStoreFn === "function" && sameStoreFn(pinnedSku, itSku)) continue;
|
||||
if (typeof isIgnoredPairFn === "function" && isIgnoredPairFn(pinnedSku, itSku)) continue;
|
||||
if (typeof sameGroupFn === "function" && sameGroupFn(pinnedSku, itSku)) continue;
|
||||
|
||||
const stores = it.stores ? it.stores.size : 0;
|
||||
const hasPrice = it.cheapestPriceNum !== null ? 1 : 0;
|
||||
const hasName = it.name ? 1 : 0;
|
||||
fallback.push({ it, s: stores * 2 + hasPrice * 1.2 + hasName * 1.0 });
|
||||
if (fallback.length >= 250) break;
|
||||
}
|
||||
|
||||
fallback.sort((a, b) => b.s - a.s);
|
||||
return fallback.slice(0, limit).map((x) => x.it);
|
||||
}
|
||||
|
||||
export function computeInitialPairsFast(allAgg, mappedSkus, limitPairs, isIgnoredPairFn, sameStoreFn) {
|
||||
const itemsAll = allAgg.filter((it) => !!it);
|
||||
|
||||
const seed = (Date.now() ^ ((Math.random() * 1e9) | 0)) >>> 0;
|
||||
const rnd = mulberry32(seed);
|
||||
const itemsShuf = itemsAll.slice();
|
||||
shuffleInPlace(itemsShuf, rnd);
|
||||
|
||||
const WORK_CAP = 5000;
|
||||
const workAll = itemsShuf.length > WORK_CAP ? itemsShuf.slice(0, WORK_CAP) : itemsShuf;
|
||||
|
||||
const work = workAll.filter((it) => !(mappedSkus && mappedSkus.has(String(it.sku))));
|
||||
|
||||
function itemRank(it) {
|
||||
const stores = it.stores ? it.stores.size : 0;
|
||||
const hasPrice = it.cheapestPriceNum != null ? 1 : 0;
|
||||
const hasName = it.name ? 1 : 0;
|
||||
const unknown = String(it.sku || "").startsWith("u:") ? 1 : 0;
|
||||
return stores * 3 + hasPrice * 2 + hasName * 0.5 + unknown * 0.25;
|
||||
}
|
||||
|
||||
function smwsPairsFirst(workArr, limit) {
|
||||
const buckets = new Map(); // code -> items[]
|
||||
for (const it of workArr) {
|
||||
if (!it) continue;
|
||||
const sku = String(it.sku || "");
|
||||
if (!sku) continue;
|
||||
|
||||
const code = smwsKeyFromName(it.name || "");
|
||||
if (!code) continue;
|
||||
|
||||
let arr = buckets.get(code);
|
||||
if (!arr) buckets.set(code, (arr = []));
|
||||
arr.push(it);
|
||||
}
|
||||
|
||||
const candPairs = [];
|
||||
|
||||
for (const arr0 of buckets.values()) {
|
||||
if (!arr0 || arr0.length < 2) continue;
|
||||
|
||||
const arr = arr0
|
||||
.slice()
|
||||
.sort((a, b) => itemRank(b) - itemRank(a))
|
||||
.slice(0, 80);
|
||||
|
||||
const mapped = [];
|
||||
const unmapped = [];
|
||||
for (const it of arr) {
|
||||
const sku = String(it.sku || "");
|
||||
if (mappedSkus && mappedSkus.has(sku)) mapped.push(it);
|
||||
else unmapped.push(it);
|
||||
}
|
||||
|
||||
const anchor = (mapped.length ? mapped : unmapped)
|
||||
.slice()
|
||||
.sort((a, b) => itemRank(b) - itemRank(a))[0];
|
||||
|
||||
if (!anchor) continue;
|
||||
|
||||
if (unmapped.length) {
|
||||
for (const u of unmapped) {
|
||||
const a = anchor;
|
||||
const b = u;
|
||||
const aSku = String(a.sku || "");
|
||||
const bSku = String(b.sku || "");
|
||||
if (!aSku || !bSku || aSku === bSku) continue;
|
||||
if (typeof sameStoreFn === "function" && sameStoreFn(aSku, bSku)) continue;
|
||||
if (typeof isIgnoredPairFn === "function" && isIgnoredPairFn(aSku, bSku)) continue;
|
||||
|
||||
const s = 1e9 + itemRank(a) + itemRank(b);
|
||||
candPairs.push({ a, b, score: s, aIsMapped: mappedSkus && mappedSkus.has(aSku) });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
candPairs.sort((x, y) => y.score - x.score);
|
||||
|
||||
const usedUnmapped = new Set();
|
||||
const anchorUse = new Map();
|
||||
const ANCHOR_REUSE_CAP = 6;
|
||||
|
||||
const out0 = [];
|
||||
for (const p of candPairs) {
|
||||
const aSku = String(p.a.sku || "");
|
||||
const bSku = String(p.b.sku || "");
|
||||
if (!aSku || !bSku) continue;
|
||||
|
||||
if (usedUnmapped.has(bSku)) continue;
|
||||
|
||||
const k = aSku;
|
||||
const n = anchorUse.get(k) || 0;
|
||||
if (n >= ANCHOR_REUSE_CAP) continue;
|
||||
|
||||
usedUnmapped.add(bSku);
|
||||
anchorUse.set(k, n + 1);
|
||||
out0.push(p);
|
||||
|
||||
if (out0.length >= limit) break;
|
||||
}
|
||||
|
||||
return { pairs: out0, usedUnmapped };
|
||||
}
|
||||
|
||||
const smwsFirst = smwsPairsFirst(workAll, limitPairs);
|
||||
const used = new Set(smwsFirst.usedUnmapped);
|
||||
const out = smwsFirst.pairs.slice();
|
||||
|
||||
if (out.length >= limitPairs) return out.slice(0, limitPairs);
|
||||
|
||||
const seeds = topSuggestions(work, Math.min(150, work.length), "", mappedSkus).filter(
|
||||
(it) => !used.has(String(it?.sku || ""))
|
||||
);
|
||||
|
||||
const TOKEN_BUCKET_CAP = 500;
|
||||
const tokMap = new Map();
|
||||
const itemTokens = new Map();
|
||||
const itemNormName = new Map();
|
||||
|
||||
for (const it of work) {
|
||||
const toks = Array.from(new Set(tokenizeQuery(it.name || ""))).filter(Boolean).slice(0, 10);
|
||||
itemTokens.set(it.sku, toks);
|
||||
itemNormName.set(it.sku, normSearchText(it.name || ""));
|
||||
for (const t of toks) {
|
||||
let arr = tokMap.get(t);
|
||||
if (!arr) tokMap.set(t, (arr = []));
|
||||
if (arr.length < TOKEN_BUCKET_CAP) arr.push(it);
|
||||
}
|
||||
}
|
||||
|
||||
const bestByPair = new Map();
|
||||
const MAX_CAND_TOTAL = 250;
|
||||
const MAX_FINE = 10;
|
||||
|
||||
for (const a of seeds) {
|
||||
const aSku = String(a.sku || "");
|
||||
if (!aSku || used.has(aSku)) continue;
|
||||
|
||||
const aToks = itemTokens.get(aSku) || [];
|
||||
if (!aToks.length) continue;
|
||||
|
||||
const cand = new Map();
|
||||
for (const t of aToks) {
|
||||
const arr = tokMap.get(t);
|
||||
if (!arr) continue;
|
||||
|
||||
for (let i = 0; i < arr.length && cand.size < MAX_CAND_TOTAL; i++) {
|
||||
const b = arr[i];
|
||||
if (!b) continue;
|
||||
const bSku = String(b.sku || "");
|
||||
if (!bSku || bSku === aSku) continue;
|
||||
if (used.has(bSku)) continue;
|
||||
if (mappedSkus && mappedSkus.has(bSku)) continue;
|
||||
|
||||
if (typeof isIgnoredPairFn === "function" && isIgnoredPairFn(aSku, bSku)) continue;
|
||||
if (typeof sameStoreFn === "function" && sameStoreFn(aSku, bSku)) continue;
|
||||
|
||||
cand.set(bSku, b);
|
||||
}
|
||||
if (cand.size >= MAX_CAND_TOTAL) break;
|
||||
}
|
||||
if (!cand.size) continue;
|
||||
|
||||
const aNameN = itemNormName.get(aSku) || "";
|
||||
const cheap = [];
|
||||
for (const b of cand.values()) {
|
||||
const bSku = String(b.sku || "");
|
||||
const bToks = itemTokens.get(bSku) || [];
|
||||
const bNameN = itemNormName.get(bSku) || "";
|
||||
const s = fastSimilarityScore(aToks, bToks, aNameN, bNameN);
|
||||
if (s > 0) cheap.push({ b, s });
|
||||
}
|
||||
if (!cheap.length) continue;
|
||||
cheap.sort((x, y) => y.s - x.s);
|
||||
|
||||
let bestB = null;
|
||||
let bestS = 0;
|
||||
for (const x of cheap.slice(0, MAX_FINE)) {
|
||||
const s = similarityScore(a.name || "", x.b.name || "");
|
||||
if (s > bestS) {
|
||||
bestS = s;
|
||||
bestB = x.b;
|
||||
}
|
||||
}
|
||||
|
||||
if (!bestB || bestS < 0.6) continue;
|
||||
|
||||
const bSku = String(bestB.sku || "");
|
||||
if (!bSku || used.has(bSku)) continue;
|
||||
|
||||
const key = aSku < bSku ? `${aSku}|${bSku}` : `${bSku}|${aSku}`;
|
||||
const prev = bestByPair.get(key);
|
||||
if (!prev || bestS > prev.score) bestByPair.set(key, { a, b: bestB, score: bestS });
|
||||
}
|
||||
|
||||
const pairs = Array.from(bestByPair.values());
|
||||
pairs.sort((x, y) => y.score - x.score);
|
||||
|
||||
const need = Math.max(0, limitPairs - out.length);
|
||||
if (!need) return out.slice(0, limitPairs);
|
||||
|
||||
const TOP_BAND = Math.min(600, pairs.length);
|
||||
const JITTER = 0.08;
|
||||
|
||||
const band = pairs.slice(0, TOP_BAND).map((p) => {
|
||||
const jitter = (rnd() - 0.5) * JITTER;
|
||||
return { ...p, _rank: p.score * (1 + jitter) };
|
||||
});
|
||||
band.sort((a, b) => b._rank - a._rank);
|
||||
|
||||
function tryTake(p) {
|
||||
const aSku = String(p.a.sku || "");
|
||||
const bSku = String(p.b.sku || "");
|
||||
if (!aSku || !bSku || aSku === bSku) return false;
|
||||
if (used.has(aSku) || used.has(bSku)) return false;
|
||||
if (typeof sameStoreFn === "function" && sameStoreFn(aSku, bSku)) return false;
|
||||
|
||||
used.add(aSku);
|
||||
used.add(bSku);
|
||||
out.push({ a: p.a, b: p.b, score: p.score });
|
||||
return true;
|
||||
}
|
||||
|
||||
for (const p of band) {
|
||||
if (out.length >= limitPairs) break;
|
||||
tryTake(p);
|
||||
}
|
||||
|
||||
if (out.length < limitPairs) {
|
||||
for (let i = TOP_BAND; i < pairs.length; i++) {
|
||||
if (out.length >= limitPairs) break;
|
||||
tryTake(pairs[i]);
|
||||
}
|
||||
}
|
||||
|
||||
return out.slice(0, limitPairs);
|
||||
}
|
||||
47
viz/app/linker/url_map.js
Normal file
47
viz/app/linker/url_map.js
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
// viz/app/linker/url_map.js
|
||||
import { keySkuForRow } from "../sku.js";
|
||||
|
||||
function urlQuality(r) {
|
||||
const u = String(r?.url || "").trim();
|
||||
if (!u) return -1;
|
||||
let s = 0;
|
||||
s += u.length;
|
||||
if (/\bproduct\/\d+\//.test(u)) s += 50;
|
||||
if (/[a-z0-9-]{8,}/i.test(u)) s += 10;
|
||||
return s;
|
||||
}
|
||||
|
||||
export function buildUrlBySkuStore(allRows) {
|
||||
const URL_BY_SKU_STORE = new Map(); // skuKey -> Map(storeLabel -> url)
|
||||
|
||||
for (const r of allRows) {
|
||||
if (!r || r.removed) continue;
|
||||
|
||||
const skuKey = String(keySkuForRow(r) || "").trim();
|
||||
if (!skuKey) continue;
|
||||
|
||||
const storeLabel = String(r.storeLabel || r.store || "").trim();
|
||||
const url = String(r.url || "").trim();
|
||||
if (!storeLabel || !url) continue;
|
||||
|
||||
let m = URL_BY_SKU_STORE.get(skuKey);
|
||||
if (!m) URL_BY_SKU_STORE.set(skuKey, (m = new Map()));
|
||||
|
||||
const prevUrl = m.get(storeLabel);
|
||||
if (!prevUrl) {
|
||||
m.set(storeLabel, url);
|
||||
continue;
|
||||
}
|
||||
|
||||
const prevScore = urlQuality({ url: prevUrl });
|
||||
const nextScore = urlQuality(r);
|
||||
|
||||
if (nextScore > prevScore) {
|
||||
m.set(storeLabel, url);
|
||||
} else if (nextScore === prevScore && url < prevUrl) {
|
||||
m.set(storeLabel, url);
|
||||
}
|
||||
}
|
||||
|
||||
return URL_BY_SKU_STORE;
|
||||
}
|
||||
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue