#!/usr/bin/env node "use strict"; const fs = require("fs"); const path = require("path"); /* ---------------- IO ---------------- */ function readJson(p) { return JSON.parse(fs.readFileSync(p, "utf8")); } function parseArgs(argv) { const out = { ab: "reports/common_listings_ab_top1000.json", bc: "reports/common_listings_bc_top1000.json", meta: "data/sku_links.json", top: 50, minDiscrep: 1, includeMissing: false, // similarityScore is NOT 0..1. minScore: 9.0, minContain: 0.75, // only consider suggestions from the opposite list (AB->BC or BC->AB) requireCrossGroup: true, base: "http://127.0.0.1:8080/#/link/?left=", debug: false, debugN: 25, debugPayload: false, debugBest: false, dumpScores: false, }; for (let i = 0; i < argv.length; i++) { const a = argv[i]; if (a === "--ab" && argv[i + 1]) out.ab = argv[++i]; else if (a === "--bc" && argv[i + 1]) out.bc = argv[++i]; else if (a === "--meta" && argv[i + 1]) out.meta = argv[++i]; else if (a === "--top" && argv[i + 1]) out.top = Number(argv[++i]) || out.top; else if (a === "--min" && argv[i + 1]) out.minDiscrep = Number(argv[++i]) || out.minDiscrep; else if (a === "--min-score" && argv[i + 1]) out.minScore = Number(argv[++i]) || out.minScore; else if (a === "--min-contain" && argv[i + 1]) out.minContain = Number(argv[++i]) || out.minContain; else if (a === "--include-missing") out.includeMissing = true; else if (a === "--base" && argv[i + 1]) out.base = String(argv[++i] || out.base); else if (a === "--no-cross-group") out.requireCrossGroup = false; else if (a === "--debug") out.debug = true; else if (a === "--debug-n" && argv[i + 1]) out.debugN = Number(argv[++i]) || out.debugN; else if (a === "--debug-payload") out.debugPayload = true; else if (a === "--debug-best") out.debugBest = true; else if (a === "--dump-scores") out.dumpScores = true; } return out; } /* ---------------- row extraction ---------------- */ function extractRows(payload) { if (Array.isArray(payload)) return payload; const candidates = [ payload?.rows, payload?.data?.rows, payload?.data, payload?.items, payload?.list, payload?.results, ]; for (const x of candidates) if (Array.isArray(x)) return x; return []; } function rowKey(r) { const k = r?.canonSku ?? r?.sku ?? r?.canon ?? r?.id ?? r?.key; return k ? String(k) : ""; } function buildRankMap(payload) { const rows = extractRows(payload); const map = new Map(); for (let i = 0; i < rows.length; i++) { const r = rows[i]; const k = rowKey(r); if (!k) continue; map.set(String(k), { rank: i + 1, row: r }); } return { map, rowsLen: rows.length, rows }; } function pickName(row) { if (!row) return ""; const repName = row?.representative?.name; if (typeof repName === "string" && repName.trim()) return repName.trim(); const cheapName = row?.cheapest?.name; if (typeof cheapName === "string" && cheapName.trim()) return cheapName.trim(); const direct = ["name", "title", "productName", "displayName", "itemName", "label", "desc", "description"]; for (const k of direct) { const v = row[k]; if (typeof v === "string" && v.trim()) return v.trim(); } return ""; } /* ---------------- sku_links union-find grouping + ignores ---------------- */ function normalizeImplicitSkuKey(k) { const s = String(k || "").trim(); const m = s.match(/^id:(\d{1,6})$/i); if (m) return String(m[1]).padStart(6, "0"); return s; } function canonicalPairKey(a, b) { const x = normalizeImplicitSkuKey(a); const y = normalizeImplicitSkuKey(b); if (!x || !y) return ""; return x < y ? `${x}|${y}` : `${y}|${x}`; } function buildIgnoreSet(meta) { const ignores = Array.isArray(meta?.ignores) ? meta.ignores : []; const s = new Set(); for (const x of ignores) { const a = String(x?.skuA || x?.a || x?.left || "").trim(); const b = String(x?.skuB || x?.b || x?.right || "").trim(); const k = canonicalPairKey(a, b); if (k) s.add(k); } return s; } class DSU { constructor() { this.parent = new Map(); this.rank = new Map(); } _add(x) { if (!this.parent.has(x)) { this.parent.set(x, x); this.rank.set(x, 0); } } find(x) { x = String(x || "").trim(); if (!x) return ""; this._add(x); let p = this.parent.get(x); if (p !== x) { p = this.find(p); this.parent.set(x, p); } return p; } union(a, b) { a = String(a || "").trim(); b = String(b || "").trim(); if (!a || !b || a === b) return; const ra = this.find(a); const rb = this.find(b); if (!ra || !rb || ra === rb) return; const rka = this.rank.get(ra) || 0; const rkb = this.rank.get(rb) || 0; if (rka < rkb) this.parent.set(ra, rb); else if (rkb < rka) this.parent.set(rb, ra); else { this.parent.set(rb, ra); this.rank.set(ra, rka + 1); } } } function compareSku(a, b) { a = String(a || "").trim(); b = String(b || "").trim(); if (a === b) return 0; const aUnknown = a.startsWith("u:"); const bUnknown = b.startsWith("u:"); if (aUnknown !== bUnknown) return aUnknown ? 1 : -1; const aNum = /^\d+$/.test(a); const bNum = /^\d+$/.test(b); if (aNum && bNum) { const na = Number(a), nb = Number(b); if (Number.isFinite(na) && Number.isFinite(nb) && na !== nb) return na < nb ? -1 : 1; } return a < b ? -1 : 1; } function buildCanonicalSkuFnFromMeta(meta) { const links = Array.isArray(meta?.links) ? meta.links : []; if (!links.length) return (sku) => normalizeImplicitSkuKey(sku); const dsu = new DSU(); const all = new Set(); for (const x of links) { const a = normalizeImplicitSkuKey(x?.fromSku); const b = normalizeImplicitSkuKey(x?.toSku); if (!a || !b || a === b) continue; all.add(a); all.add(b); dsu.union(a, b); } const groupsByRoot = new Map(); for (const s of all) { const r = dsu.find(s); if (!r) continue; let set = groupsByRoot.get(r); if (!set) groupsByRoot.set(r, (set = new Set())); set.add(s); } const repByRoot = new Map(); for (const [root, members] of groupsByRoot.entries()) { const arr = Array.from(members); arr.sort(compareSku); repByRoot.set(root, arr[0] || root); } const canonBySku = new Map(); for (const [root, members] of groupsByRoot.entries()) { const rep = repByRoot.get(root) || root; for (const s of members) canonBySku.set(s, rep); canonBySku.set(rep, rep); } return (sku) => { const s = normalizeImplicitSkuKey(sku); return canonBySku.get(s) || s; }; } /* ---------------- similarity (same math as viz/app/linker/similarity.js) ---------------- */ function normSearchText(s) { return String(s ?? "") .toLowerCase() .replace(/[^a-z0-9]+/g, " ") .replace(/\s+/g, " ") .trim(); } function tokenizeQuery(q) { const n = normSearchText(q); return n ? n.split(" ").filter(Boolean) : []; } const SIM_STOP_TOKENS = new Set([ "the", "a", "an", "and", "of", "to", "in", "for", "with", "year", "years", "yr", "yrs", "old", "whisky", "whiskey", "scotch", "single", "malt", "cask", "finish", "edition", "release", "batch", "strength", "abv", "proof", "anniversary", ]); const ORDINAL_RE = /^(\d+)(st|nd|rd|th)$/i; function numKey(t) { const s = String(t || "") .trim() .toLowerCase(); if (!s) return ""; if (/^\d+$/.test(s)) return s; const m = s.match(ORDINAL_RE); return m ? m[1] : ""; } function extractAgeFromText(normName) { const s = String(normName || ""); if (!s) return ""; const m = s.match(/\b(?:aged\s*)?(\d{1,2})\s*(?:yr|yrs|year|years)\b/i); if (m && m[1]) return String(parseInt(m[1], 10)); const m2 = s.match(/\b(\d{1,2})\s*yo\b/i); if (m2 && m2[1]) return String(parseInt(m2[1], 10)); return ""; } function filterSimTokens(tokens) { const out = []; const seen = new Set(); const SIM_EQUIV = new Map([ ["years", "yr"], ["year", "yr"], ["yrs", "yr"], ["yr", "yr"], ["whiskey", "whisky"], ["whisky", "whisky"], ["bourbon", "bourbon"], ]); const VOL_UNIT = new Set(["ml", "l", "cl", "oz", "liter", "liters", "litre", "litres"]); const VOL_INLINE_RE = /^\d+(?:\.\d+)?(?:ml|l|cl|oz)$/i; const PCT_INLINE_RE = /^\d+(?:\.\d+)?%$/; const arr = Array.isArray(tokens) ? tokens : []; for (let i = 0; i < arr.length; i++) { let t = String(arr[i] || "") .trim() .toLowerCase(); if (!t) continue; if (!/[a-z0-9]/i.test(t)) continue; if (VOL_INLINE_RE.test(t)) continue; if (PCT_INLINE_RE.test(t)) continue; t = SIM_EQUIV.get(t) || t; const nk = numKey(t); if (nk) t = nk; if (VOL_UNIT.has(t) || t === "abv" || t === "proof") continue; if (/^\d+(?:\.\d+)?$/.test(t)) { const next = String(arr[i + 1] || "") .trim() .toLowerCase(); const nextNorm = SIM_EQUIV.get(next) || next; if (VOL_UNIT.has(nextNorm)) { i++; continue; } } if (!numKey(t) && SIM_STOP_TOKENS.has(t)) continue; if (seen.has(t)) continue; seen.add(t); out.push(t); } return out; } function tokenContainmentScore(aTokens, bTokens) { const A = filterSimTokens(aTokens || []); const B = filterSimTokens(bTokens || []); if (!A.length || !B.length) return 0; const aSet = new Set(A); const bSet = new Set(B); const small = aSet.size <= bSet.size ? aSet : bSet; const big = aSet.size <= bSet.size ? bSet : aSet; let hit = 0; for (const t of small) if (big.has(t)) hit++; const recall = hit / Math.max(1, small.size); const precision = hit / Math.max(1, big.size); const f1 = (2 * precision * recall) / Math.max(1e-9, precision + recall); return f1; } function levenshtein(a, b) { a = String(a || ""); b = String(b || ""); const n = a.length, m = b.length; if (!n) return m; if (!m) return n; const dp = new Array(m + 1); for (let j = 0; j <= m; j++) dp[j] = j; for (let i = 1; i <= n; i++) { let prev = dp[0]; dp[0] = i; const ca = a.charCodeAt(i - 1); for (let j = 1; j <= m; j++) { const tmp = dp[j]; const cost = ca === b.charCodeAt(j - 1) ? 0 : 1; dp[j] = Math.min(dp[j] + 1, dp[j - 1] + 1, prev + cost); prev = tmp; } } return dp[m]; } function numberMismatchPenalty(aTokens, bTokens) { const aNums = new Set((aTokens || []).map(numKey).filter(Boolean)); const bNums = new Set((bTokens || []).map(numKey).filter(Boolean)); if (!aNums.size || !bNums.size) return 1.0; for (const n of aNums) if (bNums.has(n)) return 1.0; return 0.28; } function similarityScore(aName, bName) { const a = normSearchText(aName); const b = normSearchText(bName); if (!a || !b) return 0; const aAge = extractAgeFromText(a); const bAge = extractAgeFromText(b); const ageBoth = !!(aAge && bAge); const ageMatch = ageBoth && aAge === bAge; const ageMismatch = ageBoth && aAge !== bAge; const aToksRaw = tokenizeQuery(a); const bToksRaw = tokenizeQuery(b); const aToks = filterSimTokens(aToksRaw); const bToks = filterSimTokens(bToksRaw); if (!aToks.length || !bToks.length) return 0; const contain = tokenContainmentScore(aToksRaw, bToksRaw); const aFirst = aToks[0] || ""; const bFirst = bToks[0] || ""; const firstMatch = aFirst && bFirst && aFirst === bFirst ? 1 : 0; const A = new Set(aToks.slice(1)); const B = new Set(bToks.slice(1)); let inter = 0; for (const w of A) if (B.has(w)) inter++; const denom = Math.max(1, Math.max(A.size, B.size)); const overlapTail = inter / denom; const d = levenshtein(a, b); const maxLen = Math.max(1, Math.max(a.length, b.length)); const levSim = 1 - d / maxLen; let gate = firstMatch ? 1.0 : Math.min(0.8, 0.06 + 0.95 * contain); const smallN = Math.min(aToks.length, bToks.length); if (!firstMatch && smallN <= 3 && contain < 0.78) gate *= 0.18; const numGate = numberMismatchPenalty(aToks, bToks); let s = numGate * (firstMatch * 3.0 + overlapTail * 2.2 * gate + levSim * (firstMatch ? 1.0 : 0.1 + 0.7 * contain)); if (ageMatch) s *= 2.2; else if (ageMismatch) s *= 0.18; s *= 1 + 0.9 * contain; return s; } /* ---------------- debug helpers ---------------- */ function eprintln(...args) { console.error(...args); } function truncate(s, n) { s = String(s || ""); return s.length <= n ? s : s.slice(0, n - 1) + "…"; } /* ---------------- main ---------------- */ function main() { const args = parseArgs(process.argv.slice(2)); const repoRoot = process.cwd(); const abPath = path.isAbsolute(args.ab) ? args.ab : path.join(repoRoot, args.ab); const bcPath = path.isAbsolute(args.bc) ? args.bc : path.join(repoRoot, args.bc); const metaPath = args.meta ? (path.isAbsolute(args.meta) ? args.meta : path.join(repoRoot, args.meta)) : ""; const ab = readJson(abPath); const bc = readJson(bcPath); const meta = metaPath ? readJson(metaPath) : null; const canonicalSku = meta ? buildCanonicalSkuFnFromMeta(meta) : (sku) => normalizeImplicitSkuKey(sku); const ignoreSet = meta ? buildIgnoreSet(meta) : new Set(); function isIgnoredPair(a, b) { const k = canonicalPairKey(a, b); return k ? ignoreSet.has(k) : false; } const abBuilt = buildRankMap(ab); const bcBuilt = buildRankMap(bc); const abMap = abBuilt.map; const bcMap = bcBuilt.map; // SKU pools for “cross group” matching const abSkus = new Set(abMap.keys()); const bcSkus = new Set(bcMap.keys()); // union SKU -> row (for name lookup) const rowBySku = new Map(); for (const m of [abMap, bcMap]) { for (const [canonSku, v] of m.entries()) { if (!rowBySku.has(canonSku)) rowBySku.set(canonSku, v.row); } } const allSkus = Array.from(rowBySku.keys()); const allNames = new Map(); let namedCount = 0; for (const sku of allSkus) { const n = pickName(rowBySku.get(sku)); allNames.set(sku, n); if (n) namedCount++; } if (args.debug) { eprintln("[rank_discrepency] inputs:", { abPath, bcPath, metaPath: metaPath || "(none)", linkCount: Array.isArray(meta?.links) ? meta.links.length : 0, ignoreCount: Array.isArray(meta?.ignores) ? meta.ignores.length : 0, ignoreSetSize: ignoreSet.size, minDiscrep: args.minDiscrep, minScore: args.minScore, minContain: args.minContain, requireCrossGroup: args.requireCrossGroup, top: args.top, includeMissing: args.includeMissing, }); eprintln("[rank_discrepency] extracted rows:", { abRows: abBuilt.rowsLen, bcRows: bcBuilt.rowsLen, abKeys: abMap.size, bcKeys: bcMap.size, }); eprintln("[rank_discrepency] name coverage:", { totalSkus: allSkus.length, named: namedCount, unnamed: allSkus.length - namedCount, }); } if (args.debugPayload) { const ab0 = abBuilt.rows[0]; const bc0 = bcBuilt.rows[0]; eprintln("[rank_discrepency] sample AB rep.name:", truncate(ab0?.representative?.name || "", 120)); eprintln("[rank_discrepency] sample BC rep.name:", truncate(bc0?.representative?.name || "", 120)); } const keys = new Set([...abMap.keys(), ...bcMap.keys()]); const diffs = []; for (const canonSku of keys) { const a = abMap.get(canonSku); const b = bcMap.get(canonSku); if (!args.includeMissing && (!a || !b)) continue; const rankAB = a ? a.rank : null; const rankBC = b ? b.rank : null; const discrep = rankAB !== null && rankBC !== null ? Math.abs(rankAB - rankBC) : Infinity; if (discrep !== Infinity && discrep < args.minDiscrep) continue; diffs.push({ canonSku, discrep, rankAB, rankBC, sumRank: (rankAB ?? 1e9) + (rankBC ?? 1e9) }); } diffs.sort((x, y) => { if (y.discrep !== x.discrep) return y.discrep - x.discrep; if (x.sumRank !== y.sumRank) return x.sumRank - y.sumRank; return String(x.canonSku).localeCompare(String(y.canonSku)); }); if (args.debug) { eprintln("[rank_discrepency] diffs:", { unionKeys: keys.size, diffsAfterMin: diffs.length }); eprintln( "[rank_discrepency] top discrep sample:", diffs.slice(0, 5).map((d) => ({ sku: d.canonSku, discrep: d.discrep, rankAB: d.rankAB, rankBC: d.rankBC, name: truncate(allNames.get(String(d.canonSku)) || "", 80), })), ); } if (args.debugBest && diffs.length) { const skuA = String(diffs[0].canonSku); const nameA = allNames.get(skuA) || ""; const groupA = canonicalSku(skuA); const aInAB = abSkus.has(skuA); const pool = args.requireCrossGroup ? (aInAB ? bcSkus : abSkus) : new Set(allSkus); const aRaw = tokenizeQuery(nameA); const scored = []; for (const skuB of pool) { if (skuB === skuA) continue; if (canonicalSku(skuB) === groupA) continue; if (isIgnoredPair(skuA, skuB)) continue; const nameB = allNames.get(skuB) || ""; if (!nameB) continue; const contain = tokenContainmentScore(aRaw, tokenizeQuery(nameB)); if (contain < args.minContain) continue; const s = similarityScore(nameA, nameB); scored.push({ skuB, s, contain, nameB }); } scored.sort((a, b) => b.s - a.s); eprintln("[rank_discrepency] debug-best for first discrep:", { skuA, side: aInAB ? "AB" : "BC", nameA: truncate(nameA, 120), minContain: args.minContain, top5: scored .slice(0, 5) .map((x) => ({ sku: x.skuB, score: x.s, contain: x.contain, name: truncate(x.nameB, 120) })), }); } const filtered = []; const debugLines = []; for (const d of diffs) { const skuA = String(d.canonSku); const nameA = allNames.get(skuA) || ""; if (!nameA) continue; const aInAB = abSkus.has(skuA); const pool = args.requireCrossGroup ? (aInAB ? bcSkus : abSkus) : new Set(allSkus); const groupA = canonicalSku(skuA); const aRaw = tokenizeQuery(nameA); let best = 0, bestSku = "", bestName = "", bestContain = 0; let bestWasIgnored = false; for (const skuB of pool) { if (skuB === skuA) continue; if (canonicalSku(skuB) === groupA) continue; if (isIgnoredPair(skuA, skuB)) { // critical: ignored pairs must NOT satisfy the requirement bestWasIgnored = true; continue; } const nameB = allNames.get(skuB) || ""; if (!nameB) continue; const contain = tokenContainmentScore(aRaw, tokenizeQuery(nameB)); if (contain < args.minContain) continue; const s = similarityScore(nameA, nameB); if (s > best) { best = s; bestSku = skuB; bestName = nameB; bestContain = contain; } } const pass = bestSku && best >= args.minScore; if (args.debug && debugLines.length < args.debugN) { debugLines.push({ sku: skuA, side: aInAB ? "AB" : "BC", discrep: d.discrep, rankAB: d.rankAB, rankBC: d.rankBC, nameA: truncate(nameA, 52), best, bestContain, bestSku, bestSide: abSkus.has(bestSku) ? "AB" : "BC", bestName: truncate(bestName, 52), sawIgnoredPairs: bestWasIgnored, pass, }); } if (!pass) continue; filtered.push({ ...d, best, bestSku, bestName, bestContain }); if (filtered.length >= args.top) break; } if (args.debug) { eprintln("[rank_discrepency] filter results:", { filtered: filtered.length, minScore: args.minScore, minContain: args.minContain, requireCrossGroup: args.requireCrossGroup, minDiscrep: args.minDiscrep, }); eprintln("[rank_discrepency] debug sample (first N checked):"); for (const x of debugLines) eprintln(" ", x); } for (const d of filtered) { if (args.dumpScores) { eprintln( "[rank_discrepency] emit", JSON.stringify({ sku: d.canonSku, discrep: d.discrep, rankAB: d.rankAB, rankBC: d.rankBC, best: d.best, bestContain: d.bestContain, bestSku: d.bestSku, bestName: truncate(d.bestName, 120), }), ); } console.log(args.base + encodeURIComponent(String(d.canonSku))); } } main();