diff --git a/tools/rank_discrepency.js b/tools/rank_discrepency.js index cd925cb..239b87d 100644 --- a/tools/rank_discrepency.js +++ b/tools/rank_discrepency.js @@ -2,12 +2,16 @@ "use strict"; /* - Print local link URLs for SKUs with largest rank discrepancy between AB and BC lists. + Print local link URLs for SKUs with largest rank discrepancy between AB and BC lists, + but ONLY when there exists another *different* listing (not in same linked group) + with a reasonably high similarity score by name. Usage: node scripts/rank_discrepency_links.js \ --ab reports/common_listings_ab_top1000.json \ --bc reports/common_listings_bc_top1000.json \ + --meta viz/data/sku_meta.json \ + --min-score 0.75 \ --top 50 \ --base "http://127.0.0.1:8080/#/link/?left=" @@ -18,6 +22,8 @@ const fs = require("fs"); const path = require("path"); +/* ---------------- IO ---------------- */ + function readJson(p) { return JSON.parse(fs.readFileSync(p, "utf8")); } @@ -26,17 +32,21 @@ function parseArgs(argv) { const out = { ab: "reports/common_listings_ab_top1000.json", bc: "reports/common_listings_bc_top1000.json", + meta: "", // optional sku_meta containing {links:[{fromSku,toSku}], ignores:...} top: 50, minDiscrep: 1, includeMissing: false, base: "http://127.0.0.1:8080/#/link/?left=", + minScore: 0.75, // similarity threshold for "reasonably high" }; for (let i = 0; i < argv.length; i++) { const a = argv[i]; if (a === "--ab" && argv[i + 1]) out.ab = argv[++i]; else if (a === "--bc" && argv[i + 1]) out.bc = argv[++i]; + else if (a === "--meta" && argv[i + 1]) out.meta = argv[++i]; else if (a === "--top" && argv[i + 1]) out.top = Number(argv[++i]) || out.top; else if (a === "--min" && argv[i + 1]) out.minDiscrep = Number(argv[++i]) || out.minDiscrep; + else if (a === "--min-score" && argv[i + 1]) out.minScore = Number(argv[++i]) || out.minScore; else if (a === "--include-missing") out.includeMissing = true; else if (a === "--base" && argv[i + 1]) out.base = String(argv[++i] || out.base); } @@ -55,19 +65,375 @@ function buildRankMap(payload) { return map; } +function pickName(row) { + if (!row) return ""; + return String(row.name || row.title || row.productName || row.displayName || ""); +} + +/* ---------------- sku_meta grouping (optional) ---------------- */ + +function normalizeImplicitSkuKey(k) { + const s = String(k || "").trim(); + const m = s.match(/^id:(\d{1,6})$/i); + if (m) return String(m[1]).padStart(6, "0"); + return s; +} + +class DSU { + constructor() { + this.parent = new Map(); + this.rank = new Map(); + } + _add(x) { + if (!this.parent.has(x)) { + this.parent.set(x, x); + this.rank.set(x, 0); + } + } + find(x) { + x = String(x || "").trim(); + if (!x) return ""; + this._add(x); + let p = this.parent.get(x); + if (p !== x) { + p = this.find(p); + this.parent.set(x, p); + } + return p; + } + union(a, b) { + a = String(a || "").trim(); + b = String(b || "").trim(); + if (!a || !b || a === b) return; + const ra = this.find(a); + const rb = this.find(b); + if (!ra || !rb || ra === rb) return; + + const rka = this.rank.get(ra) || 0; + const rkb = this.rank.get(rb) || 0; + + if (rka < rkb) this.parent.set(ra, rb); + else if (rkb < rka) this.parent.set(rb, ra); + else { + this.parent.set(rb, ra); + this.rank.set(ra, rka + 1); + } + } +} + +// Choose a stable representative (good enough for filtering “same-linked”) +function compareSku(a, b) { + a = String(a || "").trim(); + b = String(b || "").trim(); + if (a === b) return 0; + + const aUnknown = a.startsWith("u:"); + const bUnknown = b.startsWith("u:"); + if (aUnknown !== bUnknown) return aUnknown ? 1 : -1; + + const aNum = /^\d+$/.test(a); + const bNum = /^\d+$/.test(b); + if (aNum && bNum) { + const na = Number(a), nb = Number(b); + if (Number.isFinite(na) && Number.isFinite(nb) && na !== nb) return na < nb ? -1 : 1; + } + return a < b ? -1 : 1; +} + +function buildCanonicalSkuFnFromMeta(meta) { + const links = Array.isArray(meta?.links) ? meta.links : []; + if (!links.length) return (sku) => normalizeImplicitSkuKey(sku); + + const dsu = new DSU(); + const all = new Set(); + + for (const x of links) { + const a = normalizeImplicitSkuKey(x?.fromSku); + const b = normalizeImplicitSkuKey(x?.toSku); + if (!a || !b || a === b) continue; + all.add(a); + all.add(b); + dsu.union(a, b); + } + + // root -> members + const groupsByRoot = new Map(); + for (const s of all) { + const r = dsu.find(s); + if (!r) continue; + let set = groupsByRoot.get(r); + if (!set) groupsByRoot.set(r, (set = new Set())); + set.add(s); + } + + // root -> representative + const repByRoot = new Map(); + for (const [root, members] of groupsByRoot.entries()) { + const arr = Array.from(members); + arr.sort(compareSku); + repByRoot.set(root, arr[0] || root); + } + + // sku -> rep + const canonBySku = new Map(); + for (const [root, members] of groupsByRoot.entries()) { + const rep = repByRoot.get(root) || root; + for (const s of members) canonBySku.set(s, rep); + canonBySku.set(rep, rep); + } + + return (sku) => { + const s = normalizeImplicitSkuKey(sku); + return canonBySku.get(s) || s; + }; +} + +/* ---------------- similarity (copied from viz/app) ---------------- */ + +// Normalize for search: lowercase, punctuation -> space, collapse spaces +function normSearchText(s) { + return String(s ?? "") + .toLowerCase() + .replace(/[^a-z0-9]+/g, " ") + .replace(/\s+/g, " ") + .trim(); +} + +function tokenizeQuery(q) { + const n = normSearchText(q); + return n ? n.split(" ").filter(Boolean) : []; +} + +const SIM_STOP_TOKENS = new Set([ + "the","a","an","and","of","to","in","for","with", + "year","years","yr","yrs","old", + "whisky","whiskey","scotch","single","malt","cask","finish","edition","release","batch","strength","abv","proof", + "anniversary", +]); + +const ORDINAL_RE = /^(\d+)(st|nd|rd|th)$/i; + +function numKey(t) { + const s = String(t || "").trim().toLowerCase(); + if (!s) return ""; + if (/^\d+$/.test(s)) return s; + const m = s.match(ORDINAL_RE); + return m ? m[1] : ""; +} + +function extractAgeFromText(normName) { + const s = String(normName || ""); + if (!s) return ""; + + const m = s.match(/\b(?:aged\s*)?(\d{1,2})\s*(?:yr|yrs|year|years)\b/i); + if (m && m[1]) return String(parseInt(m[1], 10)); + + const m2 = s.match(/\b(\d{1,2})\s*yo\b/i); + if (m2 && m2[1]) return String(parseInt(m2[1], 10)); + + return ""; +} + +function filterSimTokens(tokens) { + const out = []; + const seen = new Set(); + + const SIM_EQUIV = new Map([ + ["years", "yr"], + ["year", "yr"], + ["yrs", "yr"], + ["yr", "yr"], + ["whiskey", "whisky"], + ["whisky", "whisky"], + ["bourbon", "bourbon"], + ]); + + const VOL_UNIT = new Set(["ml","l","cl","oz","liter","liters","litre","litres"]); + const VOL_INLINE_RE = /^\d+(?:\.\d+)?(?:ml|l|cl|oz)$/i; + const PCT_INLINE_RE = /^\d+(?:\.\d+)?%$/; + + const arr = Array.isArray(tokens) ? tokens : []; + + for (let i = 0; i < arr.length; i++) { + const raw = arr[i]; + let t = String(raw || "").trim().toLowerCase(); + if (!t) continue; + + if (!/[a-z0-9]/i.test(t)) continue; + + if (VOL_INLINE_RE.test(t)) continue; + if (PCT_INLINE_RE.test(t)) continue; + + t = SIM_EQUIV.get(t) || t; + + const nk = numKey(t); + if (nk) t = nk; + + if (VOL_UNIT.has(t) || t === "abv" || t === "proof") continue; + + if (/^\d+(?:\.\d+)?$/.test(t)) { + const next = String(arr[i + 1] || "").trim().toLowerCase(); + const nextNorm = SIM_EQUIV.get(next) || next; + if (VOL_UNIT.has(nextNorm)) { + i++; + continue; + } + } + + if (!numKey(t) && SIM_STOP_TOKENS.has(t)) continue; + + if (seen.has(t)) continue; + seen.add(t); + out.push(t); + } + + return out; +} + +function tokenContainmentScore(aTokens, bTokens) { + const A = filterSimTokens(aTokens || []); + const B = filterSimTokens(bTokens || []); + if (!A.length || !B.length) return 0; + + const aSet = new Set(A); + const bSet = new Set(B); + + const small = aSet.size <= bSet.size ? aSet : bSet; + const big = aSet.size <= bSet.size ? bSet : aSet; + + let hit = 0; + for (const t of small) if (big.has(t)) hit++; + + const recall = hit / Math.max(1, small.size); + const precision = hit / Math.max(1, big.size); + const f1 = (2 * precision * recall) / Math.max(1e-9, precision + recall); + + return f1; +} + +function levenshtein(a, b) { + a = String(a || ""); + b = String(b || ""); + const n = a.length, m = b.length; + if (!n) return m; + if (!m) return n; + + const dp = new Array(m + 1); + for (let j = 0; j <= m; j++) dp[j] = j; + + for (let i = 1; i <= n; i++) { + let prev = dp[0]; + dp[0] = i; + const ca = a.charCodeAt(i - 1); + for (let j = 1; j <= m; j++) { + const tmp = dp[j]; + const cost = ca === b.charCodeAt(j - 1) ? 0 : 1; + dp[j] = Math.min(dp[j] + 1, dp[j - 1] + 1, prev + cost); + prev = tmp; + } + } + return dp[m]; +} + +function numberMismatchPenalty(aTokens, bTokens) { + const aNums = new Set((aTokens || []).map(numKey).filter(Boolean)); + const bNums = new Set((bTokens || []).map(numKey).filter(Boolean)); + if (!aNums.size || !bNums.size) return 1.0; + for (const n of aNums) if (bNums.has(n)) return 1.0; + return 0.28; +} + +// Same structure/weights as viz/app/linker/similarity.js +function similarityScore(aName, bName) { + const a = normSearchText(aName); + const b = normSearchText(bName); + if (!a || !b) return 0; + + const aAge = extractAgeFromText(a); + const bAge = extractAgeFromText(b); + const ageBoth = !!(aAge && bAge); + const ageMatch = ageBoth && aAge === bAge; + const ageMismatch = ageBoth && aAge !== bAge; + + const aToksRaw = tokenizeQuery(a); + const bToksRaw = tokenizeQuery(b); + + const aToks = filterSimTokens(aToksRaw); + const bToks = filterSimTokens(bToksRaw); + if (!aToks.length || !bToks.length) return 0; + + const contain = tokenContainmentScore(aToksRaw, bToksRaw); + + const aFirst = aToks[0] || ""; + const bFirst = bToks[0] || ""; + const firstMatch = aFirst && bFirst && aFirst === bFirst ? 1 : 0; + + const A = new Set(aToks.slice(1)); + const B = new Set(bToks.slice(1)); + let inter = 0; + for (const w of A) if (B.has(w)) inter++; + const denom = Math.max(1, Math.max(A.size, B.size)); + const overlapTail = inter / denom; + + const d = levenshtein(a, b); + const maxLen = Math.max(1, Math.max(a.length, b.length)); + const levSim = 1 - d / maxLen; + + let gate = firstMatch ? 1.0 : Math.min(0.80, 0.06 + 0.95 * contain); + + const smallN = Math.min(aToks.length, bToks.length); + if (!firstMatch && smallN <= 3 && contain < 0.78) gate *= 0.18; + + const numGate = numberMismatchPenalty(aToks, bToks); + + let s = + numGate * + (firstMatch * 3.0 + + overlapTail * 2.2 * gate + + levSim * (firstMatch ? 1.0 : (0.10 + 0.70 * contain))); + + if (ageMatch) s *= 2.2; + else if (ageMismatch) s *= 0.18; + + s *= 1 + 0.9 * contain; + + return s; +} + +/* ---------------- main logic ---------------- */ + function main() { const args = parseArgs(process.argv.slice(2)); const repoRoot = process.cwd(); const abPath = path.isAbsolute(args.ab) ? args.ab : path.join(repoRoot, args.ab); const bcPath = path.isAbsolute(args.bc) ? args.bc : path.join(repoRoot, args.bc); + const metaPath = args.meta + ? (path.isAbsolute(args.meta) ? args.meta : path.join(repoRoot, args.meta)) + : ""; const ab = readJson(abPath); const bc = readJson(bcPath); + const canonicalSku = metaPath + ? buildCanonicalSkuFnFromMeta(readJson(metaPath)) + : (sku) => normalizeImplicitSkuKey(sku); + const abMap = buildRankMap(ab); const bcMap = buildRankMap(bc); + // Build a flat pool of candidates from AB+BC (unique by canonSku) + const rowBySku = new Map(); + for (const m of [abMap, bcMap]) { + for (const [canonSku, v] of m.entries()) { + if (!rowBySku.has(canonSku)) rowBySku.set(canonSku, v.row); + } + } + + const allSkus = Array.from(rowBySku.keys()); + const allNames = new Map(); + for (const sku of allSkus) allNames.set(sku, pickName(rowBySku.get(sku))); + const keys = new Set([...abMap.keys(), ...bcMap.keys()]); const diffs = []; @@ -88,7 +454,6 @@ function main() { diffs.push({ canonSku, discrep, - // tie-breakers sumRank: (rankAB ?? 1e9) + (rankBC ?? 1e9), }); } @@ -99,13 +464,34 @@ function main() { return String(x.canonSku).localeCompare(String(y.canonSku)); }); - const top = diffs.slice(0, args.top); + // Keep only discrepancies that have a high-scoring "other" candidate not in same linked group + const filtered = []; + for (const d of diffs) { + const skuA = String(d.canonSku); + const nameA = allNames.get(skuA) || pickName(abMap.get(skuA)?.row) || pickName(bcMap.get(skuA)?.row); + if (!nameA) continue; - for (const d of top) { - // examples: - // 884096 -> left=884096 - // id:1049355 -> left=id%3A1049355 - // u:bb504a62 -> left=u%3Abb504a62 + const groupA = canonicalSku(skuA); + + let best = 0; + for (const skuB of allSkus) { + if (skuB === skuA) continue; + + // not same-linked group + if (canonicalSku(skuB) === groupA) continue; + + const nameB = allNames.get(skuB) || ""; + if (!nameB) continue; + + const s = similarityScore(nameA, nameB); + if (s > best) best = s; + } + + if (best >= args.minScore) filtered.push(d); + if (filtered.length >= args.top) break; + } + + for (const d of filtered) { console.log(args.base + encodeURIComponent(d.canonSku)); } }