UX Improvements

2026-04-27 15:07:43 +00:00 · 2026-02-03 09:52:27 -08:00 · 2026-02-03 09:52:27 -08:00 · b2a4afa890
commit b2a4afa890
parent 37796cbcbc
1 changed files with 394 additions and 8 deletions
--- a/tools/rank_discrepency.js
+++ b/tools/rank_discrepency.js
@ -2,12 +2,16 @@
 "use strict";

 /*
-  Print local link URLs for SKUs with largest rank discrepancy between AB and BC lists.
+  Print local link URLs for SKUs with largest rank discrepancy between AB and BC lists,
+  but ONLY when there exists another *different* listing (not in same linked group)
+  with a reasonably high similarity score by name.

  Usage:
    node scripts/rank_discrepency_links.js \
      --ab reports/common_listings_ab_top1000.json \
      --bc reports/common_listings_bc_top1000.json \
+      --meta viz/data/sku_meta.json \
+      --min-score 0.75 \
      --top 50 \
      --base "http://127.0.0.1:8080/#/link/?left="

@ -18,6 +22,8 @@
 const fs = require("fs");
 const path = require("path");

+/* ---------------- IO ---------------- */
+
 function readJson(p) {
  return JSON.parse(fs.readFileSync(p, "utf8"));
 }
@ -26,17 +32,21 @@ function parseArgs(argv) {
  const out = {
    ab: "reports/common_listings_ab_top1000.json",
    bc: "reports/common_listings_bc_top1000.json",
+    meta: "", // optional sku_meta containing {links:[{fromSku,toSku}], ignores:...}
    top: 50,
    minDiscrep: 1,
    includeMissing: false,
    base: "http://127.0.0.1:8080/#/link/?left=",
+    minScore: 0.75, // similarity threshold for "reasonably high"
  };
  for (let i = 0; i < argv.length; i++) {
    const a = argv[i];
    if (a === "--ab" && argv[i + 1]) out.ab = argv[++i];
    else if (a === "--bc" && argv[i + 1]) out.bc = argv[++i];
+    else if (a === "--meta" && argv[i + 1]) out.meta = argv[++i];
    else if (a === "--top" && argv[i + 1]) out.top = Number(argv[++i]) || out.top;
    else if (a === "--min" && argv[i + 1]) out.minDiscrep = Number(argv[++i]) || out.minDiscrep;
+    else if (a === "--min-score" && argv[i + 1]) out.minScore = Number(argv[++i]) || out.minScore;
    else if (a === "--include-missing") out.includeMissing = true;
    else if (a === "--base" && argv[i + 1]) out.base = String(argv[++i] || out.base);
  }
@ -55,19 +65,375 @@ function buildRankMap(payload) {
  return map;
 }

+function pickName(row) {
+  if (!row) return "";
+  return String(row.name || row.title || row.productName || row.displayName || "");
+}
+
+/* ---------------- sku_meta grouping (optional) ---------------- */
+
+function normalizeImplicitSkuKey(k) {
+  const s = String(k || "").trim();
+  const m = s.match(/^id:(\d{1,6})$/i);
+  if (m) return String(m[1]).padStart(6, "0");
+  return s;
+}
+
+class DSU {
+  constructor() {
+    this.parent = new Map();
+    this.rank = new Map();
+  }
+  _add(x) {
+    if (!this.parent.has(x)) {
+      this.parent.set(x, x);
+      this.rank.set(x, 0);
+    }
+  }
+  find(x) {
+    x = String(x || "").trim();
+    if (!x) return "";
+    this._add(x);
+    let p = this.parent.get(x);
+    if (p !== x) {
+      p = this.find(p);
+      this.parent.set(x, p);
+    }
+    return p;
+  }
+  union(a, b) {
+    a = String(a || "").trim();
+    b = String(b || "").trim();
+    if (!a || !b || a === b) return;
+    const ra = this.find(a);
+    const rb = this.find(b);
+    if (!ra || !rb || ra === rb) return;
+
+    const rka = this.rank.get(ra) || 0;
+    const rkb = this.rank.get(rb) || 0;
+
+    if (rka < rkb) this.parent.set(ra, rb);
+    else if (rkb < rka) this.parent.set(rb, ra);
+    else {
+      this.parent.set(rb, ra);
+      this.rank.set(ra, rka + 1);
+    }
+  }
+}
+
+// Choose a stable representative (good enough for filtering “same-linked”)
+function compareSku(a, b) {
+  a = String(a || "").trim();
+  b = String(b || "").trim();
+  if (a === b) return 0;
+
+  const aUnknown = a.startsWith("u:");
+  const bUnknown = b.startsWith("u:");
+  if (aUnknown !== bUnknown) return aUnknown ? 1 : -1;
+
+  const aNum = /^\d+$/.test(a);
+  const bNum = /^\d+$/.test(b);
+  if (aNum && bNum) {
+    const na = Number(a), nb = Number(b);
+    if (Number.isFinite(na) && Number.isFinite(nb) && na !== nb) return na < nb ? -1 : 1;
+  }
+  return a < b ? -1 : 1;
+}
+
+function buildCanonicalSkuFnFromMeta(meta) {
+  const links = Array.isArray(meta?.links) ? meta.links : [];
+  if (!links.length) return (sku) => normalizeImplicitSkuKey(sku);
+
+  const dsu = new DSU();
+  const all = new Set();
+
+  for (const x of links) {
+    const a = normalizeImplicitSkuKey(x?.fromSku);
+    const b = normalizeImplicitSkuKey(x?.toSku);
+    if (!a || !b || a === b) continue;
+    all.add(a);
+    all.add(b);
+    dsu.union(a, b);
+  }
+
+  // root -> members
+  const groupsByRoot = new Map();
+  for (const s of all) {
+    const r = dsu.find(s);
+    if (!r) continue;
+    let set = groupsByRoot.get(r);
+    if (!set) groupsByRoot.set(r, (set = new Set()));
+    set.add(s);
+  }
+
+  // root -> representative
+  const repByRoot = new Map();
+  for (const [root, members] of groupsByRoot.entries()) {
+    const arr = Array.from(members);
+    arr.sort(compareSku);
+    repByRoot.set(root, arr[0] || root);
+  }
+
+  // sku -> rep
+  const canonBySku = new Map();
+  for (const [root, members] of groupsByRoot.entries()) {
+    const rep = repByRoot.get(root) || root;
+    for (const s of members) canonBySku.set(s, rep);
+    canonBySku.set(rep, rep);
+  }
+
+  return (sku) => {
+    const s = normalizeImplicitSkuKey(sku);
+    return canonBySku.get(s) || s;
+  };
+}
+
+/* ---------------- similarity (copied from viz/app) ---------------- */
+
+// Normalize for search: lowercase, punctuation -> space, collapse spaces
+function normSearchText(s) {
+  return String(s ?? "")
+    .toLowerCase()
+    .replace(/[^a-z0-9]+/g, " ")
+    .replace(/\s+/g, " ")
+    .trim();
+}
+
+function tokenizeQuery(q) {
+  const n = normSearchText(q);
+  return n ? n.split(" ").filter(Boolean) : [];
+}
+
+const SIM_STOP_TOKENS = new Set([
+  "the","a","an","and","of","to","in","for","with",
+  "year","years","yr","yrs","old",
+  "whisky","whiskey","scotch","single","malt","cask","finish","edition","release","batch","strength","abv","proof",
+  "anniversary",
+]);
+
+const ORDINAL_RE = /^(\d+)(st|nd|rd|th)$/i;
+
+function numKey(t) {
+  const s = String(t || "").trim().toLowerCase();
+  if (!s) return "";
+  if (/^\d+$/.test(s)) return s;
+  const m = s.match(ORDINAL_RE);
+  return m ? m[1] : "";
+}
+
+function extractAgeFromText(normName) {
+  const s = String(normName || "");
+  if (!s) return "";
+
+  const m = s.match(/\b(?:aged\s*)?(\d{1,2})\s*(?:yr|yrs|year|years)\b/i);
+  if (m && m[1]) return String(parseInt(m[1], 10));
+
+  const m2 = s.match(/\b(\d{1,2})\s*yo\b/i);
+  if (m2 && m2[1]) return String(parseInt(m2[1], 10));
+
+  return "";
+}
+
+function filterSimTokens(tokens) {
+  const out = [];
+  const seen = new Set();
+
+  const SIM_EQUIV = new Map([
+    ["years", "yr"],
+    ["year", "yr"],
+    ["yrs", "yr"],
+    ["yr", "yr"],
+    ["whiskey", "whisky"],
+    ["whisky", "whisky"],
+    ["bourbon", "bourbon"],
+  ]);
+
+  const VOL_UNIT = new Set(["ml","l","cl","oz","liter","liters","litre","litres"]);
+  const VOL_INLINE_RE = /^\d+(?:\.\d+)?(?:ml|l|cl|oz)$/i;
+  const PCT_INLINE_RE = /^\d+(?:\.\d+)?%$/;
+
+  const arr = Array.isArray(tokens) ? tokens : [];
+
+  for (let i = 0; i < arr.length; i++) {
+    const raw = arr[i];
+    let t = String(raw || "").trim().toLowerCase();
+    if (!t) continue;
+
+    if (!/[a-z0-9]/i.test(t)) continue;
+
+    if (VOL_INLINE_RE.test(t)) continue;
+    if (PCT_INLINE_RE.test(t)) continue;
+
+    t = SIM_EQUIV.get(t) || t;
+
+    const nk = numKey(t);
+    if (nk) t = nk;
+
+    if (VOL_UNIT.has(t) || t === "abv" || t === "proof") continue;
+
+    if (/^\d+(?:\.\d+)?$/.test(t)) {
+      const next = String(arr[i + 1] || "").trim().toLowerCase();
+      const nextNorm = SIM_EQUIV.get(next) || next;
+      if (VOL_UNIT.has(nextNorm)) {
+        i++;
+        continue;
+      }
+    }
+
+    if (!numKey(t) && SIM_STOP_TOKENS.has(t)) continue;
+
+    if (seen.has(t)) continue;
+    seen.add(t);
+    out.push(t);
+  }
+
+  return out;
+}
+
+function tokenContainmentScore(aTokens, bTokens) {
+  const A = filterSimTokens(aTokens || []);
+  const B = filterSimTokens(bTokens || []);
+  if (!A.length || !B.length) return 0;
+
+  const aSet = new Set(A);
+  const bSet = new Set(B);
+
+  const small = aSet.size <= bSet.size ? aSet : bSet;
+  const big = aSet.size <= bSet.size ? bSet : aSet;
+
+  let hit = 0;
+  for (const t of small) if (big.has(t)) hit++;
+
+  const recall = hit / Math.max(1, small.size);
+  const precision = hit / Math.max(1, big.size);
+  const f1 = (2 * precision * recall) / Math.max(1e-9, precision + recall);
+
+  return f1;
+}
+
+function levenshtein(a, b) {
+  a = String(a || "");
+  b = String(b || "");
+  const n = a.length, m = b.length;
+  if (!n) return m;
+  if (!m) return n;
+
+  const dp = new Array(m + 1);
+  for (let j = 0; j <= m; j++) dp[j] = j;
+
+  for (let i = 1; i <= n; i++) {
+    let prev = dp[0];
+    dp[0] = i;
+    const ca = a.charCodeAt(i - 1);
+    for (let j = 1; j <= m; j++) {
+      const tmp = dp[j];
+      const cost = ca === b.charCodeAt(j - 1) ? 0 : 1;
+      dp[j] = Math.min(dp[j] + 1, dp[j - 1] + 1, prev + cost);
+      prev = tmp;
+    }
+  }
+  return dp[m];
+}
+
+function numberMismatchPenalty(aTokens, bTokens) {
+  const aNums = new Set((aTokens || []).map(numKey).filter(Boolean));
+  const bNums = new Set((bTokens || []).map(numKey).filter(Boolean));
+  if (!aNums.size || !bNums.size) return 1.0;
+  for (const n of aNums) if (bNums.has(n)) return 1.0;
+  return 0.28;
+}
+
+// Same structure/weights as viz/app/linker/similarity.js
+function similarityScore(aName, bName) {
+  const a = normSearchText(aName);
+  const b = normSearchText(bName);
+  if (!a || !b) return 0;
+
+  const aAge = extractAgeFromText(a);
+  const bAge = extractAgeFromText(b);
+  const ageBoth = !!(aAge && bAge);
+  const ageMatch = ageBoth && aAge === bAge;
+  const ageMismatch = ageBoth && aAge !== bAge;
+
+  const aToksRaw = tokenizeQuery(a);
+  const bToksRaw = tokenizeQuery(b);
+
+  const aToks = filterSimTokens(aToksRaw);
+  const bToks = filterSimTokens(bToksRaw);
+  if (!aToks.length || !bToks.length) return 0;
+
+  const contain = tokenContainmentScore(aToksRaw, bToksRaw);
+
+  const aFirst = aToks[0] || "";
+  const bFirst = bToks[0] || "";
+  const firstMatch = aFirst && bFirst && aFirst === bFirst ? 1 : 0;
+
+  const A = new Set(aToks.slice(1));
+  const B = new Set(bToks.slice(1));
+  let inter = 0;
+  for (const w of A) if (B.has(w)) inter++;
+  const denom = Math.max(1, Math.max(A.size, B.size));
+  const overlapTail = inter / denom;
+
+  const d = levenshtein(a, b);
+  const maxLen = Math.max(1, Math.max(a.length, b.length));
+  const levSim = 1 - d / maxLen;
+
+  let gate = firstMatch ? 1.0 : Math.min(0.80, 0.06 + 0.95 * contain);
+
+  const smallN = Math.min(aToks.length, bToks.length);
+  if (!firstMatch && smallN <= 3 && contain < 0.78) gate *= 0.18;
+
+  const numGate = numberMismatchPenalty(aToks, bToks);
+
+  let s =
+    numGate *
+    (firstMatch * 3.0 +
+      overlapTail * 2.2 * gate +
+      levSim * (firstMatch ? 1.0 : (0.10 + 0.70 * contain)));
+
+  if (ageMatch) s *= 2.2;
+  else if (ageMismatch) s *= 0.18;
+
+  s *= 1 + 0.9 * contain;
+
+  return s;
+}
+
+/* ---------------- main logic ---------------- */
+
 function main() {
  const args = parseArgs(process.argv.slice(2));
  const repoRoot = process.cwd();

  const abPath = path.isAbsolute(args.ab) ? args.ab : path.join(repoRoot, args.ab);
  const bcPath = path.isAbsolute(args.bc) ? args.bc : path.join(repoRoot, args.bc);
+  const metaPath = args.meta
+    ? (path.isAbsolute(args.meta) ? args.meta : path.join(repoRoot, args.meta))
+    : "";

  const ab = readJson(abPath);
  const bc = readJson(bcPath);

+  const canonicalSku = metaPath
+    ? buildCanonicalSkuFnFromMeta(readJson(metaPath))
+    : (sku) => normalizeImplicitSkuKey(sku);
+
  const abMap = buildRankMap(ab);
  const bcMap = buildRankMap(bc);

+  // Build a flat pool of candidates from AB+BC (unique by canonSku)
+  const rowBySku = new Map();
+  for (const m of [abMap, bcMap]) {
+    for (const [canonSku, v] of m.entries()) {
+      if (!rowBySku.has(canonSku)) rowBySku.set(canonSku, v.row);
+    }
+  }
+
+  const allSkus = Array.from(rowBySku.keys());
+  const allNames = new Map();
+  for (const sku of allSkus) allNames.set(sku, pickName(rowBySku.get(sku)));
+
  const keys = new Set([...abMap.keys(), ...bcMap.keys()]);
  const diffs = [];

@ -88,7 +454,6 @@ function main() {
    diffs.push({
      canonSku,
      discrep,
-      // tie-breakers
      sumRank: (rankAB ?? 1e9) + (rankBC ?? 1e9),
    });
  }
@ -99,13 +464,34 @@ function main() {
    return String(x.canonSku).localeCompare(String(y.canonSku));
  });

-  const top = diffs.slice(0, args.top);
+  // Keep only discrepancies that have a high-scoring "other" candidate not in same linked group
+  const filtered = [];
+  for (const d of diffs) {
+    const skuA = String(d.canonSku);
+    const nameA = allNames.get(skuA) || pickName(abMap.get(skuA)?.row) || pickName(bcMap.get(skuA)?.row);
+    if (!nameA) continue;

-  for (const d of top) {
-    // examples:
-    // 884096 -> left=884096
-    // id:1049355 -> left=id%3A1049355
-    // u:bb504a62 -> left=u%3Abb504a62
+    const groupA = canonicalSku(skuA);
+
+    let best = 0;
+    for (const skuB of allSkus) {
+      if (skuB === skuA) continue;
+
+      // not same-linked group
+      if (canonicalSku(skuB) === groupA) continue;
+
+      const nameB = allNames.get(skuB) || "";
+      if (!nameB) continue;
+
+      const s = similarityScore(nameA, nameB);
+      if (s > best) best = s;
+    }
+
+    if (best >= args.minScore) filtered.push(d);
+    if (filtered.length >= args.top) break;
+  }
+
+  for (const d of filtered) {
    console.log(args.base + encodeURIComponent(d.canonSku));
  }
 }