UX Improvements

This commit is contained in:
Brennan Wilkes (Text Groove) 2026-02-03 09:52:27 -08:00
parent 37796cbcbc
commit b2a4afa890

View file

@ -2,12 +2,16 @@
"use strict";
/*
Print local link URLs for SKUs with largest rank discrepancy between AB and BC lists.
Print local link URLs for SKUs with largest rank discrepancy between AB and BC lists,
but ONLY when there exists another *different* listing (not in same linked group)
with a reasonably high similarity score by name.
Usage:
node scripts/rank_discrepency_links.js \
--ab reports/common_listings_ab_top1000.json \
--bc reports/common_listings_bc_top1000.json \
--meta viz/data/sku_meta.json \
--min-score 0.75 \
--top 50 \
--base "http://127.0.0.1:8080/#/link/?left="
@ -18,6 +22,8 @@
const fs = require("fs");
const path = require("path");
/* ---------------- IO ---------------- */
function readJson(p) {
return JSON.parse(fs.readFileSync(p, "utf8"));
}
@ -26,17 +32,21 @@ function parseArgs(argv) {
const out = {
ab: "reports/common_listings_ab_top1000.json",
bc: "reports/common_listings_bc_top1000.json",
meta: "", // optional sku_meta containing {links:[{fromSku,toSku}], ignores:...}
top: 50,
minDiscrep: 1,
includeMissing: false,
base: "http://127.0.0.1:8080/#/link/?left=",
minScore: 0.75, // similarity threshold for "reasonably high"
};
for (let i = 0; i < argv.length; i++) {
const a = argv[i];
if (a === "--ab" && argv[i + 1]) out.ab = argv[++i];
else if (a === "--bc" && argv[i + 1]) out.bc = argv[++i];
else if (a === "--meta" && argv[i + 1]) out.meta = argv[++i];
else if (a === "--top" && argv[i + 1]) out.top = Number(argv[++i]) || out.top;
else if (a === "--min" && argv[i + 1]) out.minDiscrep = Number(argv[++i]) || out.minDiscrep;
else if (a === "--min-score" && argv[i + 1]) out.minScore = Number(argv[++i]) || out.minScore;
else if (a === "--include-missing") out.includeMissing = true;
else if (a === "--base" && argv[i + 1]) out.base = String(argv[++i] || out.base);
}
@ -55,19 +65,375 @@ function buildRankMap(payload) {
return map;
}
function pickName(row) {
if (!row) return "";
return String(row.name || row.title || row.productName || row.displayName || "");
}
/* ---------------- sku_meta grouping (optional) ---------------- */
function normalizeImplicitSkuKey(k) {
const s = String(k || "").trim();
const m = s.match(/^id:(\d{1,6})$/i);
if (m) return String(m[1]).padStart(6, "0");
return s;
}
class DSU {
constructor() {
this.parent = new Map();
this.rank = new Map();
}
_add(x) {
if (!this.parent.has(x)) {
this.parent.set(x, x);
this.rank.set(x, 0);
}
}
find(x) {
x = String(x || "").trim();
if (!x) return "";
this._add(x);
let p = this.parent.get(x);
if (p !== x) {
p = this.find(p);
this.parent.set(x, p);
}
return p;
}
union(a, b) {
a = String(a || "").trim();
b = String(b || "").trim();
if (!a || !b || a === b) return;
const ra = this.find(a);
const rb = this.find(b);
if (!ra || !rb || ra === rb) return;
const rka = this.rank.get(ra) || 0;
const rkb = this.rank.get(rb) || 0;
if (rka < rkb) this.parent.set(ra, rb);
else if (rkb < rka) this.parent.set(rb, ra);
else {
this.parent.set(rb, ra);
this.rank.set(ra, rka + 1);
}
}
}
// Choose a stable representative (good enough for filtering “same-linked”)
function compareSku(a, b) {
a = String(a || "").trim();
b = String(b || "").trim();
if (a === b) return 0;
const aUnknown = a.startsWith("u:");
const bUnknown = b.startsWith("u:");
if (aUnknown !== bUnknown) return aUnknown ? 1 : -1;
const aNum = /^\d+$/.test(a);
const bNum = /^\d+$/.test(b);
if (aNum && bNum) {
const na = Number(a), nb = Number(b);
if (Number.isFinite(na) && Number.isFinite(nb) && na !== nb) return na < nb ? -1 : 1;
}
return a < b ? -1 : 1;
}
function buildCanonicalSkuFnFromMeta(meta) {
const links = Array.isArray(meta?.links) ? meta.links : [];
if (!links.length) return (sku) => normalizeImplicitSkuKey(sku);
const dsu = new DSU();
const all = new Set();
for (const x of links) {
const a = normalizeImplicitSkuKey(x?.fromSku);
const b = normalizeImplicitSkuKey(x?.toSku);
if (!a || !b || a === b) continue;
all.add(a);
all.add(b);
dsu.union(a, b);
}
// root -> members
const groupsByRoot = new Map();
for (const s of all) {
const r = dsu.find(s);
if (!r) continue;
let set = groupsByRoot.get(r);
if (!set) groupsByRoot.set(r, (set = new Set()));
set.add(s);
}
// root -> representative
const repByRoot = new Map();
for (const [root, members] of groupsByRoot.entries()) {
const arr = Array.from(members);
arr.sort(compareSku);
repByRoot.set(root, arr[0] || root);
}
// sku -> rep
const canonBySku = new Map();
for (const [root, members] of groupsByRoot.entries()) {
const rep = repByRoot.get(root) || root;
for (const s of members) canonBySku.set(s, rep);
canonBySku.set(rep, rep);
}
return (sku) => {
const s = normalizeImplicitSkuKey(sku);
return canonBySku.get(s) || s;
};
}
/* ---------------- similarity (copied from viz/app) ---------------- */
// Normalize for search: lowercase, punctuation -> space, collapse spaces
function normSearchText(s) {
return String(s ?? "")
.toLowerCase()
.replace(/[^a-z0-9]+/g, " ")
.replace(/\s+/g, " ")
.trim();
}
function tokenizeQuery(q) {
const n = normSearchText(q);
return n ? n.split(" ").filter(Boolean) : [];
}
const SIM_STOP_TOKENS = new Set([
"the","a","an","and","of","to","in","for","with",
"year","years","yr","yrs","old",
"whisky","whiskey","scotch","single","malt","cask","finish","edition","release","batch","strength","abv","proof",
"anniversary",
]);
const ORDINAL_RE = /^(\d+)(st|nd|rd|th)$/i;
function numKey(t) {
const s = String(t || "").trim().toLowerCase();
if (!s) return "";
if (/^\d+$/.test(s)) return s;
const m = s.match(ORDINAL_RE);
return m ? m[1] : "";
}
function extractAgeFromText(normName) {
const s = String(normName || "");
if (!s) return "";
const m = s.match(/\b(?:aged\s*)?(\d{1,2})\s*(?:yr|yrs|year|years)\b/i);
if (m && m[1]) return String(parseInt(m[1], 10));
const m2 = s.match(/\b(\d{1,2})\s*yo\b/i);
if (m2 && m2[1]) return String(parseInt(m2[1], 10));
return "";
}
function filterSimTokens(tokens) {
const out = [];
const seen = new Set();
const SIM_EQUIV = new Map([
["years", "yr"],
["year", "yr"],
["yrs", "yr"],
["yr", "yr"],
["whiskey", "whisky"],
["whisky", "whisky"],
["bourbon", "bourbon"],
]);
const VOL_UNIT = new Set(["ml","l","cl","oz","liter","liters","litre","litres"]);
const VOL_INLINE_RE = /^\d+(?:\.\d+)?(?:ml|l|cl|oz)$/i;
const PCT_INLINE_RE = /^\d+(?:\.\d+)?%$/;
const arr = Array.isArray(tokens) ? tokens : [];
for (let i = 0; i < arr.length; i++) {
const raw = arr[i];
let t = String(raw || "").trim().toLowerCase();
if (!t) continue;
if (!/[a-z0-9]/i.test(t)) continue;
if (VOL_INLINE_RE.test(t)) continue;
if (PCT_INLINE_RE.test(t)) continue;
t = SIM_EQUIV.get(t) || t;
const nk = numKey(t);
if (nk) t = nk;
if (VOL_UNIT.has(t) || t === "abv" || t === "proof") continue;
if (/^\d+(?:\.\d+)?$/.test(t)) {
const next = String(arr[i + 1] || "").trim().toLowerCase();
const nextNorm = SIM_EQUIV.get(next) || next;
if (VOL_UNIT.has(nextNorm)) {
i++;
continue;
}
}
if (!numKey(t) && SIM_STOP_TOKENS.has(t)) continue;
if (seen.has(t)) continue;
seen.add(t);
out.push(t);
}
return out;
}
function tokenContainmentScore(aTokens, bTokens) {
const A = filterSimTokens(aTokens || []);
const B = filterSimTokens(bTokens || []);
if (!A.length || !B.length) return 0;
const aSet = new Set(A);
const bSet = new Set(B);
const small = aSet.size <= bSet.size ? aSet : bSet;
const big = aSet.size <= bSet.size ? bSet : aSet;
let hit = 0;
for (const t of small) if (big.has(t)) hit++;
const recall = hit / Math.max(1, small.size);
const precision = hit / Math.max(1, big.size);
const f1 = (2 * precision * recall) / Math.max(1e-9, precision + recall);
return f1;
}
function levenshtein(a, b) {
a = String(a || "");
b = String(b || "");
const n = a.length, m = b.length;
if (!n) return m;
if (!m) return n;
const dp = new Array(m + 1);
for (let j = 0; j <= m; j++) dp[j] = j;
for (let i = 1; i <= n; i++) {
let prev = dp[0];
dp[0] = i;
const ca = a.charCodeAt(i - 1);
for (let j = 1; j <= m; j++) {
const tmp = dp[j];
const cost = ca === b.charCodeAt(j - 1) ? 0 : 1;
dp[j] = Math.min(dp[j] + 1, dp[j - 1] + 1, prev + cost);
prev = tmp;
}
}
return dp[m];
}
function numberMismatchPenalty(aTokens, bTokens) {
const aNums = new Set((aTokens || []).map(numKey).filter(Boolean));
const bNums = new Set((bTokens || []).map(numKey).filter(Boolean));
if (!aNums.size || !bNums.size) return 1.0;
for (const n of aNums) if (bNums.has(n)) return 1.0;
return 0.28;
}
// Same structure/weights as viz/app/linker/similarity.js
function similarityScore(aName, bName) {
const a = normSearchText(aName);
const b = normSearchText(bName);
if (!a || !b) return 0;
const aAge = extractAgeFromText(a);
const bAge = extractAgeFromText(b);
const ageBoth = !!(aAge && bAge);
const ageMatch = ageBoth && aAge === bAge;
const ageMismatch = ageBoth && aAge !== bAge;
const aToksRaw = tokenizeQuery(a);
const bToksRaw = tokenizeQuery(b);
const aToks = filterSimTokens(aToksRaw);
const bToks = filterSimTokens(bToksRaw);
if (!aToks.length || !bToks.length) return 0;
const contain = tokenContainmentScore(aToksRaw, bToksRaw);
const aFirst = aToks[0] || "";
const bFirst = bToks[0] || "";
const firstMatch = aFirst && bFirst && aFirst === bFirst ? 1 : 0;
const A = new Set(aToks.slice(1));
const B = new Set(bToks.slice(1));
let inter = 0;
for (const w of A) if (B.has(w)) inter++;
const denom = Math.max(1, Math.max(A.size, B.size));
const overlapTail = inter / denom;
const d = levenshtein(a, b);
const maxLen = Math.max(1, Math.max(a.length, b.length));
const levSim = 1 - d / maxLen;
let gate = firstMatch ? 1.0 : Math.min(0.80, 0.06 + 0.95 * contain);
const smallN = Math.min(aToks.length, bToks.length);
if (!firstMatch && smallN <= 3 && contain < 0.78) gate *= 0.18;
const numGate = numberMismatchPenalty(aToks, bToks);
let s =
numGate *
(firstMatch * 3.0 +
overlapTail * 2.2 * gate +
levSim * (firstMatch ? 1.0 : (0.10 + 0.70 * contain)));
if (ageMatch) s *= 2.2;
else if (ageMismatch) s *= 0.18;
s *= 1 + 0.9 * contain;
return s;
}
/* ---------------- main logic ---------------- */
function main() {
const args = parseArgs(process.argv.slice(2));
const repoRoot = process.cwd();
const abPath = path.isAbsolute(args.ab) ? args.ab : path.join(repoRoot, args.ab);
const bcPath = path.isAbsolute(args.bc) ? args.bc : path.join(repoRoot, args.bc);
const metaPath = args.meta
? (path.isAbsolute(args.meta) ? args.meta : path.join(repoRoot, args.meta))
: "";
const ab = readJson(abPath);
const bc = readJson(bcPath);
const canonicalSku = metaPath
? buildCanonicalSkuFnFromMeta(readJson(metaPath))
: (sku) => normalizeImplicitSkuKey(sku);
const abMap = buildRankMap(ab);
const bcMap = buildRankMap(bc);
// Build a flat pool of candidates from AB+BC (unique by canonSku)
const rowBySku = new Map();
for (const m of [abMap, bcMap]) {
for (const [canonSku, v] of m.entries()) {
if (!rowBySku.has(canonSku)) rowBySku.set(canonSku, v.row);
}
}
const allSkus = Array.from(rowBySku.keys());
const allNames = new Map();
for (const sku of allSkus) allNames.set(sku, pickName(rowBySku.get(sku)));
const keys = new Set([...abMap.keys(), ...bcMap.keys()]);
const diffs = [];
@ -88,7 +454,6 @@ function main() {
diffs.push({
canonSku,
discrep,
// tie-breakers
sumRank: (rankAB ?? 1e9) + (rankBC ?? 1e9),
});
}
@ -99,13 +464,34 @@ function main() {
return String(x.canonSku).localeCompare(String(y.canonSku));
});
const top = diffs.slice(0, args.top);
// Keep only discrepancies that have a high-scoring "other" candidate not in same linked group
const filtered = [];
for (const d of diffs) {
const skuA = String(d.canonSku);
const nameA = allNames.get(skuA) || pickName(abMap.get(skuA)?.row) || pickName(bcMap.get(skuA)?.row);
if (!nameA) continue;
for (const d of top) {
// examples:
// 884096 -> left=884096
// id:1049355 -> left=id%3A1049355
// u:bb504a62 -> left=u%3Abb504a62
const groupA = canonicalSku(skuA);
let best = 0;
for (const skuB of allSkus) {
if (skuB === skuA) continue;
// not same-linked group
if (canonicalSku(skuB) === groupA) continue;
const nameB = allNames.get(skuB) || "";
if (!nameB) continue;
const s = similarityScore(nameA, nameB);
if (s > best) best = s;
}
if (best >= args.minScore) filtered.push(d);
if (filtered.length >= args.top) break;
}
for (const d of filtered) {
console.log(args.base + encodeURIComponent(d.canonSku));
}
}