mirror of
https://github.com/samsonjs/spirit-tracker.git
synced 2026-04-27 15:07:43 +00:00
feat: Better SKUs for CC gull tudor and BCL
This commit is contained in:
parent
6be8e87733
commit
f19c1404fa
5 changed files with 735 additions and 238 deletions
|
|
@ -80,7 +80,6 @@ function bclIsInStock(src) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
function bclNormalizeAbsUrl(raw) {
|
function bclNormalizeAbsUrl(raw) {
|
||||||
const s = String(raw || "").trim();
|
const s = String(raw || "").trim();
|
||||||
if (!s) return "";
|
if (!s) return "";
|
||||||
|
|
@ -141,7 +140,17 @@ function bclHitToItem(hit) {
|
||||||
const regular = asNumber(src.regularPrice);
|
const regular = asNumber(src.regularPrice);
|
||||||
const price = cad(Number.isFinite(current) ? current : regular);
|
const price = cad(Number.isFinite(current) ? current : regular);
|
||||||
|
|
||||||
const sku = normalizeCspc(url);
|
// SKU key:
|
||||||
|
// - Keep CSPC 6-digit when present (rare for BCL, but safe)
|
||||||
|
// - Otherwise upgrade to an explicit soft key: id:<digits>
|
||||||
|
//
|
||||||
|
// ✅ PATCH: handle tiny SKUs too (3/4/5-digit) by forcing id:<digits>
|
||||||
|
// only fall back to raw (NOT u:) if it’s genuinely non-numeric.
|
||||||
|
let sku = normalizeCspc(skuRaw);
|
||||||
|
if (!sku) {
|
||||||
|
const m = skuRaw.match(/^\d{1,6}$/); // BCL product IDs like 141, 596, 984, 117, etc.
|
||||||
|
sku = m ? `id:${m[0]}` : `id:${skuRaw}`;
|
||||||
|
}
|
||||||
|
|
||||||
const inStock = bclIsInStock(src);
|
const inStock = bclIsInStock(src);
|
||||||
if (!inStock) return null;
|
if (!inStock) return null;
|
||||||
|
|
@ -155,8 +164,6 @@ function bclHitToItem(hit) {
|
||||||
return { name, price, url, sku, img };
|
return { name, price, url, sku, img };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
async function bclFetchBrowsePage(ctx, page1, size) {
|
async function bclFetchBrowsePage(ctx, page1, size) {
|
||||||
const type = ctx.cat.bclType; // e.g. "rum" or "whisky / whiskey"
|
const type = ctx.cat.bclType; // e.g. "rum" or "whisky / whiskey"
|
||||||
const category = "spirits";
|
const category = "spirits";
|
||||||
|
|
@ -293,11 +300,12 @@ async function scanCategoryBCLAjax(ctx, prevDb, report) {
|
||||||
newCount: newItems.length,
|
newCount: newItems.length,
|
||||||
updatedCount: updatedItems.length,
|
updatedCount: updatedItems.length,
|
||||||
removedCount: removedItems.length,
|
removedCount: removedItems.length,
|
||||||
restoredCount: removedItems.length,
|
restoredCount: restoredItems.length,
|
||||||
elapsedMs: elapsed,
|
elapsedMs: elapsed,
|
||||||
});
|
});
|
||||||
report.totals.newCount += newItems.length;
|
report.totals.newCount += newItems.length;
|
||||||
report.totals.updatedCount += updatedItems.length;
|
report.totals.updatedCount += updatedItems.length;
|
||||||
|
report.totals.updatedCount += updatedItems.length;
|
||||||
report.totals.removedCount += removedItems.length;
|
report.totals.removedCount += removedItems.length;
|
||||||
report.totals.restoredCount += restoredItems.length;
|
report.totals.restoredCount += restoredItems.length;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,7 @@ const { setTimeout: sleep } = require("timers/promises");
|
||||||
|
|
||||||
const { decodeHtml, stripTags, extractFirstImgUrl } = require("../utils/html");
|
const { decodeHtml, stripTags, extractFirstImgUrl } = require("../utils/html");
|
||||||
const { sanitizeName } = require("../utils/text");
|
const { sanitizeName } = require("../utils/text");
|
||||||
const { normalizeCspc } = require("../utils/sku");
|
const { normalizeCspc, pickBetterSku, needsSkuDetail } = require("../utils/sku");
|
||||||
const { makePageUrlShopifyQueryPage } = require("../utils/url");
|
const { makePageUrlShopifyQueryPage } = require("../utils/url");
|
||||||
|
|
||||||
const { mergeDiscoveredIntoDb } = require("../tracker/merge");
|
const { mergeDiscoveredIntoDb } = require("../tracker/merge");
|
||||||
|
|
@ -33,7 +33,9 @@ function canonicalizeCraftProductUrl(raw) {
|
||||||
function extractShopifyCardPrice(block) {
|
function extractShopifyCardPrice(block) {
|
||||||
const b = String(block || "");
|
const b = String(block || "");
|
||||||
const dollars = (txt) =>
|
const dollars = (txt) =>
|
||||||
[...String(txt).matchAll(/\$\s*[\d,]+(?:\.\d{2})?/g)].map((m) => m[0].replace(/\s+/g, ""));
|
[...String(txt).matchAll(/\$\s*[\d,]+(?:\.\d{2})?/g)].map((m) =>
|
||||||
|
m[0].replace(/\s+/g, "")
|
||||||
|
);
|
||||||
|
|
||||||
const saleRegion = b.split(/sale price/i)[1] || "";
|
const saleRegion = b.split(/sale price/i)[1] || "";
|
||||||
const saleD = dollars(saleRegion);
|
const saleD = dollars(saleRegion);
|
||||||
|
|
@ -50,8 +52,14 @@ function extractShopifyCardPrice(block) {
|
||||||
function parseProductsCraftCellars(html, ctx) {
|
function parseProductsCraftCellars(html, ctx) {
|
||||||
const s = String(html || "");
|
const s = String(html || "");
|
||||||
|
|
||||||
const g1 = s.match(/<div\b[^>]*id=["']ProductGridContainer["'][^>]*>[\s\S]*?<\/div>/i)?.[0] || "";
|
const g1 =
|
||||||
const g2 = s.match(/<div\b[^>]*id=["']product-grid["'][^>]*>[\s\S]*?<\/div>/i)?.[0] || "";
|
s.match(
|
||||||
|
/<div\b[^>]*id=["']ProductGridContainer["'][^>]*>[\s\S]*?<\/div>/i
|
||||||
|
)?.[0] || "";
|
||||||
|
const g2 =
|
||||||
|
s.match(
|
||||||
|
/<div\b[^>]*id=["']product-grid["'][^>]*>[\s\S]*?<\/div>/i
|
||||||
|
)?.[0] || "";
|
||||||
|
|
||||||
const gridCandidate = g1.length > g2.length ? g1 : g2;
|
const gridCandidate = g1.length > g2.length ? g1 : g2;
|
||||||
const grid = /\/products\//i.test(gridCandidate) ? gridCandidate : s;
|
const grid = /\/products\//i.test(gridCandidate) ? gridCandidate : s;
|
||||||
|
|
@ -63,18 +71,24 @@ function parseProductsCraftCellarsInner(html, ctx) {
|
||||||
const s = String(html || "");
|
const s = String(html || "");
|
||||||
const items = [];
|
const items = [];
|
||||||
|
|
||||||
let blocks = [...s.matchAll(/<li\b[^>]*>[\s\S]*?<\/li>/gi)].map((m) => m[0]);
|
let blocks = [...s.matchAll(/<li\b[^>]*>[\s\S]*?<\/li>/gi)].map(
|
||||||
|
(m) => m[0]
|
||||||
|
);
|
||||||
if (blocks.length < 5) {
|
if (blocks.length < 5) {
|
||||||
blocks = [...s.matchAll(/<div\b[^>]*class=["'][^"']*\bcard\b[^"']*["'][^>]*>[\s\S]*?<\/div>/gi)].map(
|
blocks = [
|
||||||
(m) => m[0]
|
...s.matchAll(
|
||||||
);
|
/<div\b[^>]*class=["'][^"']*\bcard\b[^"']*["'][^>]*>[\s\S]*?<\/div>/gi
|
||||||
|
),
|
||||||
|
].map((m) => m[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
const base = `https://${(ctx && ctx.store && ctx.store.host) || "craftcellars.ca"}/`;
|
const base = `https://${(ctx && ctx.store && ctx.store.host) || "craftcellars.ca"}/`;
|
||||||
|
|
||||||
for (const block of blocks) {
|
for (const block of blocks) {
|
||||||
const href =
|
const href =
|
||||||
block.match(/<a\b[^>]*href=["']([^"']*\/products\/[^"']+)["']/i)?.[1] ||
|
block.match(
|
||||||
|
/<a\b[^>]*href=["']([^"']*\/products\/[^"']+)["']/i
|
||||||
|
)?.[1] ||
|
||||||
block.match(/href=["']([^"']*\/products\/[^"']+)["']/i)?.[1];
|
block.match(/href=["']([^"']*\/products\/[^"']+)["']/i)?.[1];
|
||||||
if (!href) continue;
|
if (!href) continue;
|
||||||
|
|
||||||
|
|
@ -87,9 +101,15 @@ function parseProductsCraftCellarsInner(html, ctx) {
|
||||||
url = canonicalizeCraftProductUrl(url);
|
url = canonicalizeCraftProductUrl(url);
|
||||||
|
|
||||||
const nameHtml =
|
const nameHtml =
|
||||||
block.match(/<a\b[^>]*href=["'][^"']*\/products\/[^"']+["'][^>]*>\s*<[^>]*>\s*([^<]{2,200}?)\s*</i)?.[1] ||
|
block.match(
|
||||||
block.match(/<h[23]\b[^>]*>[\s\S]*?<a\b[^>]*\/products\/[^"']+[^>]*>([\s\S]*?)<\/a>[\s\S]*?<\/h[23]>/i)?.[1] ||
|
/<a\b[^>]*href=["'][^"']*\/products\/[^"']+["'][^>]*>\s*<[^>]*>\s*([^<]{2,200}?)\s*</i
|
||||||
block.match(/<a\b[^>]*href=["'][^"']*\/products\/[^"']+["'][^>]*>([\s\S]*?)<\/a>/i)?.[1];
|
)?.[1] ||
|
||||||
|
block.match(
|
||||||
|
/<h[23]\b[^>]*>[\s\S]*?<a\b[^>]*\/products\/[^"']+[^>]*>([\s\S]*?)<\/a>[\s\S]*?<\/h[23]>/i
|
||||||
|
)?.[1] ||
|
||||||
|
block.match(
|
||||||
|
/<a\b[^>]*href=["'][^"']*\/products\/[^"']+["'][^>]*>([\s\S]*?)<\/a>/i
|
||||||
|
)?.[1];
|
||||||
|
|
||||||
const name = sanitizeName(stripTags(decodeHtml(nameHtml || "")));
|
const name = sanitizeName(stripTags(decodeHtml(nameHtml || "")));
|
||||||
if (!name) continue;
|
if (!name) continue;
|
||||||
|
|
@ -108,37 +128,58 @@ function parseProductsCraftCellarsInner(html, ctx) {
|
||||||
function usdFromShopifyPriceStr(s) {
|
function usdFromShopifyPriceStr(s) {
|
||||||
const n = Number(String(s || "").replace(/[^0-9.]/g, ""));
|
const n = Number(String(s || "").replace(/[^0-9.]/g, ""));
|
||||||
if (!Number.isFinite(n)) return "";
|
if (!Number.isFinite(n)) return "";
|
||||||
return `$${n.toLocaleString("en-US", { minimumFractionDigits: 2, maximumFractionDigits: 2 })}`;
|
return `$${n.toLocaleString("en-US", {
|
||||||
|
minimumFractionDigits: 2,
|
||||||
|
maximumFractionDigits: 2,
|
||||||
|
})}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
function cfgNum(v, fallback) {
|
function cfgNum(v, fallback) {
|
||||||
return Number.isFinite(v) ? v : fallback;
|
return Number.isFinite(v) ? v : fallback;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ---------- NEW: product page SKU extractor ---------- */
|
||||||
|
function extractCraftSkuFromProductPageHtml(html) {
|
||||||
|
const s = String(html || "");
|
||||||
|
|
||||||
|
const m =
|
||||||
|
s.match(
|
||||||
|
/<strong>\s*SKU:\s*<\/strong>\s*<span>\s*([^<]{1,80}?)\s*<\/span>/i
|
||||||
|
) ||
|
||||||
|
s.match(/\bSKU:\s*<\/strong>\s*<span>\s*([^<]{1,80}?)\s*<\/span>/i) ||
|
||||||
|
s.match(/\bSKU:\s*([A-Za-z0-9][A-Za-z0-9\-_/ ]{0,40})/i);
|
||||||
|
|
||||||
|
const raw = m && m[1] ? stripTags(decodeHtml(m[1])) : "";
|
||||||
|
return normalizeCspc(raw);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Craft Cellars:
|
* Craft Cellars:
|
||||||
* - HTML listing with ?filter.v.availability=1 is the allowlist (prevents OOS leaking in)
|
* - HTML listing with ?filter.v.availability=1 is the allowlist
|
||||||
* - Shopify products.json is used only to enrich SKU (and optionally price) for those allowed URLs
|
* - products.json enriches SKU/price
|
||||||
|
* - product page HTML is final SKU fallback
|
||||||
*/
|
*/
|
||||||
async function scanCategoryCraftCellars(ctx, prevDb, report) {
|
async function scanCategoryCraftCellars(ctx, prevDb, report) {
|
||||||
const t0 = Date.now();
|
const t0 = Date.now();
|
||||||
|
|
||||||
// Strongly prefer "slow and steady" to avoid 429s.
|
const perPageDelayMs =
|
||||||
// Use per-category knobs if present; otherwise default conservative.
|
Math.max(
|
||||||
const perPageDelayMs = Math.max(
|
0,
|
||||||
0,
|
cfgNum(ctx?.cat?.pageStaggerMs, cfgNum(ctx?.cat?.discoveryDelayMs, 0))
|
||||||
cfgNum(ctx?.cat?.pageStaggerMs, cfgNum(ctx?.cat?.discoveryDelayMs, 0)) || 0
|
) || 0;
|
||||||
) || 0;
|
|
||||||
|
|
||||||
const perJsonPageDelayMs = Math.max(
|
const perJsonPageDelayMs = Math.max(
|
||||||
0,
|
0,
|
||||||
cfgNum(ctx?.cat?.jsonPageDelayMs, perPageDelayMs)
|
cfgNum(ctx?.cat?.jsonPageDelayMs, perPageDelayMs)
|
||||||
);
|
);
|
||||||
|
|
||||||
// 1) HTML scan: allowlist of in-stock listing URLs
|
const htmlMap = new Map();
|
||||||
const htmlMap = new Map(); // url -> {name, price, url, img}
|
|
||||||
|
const maxPages =
|
||||||
|
ctx.config.maxPages === null
|
||||||
|
? 200
|
||||||
|
: Math.min(ctx.config.maxPages, 200);
|
||||||
|
|
||||||
const maxPages = ctx.config.maxPages === null ? 200 : Math.min(ctx.config.maxPages, 200);
|
|
||||||
let htmlPagesFetched = 0;
|
let htmlPagesFetched = 0;
|
||||||
let emptyStreak = 0;
|
let emptyStreak = 0;
|
||||||
|
|
||||||
|
|
@ -146,7 +187,11 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) {
|
||||||
if (p > 1 && perPageDelayMs > 0) await sleep(perPageDelayMs);
|
if (p > 1 && perPageDelayMs > 0) await sleep(perPageDelayMs);
|
||||||
|
|
||||||
const pageUrl = makePageUrlShopifyQueryPage(ctx.cat.startUrl, p);
|
const pageUrl = makePageUrlShopifyQueryPage(ctx.cat.startUrl, p);
|
||||||
const { text: html } = await ctx.http.fetchTextWithRetry(pageUrl, `craft:html:${ctx.cat.key}:p${p}`, ctx.store.ua);
|
const { text: html } = await ctx.http.fetchTextWithRetry(
|
||||||
|
pageUrl,
|
||||||
|
`craft:html:${ctx.cat.key}:p${p}`,
|
||||||
|
ctx.store.ua
|
||||||
|
);
|
||||||
htmlPagesFetched++;
|
htmlPagesFetched++;
|
||||||
|
|
||||||
if (craftCellarsIsEmptyListingPage(html)) break;
|
if (craftCellarsIsEmptyListingPage(html)) break;
|
||||||
|
|
@ -162,22 +207,30 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) {
|
||||||
for (const it of items) {
|
for (const it of items) {
|
||||||
const url = canonicalizeCraftProductUrl(it.url);
|
const url = canonicalizeCraftProductUrl(it.url);
|
||||||
if (!url) continue;
|
if (!url) continue;
|
||||||
htmlMap.set(url, { name: it.name || "", price: it.price || "", url, img: it.img || "" });
|
htmlMap.set(url, {
|
||||||
|
name: it.name || "",
|
||||||
|
price: it.price || "",
|
||||||
|
url,
|
||||||
|
img: it.img || "",
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// If HTML returns nothing, don't let JSON invent a category
|
|
||||||
if (!htmlMap.size) {
|
if (!htmlMap.size) {
|
||||||
ctx.logger.warn(`${ctx.catPrefixOut} | HTML listing returned 0 items; refusing to use products.json as source of truth.`);
|
ctx.logger.warn(
|
||||||
|
`${ctx.catPrefixOut} | HTML listing returned 0 items; refusing JSON-only discovery`
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2) JSON scan: build SKU index (but do NOT add new URLs from JSON)
|
const jsonMap = new Map();
|
||||||
const jsonMap = new Map(); // url -> { sku, price, img }
|
|
||||||
|
|
||||||
if (htmlMap.size) {
|
if (htmlMap.size) {
|
||||||
const start = new URL(ctx.cat.startUrl);
|
const start = new URL(ctx.cat.startUrl);
|
||||||
const m = start.pathname.match(/^\/collections\/([^/]+)/i);
|
const m = start.pathname.match(/^\/collections\/([^/]+)/i);
|
||||||
if (!m) throw new Error(`CraftCellars: couldn't extract collection handle from ${ctx.cat.startUrl}`);
|
if (!m)
|
||||||
|
throw new Error(
|
||||||
|
`CraftCellars: couldn't extract collection handle from ${ctx.cat.startUrl}`
|
||||||
|
);
|
||||||
const collectionHandle = m[1];
|
const collectionHandle = m[1];
|
||||||
|
|
||||||
const limit = 250;
|
const limit = 250;
|
||||||
|
|
@ -185,12 +238,19 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) {
|
||||||
let jsonPagesFetched = 0;
|
let jsonPagesFetched = 0;
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
if (jsonPage > 1 && perJsonPageDelayMs > 0) await sleep(perJsonPageDelayMs);
|
if (jsonPage > 1 && perJsonPageDelayMs > 0)
|
||||||
|
await sleep(perJsonPageDelayMs);
|
||||||
|
|
||||||
const url = `https://${ctx.store.host}/collections/${collectionHandle}/products.json?limit=${limit}&page=${jsonPage}`;
|
const url = `https://${ctx.store.host}/collections/${collectionHandle}/products.json?limit=${limit}&page=${jsonPage}`;
|
||||||
const r = await ctx.http.fetchJsonWithRetry(url, `craft:coljson:${ctx.cat.key}:p${jsonPage}`, ctx.store.ua);
|
const r = await ctx.http.fetchJsonWithRetry(
|
||||||
|
url,
|
||||||
|
`craft:coljson:${ctx.cat.key}:p${jsonPage}`,
|
||||||
|
ctx.store.ua
|
||||||
|
);
|
||||||
|
|
||||||
const products = Array.isArray(r?.json?.products) ? r.json.products : [];
|
const products = Array.isArray(r?.json?.products)
|
||||||
|
? r.json.products
|
||||||
|
: [];
|
||||||
jsonPagesFetched++;
|
jsonPagesFetched++;
|
||||||
|
|
||||||
if (!products.length) break;
|
if (!products.length) break;
|
||||||
|
|
@ -199,73 +259,116 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) {
|
||||||
const handle = String(p?.handle || "");
|
const handle = String(p?.handle || "");
|
||||||
if (!handle) continue;
|
if (!handle) continue;
|
||||||
|
|
||||||
const prodUrl = canonicalizeCraftProductUrl(`https://${ctx.store.host}/products/${handle}`);
|
const prodUrl = canonicalizeCraftProductUrl(
|
||||||
|
`https://${ctx.store.host}/products/${handle}`
|
||||||
// Only enrich if it's on the HTML allowlist
|
);
|
||||||
if (!htmlMap.has(prodUrl)) continue;
|
if (!htmlMap.has(prodUrl)) continue;
|
||||||
|
|
||||||
const variants = Array.isArray(p?.variants) ? p.variants : [];
|
const variants = Array.isArray(p?.variants) ? p.variants : [];
|
||||||
const v = variants.find((x) => x && x.available === true) || variants[0] || null;
|
const v =
|
||||||
|
variants.find((x) => x && x.available === true) ||
|
||||||
|
variants[0] ||
|
||||||
|
null;
|
||||||
|
|
||||||
const sku = normalizeCspc(v?.sku || "");
|
const sku = normalizeCspc(v?.sku || "");
|
||||||
const price = v?.price ? usdFromShopifyPriceStr(v.price) : "";
|
const price = v?.price ? usdFromShopifyPriceStr(v.price) : "";
|
||||||
|
|
||||||
// Product image (best effort)
|
|
||||||
let img = "";
|
let img = "";
|
||||||
const images = Array.isArray(p?.images) ? p.images : [];
|
const images = Array.isArray(p?.images) ? p.images : [];
|
||||||
if (images[0]) {
|
if (images[0]) {
|
||||||
if (typeof images[0] === "string") img = images[0];
|
img =
|
||||||
else img = String(images[0]?.src || images[0]?.url || "");
|
typeof images[0] === "string"
|
||||||
|
? images[0]
|
||||||
|
: String(images[0]?.src || images[0]?.url || "");
|
||||||
}
|
}
|
||||||
if (!img && p?.image) img = String(p.image?.src || p.image?.url || p.image || "");
|
if (!img && p?.image)
|
||||||
|
img = String(p.image?.src || p.image?.url || p.image || "");
|
||||||
img = String(img || "").trim();
|
img = String(img || "").trim();
|
||||||
if (img.startsWith("//")) img = `https:${img}`;
|
if (img.startsWith("//")) img = `https:${img}`;
|
||||||
if (img && !/^https?:\/\//i.test(img)) {
|
|
||||||
try {
|
|
||||||
img = new URL(img, `https://${ctx.store.host}/`).toString();
|
|
||||||
} catch {
|
|
||||||
// keep as-is
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
jsonMap.set(prodUrl, { sku, price, img });
|
jsonMap.set(prodUrl, { sku, price, img });
|
||||||
}
|
}
|
||||||
|
|
||||||
if (products.length < limit) break;
|
if (products.length < limit) break;
|
||||||
jsonPage++;
|
if (++jsonPage > 200) break;
|
||||||
if (jsonPage > 200) break; // safety
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx.logger.ok(`${ctx.catPrefixOut} | HTML pages=${htmlPagesFetched} JSON pages=${jsonPagesFetched}`);
|
ctx.logger.ok(
|
||||||
} else {
|
`${ctx.catPrefixOut} | HTML pages=${htmlPagesFetched} JSON pages=${jsonPagesFetched}`
|
||||||
ctx.logger.ok(`${ctx.catPrefixOut} | HTML pages=${htmlPagesFetched} JSON pages=0`);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// 3) Final discovered: HTML allowlist, enriched by JSON
|
|
||||||
const discovered = new Map();
|
const discovered = new Map();
|
||||||
for (const [url, it] of htmlMap.entries()) {
|
for (const [url, it] of htmlMap.entries()) {
|
||||||
const j = jsonMap.get(url);
|
const j = jsonMap.get(url);
|
||||||
|
const prev = prevDb?.byUrl?.get(url) || null;
|
||||||
|
|
||||||
discovered.set(url, {
|
discovered.set(url, {
|
||||||
name: it.name || "",
|
name: it.name,
|
||||||
// Prefer JSON price (normalized) when present, else keep HTML price (already formatted)
|
|
||||||
price: j?.price || it.price || "",
|
price: j?.price || it.price || "",
|
||||||
url,
|
url,
|
||||||
sku: j?.sku || "",
|
// reuse cached SKU unless we found something better this run
|
||||||
img: j?.img || it.img || "",
|
sku: pickBetterSku(j?.sku || "", prev?.sku || ""),
|
||||||
|
// reuse cached image if we didn't find one
|
||||||
|
img: (j?.img || it.img || prev?.img || ""),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx.logger.ok(`${ctx.catPrefixOut} | Unique products (this run): ${discovered.size}`);
|
/* ---------- NEW: product page SKU fallback (cached; only when needed) ---------- */
|
||||||
|
const perProductSkuDelayMs = Math.max(
|
||||||
|
0,
|
||||||
|
cfgNum(
|
||||||
|
ctx?.cat?.skuPageDelayMs,
|
||||||
|
cfgNum(ctx?.cat?.jsonPageDelayMs, perPageDelayMs)
|
||||||
|
)
|
||||||
|
);
|
||||||
|
|
||||||
const { merged, newItems, updatedItems, removedItems, restoredItems } = mergeDiscoveredIntoDb(prevDb, discovered, {
|
let skuPagesFetched = 0;
|
||||||
|
|
||||||
|
for (const it of discovered.values()) {
|
||||||
|
// only hit product pages when missing/synthetic
|
||||||
|
if (!needsSkuDetail(it.sku)) continue;
|
||||||
|
|
||||||
|
if (perProductSkuDelayMs > 0) await sleep(perProductSkuDelayMs);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const { text } = await ctx.http.fetchTextWithRetry(
|
||||||
|
it.url,
|
||||||
|
`craft:prodpage:${ctx.cat.key}:${Buffer.from(it.url)
|
||||||
|
.toString("base64")
|
||||||
|
.slice(0, 24)}`,
|
||||||
|
ctx.store.ua
|
||||||
|
);
|
||||||
|
skuPagesFetched++;
|
||||||
|
|
||||||
|
const sku = extractCraftSkuFromProductPageHtml(text);
|
||||||
|
if (sku) it.sku = sku;
|
||||||
|
} catch {
|
||||||
|
/* best effort */
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx.logger.ok(
|
||||||
|
`${ctx.catPrefixOut} | SKU fallback pages=${skuPagesFetched}`
|
||||||
|
);
|
||||||
|
|
||||||
|
ctx.logger.ok(
|
||||||
|
`${ctx.catPrefixOut} | Unique products (this run): ${discovered.size}`
|
||||||
|
);
|
||||||
|
|
||||||
|
const {
|
||||||
|
merged,
|
||||||
|
newItems,
|
||||||
|
updatedItems,
|
||||||
|
removedItems,
|
||||||
|
restoredItems,
|
||||||
|
} = mergeDiscoveredIntoDb(prevDb, discovered, {
|
||||||
storeLabel: ctx.store.name,
|
storeLabel: ctx.store.name,
|
||||||
});
|
});
|
||||||
|
|
||||||
const dbObj = buildDbObject(ctx, merged);
|
const dbObj = buildDbObject(ctx, merged);
|
||||||
writeJsonAtomic(ctx.dbFile, dbObj);
|
writeJsonAtomic(ctx.dbFile, dbObj);
|
||||||
|
|
||||||
ctx.logger.ok(`${ctx.catPrefixOut} | DB saved: ${ctx.logger.dim(ctx.dbFile)} (${dbObj.count} items)`);
|
|
||||||
|
|
||||||
const elapsed = Date.now() - t0;
|
const elapsed = Date.now() - t0;
|
||||||
|
|
||||||
report.categories.push({
|
report.categories.push({
|
||||||
|
|
@ -287,7 +390,15 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) {
|
||||||
report.totals.removedCount += removedItems.length;
|
report.totals.removedCount += removedItems.length;
|
||||||
report.totals.restoredCount += restoredItems.length;
|
report.totals.restoredCount += restoredItems.length;
|
||||||
|
|
||||||
addCategoryResultToReport(report, ctx.store.name, ctx.cat.label, newItems, updatedItems, removedItems, restoredItems);
|
addCategoryResultToReport(
|
||||||
|
report,
|
||||||
|
ctx.store.name,
|
||||||
|
ctx.cat.label,
|
||||||
|
newItems,
|
||||||
|
updatedItems,
|
||||||
|
removedItems,
|
||||||
|
restoredItems
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
function createStore(defaultUa) {
|
function createStore(defaultUa) {
|
||||||
|
|
@ -297,10 +408,8 @@ function createStore(defaultUa) {
|
||||||
host: "craftcellars.ca",
|
host: "craftcellars.ca",
|
||||||
ua: defaultUa,
|
ua: defaultUa,
|
||||||
|
|
||||||
// ✅ Custom scan (HTML allowlist + JSON enrichment)
|
|
||||||
scanCategory: scanCategoryCraftCellars,
|
scanCategory: scanCategoryCraftCellars,
|
||||||
|
|
||||||
// Keep HTML parser for debugging
|
|
||||||
parseProducts: parseProductsCraftCellars,
|
parseProducts: parseProductsCraftCellars,
|
||||||
makePageUrl: makePageUrlShopifyQueryPage,
|
makePageUrl: makePageUrlShopifyQueryPage,
|
||||||
isEmptyListingPage: craftCellarsIsEmptyListingPage,
|
isEmptyListingPage: craftCellarsIsEmptyListingPage,
|
||||||
|
|
@ -309,69 +418,22 @@ function createStore(defaultUa) {
|
||||||
{
|
{
|
||||||
key: "whisky",
|
key: "whisky",
|
||||||
label: "Whisky",
|
label: "Whisky",
|
||||||
startUrl: "https://craftcellars.ca/collections/whisky?filter.v.availability=1",
|
startUrl:
|
||||||
|
"https://craftcellars.ca/collections/whisky?filter.v.availability=1",
|
||||||
// slow-and-safe defaults (override globally if you want)
|
|
||||||
discoveryStartPage: 3,
|
|
||||||
discoveryStep: 2,
|
|
||||||
pageConcurrency: 1,
|
pageConcurrency: 1,
|
||||||
pageStaggerMs: 10000,
|
pageStaggerMs: 10000,
|
||||||
discoveryDelayMs: 10000,
|
discoveryDelayMs: 10000,
|
||||||
|
skuPageDelayMs: 12000,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
key: "rum",
|
key: "rum",
|
||||||
label: "Rum",
|
label: "Rum",
|
||||||
startUrl: "https://craftcellars.ca/collections/rum?filter.v.availability=1",
|
startUrl:
|
||||||
|
"https://craftcellars.ca/collections/rum?filter.v.availability=1",
|
||||||
discoveryStartPage: 3,
|
|
||||||
discoveryStep: 2,
|
|
||||||
pageConcurrency: 1,
|
|
||||||
pageStaggerMs: 10000,
|
|
||||||
discoveryDelayMs: 10000,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
key: "single-malt-scotch",
|
|
||||||
label: "Single Malt Scotch",
|
|
||||||
startUrl: "https://craftcellars.ca/collections/single-malt-scotch?filter.v.availability=1",
|
|
||||||
|
|
||||||
discoveryStartPage: 3,
|
|
||||||
discoveryStep: 2,
|
|
||||||
pageConcurrency: 1,
|
|
||||||
pageStaggerMs: 10000,
|
|
||||||
discoveryDelayMs: 10000,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
key: "other-scotch-styles",
|
|
||||||
label: "Other Scotch Styles",
|
|
||||||
startUrl: "https://craftcellars.ca/collections/other-scotch-styles?filter.v.availability=1",
|
|
||||||
|
|
||||||
discoveryStartPage: 3,
|
|
||||||
discoveryStep: 2,
|
|
||||||
pageConcurrency: 1,
|
|
||||||
pageStaggerMs: 10000,
|
|
||||||
discoveryDelayMs: 10000,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
key: "single-grain-scotch",
|
|
||||||
label: "Single Grain Scotch",
|
|
||||||
startUrl: "https://craftcellars.ca/collections/single-grain-scotch?filter.v.availability=1",
|
|
||||||
|
|
||||||
discoveryStartPage: 3,
|
|
||||||
discoveryStep: 2,
|
|
||||||
pageConcurrency: 1,
|
|
||||||
pageStaggerMs: 10000,
|
|
||||||
discoveryDelayMs: 10000,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
key: "blended-malt-scotch",
|
|
||||||
label: "Blended Malt Scotch",
|
|
||||||
startUrl: "https://craftcellars.ca/collections/blended-malt-scotch?filter.v.availability=1",
|
|
||||||
|
|
||||||
discoveryStartPage: 3,
|
|
||||||
discoveryStep: 2,
|
|
||||||
pageConcurrency: 1,
|
pageConcurrency: 1,
|
||||||
pageStaggerMs: 10000,
|
pageStaggerMs: 10000,
|
||||||
discoveryDelayMs: 10000,
|
discoveryDelayMs: 10000,
|
||||||
|
skuPageDelayMs: 12000,
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
};
|
};
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,8 @@
|
||||||
|
// src/stores/gull.js
|
||||||
"use strict";
|
"use strict";
|
||||||
|
|
||||||
const { decodeHtml, cleanText, extractFirstImgUrl } = require("../utils/html");
|
const { decodeHtml, cleanText, extractFirstImgUrl } = require("../utils/html");
|
||||||
const { normalizeCspc } = require("../utils/sku");
|
const { normalizeCspc, pickBetterSku, needsSkuDetail } = require("../utils/sku");
|
||||||
const { makePageUrl } = require("../utils/url");
|
const { makePageUrl } = require("../utils/url");
|
||||||
|
|
||||||
function looksInStock(block) {
|
function looksInStock(block) {
|
||||||
|
|
@ -45,6 +46,133 @@ function extractGullPriceFromBlock(block) {
|
||||||
return `$${chosen.toFixed(2)}`;
|
return `$${chosen.toFixed(2)}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Gull SKUs are often NOT 6 digits (e.g. 67424).
|
||||||
|
// If it's not 6 digits, represent as id:<digits> to avoid normalizeCspc turning it into u:SHA.
|
||||||
|
function normalizeGullSku(raw) {
|
||||||
|
const s = cleanText(decodeHtml(String(raw || ""))).trim();
|
||||||
|
|
||||||
|
// already in a stable prefixed form
|
||||||
|
if (/^(id:|u:)/i.test(s)) return s;
|
||||||
|
|
||||||
|
// digits-only SKU (from page / tile)
|
||||||
|
const digits = s.match(/\b(\d{3,10})\b/)?.[1] || "";
|
||||||
|
if (digits) {
|
||||||
|
if (digits.length === 6) return normalizeCspc(digits);
|
||||||
|
return `id:${digits}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// fall back to existing normalizer (may yield u:...)
|
||||||
|
return normalizeCspc(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
// When we fall back to normalizeCspc(url), we may end up with a generated u:XXXXXXXX.
|
||||||
|
function isGeneratedUrlSku(sku) {
|
||||||
|
const s = String(sku || "");
|
||||||
|
// you have u:8hex in the DB, so accept 8+
|
||||||
|
return /^u:[0-9a-f]{8,128}$/i.test(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract SKU from Gull product page HTML.
|
||||||
|
function extractGullSkuFromProductPage(html) {
|
||||||
|
const s = String(html || "");
|
||||||
|
|
||||||
|
// Most reliable: <span class="sku">67424</span>
|
||||||
|
const m1 = s.match(
|
||||||
|
/<span\b[^>]*class=["'][^"']*\bsku\b[^"']*["'][^>]*>\s*([0-9]{3,10})\s*<\/span>/i
|
||||||
|
);
|
||||||
|
if (m1?.[1]) return normalizeGullSku(m1[1]);
|
||||||
|
|
||||||
|
// Fallback: "SKU: 67424" text
|
||||||
|
const m2 = s.match(/\bSKU:\s*([0-9]{3,10})\b/i);
|
||||||
|
if (m2?.[1]) return normalizeGullSku(m2[1]);
|
||||||
|
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Serial limiter: ensures at least minIntervalMs between request starts.
|
||||||
|
function createMinIntervalLimiter(minIntervalMs) {
|
||||||
|
let lastStart = 0;
|
||||||
|
let chain = Promise.resolve();
|
||||||
|
|
||||||
|
return async function schedule(fn) {
|
||||||
|
chain = chain.then(async () => {
|
||||||
|
const now = Date.now();
|
||||||
|
const waitMs = Math.max(0, lastStart + minIntervalMs - now);
|
||||||
|
if (waitMs) await new Promise((r) => setTimeout(r, waitMs));
|
||||||
|
lastStart = Date.now();
|
||||||
|
return fn();
|
||||||
|
});
|
||||||
|
return chain;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
async function fetchWith429Backoff(url, { fetchFn, headers, maxRetries = 2 }) {
|
||||||
|
let attempt = 0;
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
const res = await fetchFn(url, { headers });
|
||||||
|
|
||||||
|
if (res.status !== 429) {
|
||||||
|
if (!res.ok) throw new Error(`HTTP ${res.status} fetching ${url}`);
|
||||||
|
return await res.text();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (attempt >= maxRetries) throw new Error(`HTTP 429 fetching ${url}`);
|
||||||
|
|
||||||
|
// Respect Retry-After if present; otherwise progressive backoff.
|
||||||
|
const ra =
|
||||||
|
res.headers && typeof res.headers.get === "function"
|
||||||
|
? res.headers.get("retry-after")
|
||||||
|
: null;
|
||||||
|
|
||||||
|
const waitSec = ra && /^\d+$/.test(ra) ? parseInt(ra, 10) : 15 * (attempt + 1);
|
||||||
|
await new Promise((r) => setTimeout(r, waitSec * 1000));
|
||||||
|
attempt++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Only fetches product pages for items whose sku is a generated u:... (from URL fallback).
|
||||||
|
* Runs serially + slowly to avoid Gull 429s.
|
||||||
|
*
|
||||||
|
* NEW: accepts prevDb so we can skip fetch if URL already has a good SKU cached.
|
||||||
|
*/
|
||||||
|
async function hydrateGullSkus(
|
||||||
|
items,
|
||||||
|
{ fetchFn, ua, minIntervalMs = 12000, maxRetries = 2, prevDb } = {}
|
||||||
|
) {
|
||||||
|
if (!fetchFn) throw new Error("hydrateGullSkus requires opts.fetchFn");
|
||||||
|
|
||||||
|
const schedule = createMinIntervalLimiter(minIntervalMs);
|
||||||
|
|
||||||
|
const headers = {
|
||||||
|
"user-agent": ua || "Mozilla/5.0",
|
||||||
|
accept: "text/html,application/xhtml+xml",
|
||||||
|
};
|
||||||
|
|
||||||
|
for (const it of items || []) {
|
||||||
|
if (!it || !it.url) continue;
|
||||||
|
|
||||||
|
// NEW: if DB already has a good SKU, reuse it and skip fetch
|
||||||
|
const prev = prevDb?.byUrl?.get(it.url) || null;
|
||||||
|
if (prev?.sku && !needsSkuDetail(prev.sku)) {
|
||||||
|
it.sku = pickBetterSku(it.sku, prev.sku);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!isGeneratedUrlSku(it.sku)) continue; // only where required
|
||||||
|
|
||||||
|
const html = await schedule(() =>
|
||||||
|
fetchWith429Backoff(it.url, { fetchFn, headers, maxRetries })
|
||||||
|
);
|
||||||
|
|
||||||
|
const realSku = extractGullSkuFromProductPage(html);
|
||||||
|
if (realSku) it.sku = pickBetterSku(realSku, it.sku);
|
||||||
|
}
|
||||||
|
|
||||||
|
return items;
|
||||||
|
}
|
||||||
|
|
||||||
function parseProductsGull(html, ctx) {
|
function parseProductsGull(html, ctx) {
|
||||||
const s = String(html || "");
|
const s = String(html || "");
|
||||||
const items = [];
|
const items = [];
|
||||||
|
|
@ -82,11 +210,12 @@ function parseProductsGull(html, ctx) {
|
||||||
|
|
||||||
const price = extractGullPriceFromBlock(block);
|
const price = extractGullPriceFromBlock(block);
|
||||||
|
|
||||||
const sku = normalizeCspc(
|
const skuRaw =
|
||||||
block.match(/\bdata-product_sku=["']([^"']+)["']/i)?.[1] ||
|
block.match(/\bdata-product_sku=["']([^"']+)["']/i)?.[1] ||
|
||||||
block.match(/\bSKU\b[^0-9]{0,20}(\d{6})\b/i)?.[1] ||
|
block.match(/\bSKU\b[^0-9]{0,30}(\d{3,10})\b/i)?.[1] ||
|
||||||
url
|
url; // OK fallback; hydrateGullSkus will only re-fetch when this becomes u:...
|
||||||
);
|
|
||||||
|
const sku = normalizeGullSku(skuRaw);
|
||||||
|
|
||||||
const img = extractFirstImgUrl(block, base);
|
const img = extractFirstImgUrl(block, base);
|
||||||
|
|
||||||
|
|
@ -98,7 +227,6 @@ function parseProductsGull(html, ctx) {
|
||||||
return [...uniq.values()];
|
return [...uniq.values()];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
function createStore(defaultUa) {
|
function createStore(defaultUa) {
|
||||||
return {
|
return {
|
||||||
key: "gull",
|
key: "gull",
|
||||||
|
|
@ -106,12 +234,19 @@ function createStore(defaultUa) {
|
||||||
host: "gullliquorstore.com",
|
host: "gullliquorstore.com",
|
||||||
ua: defaultUa,
|
ua: defaultUa,
|
||||||
parseProducts: parseProductsGull,
|
parseProducts: parseProductsGull,
|
||||||
|
|
||||||
|
// Optional hook callers can use to post-process items:
|
||||||
|
// only hits product pages when sku is u:...
|
||||||
|
hydrateSkus: hydrateGullSkus,
|
||||||
|
productPageMinIntervalMs: 12000, // slow by default; Gull is strict
|
||||||
|
|
||||||
makePageUrl, // enables /page/N/ paging
|
makePageUrl, // enables /page/N/ paging
|
||||||
categories: [
|
categories: [
|
||||||
{
|
{
|
||||||
key: "whisky",
|
key: "whisky",
|
||||||
label: "Whisky",
|
label: "Whisky",
|
||||||
startUrl: "https://gullliquorstore.com/product-category/spirits/?spirit_type=whisky",
|
startUrl:
|
||||||
|
"https://gullliquorstore.com/product-category/spirits/?spirit_type=whisky",
|
||||||
discoveryStartPage: 3,
|
discoveryStartPage: 3,
|
||||||
discoveryStep: 2,
|
discoveryStep: 2,
|
||||||
pageConcurrency: 1,
|
pageConcurrency: 1,
|
||||||
|
|
@ -121,7 +256,8 @@ function createStore(defaultUa) {
|
||||||
{
|
{
|
||||||
key: "rum",
|
key: "rum",
|
||||||
label: "Rum",
|
label: "Rum",
|
||||||
startUrl: "https://gullliquorstore.com/product-category/spirits/?spirit_type=rum",
|
startUrl:
|
||||||
|
"https://gullliquorstore.com/product-category/spirits/?spirit_type=rum",
|
||||||
discoveryStartPage: 3,
|
discoveryStartPage: 3,
|
||||||
discoveryStep: 2,
|
discoveryStep: 2,
|
||||||
pageConcurrency: 1,
|
pageConcurrency: 1,
|
||||||
|
|
@ -132,4 +268,11 @@ function createStore(defaultUa) {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = { createStore, parseProductsGull };
|
module.exports = {
|
||||||
|
createStore,
|
||||||
|
parseProductsGull,
|
||||||
|
hydrateGullSkus,
|
||||||
|
extractGullSkuFromProductPage,
|
||||||
|
isGeneratedUrlSku,
|
||||||
|
normalizeGullSku,
|
||||||
|
};
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
"use strict";
|
"use strict";
|
||||||
|
|
||||||
const { cleanText } = require("../utils/html");
|
const { cleanText } = require("../utils/html");
|
||||||
const { normalizeCspc } = require("../utils/sku");
|
const { normalizeCspc, pickBetterSku } = require("../utils/sku");
|
||||||
const { humanBytes } = require("../utils/bytes");
|
const { humanBytes } = require("../utils/bytes");
|
||||||
const { padLeft, padRight } = require("../utils/string");
|
const { padLeft, padRight } = require("../utils/string");
|
||||||
|
|
||||||
|
|
@ -67,6 +67,34 @@ function normalizeAbsUrl(raw) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Treat u:* as synthetic (URL-hash fallback) and eligible for repair.
|
||||||
|
function isSyntheticSku(sku) {
|
||||||
|
const s = String(sku || "").trim();
|
||||||
|
return !s || /^u:/i.test(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
// If SKU is <6 chars, namespace it (per your request) to reduce collisions.
|
||||||
|
// Also: DO NOT run numeric SKUs through normalizeCspc (some normalizers hash arbitrary strings).
|
||||||
|
function normalizeTudorSku(rawSku) {
|
||||||
|
const s = String(rawSku || "").trim();
|
||||||
|
if (!s) return "";
|
||||||
|
|
||||||
|
if (/^id:/i.test(s)) return s;
|
||||||
|
if (/^u:/i.test(s)) return s;
|
||||||
|
|
||||||
|
// numeric SKU like 67433
|
||||||
|
if (/^\d+$/.test(s)) {
|
||||||
|
return s.length < 6 ? `id:${s}` : s;
|
||||||
|
}
|
||||||
|
|
||||||
|
// short alnum SKU -> namespace
|
||||||
|
if (s.length < 6) return `id:${s}`;
|
||||||
|
|
||||||
|
// for other formats, keep your existing normalization
|
||||||
|
// (if normalizeCspc returns empty, fall back to the raw string)
|
||||||
|
return normalizeCspc(s) || s;
|
||||||
|
}
|
||||||
|
|
||||||
function tudorProductUrl(ctx, slug) {
|
function tudorProductUrl(ctx, slug) {
|
||||||
// Site URLs look like: /TUDOR_HOUSE_0/product/spirits/<subcat>/<slug>
|
// Site URLs look like: /TUDOR_HOUSE_0/product/spirits/<subcat>/<slug>
|
||||||
const root = ctx?.cat?.tudorRootSlug || "spirits";
|
const root = ctx?.cat?.tudorRootSlug || "spirits";
|
||||||
|
|
@ -82,33 +110,23 @@ function tudorPickVariant(p) {
|
||||||
return inStock || vs[0] || null;
|
return inStock || vs[0] || null;
|
||||||
}
|
}
|
||||||
|
|
||||||
function tudorItemFromProduct(p, ctx) {
|
function pickAnySkuFromProduct(p) {
|
||||||
if (!p) return null;
|
const vs = Array.isArray(p?.variants) ? p.variants : [];
|
||||||
|
for (const v of vs) {
|
||||||
const name = cleanText(p?.name || "");
|
const s = String(v?.sku || "").trim();
|
||||||
const slug = String(p?.slug || "").trim();
|
if (s) return s;
|
||||||
if (!name || !slug) return null;
|
}
|
||||||
|
return "";
|
||||||
const v = tudorPickVariant(p);
|
|
||||||
if (v && Number(v?.quantity) <= 0) return null; // only keep in-stock
|
|
||||||
|
|
||||||
const url = tudorProductUrl(ctx, slug);
|
|
||||||
|
|
||||||
const price = money(v?.price ?? p?.priceFrom ?? p?.priceTo);
|
|
||||||
const sku = normalizeCspc(v?.sku || "");
|
|
||||||
const img = normalizeAbsUrl(
|
|
||||||
firstNonEmptyStr(
|
|
||||||
v?.image,
|
|
||||||
p?.gulpImages,
|
|
||||||
p?.posImages,
|
|
||||||
p?.customImages,
|
|
||||||
p?.imageIds
|
|
||||||
)
|
|
||||||
);
|
|
||||||
|
|
||||||
return { name, price, url, sku, img };
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function pickInStockVariantWithFallback(p) {
|
||||||
|
const vs = Array.isArray(p?.variants) ? p.variants : [];
|
||||||
|
const inStock = vs.find((v) => Number(v?.quantity) > 0);
|
||||||
|
return inStock || vs[0] || null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---------------- GraphQL ---------------- */
|
||||||
|
|
||||||
async function tudorGql(ctx, label, query, variables) {
|
async function tudorGql(ctx, label, query, variables) {
|
||||||
return await ctx.http.fetchJsonWithRetry(GQL_URL, label, ctx.store.ua, {
|
return await ctx.http.fetchJsonWithRetry(GQL_URL, label, ctx.store.ua, {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
|
|
@ -122,15 +140,7 @@ async function tudorGql(ctx, label, query, variables) {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
function pickConnection(json) {
|
/* ---------------- GQL queries ---------------- */
|
||||||
const data = json?.data;
|
|
||||||
if (!data || typeof data !== "object") return null;
|
|
||||||
for (const v of Object.values(data)) {
|
|
||||||
if (v && typeof v === "object" && Array.isArray(v.items)) return v;
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
const PRODUCTS_QUERY = `
|
const PRODUCTS_QUERY = `
|
||||||
query(
|
query(
|
||||||
|
|
@ -170,15 +180,14 @@ const PRODUCTS_QUERY = `
|
||||||
isStaffPick: $isStaffPick,
|
isStaffPick: $isStaffPick,
|
||||||
pageCursor: $pageCursor,
|
pageCursor: $pageCursor,
|
||||||
pageLimit: $pageLimit,
|
pageLimit: $pageLimit,
|
||||||
pointsMin: $pointsMin,
|
sortBy: $sortBy,
|
||||||
|
sortOrder: $sortOrder,
|
||||||
priceMin: $priceMin,
|
priceMin: $priceMin,
|
||||||
priceMax: $priceMax,
|
priceMax: $priceMax,
|
||||||
quantityMin: $quantityMin,
|
quantityMin: $quantityMin,
|
||||||
regions: $regions,
|
regions: $regions,
|
||||||
brandValue: $brandValue,
|
brandValue: $brandValue,
|
||||||
searchValue: $searchValue,
|
searchValue: $searchValue,
|
||||||
sortOrder: $sortOrder,
|
|
||||||
sortBy: $sortBy,
|
|
||||||
storeId: $storeId,
|
storeId: $storeId,
|
||||||
) {
|
) {
|
||||||
items {
|
items {
|
||||||
|
|
@ -199,6 +208,31 @@ const PRODUCTS_QUERY = `
|
||||||
}
|
}
|
||||||
`;
|
`;
|
||||||
|
|
||||||
|
// ONLY for limited image supplementation (within a small budget)
|
||||||
|
const PRODUCTS_BY_SKU_QUERY = `
|
||||||
|
query(
|
||||||
|
$sku: String!,
|
||||||
|
$storeId: String
|
||||||
|
) {
|
||||||
|
productsBySku(
|
||||||
|
sku: $sku,
|
||||||
|
storeId: $storeId
|
||||||
|
) {
|
||||||
|
items {
|
||||||
|
id
|
||||||
|
slug
|
||||||
|
imageIds
|
||||||
|
posImages
|
||||||
|
customImages
|
||||||
|
gulpImages
|
||||||
|
variants { id image price quantity sku deposit }
|
||||||
|
}
|
||||||
|
nextPageCursor
|
||||||
|
totalCount
|
||||||
|
}
|
||||||
|
}
|
||||||
|
`;
|
||||||
|
|
||||||
async function fetchProductsPage(ctx, cursor) {
|
async function fetchProductsPage(ctx, cursor) {
|
||||||
const vars = {
|
const vars = {
|
||||||
storeId: STORE_ID,
|
storeId: STORE_ID,
|
||||||
|
|
@ -224,78 +258,291 @@ async function fetchProductsPage(ctx, cursor) {
|
||||||
return r.json.data.products;
|
return r.json.data.products;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ---------------- GQL bySku helper (image-only within budget) ---------------- */
|
||||||
|
|
||||||
|
async function fetchProductBySku(ctx, sku) {
|
||||||
|
const s = String(sku || "").trim();
|
||||||
|
if (!s) return null;
|
||||||
|
|
||||||
|
if (!ctx._tudorSkuCache) ctx._tudorSkuCache = new Map();
|
||||||
|
if (ctx._tudorSkuCache.has(s)) return ctx._tudorSkuCache.get(s);
|
||||||
|
|
||||||
|
const r = await tudorGql(ctx, `tudor:gql:bySku:${ctx.cat.key}:${s}`, PRODUCTS_BY_SKU_QUERY, {
|
||||||
|
sku: s,
|
||||||
|
storeId: STORE_ID,
|
||||||
|
});
|
||||||
|
|
||||||
|
let out = null;
|
||||||
|
if (r?.status === 200 && r?.json?.data?.productsBySku?.items?.length) {
|
||||||
|
out = r.json.data.productsBySku.items[0] || null;
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx._tudorSkuCache.set(s, out);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function supplementImageFromSku(ctx, skuProbe) {
|
||||||
|
const prod = await fetchProductBySku(ctx, skuProbe);
|
||||||
|
if (!prod) return null;
|
||||||
|
|
||||||
|
const v = pickInStockVariantWithFallback(prod);
|
||||||
|
const img = normalizeAbsUrl(
|
||||||
|
firstNonEmptyStr(v?.image, prod?.gulpImages, prod?.posImages, prod?.customImages, prod?.imageIds)
|
||||||
|
);
|
||||||
|
|
||||||
|
return img ? { img } : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---------------- HTML product page fallback (SKU + optional image) ---------------- */
|
||||||
|
|
||||||
|
// Budgets (per category run). Override via ctx.config.tudorHtmlBudget / ctx.config.tudorGqlBudget.
|
||||||
|
const DETAIL_HTML_BUDGET_DEFAULT = 200;
|
||||||
|
const DETAIL_GQL_BUDGET_DEFAULT = 10;
|
||||||
|
|
||||||
|
function parseSkuFromHtml(html) {
|
||||||
|
const s = String(html || "");
|
||||||
|
|
||||||
|
// 1) Visible block: <div class="sku ...">SKU: 67433</div>
|
||||||
|
const m1 =
|
||||||
|
s.match(/>\s*SKU:\s*([A-Za-z0-9._-]+)\s*</i) ||
|
||||||
|
s.match(/\bSKU:\s*([A-Za-z0-9._-]+)\b/i);
|
||||||
|
if (m1 && m1[1]) return String(m1[1]).trim();
|
||||||
|
|
||||||
|
// 2) Embedded SAPPER preloaded JSON has variants with `"sku":"67433"`
|
||||||
|
const m2 = s.match(/"sku"\s*:\s*"([^"]+)"/i);
|
||||||
|
return m2 && m2[1] ? String(m2[1]).trim() : "";
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseOgImageFromHtml(html) {
|
||||||
|
const s = String(html || "");
|
||||||
|
const m =
|
||||||
|
s.match(/property=["']og:image["'][^>]*content=["']([^"']+)["']/i) ||
|
||||||
|
s.match(/name=["']twitter:image["'][^>]*content=["']([^"']+)["']/i);
|
||||||
|
return m ? String(m[1] || "").trim() : "";
|
||||||
|
}
|
||||||
|
|
||||||
|
async function tudorFetchHtml(ctx, label, url) {
|
||||||
|
// Use ctx.http so pacing/throttle is respected.
|
||||||
|
if (ctx?.http?.fetchTextWithRetry) {
|
||||||
|
return await ctx.http.fetchTextWithRetry(url, label, ctx.store.ua, {
|
||||||
|
method: "GET",
|
||||||
|
headers: {
|
||||||
|
Accept: "text/html,application/xhtml+xml",
|
||||||
|
Referer: `${BASE}/`,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Best-effort fallback if your wrapper has a generic fetchWithRetry.
|
||||||
|
if (ctx?.http?.fetchWithRetry) {
|
||||||
|
const r = await ctx.http.fetchWithRetry(url, label, ctx.store.ua, {
|
||||||
|
method: "GET",
|
||||||
|
headers: {
|
||||||
|
Accept: "text/html,application/xhtml+xml",
|
||||||
|
Referer: `${BASE}/`,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
const body = r?.text ?? r?.body ?? r?.data ?? "";
|
||||||
|
const text =
|
||||||
|
typeof body === "string"
|
||||||
|
? body
|
||||||
|
: Buffer.isBuffer(body)
|
||||||
|
? body.toString("utf8")
|
||||||
|
: body && typeof body === "object" && typeof body.toString === "function"
|
||||||
|
? body.toString()
|
||||||
|
: "";
|
||||||
|
|
||||||
|
return { status: r?.status, text, bytes: r?.bytes, ms: r?.ms };
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new Error("No HTML fetch method available on ctx.http (need fetchTextWithRetry or fetchWithRetry).");
|
||||||
|
}
|
||||||
|
|
||||||
|
async function tudorDetailFromProductPage(ctx, url) {
|
||||||
|
if (!ctx._tudorHtmlCache) ctx._tudorHtmlCache = new Map();
|
||||||
|
if (ctx._tudorHtmlCache.has(url)) return ctx._tudorHtmlCache.get(url);
|
||||||
|
|
||||||
|
let out = null;
|
||||||
|
try {
|
||||||
|
const r = await tudorFetchHtml(ctx, `tudor:html:${ctx.cat.key}`, url);
|
||||||
|
if (r?.status === 200 && typeof r?.text === "string" && r.text.length) {
|
||||||
|
const rawSku = parseSkuFromHtml(r.text);
|
||||||
|
const sku = normalizeTudorSku(rawSku);
|
||||||
|
const img = normalizeAbsUrl(parseOgImageFromHtml(r.text));
|
||||||
|
out = { sku, img };
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
out = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx._tudorHtmlCache.set(url, out);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---------------- item builder (fast, no extra calls) ---------------- */
|
||||||
|
|
||||||
|
function tudorItemFromProductFast(p, ctx) {
|
||||||
|
if (!p) return null;
|
||||||
|
|
||||||
|
const name = cleanText(p?.name || "");
|
||||||
|
const slug = String(p?.slug || "").trim();
|
||||||
|
if (!name || !slug) return null;
|
||||||
|
|
||||||
|
const v = tudorPickVariant(p);
|
||||||
|
if (v && Number(v?.quantity) <= 0) return null; // only keep in-stock
|
||||||
|
|
||||||
|
const url = tudorProductUrl(ctx, slug);
|
||||||
|
const price = money(v?.price ?? p?.priceFrom ?? p?.priceTo);
|
||||||
|
|
||||||
|
const skuRaw = String(v?.sku || "").trim() || pickAnySkuFromProduct(p);
|
||||||
|
const sku = normalizeTudorSku(skuRaw);
|
||||||
|
|
||||||
|
const img = normalizeAbsUrl(
|
||||||
|
firstNonEmptyStr(v?.image, p?.gulpImages, p?.posImages, p?.customImages, p?.imageIds)
|
||||||
|
);
|
||||||
|
|
||||||
|
return { name, price, url, sku, img, _skuProbe: skuRaw };
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---------------- repair (second pass, budgeted) ---------------- */
|
||||||
|
|
||||||
|
async function tudorRepairItem(ctx, it) {
|
||||||
|
// 1) Missing or synthetic SKU -> HTML product page (fastest path to real SKU)
|
||||||
|
if (isSyntheticSku(it.sku)) {
|
||||||
|
const d = await tudorDetailFromProductPage(ctx, it.url);
|
||||||
|
if (d?.sku && !isSyntheticSku(d.sku)) it.sku = d.sku;
|
||||||
|
if (!it.img && d?.img) it.img = d.img;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2) Missing image -> if we have a sku probe, do limited productsBySku
|
||||||
|
if (!it.img) {
|
||||||
|
const skuProbe = String(it._skuProbe || "").trim();
|
||||||
|
if (skuProbe) {
|
||||||
|
const supp = await supplementImageFromSku(ctx, skuProbe);
|
||||||
|
if (supp?.img) it.img = supp.img;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Final fallback ONLY after repair attempts (stability)
|
||||||
|
if (isSyntheticSku(it.sku)) it.sku = normalizeCspc(it.url) || "";
|
||||||
|
|
||||||
|
return it;
|
||||||
|
}
|
||||||
|
|
||||||
/* ---------------- scanner ---------------- */
|
/* ---------------- scanner ---------------- */
|
||||||
|
|
||||||
async function scanCategoryTudor(ctx, prevDb, report) {
|
async function scanCategoryTudor(ctx, prevDb, report) {
|
||||||
const t0 = Date.now();
|
const t0 = Date.now();
|
||||||
const discovered = new Map();
|
const discovered = new Map();
|
||||||
|
|
||||||
const maxPages = ctx.config.maxPages === null ? 500 : Math.min(ctx.config.maxPages, 500);
|
const maxPages = ctx.config.maxPages === null ? 500 : Math.min(ctx.config.maxPages, 500);
|
||||||
let cursor = null;
|
let cursor = null;
|
||||||
let done = 0;
|
let done = 0;
|
||||||
|
|
||||||
for (let page = 1; page <= maxPages; page++) {
|
const needsDetail = [];
|
||||||
const tPage = Date.now();
|
|
||||||
|
for (let page = 1; page <= maxPages; page++) {
|
||||||
const prod = await fetchProductsPage(ctx, cursor);
|
const tPage = Date.now();
|
||||||
const arr = Array.isArray(prod?.items) ? prod.items : [];
|
|
||||||
|
const prod = await fetchProductsPage(ctx, cursor);
|
||||||
let kept = 0;
|
const arr = Array.isArray(prod?.items) ? prod.items : [];
|
||||||
for (const p of arr) {
|
|
||||||
const it = tudorItemFromProduct(p, ctx);
|
let kept = 0;
|
||||||
if (!it) continue;
|
for (const p of arr) {
|
||||||
discovered.set(it.url, it);
|
const it = tudorItemFromProductFast(p, ctx);
|
||||||
kept++;
|
if (!it) continue;
|
||||||
|
|
||||||
|
// NEW: seed from cached DB to avoid repeating detail HTML
|
||||||
|
const prev = prevDb?.byUrl?.get(it.url) || null;
|
||||||
|
if (prev) {
|
||||||
|
it.sku = pickBetterSku(it.sku, prev.sku);
|
||||||
|
if (!it.img && prev.img) it.img = prev.img;
|
||||||
}
|
}
|
||||||
|
|
||||||
done++;
|
// queue only; do not do detail calls inline
|
||||||
|
if (isSyntheticSku(it.sku) || !it.img) needsDetail.push(it);
|
||||||
const ms = Date.now() - tPage;
|
|
||||||
ctx.logger.ok(
|
discovered.set(it.url, it);
|
||||||
`${ctx.catPrefixOut} | Page ${pageStr(page, maxPages)} | 200 | items=${padLeft(
|
kept++;
|
||||||
kept,
|
|
||||||
3
|
|
||||||
)} | bytes=${kbStr(0)} | ${padRight(ctx.http.inflightStr(), 11)} | ${secStr(ms)}`
|
|
||||||
);
|
|
||||||
|
|
||||||
cursor = prod?.nextPageCursor || null;
|
|
||||||
if (!cursor || !arr.length) break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx.logger.ok(`${ctx.catPrefixOut} | Unique products: ${discovered.size}`);
|
done++;
|
||||||
|
|
||||||
const { merged, newItems, updatedItems, removedItems, restoredItems } = mergeDiscoveredIntoDb(prevDb, discovered, {
|
const ms = Date.now() - tPage;
|
||||||
storeLabel: ctx.store.name,
|
ctx.logger.ok(
|
||||||
});
|
`${ctx.catPrefixOut} | Page ${pageStr(page, maxPages)} | 200 | items=${padLeft(
|
||||||
|
kept,
|
||||||
const dbObj = buildDbObject(ctx, merged);
|
3
|
||||||
writeJsonAtomic(ctx.dbFile, dbObj);
|
)} | bytes=${kbStr(0)} | ${padRight(ctx.http.inflightStr(), 11)} | ${secStr(ms)}`
|
||||||
|
);
|
||||||
const elapsed = Date.now() - t0;
|
|
||||||
|
cursor = prod?.nextPageCursor || null;
|
||||||
report.categories.push({
|
if (!cursor || !arr.length) break;
|
||||||
store: ctx.store.name,
|
|
||||||
label: ctx.cat.label,
|
|
||||||
key: ctx.cat.key,
|
|
||||||
dbFile: ctx.dbFile,
|
|
||||||
scannedPages: done,
|
|
||||||
discoveredUnique: discovered.size,
|
|
||||||
newCount: newItems.length,
|
|
||||||
updatedCount: updatedItems.length,
|
|
||||||
removedCount: removedItems.length,
|
|
||||||
restoredCount: restoredItems.length,
|
|
||||||
elapsedMs: elapsed,
|
|
||||||
});
|
|
||||||
|
|
||||||
report.totals.newCount += newItems.length;
|
|
||||||
report.totals.updatedCount += updatedItems.length;
|
|
||||||
report.totals.removedCount += removedItems.length;
|
|
||||||
report.totals.restoredCount += restoredItems.length;
|
|
||||||
|
|
||||||
addCategoryResultToReport(report, ctx.store.name, ctx.cat.label, newItems, updatedItems, removedItems, restoredItems);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// second pass: repair with budgets
|
||||||
|
const htmlBudget = Number.isFinite(ctx.config.tudorHtmlBudget)
|
||||||
|
? ctx.config.tudorHtmlBudget
|
||||||
|
: DETAIL_HTML_BUDGET_DEFAULT;
|
||||||
|
|
||||||
|
const gqlBudget = Number.isFinite(ctx.config.tudorGqlBudget)
|
||||||
|
? ctx.config.tudorGqlBudget
|
||||||
|
: DETAIL_GQL_BUDGET_DEFAULT;
|
||||||
|
|
||||||
|
let htmlUsed = 0;
|
||||||
|
let gqlUsed = 0;
|
||||||
|
|
||||||
|
for (const it of needsDetail) {
|
||||||
|
const wantsHtml = isSyntheticSku(it.sku);
|
||||||
|
const wantsGql = !it.img && String(it._skuProbe || "").trim();
|
||||||
|
|
||||||
|
// enforce caps
|
||||||
|
if (wantsHtml && htmlUsed >= htmlBudget && (!wantsGql || gqlUsed >= gqlBudget)) continue;
|
||||||
|
if (wantsGql && gqlUsed >= gqlBudget && (!wantsHtml || htmlUsed >= htmlBudget)) continue;
|
||||||
|
|
||||||
|
// count budgets pessimistically
|
||||||
|
if (wantsHtml) htmlUsed++;
|
||||||
|
if (wantsGql) gqlUsed++;
|
||||||
|
|
||||||
|
await tudorRepairItem(ctx, it);
|
||||||
|
discovered.set(it.url, it);
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx.logger.ok(
|
||||||
|
`${ctx.catPrefixOut} | Unique products: ${discovered.size} | detail(html=${htmlUsed}/${htmlBudget}, gql=${gqlUsed}/${gqlBudget})`
|
||||||
|
);
|
||||||
|
|
||||||
|
const { merged, newItems, updatedItems, removedItems, restoredItems } = mergeDiscoveredIntoDb(prevDb, discovered, {
|
||||||
|
storeLabel: ctx.store.name,
|
||||||
|
});
|
||||||
|
|
||||||
|
const dbObj = buildDbObject(ctx, merged);
|
||||||
|
writeJsonAtomic(ctx.dbFile, dbObj);
|
||||||
|
|
||||||
|
const elapsed = Date.now() - t0;
|
||||||
|
|
||||||
|
report.categories.push({
|
||||||
|
store: ctx.store.name,
|
||||||
|
label: ctx.cat.label,
|
||||||
|
key: ctx.cat.key,
|
||||||
|
dbFile: ctx.dbFile,
|
||||||
|
scannedPages: done,
|
||||||
|
discoveredUnique: discovered.size,
|
||||||
|
newCount: newItems.length,
|
||||||
|
updatedCount: updatedItems.length,
|
||||||
|
removedCount: removedItems.length,
|
||||||
|
restoredCount: restoredItems.length,
|
||||||
|
elapsedMs: elapsed,
|
||||||
|
});
|
||||||
|
|
||||||
|
report.totals.newCount += newItems.length;
|
||||||
|
report.totals.updatedCount += updatedItems.length;
|
||||||
|
report.totals.removedCount += removedItems.length;
|
||||||
|
report.totals.restoredCount += restoredItems.length;
|
||||||
|
|
||||||
|
addCategoryResultToReport(report, ctx.store.name, ctx.cat.label, newItems, updatedItems, removedItems, restoredItems);
|
||||||
|
}
|
||||||
|
|
||||||
/* ---------------- store ---------------- */
|
/* ---------------- store ---------------- */
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -20,8 +20,9 @@ function normalizeUpcDigits(v) {
|
||||||
return m ? m[1] : "";
|
return m ? m[1] : "";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// CHANGE: allow 1-11 digits so BCL 3-digit ids like id:141 are preserved
|
||||||
function normalizeIdDigits(v) {
|
function normalizeIdDigits(v) {
|
||||||
const m = String(v ?? "").match(/\b(\d{4,11})\b/);
|
const m = String(v ?? "").match(/\b(\d{1,11})\b/);
|
||||||
return m ? m[1] : "";
|
return m ? m[1] : "";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -33,6 +34,35 @@ function makeSyntheticSkuKey({ storeLabel, url }) {
|
||||||
return `u:${fnv1a32(`${store}|${u}`)}`;
|
return `u:${fnv1a32(`${store}|${u}`)}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ---------------- NEW: SKU quality helpers ---------------- */
|
||||||
|
|
||||||
|
function skuQuality(v) {
|
||||||
|
const s = String(v ?? "").trim();
|
||||||
|
if (!s) return 0; // missing
|
||||||
|
if (/^u:/i.test(s)) return 0; // synthetic
|
||||||
|
if (normalizeCspc(s)) return 3; // best (6-digit CSPC)
|
||||||
|
if (/^upc:/i.test(s)) return 2;
|
||||||
|
if (/^id:/i.test(s)) return 2;
|
||||||
|
return 1; // explicit non-synthetic string
|
||||||
|
}
|
||||||
|
|
||||||
|
// Prefer higher quality; on ties keep existing (stable) value
|
||||||
|
function pickBetterSku(newSku, oldSku) {
|
||||||
|
const a = String(newSku ?? "").trim();
|
||||||
|
const b = String(oldSku ?? "").trim();
|
||||||
|
const qa = skuQuality(a);
|
||||||
|
const qb = skuQuality(b);
|
||||||
|
if (qa > qb) return a;
|
||||||
|
if (qb > qa) return b;
|
||||||
|
return b || a;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Only fetch product pages when missing/synthetic
|
||||||
|
function needsSkuDetail(sku) {
|
||||||
|
const s = String(sku ?? "").trim();
|
||||||
|
return !s || /^u:/i.test(s);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Behavior:
|
* Behavior:
|
||||||
* - CSPC 6-digit => "123456"
|
* - CSPC 6-digit => "123456"
|
||||||
|
|
@ -63,4 +93,11 @@ function normalizeSkuKey(v, { storeLabel, url } = {}) {
|
||||||
return syn || "";
|
return syn || "";
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = { normalizeCspc, normalizeSkuKey, makeSyntheticSkuKey };
|
module.exports = {
|
||||||
|
normalizeCspc,
|
||||||
|
normalizeSkuKey,
|
||||||
|
makeSyntheticSkuKey,
|
||||||
|
skuQuality,
|
||||||
|
pickBetterSku,
|
||||||
|
needsSkuDetail,
|
||||||
|
};
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue