mirror of
https://github.com/samsonjs/spirit-tracker.git
synced 2026-03-25 09:25:51 +00:00
feat: Better SKUs for CC gull tudor and BCL
This commit is contained in:
parent
6be8e87733
commit
f19c1404fa
5 changed files with 735 additions and 238 deletions
|
|
@ -80,7 +80,6 @@ function bclIsInStock(src) {
|
|||
return true;
|
||||
}
|
||||
|
||||
|
||||
function bclNormalizeAbsUrl(raw) {
|
||||
const s = String(raw || "").trim();
|
||||
if (!s) return "";
|
||||
|
|
@ -141,7 +140,17 @@ function bclHitToItem(hit) {
|
|||
const regular = asNumber(src.regularPrice);
|
||||
const price = cad(Number.isFinite(current) ? current : regular);
|
||||
|
||||
const sku = normalizeCspc(url);
|
||||
// SKU key:
|
||||
// - Keep CSPC 6-digit when present (rare for BCL, but safe)
|
||||
// - Otherwise upgrade to an explicit soft key: id:<digits>
|
||||
//
|
||||
// ✅ PATCH: handle tiny SKUs too (3/4/5-digit) by forcing id:<digits>
|
||||
// only fall back to raw (NOT u:) if it’s genuinely non-numeric.
|
||||
let sku = normalizeCspc(skuRaw);
|
||||
if (!sku) {
|
||||
const m = skuRaw.match(/^\d{1,6}$/); // BCL product IDs like 141, 596, 984, 117, etc.
|
||||
sku = m ? `id:${m[0]}` : `id:${skuRaw}`;
|
||||
}
|
||||
|
||||
const inStock = bclIsInStock(src);
|
||||
if (!inStock) return null;
|
||||
|
|
@ -155,8 +164,6 @@ function bclHitToItem(hit) {
|
|||
return { name, price, url, sku, img };
|
||||
}
|
||||
|
||||
|
||||
|
||||
async function bclFetchBrowsePage(ctx, page1, size) {
|
||||
const type = ctx.cat.bclType; // e.g. "rum" or "whisky / whiskey"
|
||||
const category = "spirits";
|
||||
|
|
@ -293,11 +300,12 @@ async function scanCategoryBCLAjax(ctx, prevDb, report) {
|
|||
newCount: newItems.length,
|
||||
updatedCount: updatedItems.length,
|
||||
removedCount: removedItems.length,
|
||||
restoredCount: removedItems.length,
|
||||
restoredCount: restoredItems.length,
|
||||
elapsedMs: elapsed,
|
||||
});
|
||||
report.totals.newCount += newItems.length;
|
||||
report.totals.updatedCount += updatedItems.length;
|
||||
report.totals.updatedCount += updatedItems.length;
|
||||
report.totals.removedCount += removedItems.length;
|
||||
report.totals.restoredCount += restoredItems.length;
|
||||
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ const { setTimeout: sleep } = require("timers/promises");
|
|||
|
||||
const { decodeHtml, stripTags, extractFirstImgUrl } = require("../utils/html");
|
||||
const { sanitizeName } = require("../utils/text");
|
||||
const { normalizeCspc } = require("../utils/sku");
|
||||
const { normalizeCspc, pickBetterSku, needsSkuDetail } = require("../utils/sku");
|
||||
const { makePageUrlShopifyQueryPage } = require("../utils/url");
|
||||
|
||||
const { mergeDiscoveredIntoDb } = require("../tracker/merge");
|
||||
|
|
@ -33,7 +33,9 @@ function canonicalizeCraftProductUrl(raw) {
|
|||
function extractShopifyCardPrice(block) {
|
||||
const b = String(block || "");
|
||||
const dollars = (txt) =>
|
||||
[...String(txt).matchAll(/\$\s*[\d,]+(?:\.\d{2})?/g)].map((m) => m[0].replace(/\s+/g, ""));
|
||||
[...String(txt).matchAll(/\$\s*[\d,]+(?:\.\d{2})?/g)].map((m) =>
|
||||
m[0].replace(/\s+/g, "")
|
||||
);
|
||||
|
||||
const saleRegion = b.split(/sale price/i)[1] || "";
|
||||
const saleD = dollars(saleRegion);
|
||||
|
|
@ -50,8 +52,14 @@ function extractShopifyCardPrice(block) {
|
|||
function parseProductsCraftCellars(html, ctx) {
|
||||
const s = String(html || "");
|
||||
|
||||
const g1 = s.match(/<div\b[^>]*id=["']ProductGridContainer["'][^>]*>[\s\S]*?<\/div>/i)?.[0] || "";
|
||||
const g2 = s.match(/<div\b[^>]*id=["']product-grid["'][^>]*>[\s\S]*?<\/div>/i)?.[0] || "";
|
||||
const g1 =
|
||||
s.match(
|
||||
/<div\b[^>]*id=["']ProductGridContainer["'][^>]*>[\s\S]*?<\/div>/i
|
||||
)?.[0] || "";
|
||||
const g2 =
|
||||
s.match(
|
||||
/<div\b[^>]*id=["']product-grid["'][^>]*>[\s\S]*?<\/div>/i
|
||||
)?.[0] || "";
|
||||
|
||||
const gridCandidate = g1.length > g2.length ? g1 : g2;
|
||||
const grid = /\/products\//i.test(gridCandidate) ? gridCandidate : s;
|
||||
|
|
@ -63,18 +71,24 @@ function parseProductsCraftCellarsInner(html, ctx) {
|
|||
const s = String(html || "");
|
||||
const items = [];
|
||||
|
||||
let blocks = [...s.matchAll(/<li\b[^>]*>[\s\S]*?<\/li>/gi)].map((m) => m[0]);
|
||||
let blocks = [...s.matchAll(/<li\b[^>]*>[\s\S]*?<\/li>/gi)].map(
|
||||
(m) => m[0]
|
||||
);
|
||||
if (blocks.length < 5) {
|
||||
blocks = [...s.matchAll(/<div\b[^>]*class=["'][^"']*\bcard\b[^"']*["'][^>]*>[\s\S]*?<\/div>/gi)].map(
|
||||
(m) => m[0]
|
||||
);
|
||||
blocks = [
|
||||
...s.matchAll(
|
||||
/<div\b[^>]*class=["'][^"']*\bcard\b[^"']*["'][^>]*>[\s\S]*?<\/div>/gi
|
||||
),
|
||||
].map((m) => m[0]);
|
||||
}
|
||||
|
||||
const base = `https://${(ctx && ctx.store && ctx.store.host) || "craftcellars.ca"}/`;
|
||||
|
||||
for (const block of blocks) {
|
||||
const href =
|
||||
block.match(/<a\b[^>]*href=["']([^"']*\/products\/[^"']+)["']/i)?.[1] ||
|
||||
block.match(
|
||||
/<a\b[^>]*href=["']([^"']*\/products\/[^"']+)["']/i
|
||||
)?.[1] ||
|
||||
block.match(/href=["']([^"']*\/products\/[^"']+)["']/i)?.[1];
|
||||
if (!href) continue;
|
||||
|
||||
|
|
@ -87,9 +101,15 @@ function parseProductsCraftCellarsInner(html, ctx) {
|
|||
url = canonicalizeCraftProductUrl(url);
|
||||
|
||||
const nameHtml =
|
||||
block.match(/<a\b[^>]*href=["'][^"']*\/products\/[^"']+["'][^>]*>\s*<[^>]*>\s*([^<]{2,200}?)\s*</i)?.[1] ||
|
||||
block.match(/<h[23]\b[^>]*>[\s\S]*?<a\b[^>]*\/products\/[^"']+[^>]*>([\s\S]*?)<\/a>[\s\S]*?<\/h[23]>/i)?.[1] ||
|
||||
block.match(/<a\b[^>]*href=["'][^"']*\/products\/[^"']+["'][^>]*>([\s\S]*?)<\/a>/i)?.[1];
|
||||
block.match(
|
||||
/<a\b[^>]*href=["'][^"']*\/products\/[^"']+["'][^>]*>\s*<[^>]*>\s*([^<]{2,200}?)\s*</i
|
||||
)?.[1] ||
|
||||
block.match(
|
||||
/<h[23]\b[^>]*>[\s\S]*?<a\b[^>]*\/products\/[^"']+[^>]*>([\s\S]*?)<\/a>[\s\S]*?<\/h[23]>/i
|
||||
)?.[1] ||
|
||||
block.match(
|
||||
/<a\b[^>]*href=["'][^"']*\/products\/[^"']+["'][^>]*>([\s\S]*?)<\/a>/i
|
||||
)?.[1];
|
||||
|
||||
const name = sanitizeName(stripTags(decodeHtml(nameHtml || "")));
|
||||
if (!name) continue;
|
||||
|
|
@ -108,37 +128,58 @@ function parseProductsCraftCellarsInner(html, ctx) {
|
|||
function usdFromShopifyPriceStr(s) {
|
||||
const n = Number(String(s || "").replace(/[^0-9.]/g, ""));
|
||||
if (!Number.isFinite(n)) return "";
|
||||
return `$${n.toLocaleString("en-US", { minimumFractionDigits: 2, maximumFractionDigits: 2 })}`;
|
||||
return `$${n.toLocaleString("en-US", {
|
||||
minimumFractionDigits: 2,
|
||||
maximumFractionDigits: 2,
|
||||
})}`;
|
||||
}
|
||||
|
||||
function cfgNum(v, fallback) {
|
||||
return Number.isFinite(v) ? v : fallback;
|
||||
}
|
||||
|
||||
/* ---------- NEW: product page SKU extractor ---------- */
|
||||
function extractCraftSkuFromProductPageHtml(html) {
|
||||
const s = String(html || "");
|
||||
|
||||
const m =
|
||||
s.match(
|
||||
/<strong>\s*SKU:\s*<\/strong>\s*<span>\s*([^<]{1,80}?)\s*<\/span>/i
|
||||
) ||
|
||||
s.match(/\bSKU:\s*<\/strong>\s*<span>\s*([^<]{1,80}?)\s*<\/span>/i) ||
|
||||
s.match(/\bSKU:\s*([A-Za-z0-9][A-Za-z0-9\-_/ ]{0,40})/i);
|
||||
|
||||
const raw = m && m[1] ? stripTags(decodeHtml(m[1])) : "";
|
||||
return normalizeCspc(raw);
|
||||
}
|
||||
|
||||
/**
|
||||
* Craft Cellars:
|
||||
* - HTML listing with ?filter.v.availability=1 is the allowlist (prevents OOS leaking in)
|
||||
* - Shopify products.json is used only to enrich SKU (and optionally price) for those allowed URLs
|
||||
* - HTML listing with ?filter.v.availability=1 is the allowlist
|
||||
* - products.json enriches SKU/price
|
||||
* - product page HTML is final SKU fallback
|
||||
*/
|
||||
async function scanCategoryCraftCellars(ctx, prevDb, report) {
|
||||
const t0 = Date.now();
|
||||
|
||||
// Strongly prefer "slow and steady" to avoid 429s.
|
||||
// Use per-category knobs if present; otherwise default conservative.
|
||||
const perPageDelayMs = Math.max(
|
||||
0,
|
||||
cfgNum(ctx?.cat?.pageStaggerMs, cfgNum(ctx?.cat?.discoveryDelayMs, 0)) || 0
|
||||
) || 0;
|
||||
const perPageDelayMs =
|
||||
Math.max(
|
||||
0,
|
||||
cfgNum(ctx?.cat?.pageStaggerMs, cfgNum(ctx?.cat?.discoveryDelayMs, 0))
|
||||
) || 0;
|
||||
|
||||
const perJsonPageDelayMs = Math.max(
|
||||
0,
|
||||
cfgNum(ctx?.cat?.jsonPageDelayMs, perPageDelayMs)
|
||||
);
|
||||
|
||||
// 1) HTML scan: allowlist of in-stock listing URLs
|
||||
const htmlMap = new Map(); // url -> {name, price, url, img}
|
||||
const htmlMap = new Map();
|
||||
|
||||
const maxPages =
|
||||
ctx.config.maxPages === null
|
||||
? 200
|
||||
: Math.min(ctx.config.maxPages, 200);
|
||||
|
||||
const maxPages = ctx.config.maxPages === null ? 200 : Math.min(ctx.config.maxPages, 200);
|
||||
let htmlPagesFetched = 0;
|
||||
let emptyStreak = 0;
|
||||
|
||||
|
|
@ -146,7 +187,11 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) {
|
|||
if (p > 1 && perPageDelayMs > 0) await sleep(perPageDelayMs);
|
||||
|
||||
const pageUrl = makePageUrlShopifyQueryPage(ctx.cat.startUrl, p);
|
||||
const { text: html } = await ctx.http.fetchTextWithRetry(pageUrl, `craft:html:${ctx.cat.key}:p${p}`, ctx.store.ua);
|
||||
const { text: html } = await ctx.http.fetchTextWithRetry(
|
||||
pageUrl,
|
||||
`craft:html:${ctx.cat.key}:p${p}`,
|
||||
ctx.store.ua
|
||||
);
|
||||
htmlPagesFetched++;
|
||||
|
||||
if (craftCellarsIsEmptyListingPage(html)) break;
|
||||
|
|
@ -162,22 +207,30 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) {
|
|||
for (const it of items) {
|
||||
const url = canonicalizeCraftProductUrl(it.url);
|
||||
if (!url) continue;
|
||||
htmlMap.set(url, { name: it.name || "", price: it.price || "", url, img: it.img || "" });
|
||||
htmlMap.set(url, {
|
||||
name: it.name || "",
|
||||
price: it.price || "",
|
||||
url,
|
||||
img: it.img || "",
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// If HTML returns nothing, don't let JSON invent a category
|
||||
if (!htmlMap.size) {
|
||||
ctx.logger.warn(`${ctx.catPrefixOut} | HTML listing returned 0 items; refusing to use products.json as source of truth.`);
|
||||
ctx.logger.warn(
|
||||
`${ctx.catPrefixOut} | HTML listing returned 0 items; refusing JSON-only discovery`
|
||||
);
|
||||
}
|
||||
|
||||
// 2) JSON scan: build SKU index (but do NOT add new URLs from JSON)
|
||||
const jsonMap = new Map(); // url -> { sku, price, img }
|
||||
const jsonMap = new Map();
|
||||
|
||||
if (htmlMap.size) {
|
||||
const start = new URL(ctx.cat.startUrl);
|
||||
const m = start.pathname.match(/^\/collections\/([^/]+)/i);
|
||||
if (!m) throw new Error(`CraftCellars: couldn't extract collection handle from ${ctx.cat.startUrl}`);
|
||||
if (!m)
|
||||
throw new Error(
|
||||
`CraftCellars: couldn't extract collection handle from ${ctx.cat.startUrl}`
|
||||
);
|
||||
const collectionHandle = m[1];
|
||||
|
||||
const limit = 250;
|
||||
|
|
@ -185,12 +238,19 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) {
|
|||
let jsonPagesFetched = 0;
|
||||
|
||||
while (true) {
|
||||
if (jsonPage > 1 && perJsonPageDelayMs > 0) await sleep(perJsonPageDelayMs);
|
||||
if (jsonPage > 1 && perJsonPageDelayMs > 0)
|
||||
await sleep(perJsonPageDelayMs);
|
||||
|
||||
const url = `https://${ctx.store.host}/collections/${collectionHandle}/products.json?limit=${limit}&page=${jsonPage}`;
|
||||
const r = await ctx.http.fetchJsonWithRetry(url, `craft:coljson:${ctx.cat.key}:p${jsonPage}`, ctx.store.ua);
|
||||
const r = await ctx.http.fetchJsonWithRetry(
|
||||
url,
|
||||
`craft:coljson:${ctx.cat.key}:p${jsonPage}`,
|
||||
ctx.store.ua
|
||||
);
|
||||
|
||||
const products = Array.isArray(r?.json?.products) ? r.json.products : [];
|
||||
const products = Array.isArray(r?.json?.products)
|
||||
? r.json.products
|
||||
: [];
|
||||
jsonPagesFetched++;
|
||||
|
||||
if (!products.length) break;
|
||||
|
|
@ -199,73 +259,116 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) {
|
|||
const handle = String(p?.handle || "");
|
||||
if (!handle) continue;
|
||||
|
||||
const prodUrl = canonicalizeCraftProductUrl(`https://${ctx.store.host}/products/${handle}`);
|
||||
|
||||
// Only enrich if it's on the HTML allowlist
|
||||
const prodUrl = canonicalizeCraftProductUrl(
|
||||
`https://${ctx.store.host}/products/${handle}`
|
||||
);
|
||||
if (!htmlMap.has(prodUrl)) continue;
|
||||
|
||||
const variants = Array.isArray(p?.variants) ? p.variants : [];
|
||||
const v = variants.find((x) => x && x.available === true) || variants[0] || null;
|
||||
const v =
|
||||
variants.find((x) => x && x.available === true) ||
|
||||
variants[0] ||
|
||||
null;
|
||||
|
||||
const sku = normalizeCspc(v?.sku || "");
|
||||
const price = v?.price ? usdFromShopifyPriceStr(v.price) : "";
|
||||
|
||||
// Product image (best effort)
|
||||
let img = "";
|
||||
const images = Array.isArray(p?.images) ? p.images : [];
|
||||
if (images[0]) {
|
||||
if (typeof images[0] === "string") img = images[0];
|
||||
else img = String(images[0]?.src || images[0]?.url || "");
|
||||
img =
|
||||
typeof images[0] === "string"
|
||||
? images[0]
|
||||
: String(images[0]?.src || images[0]?.url || "");
|
||||
}
|
||||
if (!img && p?.image) img = String(p.image?.src || p.image?.url || p.image || "");
|
||||
if (!img && p?.image)
|
||||
img = String(p.image?.src || p.image?.url || p.image || "");
|
||||
img = String(img || "").trim();
|
||||
if (img.startsWith("//")) img = `https:${img}`;
|
||||
if (img && !/^https?:\/\//i.test(img)) {
|
||||
try {
|
||||
img = new URL(img, `https://${ctx.store.host}/`).toString();
|
||||
} catch {
|
||||
// keep as-is
|
||||
}
|
||||
}
|
||||
|
||||
jsonMap.set(prodUrl, { sku, price, img });
|
||||
}
|
||||
|
||||
if (products.length < limit) break;
|
||||
jsonPage++;
|
||||
if (jsonPage > 200) break; // safety
|
||||
if (++jsonPage > 200) break;
|
||||
}
|
||||
|
||||
ctx.logger.ok(`${ctx.catPrefixOut} | HTML pages=${htmlPagesFetched} JSON pages=${jsonPagesFetched}`);
|
||||
} else {
|
||||
ctx.logger.ok(`${ctx.catPrefixOut} | HTML pages=${htmlPagesFetched} JSON pages=0`);
|
||||
ctx.logger.ok(
|
||||
`${ctx.catPrefixOut} | HTML pages=${htmlPagesFetched} JSON pages=${jsonPagesFetched}`
|
||||
);
|
||||
}
|
||||
|
||||
// 3) Final discovered: HTML allowlist, enriched by JSON
|
||||
const discovered = new Map();
|
||||
for (const [url, it] of htmlMap.entries()) {
|
||||
const j = jsonMap.get(url);
|
||||
const prev = prevDb?.byUrl?.get(url) || null;
|
||||
|
||||
discovered.set(url, {
|
||||
name: it.name || "",
|
||||
// Prefer JSON price (normalized) when present, else keep HTML price (already formatted)
|
||||
name: it.name,
|
||||
price: j?.price || it.price || "",
|
||||
url,
|
||||
sku: j?.sku || "",
|
||||
img: j?.img || it.img || "",
|
||||
// reuse cached SKU unless we found something better this run
|
||||
sku: pickBetterSku(j?.sku || "", prev?.sku || ""),
|
||||
// reuse cached image if we didn't find one
|
||||
img: (j?.img || it.img || prev?.img || ""),
|
||||
});
|
||||
}
|
||||
|
||||
ctx.logger.ok(`${ctx.catPrefixOut} | Unique products (this run): ${discovered.size}`);
|
||||
/* ---------- NEW: product page SKU fallback (cached; only when needed) ---------- */
|
||||
const perProductSkuDelayMs = Math.max(
|
||||
0,
|
||||
cfgNum(
|
||||
ctx?.cat?.skuPageDelayMs,
|
||||
cfgNum(ctx?.cat?.jsonPageDelayMs, perPageDelayMs)
|
||||
)
|
||||
);
|
||||
|
||||
const { merged, newItems, updatedItems, removedItems, restoredItems } = mergeDiscoveredIntoDb(prevDb, discovered, {
|
||||
let skuPagesFetched = 0;
|
||||
|
||||
for (const it of discovered.values()) {
|
||||
// only hit product pages when missing/synthetic
|
||||
if (!needsSkuDetail(it.sku)) continue;
|
||||
|
||||
if (perProductSkuDelayMs > 0) await sleep(perProductSkuDelayMs);
|
||||
|
||||
try {
|
||||
const { text } = await ctx.http.fetchTextWithRetry(
|
||||
it.url,
|
||||
`craft:prodpage:${ctx.cat.key}:${Buffer.from(it.url)
|
||||
.toString("base64")
|
||||
.slice(0, 24)}`,
|
||||
ctx.store.ua
|
||||
);
|
||||
skuPagesFetched++;
|
||||
|
||||
const sku = extractCraftSkuFromProductPageHtml(text);
|
||||
if (sku) it.sku = sku;
|
||||
} catch {
|
||||
/* best effort */
|
||||
}
|
||||
}
|
||||
|
||||
ctx.logger.ok(
|
||||
`${ctx.catPrefixOut} | SKU fallback pages=${skuPagesFetched}`
|
||||
);
|
||||
|
||||
ctx.logger.ok(
|
||||
`${ctx.catPrefixOut} | Unique products (this run): ${discovered.size}`
|
||||
);
|
||||
|
||||
const {
|
||||
merged,
|
||||
newItems,
|
||||
updatedItems,
|
||||
removedItems,
|
||||
restoredItems,
|
||||
} = mergeDiscoveredIntoDb(prevDb, discovered, {
|
||||
storeLabel: ctx.store.name,
|
||||
});
|
||||
|
||||
const dbObj = buildDbObject(ctx, merged);
|
||||
writeJsonAtomic(ctx.dbFile, dbObj);
|
||||
|
||||
ctx.logger.ok(`${ctx.catPrefixOut} | DB saved: ${ctx.logger.dim(ctx.dbFile)} (${dbObj.count} items)`);
|
||||
|
||||
const elapsed = Date.now() - t0;
|
||||
|
||||
report.categories.push({
|
||||
|
|
@ -287,7 +390,15 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) {
|
|||
report.totals.removedCount += removedItems.length;
|
||||
report.totals.restoredCount += restoredItems.length;
|
||||
|
||||
addCategoryResultToReport(report, ctx.store.name, ctx.cat.label, newItems, updatedItems, removedItems, restoredItems);
|
||||
addCategoryResultToReport(
|
||||
report,
|
||||
ctx.store.name,
|
||||
ctx.cat.label,
|
||||
newItems,
|
||||
updatedItems,
|
||||
removedItems,
|
||||
restoredItems
|
||||
);
|
||||
}
|
||||
|
||||
function createStore(defaultUa) {
|
||||
|
|
@ -297,10 +408,8 @@ function createStore(defaultUa) {
|
|||
host: "craftcellars.ca",
|
||||
ua: defaultUa,
|
||||
|
||||
// ✅ Custom scan (HTML allowlist + JSON enrichment)
|
||||
scanCategory: scanCategoryCraftCellars,
|
||||
|
||||
// Keep HTML parser for debugging
|
||||
parseProducts: parseProductsCraftCellars,
|
||||
makePageUrl: makePageUrlShopifyQueryPage,
|
||||
isEmptyListingPage: craftCellarsIsEmptyListingPage,
|
||||
|
|
@ -309,69 +418,22 @@ function createStore(defaultUa) {
|
|||
{
|
||||
key: "whisky",
|
||||
label: "Whisky",
|
||||
startUrl: "https://craftcellars.ca/collections/whisky?filter.v.availability=1",
|
||||
|
||||
// slow-and-safe defaults (override globally if you want)
|
||||
discoveryStartPage: 3,
|
||||
discoveryStep: 2,
|
||||
startUrl:
|
||||
"https://craftcellars.ca/collections/whisky?filter.v.availability=1",
|
||||
pageConcurrency: 1,
|
||||
pageStaggerMs: 10000,
|
||||
discoveryDelayMs: 10000,
|
||||
skuPageDelayMs: 12000,
|
||||
},
|
||||
{
|
||||
key: "rum",
|
||||
label: "Rum",
|
||||
startUrl: "https://craftcellars.ca/collections/rum?filter.v.availability=1",
|
||||
|
||||
discoveryStartPage: 3,
|
||||
discoveryStep: 2,
|
||||
pageConcurrency: 1,
|
||||
pageStaggerMs: 10000,
|
||||
discoveryDelayMs: 10000,
|
||||
},
|
||||
{
|
||||
key: "single-malt-scotch",
|
||||
label: "Single Malt Scotch",
|
||||
startUrl: "https://craftcellars.ca/collections/single-malt-scotch?filter.v.availability=1",
|
||||
|
||||
discoveryStartPage: 3,
|
||||
discoveryStep: 2,
|
||||
pageConcurrency: 1,
|
||||
pageStaggerMs: 10000,
|
||||
discoveryDelayMs: 10000,
|
||||
},
|
||||
{
|
||||
key: "other-scotch-styles",
|
||||
label: "Other Scotch Styles",
|
||||
startUrl: "https://craftcellars.ca/collections/other-scotch-styles?filter.v.availability=1",
|
||||
|
||||
discoveryStartPage: 3,
|
||||
discoveryStep: 2,
|
||||
pageConcurrency: 1,
|
||||
pageStaggerMs: 10000,
|
||||
discoveryDelayMs: 10000,
|
||||
},
|
||||
{
|
||||
key: "single-grain-scotch",
|
||||
label: "Single Grain Scotch",
|
||||
startUrl: "https://craftcellars.ca/collections/single-grain-scotch?filter.v.availability=1",
|
||||
|
||||
discoveryStartPage: 3,
|
||||
discoveryStep: 2,
|
||||
pageConcurrency: 1,
|
||||
pageStaggerMs: 10000,
|
||||
discoveryDelayMs: 10000,
|
||||
},
|
||||
{
|
||||
key: "blended-malt-scotch",
|
||||
label: "Blended Malt Scotch",
|
||||
startUrl: "https://craftcellars.ca/collections/blended-malt-scotch?filter.v.availability=1",
|
||||
|
||||
discoveryStartPage: 3,
|
||||
discoveryStep: 2,
|
||||
startUrl:
|
||||
"https://craftcellars.ca/collections/rum?filter.v.availability=1",
|
||||
pageConcurrency: 1,
|
||||
pageStaggerMs: 10000,
|
||||
discoveryDelayMs: 10000,
|
||||
skuPageDelayMs: 12000,
|
||||
},
|
||||
],
|
||||
};
|
||||
|
|
|
|||
|
|
@ -1,7 +1,8 @@
|
|||
// src/stores/gull.js
|
||||
"use strict";
|
||||
|
||||
const { decodeHtml, cleanText, extractFirstImgUrl } = require("../utils/html");
|
||||
const { normalizeCspc } = require("../utils/sku");
|
||||
const { normalizeCspc, pickBetterSku, needsSkuDetail } = require("../utils/sku");
|
||||
const { makePageUrl } = require("../utils/url");
|
||||
|
||||
function looksInStock(block) {
|
||||
|
|
@ -45,6 +46,133 @@ function extractGullPriceFromBlock(block) {
|
|||
return `$${chosen.toFixed(2)}`;
|
||||
}
|
||||
|
||||
// Gull SKUs are often NOT 6 digits (e.g. 67424).
|
||||
// If it's not 6 digits, represent as id:<digits> to avoid normalizeCspc turning it into u:SHA.
|
||||
function normalizeGullSku(raw) {
|
||||
const s = cleanText(decodeHtml(String(raw || ""))).trim();
|
||||
|
||||
// already in a stable prefixed form
|
||||
if (/^(id:|u:)/i.test(s)) return s;
|
||||
|
||||
// digits-only SKU (from page / tile)
|
||||
const digits = s.match(/\b(\d{3,10})\b/)?.[1] || "";
|
||||
if (digits) {
|
||||
if (digits.length === 6) return normalizeCspc(digits);
|
||||
return `id:${digits}`;
|
||||
}
|
||||
|
||||
// fall back to existing normalizer (may yield u:...)
|
||||
return normalizeCspc(s);
|
||||
}
|
||||
|
||||
// When we fall back to normalizeCspc(url), we may end up with a generated u:XXXXXXXX.
|
||||
function isGeneratedUrlSku(sku) {
|
||||
const s = String(sku || "");
|
||||
// you have u:8hex in the DB, so accept 8+
|
||||
return /^u:[0-9a-f]{8,128}$/i.test(s);
|
||||
}
|
||||
|
||||
// Extract SKU from Gull product page HTML.
|
||||
function extractGullSkuFromProductPage(html) {
|
||||
const s = String(html || "");
|
||||
|
||||
// Most reliable: <span class="sku">67424</span>
|
||||
const m1 = s.match(
|
||||
/<span\b[^>]*class=["'][^"']*\bsku\b[^"']*["'][^>]*>\s*([0-9]{3,10})\s*<\/span>/i
|
||||
);
|
||||
if (m1?.[1]) return normalizeGullSku(m1[1]);
|
||||
|
||||
// Fallback: "SKU: 67424" text
|
||||
const m2 = s.match(/\bSKU:\s*([0-9]{3,10})\b/i);
|
||||
if (m2?.[1]) return normalizeGullSku(m2[1]);
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
// Serial limiter: ensures at least minIntervalMs between request starts.
|
||||
function createMinIntervalLimiter(minIntervalMs) {
|
||||
let lastStart = 0;
|
||||
let chain = Promise.resolve();
|
||||
|
||||
return async function schedule(fn) {
|
||||
chain = chain.then(async () => {
|
||||
const now = Date.now();
|
||||
const waitMs = Math.max(0, lastStart + minIntervalMs - now);
|
||||
if (waitMs) await new Promise((r) => setTimeout(r, waitMs));
|
||||
lastStart = Date.now();
|
||||
return fn();
|
||||
});
|
||||
return chain;
|
||||
};
|
||||
}
|
||||
|
||||
async function fetchWith429Backoff(url, { fetchFn, headers, maxRetries = 2 }) {
|
||||
let attempt = 0;
|
||||
|
||||
while (true) {
|
||||
const res = await fetchFn(url, { headers });
|
||||
|
||||
if (res.status !== 429) {
|
||||
if (!res.ok) throw new Error(`HTTP ${res.status} fetching ${url}`);
|
||||
return await res.text();
|
||||
}
|
||||
|
||||
if (attempt >= maxRetries) throw new Error(`HTTP 429 fetching ${url}`);
|
||||
|
||||
// Respect Retry-After if present; otherwise progressive backoff.
|
||||
const ra =
|
||||
res.headers && typeof res.headers.get === "function"
|
||||
? res.headers.get("retry-after")
|
||||
: null;
|
||||
|
||||
const waitSec = ra && /^\d+$/.test(ra) ? parseInt(ra, 10) : 15 * (attempt + 1);
|
||||
await new Promise((r) => setTimeout(r, waitSec * 1000));
|
||||
attempt++;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Only fetches product pages for items whose sku is a generated u:... (from URL fallback).
|
||||
* Runs serially + slowly to avoid Gull 429s.
|
||||
*
|
||||
* NEW: accepts prevDb so we can skip fetch if URL already has a good SKU cached.
|
||||
*/
|
||||
async function hydrateGullSkus(
|
||||
items,
|
||||
{ fetchFn, ua, minIntervalMs = 12000, maxRetries = 2, prevDb } = {}
|
||||
) {
|
||||
if (!fetchFn) throw new Error("hydrateGullSkus requires opts.fetchFn");
|
||||
|
||||
const schedule = createMinIntervalLimiter(minIntervalMs);
|
||||
|
||||
const headers = {
|
||||
"user-agent": ua || "Mozilla/5.0",
|
||||
accept: "text/html,application/xhtml+xml",
|
||||
};
|
||||
|
||||
for (const it of items || []) {
|
||||
if (!it || !it.url) continue;
|
||||
|
||||
// NEW: if DB already has a good SKU, reuse it and skip fetch
|
||||
const prev = prevDb?.byUrl?.get(it.url) || null;
|
||||
if (prev?.sku && !needsSkuDetail(prev.sku)) {
|
||||
it.sku = pickBetterSku(it.sku, prev.sku);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!isGeneratedUrlSku(it.sku)) continue; // only where required
|
||||
|
||||
const html = await schedule(() =>
|
||||
fetchWith429Backoff(it.url, { fetchFn, headers, maxRetries })
|
||||
);
|
||||
|
||||
const realSku = extractGullSkuFromProductPage(html);
|
||||
if (realSku) it.sku = pickBetterSku(realSku, it.sku);
|
||||
}
|
||||
|
||||
return items;
|
||||
}
|
||||
|
||||
function parseProductsGull(html, ctx) {
|
||||
const s = String(html || "");
|
||||
const items = [];
|
||||
|
|
@ -82,11 +210,12 @@ function parseProductsGull(html, ctx) {
|
|||
|
||||
const price = extractGullPriceFromBlock(block);
|
||||
|
||||
const sku = normalizeCspc(
|
||||
const skuRaw =
|
||||
block.match(/\bdata-product_sku=["']([^"']+)["']/i)?.[1] ||
|
||||
block.match(/\bSKU\b[^0-9]{0,20}(\d{6})\b/i)?.[1] ||
|
||||
url
|
||||
);
|
||||
block.match(/\bSKU\b[^0-9]{0,30}(\d{3,10})\b/i)?.[1] ||
|
||||
url; // OK fallback; hydrateGullSkus will only re-fetch when this becomes u:...
|
||||
|
||||
const sku = normalizeGullSku(skuRaw);
|
||||
|
||||
const img = extractFirstImgUrl(block, base);
|
||||
|
||||
|
|
@ -98,7 +227,6 @@ function parseProductsGull(html, ctx) {
|
|||
return [...uniq.values()];
|
||||
}
|
||||
|
||||
|
||||
function createStore(defaultUa) {
|
||||
return {
|
||||
key: "gull",
|
||||
|
|
@ -106,12 +234,19 @@ function createStore(defaultUa) {
|
|||
host: "gullliquorstore.com",
|
||||
ua: defaultUa,
|
||||
parseProducts: parseProductsGull,
|
||||
|
||||
// Optional hook callers can use to post-process items:
|
||||
// only hits product pages when sku is u:...
|
||||
hydrateSkus: hydrateGullSkus,
|
||||
productPageMinIntervalMs: 12000, // slow by default; Gull is strict
|
||||
|
||||
makePageUrl, // enables /page/N/ paging
|
||||
categories: [
|
||||
{
|
||||
key: "whisky",
|
||||
label: "Whisky",
|
||||
startUrl: "https://gullliquorstore.com/product-category/spirits/?spirit_type=whisky",
|
||||
startUrl:
|
||||
"https://gullliquorstore.com/product-category/spirits/?spirit_type=whisky",
|
||||
discoveryStartPage: 3,
|
||||
discoveryStep: 2,
|
||||
pageConcurrency: 1,
|
||||
|
|
@ -121,7 +256,8 @@ function createStore(defaultUa) {
|
|||
{
|
||||
key: "rum",
|
||||
label: "Rum",
|
||||
startUrl: "https://gullliquorstore.com/product-category/spirits/?spirit_type=rum",
|
||||
startUrl:
|
||||
"https://gullliquorstore.com/product-category/spirits/?spirit_type=rum",
|
||||
discoveryStartPage: 3,
|
||||
discoveryStep: 2,
|
||||
pageConcurrency: 1,
|
||||
|
|
@ -132,4 +268,11 @@ function createStore(defaultUa) {
|
|||
};
|
||||
}
|
||||
|
||||
module.exports = { createStore, parseProductsGull };
|
||||
module.exports = {
|
||||
createStore,
|
||||
parseProductsGull,
|
||||
hydrateGullSkus,
|
||||
extractGullSkuFromProductPage,
|
||||
isGeneratedUrlSku,
|
||||
normalizeGullSku,
|
||||
};
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
"use strict";
|
||||
|
||||
const { cleanText } = require("../utils/html");
|
||||
const { normalizeCspc } = require("../utils/sku");
|
||||
const { normalizeCspc, pickBetterSku } = require("../utils/sku");
|
||||
const { humanBytes } = require("../utils/bytes");
|
||||
const { padLeft, padRight } = require("../utils/string");
|
||||
|
||||
|
|
@ -67,6 +67,34 @@ function normalizeAbsUrl(raw) {
|
|||
}
|
||||
}
|
||||
|
||||
// Treat u:* as synthetic (URL-hash fallback) and eligible for repair.
|
||||
function isSyntheticSku(sku) {
|
||||
const s = String(sku || "").trim();
|
||||
return !s || /^u:/i.test(s);
|
||||
}
|
||||
|
||||
// If SKU is <6 chars, namespace it (per your request) to reduce collisions.
|
||||
// Also: DO NOT run numeric SKUs through normalizeCspc (some normalizers hash arbitrary strings).
|
||||
function normalizeTudorSku(rawSku) {
|
||||
const s = String(rawSku || "").trim();
|
||||
if (!s) return "";
|
||||
|
||||
if (/^id:/i.test(s)) return s;
|
||||
if (/^u:/i.test(s)) return s;
|
||||
|
||||
// numeric SKU like 67433
|
||||
if (/^\d+$/.test(s)) {
|
||||
return s.length < 6 ? `id:${s}` : s;
|
||||
}
|
||||
|
||||
// short alnum SKU -> namespace
|
||||
if (s.length < 6) return `id:${s}`;
|
||||
|
||||
// for other formats, keep your existing normalization
|
||||
// (if normalizeCspc returns empty, fall back to the raw string)
|
||||
return normalizeCspc(s) || s;
|
||||
}
|
||||
|
||||
function tudorProductUrl(ctx, slug) {
|
||||
// Site URLs look like: /TUDOR_HOUSE_0/product/spirits/<subcat>/<slug>
|
||||
const root = ctx?.cat?.tudorRootSlug || "spirits";
|
||||
|
|
@ -82,33 +110,23 @@ function tudorPickVariant(p) {
|
|||
return inStock || vs[0] || null;
|
||||
}
|
||||
|
||||
function tudorItemFromProduct(p, ctx) {
|
||||
if (!p) return null;
|
||||
|
||||
const name = cleanText(p?.name || "");
|
||||
const slug = String(p?.slug || "").trim();
|
||||
if (!name || !slug) return null;
|
||||
|
||||
const v = tudorPickVariant(p);
|
||||
if (v && Number(v?.quantity) <= 0) return null; // only keep in-stock
|
||||
|
||||
const url = tudorProductUrl(ctx, slug);
|
||||
|
||||
const price = money(v?.price ?? p?.priceFrom ?? p?.priceTo);
|
||||
const sku = normalizeCspc(v?.sku || "");
|
||||
const img = normalizeAbsUrl(
|
||||
firstNonEmptyStr(
|
||||
v?.image,
|
||||
p?.gulpImages,
|
||||
p?.posImages,
|
||||
p?.customImages,
|
||||
p?.imageIds
|
||||
)
|
||||
);
|
||||
|
||||
return { name, price, url, sku, img };
|
||||
function pickAnySkuFromProduct(p) {
|
||||
const vs = Array.isArray(p?.variants) ? p.variants : [];
|
||||
for (const v of vs) {
|
||||
const s = String(v?.sku || "").trim();
|
||||
if (s) return s;
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
function pickInStockVariantWithFallback(p) {
|
||||
const vs = Array.isArray(p?.variants) ? p.variants : [];
|
||||
const inStock = vs.find((v) => Number(v?.quantity) > 0);
|
||||
return inStock || vs[0] || null;
|
||||
}
|
||||
|
||||
/* ---------------- GraphQL ---------------- */
|
||||
|
||||
async function tudorGql(ctx, label, query, variables) {
|
||||
return await ctx.http.fetchJsonWithRetry(GQL_URL, label, ctx.store.ua, {
|
||||
method: "POST",
|
||||
|
|
@ -122,15 +140,7 @@ async function tudorGql(ctx, label, query, variables) {
|
|||
});
|
||||
}
|
||||
|
||||
function pickConnection(json) {
|
||||
const data = json?.data;
|
||||
if (!data || typeof data !== "object") return null;
|
||||
for (const v of Object.values(data)) {
|
||||
if (v && typeof v === "object" && Array.isArray(v.items)) return v;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/* ---------------- GQL queries ---------------- */
|
||||
|
||||
const PRODUCTS_QUERY = `
|
||||
query(
|
||||
|
|
@ -170,15 +180,14 @@ const PRODUCTS_QUERY = `
|
|||
isStaffPick: $isStaffPick,
|
||||
pageCursor: $pageCursor,
|
||||
pageLimit: $pageLimit,
|
||||
pointsMin: $pointsMin,
|
||||
sortBy: $sortBy,
|
||||
sortOrder: $sortOrder,
|
||||
priceMin: $priceMin,
|
||||
priceMax: $priceMax,
|
||||
quantityMin: $quantityMin,
|
||||
regions: $regions,
|
||||
brandValue: $brandValue,
|
||||
searchValue: $searchValue,
|
||||
sortOrder: $sortOrder,
|
||||
sortBy: $sortBy,
|
||||
storeId: $storeId,
|
||||
) {
|
||||
items {
|
||||
|
|
@ -199,6 +208,31 @@ const PRODUCTS_QUERY = `
|
|||
}
|
||||
`;
|
||||
|
||||
// ONLY for limited image supplementation (within a small budget)
|
||||
const PRODUCTS_BY_SKU_QUERY = `
|
||||
query(
|
||||
$sku: String!,
|
||||
$storeId: String
|
||||
) {
|
||||
productsBySku(
|
||||
sku: $sku,
|
||||
storeId: $storeId
|
||||
) {
|
||||
items {
|
||||
id
|
||||
slug
|
||||
imageIds
|
||||
posImages
|
||||
customImages
|
||||
gulpImages
|
||||
variants { id image price quantity sku deposit }
|
||||
}
|
||||
nextPageCursor
|
||||
totalCount
|
||||
}
|
||||
}
|
||||
`;
|
||||
|
||||
async function fetchProductsPage(ctx, cursor) {
|
||||
const vars = {
|
||||
storeId: STORE_ID,
|
||||
|
|
@ -224,78 +258,291 @@ async function fetchProductsPage(ctx, cursor) {
|
|||
return r.json.data.products;
|
||||
}
|
||||
|
||||
/* ---------------- GQL bySku helper (image-only within budget) ---------------- */
|
||||
|
||||
async function fetchProductBySku(ctx, sku) {
|
||||
const s = String(sku || "").trim();
|
||||
if (!s) return null;
|
||||
|
||||
if (!ctx._tudorSkuCache) ctx._tudorSkuCache = new Map();
|
||||
if (ctx._tudorSkuCache.has(s)) return ctx._tudorSkuCache.get(s);
|
||||
|
||||
const r = await tudorGql(ctx, `tudor:gql:bySku:${ctx.cat.key}:${s}`, PRODUCTS_BY_SKU_QUERY, {
|
||||
sku: s,
|
||||
storeId: STORE_ID,
|
||||
});
|
||||
|
||||
let out = null;
|
||||
if (r?.status === 200 && r?.json?.data?.productsBySku?.items?.length) {
|
||||
out = r.json.data.productsBySku.items[0] || null;
|
||||
}
|
||||
|
||||
ctx._tudorSkuCache.set(s, out);
|
||||
return out;
|
||||
}
|
||||
|
||||
async function supplementImageFromSku(ctx, skuProbe) {
|
||||
const prod = await fetchProductBySku(ctx, skuProbe);
|
||||
if (!prod) return null;
|
||||
|
||||
const v = pickInStockVariantWithFallback(prod);
|
||||
const img = normalizeAbsUrl(
|
||||
firstNonEmptyStr(v?.image, prod?.gulpImages, prod?.posImages, prod?.customImages, prod?.imageIds)
|
||||
);
|
||||
|
||||
return img ? { img } : null;
|
||||
}
|
||||
|
||||
/* ---------------- HTML product page fallback (SKU + optional image) ---------------- */
|
||||
|
||||
// Budgets (per category run). Override via ctx.config.tudorHtmlBudget / ctx.config.tudorGqlBudget.
|
||||
const DETAIL_HTML_BUDGET_DEFAULT = 200;
|
||||
const DETAIL_GQL_BUDGET_DEFAULT = 10;
|
||||
|
||||
function parseSkuFromHtml(html) {
|
||||
const s = String(html || "");
|
||||
|
||||
// 1) Visible block: <div class="sku ...">SKU: 67433</div>
|
||||
const m1 =
|
||||
s.match(/>\s*SKU:\s*([A-Za-z0-9._-]+)\s*</i) ||
|
||||
s.match(/\bSKU:\s*([A-Za-z0-9._-]+)\b/i);
|
||||
if (m1 && m1[1]) return String(m1[1]).trim();
|
||||
|
||||
// 2) Embedded SAPPER preloaded JSON has variants with `"sku":"67433"`
|
||||
const m2 = s.match(/"sku"\s*:\s*"([^"]+)"/i);
|
||||
return m2 && m2[1] ? String(m2[1]).trim() : "";
|
||||
}
|
||||
|
||||
function parseOgImageFromHtml(html) {
|
||||
const s = String(html || "");
|
||||
const m =
|
||||
s.match(/property=["']og:image["'][^>]*content=["']([^"']+)["']/i) ||
|
||||
s.match(/name=["']twitter:image["'][^>]*content=["']([^"']+)["']/i);
|
||||
return m ? String(m[1] || "").trim() : "";
|
||||
}
|
||||
|
||||
async function tudorFetchHtml(ctx, label, url) {
|
||||
// Use ctx.http so pacing/throttle is respected.
|
||||
if (ctx?.http?.fetchTextWithRetry) {
|
||||
return await ctx.http.fetchTextWithRetry(url, label, ctx.store.ua, {
|
||||
method: "GET",
|
||||
headers: {
|
||||
Accept: "text/html,application/xhtml+xml",
|
||||
Referer: `${BASE}/`,
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
// Best-effort fallback if your wrapper has a generic fetchWithRetry.
|
||||
if (ctx?.http?.fetchWithRetry) {
|
||||
const r = await ctx.http.fetchWithRetry(url, label, ctx.store.ua, {
|
||||
method: "GET",
|
||||
headers: {
|
||||
Accept: "text/html,application/xhtml+xml",
|
||||
Referer: `${BASE}/`,
|
||||
},
|
||||
});
|
||||
|
||||
const body = r?.text ?? r?.body ?? r?.data ?? "";
|
||||
const text =
|
||||
typeof body === "string"
|
||||
? body
|
||||
: Buffer.isBuffer(body)
|
||||
? body.toString("utf8")
|
||||
: body && typeof body === "object" && typeof body.toString === "function"
|
||||
? body.toString()
|
||||
: "";
|
||||
|
||||
return { status: r?.status, text, bytes: r?.bytes, ms: r?.ms };
|
||||
}
|
||||
|
||||
throw new Error("No HTML fetch method available on ctx.http (need fetchTextWithRetry or fetchWithRetry).");
|
||||
}
|
||||
|
||||
async function tudorDetailFromProductPage(ctx, url) {
|
||||
if (!ctx._tudorHtmlCache) ctx._tudorHtmlCache = new Map();
|
||||
if (ctx._tudorHtmlCache.has(url)) return ctx._tudorHtmlCache.get(url);
|
||||
|
||||
let out = null;
|
||||
try {
|
||||
const r = await tudorFetchHtml(ctx, `tudor:html:${ctx.cat.key}`, url);
|
||||
if (r?.status === 200 && typeof r?.text === "string" && r.text.length) {
|
||||
const rawSku = parseSkuFromHtml(r.text);
|
||||
const sku = normalizeTudorSku(rawSku);
|
||||
const img = normalizeAbsUrl(parseOgImageFromHtml(r.text));
|
||||
out = { sku, img };
|
||||
}
|
||||
} catch {
|
||||
out = null;
|
||||
}
|
||||
|
||||
ctx._tudorHtmlCache.set(url, out);
|
||||
return out;
|
||||
}
|
||||
|
||||
/* ---------------- item builder (fast, no extra calls) ---------------- */
|
||||
|
||||
function tudorItemFromProductFast(p, ctx) {
|
||||
if (!p) return null;
|
||||
|
||||
const name = cleanText(p?.name || "");
|
||||
const slug = String(p?.slug || "").trim();
|
||||
if (!name || !slug) return null;
|
||||
|
||||
const v = tudorPickVariant(p);
|
||||
if (v && Number(v?.quantity) <= 0) return null; // only keep in-stock
|
||||
|
||||
const url = tudorProductUrl(ctx, slug);
|
||||
const price = money(v?.price ?? p?.priceFrom ?? p?.priceTo);
|
||||
|
||||
const skuRaw = String(v?.sku || "").trim() || pickAnySkuFromProduct(p);
|
||||
const sku = normalizeTudorSku(skuRaw);
|
||||
|
||||
const img = normalizeAbsUrl(
|
||||
firstNonEmptyStr(v?.image, p?.gulpImages, p?.posImages, p?.customImages, p?.imageIds)
|
||||
);
|
||||
|
||||
return { name, price, url, sku, img, _skuProbe: skuRaw };
|
||||
}
|
||||
|
||||
/* ---------------- repair (second pass, budgeted) ---------------- */
|
||||
|
||||
async function tudorRepairItem(ctx, it) {
|
||||
// 1) Missing or synthetic SKU -> HTML product page (fastest path to real SKU)
|
||||
if (isSyntheticSku(it.sku)) {
|
||||
const d = await tudorDetailFromProductPage(ctx, it.url);
|
||||
if (d?.sku && !isSyntheticSku(d.sku)) it.sku = d.sku;
|
||||
if (!it.img && d?.img) it.img = d.img;
|
||||
}
|
||||
|
||||
// 2) Missing image -> if we have a sku probe, do limited productsBySku
|
||||
if (!it.img) {
|
||||
const skuProbe = String(it._skuProbe || "").trim();
|
||||
if (skuProbe) {
|
||||
const supp = await supplementImageFromSku(ctx, skuProbe);
|
||||
if (supp?.img) it.img = supp.img;
|
||||
}
|
||||
}
|
||||
|
||||
// Final fallback ONLY after repair attempts (stability)
|
||||
if (isSyntheticSku(it.sku)) it.sku = normalizeCspc(it.url) || "";
|
||||
|
||||
return it;
|
||||
}
|
||||
|
||||
/* ---------------- scanner ---------------- */
|
||||
|
||||
async function scanCategoryTudor(ctx, prevDb, report) {
|
||||
const t0 = Date.now();
|
||||
const discovered = new Map();
|
||||
|
||||
const maxPages = ctx.config.maxPages === null ? 500 : Math.min(ctx.config.maxPages, 500);
|
||||
let cursor = null;
|
||||
let done = 0;
|
||||
|
||||
for (let page = 1; page <= maxPages; page++) {
|
||||
const tPage = Date.now();
|
||||
|
||||
const prod = await fetchProductsPage(ctx, cursor);
|
||||
const arr = Array.isArray(prod?.items) ? prod.items : [];
|
||||
|
||||
let kept = 0;
|
||||
for (const p of arr) {
|
||||
const it = tudorItemFromProduct(p, ctx);
|
||||
if (!it) continue;
|
||||
discovered.set(it.url, it);
|
||||
kept++;
|
||||
const t0 = Date.now();
|
||||
const discovered = new Map();
|
||||
|
||||
const maxPages = ctx.config.maxPages === null ? 500 : Math.min(ctx.config.maxPages, 500);
|
||||
let cursor = null;
|
||||
let done = 0;
|
||||
|
||||
const needsDetail = [];
|
||||
|
||||
for (let page = 1; page <= maxPages; page++) {
|
||||
const tPage = Date.now();
|
||||
|
||||
const prod = await fetchProductsPage(ctx, cursor);
|
||||
const arr = Array.isArray(prod?.items) ? prod.items : [];
|
||||
|
||||
let kept = 0;
|
||||
for (const p of arr) {
|
||||
const it = tudorItemFromProductFast(p, ctx);
|
||||
if (!it) continue;
|
||||
|
||||
// NEW: seed from cached DB to avoid repeating detail HTML
|
||||
const prev = prevDb?.byUrl?.get(it.url) || null;
|
||||
if (prev) {
|
||||
it.sku = pickBetterSku(it.sku, prev.sku);
|
||||
if (!it.img && prev.img) it.img = prev.img;
|
||||
}
|
||||
|
||||
done++;
|
||||
|
||||
const ms = Date.now() - tPage;
|
||||
ctx.logger.ok(
|
||||
`${ctx.catPrefixOut} | Page ${pageStr(page, maxPages)} | 200 | items=${padLeft(
|
||||
kept,
|
||||
3
|
||||
)} | bytes=${kbStr(0)} | ${padRight(ctx.http.inflightStr(), 11)} | ${secStr(ms)}`
|
||||
);
|
||||
|
||||
cursor = prod?.nextPageCursor || null;
|
||||
if (!cursor || !arr.length) break;
|
||||
|
||||
// queue only; do not do detail calls inline
|
||||
if (isSyntheticSku(it.sku) || !it.img) needsDetail.push(it);
|
||||
|
||||
discovered.set(it.url, it);
|
||||
kept++;
|
||||
}
|
||||
|
||||
ctx.logger.ok(`${ctx.catPrefixOut} | Unique products: ${discovered.size}`);
|
||||
|
||||
const { merged, newItems, updatedItems, removedItems, restoredItems } = mergeDiscoveredIntoDb(prevDb, discovered, {
|
||||
storeLabel: ctx.store.name,
|
||||
});
|
||||
|
||||
const dbObj = buildDbObject(ctx, merged);
|
||||
writeJsonAtomic(ctx.dbFile, dbObj);
|
||||
|
||||
const elapsed = Date.now() - t0;
|
||||
|
||||
report.categories.push({
|
||||
store: ctx.store.name,
|
||||
label: ctx.cat.label,
|
||||
key: ctx.cat.key,
|
||||
dbFile: ctx.dbFile,
|
||||
scannedPages: done,
|
||||
discoveredUnique: discovered.size,
|
||||
newCount: newItems.length,
|
||||
updatedCount: updatedItems.length,
|
||||
removedCount: removedItems.length,
|
||||
restoredCount: restoredItems.length,
|
||||
elapsedMs: elapsed,
|
||||
});
|
||||
|
||||
report.totals.newCount += newItems.length;
|
||||
report.totals.updatedCount += updatedItems.length;
|
||||
report.totals.removedCount += removedItems.length;
|
||||
report.totals.restoredCount += restoredItems.length;
|
||||
|
||||
addCategoryResultToReport(report, ctx.store.name, ctx.cat.label, newItems, updatedItems, removedItems, restoredItems);
|
||||
|
||||
done++;
|
||||
|
||||
const ms = Date.now() - tPage;
|
||||
ctx.logger.ok(
|
||||
`${ctx.catPrefixOut} | Page ${pageStr(page, maxPages)} | 200 | items=${padLeft(
|
||||
kept,
|
||||
3
|
||||
)} | bytes=${kbStr(0)} | ${padRight(ctx.http.inflightStr(), 11)} | ${secStr(ms)}`
|
||||
);
|
||||
|
||||
cursor = prod?.nextPageCursor || null;
|
||||
if (!cursor || !arr.length) break;
|
||||
}
|
||||
|
||||
|
||||
// second pass: repair with budgets
|
||||
const htmlBudget = Number.isFinite(ctx.config.tudorHtmlBudget)
|
||||
? ctx.config.tudorHtmlBudget
|
||||
: DETAIL_HTML_BUDGET_DEFAULT;
|
||||
|
||||
const gqlBudget = Number.isFinite(ctx.config.tudorGqlBudget)
|
||||
? ctx.config.tudorGqlBudget
|
||||
: DETAIL_GQL_BUDGET_DEFAULT;
|
||||
|
||||
let htmlUsed = 0;
|
||||
let gqlUsed = 0;
|
||||
|
||||
for (const it of needsDetail) {
|
||||
const wantsHtml = isSyntheticSku(it.sku);
|
||||
const wantsGql = !it.img && String(it._skuProbe || "").trim();
|
||||
|
||||
// enforce caps
|
||||
if (wantsHtml && htmlUsed >= htmlBudget && (!wantsGql || gqlUsed >= gqlBudget)) continue;
|
||||
if (wantsGql && gqlUsed >= gqlBudget && (!wantsHtml || htmlUsed >= htmlBudget)) continue;
|
||||
|
||||
// count budgets pessimistically
|
||||
if (wantsHtml) htmlUsed++;
|
||||
if (wantsGql) gqlUsed++;
|
||||
|
||||
await tudorRepairItem(ctx, it);
|
||||
discovered.set(it.url, it);
|
||||
}
|
||||
|
||||
ctx.logger.ok(
|
||||
`${ctx.catPrefixOut} | Unique products: ${discovered.size} | detail(html=${htmlUsed}/${htmlBudget}, gql=${gqlUsed}/${gqlBudget})`
|
||||
);
|
||||
|
||||
const { merged, newItems, updatedItems, removedItems, restoredItems } = mergeDiscoveredIntoDb(prevDb, discovered, {
|
||||
storeLabel: ctx.store.name,
|
||||
});
|
||||
|
||||
const dbObj = buildDbObject(ctx, merged);
|
||||
writeJsonAtomic(ctx.dbFile, dbObj);
|
||||
|
||||
const elapsed = Date.now() - t0;
|
||||
|
||||
report.categories.push({
|
||||
store: ctx.store.name,
|
||||
label: ctx.cat.label,
|
||||
key: ctx.cat.key,
|
||||
dbFile: ctx.dbFile,
|
||||
scannedPages: done,
|
||||
discoveredUnique: discovered.size,
|
||||
newCount: newItems.length,
|
||||
updatedCount: updatedItems.length,
|
||||
removedCount: removedItems.length,
|
||||
restoredCount: restoredItems.length,
|
||||
elapsedMs: elapsed,
|
||||
});
|
||||
|
||||
report.totals.newCount += newItems.length;
|
||||
report.totals.updatedCount += updatedItems.length;
|
||||
report.totals.removedCount += removedItems.length;
|
||||
report.totals.restoredCount += restoredItems.length;
|
||||
|
||||
addCategoryResultToReport(report, ctx.store.name, ctx.cat.label, newItems, updatedItems, removedItems, restoredItems);
|
||||
}
|
||||
|
||||
/* ---------------- store ---------------- */
|
||||
|
||||
|
|
|
|||
|
|
@ -20,8 +20,9 @@ function normalizeUpcDigits(v) {
|
|||
return m ? m[1] : "";
|
||||
}
|
||||
|
||||
// CHANGE: allow 1-11 digits so BCL 3-digit ids like id:141 are preserved
|
||||
function normalizeIdDigits(v) {
|
||||
const m = String(v ?? "").match(/\b(\d{4,11})\b/);
|
||||
const m = String(v ?? "").match(/\b(\d{1,11})\b/);
|
||||
return m ? m[1] : "";
|
||||
}
|
||||
|
||||
|
|
@ -33,6 +34,35 @@ function makeSyntheticSkuKey({ storeLabel, url }) {
|
|||
return `u:${fnv1a32(`${store}|${u}`)}`;
|
||||
}
|
||||
|
||||
/* ---------------- NEW: SKU quality helpers ---------------- */
|
||||
|
||||
function skuQuality(v) {
|
||||
const s = String(v ?? "").trim();
|
||||
if (!s) return 0; // missing
|
||||
if (/^u:/i.test(s)) return 0; // synthetic
|
||||
if (normalizeCspc(s)) return 3; // best (6-digit CSPC)
|
||||
if (/^upc:/i.test(s)) return 2;
|
||||
if (/^id:/i.test(s)) return 2;
|
||||
return 1; // explicit non-synthetic string
|
||||
}
|
||||
|
||||
// Prefer higher quality; on ties keep existing (stable) value
|
||||
function pickBetterSku(newSku, oldSku) {
|
||||
const a = String(newSku ?? "").trim();
|
||||
const b = String(oldSku ?? "").trim();
|
||||
const qa = skuQuality(a);
|
||||
const qb = skuQuality(b);
|
||||
if (qa > qb) return a;
|
||||
if (qb > qa) return b;
|
||||
return b || a;
|
||||
}
|
||||
|
||||
// Only fetch product pages when missing/synthetic
|
||||
function needsSkuDetail(sku) {
|
||||
const s = String(sku ?? "").trim();
|
||||
return !s || /^u:/i.test(s);
|
||||
}
|
||||
|
||||
/**
|
||||
* Behavior:
|
||||
* - CSPC 6-digit => "123456"
|
||||
|
|
@ -63,4 +93,11 @@ function normalizeSkuKey(v, { storeLabel, url } = {}) {
|
|||
return syn || "";
|
||||
}
|
||||
|
||||
module.exports = { normalizeCspc, normalizeSkuKey, makeSyntheticSkuKey };
|
||||
module.exports = {
|
||||
normalizeCspc,
|
||||
normalizeSkuKey,
|
||||
makeSyntheticSkuKey,
|
||||
skuQuality,
|
||||
pickBetterSku,
|
||||
needsSkuDetail,
|
||||
};
|
||||
|
|
|
|||
Loading…
Reference in a new issue