feat: Better SKUs for CC gull tudor and BCL

This commit is contained in:
Brennan Wilkes (Text Groove) 2026-01-31 14:08:52 -08:00
parent 6be8e87733
commit f19c1404fa
5 changed files with 735 additions and 238 deletions

View file

@ -80,7 +80,6 @@ function bclIsInStock(src) {
return true;
}
function bclNormalizeAbsUrl(raw) {
const s = String(raw || "").trim();
if (!s) return "";
@ -141,7 +140,17 @@ function bclHitToItem(hit) {
const regular = asNumber(src.regularPrice);
const price = cad(Number.isFinite(current) ? current : regular);
const sku = normalizeCspc(url);
// SKU key:
// - Keep CSPC 6-digit when present (rare for BCL, but safe)
// - Otherwise upgrade to an explicit soft key: id:<digits>
//
// ✅ PATCH: handle tiny SKUs too (3/4/5-digit) by forcing id:<digits>
// only fall back to raw (NOT u:) if its genuinely non-numeric.
let sku = normalizeCspc(skuRaw);
if (!sku) {
const m = skuRaw.match(/^\d{1,6}$/); // BCL product IDs like 141, 596, 984, 117, etc.
sku = m ? `id:${m[0]}` : `id:${skuRaw}`;
}
const inStock = bclIsInStock(src);
if (!inStock) return null;
@ -155,8 +164,6 @@ function bclHitToItem(hit) {
return { name, price, url, sku, img };
}
async function bclFetchBrowsePage(ctx, page1, size) {
const type = ctx.cat.bclType; // e.g. "rum" or "whisky / whiskey"
const category = "spirits";
@ -293,11 +300,12 @@ async function scanCategoryBCLAjax(ctx, prevDb, report) {
newCount: newItems.length,
updatedCount: updatedItems.length,
removedCount: removedItems.length,
restoredCount: removedItems.length,
restoredCount: restoredItems.length,
elapsedMs: elapsed,
});
report.totals.newCount += newItems.length;
report.totals.updatedCount += updatedItems.length;
report.totals.updatedCount += updatedItems.length;
report.totals.removedCount += removedItems.length;
report.totals.restoredCount += restoredItems.length;

View file

@ -5,7 +5,7 @@ const { setTimeout: sleep } = require("timers/promises");
const { decodeHtml, stripTags, extractFirstImgUrl } = require("../utils/html");
const { sanitizeName } = require("../utils/text");
const { normalizeCspc } = require("../utils/sku");
const { normalizeCspc, pickBetterSku, needsSkuDetail } = require("../utils/sku");
const { makePageUrlShopifyQueryPage } = require("../utils/url");
const { mergeDiscoveredIntoDb } = require("../tracker/merge");
@ -33,7 +33,9 @@ function canonicalizeCraftProductUrl(raw) {
function extractShopifyCardPrice(block) {
const b = String(block || "");
const dollars = (txt) =>
[...String(txt).matchAll(/\$\s*[\d,]+(?:\.\d{2})?/g)].map((m) => m[0].replace(/\s+/g, ""));
[...String(txt).matchAll(/\$\s*[\d,]+(?:\.\d{2})?/g)].map((m) =>
m[0].replace(/\s+/g, "")
);
const saleRegion = b.split(/sale price/i)[1] || "";
const saleD = dollars(saleRegion);
@ -50,8 +52,14 @@ function extractShopifyCardPrice(block) {
function parseProductsCraftCellars(html, ctx) {
const s = String(html || "");
const g1 = s.match(/<div\b[^>]*id=["']ProductGridContainer["'][^>]*>[\s\S]*?<\/div>/i)?.[0] || "";
const g2 = s.match(/<div\b[^>]*id=["']product-grid["'][^>]*>[\s\S]*?<\/div>/i)?.[0] || "";
const g1 =
s.match(
/<div\b[^>]*id=["']ProductGridContainer["'][^>]*>[\s\S]*?<\/div>/i
)?.[0] || "";
const g2 =
s.match(
/<div\b[^>]*id=["']product-grid["'][^>]*>[\s\S]*?<\/div>/i
)?.[0] || "";
const gridCandidate = g1.length > g2.length ? g1 : g2;
const grid = /\/products\//i.test(gridCandidate) ? gridCandidate : s;
@ -63,18 +71,24 @@ function parseProductsCraftCellarsInner(html, ctx) {
const s = String(html || "");
const items = [];
let blocks = [...s.matchAll(/<li\b[^>]*>[\s\S]*?<\/li>/gi)].map((m) => m[0]);
let blocks = [...s.matchAll(/<li\b[^>]*>[\s\S]*?<\/li>/gi)].map(
(m) => m[0]
);
if (blocks.length < 5) {
blocks = [...s.matchAll(/<div\b[^>]*class=["'][^"']*\bcard\b[^"']*["'][^>]*>[\s\S]*?<\/div>/gi)].map(
(m) => m[0]
);
blocks = [
...s.matchAll(
/<div\b[^>]*class=["'][^"']*\bcard\b[^"']*["'][^>]*>[\s\S]*?<\/div>/gi
),
].map((m) => m[0]);
}
const base = `https://${(ctx && ctx.store && ctx.store.host) || "craftcellars.ca"}/`;
for (const block of blocks) {
const href =
block.match(/<a\b[^>]*href=["']([^"']*\/products\/[^"']+)["']/i)?.[1] ||
block.match(
/<a\b[^>]*href=["']([^"']*\/products\/[^"']+)["']/i
)?.[1] ||
block.match(/href=["']([^"']*\/products\/[^"']+)["']/i)?.[1];
if (!href) continue;
@ -87,9 +101,15 @@ function parseProductsCraftCellarsInner(html, ctx) {
url = canonicalizeCraftProductUrl(url);
const nameHtml =
block.match(/<a\b[^>]*href=["'][^"']*\/products\/[^"']+["'][^>]*>\s*<[^>]*>\s*([^<]{2,200}?)\s*</i)?.[1] ||
block.match(/<h[23]\b[^>]*>[\s\S]*?<a\b[^>]*\/products\/[^"']+[^>]*>([\s\S]*?)<\/a>[\s\S]*?<\/h[23]>/i)?.[1] ||
block.match(/<a\b[^>]*href=["'][^"']*\/products\/[^"']+["'][^>]*>([\s\S]*?)<\/a>/i)?.[1];
block.match(
/<a\b[^>]*href=["'][^"']*\/products\/[^"']+["'][^>]*>\s*<[^>]*>\s*([^<]{2,200}?)\s*</i
)?.[1] ||
block.match(
/<h[23]\b[^>]*>[\s\S]*?<a\b[^>]*\/products\/[^"']+[^>]*>([\s\S]*?)<\/a>[\s\S]*?<\/h[23]>/i
)?.[1] ||
block.match(
/<a\b[^>]*href=["'][^"']*\/products\/[^"']+["'][^>]*>([\s\S]*?)<\/a>/i
)?.[1];
const name = sanitizeName(stripTags(decodeHtml(nameHtml || "")));
if (!name) continue;
@ -108,37 +128,58 @@ function parseProductsCraftCellarsInner(html, ctx) {
function usdFromShopifyPriceStr(s) {
const n = Number(String(s || "").replace(/[^0-9.]/g, ""));
if (!Number.isFinite(n)) return "";
return `$${n.toLocaleString("en-US", { minimumFractionDigits: 2, maximumFractionDigits: 2 })}`;
return `$${n.toLocaleString("en-US", {
minimumFractionDigits: 2,
maximumFractionDigits: 2,
})}`;
}
function cfgNum(v, fallback) {
return Number.isFinite(v) ? v : fallback;
}
/* ---------- NEW: product page SKU extractor ---------- */
function extractCraftSkuFromProductPageHtml(html) {
const s = String(html || "");
const m =
s.match(
/<strong>\s*SKU:\s*<\/strong>\s*<span>\s*([^<]{1,80}?)\s*<\/span>/i
) ||
s.match(/\bSKU:\s*<\/strong>\s*<span>\s*([^<]{1,80}?)\s*<\/span>/i) ||
s.match(/\bSKU:\s*([A-Za-z0-9][A-Za-z0-9\-_/ ]{0,40})/i);
const raw = m && m[1] ? stripTags(decodeHtml(m[1])) : "";
return normalizeCspc(raw);
}
/**
* Craft Cellars:
* - HTML listing with ?filter.v.availability=1 is the allowlist (prevents OOS leaking in)
* - Shopify products.json is used only to enrich SKU (and optionally price) for those allowed URLs
* - HTML listing with ?filter.v.availability=1 is the allowlist
* - products.json enriches SKU/price
* - product page HTML is final SKU fallback
*/
async function scanCategoryCraftCellars(ctx, prevDb, report) {
const t0 = Date.now();
// Strongly prefer "slow and steady" to avoid 429s.
// Use per-category knobs if present; otherwise default conservative.
const perPageDelayMs = Math.max(
0,
cfgNum(ctx?.cat?.pageStaggerMs, cfgNum(ctx?.cat?.discoveryDelayMs, 0)) || 0
) || 0;
const perPageDelayMs =
Math.max(
0,
cfgNum(ctx?.cat?.pageStaggerMs, cfgNum(ctx?.cat?.discoveryDelayMs, 0))
) || 0;
const perJsonPageDelayMs = Math.max(
0,
cfgNum(ctx?.cat?.jsonPageDelayMs, perPageDelayMs)
);
// 1) HTML scan: allowlist of in-stock listing URLs
const htmlMap = new Map(); // url -> {name, price, url, img}
const htmlMap = new Map();
const maxPages =
ctx.config.maxPages === null
? 200
: Math.min(ctx.config.maxPages, 200);
const maxPages = ctx.config.maxPages === null ? 200 : Math.min(ctx.config.maxPages, 200);
let htmlPagesFetched = 0;
let emptyStreak = 0;
@ -146,7 +187,11 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) {
if (p > 1 && perPageDelayMs > 0) await sleep(perPageDelayMs);
const pageUrl = makePageUrlShopifyQueryPage(ctx.cat.startUrl, p);
const { text: html } = await ctx.http.fetchTextWithRetry(pageUrl, `craft:html:${ctx.cat.key}:p${p}`, ctx.store.ua);
const { text: html } = await ctx.http.fetchTextWithRetry(
pageUrl,
`craft:html:${ctx.cat.key}:p${p}`,
ctx.store.ua
);
htmlPagesFetched++;
if (craftCellarsIsEmptyListingPage(html)) break;
@ -162,22 +207,30 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) {
for (const it of items) {
const url = canonicalizeCraftProductUrl(it.url);
if (!url) continue;
htmlMap.set(url, { name: it.name || "", price: it.price || "", url, img: it.img || "" });
htmlMap.set(url, {
name: it.name || "",
price: it.price || "",
url,
img: it.img || "",
});
}
}
// If HTML returns nothing, don't let JSON invent a category
if (!htmlMap.size) {
ctx.logger.warn(`${ctx.catPrefixOut} | HTML listing returned 0 items; refusing to use products.json as source of truth.`);
ctx.logger.warn(
`${ctx.catPrefixOut} | HTML listing returned 0 items; refusing JSON-only discovery`
);
}
// 2) JSON scan: build SKU index (but do NOT add new URLs from JSON)
const jsonMap = new Map(); // url -> { sku, price, img }
const jsonMap = new Map();
if (htmlMap.size) {
const start = new URL(ctx.cat.startUrl);
const m = start.pathname.match(/^\/collections\/([^/]+)/i);
if (!m) throw new Error(`CraftCellars: couldn't extract collection handle from ${ctx.cat.startUrl}`);
if (!m)
throw new Error(
`CraftCellars: couldn't extract collection handle from ${ctx.cat.startUrl}`
);
const collectionHandle = m[1];
const limit = 250;
@ -185,12 +238,19 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) {
let jsonPagesFetched = 0;
while (true) {
if (jsonPage > 1 && perJsonPageDelayMs > 0) await sleep(perJsonPageDelayMs);
if (jsonPage > 1 && perJsonPageDelayMs > 0)
await sleep(perJsonPageDelayMs);
const url = `https://${ctx.store.host}/collections/${collectionHandle}/products.json?limit=${limit}&page=${jsonPage}`;
const r = await ctx.http.fetchJsonWithRetry(url, `craft:coljson:${ctx.cat.key}:p${jsonPage}`, ctx.store.ua);
const r = await ctx.http.fetchJsonWithRetry(
url,
`craft:coljson:${ctx.cat.key}:p${jsonPage}`,
ctx.store.ua
);
const products = Array.isArray(r?.json?.products) ? r.json.products : [];
const products = Array.isArray(r?.json?.products)
? r.json.products
: [];
jsonPagesFetched++;
if (!products.length) break;
@ -199,73 +259,116 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) {
const handle = String(p?.handle || "");
if (!handle) continue;
const prodUrl = canonicalizeCraftProductUrl(`https://${ctx.store.host}/products/${handle}`);
// Only enrich if it's on the HTML allowlist
const prodUrl = canonicalizeCraftProductUrl(
`https://${ctx.store.host}/products/${handle}`
);
if (!htmlMap.has(prodUrl)) continue;
const variants = Array.isArray(p?.variants) ? p.variants : [];
const v = variants.find((x) => x && x.available === true) || variants[0] || null;
const v =
variants.find((x) => x && x.available === true) ||
variants[0] ||
null;
const sku = normalizeCspc(v?.sku || "");
const price = v?.price ? usdFromShopifyPriceStr(v.price) : "";
// Product image (best effort)
let img = "";
const images = Array.isArray(p?.images) ? p.images : [];
if (images[0]) {
if (typeof images[0] === "string") img = images[0];
else img = String(images[0]?.src || images[0]?.url || "");
img =
typeof images[0] === "string"
? images[0]
: String(images[0]?.src || images[0]?.url || "");
}
if (!img && p?.image) img = String(p.image?.src || p.image?.url || p.image || "");
if (!img && p?.image)
img = String(p.image?.src || p.image?.url || p.image || "");
img = String(img || "").trim();
if (img.startsWith("//")) img = `https:${img}`;
if (img && !/^https?:\/\//i.test(img)) {
try {
img = new URL(img, `https://${ctx.store.host}/`).toString();
} catch {
// keep as-is
}
}
jsonMap.set(prodUrl, { sku, price, img });
}
if (products.length < limit) break;
jsonPage++;
if (jsonPage > 200) break; // safety
if (++jsonPage > 200) break;
}
ctx.logger.ok(`${ctx.catPrefixOut} | HTML pages=${htmlPagesFetched} JSON pages=${jsonPagesFetched}`);
} else {
ctx.logger.ok(`${ctx.catPrefixOut} | HTML pages=${htmlPagesFetched} JSON pages=0`);
ctx.logger.ok(
`${ctx.catPrefixOut} | HTML pages=${htmlPagesFetched} JSON pages=${jsonPagesFetched}`
);
}
// 3) Final discovered: HTML allowlist, enriched by JSON
const discovered = new Map();
for (const [url, it] of htmlMap.entries()) {
const j = jsonMap.get(url);
const prev = prevDb?.byUrl?.get(url) || null;
discovered.set(url, {
name: it.name || "",
// Prefer JSON price (normalized) when present, else keep HTML price (already formatted)
name: it.name,
price: j?.price || it.price || "",
url,
sku: j?.sku || "",
img: j?.img || it.img || "",
// reuse cached SKU unless we found something better this run
sku: pickBetterSku(j?.sku || "", prev?.sku || ""),
// reuse cached image if we didn't find one
img: (j?.img || it.img || prev?.img || ""),
});
}
ctx.logger.ok(`${ctx.catPrefixOut} | Unique products (this run): ${discovered.size}`);
/* ---------- NEW: product page SKU fallback (cached; only when needed) ---------- */
const perProductSkuDelayMs = Math.max(
0,
cfgNum(
ctx?.cat?.skuPageDelayMs,
cfgNum(ctx?.cat?.jsonPageDelayMs, perPageDelayMs)
)
);
const { merged, newItems, updatedItems, removedItems, restoredItems } = mergeDiscoveredIntoDb(prevDb, discovered, {
let skuPagesFetched = 0;
for (const it of discovered.values()) {
// only hit product pages when missing/synthetic
if (!needsSkuDetail(it.sku)) continue;
if (perProductSkuDelayMs > 0) await sleep(perProductSkuDelayMs);
try {
const { text } = await ctx.http.fetchTextWithRetry(
it.url,
`craft:prodpage:${ctx.cat.key}:${Buffer.from(it.url)
.toString("base64")
.slice(0, 24)}`,
ctx.store.ua
);
skuPagesFetched++;
const sku = extractCraftSkuFromProductPageHtml(text);
if (sku) it.sku = sku;
} catch {
/* best effort */
}
}
ctx.logger.ok(
`${ctx.catPrefixOut} | SKU fallback pages=${skuPagesFetched}`
);
ctx.logger.ok(
`${ctx.catPrefixOut} | Unique products (this run): ${discovered.size}`
);
const {
merged,
newItems,
updatedItems,
removedItems,
restoredItems,
} = mergeDiscoveredIntoDb(prevDb, discovered, {
storeLabel: ctx.store.name,
});
const dbObj = buildDbObject(ctx, merged);
writeJsonAtomic(ctx.dbFile, dbObj);
ctx.logger.ok(`${ctx.catPrefixOut} | DB saved: ${ctx.logger.dim(ctx.dbFile)} (${dbObj.count} items)`);
const elapsed = Date.now() - t0;
report.categories.push({
@ -287,7 +390,15 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) {
report.totals.removedCount += removedItems.length;
report.totals.restoredCount += restoredItems.length;
addCategoryResultToReport(report, ctx.store.name, ctx.cat.label, newItems, updatedItems, removedItems, restoredItems);
addCategoryResultToReport(
report,
ctx.store.name,
ctx.cat.label,
newItems,
updatedItems,
removedItems,
restoredItems
);
}
function createStore(defaultUa) {
@ -297,10 +408,8 @@ function createStore(defaultUa) {
host: "craftcellars.ca",
ua: defaultUa,
// ✅ Custom scan (HTML allowlist + JSON enrichment)
scanCategory: scanCategoryCraftCellars,
// Keep HTML parser for debugging
parseProducts: parseProductsCraftCellars,
makePageUrl: makePageUrlShopifyQueryPage,
isEmptyListingPage: craftCellarsIsEmptyListingPage,
@ -309,69 +418,22 @@ function createStore(defaultUa) {
{
key: "whisky",
label: "Whisky",
startUrl: "https://craftcellars.ca/collections/whisky?filter.v.availability=1",
// slow-and-safe defaults (override globally if you want)
discoveryStartPage: 3,
discoveryStep: 2,
startUrl:
"https://craftcellars.ca/collections/whisky?filter.v.availability=1",
pageConcurrency: 1,
pageStaggerMs: 10000,
discoveryDelayMs: 10000,
skuPageDelayMs: 12000,
},
{
key: "rum",
label: "Rum",
startUrl: "https://craftcellars.ca/collections/rum?filter.v.availability=1",
discoveryStartPage: 3,
discoveryStep: 2,
pageConcurrency: 1,
pageStaggerMs: 10000,
discoveryDelayMs: 10000,
},
{
key: "single-malt-scotch",
label: "Single Malt Scotch",
startUrl: "https://craftcellars.ca/collections/single-malt-scotch?filter.v.availability=1",
discoveryStartPage: 3,
discoveryStep: 2,
pageConcurrency: 1,
pageStaggerMs: 10000,
discoveryDelayMs: 10000,
},
{
key: "other-scotch-styles",
label: "Other Scotch Styles",
startUrl: "https://craftcellars.ca/collections/other-scotch-styles?filter.v.availability=1",
discoveryStartPage: 3,
discoveryStep: 2,
pageConcurrency: 1,
pageStaggerMs: 10000,
discoveryDelayMs: 10000,
},
{
key: "single-grain-scotch",
label: "Single Grain Scotch",
startUrl: "https://craftcellars.ca/collections/single-grain-scotch?filter.v.availability=1",
discoveryStartPage: 3,
discoveryStep: 2,
pageConcurrency: 1,
pageStaggerMs: 10000,
discoveryDelayMs: 10000,
},
{
key: "blended-malt-scotch",
label: "Blended Malt Scotch",
startUrl: "https://craftcellars.ca/collections/blended-malt-scotch?filter.v.availability=1",
discoveryStartPage: 3,
discoveryStep: 2,
startUrl:
"https://craftcellars.ca/collections/rum?filter.v.availability=1",
pageConcurrency: 1,
pageStaggerMs: 10000,
discoveryDelayMs: 10000,
skuPageDelayMs: 12000,
},
],
};

View file

@ -1,7 +1,8 @@
// src/stores/gull.js
"use strict";
const { decodeHtml, cleanText, extractFirstImgUrl } = require("../utils/html");
const { normalizeCspc } = require("../utils/sku");
const { normalizeCspc, pickBetterSku, needsSkuDetail } = require("../utils/sku");
const { makePageUrl } = require("../utils/url");
function looksInStock(block) {
@ -45,6 +46,133 @@ function extractGullPriceFromBlock(block) {
return `$${chosen.toFixed(2)}`;
}
// Gull SKUs are often NOT 6 digits (e.g. 67424).
// If it's not 6 digits, represent as id:<digits> to avoid normalizeCspc turning it into u:SHA.
function normalizeGullSku(raw) {
const s = cleanText(decodeHtml(String(raw || ""))).trim();
// already in a stable prefixed form
if (/^(id:|u:)/i.test(s)) return s;
// digits-only SKU (from page / tile)
const digits = s.match(/\b(\d{3,10})\b/)?.[1] || "";
if (digits) {
if (digits.length === 6) return normalizeCspc(digits);
return `id:${digits}`;
}
// fall back to existing normalizer (may yield u:...)
return normalizeCspc(s);
}
// When we fall back to normalizeCspc(url), we may end up with a generated u:XXXXXXXX.
function isGeneratedUrlSku(sku) {
const s = String(sku || "");
// you have u:8hex in the DB, so accept 8+
return /^u:[0-9a-f]{8,128}$/i.test(s);
}
// Extract SKU from Gull product page HTML.
function extractGullSkuFromProductPage(html) {
const s = String(html || "");
// Most reliable: <span class="sku">67424</span>
const m1 = s.match(
/<span\b[^>]*class=["'][^"']*\bsku\b[^"']*["'][^>]*>\s*([0-9]{3,10})\s*<\/span>/i
);
if (m1?.[1]) return normalizeGullSku(m1[1]);
// Fallback: "SKU: 67424" text
const m2 = s.match(/\bSKU:\s*([0-9]{3,10})\b/i);
if (m2?.[1]) return normalizeGullSku(m2[1]);
return "";
}
// Serial limiter: ensures at least minIntervalMs between request starts.
function createMinIntervalLimiter(minIntervalMs) {
let lastStart = 0;
let chain = Promise.resolve();
return async function schedule(fn) {
chain = chain.then(async () => {
const now = Date.now();
const waitMs = Math.max(0, lastStart + minIntervalMs - now);
if (waitMs) await new Promise((r) => setTimeout(r, waitMs));
lastStart = Date.now();
return fn();
});
return chain;
};
}
async function fetchWith429Backoff(url, { fetchFn, headers, maxRetries = 2 }) {
let attempt = 0;
while (true) {
const res = await fetchFn(url, { headers });
if (res.status !== 429) {
if (!res.ok) throw new Error(`HTTP ${res.status} fetching ${url}`);
return await res.text();
}
if (attempt >= maxRetries) throw new Error(`HTTP 429 fetching ${url}`);
// Respect Retry-After if present; otherwise progressive backoff.
const ra =
res.headers && typeof res.headers.get === "function"
? res.headers.get("retry-after")
: null;
const waitSec = ra && /^\d+$/.test(ra) ? parseInt(ra, 10) : 15 * (attempt + 1);
await new Promise((r) => setTimeout(r, waitSec * 1000));
attempt++;
}
}
/**
* Only fetches product pages for items whose sku is a generated u:... (from URL fallback).
* Runs serially + slowly to avoid Gull 429s.
*
* NEW: accepts prevDb so we can skip fetch if URL already has a good SKU cached.
*/
async function hydrateGullSkus(
items,
{ fetchFn, ua, minIntervalMs = 12000, maxRetries = 2, prevDb } = {}
) {
if (!fetchFn) throw new Error("hydrateGullSkus requires opts.fetchFn");
const schedule = createMinIntervalLimiter(minIntervalMs);
const headers = {
"user-agent": ua || "Mozilla/5.0",
accept: "text/html,application/xhtml+xml",
};
for (const it of items || []) {
if (!it || !it.url) continue;
// NEW: if DB already has a good SKU, reuse it and skip fetch
const prev = prevDb?.byUrl?.get(it.url) || null;
if (prev?.sku && !needsSkuDetail(prev.sku)) {
it.sku = pickBetterSku(it.sku, prev.sku);
continue;
}
if (!isGeneratedUrlSku(it.sku)) continue; // only where required
const html = await schedule(() =>
fetchWith429Backoff(it.url, { fetchFn, headers, maxRetries })
);
const realSku = extractGullSkuFromProductPage(html);
if (realSku) it.sku = pickBetterSku(realSku, it.sku);
}
return items;
}
function parseProductsGull(html, ctx) {
const s = String(html || "");
const items = [];
@ -82,11 +210,12 @@ function parseProductsGull(html, ctx) {
const price = extractGullPriceFromBlock(block);
const sku = normalizeCspc(
const skuRaw =
block.match(/\bdata-product_sku=["']([^"']+)["']/i)?.[1] ||
block.match(/\bSKU\b[^0-9]{0,20}(\d{6})\b/i)?.[1] ||
url
);
block.match(/\bSKU\b[^0-9]{0,30}(\d{3,10})\b/i)?.[1] ||
url; // OK fallback; hydrateGullSkus will only re-fetch when this becomes u:...
const sku = normalizeGullSku(skuRaw);
const img = extractFirstImgUrl(block, base);
@ -98,7 +227,6 @@ function parseProductsGull(html, ctx) {
return [...uniq.values()];
}
function createStore(defaultUa) {
return {
key: "gull",
@ -106,12 +234,19 @@ function createStore(defaultUa) {
host: "gullliquorstore.com",
ua: defaultUa,
parseProducts: parseProductsGull,
// Optional hook callers can use to post-process items:
// only hits product pages when sku is u:...
hydrateSkus: hydrateGullSkus,
productPageMinIntervalMs: 12000, // slow by default; Gull is strict
makePageUrl, // enables /page/N/ paging
categories: [
{
key: "whisky",
label: "Whisky",
startUrl: "https://gullliquorstore.com/product-category/spirits/?spirit_type=whisky",
startUrl:
"https://gullliquorstore.com/product-category/spirits/?spirit_type=whisky",
discoveryStartPage: 3,
discoveryStep: 2,
pageConcurrency: 1,
@ -121,7 +256,8 @@ function createStore(defaultUa) {
{
key: "rum",
label: "Rum",
startUrl: "https://gullliquorstore.com/product-category/spirits/?spirit_type=rum",
startUrl:
"https://gullliquorstore.com/product-category/spirits/?spirit_type=rum",
discoveryStartPage: 3,
discoveryStep: 2,
pageConcurrency: 1,
@ -132,4 +268,11 @@ function createStore(defaultUa) {
};
}
module.exports = { createStore, parseProductsGull };
module.exports = {
createStore,
parseProductsGull,
hydrateGullSkus,
extractGullSkuFromProductPage,
isGeneratedUrlSku,
normalizeGullSku,
};

View file

@ -1,7 +1,7 @@
"use strict";
const { cleanText } = require("../utils/html");
const { normalizeCspc } = require("../utils/sku");
const { normalizeCspc, pickBetterSku } = require("../utils/sku");
const { humanBytes } = require("../utils/bytes");
const { padLeft, padRight } = require("../utils/string");
@ -67,6 +67,34 @@ function normalizeAbsUrl(raw) {
}
}
// Treat u:* as synthetic (URL-hash fallback) and eligible for repair.
function isSyntheticSku(sku) {
const s = String(sku || "").trim();
return !s || /^u:/i.test(s);
}
// If SKU is <6 chars, namespace it (per your request) to reduce collisions.
// Also: DO NOT run numeric SKUs through normalizeCspc (some normalizers hash arbitrary strings).
function normalizeTudorSku(rawSku) {
const s = String(rawSku || "").trim();
if (!s) return "";
if (/^id:/i.test(s)) return s;
if (/^u:/i.test(s)) return s;
// numeric SKU like 67433
if (/^\d+$/.test(s)) {
return s.length < 6 ? `id:${s}` : s;
}
// short alnum SKU -> namespace
if (s.length < 6) return `id:${s}`;
// for other formats, keep your existing normalization
// (if normalizeCspc returns empty, fall back to the raw string)
return normalizeCspc(s) || s;
}
function tudorProductUrl(ctx, slug) {
// Site URLs look like: /TUDOR_HOUSE_0/product/spirits/<subcat>/<slug>
const root = ctx?.cat?.tudorRootSlug || "spirits";
@ -82,33 +110,23 @@ function tudorPickVariant(p) {
return inStock || vs[0] || null;
}
function tudorItemFromProduct(p, ctx) {
if (!p) return null;
const name = cleanText(p?.name || "");
const slug = String(p?.slug || "").trim();
if (!name || !slug) return null;
const v = tudorPickVariant(p);
if (v && Number(v?.quantity) <= 0) return null; // only keep in-stock
const url = tudorProductUrl(ctx, slug);
const price = money(v?.price ?? p?.priceFrom ?? p?.priceTo);
const sku = normalizeCspc(v?.sku || "");
const img = normalizeAbsUrl(
firstNonEmptyStr(
v?.image,
p?.gulpImages,
p?.posImages,
p?.customImages,
p?.imageIds
)
);
return { name, price, url, sku, img };
function pickAnySkuFromProduct(p) {
const vs = Array.isArray(p?.variants) ? p.variants : [];
for (const v of vs) {
const s = String(v?.sku || "").trim();
if (s) return s;
}
return "";
}
function pickInStockVariantWithFallback(p) {
const vs = Array.isArray(p?.variants) ? p.variants : [];
const inStock = vs.find((v) => Number(v?.quantity) > 0);
return inStock || vs[0] || null;
}
/* ---------------- GraphQL ---------------- */
async function tudorGql(ctx, label, query, variables) {
return await ctx.http.fetchJsonWithRetry(GQL_URL, label, ctx.store.ua, {
method: "POST",
@ -122,15 +140,7 @@ async function tudorGql(ctx, label, query, variables) {
});
}
function pickConnection(json) {
const data = json?.data;
if (!data || typeof data !== "object") return null;
for (const v of Object.values(data)) {
if (v && typeof v === "object" && Array.isArray(v.items)) return v;
}
return null;
}
/* ---------------- GQL queries ---------------- */
const PRODUCTS_QUERY = `
query(
@ -170,15 +180,14 @@ const PRODUCTS_QUERY = `
isStaffPick: $isStaffPick,
pageCursor: $pageCursor,
pageLimit: $pageLimit,
pointsMin: $pointsMin,
sortBy: $sortBy,
sortOrder: $sortOrder,
priceMin: $priceMin,
priceMax: $priceMax,
quantityMin: $quantityMin,
regions: $regions,
brandValue: $brandValue,
searchValue: $searchValue,
sortOrder: $sortOrder,
sortBy: $sortBy,
storeId: $storeId,
) {
items {
@ -199,6 +208,31 @@ const PRODUCTS_QUERY = `
}
`;
// ONLY for limited image supplementation (within a small budget)
const PRODUCTS_BY_SKU_QUERY = `
query(
$sku: String!,
$storeId: String
) {
productsBySku(
sku: $sku,
storeId: $storeId
) {
items {
id
slug
imageIds
posImages
customImages
gulpImages
variants { id image price quantity sku deposit }
}
nextPageCursor
totalCount
}
}
`;
async function fetchProductsPage(ctx, cursor) {
const vars = {
storeId: STORE_ID,
@ -224,78 +258,291 @@ async function fetchProductsPage(ctx, cursor) {
return r.json.data.products;
}
/* ---------------- GQL bySku helper (image-only within budget) ---------------- */
async function fetchProductBySku(ctx, sku) {
const s = String(sku || "").trim();
if (!s) return null;
if (!ctx._tudorSkuCache) ctx._tudorSkuCache = new Map();
if (ctx._tudorSkuCache.has(s)) return ctx._tudorSkuCache.get(s);
const r = await tudorGql(ctx, `tudor:gql:bySku:${ctx.cat.key}:${s}`, PRODUCTS_BY_SKU_QUERY, {
sku: s,
storeId: STORE_ID,
});
let out = null;
if (r?.status === 200 && r?.json?.data?.productsBySku?.items?.length) {
out = r.json.data.productsBySku.items[0] || null;
}
ctx._tudorSkuCache.set(s, out);
return out;
}
async function supplementImageFromSku(ctx, skuProbe) {
const prod = await fetchProductBySku(ctx, skuProbe);
if (!prod) return null;
const v = pickInStockVariantWithFallback(prod);
const img = normalizeAbsUrl(
firstNonEmptyStr(v?.image, prod?.gulpImages, prod?.posImages, prod?.customImages, prod?.imageIds)
);
return img ? { img } : null;
}
/* ---------------- HTML product page fallback (SKU + optional image) ---------------- */
// Budgets (per category run). Override via ctx.config.tudorHtmlBudget / ctx.config.tudorGqlBudget.
const DETAIL_HTML_BUDGET_DEFAULT = 200;
const DETAIL_GQL_BUDGET_DEFAULT = 10;
function parseSkuFromHtml(html) {
const s = String(html || "");
// 1) Visible block: <div class="sku ...">SKU: 67433</div>
const m1 =
s.match(/>\s*SKU:\s*([A-Za-z0-9._-]+)\s*</i) ||
s.match(/\bSKU:\s*([A-Za-z0-9._-]+)\b/i);
if (m1 && m1[1]) return String(m1[1]).trim();
// 2) Embedded SAPPER preloaded JSON has variants with `"sku":"67433"`
const m2 = s.match(/"sku"\s*:\s*"([^"]+)"/i);
return m2 && m2[1] ? String(m2[1]).trim() : "";
}
function parseOgImageFromHtml(html) {
const s = String(html || "");
const m =
s.match(/property=["']og:image["'][^>]*content=["']([^"']+)["']/i) ||
s.match(/name=["']twitter:image["'][^>]*content=["']([^"']+)["']/i);
return m ? String(m[1] || "").trim() : "";
}
async function tudorFetchHtml(ctx, label, url) {
// Use ctx.http so pacing/throttle is respected.
if (ctx?.http?.fetchTextWithRetry) {
return await ctx.http.fetchTextWithRetry(url, label, ctx.store.ua, {
method: "GET",
headers: {
Accept: "text/html,application/xhtml+xml",
Referer: `${BASE}/`,
},
});
}
// Best-effort fallback if your wrapper has a generic fetchWithRetry.
if (ctx?.http?.fetchWithRetry) {
const r = await ctx.http.fetchWithRetry(url, label, ctx.store.ua, {
method: "GET",
headers: {
Accept: "text/html,application/xhtml+xml",
Referer: `${BASE}/`,
},
});
const body = r?.text ?? r?.body ?? r?.data ?? "";
const text =
typeof body === "string"
? body
: Buffer.isBuffer(body)
? body.toString("utf8")
: body && typeof body === "object" && typeof body.toString === "function"
? body.toString()
: "";
return { status: r?.status, text, bytes: r?.bytes, ms: r?.ms };
}
throw new Error("No HTML fetch method available on ctx.http (need fetchTextWithRetry or fetchWithRetry).");
}
async function tudorDetailFromProductPage(ctx, url) {
if (!ctx._tudorHtmlCache) ctx._tudorHtmlCache = new Map();
if (ctx._tudorHtmlCache.has(url)) return ctx._tudorHtmlCache.get(url);
let out = null;
try {
const r = await tudorFetchHtml(ctx, `tudor:html:${ctx.cat.key}`, url);
if (r?.status === 200 && typeof r?.text === "string" && r.text.length) {
const rawSku = parseSkuFromHtml(r.text);
const sku = normalizeTudorSku(rawSku);
const img = normalizeAbsUrl(parseOgImageFromHtml(r.text));
out = { sku, img };
}
} catch {
out = null;
}
ctx._tudorHtmlCache.set(url, out);
return out;
}
/* ---------------- item builder (fast, no extra calls) ---------------- */
function tudorItemFromProductFast(p, ctx) {
if (!p) return null;
const name = cleanText(p?.name || "");
const slug = String(p?.slug || "").trim();
if (!name || !slug) return null;
const v = tudorPickVariant(p);
if (v && Number(v?.quantity) <= 0) return null; // only keep in-stock
const url = tudorProductUrl(ctx, slug);
const price = money(v?.price ?? p?.priceFrom ?? p?.priceTo);
const skuRaw = String(v?.sku || "").trim() || pickAnySkuFromProduct(p);
const sku = normalizeTudorSku(skuRaw);
const img = normalizeAbsUrl(
firstNonEmptyStr(v?.image, p?.gulpImages, p?.posImages, p?.customImages, p?.imageIds)
);
return { name, price, url, sku, img, _skuProbe: skuRaw };
}
/* ---------------- repair (second pass, budgeted) ---------------- */
async function tudorRepairItem(ctx, it) {
// 1) Missing or synthetic SKU -> HTML product page (fastest path to real SKU)
if (isSyntheticSku(it.sku)) {
const d = await tudorDetailFromProductPage(ctx, it.url);
if (d?.sku && !isSyntheticSku(d.sku)) it.sku = d.sku;
if (!it.img && d?.img) it.img = d.img;
}
// 2) Missing image -> if we have a sku probe, do limited productsBySku
if (!it.img) {
const skuProbe = String(it._skuProbe || "").trim();
if (skuProbe) {
const supp = await supplementImageFromSku(ctx, skuProbe);
if (supp?.img) it.img = supp.img;
}
}
// Final fallback ONLY after repair attempts (stability)
if (isSyntheticSku(it.sku)) it.sku = normalizeCspc(it.url) || "";
return it;
}
/* ---------------- scanner ---------------- */
async function scanCategoryTudor(ctx, prevDb, report) {
const t0 = Date.now();
const discovered = new Map();
const maxPages = ctx.config.maxPages === null ? 500 : Math.min(ctx.config.maxPages, 500);
let cursor = null;
let done = 0;
for (let page = 1; page <= maxPages; page++) {
const tPage = Date.now();
const prod = await fetchProductsPage(ctx, cursor);
const arr = Array.isArray(prod?.items) ? prod.items : [];
let kept = 0;
for (const p of arr) {
const it = tudorItemFromProduct(p, ctx);
if (!it) continue;
discovered.set(it.url, it);
kept++;
const t0 = Date.now();
const discovered = new Map();
const maxPages = ctx.config.maxPages === null ? 500 : Math.min(ctx.config.maxPages, 500);
let cursor = null;
let done = 0;
const needsDetail = [];
for (let page = 1; page <= maxPages; page++) {
const tPage = Date.now();
const prod = await fetchProductsPage(ctx, cursor);
const arr = Array.isArray(prod?.items) ? prod.items : [];
let kept = 0;
for (const p of arr) {
const it = tudorItemFromProductFast(p, ctx);
if (!it) continue;
// NEW: seed from cached DB to avoid repeating detail HTML
const prev = prevDb?.byUrl?.get(it.url) || null;
if (prev) {
it.sku = pickBetterSku(it.sku, prev.sku);
if (!it.img && prev.img) it.img = prev.img;
}
done++;
const ms = Date.now() - tPage;
ctx.logger.ok(
`${ctx.catPrefixOut} | Page ${pageStr(page, maxPages)} | 200 | items=${padLeft(
kept,
3
)} | bytes=${kbStr(0)} | ${padRight(ctx.http.inflightStr(), 11)} | ${secStr(ms)}`
);
cursor = prod?.nextPageCursor || null;
if (!cursor || !arr.length) break;
// queue only; do not do detail calls inline
if (isSyntheticSku(it.sku) || !it.img) needsDetail.push(it);
discovered.set(it.url, it);
kept++;
}
ctx.logger.ok(`${ctx.catPrefixOut} | Unique products: ${discovered.size}`);
const { merged, newItems, updatedItems, removedItems, restoredItems } = mergeDiscoveredIntoDb(prevDb, discovered, {
storeLabel: ctx.store.name,
});
const dbObj = buildDbObject(ctx, merged);
writeJsonAtomic(ctx.dbFile, dbObj);
const elapsed = Date.now() - t0;
report.categories.push({
store: ctx.store.name,
label: ctx.cat.label,
key: ctx.cat.key,
dbFile: ctx.dbFile,
scannedPages: done,
discoveredUnique: discovered.size,
newCount: newItems.length,
updatedCount: updatedItems.length,
removedCount: removedItems.length,
restoredCount: restoredItems.length,
elapsedMs: elapsed,
});
report.totals.newCount += newItems.length;
report.totals.updatedCount += updatedItems.length;
report.totals.removedCount += removedItems.length;
report.totals.restoredCount += restoredItems.length;
addCategoryResultToReport(report, ctx.store.name, ctx.cat.label, newItems, updatedItems, removedItems, restoredItems);
done++;
const ms = Date.now() - tPage;
ctx.logger.ok(
`${ctx.catPrefixOut} | Page ${pageStr(page, maxPages)} | 200 | items=${padLeft(
kept,
3
)} | bytes=${kbStr(0)} | ${padRight(ctx.http.inflightStr(), 11)} | ${secStr(ms)}`
);
cursor = prod?.nextPageCursor || null;
if (!cursor || !arr.length) break;
}
// second pass: repair with budgets
const htmlBudget = Number.isFinite(ctx.config.tudorHtmlBudget)
? ctx.config.tudorHtmlBudget
: DETAIL_HTML_BUDGET_DEFAULT;
const gqlBudget = Number.isFinite(ctx.config.tudorGqlBudget)
? ctx.config.tudorGqlBudget
: DETAIL_GQL_BUDGET_DEFAULT;
let htmlUsed = 0;
let gqlUsed = 0;
for (const it of needsDetail) {
const wantsHtml = isSyntheticSku(it.sku);
const wantsGql = !it.img && String(it._skuProbe || "").trim();
// enforce caps
if (wantsHtml && htmlUsed >= htmlBudget && (!wantsGql || gqlUsed >= gqlBudget)) continue;
if (wantsGql && gqlUsed >= gqlBudget && (!wantsHtml || htmlUsed >= htmlBudget)) continue;
// count budgets pessimistically
if (wantsHtml) htmlUsed++;
if (wantsGql) gqlUsed++;
await tudorRepairItem(ctx, it);
discovered.set(it.url, it);
}
ctx.logger.ok(
`${ctx.catPrefixOut} | Unique products: ${discovered.size} | detail(html=${htmlUsed}/${htmlBudget}, gql=${gqlUsed}/${gqlBudget})`
);
const { merged, newItems, updatedItems, removedItems, restoredItems } = mergeDiscoveredIntoDb(prevDb, discovered, {
storeLabel: ctx.store.name,
});
const dbObj = buildDbObject(ctx, merged);
writeJsonAtomic(ctx.dbFile, dbObj);
const elapsed = Date.now() - t0;
report.categories.push({
store: ctx.store.name,
label: ctx.cat.label,
key: ctx.cat.key,
dbFile: ctx.dbFile,
scannedPages: done,
discoveredUnique: discovered.size,
newCount: newItems.length,
updatedCount: updatedItems.length,
removedCount: removedItems.length,
restoredCount: restoredItems.length,
elapsedMs: elapsed,
});
report.totals.newCount += newItems.length;
report.totals.updatedCount += updatedItems.length;
report.totals.removedCount += removedItems.length;
report.totals.restoredCount += restoredItems.length;
addCategoryResultToReport(report, ctx.store.name, ctx.cat.label, newItems, updatedItems, removedItems, restoredItems);
}
/* ---------------- store ---------------- */

View file

@ -20,8 +20,9 @@ function normalizeUpcDigits(v) {
return m ? m[1] : "";
}
// CHANGE: allow 1-11 digits so BCL 3-digit ids like id:141 are preserved
function normalizeIdDigits(v) {
const m = String(v ?? "").match(/\b(\d{4,11})\b/);
const m = String(v ?? "").match(/\b(\d{1,11})\b/);
return m ? m[1] : "";
}
@ -33,6 +34,35 @@ function makeSyntheticSkuKey({ storeLabel, url }) {
return `u:${fnv1a32(`${store}|${u}`)}`;
}
/* ---------------- NEW: SKU quality helpers ---------------- */
function skuQuality(v) {
const s = String(v ?? "").trim();
if (!s) return 0; // missing
if (/^u:/i.test(s)) return 0; // synthetic
if (normalizeCspc(s)) return 3; // best (6-digit CSPC)
if (/^upc:/i.test(s)) return 2;
if (/^id:/i.test(s)) return 2;
return 1; // explicit non-synthetic string
}
// Prefer higher quality; on ties keep existing (stable) value
function pickBetterSku(newSku, oldSku) {
const a = String(newSku ?? "").trim();
const b = String(oldSku ?? "").trim();
const qa = skuQuality(a);
const qb = skuQuality(b);
if (qa > qb) return a;
if (qb > qa) return b;
return b || a;
}
// Only fetch product pages when missing/synthetic
function needsSkuDetail(sku) {
const s = String(sku ?? "").trim();
return !s || /^u:/i.test(s);
}
/**
* Behavior:
* - CSPC 6-digit => "123456"
@ -63,4 +93,11 @@ function normalizeSkuKey(v, { storeLabel, url } = {}) {
return syn || "";
}
module.exports = { normalizeCspc, normalizeSkuKey, makeSyntheticSkuKey };
module.exports = {
normalizeCspc,
normalizeSkuKey,
makeSyntheticSkuKey,
skuQuality,
pickBetterSku,
needsSkuDetail,
};