diff --git a/src/stores/bcl.js b/src/stores/bcl.js index 92fc840..0f0d0a7 100644 --- a/src/stores/bcl.js +++ b/src/stores/bcl.js @@ -80,7 +80,6 @@ function bclIsInStock(src) { return true; } - function bclNormalizeAbsUrl(raw) { const s = String(raw || "").trim(); if (!s) return ""; @@ -141,7 +140,17 @@ function bclHitToItem(hit) { const regular = asNumber(src.regularPrice); const price = cad(Number.isFinite(current) ? current : regular); - const sku = normalizeCspc(url); + // SKU key: + // - Keep CSPC 6-digit when present (rare for BCL, but safe) + // - Otherwise upgrade to an explicit soft key: id: + // + // ✅ PATCH: handle tiny SKUs too (3/4/5-digit) by forcing id: + // only fall back to raw (NOT u:) if it’s genuinely non-numeric. + let sku = normalizeCspc(skuRaw); + if (!sku) { + const m = skuRaw.match(/^\d{1,6}$/); // BCL product IDs like 141, 596, 984, 117, etc. + sku = m ? `id:${m[0]}` : `id:${skuRaw}`; + } const inStock = bclIsInStock(src); if (!inStock) return null; @@ -155,8 +164,6 @@ function bclHitToItem(hit) { return { name, price, url, sku, img }; } - - async function bclFetchBrowsePage(ctx, page1, size) { const type = ctx.cat.bclType; // e.g. "rum" or "whisky / whiskey" const category = "spirits"; @@ -293,11 +300,12 @@ async function scanCategoryBCLAjax(ctx, prevDb, report) { newCount: newItems.length, updatedCount: updatedItems.length, removedCount: removedItems.length, - restoredCount: removedItems.length, + restoredCount: restoredItems.length, elapsedMs: elapsed, }); report.totals.newCount += newItems.length; report.totals.updatedCount += updatedItems.length; + report.totals.updatedCount += updatedItems.length; report.totals.removedCount += removedItems.length; report.totals.restoredCount += restoredItems.length; diff --git a/src/stores/craftcellars.js b/src/stores/craftcellars.js index 2f0a052..49f45e1 100644 --- a/src/stores/craftcellars.js +++ b/src/stores/craftcellars.js @@ -5,7 +5,7 @@ const { setTimeout: sleep } = require("timers/promises"); const { decodeHtml, stripTags, extractFirstImgUrl } = require("../utils/html"); const { sanitizeName } = require("../utils/text"); -const { normalizeCspc } = require("../utils/sku"); +const { normalizeCspc, pickBetterSku, needsSkuDetail } = require("../utils/sku"); const { makePageUrlShopifyQueryPage } = require("../utils/url"); const { mergeDiscoveredIntoDb } = require("../tracker/merge"); @@ -33,7 +33,9 @@ function canonicalizeCraftProductUrl(raw) { function extractShopifyCardPrice(block) { const b = String(block || ""); const dollars = (txt) => - [...String(txt).matchAll(/\$\s*[\d,]+(?:\.\d{2})?/g)].map((m) => m[0].replace(/\s+/g, "")); + [...String(txt).matchAll(/\$\s*[\d,]+(?:\.\d{2})?/g)].map((m) => + m[0].replace(/\s+/g, "") + ); const saleRegion = b.split(/sale price/i)[1] || ""; const saleD = dollars(saleRegion); @@ -50,8 +52,14 @@ function extractShopifyCardPrice(block) { function parseProductsCraftCellars(html, ctx) { const s = String(html || ""); - const g1 = s.match(/]*id=["']ProductGridContainer["'][^>]*>[\s\S]*?<\/div>/i)?.[0] || ""; - const g2 = s.match(/]*id=["']product-grid["'][^>]*>[\s\S]*?<\/div>/i)?.[0] || ""; + const g1 = + s.match( + /]*id=["']ProductGridContainer["'][^>]*>[\s\S]*?<\/div>/i + )?.[0] || ""; + const g2 = + s.match( + /]*id=["']product-grid["'][^>]*>[\s\S]*?<\/div>/i + )?.[0] || ""; const gridCandidate = g1.length > g2.length ? g1 : g2; const grid = /\/products\//i.test(gridCandidate) ? gridCandidate : s; @@ -63,18 +71,24 @@ function parseProductsCraftCellarsInner(html, ctx) { const s = String(html || ""); const items = []; - let blocks = [...s.matchAll(/]*>[\s\S]*?<\/li>/gi)].map((m) => m[0]); + let blocks = [...s.matchAll(/]*>[\s\S]*?<\/li>/gi)].map( + (m) => m[0] + ); if (blocks.length < 5) { - blocks = [...s.matchAll(/]*class=["'][^"']*\bcard\b[^"']*["'][^>]*>[\s\S]*?<\/div>/gi)].map( - (m) => m[0] - ); + blocks = [ + ...s.matchAll( + /]*class=["'][^"']*\bcard\b[^"']*["'][^>]*>[\s\S]*?<\/div>/gi + ), + ].map((m) => m[0]); } const base = `https://${(ctx && ctx.store && ctx.store.host) || "craftcellars.ca"}/`; for (const block of blocks) { const href = - block.match(/]*href=["']([^"']*\/products\/[^"']+)["']/i)?.[1] || + block.match( + /]*href=["']([^"']*\/products\/[^"']+)["']/i + )?.[1] || block.match(/href=["']([^"']*\/products\/[^"']+)["']/i)?.[1]; if (!href) continue; @@ -87,9 +101,15 @@ function parseProductsCraftCellarsInner(html, ctx) { url = canonicalizeCraftProductUrl(url); const nameHtml = - block.match(/]*href=["'][^"']*\/products\/[^"']+["'][^>]*>\s*<[^>]*>\s*([^<]{2,200}?)\s*]*>[\s\S]*?]*\/products\/[^"']+[^>]*>([\s\S]*?)<\/a>[\s\S]*?<\/h[23]>/i)?.[1] || - block.match(/]*href=["'][^"']*\/products\/[^"']+["'][^>]*>([\s\S]*?)<\/a>/i)?.[1]; + block.match( + /]*href=["'][^"']*\/products\/[^"']+["'][^>]*>\s*<[^>]*>\s*([^<]{2,200}?)\s*]*>[\s\S]*?]*\/products\/[^"']+[^>]*>([\s\S]*?)<\/a>[\s\S]*?<\/h[23]>/i + )?.[1] || + block.match( + /]*href=["'][^"']*\/products\/[^"']+["'][^>]*>([\s\S]*?)<\/a>/i + )?.[1]; const name = sanitizeName(stripTags(decodeHtml(nameHtml || ""))); if (!name) continue; @@ -108,37 +128,58 @@ function parseProductsCraftCellarsInner(html, ctx) { function usdFromShopifyPriceStr(s) { const n = Number(String(s || "").replace(/[^0-9.]/g, "")); if (!Number.isFinite(n)) return ""; - return `$${n.toLocaleString("en-US", { minimumFractionDigits: 2, maximumFractionDigits: 2 })}`; + return `$${n.toLocaleString("en-US", { + minimumFractionDigits: 2, + maximumFractionDigits: 2, + })}`; } function cfgNum(v, fallback) { return Number.isFinite(v) ? v : fallback; } +/* ---------- NEW: product page SKU extractor ---------- */ +function extractCraftSkuFromProductPageHtml(html) { + const s = String(html || ""); + + const m = + s.match( + /\s*SKU:\s*<\/strong>\s*\s*([^<]{1,80}?)\s*<\/span>/i + ) || + s.match(/\bSKU:\s*<\/strong>\s*\s*([^<]{1,80}?)\s*<\/span>/i) || + s.match(/\bSKU:\s*([A-Za-z0-9][A-Za-z0-9\-_/ ]{0,40})/i); + + const raw = m && m[1] ? stripTags(decodeHtml(m[1])) : ""; + return normalizeCspc(raw); +} + /** * Craft Cellars: - * - HTML listing with ?filter.v.availability=1 is the allowlist (prevents OOS leaking in) - * - Shopify products.json is used only to enrich SKU (and optionally price) for those allowed URLs + * - HTML listing with ?filter.v.availability=1 is the allowlist + * - products.json enriches SKU/price + * - product page HTML is final SKU fallback */ async function scanCategoryCraftCellars(ctx, prevDb, report) { const t0 = Date.now(); - // Strongly prefer "slow and steady" to avoid 429s. - // Use per-category knobs if present; otherwise default conservative. - const perPageDelayMs = Math.max( - 0, - cfgNum(ctx?.cat?.pageStaggerMs, cfgNum(ctx?.cat?.discoveryDelayMs, 0)) || 0 - ) || 0; + const perPageDelayMs = + Math.max( + 0, + cfgNum(ctx?.cat?.pageStaggerMs, cfgNum(ctx?.cat?.discoveryDelayMs, 0)) + ) || 0; const perJsonPageDelayMs = Math.max( 0, cfgNum(ctx?.cat?.jsonPageDelayMs, perPageDelayMs) ); - // 1) HTML scan: allowlist of in-stock listing URLs - const htmlMap = new Map(); // url -> {name, price, url, img} + const htmlMap = new Map(); + + const maxPages = + ctx.config.maxPages === null + ? 200 + : Math.min(ctx.config.maxPages, 200); - const maxPages = ctx.config.maxPages === null ? 200 : Math.min(ctx.config.maxPages, 200); let htmlPagesFetched = 0; let emptyStreak = 0; @@ -146,7 +187,11 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) { if (p > 1 && perPageDelayMs > 0) await sleep(perPageDelayMs); const pageUrl = makePageUrlShopifyQueryPage(ctx.cat.startUrl, p); - const { text: html } = await ctx.http.fetchTextWithRetry(pageUrl, `craft:html:${ctx.cat.key}:p${p}`, ctx.store.ua); + const { text: html } = await ctx.http.fetchTextWithRetry( + pageUrl, + `craft:html:${ctx.cat.key}:p${p}`, + ctx.store.ua + ); htmlPagesFetched++; if (craftCellarsIsEmptyListingPage(html)) break; @@ -162,22 +207,30 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) { for (const it of items) { const url = canonicalizeCraftProductUrl(it.url); if (!url) continue; - htmlMap.set(url, { name: it.name || "", price: it.price || "", url, img: it.img || "" }); + htmlMap.set(url, { + name: it.name || "", + price: it.price || "", + url, + img: it.img || "", + }); } } - // If HTML returns nothing, don't let JSON invent a category if (!htmlMap.size) { - ctx.logger.warn(`${ctx.catPrefixOut} | HTML listing returned 0 items; refusing to use products.json as source of truth.`); + ctx.logger.warn( + `${ctx.catPrefixOut} | HTML listing returned 0 items; refusing JSON-only discovery` + ); } - // 2) JSON scan: build SKU index (but do NOT add new URLs from JSON) - const jsonMap = new Map(); // url -> { sku, price, img } + const jsonMap = new Map(); if (htmlMap.size) { const start = new URL(ctx.cat.startUrl); const m = start.pathname.match(/^\/collections\/([^/]+)/i); - if (!m) throw new Error(`CraftCellars: couldn't extract collection handle from ${ctx.cat.startUrl}`); + if (!m) + throw new Error( + `CraftCellars: couldn't extract collection handle from ${ctx.cat.startUrl}` + ); const collectionHandle = m[1]; const limit = 250; @@ -185,12 +238,19 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) { let jsonPagesFetched = 0; while (true) { - if (jsonPage > 1 && perJsonPageDelayMs > 0) await sleep(perJsonPageDelayMs); + if (jsonPage > 1 && perJsonPageDelayMs > 0) + await sleep(perJsonPageDelayMs); const url = `https://${ctx.store.host}/collections/${collectionHandle}/products.json?limit=${limit}&page=${jsonPage}`; - const r = await ctx.http.fetchJsonWithRetry(url, `craft:coljson:${ctx.cat.key}:p${jsonPage}`, ctx.store.ua); + const r = await ctx.http.fetchJsonWithRetry( + url, + `craft:coljson:${ctx.cat.key}:p${jsonPage}`, + ctx.store.ua + ); - const products = Array.isArray(r?.json?.products) ? r.json.products : []; + const products = Array.isArray(r?.json?.products) + ? r.json.products + : []; jsonPagesFetched++; if (!products.length) break; @@ -199,73 +259,116 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) { const handle = String(p?.handle || ""); if (!handle) continue; - const prodUrl = canonicalizeCraftProductUrl(`https://${ctx.store.host}/products/${handle}`); - - // Only enrich if it's on the HTML allowlist + const prodUrl = canonicalizeCraftProductUrl( + `https://${ctx.store.host}/products/${handle}` + ); if (!htmlMap.has(prodUrl)) continue; const variants = Array.isArray(p?.variants) ? p.variants : []; - const v = variants.find((x) => x && x.available === true) || variants[0] || null; + const v = + variants.find((x) => x && x.available === true) || + variants[0] || + null; const sku = normalizeCspc(v?.sku || ""); const price = v?.price ? usdFromShopifyPriceStr(v.price) : ""; - // Product image (best effort) let img = ""; const images = Array.isArray(p?.images) ? p.images : []; if (images[0]) { - if (typeof images[0] === "string") img = images[0]; - else img = String(images[0]?.src || images[0]?.url || ""); + img = + typeof images[0] === "string" + ? images[0] + : String(images[0]?.src || images[0]?.url || ""); } - if (!img && p?.image) img = String(p.image?.src || p.image?.url || p.image || ""); + if (!img && p?.image) + img = String(p.image?.src || p.image?.url || p.image || ""); img = String(img || "").trim(); if (img.startsWith("//")) img = `https:${img}`; - if (img && !/^https?:\/\//i.test(img)) { - try { - img = new URL(img, `https://${ctx.store.host}/`).toString(); - } catch { - // keep as-is - } - } jsonMap.set(prodUrl, { sku, price, img }); } if (products.length < limit) break; - jsonPage++; - if (jsonPage > 200) break; // safety + if (++jsonPage > 200) break; } - ctx.logger.ok(`${ctx.catPrefixOut} | HTML pages=${htmlPagesFetched} JSON pages=${jsonPagesFetched}`); - } else { - ctx.logger.ok(`${ctx.catPrefixOut} | HTML pages=${htmlPagesFetched} JSON pages=0`); + ctx.logger.ok( + `${ctx.catPrefixOut} | HTML pages=${htmlPagesFetched} JSON pages=${jsonPagesFetched}` + ); } - // 3) Final discovered: HTML allowlist, enriched by JSON const discovered = new Map(); for (const [url, it] of htmlMap.entries()) { const j = jsonMap.get(url); + const prev = prevDb?.byUrl?.get(url) || null; + discovered.set(url, { - name: it.name || "", - // Prefer JSON price (normalized) when present, else keep HTML price (already formatted) + name: it.name, price: j?.price || it.price || "", url, - sku: j?.sku || "", - img: j?.img || it.img || "", + // reuse cached SKU unless we found something better this run + sku: pickBetterSku(j?.sku || "", prev?.sku || ""), + // reuse cached image if we didn't find one + img: (j?.img || it.img || prev?.img || ""), }); } - ctx.logger.ok(`${ctx.catPrefixOut} | Unique products (this run): ${discovered.size}`); + /* ---------- NEW: product page SKU fallback (cached; only when needed) ---------- */ + const perProductSkuDelayMs = Math.max( + 0, + cfgNum( + ctx?.cat?.skuPageDelayMs, + cfgNum(ctx?.cat?.jsonPageDelayMs, perPageDelayMs) + ) + ); - const { merged, newItems, updatedItems, removedItems, restoredItems } = mergeDiscoveredIntoDb(prevDb, discovered, { + let skuPagesFetched = 0; + + for (const it of discovered.values()) { + // only hit product pages when missing/synthetic + if (!needsSkuDetail(it.sku)) continue; + + if (perProductSkuDelayMs > 0) await sleep(perProductSkuDelayMs); + + try { + const { text } = await ctx.http.fetchTextWithRetry( + it.url, + `craft:prodpage:${ctx.cat.key}:${Buffer.from(it.url) + .toString("base64") + .slice(0, 24)}`, + ctx.store.ua + ); + skuPagesFetched++; + + const sku = extractCraftSkuFromProductPageHtml(text); + if (sku) it.sku = sku; + } catch { + /* best effort */ + } + } + + ctx.logger.ok( + `${ctx.catPrefixOut} | SKU fallback pages=${skuPagesFetched}` + ); + + ctx.logger.ok( + `${ctx.catPrefixOut} | Unique products (this run): ${discovered.size}` + ); + + const { + merged, + newItems, + updatedItems, + removedItems, + restoredItems, + } = mergeDiscoveredIntoDb(prevDb, discovered, { storeLabel: ctx.store.name, }); const dbObj = buildDbObject(ctx, merged); writeJsonAtomic(ctx.dbFile, dbObj); - ctx.logger.ok(`${ctx.catPrefixOut} | DB saved: ${ctx.logger.dim(ctx.dbFile)} (${dbObj.count} items)`); - const elapsed = Date.now() - t0; report.categories.push({ @@ -287,7 +390,15 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) { report.totals.removedCount += removedItems.length; report.totals.restoredCount += restoredItems.length; - addCategoryResultToReport(report, ctx.store.name, ctx.cat.label, newItems, updatedItems, removedItems, restoredItems); + addCategoryResultToReport( + report, + ctx.store.name, + ctx.cat.label, + newItems, + updatedItems, + removedItems, + restoredItems + ); } function createStore(defaultUa) { @@ -297,10 +408,8 @@ function createStore(defaultUa) { host: "craftcellars.ca", ua: defaultUa, - // ✅ Custom scan (HTML allowlist + JSON enrichment) scanCategory: scanCategoryCraftCellars, - // Keep HTML parser for debugging parseProducts: parseProductsCraftCellars, makePageUrl: makePageUrlShopifyQueryPage, isEmptyListingPage: craftCellarsIsEmptyListingPage, @@ -309,69 +418,22 @@ function createStore(defaultUa) { { key: "whisky", label: "Whisky", - startUrl: "https://craftcellars.ca/collections/whisky?filter.v.availability=1", - - // slow-and-safe defaults (override globally if you want) - discoveryStartPage: 3, - discoveryStep: 2, + startUrl: + "https://craftcellars.ca/collections/whisky?filter.v.availability=1", pageConcurrency: 1, pageStaggerMs: 10000, discoveryDelayMs: 10000, + skuPageDelayMs: 12000, }, { key: "rum", label: "Rum", - startUrl: "https://craftcellars.ca/collections/rum?filter.v.availability=1", - - discoveryStartPage: 3, - discoveryStep: 2, - pageConcurrency: 1, - pageStaggerMs: 10000, - discoveryDelayMs: 10000, - }, - { - key: "single-malt-scotch", - label: "Single Malt Scotch", - startUrl: "https://craftcellars.ca/collections/single-malt-scotch?filter.v.availability=1", - - discoveryStartPage: 3, - discoveryStep: 2, - pageConcurrency: 1, - pageStaggerMs: 10000, - discoveryDelayMs: 10000, - }, - { - key: "other-scotch-styles", - label: "Other Scotch Styles", - startUrl: "https://craftcellars.ca/collections/other-scotch-styles?filter.v.availability=1", - - discoveryStartPage: 3, - discoveryStep: 2, - pageConcurrency: 1, - pageStaggerMs: 10000, - discoveryDelayMs: 10000, - }, - { - key: "single-grain-scotch", - label: "Single Grain Scotch", - startUrl: "https://craftcellars.ca/collections/single-grain-scotch?filter.v.availability=1", - - discoveryStartPage: 3, - discoveryStep: 2, - pageConcurrency: 1, - pageStaggerMs: 10000, - discoveryDelayMs: 10000, - }, - { - key: "blended-malt-scotch", - label: "Blended Malt Scotch", - startUrl: "https://craftcellars.ca/collections/blended-malt-scotch?filter.v.availability=1", - - discoveryStartPage: 3, - discoveryStep: 2, + startUrl: + "https://craftcellars.ca/collections/rum?filter.v.availability=1", pageConcurrency: 1, pageStaggerMs: 10000, discoveryDelayMs: 10000, + skuPageDelayMs: 12000, }, ], }; diff --git a/src/stores/gull.js b/src/stores/gull.js index c5a9c14..deeaedd 100644 --- a/src/stores/gull.js +++ b/src/stores/gull.js @@ -1,7 +1,8 @@ +// src/stores/gull.js "use strict"; const { decodeHtml, cleanText, extractFirstImgUrl } = require("../utils/html"); -const { normalizeCspc } = require("../utils/sku"); +const { normalizeCspc, pickBetterSku, needsSkuDetail } = require("../utils/sku"); const { makePageUrl } = require("../utils/url"); function looksInStock(block) { @@ -45,6 +46,133 @@ function extractGullPriceFromBlock(block) { return `$${chosen.toFixed(2)}`; } +// Gull SKUs are often NOT 6 digits (e.g. 67424). +// If it's not 6 digits, represent as id: to avoid normalizeCspc turning it into u:SHA. +function normalizeGullSku(raw) { + const s = cleanText(decodeHtml(String(raw || ""))).trim(); + + // already in a stable prefixed form + if (/^(id:|u:)/i.test(s)) return s; + + // digits-only SKU (from page / tile) + const digits = s.match(/\b(\d{3,10})\b/)?.[1] || ""; + if (digits) { + if (digits.length === 6) return normalizeCspc(digits); + return `id:${digits}`; + } + + // fall back to existing normalizer (may yield u:...) + return normalizeCspc(s); +} + +// When we fall back to normalizeCspc(url), we may end up with a generated u:XXXXXXXX. +function isGeneratedUrlSku(sku) { + const s = String(sku || ""); + // you have u:8hex in the DB, so accept 8+ + return /^u:[0-9a-f]{8,128}$/i.test(s); +} + +// Extract SKU from Gull product page HTML. +function extractGullSkuFromProductPage(html) { + const s = String(html || ""); + + // Most reliable: 67424 + const m1 = s.match( + /]*class=["'][^"']*\bsku\b[^"']*["'][^>]*>\s*([0-9]{3,10})\s*<\/span>/i + ); + if (m1?.[1]) return normalizeGullSku(m1[1]); + + // Fallback: "SKU: 67424" text + const m2 = s.match(/\bSKU:\s*([0-9]{3,10})\b/i); + if (m2?.[1]) return normalizeGullSku(m2[1]); + + return ""; +} + +// Serial limiter: ensures at least minIntervalMs between request starts. +function createMinIntervalLimiter(minIntervalMs) { + let lastStart = 0; + let chain = Promise.resolve(); + + return async function schedule(fn) { + chain = chain.then(async () => { + const now = Date.now(); + const waitMs = Math.max(0, lastStart + minIntervalMs - now); + if (waitMs) await new Promise((r) => setTimeout(r, waitMs)); + lastStart = Date.now(); + return fn(); + }); + return chain; + }; +} + +async function fetchWith429Backoff(url, { fetchFn, headers, maxRetries = 2 }) { + let attempt = 0; + + while (true) { + const res = await fetchFn(url, { headers }); + + if (res.status !== 429) { + if (!res.ok) throw new Error(`HTTP ${res.status} fetching ${url}`); + return await res.text(); + } + + if (attempt >= maxRetries) throw new Error(`HTTP 429 fetching ${url}`); + + // Respect Retry-After if present; otherwise progressive backoff. + const ra = + res.headers && typeof res.headers.get === "function" + ? res.headers.get("retry-after") + : null; + + const waitSec = ra && /^\d+$/.test(ra) ? parseInt(ra, 10) : 15 * (attempt + 1); + await new Promise((r) => setTimeout(r, waitSec * 1000)); + attempt++; + } +} + +/** + * Only fetches product pages for items whose sku is a generated u:... (from URL fallback). + * Runs serially + slowly to avoid Gull 429s. + * + * NEW: accepts prevDb so we can skip fetch if URL already has a good SKU cached. + */ +async function hydrateGullSkus( + items, + { fetchFn, ua, minIntervalMs = 12000, maxRetries = 2, prevDb } = {} +) { + if (!fetchFn) throw new Error("hydrateGullSkus requires opts.fetchFn"); + + const schedule = createMinIntervalLimiter(minIntervalMs); + + const headers = { + "user-agent": ua || "Mozilla/5.0", + accept: "text/html,application/xhtml+xml", + }; + + for (const it of items || []) { + if (!it || !it.url) continue; + + // NEW: if DB already has a good SKU, reuse it and skip fetch + const prev = prevDb?.byUrl?.get(it.url) || null; + if (prev?.sku && !needsSkuDetail(prev.sku)) { + it.sku = pickBetterSku(it.sku, prev.sku); + continue; + } + + if (!isGeneratedUrlSku(it.sku)) continue; // only where required + + const html = await schedule(() => + fetchWith429Backoff(it.url, { fetchFn, headers, maxRetries }) + ); + + const realSku = extractGullSkuFromProductPage(html); + if (realSku) it.sku = pickBetterSku(realSku, it.sku); + } + + return items; +} + function parseProductsGull(html, ctx) { const s = String(html || ""); const items = []; @@ -82,11 +210,12 @@ function parseProductsGull(html, ctx) { const price = extractGullPriceFromBlock(block); - const sku = normalizeCspc( + const skuRaw = block.match(/\bdata-product_sku=["']([^"']+)["']/i)?.[1] || - block.match(/\bSKU\b[^0-9]{0,20}(\d{6})\b/i)?.[1] || - url - ); + block.match(/\bSKU\b[^0-9]{0,30}(\d{3,10})\b/i)?.[1] || + url; // OK fallback; hydrateGullSkus will only re-fetch when this becomes u:... + + const sku = normalizeGullSku(skuRaw); const img = extractFirstImgUrl(block, base); @@ -98,7 +227,6 @@ function parseProductsGull(html, ctx) { return [...uniq.values()]; } - function createStore(defaultUa) { return { key: "gull", @@ -106,12 +234,19 @@ function createStore(defaultUa) { host: "gullliquorstore.com", ua: defaultUa, parseProducts: parseProductsGull, + + // Optional hook callers can use to post-process items: + // only hits product pages when sku is u:... + hydrateSkus: hydrateGullSkus, + productPageMinIntervalMs: 12000, // slow by default; Gull is strict + makePageUrl, // enables /page/N/ paging categories: [ { key: "whisky", label: "Whisky", - startUrl: "https://gullliquorstore.com/product-category/spirits/?spirit_type=whisky", + startUrl: + "https://gullliquorstore.com/product-category/spirits/?spirit_type=whisky", discoveryStartPage: 3, discoveryStep: 2, pageConcurrency: 1, @@ -121,7 +256,8 @@ function createStore(defaultUa) { { key: "rum", label: "Rum", - startUrl: "https://gullliquorstore.com/product-category/spirits/?spirit_type=rum", + startUrl: + "https://gullliquorstore.com/product-category/spirits/?spirit_type=rum", discoveryStartPage: 3, discoveryStep: 2, pageConcurrency: 1, @@ -132,4 +268,11 @@ function createStore(defaultUa) { }; } -module.exports = { createStore, parseProductsGull }; +module.exports = { + createStore, + parseProductsGull, + hydrateGullSkus, + extractGullSkuFromProductPage, + isGeneratedUrlSku, + normalizeGullSku, +}; diff --git a/src/stores/tudor.js b/src/stores/tudor.js index 1a60fa1..54f41bc 100644 --- a/src/stores/tudor.js +++ b/src/stores/tudor.js @@ -1,7 +1,7 @@ "use strict"; const { cleanText } = require("../utils/html"); -const { normalizeCspc } = require("../utils/sku"); +const { normalizeCspc, pickBetterSku } = require("../utils/sku"); const { humanBytes } = require("../utils/bytes"); const { padLeft, padRight } = require("../utils/string"); @@ -67,6 +67,34 @@ function normalizeAbsUrl(raw) { } } +// Treat u:* as synthetic (URL-hash fallback) and eligible for repair. +function isSyntheticSku(sku) { + const s = String(sku || "").trim(); + return !s || /^u:/i.test(s); +} + +// If SKU is <6 chars, namespace it (per your request) to reduce collisions. +// Also: DO NOT run numeric SKUs through normalizeCspc (some normalizers hash arbitrary strings). +function normalizeTudorSku(rawSku) { + const s = String(rawSku || "").trim(); + if (!s) return ""; + + if (/^id:/i.test(s)) return s; + if (/^u:/i.test(s)) return s; + + // numeric SKU like 67433 + if (/^\d+$/.test(s)) { + return s.length < 6 ? `id:${s}` : s; + } + + // short alnum SKU -> namespace + if (s.length < 6) return `id:${s}`; + + // for other formats, keep your existing normalization + // (if normalizeCspc returns empty, fall back to the raw string) + return normalizeCspc(s) || s; +} + function tudorProductUrl(ctx, slug) { // Site URLs look like: /TUDOR_HOUSE_0/product/spirits// const root = ctx?.cat?.tudorRootSlug || "spirits"; @@ -82,33 +110,23 @@ function tudorPickVariant(p) { return inStock || vs[0] || null; } -function tudorItemFromProduct(p, ctx) { - if (!p) return null; - - const name = cleanText(p?.name || ""); - const slug = String(p?.slug || "").trim(); - if (!name || !slug) return null; - - const v = tudorPickVariant(p); - if (v && Number(v?.quantity) <= 0) return null; // only keep in-stock - - const url = tudorProductUrl(ctx, slug); - - const price = money(v?.price ?? p?.priceFrom ?? p?.priceTo); - const sku = normalizeCspc(v?.sku || ""); - const img = normalizeAbsUrl( - firstNonEmptyStr( - v?.image, - p?.gulpImages, - p?.posImages, - p?.customImages, - p?.imageIds - ) - ); - - return { name, price, url, sku, img }; +function pickAnySkuFromProduct(p) { + const vs = Array.isArray(p?.variants) ? p.variants : []; + for (const v of vs) { + const s = String(v?.sku || "").trim(); + if (s) return s; + } + return ""; } +function pickInStockVariantWithFallback(p) { + const vs = Array.isArray(p?.variants) ? p.variants : []; + const inStock = vs.find((v) => Number(v?.quantity) > 0); + return inStock || vs[0] || null; +} + +/* ---------------- GraphQL ---------------- */ + async function tudorGql(ctx, label, query, variables) { return await ctx.http.fetchJsonWithRetry(GQL_URL, label, ctx.store.ua, { method: "POST", @@ -122,15 +140,7 @@ async function tudorGql(ctx, label, query, variables) { }); } -function pickConnection(json) { - const data = json?.data; - if (!data || typeof data !== "object") return null; - for (const v of Object.values(data)) { - if (v && typeof v === "object" && Array.isArray(v.items)) return v; - } - return null; -} - +/* ---------------- GQL queries ---------------- */ const PRODUCTS_QUERY = ` query( @@ -170,15 +180,14 @@ const PRODUCTS_QUERY = ` isStaffPick: $isStaffPick, pageCursor: $pageCursor, pageLimit: $pageLimit, - pointsMin: $pointsMin, + sortBy: $sortBy, + sortOrder: $sortOrder, priceMin: $priceMin, priceMax: $priceMax, quantityMin: $quantityMin, regions: $regions, brandValue: $brandValue, searchValue: $searchValue, - sortOrder: $sortOrder, - sortBy: $sortBy, storeId: $storeId, ) { items { @@ -199,6 +208,31 @@ const PRODUCTS_QUERY = ` } `; +// ONLY for limited image supplementation (within a small budget) +const PRODUCTS_BY_SKU_QUERY = ` + query( + $sku: String!, + $storeId: String + ) { + productsBySku( + sku: $sku, + storeId: $storeId + ) { + items { + id + slug + imageIds + posImages + customImages + gulpImages + variants { id image price quantity sku deposit } + } + nextPageCursor + totalCount + } + } +`; + async function fetchProductsPage(ctx, cursor) { const vars = { storeId: STORE_ID, @@ -224,78 +258,291 @@ async function fetchProductsPage(ctx, cursor) { return r.json.data.products; } +/* ---------------- GQL bySku helper (image-only within budget) ---------------- */ + +async function fetchProductBySku(ctx, sku) { + const s = String(sku || "").trim(); + if (!s) return null; + + if (!ctx._tudorSkuCache) ctx._tudorSkuCache = new Map(); + if (ctx._tudorSkuCache.has(s)) return ctx._tudorSkuCache.get(s); + + const r = await tudorGql(ctx, `tudor:gql:bySku:${ctx.cat.key}:${s}`, PRODUCTS_BY_SKU_QUERY, { + sku: s, + storeId: STORE_ID, + }); + + let out = null; + if (r?.status === 200 && r?.json?.data?.productsBySku?.items?.length) { + out = r.json.data.productsBySku.items[0] || null; + } + + ctx._tudorSkuCache.set(s, out); + return out; +} + +async function supplementImageFromSku(ctx, skuProbe) { + const prod = await fetchProductBySku(ctx, skuProbe); + if (!prod) return null; + + const v = pickInStockVariantWithFallback(prod); + const img = normalizeAbsUrl( + firstNonEmptyStr(v?.image, prod?.gulpImages, prod?.posImages, prod?.customImages, prod?.imageIds) + ); + + return img ? { img } : null; +} + +/* ---------------- HTML product page fallback (SKU + optional image) ---------------- */ + +// Budgets (per category run). Override via ctx.config.tudorHtmlBudget / ctx.config.tudorGqlBudget. +const DETAIL_HTML_BUDGET_DEFAULT = 200; +const DETAIL_GQL_BUDGET_DEFAULT = 10; + +function parseSkuFromHtml(html) { + const s = String(html || ""); + + // 1) Visible block:
SKU: 67433
+ const m1 = + s.match(/>\s*SKU:\s*([A-Za-z0-9._-]+)\s*]*content=["']([^"']+)["']/i) || + s.match(/name=["']twitter:image["'][^>]*content=["']([^"']+)["']/i); + return m ? String(m[1] || "").trim() : ""; +} + +async function tudorFetchHtml(ctx, label, url) { + // Use ctx.http so pacing/throttle is respected. + if (ctx?.http?.fetchTextWithRetry) { + return await ctx.http.fetchTextWithRetry(url, label, ctx.store.ua, { + method: "GET", + headers: { + Accept: "text/html,application/xhtml+xml", + Referer: `${BASE}/`, + }, + }); + } + + // Best-effort fallback if your wrapper has a generic fetchWithRetry. + if (ctx?.http?.fetchWithRetry) { + const r = await ctx.http.fetchWithRetry(url, label, ctx.store.ua, { + method: "GET", + headers: { + Accept: "text/html,application/xhtml+xml", + Referer: `${BASE}/`, + }, + }); + + const body = r?.text ?? r?.body ?? r?.data ?? ""; + const text = + typeof body === "string" + ? body + : Buffer.isBuffer(body) + ? body.toString("utf8") + : body && typeof body === "object" && typeof body.toString === "function" + ? body.toString() + : ""; + + return { status: r?.status, text, bytes: r?.bytes, ms: r?.ms }; + } + + throw new Error("No HTML fetch method available on ctx.http (need fetchTextWithRetry or fetchWithRetry)."); +} + +async function tudorDetailFromProductPage(ctx, url) { + if (!ctx._tudorHtmlCache) ctx._tudorHtmlCache = new Map(); + if (ctx._tudorHtmlCache.has(url)) return ctx._tudorHtmlCache.get(url); + + let out = null; + try { + const r = await tudorFetchHtml(ctx, `tudor:html:${ctx.cat.key}`, url); + if (r?.status === 200 && typeof r?.text === "string" && r.text.length) { + const rawSku = parseSkuFromHtml(r.text); + const sku = normalizeTudorSku(rawSku); + const img = normalizeAbsUrl(parseOgImageFromHtml(r.text)); + out = { sku, img }; + } + } catch { + out = null; + } + + ctx._tudorHtmlCache.set(url, out); + return out; +} + +/* ---------------- item builder (fast, no extra calls) ---------------- */ + +function tudorItemFromProductFast(p, ctx) { + if (!p) return null; + + const name = cleanText(p?.name || ""); + const slug = String(p?.slug || "").trim(); + if (!name || !slug) return null; + + const v = tudorPickVariant(p); + if (v && Number(v?.quantity) <= 0) return null; // only keep in-stock + + const url = tudorProductUrl(ctx, slug); + const price = money(v?.price ?? p?.priceFrom ?? p?.priceTo); + + const skuRaw = String(v?.sku || "").trim() || pickAnySkuFromProduct(p); + const sku = normalizeTudorSku(skuRaw); + + const img = normalizeAbsUrl( + firstNonEmptyStr(v?.image, p?.gulpImages, p?.posImages, p?.customImages, p?.imageIds) + ); + + return { name, price, url, sku, img, _skuProbe: skuRaw }; +} + +/* ---------------- repair (second pass, budgeted) ---------------- */ + +async function tudorRepairItem(ctx, it) { + // 1) Missing or synthetic SKU -> HTML product page (fastest path to real SKU) + if (isSyntheticSku(it.sku)) { + const d = await tudorDetailFromProductPage(ctx, it.url); + if (d?.sku && !isSyntheticSku(d.sku)) it.sku = d.sku; + if (!it.img && d?.img) it.img = d.img; + } + + // 2) Missing image -> if we have a sku probe, do limited productsBySku + if (!it.img) { + const skuProbe = String(it._skuProbe || "").trim(); + if (skuProbe) { + const supp = await supplementImageFromSku(ctx, skuProbe); + if (supp?.img) it.img = supp.img; + } + } + + // Final fallback ONLY after repair attempts (stability) + if (isSyntheticSku(it.sku)) it.sku = normalizeCspc(it.url) || ""; + + return it; +} /* ---------------- scanner ---------------- */ async function scanCategoryTudor(ctx, prevDb, report) { - const t0 = Date.now(); - const discovered = new Map(); - - const maxPages = ctx.config.maxPages === null ? 500 : Math.min(ctx.config.maxPages, 500); - let cursor = null; - let done = 0; - - for (let page = 1; page <= maxPages; page++) { - const tPage = Date.now(); - - const prod = await fetchProductsPage(ctx, cursor); - const arr = Array.isArray(prod?.items) ? prod.items : []; - - let kept = 0; - for (const p of arr) { - const it = tudorItemFromProduct(p, ctx); - if (!it) continue; - discovered.set(it.url, it); - kept++; + const t0 = Date.now(); + const discovered = new Map(); + + const maxPages = ctx.config.maxPages === null ? 500 : Math.min(ctx.config.maxPages, 500); + let cursor = null; + let done = 0; + + const needsDetail = []; + + for (let page = 1; page <= maxPages; page++) { + const tPage = Date.now(); + + const prod = await fetchProductsPage(ctx, cursor); + const arr = Array.isArray(prod?.items) ? prod.items : []; + + let kept = 0; + for (const p of arr) { + const it = tudorItemFromProductFast(p, ctx); + if (!it) continue; + + // NEW: seed from cached DB to avoid repeating detail HTML + const prev = prevDb?.byUrl?.get(it.url) || null; + if (prev) { + it.sku = pickBetterSku(it.sku, prev.sku); + if (!it.img && prev.img) it.img = prev.img; } - - done++; - - const ms = Date.now() - tPage; - ctx.logger.ok( - `${ctx.catPrefixOut} | Page ${pageStr(page, maxPages)} | 200 | items=${padLeft( - kept, - 3 - )} | bytes=${kbStr(0)} | ${padRight(ctx.http.inflightStr(), 11)} | ${secStr(ms)}` - ); - - cursor = prod?.nextPageCursor || null; - if (!cursor || !arr.length) break; + + // queue only; do not do detail calls inline + if (isSyntheticSku(it.sku) || !it.img) needsDetail.push(it); + + discovered.set(it.url, it); + kept++; } - - ctx.logger.ok(`${ctx.catPrefixOut} | Unique products: ${discovered.size}`); - - const { merged, newItems, updatedItems, removedItems, restoredItems } = mergeDiscoveredIntoDb(prevDb, discovered, { - storeLabel: ctx.store.name, - }); - - const dbObj = buildDbObject(ctx, merged); - writeJsonAtomic(ctx.dbFile, dbObj); - - const elapsed = Date.now() - t0; - - report.categories.push({ - store: ctx.store.name, - label: ctx.cat.label, - key: ctx.cat.key, - dbFile: ctx.dbFile, - scannedPages: done, - discoveredUnique: discovered.size, - newCount: newItems.length, - updatedCount: updatedItems.length, - removedCount: removedItems.length, - restoredCount: restoredItems.length, - elapsedMs: elapsed, - }); - - report.totals.newCount += newItems.length; - report.totals.updatedCount += updatedItems.length; - report.totals.removedCount += removedItems.length; - report.totals.restoredCount += restoredItems.length; - - addCategoryResultToReport(report, ctx.store.name, ctx.cat.label, newItems, updatedItems, removedItems, restoredItems); + + done++; + + const ms = Date.now() - tPage; + ctx.logger.ok( + `${ctx.catPrefixOut} | Page ${pageStr(page, maxPages)} | 200 | items=${padLeft( + kept, + 3 + )} | bytes=${kbStr(0)} | ${padRight(ctx.http.inflightStr(), 11)} | ${secStr(ms)}` + ); + + cursor = prod?.nextPageCursor || null; + if (!cursor || !arr.length) break; } - + + // second pass: repair with budgets + const htmlBudget = Number.isFinite(ctx.config.tudorHtmlBudget) + ? ctx.config.tudorHtmlBudget + : DETAIL_HTML_BUDGET_DEFAULT; + + const gqlBudget = Number.isFinite(ctx.config.tudorGqlBudget) + ? ctx.config.tudorGqlBudget + : DETAIL_GQL_BUDGET_DEFAULT; + + let htmlUsed = 0; + let gqlUsed = 0; + + for (const it of needsDetail) { + const wantsHtml = isSyntheticSku(it.sku); + const wantsGql = !it.img && String(it._skuProbe || "").trim(); + + // enforce caps + if (wantsHtml && htmlUsed >= htmlBudget && (!wantsGql || gqlUsed >= gqlBudget)) continue; + if (wantsGql && gqlUsed >= gqlBudget && (!wantsHtml || htmlUsed >= htmlBudget)) continue; + + // count budgets pessimistically + if (wantsHtml) htmlUsed++; + if (wantsGql) gqlUsed++; + + await tudorRepairItem(ctx, it); + discovered.set(it.url, it); + } + + ctx.logger.ok( + `${ctx.catPrefixOut} | Unique products: ${discovered.size} | detail(html=${htmlUsed}/${htmlBudget}, gql=${gqlUsed}/${gqlBudget})` + ); + + const { merged, newItems, updatedItems, removedItems, restoredItems } = mergeDiscoveredIntoDb(prevDb, discovered, { + storeLabel: ctx.store.name, + }); + + const dbObj = buildDbObject(ctx, merged); + writeJsonAtomic(ctx.dbFile, dbObj); + + const elapsed = Date.now() - t0; + + report.categories.push({ + store: ctx.store.name, + label: ctx.cat.label, + key: ctx.cat.key, + dbFile: ctx.dbFile, + scannedPages: done, + discoveredUnique: discovered.size, + newCount: newItems.length, + updatedCount: updatedItems.length, + removedCount: removedItems.length, + restoredCount: restoredItems.length, + elapsedMs: elapsed, + }); + + report.totals.newCount += newItems.length; + report.totals.updatedCount += updatedItems.length; + report.totals.removedCount += removedItems.length; + report.totals.restoredCount += restoredItems.length; + + addCategoryResultToReport(report, ctx.store.name, ctx.cat.label, newItems, updatedItems, removedItems, restoredItems); +} /* ---------------- store ---------------- */ diff --git a/src/utils/sku.js b/src/utils/sku.js index 6b2b8eb..5dd0bda 100644 --- a/src/utils/sku.js +++ b/src/utils/sku.js @@ -20,8 +20,9 @@ function normalizeUpcDigits(v) { return m ? m[1] : ""; } +// CHANGE: allow 1-11 digits so BCL 3-digit ids like id:141 are preserved function normalizeIdDigits(v) { - const m = String(v ?? "").match(/\b(\d{4,11})\b/); + const m = String(v ?? "").match(/\b(\d{1,11})\b/); return m ? m[1] : ""; } @@ -33,6 +34,35 @@ function makeSyntheticSkuKey({ storeLabel, url }) { return `u:${fnv1a32(`${store}|${u}`)}`; } +/* ---------------- NEW: SKU quality helpers ---------------- */ + +function skuQuality(v) { + const s = String(v ?? "").trim(); + if (!s) return 0; // missing + if (/^u:/i.test(s)) return 0; // synthetic + if (normalizeCspc(s)) return 3; // best (6-digit CSPC) + if (/^upc:/i.test(s)) return 2; + if (/^id:/i.test(s)) return 2; + return 1; // explicit non-synthetic string +} + +// Prefer higher quality; on ties keep existing (stable) value +function pickBetterSku(newSku, oldSku) { + const a = String(newSku ?? "").trim(); + const b = String(oldSku ?? "").trim(); + const qa = skuQuality(a); + const qb = skuQuality(b); + if (qa > qb) return a; + if (qb > qa) return b; + return b || a; +} + +// Only fetch product pages when missing/synthetic +function needsSkuDetail(sku) { + const s = String(sku ?? "").trim(); + return !s || /^u:/i.test(s); +} + /** * Behavior: * - CSPC 6-digit => "123456" @@ -63,4 +93,11 @@ function normalizeSkuKey(v, { storeLabel, url } = {}) { return syn || ""; } -module.exports = { normalizeCspc, normalizeSkuKey, makeSyntheticSkuKey }; +module.exports = { + normalizeCspc, + normalizeSkuKey, + makeSyntheticSkuKey, + skuQuality, + pickBetterSku, + needsSkuDetail, +};