1) {
+ const items = [];
+ for (let i = 1; i < blocks.length; i++) {
+ const block = "
]*class=["'][^"']*t-entry-title[^"']*["'][^>]*>\s*
]*href=["']([^"']+)["'][^>]*>([\s\S]*?)<\/a>\s*<\/h3>/i
+ );
+ if (!titleMatch) continue;
+
+ const url = new URL(decodeHtml(titleMatch[1]), base).toString();
+ const name = cleanText(decodeHtml(titleMatch[2]));
+ if (!name) continue;
+
+ const price = extractPriceFromTmbBlock(block);
+
+ const rawSku =
+ block.match(/\bdata-product_sku=["']([^"']+)["']/i)?.[1] ||
+ block.match(/\bSKU[:\s]*([0-9]{6})\b/i)?.[1] ||
+ "";
+
+ const taggedSku = /^\d{1,11}$/.test(String(rawSku).trim())
+ ? `id:${String(rawSku).trim()}`
+ : rawSku;
+
+ const sku = normalizeSkuKey(taggedSku, { storeLabel: ctx?.store?.name, url });
+ const img = extractFirstImgUrl(block, base);
+
+ const item = { name, price, url, sku, img };
+
+ const allowUrl = ctx?.cat?.allowUrl;
+ if (typeof allowUrl === "function" && !allowUrl(item)) continue;
+
+ items.push(item);
+ }
+
+ const uniq = new Map();
+ for (const it of items) uniq.set(it.url, it);
+ return [...uniq.values()];
+ }
+
+ const woo = parseWooProductsHtml(s, ctx);
+ ctx.logger?.dbg?.(`parseProductsSierra: wooItems=${woo.length} bytes=${s.length}`);
+ return woo;
+}
+
+function extractProductCatTermId(html) {
+ const s = String(html || "");
+ // Typical body classes contain: "tax-product_cat term- term-1131 ..."
+ const m =
+ s.match(/tax-product_cat[^"']{0,400}\bterm-(\d{1,10})\b/i) ||
+ s.match(/\bterm-(\d{1,10})\b/i);
+ if (!m) return null;
+ const n = Number(m[1]);
+ return Number.isFinite(n) ? n : null;
+}
+
+async function getWooCategoryIdForCat(ctx) {
+ // allow manual override if you ever want it
+ if (Number.isFinite(ctx?.cat?.wooCategoryId)) return ctx.cat.wooCategoryId;
+
+ // cache per category object
+ if (Number.isFinite(ctx?.cat?._wooCategoryId)) return ctx.cat._wooCategoryId;
+
+ // infer from the HTML category page so startUrl stays stable (DB filenames stay stable)
+ const { text, finalUrl } = await ctx.http.fetchTextWithRetry(ctx.cat.startUrl, "discover", ctx.store.ua);
+ const id = extractProductCatTermId(text);
+
+ if (!id) {
+ ctx.logger.warn(`${ctx.catPrefixOut} | Could not infer product_cat term id from category page; falling back to HTML parsing only.`);
+ ctx.cat._wooCategoryId = null;
+ return null;
+ }
+
+ ctx.logger.ok(`${ctx.catPrefixOut} | Woo category id: ${id} (${finalUrl || ctx.cat.startUrl})`);
+ ctx.cat._wooCategoryId = id;
+ return id;
+}
+
+/**
+ * Sierra Springs: override scan to use Woo Store API pagination
+ * while keeping original startUrl (so DB hashes and "source" stay unchanged).
+ */
+async function scanCategoryWooStoreApi(ctx, prevDb, report) {
+ const { logger } = ctx;
+ const t0 = Date.now();
+
+ const perPage = Number.isFinite(ctx.cat.perPage) ? ctx.cat.perPage : 100;
+ const discovered = new Map();
+
+ const catId = await getWooCategoryIdForCat(ctx);
+
+ // If we can't infer id, do nothing special; let existing DB stay as-is.
+ // (You can remove this fallback if you prefer hard failure.)
+ if (!catId) return;
+
+ const apiBase = new URL(`https://${ctx.store.host}/wp-json/wc/store/v1/products`);
+ apiBase.searchParams.set("per_page", String(perPage));
+ apiBase.searchParams.set("category", String(catId));
+
+ const hardCap = 500; // safety
+ let page = 1;
+
+ while (page <= hardCap) {
+ apiBase.searchParams.set("page", String(page));
+ const pageUrl = apiBase.toString();
+
+ const { text, status, bytes, ms, finalUrl } = await ctx.http.fetchTextWithRetry(
+ pageUrl,
+ `page:${ctx.store.key}:${ctx.cat.key}:${page}`,
+ ctx.store.ua
+ );
+
+ const itemsRaw = (ctx.store.parseProducts || ctx.config.defaultParseProducts)(text, ctx, finalUrl);
+ const items = [];
+
+ for (const it of itemsRaw) {
+ const allow = ctx?.cat?.allowUrl;
+ if (typeof allow === "function" && !allow(it)) continue;
+ items.push(it);
+ }
+
+ logger.ok(
+ `${ctx.catPrefixOut} | Page ${String(page).padStart(3, " ")} | ${String(status).padStart(3, " ")} | items=${String(items.length).padStart(3, " ")} | bytes=${String(bytes || 0).padStart(8, " ")} | ${(ms / 1000).toFixed(1).padStart(6, " ")}s`
+ );
+
+ // stop on empty OR short last page (prevents requesting the "[]" next page that triggers Short HTML)
+ if (!items.length) break;
+
+ for (const it of items) discovered.set(it.url, it);
+
+ if (items.length < perPage) break;
+ page++;
+ }
+
+ logger.ok(`${ctx.catPrefixOut} | Unique products (this run): ${discovered.size}`);
+
+ const { merged, newItems, updatedItems, removedItems, restoredItems, metaChangedItems } =
+ mergeDiscoveredIntoDb(prevDb, discovered, { storeLabel: ctx.store.name });
+
+ const dbObj = buildDbObject(ctx, merged);
+ writeJsonAtomic(ctx.dbFile, dbObj);
+
+ logger.ok(`${ctx.catPrefixOut} | DB saved: ${logger.dim(ctx.dbFile)} (${dbObj.count} items)`);
+
+ const elapsedMs = Date.now() - t0;
+
+ report.categories.push({
+ store: ctx.store.name,
+ label: ctx.cat.label,
+ key: ctx.cat.key,
+ dbFile: ctx.dbFile,
+ scannedPages: Math.max(0, page),
+ discoveredUnique: discovered.size,
+ newCount: newItems.length,
+ updatedCount: updatedItems.length,
+ removedCount: removedItems.length,
+ restoredCount: restoredItems.length,
+ metaChangedCount: metaChangedItems.length,
+ elapsedMs,
+ });
+
+ report.totals.newCount += newItems.length;
+ report.totals.updatedCount += updatedItems.length;
+ report.totals.removedCount += removedItems.length;
+ report.totals.restoredCount += restoredItems.length;
+ report.totals.metaChangedCount += metaChangedItems.length;
+
+ addCategoryResultToReport(report, ctx.store.name, ctx.cat.label, newItems, updatedItems, removedItems, restoredItems);
+}
function createStore(defaultUa) {
+ const ua = defaultUa;
+
return {
key: "sierrasprings",
name: "Sierra Springs",
host: "sierraspringsliquor.ca",
- ua: defaultUa,
+ ua,
parseProducts: parseProductsSierra,
+
+ // store-only override (no changes outside this file)
+ scanCategory: scanCategoryWooStoreApi,
+
+ // RESTORED: original 4 categories, unchanged startUrl so DB hashes match
categories: [
{
key: "whisky",
label: "Whisky",
startUrl: "https://sierraspringsliquor.ca/product-category/whisky-2/",
- discoveryStartPage: 20,
+ discoveryStartPage: 1,
+ perPage: 100,
},
{
key: "fine-rare",
label: "Fine & Rare",
startUrl: "https://sierraspringsliquor.ca/product-category/fine-rare/",
discoveryStartPage: 1,
+ perPage: 100,
},
{
key: "spirits-liquor",
label: "Spirits / Liquor",
- startUrl: "https://sierraspringsliquor.ca/product-category/spirits-liquor/page/2/",
- discoveryStartPage: 15,
+ startUrl: "https://sierraspringsliquor.ca/product-category/spirits-liquor/",
+ discoveryStartPage: 1,
+ perPage: 100,
allowUrl: allowSierraSpiritsLiquorUrlRumWhisky,
},
{
@@ -90,6 +362,7 @@ function createStore(defaultUa) {
label: "Spirits",
startUrl: "https://sierraspringsliquor.ca/product-category/spirits/",
discoveryStartPage: 1,
+ perPage: 100,
},
],
};