diff --git a/src/stores/sierrasprings.js b/src/stores/sierrasprings.js index 9c40c5c..30e7012 100644 --- a/src/stores/sierrasprings.js +++ b/src/stores/sierrasprings.js @@ -1,55 +1,76 @@ "use strict"; const { decodeHtml, cleanText, extractFirstImgUrl } = require("../utils/html"); -const { normalizeCspc, normalizeSkuKey } = require("../utils/sku"); +const { normalizeSkuKey } = require("../utils/sku"); const { extractPriceFromTmbBlock } = require("../utils/woocommerce"); -function allowSierraSpiritsLiquorUrlRumWhisky(item) { - const u = item && item.url ? item.url : ""; - const s = String(u || "").toLowerCase(); - if (!/^https?:\/\/sierraspringsliquor\.ca\/shop\/spirits-liquor\/.+\/$/.test(s)) return false; - return /\/shop\/spirits-liquor\/.*(rum|whisk(?:e)?y).*/.test(s); +// Tracker internals (store-only override; no global changes) +const { writeJsonAtomic, buildDbObject } = require("../tracker/db"); +const { mergeDiscoveredIntoDb } = require("../tracker/merge"); +const { addCategoryResultToReport } = require("../tracker/report"); + +function allowSierraUrlRumWhisky(item) { + const u = (item && item.url) ? String(item.url) : ""; + const s = u.toLowerCase(); + if (!/^https?:\/\/sierraspringsliquor\.ca\//.test(s)) return false; + return /\b(rum|whisk(?:e)?y)\b/.test(s); } -function parseProductsSierra(html, ctx) { +// Keep old name referenced historically in this store config +const allowSierraSpiritsLiquorUrlRumWhisky = allowSierraUrlRumWhisky; + +function formatWooStorePrice(prices) { + if (!prices) return null; + + const minor = Number.isFinite(prices.currency_minor_unit) ? prices.currency_minor_unit : 2; + const raw = prices.price ?? prices.regular_price ?? prices.sale_price; + if (raw == null) return null; + + const n = Number(String(raw).replace(/[^\d]/g, "")); + if (!Number.isFinite(n)) return null; + + const value = (n / Math.pow(10, minor)).toFixed(minor); + const prefix = prices.currency_prefix ?? prices.currency_symbol ?? "$"; + const suffix = prices.currency_suffix ?? ""; + return `${prefix}${value}${suffix}`; +} + +function parseWooStoreProductsJson(payload, ctx) { const items = []; - const blocks = String(html || "").split(/
]*class=["'][^"']*t-entry-title[^"']*["'][^>]*>\s*]*href=["']([^"']+)["'][^>]*>([\s\S]*?)<\/a>\s*<\/h3>/i - ); - if (!titleMatch) continue; + for (const p of data) { + const url = (p && p.permalink) ? String(p.permalink) : ""; + if (!url) continue; - const url = new URL(decodeHtml(titleMatch[1]), base).toString(); - const name = cleanText(decodeHtml(titleMatch[2])); + const name = (p && p.name) ? cleanText(decodeHtml(String(p.name))) : ""; if (!name) continue; - const price = extractPriceFromTmbBlock(block); - - const rawSku = - block.match(/\bdata-product_sku=["']([^"']+)["']/i)?.[1] || - block.match(/\bSKU[:\s]*([0-9]{6})\b/i)?.[1] || - ""; - - // Sierra uses short numeric SKUs like "1222" -> treat as id: - const taggedSku = /^\d{1,11}$/.test(String(rawSku).trim()) - ? `id:${String(rawSku).trim()}` - : rawSku; + const price = formatWooStorePrice(p.prices); + const id = (p && (p.id ?? p.id === 0)) ? String(p.id) : ""; + const taggedSku = id ? `id:${id}` : ""; const sku = normalizeSkuKey(taggedSku, { storeLabel: ctx?.store?.name, url }); - - - const img = extractFirstImgUrl(block, base); - items.push({ name, price, url, sku, img }); + const img = + (p.images && Array.isArray(p.images) && p.images[0] && p.images[0].src) + ? String(p.images[0].src) + : null; + + const item = { name, price, url, sku, img }; + + const allowUrl = ctx?.cat?.allowUrl; + if (typeof allowUrl === "function" && !allowUrl(item)) continue; + + items.push(item); } const uniq = new Map(); @@ -57,32 +78,283 @@ function parseProductsSierra(html, ctx) { return [...uniq.values()]; } +function parseWooProductsHtml(html, ctx) { + const s = String(html || ""); + const items = []; + + const base = `https://${(ctx && ctx.store && ctx.store.host) || "sierraspringsliquor.ca"}/`; + const parts = s.split(//i); + const block = (endIdx >= 0 ? chunk.slice(0, endIdx + 5) : chunk); + + const hrefs = [...block.matchAll(/]*href=["']([^"']+)["']/gi)].map(m => m[1]); + const href = hrefs.find(h => !/add-to-cart=|\/cart\/|\/checkout\//i.test(h)) || ""; + if (!href) continue; + + const url = new URL(decodeHtml(href), base).toString(); + + const nameHtml = + block.match(/]*class=["'][^"']*woocommerce-loop-product__title[^"']*["'][^>]*>([\s\S]*?)<\/h2>/i)?.[1] || + block.match(/]*>([\s\S]*?)<\/h3>/i)?.[1] || + ""; + const name = cleanText(decodeHtml(nameHtml)); + if (!name) continue; + + const price = extractPriceFromTmbBlock(block); + + const rawSku = + block.match(/\bdata-product_sku=["']([^"']+)["']/i)?.[1] || + block.match(/\bdata-product_id=["']([^"']+)["']/i)?.[1] || + ""; + + const taggedSku = /^\d{1,11}$/.test(String(rawSku).trim()) + ? `id:${String(rawSku).trim()}` + : String(rawSku || "").trim(); + + const sku = normalizeSkuKey(taggedSku, { storeLabel: ctx?.store?.name, url }); + const img = extractFirstImgUrl(block, base); + + const item = { name, price, url, sku, img }; + + const allowUrl = ctx?.cat?.allowUrl; + if (typeof allowUrl === "function" && !allowUrl(item)) continue; + + items.push(item); + } + + const uniq = new Map(); + for (const it of items) uniq.set(it.url, it); + return [...uniq.values()]; +} + +function parseProductsSierra(body, ctx) { + const s = String(body || ""); + const t = s.trimStart(); + + if (t.startsWith("[") || t.startsWith("{")) { + const jsonItems = parseWooStoreProductsJson(s, ctx); + ctx.logger?.dbg?.(`parseProductsSierra: storeApiItems=${jsonItems.length} bytes=${s.length}`); + return jsonItems; + } + + const blocks = s.split(/
1) { + const items = []; + for (let i = 1; i < blocks.length; i++) { + const block = "
]*class=["'][^"']*t-entry-title[^"']*["'][^>]*>\s*]*href=["']([^"']+)["'][^>]*>([\s\S]*?)<\/a>\s*<\/h3>/i + ); + if (!titleMatch) continue; + + const url = new URL(decodeHtml(titleMatch[1]), base).toString(); + const name = cleanText(decodeHtml(titleMatch[2])); + if (!name) continue; + + const price = extractPriceFromTmbBlock(block); + + const rawSku = + block.match(/\bdata-product_sku=["']([^"']+)["']/i)?.[1] || + block.match(/\bSKU[:\s]*([0-9]{6})\b/i)?.[1] || + ""; + + const taggedSku = /^\d{1,11}$/.test(String(rawSku).trim()) + ? `id:${String(rawSku).trim()}` + : rawSku; + + const sku = normalizeSkuKey(taggedSku, { storeLabel: ctx?.store?.name, url }); + const img = extractFirstImgUrl(block, base); + + const item = { name, price, url, sku, img }; + + const allowUrl = ctx?.cat?.allowUrl; + if (typeof allowUrl === "function" && !allowUrl(item)) continue; + + items.push(item); + } + + const uniq = new Map(); + for (const it of items) uniq.set(it.url, it); + return [...uniq.values()]; + } + + const woo = parseWooProductsHtml(s, ctx); + ctx.logger?.dbg?.(`parseProductsSierra: wooItems=${woo.length} bytes=${s.length}`); + return woo; +} + +function extractProductCatTermId(html) { + const s = String(html || ""); + // Typical body classes contain: "tax-product_cat term- term-1131 ..." + const m = + s.match(/tax-product_cat[^"']{0,400}\bterm-(\d{1,10})\b/i) || + s.match(/\bterm-(\d{1,10})\b/i); + if (!m) return null; + const n = Number(m[1]); + return Number.isFinite(n) ? n : null; +} + +async function getWooCategoryIdForCat(ctx) { + // allow manual override if you ever want it + if (Number.isFinite(ctx?.cat?.wooCategoryId)) return ctx.cat.wooCategoryId; + + // cache per category object + if (Number.isFinite(ctx?.cat?._wooCategoryId)) return ctx.cat._wooCategoryId; + + // infer from the HTML category page so startUrl stays stable (DB filenames stay stable) + const { text, finalUrl } = await ctx.http.fetchTextWithRetry(ctx.cat.startUrl, "discover", ctx.store.ua); + const id = extractProductCatTermId(text); + + if (!id) { + ctx.logger.warn(`${ctx.catPrefixOut} | Could not infer product_cat term id from category page; falling back to HTML parsing only.`); + ctx.cat._wooCategoryId = null; + return null; + } + + ctx.logger.ok(`${ctx.catPrefixOut} | Woo category id: ${id} (${finalUrl || ctx.cat.startUrl})`); + ctx.cat._wooCategoryId = id; + return id; +} + +/** + * Sierra Springs: override scan to use Woo Store API pagination + * while keeping original startUrl (so DB hashes and "source" stay unchanged). + */ +async function scanCategoryWooStoreApi(ctx, prevDb, report) { + const { logger } = ctx; + const t0 = Date.now(); + + const perPage = Number.isFinite(ctx.cat.perPage) ? ctx.cat.perPage : 100; + const discovered = new Map(); + + const catId = await getWooCategoryIdForCat(ctx); + + // If we can't infer id, do nothing special; let existing DB stay as-is. + // (You can remove this fallback if you prefer hard failure.) + if (!catId) return; + + const apiBase = new URL(`https://${ctx.store.host}/wp-json/wc/store/v1/products`); + apiBase.searchParams.set("per_page", String(perPage)); + apiBase.searchParams.set("category", String(catId)); + + const hardCap = 500; // safety + let page = 1; + + while (page <= hardCap) { + apiBase.searchParams.set("page", String(page)); + const pageUrl = apiBase.toString(); + + const { text, status, bytes, ms, finalUrl } = await ctx.http.fetchTextWithRetry( + pageUrl, + `page:${ctx.store.key}:${ctx.cat.key}:${page}`, + ctx.store.ua + ); + + const itemsRaw = (ctx.store.parseProducts || ctx.config.defaultParseProducts)(text, ctx, finalUrl); + const items = []; + + for (const it of itemsRaw) { + const allow = ctx?.cat?.allowUrl; + if (typeof allow === "function" && !allow(it)) continue; + items.push(it); + } + + logger.ok( + `${ctx.catPrefixOut} | Page ${String(page).padStart(3, " ")} | ${String(status).padStart(3, " ")} | items=${String(items.length).padStart(3, " ")} | bytes=${String(bytes || 0).padStart(8, " ")} | ${(ms / 1000).toFixed(1).padStart(6, " ")}s` + ); + + // stop on empty OR short last page (prevents requesting the "[]" next page that triggers Short HTML) + if (!items.length) break; + + for (const it of items) discovered.set(it.url, it); + + if (items.length < perPage) break; + page++; + } + + logger.ok(`${ctx.catPrefixOut} | Unique products (this run): ${discovered.size}`); + + const { merged, newItems, updatedItems, removedItems, restoredItems, metaChangedItems } = + mergeDiscoveredIntoDb(prevDb, discovered, { storeLabel: ctx.store.name }); + + const dbObj = buildDbObject(ctx, merged); + writeJsonAtomic(ctx.dbFile, dbObj); + + logger.ok(`${ctx.catPrefixOut} | DB saved: ${logger.dim(ctx.dbFile)} (${dbObj.count} items)`); + + const elapsedMs = Date.now() - t0; + + report.categories.push({ + store: ctx.store.name, + label: ctx.cat.label, + key: ctx.cat.key, + dbFile: ctx.dbFile, + scannedPages: Math.max(0, page), + discoveredUnique: discovered.size, + newCount: newItems.length, + updatedCount: updatedItems.length, + removedCount: removedItems.length, + restoredCount: restoredItems.length, + metaChangedCount: metaChangedItems.length, + elapsedMs, + }); + + report.totals.newCount += newItems.length; + report.totals.updatedCount += updatedItems.length; + report.totals.removedCount += removedItems.length; + report.totals.restoredCount += restoredItems.length; + report.totals.metaChangedCount += metaChangedItems.length; + + addCategoryResultToReport(report, ctx.store.name, ctx.cat.label, newItems, updatedItems, removedItems, restoredItems); +} function createStore(defaultUa) { + const ua = defaultUa; + return { key: "sierrasprings", name: "Sierra Springs", host: "sierraspringsliquor.ca", - ua: defaultUa, + ua, parseProducts: parseProductsSierra, + + // store-only override (no changes outside this file) + scanCategory: scanCategoryWooStoreApi, + + // RESTORED: original 4 categories, unchanged startUrl so DB hashes match categories: [ { key: "whisky", label: "Whisky", startUrl: "https://sierraspringsliquor.ca/product-category/whisky-2/", - discoveryStartPage: 20, + discoveryStartPage: 1, + perPage: 100, }, { key: "fine-rare", label: "Fine & Rare", startUrl: "https://sierraspringsliquor.ca/product-category/fine-rare/", discoveryStartPage: 1, + perPage: 100, }, { key: "spirits-liquor", label: "Spirits / Liquor", - startUrl: "https://sierraspringsliquor.ca/product-category/spirits-liquor/page/2/", - discoveryStartPage: 15, + startUrl: "https://sierraspringsliquor.ca/product-category/spirits-liquor/", + discoveryStartPage: 1, + perPage: 100, allowUrl: allowSierraSpiritsLiquorUrlRumWhisky, }, { @@ -90,6 +362,7 @@ function createStore(defaultUa) { label: "Spirits", startUrl: "https://sierraspringsliquor.ca/product-category/spirits/", discoveryStartPage: 1, + perPage: 100, }, ], };