From fa6fd99991ca6c717bd8d0cc6858db32697f2530 Mon Sep 17 00:00:00 2001 From: "Brennan Wilkes (Text Groove)" Date: Sat, 24 Jan 2026 12:55:59 -0800 Subject: [PATCH] feat: Support for the Gull --- src/stores/gull.js | 99 ++++++++++++++++++++++++++++++++++++ src/stores/index.js | 8 +-- src/tracker/category_scan.js | 81 +++++++++++++++++++++++++++-- src/tracker/run_all.js | 66 ++++++++++++++++-------- 4 files changed, 226 insertions(+), 28 deletions(-) create mode 100644 src/stores/gull.js diff --git a/src/stores/gull.js b/src/stores/gull.js new file mode 100644 index 0000000..e1a2fa2 --- /dev/null +++ b/src/stores/gull.js @@ -0,0 +1,99 @@ +"use strict"; + +const { decodeHtml, cleanText, extractFirstImgUrl } = require("../utils/html"); +const { normalizeCspc } = require("../utils/sku"); +const { extractPriceFromTmbBlock } = require("../utils/woocommerce"); +const { makePageUrl } = require("../utils/url"); + +function looksInStock(block) { + const s = String(block || ""); + if (/\boutofstock\b/i.test(s)) return false; + // your sample has:

1 in stock

+ if (/\bin-stock\b/i.test(s)) return true; + if (/\binstock\b/i.test(s)) return true; + if (/>\s*\d+\s+in\s+stock\s* + const parts = s.split(/]*class=["'][^"']*\bproduct\b[^"']*["'][^>]*>/i); + if (parts.length <= 1) return items; + + const base = `https://${(ctx && ctx.store && ctx.store.host) || "gullliquorstore.com"}/`; + + for (let i = 1; i < parts.length; i++) { + const block = '
  • ]*href=["']([^"']+)["'][^>]*class=["'][^"']*\bwoocommerce-LoopProduct-link\b/i); + if (!hrefM || !hrefM[1]) continue; + + let url; + try { + url = new URL(decodeHtml(hrefM[1]), base).toString(); + } catch { + continue; + } + + const titleM = block.match(/]*class=["'][^"']*\bwoocommerce-loop-product__title\b[^"']*["'][^>]*>([\s\S]*?)<\/h2>/i); + const name = cleanText(decodeHtml(titleM ? titleM[1] : "")); + if (!name) continue; + + // Price is in standard Woo ... + const price = extractPriceFromTmbBlock(block) || ""; + + const sku = normalizeCspc( + block.match(/\bdata-product_sku=["']([^"']+)["']/i)?.[1] || + block.match(/\bSKU\b[^0-9]{0,20}(\d{6})\b/i)?.[1] || + url + ); + + const img = extractFirstImgUrl(block, base); + + items.push({ name, price, url, sku, img }); + } + + const uniq = new Map(); + for (const it of items) uniq.set(it.url, it); + return [...uniq.values()]; +} + +function createStore(defaultUa) { + return { + key: "gull", + name: "Gull Liquor", + host: "gullliquorstore.com", + ua: defaultUa, + parseProducts: parseProductsGull, + makePageUrl, // enables /page/N/ paging + categories: [ + { + key: "whisky", + label: "Whisky", + startUrl: "https://gullliquorstore.com/product-category/spirits/?spirit_type=whisky", + discoveryStartPage: 3, + discoveryStep: 2, + pageConcurrency: 1, + pageStaggerMs: 10000, + discoveryDelayMs: 10000 + }, + { + key: "rum", + label: "Rum", + startUrl: "https://gullliquorstore.com/product-category/spirits/?spirit_type=rum", + discoveryStartPage: 3, + discoveryStep: 2, + pageConcurrency: 1, + pageStaggerMs: 10000, + discoveryDelayMs: 10000 + }, + ], + }; +} + +module.exports = { createStore, parseProductsGull }; diff --git a/src/stores/index.js b/src/stores/index.js index 22be4e4..69bdf59 100644 --- a/src/stores/index.js +++ b/src/stores/index.js @@ -9,17 +9,19 @@ const { createStore: createCraftCellars } = require("./craftcellars"); const { createStore: createBCL } = require("./bcl"); const { createStore: createStrath } = require("./strath"); const { createStore: createLegacy } = require("./legacyliquor"); +const { createStore: createGull } = require("./gull"); function createStores({ defaultUa } = {}) { return [ + createGull(defaultUa), createSierra(defaultUa), - createBSW(defaultUa), createKWM(defaultUa), + createCraftCellars(defaultUa), + createStrath(defaultUa), + createBSW(defaultUa), createKegNCork(defaultUa), createMaltsAndGrains(defaultUa), - createCraftCellars(defaultUa), createBCL(defaultUa), - createStrath(defaultUa), createLegacy(defaultUa), ]; } diff --git a/src/tracker/category_scan.js b/src/tracker/category_scan.js index 1b0d09a..cbcdaef 100644 --- a/src/tracker/category_scan.js +++ b/src/tracker/category_scan.js @@ -1,5 +1,7 @@ "use strict"; +const { setTimeout: sleep } = require("timers/promises"); + const { humanBytes } = require("../utils/bytes"); const { padLeft, padRight, padLeftV, padRightV } = require("../utils/string"); const { normalizeBaseUrl, makePageUrlForCtx } = require("../utils/url"); @@ -94,8 +96,38 @@ function shouldTrackItem(ctx, finalUrl, item) { return allow(item, ctx, finalUrl); } +/** + * Best-effort extraction of total pages from Woo pagination markup on page 1. + * Looks for: + * - /page/N/ + * - ?paged=N + * inside links that often have "page-numbers" class, but works even without it. + */ +function extractTotalPagesFromPaginationHtml(html) { + const s = String(html || ""); + let max = 0; + + // /page/23/ + for (const m of s.matchAll(/href=["'][^"']*\/page\/(\d+)\/[^"']*["']/gi)) { + const n = Number(m[1]); + if (Number.isFinite(n) && n > max) max = n; + } + + // ?paged=23 + for (const m of s.matchAll(/href=["'][^"']*[?&]paged=(\d+)[^"']*["']/gi)) { + const n = Number(m[1]); + if (Number.isFinite(n) && n > max) max = n; + } + + // Sometimes themes render plain numbers without /page/ in href; keep it conservative: + // Only trust these if we already found at least one pagination-ish token. + if (max > 1) return max; + + return 0; +} + async function pageHasProducts(ctx, url) { - const { http, config, logger } = ctx; + const { http, config } = ctx; try { const { text } = await http.fetchTextWithRetry(url, "discover", ctx.store.ua); @@ -113,6 +145,10 @@ async function pageHasProducts(ctx, url) { async function probePage(ctx, baseUrl, pageNum, state) { const url = makePageUrlForCtx(ctx, baseUrl, pageNum); + + const delay = Number.isFinite(ctx?.cat?.discoveryDelayMs) ? ctx.cat.discoveryDelayMs : 0; + if (delay > 0) await sleep(delay); + const t0 = Date.now(); const r = await pageHasProducts(ctx, url); const ms = Date.now() - t0; @@ -168,12 +204,44 @@ async function binaryFindLastOk(ctx, baseUrl, loOk, hiMiss, state) { async function discoverTotalPagesFast(ctx, baseUrl, guess, step) { const state = { phase: "pre", loOk: 1, hiMiss: 2, binInitialSpan: 0 }; - const p1 = await probePage(ctx, baseUrl, 1, state); - if (!p1.ok) { + // Fetch page 1 ONCE and try to extract total pages from pagination. + const url1 = makePageUrlForCtx(ctx, baseUrl, 1); + const t0 = Date.now(); + const { text: html1, ms, status, bytes, finalUrl } = await ctx.http.fetchTextWithRetry(url1, "discover", ctx.store.ua); + const pMs = Date.now() - t0; + + if (typeof ctx.store.isEmptyListingPage === "function") { + if (ctx.store.isEmptyListingPage(html1, ctx, url1)) { + ctx.logger.warn(`${ctx.store.name} | ${ctx.cat.label} | Page 1 did not look like a listing. Defaulting to 1.`); + return 1; + } + } + + const parser = ctx.store.parseProducts || ctx.config.defaultParseProducts; + const items1 = parser(html1, ctx, finalUrl).length; + + logProgressLine( + ctx.logger, + ctx, + `Discover probe page=${padLeftV(1, 4)}`, + items1 > 0 ? "OK" : "MISS", + items1 > 0, + discoverProg(state), + `items=${padLeftV(items1, 3)} | bytes=${padLeftV("", 8)} | ${padRightV(ctx.http.inflightStr(), 11)} | ${secStr(ms || pMs)}` + ); + + if (items1 <= 0) { ctx.logger.warn(`${ctx.store.name} | ${ctx.cat.label} | Page 1 did not look like a listing. Defaulting to 1.`); return 1; } + const extracted = extractTotalPagesFromPaginationHtml(html1); + if (extracted && extracted >= 1) { + ctx.logger.ok(`${ctx.catPrefixOut} | Total pages (from pagination): ${extracted}`); + return extracted; + } + + // Fallback to probing if pagination parse fails const g = Math.max(2, guess); const pg = await probePage(ctx, baseUrl, g, state); if (!pg.ok) return await binaryFindLastOk(ctx, baseUrl, 1, g, state); @@ -202,7 +270,7 @@ async function discoverAndScanCategory(ctx, prevDb, report) { const t0 = Date.now(); const guess = Number.isFinite(ctx.cat.discoveryStartPage) ? ctx.cat.discoveryStartPage : config.discoveryGuess; - const step = config.discoveryStep; + const step = Number.isFinite(ctx.cat.discoveryStep) ? ctx.cat.discoveryStep : config.discoveryStep; const totalPages = await discoverTotalPagesFast(ctx, ctx.baseUrl, guess, step); const scanPages = config.maxPages === null ? totalPages : Math.min(config.maxPages, totalPages); @@ -214,7 +282,10 @@ async function discoverAndScanCategory(ctx, prevDb, report) { let donePages = 0; - const perPageItems = await parallelMapStaggered(pages, config.concurrency, config.staggerMs, async (pageUrl, idx) => { + const pageConc = Number.isFinite(ctx.cat.pageConcurrency) ? ctx.cat.pageConcurrency : config.concurrency; + const pageStagger = Number.isFinite(ctx.cat.pageStaggerMs) ? ctx.cat.pageStaggerMs : config.staggerMs; + + const perPageItems = await parallelMapStaggered(pages, pageConc, pageStagger, async (pageUrl, idx) => { const pnum = idx + 1; const { text: html, ms, bytes, status, finalUrl } = await ctx.http.fetchTextWithRetry( diff --git a/src/tracker/run_all.js b/src/tracker/run_all.js index c79d0e7..965c31a 100644 --- a/src/tracker/run_all.js +++ b/src/tracker/run_all.js @@ -1,7 +1,7 @@ "use strict"; const { createReport } = require("./report"); -const { parallelMapStaggered } = require("../utils/async"); +const { setTimeout: sleep } = require("timers/promises"); const { makeCatPrefixers, @@ -43,28 +43,54 @@ async function runAllStores(stores, { config, logger, http }) { } } - await parallelMapStaggered( - workItems, - Math.min(config.categoryConcurrency, workItems.length), - 0, - async (w) => { - try { - await discoverAndScanCategory(w.ctx, w.prevDb, report); - } catch (e) { - const storeName = w?.ctx?.store?.name || w?.ctx?.store?.host || "unknown-store"; - const catLabel = w?.ctx?.cat?.label || w?.ctx?.cat?.key || "unknown-category"; + // Host-level serialization: never run two categories from the same host concurrently. + const maxWorkers = Math.min(config.categoryConcurrency, workItems.length); + const queue = workItems.slice(); + const inflightHosts = new Set(); - // Keep it loud in logs, but do not fail the entire run. - logger.warn( - `Category failed (continuing): ${storeName} | ${catLabel}\n${formatErr(e)}` - ); + async function runOne(w) { + try { + await discoverAndScanCategory(w.ctx, w.prevDb, report); + } catch (e) { + const storeName = w?.ctx?.store?.name || w?.ctx?.store?.host || "unknown-store"; + const catLabel = w?.ctx?.cat?.label || w?.ctx?.cat?.key || "unknown-category"; - // If you want failures surfaced in the final report later, you could also - // push a "failed category" record onto report.categories here. - } - return null; + // Keep it loud in logs, but do not fail the entire run. + logger.warn(`Category failed (continuing): ${storeName} | ${catLabel}\n${formatErr(e)}`); } - ); + } + + async function worker() { + while (true) { + if (queue.length === 0) return; + + // Pick next item whose host isn't currently running. + const idx = queue.findIndex((w) => { + const host = String(w?.ctx?.store?.host || w?.ctx?.store?.key || ""); + return host && !inflightHosts.has(host); + }); + + if (idx === -1) { + // Nothing available right now; wait a bit. + await sleep(50); + continue; + } + + const w = queue.splice(idx, 1)[0]; + const host = String(w?.ctx?.store?.host || w?.ctx?.store?.key || ""); + + inflightHosts.add(host); + try { + await runOne(w); + } finally { + inflightHosts.delete(host); + } + } + } + + const workers = []; + for (let i = 0; i < maxWorkers; i++) workers.push(worker()); + await Promise.all(workers); return report; }