diff --git a/src/stores/gull.js b/src/stores/gull.js
new file mode 100644
index 0000000..e1a2fa2
--- /dev/null
+++ b/src/stores/gull.js
@@ -0,0 +1,99 @@
+"use strict";
+
+const { decodeHtml, cleanText, extractFirstImgUrl } = require("../utils/html");
+const { normalizeCspc } = require("../utils/sku");
+const { extractPriceFromTmbBlock } = require("../utils/woocommerce");
+const { makePageUrl } = require("../utils/url");
+
+function looksInStock(block) {
+ const s = String(block || "");
+ if (/\boutofstock\b/i.test(s)) return false;
+ // your sample has:
1 in stock
+ if (/\bin-stock\b/i.test(s)) return true;
+ if (/\binstock\b/i.test(s)) return true;
+ if (/>\s*\d+\s+in\s+stock\s*
+ const parts = s.split(/]*class=["'][^"']*\bproduct\b[^"']*["'][^>]*>/i);
+ if (parts.length <= 1) return items;
+
+ const base = `https://${(ctx && ctx.store && ctx.store.host) || "gullliquorstore.com"}/`;
+
+ for (let i = 1; i < parts.length; i++) {
+ const block = ']*href=["']([^"']+)["'][^>]*class=["'][^"']*\bwoocommerce-LoopProduct-link\b/i);
+ if (!hrefM || !hrefM[1]) continue;
+
+ let url;
+ try {
+ url = new URL(decodeHtml(hrefM[1]), base).toString();
+ } catch {
+ continue;
+ }
+
+ const titleM = block.match(/]*class=["'][^"']*\bwoocommerce-loop-product__title\b[^"']*["'][^>]*>([\s\S]*?)<\/h2>/i);
+ const name = cleanText(decodeHtml(titleM ? titleM[1] : ""));
+ if (!name) continue;
+
+ // Price is in standard Woo ...
+ const price = extractPriceFromTmbBlock(block) || "";
+
+ const sku = normalizeCspc(
+ block.match(/\bdata-product_sku=["']([^"']+)["']/i)?.[1] ||
+ block.match(/\bSKU\b[^0-9]{0,20}(\d{6})\b/i)?.[1] ||
+ url
+ );
+
+ const img = extractFirstImgUrl(block, base);
+
+ items.push({ name, price, url, sku, img });
+ }
+
+ const uniq = new Map();
+ for (const it of items) uniq.set(it.url, it);
+ return [...uniq.values()];
+}
+
+function createStore(defaultUa) {
+ return {
+ key: "gull",
+ name: "Gull Liquor",
+ host: "gullliquorstore.com",
+ ua: defaultUa,
+ parseProducts: parseProductsGull,
+ makePageUrl, // enables /page/N/ paging
+ categories: [
+ {
+ key: "whisky",
+ label: "Whisky",
+ startUrl: "https://gullliquorstore.com/product-category/spirits/?spirit_type=whisky",
+ discoveryStartPage: 3,
+ discoveryStep: 2,
+ pageConcurrency: 1,
+ pageStaggerMs: 10000,
+ discoveryDelayMs: 10000
+ },
+ {
+ key: "rum",
+ label: "Rum",
+ startUrl: "https://gullliquorstore.com/product-category/spirits/?spirit_type=rum",
+ discoveryStartPage: 3,
+ discoveryStep: 2,
+ pageConcurrency: 1,
+ pageStaggerMs: 10000,
+ discoveryDelayMs: 10000
+ },
+ ],
+ };
+}
+
+module.exports = { createStore, parseProductsGull };
diff --git a/src/stores/index.js b/src/stores/index.js
index 22be4e4..69bdf59 100644
--- a/src/stores/index.js
+++ b/src/stores/index.js
@@ -9,17 +9,19 @@ const { createStore: createCraftCellars } = require("./craftcellars");
const { createStore: createBCL } = require("./bcl");
const { createStore: createStrath } = require("./strath");
const { createStore: createLegacy } = require("./legacyliquor");
+const { createStore: createGull } = require("./gull");
function createStores({ defaultUa } = {}) {
return [
+ createGull(defaultUa),
createSierra(defaultUa),
- createBSW(defaultUa),
createKWM(defaultUa),
+ createCraftCellars(defaultUa),
+ createStrath(defaultUa),
+ createBSW(defaultUa),
createKegNCork(defaultUa),
createMaltsAndGrains(defaultUa),
- createCraftCellars(defaultUa),
createBCL(defaultUa),
- createStrath(defaultUa),
createLegacy(defaultUa),
];
}
diff --git a/src/tracker/category_scan.js b/src/tracker/category_scan.js
index 1b0d09a..cbcdaef 100644
--- a/src/tracker/category_scan.js
+++ b/src/tracker/category_scan.js
@@ -1,5 +1,7 @@
"use strict";
+const { setTimeout: sleep } = require("timers/promises");
+
const { humanBytes } = require("../utils/bytes");
const { padLeft, padRight, padLeftV, padRightV } = require("../utils/string");
const { normalizeBaseUrl, makePageUrlForCtx } = require("../utils/url");
@@ -94,8 +96,38 @@ function shouldTrackItem(ctx, finalUrl, item) {
return allow(item, ctx, finalUrl);
}
+/**
+ * Best-effort extraction of total pages from Woo pagination markup on page 1.
+ * Looks for:
+ * - /page/N/
+ * - ?paged=N
+ * inside links that often have "page-numbers" class, but works even without it.
+ */
+function extractTotalPagesFromPaginationHtml(html) {
+ const s = String(html || "");
+ let max = 0;
+
+ // /page/23/
+ for (const m of s.matchAll(/href=["'][^"']*\/page\/(\d+)\/[^"']*["']/gi)) {
+ const n = Number(m[1]);
+ if (Number.isFinite(n) && n > max) max = n;
+ }
+
+ // ?paged=23
+ for (const m of s.matchAll(/href=["'][^"']*[?&]paged=(\d+)[^"']*["']/gi)) {
+ const n = Number(m[1]);
+ if (Number.isFinite(n) && n > max) max = n;
+ }
+
+ // Sometimes themes render plain numbers without /page/ in href; keep it conservative:
+ // Only trust these if we already found at least one pagination-ish token.
+ if (max > 1) return max;
+
+ return 0;
+}
+
async function pageHasProducts(ctx, url) {
- const { http, config, logger } = ctx;
+ const { http, config } = ctx;
try {
const { text } = await http.fetchTextWithRetry(url, "discover", ctx.store.ua);
@@ -113,6 +145,10 @@ async function pageHasProducts(ctx, url) {
async function probePage(ctx, baseUrl, pageNum, state) {
const url = makePageUrlForCtx(ctx, baseUrl, pageNum);
+
+ const delay = Number.isFinite(ctx?.cat?.discoveryDelayMs) ? ctx.cat.discoveryDelayMs : 0;
+ if (delay > 0) await sleep(delay);
+
const t0 = Date.now();
const r = await pageHasProducts(ctx, url);
const ms = Date.now() - t0;
@@ -168,12 +204,44 @@ async function binaryFindLastOk(ctx, baseUrl, loOk, hiMiss, state) {
async function discoverTotalPagesFast(ctx, baseUrl, guess, step) {
const state = { phase: "pre", loOk: 1, hiMiss: 2, binInitialSpan: 0 };
- const p1 = await probePage(ctx, baseUrl, 1, state);
- if (!p1.ok) {
+ // Fetch page 1 ONCE and try to extract total pages from pagination.
+ const url1 = makePageUrlForCtx(ctx, baseUrl, 1);
+ const t0 = Date.now();
+ const { text: html1, ms, status, bytes, finalUrl } = await ctx.http.fetchTextWithRetry(url1, "discover", ctx.store.ua);
+ const pMs = Date.now() - t0;
+
+ if (typeof ctx.store.isEmptyListingPage === "function") {
+ if (ctx.store.isEmptyListingPage(html1, ctx, url1)) {
+ ctx.logger.warn(`${ctx.store.name} | ${ctx.cat.label} | Page 1 did not look like a listing. Defaulting to 1.`);
+ return 1;
+ }
+ }
+
+ const parser = ctx.store.parseProducts || ctx.config.defaultParseProducts;
+ const items1 = parser(html1, ctx, finalUrl).length;
+
+ logProgressLine(
+ ctx.logger,
+ ctx,
+ `Discover probe page=${padLeftV(1, 4)}`,
+ items1 > 0 ? "OK" : "MISS",
+ items1 > 0,
+ discoverProg(state),
+ `items=${padLeftV(items1, 3)} | bytes=${padLeftV("", 8)} | ${padRightV(ctx.http.inflightStr(), 11)} | ${secStr(ms || pMs)}`
+ );
+
+ if (items1 <= 0) {
ctx.logger.warn(`${ctx.store.name} | ${ctx.cat.label} | Page 1 did not look like a listing. Defaulting to 1.`);
return 1;
}
+ const extracted = extractTotalPagesFromPaginationHtml(html1);
+ if (extracted && extracted >= 1) {
+ ctx.logger.ok(`${ctx.catPrefixOut} | Total pages (from pagination): ${extracted}`);
+ return extracted;
+ }
+
+ // Fallback to probing if pagination parse fails
const g = Math.max(2, guess);
const pg = await probePage(ctx, baseUrl, g, state);
if (!pg.ok) return await binaryFindLastOk(ctx, baseUrl, 1, g, state);
@@ -202,7 +270,7 @@ async function discoverAndScanCategory(ctx, prevDb, report) {
const t0 = Date.now();
const guess = Number.isFinite(ctx.cat.discoveryStartPage) ? ctx.cat.discoveryStartPage : config.discoveryGuess;
- const step = config.discoveryStep;
+ const step = Number.isFinite(ctx.cat.discoveryStep) ? ctx.cat.discoveryStep : config.discoveryStep;
const totalPages = await discoverTotalPagesFast(ctx, ctx.baseUrl, guess, step);
const scanPages = config.maxPages === null ? totalPages : Math.min(config.maxPages, totalPages);
@@ -214,7 +282,10 @@ async function discoverAndScanCategory(ctx, prevDb, report) {
let donePages = 0;
- const perPageItems = await parallelMapStaggered(pages, config.concurrency, config.staggerMs, async (pageUrl, idx) => {
+ const pageConc = Number.isFinite(ctx.cat.pageConcurrency) ? ctx.cat.pageConcurrency : config.concurrency;
+ const pageStagger = Number.isFinite(ctx.cat.pageStaggerMs) ? ctx.cat.pageStaggerMs : config.staggerMs;
+
+ const perPageItems = await parallelMapStaggered(pages, pageConc, pageStagger, async (pageUrl, idx) => {
const pnum = idx + 1;
const { text: html, ms, bytes, status, finalUrl } = await ctx.http.fetchTextWithRetry(
diff --git a/src/tracker/run_all.js b/src/tracker/run_all.js
index c79d0e7..965c31a 100644
--- a/src/tracker/run_all.js
+++ b/src/tracker/run_all.js
@@ -1,7 +1,7 @@
"use strict";
const { createReport } = require("./report");
-const { parallelMapStaggered } = require("../utils/async");
+const { setTimeout: sleep } = require("timers/promises");
const {
makeCatPrefixers,
@@ -43,28 +43,54 @@ async function runAllStores(stores, { config, logger, http }) {
}
}
- await parallelMapStaggered(
- workItems,
- Math.min(config.categoryConcurrency, workItems.length),
- 0,
- async (w) => {
- try {
- await discoverAndScanCategory(w.ctx, w.prevDb, report);
- } catch (e) {
- const storeName = w?.ctx?.store?.name || w?.ctx?.store?.host || "unknown-store";
- const catLabel = w?.ctx?.cat?.label || w?.ctx?.cat?.key || "unknown-category";
+ // Host-level serialization: never run two categories from the same host concurrently.
+ const maxWorkers = Math.min(config.categoryConcurrency, workItems.length);
+ const queue = workItems.slice();
+ const inflightHosts = new Set();
- // Keep it loud in logs, but do not fail the entire run.
- logger.warn(
- `Category failed (continuing): ${storeName} | ${catLabel}\n${formatErr(e)}`
- );
+ async function runOne(w) {
+ try {
+ await discoverAndScanCategory(w.ctx, w.prevDb, report);
+ } catch (e) {
+ const storeName = w?.ctx?.store?.name || w?.ctx?.store?.host || "unknown-store";
+ const catLabel = w?.ctx?.cat?.label || w?.ctx?.cat?.key || "unknown-category";
- // If you want failures surfaced in the final report later, you could also
- // push a "failed category" record onto report.categories here.
- }
- return null;
+ // Keep it loud in logs, but do not fail the entire run.
+ logger.warn(`Category failed (continuing): ${storeName} | ${catLabel}\n${formatErr(e)}`);
}
- );
+ }
+
+ async function worker() {
+ while (true) {
+ if (queue.length === 0) return;
+
+ // Pick next item whose host isn't currently running.
+ const idx = queue.findIndex((w) => {
+ const host = String(w?.ctx?.store?.host || w?.ctx?.store?.key || "");
+ return host && !inflightHosts.has(host);
+ });
+
+ if (idx === -1) {
+ // Nothing available right now; wait a bit.
+ await sleep(50);
+ continue;
+ }
+
+ const w = queue.splice(idx, 1)[0];
+ const host = String(w?.ctx?.store?.host || w?.ctx?.store?.key || "");
+
+ inflightHosts.add(host);
+ try {
+ await runOne(w);
+ } finally {
+ inflightHosts.delete(host);
+ }
+ }
+ }
+
+ const workers = [];
+ for (let i = 0; i < maxWorkers; i++) workers.push(worker());
+ await Promise.all(workers);
return report;
}