feat: Support for the Gull

This commit is contained in:
Brennan Wilkes (Text Groove) 2026-01-24 12:55:59 -08:00
parent a4da64ef4a
commit fa6fd99991
4 changed files with 226 additions and 28 deletions

99
src/stores/gull.js Normal file
View file

@ -0,0 +1,99 @@
"use strict";
const { decodeHtml, cleanText, extractFirstImgUrl } = require("../utils/html");
const { normalizeCspc } = require("../utils/sku");
const { extractPriceFromTmbBlock } = require("../utils/woocommerce");
const { makePageUrl } = require("../utils/url");
function looksInStock(block) {
const s = String(block || "");
if (/\boutofstock\b/i.test(s)) return false;
// your sample has: <p class="stock in-stock">1 in stock</p>
if (/\bin-stock\b/i.test(s)) return true;
if (/\binstock\b/i.test(s)) return true;
if (/>\s*\d+\s+in\s+stock\s*</i.test(s)) return true;
return /\bin-stock\b/i.test(s);
}
function parseProductsGull(html, ctx) {
const s = String(html || "");
const items = [];
// split on <li class="product ...">
const parts = s.split(/<li\b[^>]*class=["'][^"']*\bproduct\b[^"']*["'][^>]*>/i);
if (parts.length <= 1) return items;
const base = `https://${(ctx && ctx.store && ctx.store.host) || "gullliquorstore.com"}/`;
for (let i = 1; i < parts.length; i++) {
const block = '<li class="product"' + parts[i];
if (!looksInStock(block)) continue;
const hrefM = block.match(/<a\b[^>]*href=["']([^"']+)["'][^>]*class=["'][^"']*\bwoocommerce-LoopProduct-link\b/i);
if (!hrefM || !hrefM[1]) continue;
let url;
try {
url = new URL(decodeHtml(hrefM[1]), base).toString();
} catch {
continue;
}
const titleM = block.match(/<h2\b[^>]*class=["'][^"']*\bwoocommerce-loop-product__title\b[^"']*["'][^>]*>([\s\S]*?)<\/h2>/i);
const name = cleanText(decodeHtml(titleM ? titleM[1] : ""));
if (!name) continue;
// Price is in standard Woo <span class="price"> ... </span>
const price = extractPriceFromTmbBlock(block) || "";
const sku = normalizeCspc(
block.match(/\bdata-product_sku=["']([^"']+)["']/i)?.[1] ||
block.match(/\bSKU\b[^0-9]{0,20}(\d{6})\b/i)?.[1] ||
url
);
const img = extractFirstImgUrl(block, base);
items.push({ name, price, url, sku, img });
}
const uniq = new Map();
for (const it of items) uniq.set(it.url, it);
return [...uniq.values()];
}
function createStore(defaultUa) {
return {
key: "gull",
name: "Gull Liquor",
host: "gullliquorstore.com",
ua: defaultUa,
parseProducts: parseProductsGull,
makePageUrl, // enables /page/N/ paging
categories: [
{
key: "whisky",
label: "Whisky",
startUrl: "https://gullliquorstore.com/product-category/spirits/?spirit_type=whisky",
discoveryStartPage: 3,
discoveryStep: 2,
pageConcurrency: 1,
pageStaggerMs: 10000,
discoveryDelayMs: 10000
},
{
key: "rum",
label: "Rum",
startUrl: "https://gullliquorstore.com/product-category/spirits/?spirit_type=rum",
discoveryStartPage: 3,
discoveryStep: 2,
pageConcurrency: 1,
pageStaggerMs: 10000,
discoveryDelayMs: 10000
},
],
};
}
module.exports = { createStore, parseProductsGull };

View file

@ -9,17 +9,19 @@ const { createStore: createCraftCellars } = require("./craftcellars");
const { createStore: createBCL } = require("./bcl"); const { createStore: createBCL } = require("./bcl");
const { createStore: createStrath } = require("./strath"); const { createStore: createStrath } = require("./strath");
const { createStore: createLegacy } = require("./legacyliquor"); const { createStore: createLegacy } = require("./legacyliquor");
const { createStore: createGull } = require("./gull");
function createStores({ defaultUa } = {}) { function createStores({ defaultUa } = {}) {
return [ return [
createGull(defaultUa),
createSierra(defaultUa), createSierra(defaultUa),
createBSW(defaultUa),
createKWM(defaultUa), createKWM(defaultUa),
createCraftCellars(defaultUa),
createStrath(defaultUa),
createBSW(defaultUa),
createKegNCork(defaultUa), createKegNCork(defaultUa),
createMaltsAndGrains(defaultUa), createMaltsAndGrains(defaultUa),
createCraftCellars(defaultUa),
createBCL(defaultUa), createBCL(defaultUa),
createStrath(defaultUa),
createLegacy(defaultUa), createLegacy(defaultUa),
]; ];
} }

View file

@ -1,5 +1,7 @@
"use strict"; "use strict";
const { setTimeout: sleep } = require("timers/promises");
const { humanBytes } = require("../utils/bytes"); const { humanBytes } = require("../utils/bytes");
const { padLeft, padRight, padLeftV, padRightV } = require("../utils/string"); const { padLeft, padRight, padLeftV, padRightV } = require("../utils/string");
const { normalizeBaseUrl, makePageUrlForCtx } = require("../utils/url"); const { normalizeBaseUrl, makePageUrlForCtx } = require("../utils/url");
@ -94,8 +96,38 @@ function shouldTrackItem(ctx, finalUrl, item) {
return allow(item, ctx, finalUrl); return allow(item, ctx, finalUrl);
} }
/**
* Best-effort extraction of total pages from Woo pagination markup on page 1.
* Looks for:
* - /page/N/
* - ?paged=N
* inside links that often have "page-numbers" class, but works even without it.
*/
function extractTotalPagesFromPaginationHtml(html) {
const s = String(html || "");
let max = 0;
// /page/23/
for (const m of s.matchAll(/href=["'][^"']*\/page\/(\d+)\/[^"']*["']/gi)) {
const n = Number(m[1]);
if (Number.isFinite(n) && n > max) max = n;
}
// ?paged=23
for (const m of s.matchAll(/href=["'][^"']*[?&]paged=(\d+)[^"']*["']/gi)) {
const n = Number(m[1]);
if (Number.isFinite(n) && n > max) max = n;
}
// Sometimes themes render plain numbers without /page/ in href; keep it conservative:
// Only trust these if we already found at least one pagination-ish token.
if (max > 1) return max;
return 0;
}
async function pageHasProducts(ctx, url) { async function pageHasProducts(ctx, url) {
const { http, config, logger } = ctx; const { http, config } = ctx;
try { try {
const { text } = await http.fetchTextWithRetry(url, "discover", ctx.store.ua); const { text } = await http.fetchTextWithRetry(url, "discover", ctx.store.ua);
@ -113,6 +145,10 @@ async function pageHasProducts(ctx, url) {
async function probePage(ctx, baseUrl, pageNum, state) { async function probePage(ctx, baseUrl, pageNum, state) {
const url = makePageUrlForCtx(ctx, baseUrl, pageNum); const url = makePageUrlForCtx(ctx, baseUrl, pageNum);
const delay = Number.isFinite(ctx?.cat?.discoveryDelayMs) ? ctx.cat.discoveryDelayMs : 0;
if (delay > 0) await sleep(delay);
const t0 = Date.now(); const t0 = Date.now();
const r = await pageHasProducts(ctx, url); const r = await pageHasProducts(ctx, url);
const ms = Date.now() - t0; const ms = Date.now() - t0;
@ -168,12 +204,44 @@ async function binaryFindLastOk(ctx, baseUrl, loOk, hiMiss, state) {
async function discoverTotalPagesFast(ctx, baseUrl, guess, step) { async function discoverTotalPagesFast(ctx, baseUrl, guess, step) {
const state = { phase: "pre", loOk: 1, hiMiss: 2, binInitialSpan: 0 }; const state = { phase: "pre", loOk: 1, hiMiss: 2, binInitialSpan: 0 };
const p1 = await probePage(ctx, baseUrl, 1, state); // Fetch page 1 ONCE and try to extract total pages from pagination.
if (!p1.ok) { const url1 = makePageUrlForCtx(ctx, baseUrl, 1);
const t0 = Date.now();
const { text: html1, ms, status, bytes, finalUrl } = await ctx.http.fetchTextWithRetry(url1, "discover", ctx.store.ua);
const pMs = Date.now() - t0;
if (typeof ctx.store.isEmptyListingPage === "function") {
if (ctx.store.isEmptyListingPage(html1, ctx, url1)) {
ctx.logger.warn(`${ctx.store.name} | ${ctx.cat.label} | Page 1 did not look like a listing. Defaulting to 1.`);
return 1;
}
}
const parser = ctx.store.parseProducts || ctx.config.defaultParseProducts;
const items1 = parser(html1, ctx, finalUrl).length;
logProgressLine(
ctx.logger,
ctx,
`Discover probe page=${padLeftV(1, 4)}`,
items1 > 0 ? "OK" : "MISS",
items1 > 0,
discoverProg(state),
`items=${padLeftV(items1, 3)} | bytes=${padLeftV("", 8)} | ${padRightV(ctx.http.inflightStr(), 11)} | ${secStr(ms || pMs)}`
);
if (items1 <= 0) {
ctx.logger.warn(`${ctx.store.name} | ${ctx.cat.label} | Page 1 did not look like a listing. Defaulting to 1.`); ctx.logger.warn(`${ctx.store.name} | ${ctx.cat.label} | Page 1 did not look like a listing. Defaulting to 1.`);
return 1; return 1;
} }
const extracted = extractTotalPagesFromPaginationHtml(html1);
if (extracted && extracted >= 1) {
ctx.logger.ok(`${ctx.catPrefixOut} | Total pages (from pagination): ${extracted}`);
return extracted;
}
// Fallback to probing if pagination parse fails
const g = Math.max(2, guess); const g = Math.max(2, guess);
const pg = await probePage(ctx, baseUrl, g, state); const pg = await probePage(ctx, baseUrl, g, state);
if (!pg.ok) return await binaryFindLastOk(ctx, baseUrl, 1, g, state); if (!pg.ok) return await binaryFindLastOk(ctx, baseUrl, 1, g, state);
@ -202,7 +270,7 @@ async function discoverAndScanCategory(ctx, prevDb, report) {
const t0 = Date.now(); const t0 = Date.now();
const guess = Number.isFinite(ctx.cat.discoveryStartPage) ? ctx.cat.discoveryStartPage : config.discoveryGuess; const guess = Number.isFinite(ctx.cat.discoveryStartPage) ? ctx.cat.discoveryStartPage : config.discoveryGuess;
const step = config.discoveryStep; const step = Number.isFinite(ctx.cat.discoveryStep) ? ctx.cat.discoveryStep : config.discoveryStep;
const totalPages = await discoverTotalPagesFast(ctx, ctx.baseUrl, guess, step); const totalPages = await discoverTotalPagesFast(ctx, ctx.baseUrl, guess, step);
const scanPages = config.maxPages === null ? totalPages : Math.min(config.maxPages, totalPages); const scanPages = config.maxPages === null ? totalPages : Math.min(config.maxPages, totalPages);
@ -214,7 +282,10 @@ async function discoverAndScanCategory(ctx, prevDb, report) {
let donePages = 0; let donePages = 0;
const perPageItems = await parallelMapStaggered(pages, config.concurrency, config.staggerMs, async (pageUrl, idx) => { const pageConc = Number.isFinite(ctx.cat.pageConcurrency) ? ctx.cat.pageConcurrency : config.concurrency;
const pageStagger = Number.isFinite(ctx.cat.pageStaggerMs) ? ctx.cat.pageStaggerMs : config.staggerMs;
const perPageItems = await parallelMapStaggered(pages, pageConc, pageStagger, async (pageUrl, idx) => {
const pnum = idx + 1; const pnum = idx + 1;
const { text: html, ms, bytes, status, finalUrl } = await ctx.http.fetchTextWithRetry( const { text: html, ms, bytes, status, finalUrl } = await ctx.http.fetchTextWithRetry(

View file

@ -1,7 +1,7 @@
"use strict"; "use strict";
const { createReport } = require("./report"); const { createReport } = require("./report");
const { parallelMapStaggered } = require("../utils/async"); const { setTimeout: sleep } = require("timers/promises");
const { const {
makeCatPrefixers, makeCatPrefixers,
@ -43,28 +43,54 @@ async function runAllStores(stores, { config, logger, http }) {
} }
} }
await parallelMapStaggered( // Host-level serialization: never run two categories from the same host concurrently.
workItems, const maxWorkers = Math.min(config.categoryConcurrency, workItems.length);
Math.min(config.categoryConcurrency, workItems.length), const queue = workItems.slice();
0, const inflightHosts = new Set();
async (w) => {
try {
await discoverAndScanCategory(w.ctx, w.prevDb, report);
} catch (e) {
const storeName = w?.ctx?.store?.name || w?.ctx?.store?.host || "unknown-store";
const catLabel = w?.ctx?.cat?.label || w?.ctx?.cat?.key || "unknown-category";
// Keep it loud in logs, but do not fail the entire run. async function runOne(w) {
logger.warn( try {
`Category failed (continuing): ${storeName} | ${catLabel}\n${formatErr(e)}` await discoverAndScanCategory(w.ctx, w.prevDb, report);
); } catch (e) {
const storeName = w?.ctx?.store?.name || w?.ctx?.store?.host || "unknown-store";
const catLabel = w?.ctx?.cat?.label || w?.ctx?.cat?.key || "unknown-category";
// If you want failures surfaced in the final report later, you could also // Keep it loud in logs, but do not fail the entire run.
// push a "failed category" record onto report.categories here. logger.warn(`Category failed (continuing): ${storeName} | ${catLabel}\n${formatErr(e)}`);
}
return null;
} }
); }
async function worker() {
while (true) {
if (queue.length === 0) return;
// Pick next item whose host isn't currently running.
const idx = queue.findIndex((w) => {
const host = String(w?.ctx?.store?.host || w?.ctx?.store?.key || "");
return host && !inflightHosts.has(host);
});
if (idx === -1) {
// Nothing available right now; wait a bit.
await sleep(50);
continue;
}
const w = queue.splice(idx, 1)[0];
const host = String(w?.ctx?.store?.host || w?.ctx?.store?.key || "");
inflightHosts.add(host);
try {
await runOne(w);
} finally {
inflightHosts.delete(host);
}
}
}
const workers = [];
for (let i = 0; i < maxWorkers; i++) workers.push(worker());
await Promise.all(workers);
return report; return report;
} }