mirror of
https://github.com/samsonjs/spirit-tracker.git
synced 2026-03-25 09:25:51 +00:00
feat: Support for the Gull
This commit is contained in:
parent
a4da64ef4a
commit
fa6fd99991
4 changed files with 226 additions and 28 deletions
99
src/stores/gull.js
Normal file
99
src/stores/gull.js
Normal file
|
|
@ -0,0 +1,99 @@
|
|||
"use strict";
|
||||
|
||||
const { decodeHtml, cleanText, extractFirstImgUrl } = require("../utils/html");
|
||||
const { normalizeCspc } = require("../utils/sku");
|
||||
const { extractPriceFromTmbBlock } = require("../utils/woocommerce");
|
||||
const { makePageUrl } = require("../utils/url");
|
||||
|
||||
function looksInStock(block) {
|
||||
const s = String(block || "");
|
||||
if (/\boutofstock\b/i.test(s)) return false;
|
||||
// your sample has: <p class="stock in-stock">1 in stock</p>
|
||||
if (/\bin-stock\b/i.test(s)) return true;
|
||||
if (/\binstock\b/i.test(s)) return true;
|
||||
if (/>\s*\d+\s+in\s+stock\s*</i.test(s)) return true;
|
||||
return /\bin-stock\b/i.test(s);
|
||||
}
|
||||
|
||||
function parseProductsGull(html, ctx) {
|
||||
const s = String(html || "");
|
||||
const items = [];
|
||||
|
||||
// split on <li class="product ...">
|
||||
const parts = s.split(/<li\b[^>]*class=["'][^"']*\bproduct\b[^"']*["'][^>]*>/i);
|
||||
if (parts.length <= 1) return items;
|
||||
|
||||
const base = `https://${(ctx && ctx.store && ctx.store.host) || "gullliquorstore.com"}/`;
|
||||
|
||||
for (let i = 1; i < parts.length; i++) {
|
||||
const block = '<li class="product"' + parts[i];
|
||||
|
||||
if (!looksInStock(block)) continue;
|
||||
|
||||
const hrefM = block.match(/<a\b[^>]*href=["']([^"']+)["'][^>]*class=["'][^"']*\bwoocommerce-LoopProduct-link\b/i);
|
||||
if (!hrefM || !hrefM[1]) continue;
|
||||
|
||||
let url;
|
||||
try {
|
||||
url = new URL(decodeHtml(hrefM[1]), base).toString();
|
||||
} catch {
|
||||
continue;
|
||||
}
|
||||
|
||||
const titleM = block.match(/<h2\b[^>]*class=["'][^"']*\bwoocommerce-loop-product__title\b[^"']*["'][^>]*>([\s\S]*?)<\/h2>/i);
|
||||
const name = cleanText(decodeHtml(titleM ? titleM[1] : ""));
|
||||
if (!name) continue;
|
||||
|
||||
// Price is in standard Woo <span class="price"> ... </span>
|
||||
const price = extractPriceFromTmbBlock(block) || "";
|
||||
|
||||
const sku = normalizeCspc(
|
||||
block.match(/\bdata-product_sku=["']([^"']+)["']/i)?.[1] ||
|
||||
block.match(/\bSKU\b[^0-9]{0,20}(\d{6})\b/i)?.[1] ||
|
||||
url
|
||||
);
|
||||
|
||||
const img = extractFirstImgUrl(block, base);
|
||||
|
||||
items.push({ name, price, url, sku, img });
|
||||
}
|
||||
|
||||
const uniq = new Map();
|
||||
for (const it of items) uniq.set(it.url, it);
|
||||
return [...uniq.values()];
|
||||
}
|
||||
|
||||
function createStore(defaultUa) {
|
||||
return {
|
||||
key: "gull",
|
||||
name: "Gull Liquor",
|
||||
host: "gullliquorstore.com",
|
||||
ua: defaultUa,
|
||||
parseProducts: parseProductsGull,
|
||||
makePageUrl, // enables /page/N/ paging
|
||||
categories: [
|
||||
{
|
||||
key: "whisky",
|
||||
label: "Whisky",
|
||||
startUrl: "https://gullliquorstore.com/product-category/spirits/?spirit_type=whisky",
|
||||
discoveryStartPage: 3,
|
||||
discoveryStep: 2,
|
||||
pageConcurrency: 1,
|
||||
pageStaggerMs: 10000,
|
||||
discoveryDelayMs: 10000
|
||||
},
|
||||
{
|
||||
key: "rum",
|
||||
label: "Rum",
|
||||
startUrl: "https://gullliquorstore.com/product-category/spirits/?spirit_type=rum",
|
||||
discoveryStartPage: 3,
|
||||
discoveryStep: 2,
|
||||
pageConcurrency: 1,
|
||||
pageStaggerMs: 10000,
|
||||
discoveryDelayMs: 10000
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
module.exports = { createStore, parseProductsGull };
|
||||
|
|
@ -9,17 +9,19 @@ const { createStore: createCraftCellars } = require("./craftcellars");
|
|||
const { createStore: createBCL } = require("./bcl");
|
||||
const { createStore: createStrath } = require("./strath");
|
||||
const { createStore: createLegacy } = require("./legacyliquor");
|
||||
const { createStore: createGull } = require("./gull");
|
||||
|
||||
function createStores({ defaultUa } = {}) {
|
||||
return [
|
||||
createGull(defaultUa),
|
||||
createSierra(defaultUa),
|
||||
createBSW(defaultUa),
|
||||
createKWM(defaultUa),
|
||||
createCraftCellars(defaultUa),
|
||||
createStrath(defaultUa),
|
||||
createBSW(defaultUa),
|
||||
createKegNCork(defaultUa),
|
||||
createMaltsAndGrains(defaultUa),
|
||||
createCraftCellars(defaultUa),
|
||||
createBCL(defaultUa),
|
||||
createStrath(defaultUa),
|
||||
createLegacy(defaultUa),
|
||||
];
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
"use strict";
|
||||
|
||||
const { setTimeout: sleep } = require("timers/promises");
|
||||
|
||||
const { humanBytes } = require("../utils/bytes");
|
||||
const { padLeft, padRight, padLeftV, padRightV } = require("../utils/string");
|
||||
const { normalizeBaseUrl, makePageUrlForCtx } = require("../utils/url");
|
||||
|
|
@ -94,8 +96,38 @@ function shouldTrackItem(ctx, finalUrl, item) {
|
|||
return allow(item, ctx, finalUrl);
|
||||
}
|
||||
|
||||
/**
|
||||
* Best-effort extraction of total pages from Woo pagination markup on page 1.
|
||||
* Looks for:
|
||||
* - /page/N/
|
||||
* - ?paged=N
|
||||
* inside links that often have "page-numbers" class, but works even without it.
|
||||
*/
|
||||
function extractTotalPagesFromPaginationHtml(html) {
|
||||
const s = String(html || "");
|
||||
let max = 0;
|
||||
|
||||
// /page/23/
|
||||
for (const m of s.matchAll(/href=["'][^"']*\/page\/(\d+)\/[^"']*["']/gi)) {
|
||||
const n = Number(m[1]);
|
||||
if (Number.isFinite(n) && n > max) max = n;
|
||||
}
|
||||
|
||||
// ?paged=23
|
||||
for (const m of s.matchAll(/href=["'][^"']*[?&]paged=(\d+)[^"']*["']/gi)) {
|
||||
const n = Number(m[1]);
|
||||
if (Number.isFinite(n) && n > max) max = n;
|
||||
}
|
||||
|
||||
// Sometimes themes render plain numbers without /page/ in href; keep it conservative:
|
||||
// Only trust these if we already found at least one pagination-ish token.
|
||||
if (max > 1) return max;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
async function pageHasProducts(ctx, url) {
|
||||
const { http, config, logger } = ctx;
|
||||
const { http, config } = ctx;
|
||||
try {
|
||||
const { text } = await http.fetchTextWithRetry(url, "discover", ctx.store.ua);
|
||||
|
||||
|
|
@ -113,6 +145,10 @@ async function pageHasProducts(ctx, url) {
|
|||
|
||||
async function probePage(ctx, baseUrl, pageNum, state) {
|
||||
const url = makePageUrlForCtx(ctx, baseUrl, pageNum);
|
||||
|
||||
const delay = Number.isFinite(ctx?.cat?.discoveryDelayMs) ? ctx.cat.discoveryDelayMs : 0;
|
||||
if (delay > 0) await sleep(delay);
|
||||
|
||||
const t0 = Date.now();
|
||||
const r = await pageHasProducts(ctx, url);
|
||||
const ms = Date.now() - t0;
|
||||
|
|
@ -168,12 +204,44 @@ async function binaryFindLastOk(ctx, baseUrl, loOk, hiMiss, state) {
|
|||
async function discoverTotalPagesFast(ctx, baseUrl, guess, step) {
|
||||
const state = { phase: "pre", loOk: 1, hiMiss: 2, binInitialSpan: 0 };
|
||||
|
||||
const p1 = await probePage(ctx, baseUrl, 1, state);
|
||||
if (!p1.ok) {
|
||||
// Fetch page 1 ONCE and try to extract total pages from pagination.
|
||||
const url1 = makePageUrlForCtx(ctx, baseUrl, 1);
|
||||
const t0 = Date.now();
|
||||
const { text: html1, ms, status, bytes, finalUrl } = await ctx.http.fetchTextWithRetry(url1, "discover", ctx.store.ua);
|
||||
const pMs = Date.now() - t0;
|
||||
|
||||
if (typeof ctx.store.isEmptyListingPage === "function") {
|
||||
if (ctx.store.isEmptyListingPage(html1, ctx, url1)) {
|
||||
ctx.logger.warn(`${ctx.store.name} | ${ctx.cat.label} | Page 1 did not look like a listing. Defaulting to 1.`);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
const parser = ctx.store.parseProducts || ctx.config.defaultParseProducts;
|
||||
const items1 = parser(html1, ctx, finalUrl).length;
|
||||
|
||||
logProgressLine(
|
||||
ctx.logger,
|
||||
ctx,
|
||||
`Discover probe page=${padLeftV(1, 4)}`,
|
||||
items1 > 0 ? "OK" : "MISS",
|
||||
items1 > 0,
|
||||
discoverProg(state),
|
||||
`items=${padLeftV(items1, 3)} | bytes=${padLeftV("", 8)} | ${padRightV(ctx.http.inflightStr(), 11)} | ${secStr(ms || pMs)}`
|
||||
);
|
||||
|
||||
if (items1 <= 0) {
|
||||
ctx.logger.warn(`${ctx.store.name} | ${ctx.cat.label} | Page 1 did not look like a listing. Defaulting to 1.`);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const extracted = extractTotalPagesFromPaginationHtml(html1);
|
||||
if (extracted && extracted >= 1) {
|
||||
ctx.logger.ok(`${ctx.catPrefixOut} | Total pages (from pagination): ${extracted}`);
|
||||
return extracted;
|
||||
}
|
||||
|
||||
// Fallback to probing if pagination parse fails
|
||||
const g = Math.max(2, guess);
|
||||
const pg = await probePage(ctx, baseUrl, g, state);
|
||||
if (!pg.ok) return await binaryFindLastOk(ctx, baseUrl, 1, g, state);
|
||||
|
|
@ -202,7 +270,7 @@ async function discoverAndScanCategory(ctx, prevDb, report) {
|
|||
const t0 = Date.now();
|
||||
|
||||
const guess = Number.isFinite(ctx.cat.discoveryStartPage) ? ctx.cat.discoveryStartPage : config.discoveryGuess;
|
||||
const step = config.discoveryStep;
|
||||
const step = Number.isFinite(ctx.cat.discoveryStep) ? ctx.cat.discoveryStep : config.discoveryStep;
|
||||
|
||||
const totalPages = await discoverTotalPagesFast(ctx, ctx.baseUrl, guess, step);
|
||||
const scanPages = config.maxPages === null ? totalPages : Math.min(config.maxPages, totalPages);
|
||||
|
|
@ -214,7 +282,10 @@ async function discoverAndScanCategory(ctx, prevDb, report) {
|
|||
|
||||
let donePages = 0;
|
||||
|
||||
const perPageItems = await parallelMapStaggered(pages, config.concurrency, config.staggerMs, async (pageUrl, idx) => {
|
||||
const pageConc = Number.isFinite(ctx.cat.pageConcurrency) ? ctx.cat.pageConcurrency : config.concurrency;
|
||||
const pageStagger = Number.isFinite(ctx.cat.pageStaggerMs) ? ctx.cat.pageStaggerMs : config.staggerMs;
|
||||
|
||||
const perPageItems = await parallelMapStaggered(pages, pageConc, pageStagger, async (pageUrl, idx) => {
|
||||
const pnum = idx + 1;
|
||||
|
||||
const { text: html, ms, bytes, status, finalUrl } = await ctx.http.fetchTextWithRetry(
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
"use strict";
|
||||
|
||||
const { createReport } = require("./report");
|
||||
const { parallelMapStaggered } = require("../utils/async");
|
||||
const { setTimeout: sleep } = require("timers/promises");
|
||||
|
||||
const {
|
||||
makeCatPrefixers,
|
||||
|
|
@ -43,28 +43,54 @@ async function runAllStores(stores, { config, logger, http }) {
|
|||
}
|
||||
}
|
||||
|
||||
await parallelMapStaggered(
|
||||
workItems,
|
||||
Math.min(config.categoryConcurrency, workItems.length),
|
||||
0,
|
||||
async (w) => {
|
||||
try {
|
||||
await discoverAndScanCategory(w.ctx, w.prevDb, report);
|
||||
} catch (e) {
|
||||
const storeName = w?.ctx?.store?.name || w?.ctx?.store?.host || "unknown-store";
|
||||
const catLabel = w?.ctx?.cat?.label || w?.ctx?.cat?.key || "unknown-category";
|
||||
// Host-level serialization: never run two categories from the same host concurrently.
|
||||
const maxWorkers = Math.min(config.categoryConcurrency, workItems.length);
|
||||
const queue = workItems.slice();
|
||||
const inflightHosts = new Set();
|
||||
|
||||
// Keep it loud in logs, but do not fail the entire run.
|
||||
logger.warn(
|
||||
`Category failed (continuing): ${storeName} | ${catLabel}\n${formatErr(e)}`
|
||||
);
|
||||
async function runOne(w) {
|
||||
try {
|
||||
await discoverAndScanCategory(w.ctx, w.prevDb, report);
|
||||
} catch (e) {
|
||||
const storeName = w?.ctx?.store?.name || w?.ctx?.store?.host || "unknown-store";
|
||||
const catLabel = w?.ctx?.cat?.label || w?.ctx?.cat?.key || "unknown-category";
|
||||
|
||||
// If you want failures surfaced in the final report later, you could also
|
||||
// push a "failed category" record onto report.categories here.
|
||||
}
|
||||
return null;
|
||||
// Keep it loud in logs, but do not fail the entire run.
|
||||
logger.warn(`Category failed (continuing): ${storeName} | ${catLabel}\n${formatErr(e)}`);
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
async function worker() {
|
||||
while (true) {
|
||||
if (queue.length === 0) return;
|
||||
|
||||
// Pick next item whose host isn't currently running.
|
||||
const idx = queue.findIndex((w) => {
|
||||
const host = String(w?.ctx?.store?.host || w?.ctx?.store?.key || "");
|
||||
return host && !inflightHosts.has(host);
|
||||
});
|
||||
|
||||
if (idx === -1) {
|
||||
// Nothing available right now; wait a bit.
|
||||
await sleep(50);
|
||||
continue;
|
||||
}
|
||||
|
||||
const w = queue.splice(idx, 1)[0];
|
||||
const host = String(w?.ctx?.store?.host || w?.ctx?.store?.key || "");
|
||||
|
||||
inflightHosts.add(host);
|
||||
try {
|
||||
await runOne(w);
|
||||
} finally {
|
||||
inflightHosts.delete(host);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const workers = [];
|
||||
for (let i = 0; i < maxWorkers; i++) workers.push(worker());
|
||||
await Promise.all(workers);
|
||||
|
||||
return report;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue