mirror of
https://github.com/samsonjs/spirit-tracker.git
synced 2026-03-25 09:25:51 +00:00
feat: Support for the Gull
This commit is contained in:
parent
a4da64ef4a
commit
fa6fd99991
4 changed files with 226 additions and 28 deletions
99
src/stores/gull.js
Normal file
99
src/stores/gull.js
Normal file
|
|
@ -0,0 +1,99 @@
|
||||||
|
"use strict";
|
||||||
|
|
||||||
|
const { decodeHtml, cleanText, extractFirstImgUrl } = require("../utils/html");
|
||||||
|
const { normalizeCspc } = require("../utils/sku");
|
||||||
|
const { extractPriceFromTmbBlock } = require("../utils/woocommerce");
|
||||||
|
const { makePageUrl } = require("../utils/url");
|
||||||
|
|
||||||
|
function looksInStock(block) {
|
||||||
|
const s = String(block || "");
|
||||||
|
if (/\boutofstock\b/i.test(s)) return false;
|
||||||
|
// your sample has: <p class="stock in-stock">1 in stock</p>
|
||||||
|
if (/\bin-stock\b/i.test(s)) return true;
|
||||||
|
if (/\binstock\b/i.test(s)) return true;
|
||||||
|
if (/>\s*\d+\s+in\s+stock\s*</i.test(s)) return true;
|
||||||
|
return /\bin-stock\b/i.test(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseProductsGull(html, ctx) {
|
||||||
|
const s = String(html || "");
|
||||||
|
const items = [];
|
||||||
|
|
||||||
|
// split on <li class="product ...">
|
||||||
|
const parts = s.split(/<li\b[^>]*class=["'][^"']*\bproduct\b[^"']*["'][^>]*>/i);
|
||||||
|
if (parts.length <= 1) return items;
|
||||||
|
|
||||||
|
const base = `https://${(ctx && ctx.store && ctx.store.host) || "gullliquorstore.com"}/`;
|
||||||
|
|
||||||
|
for (let i = 1; i < parts.length; i++) {
|
||||||
|
const block = '<li class="product"' + parts[i];
|
||||||
|
|
||||||
|
if (!looksInStock(block)) continue;
|
||||||
|
|
||||||
|
const hrefM = block.match(/<a\b[^>]*href=["']([^"']+)["'][^>]*class=["'][^"']*\bwoocommerce-LoopProduct-link\b/i);
|
||||||
|
if (!hrefM || !hrefM[1]) continue;
|
||||||
|
|
||||||
|
let url;
|
||||||
|
try {
|
||||||
|
url = new URL(decodeHtml(hrefM[1]), base).toString();
|
||||||
|
} catch {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const titleM = block.match(/<h2\b[^>]*class=["'][^"']*\bwoocommerce-loop-product__title\b[^"']*["'][^>]*>([\s\S]*?)<\/h2>/i);
|
||||||
|
const name = cleanText(decodeHtml(titleM ? titleM[1] : ""));
|
||||||
|
if (!name) continue;
|
||||||
|
|
||||||
|
// Price is in standard Woo <span class="price"> ... </span>
|
||||||
|
const price = extractPriceFromTmbBlock(block) || "";
|
||||||
|
|
||||||
|
const sku = normalizeCspc(
|
||||||
|
block.match(/\bdata-product_sku=["']([^"']+)["']/i)?.[1] ||
|
||||||
|
block.match(/\bSKU\b[^0-9]{0,20}(\d{6})\b/i)?.[1] ||
|
||||||
|
url
|
||||||
|
);
|
||||||
|
|
||||||
|
const img = extractFirstImgUrl(block, base);
|
||||||
|
|
||||||
|
items.push({ name, price, url, sku, img });
|
||||||
|
}
|
||||||
|
|
||||||
|
const uniq = new Map();
|
||||||
|
for (const it of items) uniq.set(it.url, it);
|
||||||
|
return [...uniq.values()];
|
||||||
|
}
|
||||||
|
|
||||||
|
function createStore(defaultUa) {
|
||||||
|
return {
|
||||||
|
key: "gull",
|
||||||
|
name: "Gull Liquor",
|
||||||
|
host: "gullliquorstore.com",
|
||||||
|
ua: defaultUa,
|
||||||
|
parseProducts: parseProductsGull,
|
||||||
|
makePageUrl, // enables /page/N/ paging
|
||||||
|
categories: [
|
||||||
|
{
|
||||||
|
key: "whisky",
|
||||||
|
label: "Whisky",
|
||||||
|
startUrl: "https://gullliquorstore.com/product-category/spirits/?spirit_type=whisky",
|
||||||
|
discoveryStartPage: 3,
|
||||||
|
discoveryStep: 2,
|
||||||
|
pageConcurrency: 1,
|
||||||
|
pageStaggerMs: 10000,
|
||||||
|
discoveryDelayMs: 10000
|
||||||
|
},
|
||||||
|
{
|
||||||
|
key: "rum",
|
||||||
|
label: "Rum",
|
||||||
|
startUrl: "https://gullliquorstore.com/product-category/spirits/?spirit_type=rum",
|
||||||
|
discoveryStartPage: 3,
|
||||||
|
discoveryStep: 2,
|
||||||
|
pageConcurrency: 1,
|
||||||
|
pageStaggerMs: 10000,
|
||||||
|
discoveryDelayMs: 10000
|
||||||
|
},
|
||||||
|
],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = { createStore, parseProductsGull };
|
||||||
|
|
@ -9,17 +9,19 @@ const { createStore: createCraftCellars } = require("./craftcellars");
|
||||||
const { createStore: createBCL } = require("./bcl");
|
const { createStore: createBCL } = require("./bcl");
|
||||||
const { createStore: createStrath } = require("./strath");
|
const { createStore: createStrath } = require("./strath");
|
||||||
const { createStore: createLegacy } = require("./legacyliquor");
|
const { createStore: createLegacy } = require("./legacyliquor");
|
||||||
|
const { createStore: createGull } = require("./gull");
|
||||||
|
|
||||||
function createStores({ defaultUa } = {}) {
|
function createStores({ defaultUa } = {}) {
|
||||||
return [
|
return [
|
||||||
|
createGull(defaultUa),
|
||||||
createSierra(defaultUa),
|
createSierra(defaultUa),
|
||||||
createBSW(defaultUa),
|
|
||||||
createKWM(defaultUa),
|
createKWM(defaultUa),
|
||||||
|
createCraftCellars(defaultUa),
|
||||||
|
createStrath(defaultUa),
|
||||||
|
createBSW(defaultUa),
|
||||||
createKegNCork(defaultUa),
|
createKegNCork(defaultUa),
|
||||||
createMaltsAndGrains(defaultUa),
|
createMaltsAndGrains(defaultUa),
|
||||||
createCraftCellars(defaultUa),
|
|
||||||
createBCL(defaultUa),
|
createBCL(defaultUa),
|
||||||
createStrath(defaultUa),
|
|
||||||
createLegacy(defaultUa),
|
createLegacy(defaultUa),
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,7 @@
|
||||||
"use strict";
|
"use strict";
|
||||||
|
|
||||||
|
const { setTimeout: sleep } = require("timers/promises");
|
||||||
|
|
||||||
const { humanBytes } = require("../utils/bytes");
|
const { humanBytes } = require("../utils/bytes");
|
||||||
const { padLeft, padRight, padLeftV, padRightV } = require("../utils/string");
|
const { padLeft, padRight, padLeftV, padRightV } = require("../utils/string");
|
||||||
const { normalizeBaseUrl, makePageUrlForCtx } = require("../utils/url");
|
const { normalizeBaseUrl, makePageUrlForCtx } = require("../utils/url");
|
||||||
|
|
@ -94,8 +96,38 @@ function shouldTrackItem(ctx, finalUrl, item) {
|
||||||
return allow(item, ctx, finalUrl);
|
return allow(item, ctx, finalUrl);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Best-effort extraction of total pages from Woo pagination markup on page 1.
|
||||||
|
* Looks for:
|
||||||
|
* - /page/N/
|
||||||
|
* - ?paged=N
|
||||||
|
* inside links that often have "page-numbers" class, but works even without it.
|
||||||
|
*/
|
||||||
|
function extractTotalPagesFromPaginationHtml(html) {
|
||||||
|
const s = String(html || "");
|
||||||
|
let max = 0;
|
||||||
|
|
||||||
|
// /page/23/
|
||||||
|
for (const m of s.matchAll(/href=["'][^"']*\/page\/(\d+)\/[^"']*["']/gi)) {
|
||||||
|
const n = Number(m[1]);
|
||||||
|
if (Number.isFinite(n) && n > max) max = n;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ?paged=23
|
||||||
|
for (const m of s.matchAll(/href=["'][^"']*[?&]paged=(\d+)[^"']*["']/gi)) {
|
||||||
|
const n = Number(m[1]);
|
||||||
|
if (Number.isFinite(n) && n > max) max = n;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sometimes themes render plain numbers without /page/ in href; keep it conservative:
|
||||||
|
// Only trust these if we already found at least one pagination-ish token.
|
||||||
|
if (max > 1) return max;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
async function pageHasProducts(ctx, url) {
|
async function pageHasProducts(ctx, url) {
|
||||||
const { http, config, logger } = ctx;
|
const { http, config } = ctx;
|
||||||
try {
|
try {
|
||||||
const { text } = await http.fetchTextWithRetry(url, "discover", ctx.store.ua);
|
const { text } = await http.fetchTextWithRetry(url, "discover", ctx.store.ua);
|
||||||
|
|
||||||
|
|
@ -113,6 +145,10 @@ async function pageHasProducts(ctx, url) {
|
||||||
|
|
||||||
async function probePage(ctx, baseUrl, pageNum, state) {
|
async function probePage(ctx, baseUrl, pageNum, state) {
|
||||||
const url = makePageUrlForCtx(ctx, baseUrl, pageNum);
|
const url = makePageUrlForCtx(ctx, baseUrl, pageNum);
|
||||||
|
|
||||||
|
const delay = Number.isFinite(ctx?.cat?.discoveryDelayMs) ? ctx.cat.discoveryDelayMs : 0;
|
||||||
|
if (delay > 0) await sleep(delay);
|
||||||
|
|
||||||
const t0 = Date.now();
|
const t0 = Date.now();
|
||||||
const r = await pageHasProducts(ctx, url);
|
const r = await pageHasProducts(ctx, url);
|
||||||
const ms = Date.now() - t0;
|
const ms = Date.now() - t0;
|
||||||
|
|
@ -168,12 +204,44 @@ async function binaryFindLastOk(ctx, baseUrl, loOk, hiMiss, state) {
|
||||||
async function discoverTotalPagesFast(ctx, baseUrl, guess, step) {
|
async function discoverTotalPagesFast(ctx, baseUrl, guess, step) {
|
||||||
const state = { phase: "pre", loOk: 1, hiMiss: 2, binInitialSpan: 0 };
|
const state = { phase: "pre", loOk: 1, hiMiss: 2, binInitialSpan: 0 };
|
||||||
|
|
||||||
const p1 = await probePage(ctx, baseUrl, 1, state);
|
// Fetch page 1 ONCE and try to extract total pages from pagination.
|
||||||
if (!p1.ok) {
|
const url1 = makePageUrlForCtx(ctx, baseUrl, 1);
|
||||||
|
const t0 = Date.now();
|
||||||
|
const { text: html1, ms, status, bytes, finalUrl } = await ctx.http.fetchTextWithRetry(url1, "discover", ctx.store.ua);
|
||||||
|
const pMs = Date.now() - t0;
|
||||||
|
|
||||||
|
if (typeof ctx.store.isEmptyListingPage === "function") {
|
||||||
|
if (ctx.store.isEmptyListingPage(html1, ctx, url1)) {
|
||||||
|
ctx.logger.warn(`${ctx.store.name} | ${ctx.cat.label} | Page 1 did not look like a listing. Defaulting to 1.`);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const parser = ctx.store.parseProducts || ctx.config.defaultParseProducts;
|
||||||
|
const items1 = parser(html1, ctx, finalUrl).length;
|
||||||
|
|
||||||
|
logProgressLine(
|
||||||
|
ctx.logger,
|
||||||
|
ctx,
|
||||||
|
`Discover probe page=${padLeftV(1, 4)}`,
|
||||||
|
items1 > 0 ? "OK" : "MISS",
|
||||||
|
items1 > 0,
|
||||||
|
discoverProg(state),
|
||||||
|
`items=${padLeftV(items1, 3)} | bytes=${padLeftV("", 8)} | ${padRightV(ctx.http.inflightStr(), 11)} | ${secStr(ms || pMs)}`
|
||||||
|
);
|
||||||
|
|
||||||
|
if (items1 <= 0) {
|
||||||
ctx.logger.warn(`${ctx.store.name} | ${ctx.cat.label} | Page 1 did not look like a listing. Defaulting to 1.`);
|
ctx.logger.warn(`${ctx.store.name} | ${ctx.cat.label} | Page 1 did not look like a listing. Defaulting to 1.`);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const extracted = extractTotalPagesFromPaginationHtml(html1);
|
||||||
|
if (extracted && extracted >= 1) {
|
||||||
|
ctx.logger.ok(`${ctx.catPrefixOut} | Total pages (from pagination): ${extracted}`);
|
||||||
|
return extracted;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback to probing if pagination parse fails
|
||||||
const g = Math.max(2, guess);
|
const g = Math.max(2, guess);
|
||||||
const pg = await probePage(ctx, baseUrl, g, state);
|
const pg = await probePage(ctx, baseUrl, g, state);
|
||||||
if (!pg.ok) return await binaryFindLastOk(ctx, baseUrl, 1, g, state);
|
if (!pg.ok) return await binaryFindLastOk(ctx, baseUrl, 1, g, state);
|
||||||
|
|
@ -202,7 +270,7 @@ async function discoverAndScanCategory(ctx, prevDb, report) {
|
||||||
const t0 = Date.now();
|
const t0 = Date.now();
|
||||||
|
|
||||||
const guess = Number.isFinite(ctx.cat.discoveryStartPage) ? ctx.cat.discoveryStartPage : config.discoveryGuess;
|
const guess = Number.isFinite(ctx.cat.discoveryStartPage) ? ctx.cat.discoveryStartPage : config.discoveryGuess;
|
||||||
const step = config.discoveryStep;
|
const step = Number.isFinite(ctx.cat.discoveryStep) ? ctx.cat.discoveryStep : config.discoveryStep;
|
||||||
|
|
||||||
const totalPages = await discoverTotalPagesFast(ctx, ctx.baseUrl, guess, step);
|
const totalPages = await discoverTotalPagesFast(ctx, ctx.baseUrl, guess, step);
|
||||||
const scanPages = config.maxPages === null ? totalPages : Math.min(config.maxPages, totalPages);
|
const scanPages = config.maxPages === null ? totalPages : Math.min(config.maxPages, totalPages);
|
||||||
|
|
@ -214,7 +282,10 @@ async function discoverAndScanCategory(ctx, prevDb, report) {
|
||||||
|
|
||||||
let donePages = 0;
|
let donePages = 0;
|
||||||
|
|
||||||
const perPageItems = await parallelMapStaggered(pages, config.concurrency, config.staggerMs, async (pageUrl, idx) => {
|
const pageConc = Number.isFinite(ctx.cat.pageConcurrency) ? ctx.cat.pageConcurrency : config.concurrency;
|
||||||
|
const pageStagger = Number.isFinite(ctx.cat.pageStaggerMs) ? ctx.cat.pageStaggerMs : config.staggerMs;
|
||||||
|
|
||||||
|
const perPageItems = await parallelMapStaggered(pages, pageConc, pageStagger, async (pageUrl, idx) => {
|
||||||
const pnum = idx + 1;
|
const pnum = idx + 1;
|
||||||
|
|
||||||
const { text: html, ms, bytes, status, finalUrl } = await ctx.http.fetchTextWithRetry(
|
const { text: html, ms, bytes, status, finalUrl } = await ctx.http.fetchTextWithRetry(
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
"use strict";
|
"use strict";
|
||||||
|
|
||||||
const { createReport } = require("./report");
|
const { createReport } = require("./report");
|
||||||
const { parallelMapStaggered } = require("../utils/async");
|
const { setTimeout: sleep } = require("timers/promises");
|
||||||
|
|
||||||
const {
|
const {
|
||||||
makeCatPrefixers,
|
makeCatPrefixers,
|
||||||
|
|
@ -43,28 +43,54 @@ async function runAllStores(stores, { config, logger, http }) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
await parallelMapStaggered(
|
// Host-level serialization: never run two categories from the same host concurrently.
|
||||||
workItems,
|
const maxWorkers = Math.min(config.categoryConcurrency, workItems.length);
|
||||||
Math.min(config.categoryConcurrency, workItems.length),
|
const queue = workItems.slice();
|
||||||
0,
|
const inflightHosts = new Set();
|
||||||
async (w) => {
|
|
||||||
try {
|
|
||||||
await discoverAndScanCategory(w.ctx, w.prevDb, report);
|
|
||||||
} catch (e) {
|
|
||||||
const storeName = w?.ctx?.store?.name || w?.ctx?.store?.host || "unknown-store";
|
|
||||||
const catLabel = w?.ctx?.cat?.label || w?.ctx?.cat?.key || "unknown-category";
|
|
||||||
|
|
||||||
// Keep it loud in logs, but do not fail the entire run.
|
async function runOne(w) {
|
||||||
logger.warn(
|
try {
|
||||||
`Category failed (continuing): ${storeName} | ${catLabel}\n${formatErr(e)}`
|
await discoverAndScanCategory(w.ctx, w.prevDb, report);
|
||||||
);
|
} catch (e) {
|
||||||
|
const storeName = w?.ctx?.store?.name || w?.ctx?.store?.host || "unknown-store";
|
||||||
|
const catLabel = w?.ctx?.cat?.label || w?.ctx?.cat?.key || "unknown-category";
|
||||||
|
|
||||||
// If you want failures surfaced in the final report later, you could also
|
// Keep it loud in logs, but do not fail the entire run.
|
||||||
// push a "failed category" record onto report.categories here.
|
logger.warn(`Category failed (continuing): ${storeName} | ${catLabel}\n${formatErr(e)}`);
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
);
|
}
|
||||||
|
|
||||||
|
async function worker() {
|
||||||
|
while (true) {
|
||||||
|
if (queue.length === 0) return;
|
||||||
|
|
||||||
|
// Pick next item whose host isn't currently running.
|
||||||
|
const idx = queue.findIndex((w) => {
|
||||||
|
const host = String(w?.ctx?.store?.host || w?.ctx?.store?.key || "");
|
||||||
|
return host && !inflightHosts.has(host);
|
||||||
|
});
|
||||||
|
|
||||||
|
if (idx === -1) {
|
||||||
|
// Nothing available right now; wait a bit.
|
||||||
|
await sleep(50);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const w = queue.splice(idx, 1)[0];
|
||||||
|
const host = String(w?.ctx?.store?.host || w?.ctx?.store?.key || "");
|
||||||
|
|
||||||
|
inflightHosts.add(host);
|
||||||
|
try {
|
||||||
|
await runOne(w);
|
||||||
|
} finally {
|
||||||
|
inflightHosts.delete(host);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const workers = [];
|
||||||
|
for (let i = 0; i < maxWorkers; i++) workers.push(worker());
|
||||||
|
await Promise.all(workers);
|
||||||
|
|
||||||
return report;
|
return report;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue