diff --git a/src/core/http.js b/src/core/http.js
index 01b2764..a60552c 100644
--- a/src/core/http.js
+++ b/src/core/http.js
@@ -1,3 +1,4 @@
+// src/core/http.js
"use strict";
const { setTimeout: sleep } = require("timers/promises");
@@ -134,29 +135,70 @@ function createHttpClient({ maxRetries, timeoutMs, defaultUa, logger }) {
// host -> epoch ms when next request is allowed
const hostNextOkAt = new Map();
- const minHostIntervalMs = 900;
+
+ // Conservative pacing defaults (slow > blocked)
+ const minHostIntervalMs = 2500;
+
+ // Per-host inflight clamp (prevents bursts when global concurrency is high)
+ const hostInflight = new Map();
+ const maxHostInflight = 1;
function inflightStr() {
return `inflight=${inflight}`;
}
+ async function acquireHost(url) {
+ const host = hostFromUrl(url);
+ if (!host) return () => {};
+
+ while (true) {
+ const cur = hostInflight.get(host) || 0;
+ if (cur < maxHostInflight) {
+ hostInflight.set(host, cur + 1);
+ return () => {
+ const n = (hostInflight.get(host) || 1) - 1;
+ if (n <= 0) hostInflight.delete(host);
+ else hostInflight.set(host, n);
+ };
+ }
+ await sleep(50);
+ }
+ }
+
+ // ✅ Pre-pacing reservation: reserve the next slot BEFORE the fetch is sent
async function throttleHost(url) {
const host = hostFromUrl(url);
if (!host) return;
- const now = Date.now();
- const next = hostNextOkAt.get(host) || 0;
- if (next > now) {
- logger?.dbg?.(`THROTTLE host=${host} wait=${next - now}ms`);
- await sleep(next - now);
+
+ while (true) {
+ const now = Date.now();
+ const next = hostNextOkAt.get(host) || 0;
+ const wait = next - now;
+
+ if (wait > 0) {
+ logger?.dbg?.(`THROTTLE host=${host} wait=${wait}ms`);
+ await sleep(wait);
+ continue;
+ }
+
+ // Reserve immediately to prevent concurrent pass-through
+ hostNextOkAt.set(host, now + minHostIntervalMs);
+ return;
}
}
function noteHost(url, extraDelayMs = 0) {
const host = hostFromUrl(url);
if (!host) return;
- const until = Date.now() + minHostIntervalMs + extraDelayMs;
- hostNextOkAt.set(host, until);
- logger?.dbg?.(`HOST-PACE host=${host} nextOkIn=${until - Date.now()}ms`);
+
+ const now = Date.now();
+ const current = hostNextOkAt.get(host) || 0;
+
+ // Extend (never shorten) any existing cooldown
+ const target = now + minHostIntervalMs + Math.max(0, extraDelayMs);
+ hostNextOkAt.set(host, Math.max(current, target));
+
+ logger?.dbg?.(`HOST-PACE host=${host} nextOkIn=${Math.max(0, (hostNextOkAt.get(host) || 0) - Date.now())}ms`);
}
async function fetchWithRetry(
@@ -170,9 +212,9 @@ function createHttpClient({ maxRetries, timeoutMs, defaultUa, logger }) {
const start = Date.now();
inflight++;
- logger?.dbg?.(
- `REQ#${reqId} START ${tag} attempt=${attempt + 1}/${maxRetries + 1} ${url} (${inflightStr()})`
- );
+ logger?.dbg?.(`REQ#${reqId} START ${tag} attempt=${attempt + 1}/${maxRetries + 1} ${url} (${inflightStr()})`);
+
+ const releaseHost = await acquireHost(url);
try {
await throttleHost(url);
@@ -181,11 +223,7 @@ function createHttpClient({ maxRetries, timeoutMs, defaultUa, logger }) {
const t = setTimeoutCb(() => ctrl.abort(), timeoutMs);
const cookieHdr =
- cookies &&
- !("Cookie" in headers) &&
- !("cookie" in headers)
- ? cookieJar.cookieHeaderFor(url)
- : "";
+ cookies && !("Cookie" in headers) && !("cookie" in headers) ? cookieJar.cookieHeaderFor(url) : "";
const res = await fetch(url, {
method,
@@ -207,20 +245,20 @@ function createHttpClient({ maxRetries, timeoutMs, defaultUa, logger }) {
const finalUrl = res.url || url;
const elapsed = Date.now() - start;
+ // Always pace the host a bit after any response
noteHost(finalUrl);
if (cookies) cookieJar.storeFromResponse(url, res);
- logger?.dbg?.(
- `REQ#${reqId} HTTP ${status} ${tag} ms=${elapsed} finalUrl=${finalUrl}`
- );
+ logger?.dbg?.(`REQ#${reqId} HTTP ${status} ${tag} ms=${elapsed} finalUrl=${finalUrl}`);
if (status === 429) {
- const raMs = retryAfterMs(res);
- if (raMs > 0) noteHost(finalUrl, raMs);
+ let raMs = retryAfterMs(res);
- logger?.dbg?.(
- `REQ#${reqId} 429 retryAfterMs=${raMs} host=${hostFromUrl(finalUrl)}`
- );
+ // ✅ If no Retry-After header, enforce a real cooldown (Shopify often omits it)
+ if (raMs <= 0) raMs = 15000 + Math.floor(Math.random() * 5000);
+
+ noteHost(finalUrl, raMs);
+ logger?.dbg?.(`REQ#${reqId} 429 retryAfterMs=${raMs} host=${hostFromUrl(finalUrl)}`);
throw new RetryableError("HTTP 429");
}
@@ -231,9 +269,7 @@ function createHttpClient({ maxRetries, timeoutMs, defaultUa, logger }) {
if (status >= 400) {
const bodyTxt = await safeText(res);
throw new Error(
- `HTTP ${status} bodyHead=${String(bodyTxt)
- .slice(0, 160)
- .replace(/\s+/g, " ")}`
+ `HTTP ${status} bodyHead=${String(bodyTxt).slice(0, 160).replace(/\s+/g, " ")}`
);
}
@@ -274,6 +310,7 @@ function createHttpClient({ maxRetries, timeoutMs, defaultUa, logger }) {
logger?.warn?.(`Request failed, retrying in ${delay}ms (${attempt + 1}/${maxRetries})`);
await sleep(delay);
} finally {
+ releaseHost();
inflight--;
logger?.dbg?.(`REQ#${reqId} END ${tag} (${inflightStr()})`);
}
diff --git a/src/stores/craftcellars.js b/src/stores/craftcellars.js
index de3ee5f..2f0a052 100644
--- a/src/stores/craftcellars.js
+++ b/src/stores/craftcellars.js
@@ -1,5 +1,8 @@
+// src/stores/craftcellars.js
"use strict";
+const { setTimeout: sleep } = require("timers/promises");
+
const { decodeHtml, stripTags, extractFirstImgUrl } = require("../utils/html");
const { sanitizeName } = require("../utils/text");
const { normalizeCspc } = require("../utils/sku");
@@ -29,7 +32,8 @@ function canonicalizeCraftProductUrl(raw) {
function extractShopifyCardPrice(block) {
const b = String(block || "");
- const dollars = (txt) => [...String(txt).matchAll(/\$\s*[\d,]+(?:\.\d{2})?/g)].map((m) => m[0].replace(/\s+/g, ""));
+ const dollars = (txt) =>
+ [...String(txt).matchAll(/\$\s*[\d,]+(?:\.\d{2})?/g)].map((m) => m[0].replace(/\s+/g, ""));
const saleRegion = b.split(/sale price/i)[1] || "";
const saleD = dollars(saleRegion);
@@ -83,12 +87,8 @@ function parseProductsCraftCellarsInner(html, ctx) {
url = canonicalizeCraftProductUrl(url);
const nameHtml =
- block.match(
- /]*href=["'][^"']*\/products\/[^"']+["'][^>]*>\s*<[^>]*>\s*([^<]{2,200}?)\s*]*>[\s\S]*?]*\/products\/[^"']+[^>]*>([\s\S]*?)<\/a>[\s\S]*?<\/h[23]>/i
- )?.[1] ||
+ block.match(/]*href=["'][^"']*\/products\/[^"']+["'][^>]*>\s*<[^>]*>\s*([^<]{2,200}?)\s*]*>[\s\S]*?]*\/products\/[^"']+[^>]*>([\s\S]*?)<\/a>[\s\S]*?<\/h[23]>/i)?.[1] ||
block.match(/]*href=["'][^"']*\/products\/[^"']+["'][^>]*>([\s\S]*?)<\/a>/i)?.[1];
const name = sanitizeName(stripTags(decodeHtml(nameHtml || "")));
@@ -105,13 +105,16 @@ function parseProductsCraftCellarsInner(html, ctx) {
return [...uniq.values()];
}
-
function usdFromShopifyPriceStr(s) {
const n = Number(String(s || "").replace(/[^0-9.]/g, ""));
if (!Number.isFinite(n)) return "";
return `$${n.toLocaleString("en-US", { minimumFractionDigits: 2, maximumFractionDigits: 2 })}`;
}
+function cfgNum(v, fallback) {
+ return Number.isFinite(v) ? v : fallback;
+}
+
/**
* Craft Cellars:
* - HTML listing with ?filter.v.availability=1 is the allowlist (prevents OOS leaking in)
@@ -120,6 +123,18 @@ function usdFromShopifyPriceStr(s) {
async function scanCategoryCraftCellars(ctx, prevDb, report) {
const t0 = Date.now();
+ // Strongly prefer "slow and steady" to avoid 429s.
+ // Use per-category knobs if present; otherwise default conservative.
+ const perPageDelayMs = Math.max(
+ 0,
+ cfgNum(ctx?.cat?.pageStaggerMs, cfgNum(ctx?.cat?.discoveryDelayMs, 0)) || 0
+ ) || 0;
+
+ const perJsonPageDelayMs = Math.max(
+ 0,
+ cfgNum(ctx?.cat?.jsonPageDelayMs, perPageDelayMs)
+ );
+
// 1) HTML scan: allowlist of in-stock listing URLs
const htmlMap = new Map(); // url -> {name, price, url, img}
@@ -128,6 +143,8 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) {
let emptyStreak = 0;
for (let p = 1; p <= maxPages; p++) {
+ if (p > 1 && perPageDelayMs > 0) await sleep(perPageDelayMs);
+
const pageUrl = makePageUrlShopifyQueryPage(ctx.cat.startUrl, p);
const { text: html } = await ctx.http.fetchTextWithRetry(pageUrl, `craft:html:${ctx.cat.key}:p${p}`, ctx.store.ua);
htmlPagesFetched++;
@@ -151,9 +168,7 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) {
// If HTML returns nothing, don't let JSON invent a category
if (!htmlMap.size) {
- ctx.logger.warn(
- `${ctx.catPrefixOut} | HTML listing returned 0 items; refusing to use products.json as source of truth.`
- );
+ ctx.logger.warn(`${ctx.catPrefixOut} | HTML listing returned 0 items; refusing to use products.json as source of truth.`);
}
// 2) JSON scan: build SKU index (but do NOT add new URLs from JSON)
@@ -170,6 +185,8 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) {
let jsonPagesFetched = 0;
while (true) {
+ if (jsonPage > 1 && perJsonPageDelayMs > 0) await sleep(perJsonPageDelayMs);
+
const url = `https://${ctx.store.host}/collections/${collectionHandle}/products.json?limit=${limit}&page=${jsonPage}`;
const r = await ctx.http.fetchJsonWithRetry(url, `craft:coljson:${ctx.cat.key}:p${jsonPage}`, ctx.store.ua);
@@ -240,7 +257,9 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) {
ctx.logger.ok(`${ctx.catPrefixOut} | Unique products (this run): ${discovered.size}`);
- const { merged, newItems, updatedItems, removedItems, restoredItems } = mergeDiscoveredIntoDb(prevDb, discovered, { storeLabel: ctx.store.name });
+ const { merged, newItems, updatedItems, removedItems, restoredItems } = mergeDiscoveredIntoDb(prevDb, discovered, {
+ storeLabel: ctx.store.name,
+ });
const dbObj = buildDbObject(ctx, merged);
writeJsonAtomic(ctx.dbFile, dbObj);
@@ -271,7 +290,6 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) {
addCategoryResultToReport(report, ctx.store.name, ctx.cat.label, newItems, updatedItems, removedItems, restoredItems);
}
-
function createStore(defaultUa) {
return {
key: "craftcellars",
@@ -292,37 +310,68 @@ function createStore(defaultUa) {
key: "whisky",
label: "Whisky",
startUrl: "https://craftcellars.ca/collections/whisky?filter.v.availability=1",
- discoveryStartPage: 10,
+
+ // slow-and-safe defaults (override globally if you want)
+ discoveryStartPage: 3,
+ discoveryStep: 2,
+ pageConcurrency: 1,
+ pageStaggerMs: 10000,
+ discoveryDelayMs: 10000,
},
{
key: "rum",
label: "Rum",
startUrl: "https://craftcellars.ca/collections/rum?filter.v.availability=1",
- discoveryStartPage: 5,
+
+ discoveryStartPage: 3,
+ discoveryStep: 2,
+ pageConcurrency: 1,
+ pageStaggerMs: 10000,
+ discoveryDelayMs: 10000,
},
{
key: "single-malt-scotch",
label: "Single Malt Scotch",
startUrl: "https://craftcellars.ca/collections/single-malt-scotch?filter.v.availability=1",
- discoveryStartPage: 1,
+
+ discoveryStartPage: 3,
+ discoveryStep: 2,
+ pageConcurrency: 1,
+ pageStaggerMs: 10000,
+ discoveryDelayMs: 10000,
},
{
key: "other-scotch-styles",
label: "Other Scotch Styles",
startUrl: "https://craftcellars.ca/collections/other-scotch-styles?filter.v.availability=1",
- discoveryStartPage: 1,
+
+ discoveryStartPage: 3,
+ discoveryStep: 2,
+ pageConcurrency: 1,
+ pageStaggerMs: 10000,
+ discoveryDelayMs: 10000,
},
{
key: "single-grain-scotch",
label: "Single Grain Scotch",
startUrl: "https://craftcellars.ca/collections/single-grain-scotch?filter.v.availability=1",
- discoveryStartPage: 1,
+
+ discoveryStartPage: 3,
+ discoveryStep: 2,
+ pageConcurrency: 1,
+ pageStaggerMs: 10000,
+ discoveryDelayMs: 10000,
},
{
key: "blended-malt-scotch",
label: "Blended Malt Scotch",
startUrl: "https://craftcellars.ca/collections/blended-malt-scotch?filter.v.availability=1",
- discoveryStartPage: 10,
+
+ discoveryStartPage: 3,
+ discoveryStep: 2,
+ pageConcurrency: 1,
+ pageStaggerMs: 10000,
+ discoveryDelayMs: 10000,
},
],
};