feat: Slow down

This commit is contained in:
Brennan Wilkes (Text Groove) 2026-01-30 09:42:06 -08:00
parent 7ff1bde8bd
commit d53b68c3a3
2 changed files with 133 additions and 47 deletions

View file

@ -1,3 +1,4 @@
// src/core/http.js
"use strict"; "use strict";
const { setTimeout: sleep } = require("timers/promises"); const { setTimeout: sleep } = require("timers/promises");
@ -134,29 +135,70 @@ function createHttpClient({ maxRetries, timeoutMs, defaultUa, logger }) {
// host -> epoch ms when next request is allowed // host -> epoch ms when next request is allowed
const hostNextOkAt = new Map(); const hostNextOkAt = new Map();
const minHostIntervalMs = 900;
// Conservative pacing defaults (slow > blocked)
const minHostIntervalMs = 2500;
// Per-host inflight clamp (prevents bursts when global concurrency is high)
const hostInflight = new Map();
const maxHostInflight = 1;
function inflightStr() { function inflightStr() {
return `inflight=${inflight}`; return `inflight=${inflight}`;
} }
async function acquireHost(url) {
const host = hostFromUrl(url);
if (!host) return () => {};
while (true) {
const cur = hostInflight.get(host) || 0;
if (cur < maxHostInflight) {
hostInflight.set(host, cur + 1);
return () => {
const n = (hostInflight.get(host) || 1) - 1;
if (n <= 0) hostInflight.delete(host);
else hostInflight.set(host, n);
};
}
await sleep(50);
}
}
// ✅ Pre-pacing reservation: reserve the next slot BEFORE the fetch is sent
async function throttleHost(url) { async function throttleHost(url) {
const host = hostFromUrl(url); const host = hostFromUrl(url);
if (!host) return; if (!host) return;
const now = Date.now();
const next = hostNextOkAt.get(host) || 0; while (true) {
if (next > now) { const now = Date.now();
logger?.dbg?.(`THROTTLE host=${host} wait=${next - now}ms`); const next = hostNextOkAt.get(host) || 0;
await sleep(next - now); const wait = next - now;
if (wait > 0) {
logger?.dbg?.(`THROTTLE host=${host} wait=${wait}ms`);
await sleep(wait);
continue;
}
// Reserve immediately to prevent concurrent pass-through
hostNextOkAt.set(host, now + minHostIntervalMs);
return;
} }
} }
function noteHost(url, extraDelayMs = 0) { function noteHost(url, extraDelayMs = 0) {
const host = hostFromUrl(url); const host = hostFromUrl(url);
if (!host) return; if (!host) return;
const until = Date.now() + minHostIntervalMs + extraDelayMs;
hostNextOkAt.set(host, until); const now = Date.now();
logger?.dbg?.(`HOST-PACE host=${host} nextOkIn=${until - Date.now()}ms`); const current = hostNextOkAt.get(host) || 0;
// Extend (never shorten) any existing cooldown
const target = now + minHostIntervalMs + Math.max(0, extraDelayMs);
hostNextOkAt.set(host, Math.max(current, target));
logger?.dbg?.(`HOST-PACE host=${host} nextOkIn=${Math.max(0, (hostNextOkAt.get(host) || 0) - Date.now())}ms`);
} }
async function fetchWithRetry( async function fetchWithRetry(
@ -170,9 +212,9 @@ function createHttpClient({ maxRetries, timeoutMs, defaultUa, logger }) {
const start = Date.now(); const start = Date.now();
inflight++; inflight++;
logger?.dbg?.( logger?.dbg?.(`REQ#${reqId} START ${tag} attempt=${attempt + 1}/${maxRetries + 1} ${url} (${inflightStr()})`);
`REQ#${reqId} START ${tag} attempt=${attempt + 1}/${maxRetries + 1} ${url} (${inflightStr()})`
); const releaseHost = await acquireHost(url);
try { try {
await throttleHost(url); await throttleHost(url);
@ -181,11 +223,7 @@ function createHttpClient({ maxRetries, timeoutMs, defaultUa, logger }) {
const t = setTimeoutCb(() => ctrl.abort(), timeoutMs); const t = setTimeoutCb(() => ctrl.abort(), timeoutMs);
const cookieHdr = const cookieHdr =
cookies && cookies && !("Cookie" in headers) && !("cookie" in headers) ? cookieJar.cookieHeaderFor(url) : "";
!("Cookie" in headers) &&
!("cookie" in headers)
? cookieJar.cookieHeaderFor(url)
: "";
const res = await fetch(url, { const res = await fetch(url, {
method, method,
@ -207,20 +245,20 @@ function createHttpClient({ maxRetries, timeoutMs, defaultUa, logger }) {
const finalUrl = res.url || url; const finalUrl = res.url || url;
const elapsed = Date.now() - start; const elapsed = Date.now() - start;
// Always pace the host a bit after any response
noteHost(finalUrl); noteHost(finalUrl);
if (cookies) cookieJar.storeFromResponse(url, res); if (cookies) cookieJar.storeFromResponse(url, res);
logger?.dbg?.( logger?.dbg?.(`REQ#${reqId} HTTP ${status} ${tag} ms=${elapsed} finalUrl=${finalUrl}`);
`REQ#${reqId} HTTP ${status} ${tag} ms=${elapsed} finalUrl=${finalUrl}`
);
if (status === 429) { if (status === 429) {
const raMs = retryAfterMs(res); let raMs = retryAfterMs(res);
if (raMs > 0) noteHost(finalUrl, raMs);
logger?.dbg?.( // ✅ If no Retry-After header, enforce a real cooldown (Shopify often omits it)
`REQ#${reqId} 429 retryAfterMs=${raMs} host=${hostFromUrl(finalUrl)}` if (raMs <= 0) raMs = 15000 + Math.floor(Math.random() * 5000);
);
noteHost(finalUrl, raMs);
logger?.dbg?.(`REQ#${reqId} 429 retryAfterMs=${raMs} host=${hostFromUrl(finalUrl)}`);
throw new RetryableError("HTTP 429"); throw new RetryableError("HTTP 429");
} }
@ -231,9 +269,7 @@ function createHttpClient({ maxRetries, timeoutMs, defaultUa, logger }) {
if (status >= 400) { if (status >= 400) {
const bodyTxt = await safeText(res); const bodyTxt = await safeText(res);
throw new Error( throw new Error(
`HTTP ${status} bodyHead=${String(bodyTxt) `HTTP ${status} bodyHead=${String(bodyTxt).slice(0, 160).replace(/\s+/g, " ")}`
.slice(0, 160)
.replace(/\s+/g, " ")}`
); );
} }
@ -274,6 +310,7 @@ function createHttpClient({ maxRetries, timeoutMs, defaultUa, logger }) {
logger?.warn?.(`Request failed, retrying in ${delay}ms (${attempt + 1}/${maxRetries})`); logger?.warn?.(`Request failed, retrying in ${delay}ms (${attempt + 1}/${maxRetries})`);
await sleep(delay); await sleep(delay);
} finally { } finally {
releaseHost();
inflight--; inflight--;
logger?.dbg?.(`REQ#${reqId} END ${tag} (${inflightStr()})`); logger?.dbg?.(`REQ#${reqId} END ${tag} (${inflightStr()})`);
} }

View file

@ -1,5 +1,8 @@
// src/stores/craftcellars.js
"use strict"; "use strict";
const { setTimeout: sleep } = require("timers/promises");
const { decodeHtml, stripTags, extractFirstImgUrl } = require("../utils/html"); const { decodeHtml, stripTags, extractFirstImgUrl } = require("../utils/html");
const { sanitizeName } = require("../utils/text"); const { sanitizeName } = require("../utils/text");
const { normalizeCspc } = require("../utils/sku"); const { normalizeCspc } = require("../utils/sku");
@ -29,7 +32,8 @@ function canonicalizeCraftProductUrl(raw) {
function extractShopifyCardPrice(block) { function extractShopifyCardPrice(block) {
const b = String(block || ""); const b = String(block || "");
const dollars = (txt) => [...String(txt).matchAll(/\$\s*[\d,]+(?:\.\d{2})?/g)].map((m) => m[0].replace(/\s+/g, "")); const dollars = (txt) =>
[...String(txt).matchAll(/\$\s*[\d,]+(?:\.\d{2})?/g)].map((m) => m[0].replace(/\s+/g, ""));
const saleRegion = b.split(/sale price/i)[1] || ""; const saleRegion = b.split(/sale price/i)[1] || "";
const saleD = dollars(saleRegion); const saleD = dollars(saleRegion);
@ -83,12 +87,8 @@ function parseProductsCraftCellarsInner(html, ctx) {
url = canonicalizeCraftProductUrl(url); url = canonicalizeCraftProductUrl(url);
const nameHtml = const nameHtml =
block.match( block.match(/<a\b[^>]*href=["'][^"']*\/products\/[^"']+["'][^>]*>\s*<[^>]*>\s*([^<]{2,200}?)\s*</i)?.[1] ||
/<a\b[^>]*href=["'][^"']*\/products\/[^"']+["'][^>]*>\s*<[^>]*>\s*([^<]{2,200}?)\s*</i block.match(/<h[23]\b[^>]*>[\s\S]*?<a\b[^>]*\/products\/[^"']+[^>]*>([\s\S]*?)<\/a>[\s\S]*?<\/h[23]>/i)?.[1] ||
)?.[1] ||
block.match(
/<h[23]\b[^>]*>[\s\S]*?<a\b[^>]*\/products\/[^"']+[^>]*>([\s\S]*?)<\/a>[\s\S]*?<\/h[23]>/i
)?.[1] ||
block.match(/<a\b[^>]*href=["'][^"']*\/products\/[^"']+["'][^>]*>([\s\S]*?)<\/a>/i)?.[1]; block.match(/<a\b[^>]*href=["'][^"']*\/products\/[^"']+["'][^>]*>([\s\S]*?)<\/a>/i)?.[1];
const name = sanitizeName(stripTags(decodeHtml(nameHtml || ""))); const name = sanitizeName(stripTags(decodeHtml(nameHtml || "")));
@ -105,13 +105,16 @@ function parseProductsCraftCellarsInner(html, ctx) {
return [...uniq.values()]; return [...uniq.values()];
} }
function usdFromShopifyPriceStr(s) { function usdFromShopifyPriceStr(s) {
const n = Number(String(s || "").replace(/[^0-9.]/g, "")); const n = Number(String(s || "").replace(/[^0-9.]/g, ""));
if (!Number.isFinite(n)) return ""; if (!Number.isFinite(n)) return "";
return `$${n.toLocaleString("en-US", { minimumFractionDigits: 2, maximumFractionDigits: 2 })}`; return `$${n.toLocaleString("en-US", { minimumFractionDigits: 2, maximumFractionDigits: 2 })}`;
} }
function cfgNum(v, fallback) {
return Number.isFinite(v) ? v : fallback;
}
/** /**
* Craft Cellars: * Craft Cellars:
* - HTML listing with ?filter.v.availability=1 is the allowlist (prevents OOS leaking in) * - HTML listing with ?filter.v.availability=1 is the allowlist (prevents OOS leaking in)
@ -120,6 +123,18 @@ function usdFromShopifyPriceStr(s) {
async function scanCategoryCraftCellars(ctx, prevDb, report) { async function scanCategoryCraftCellars(ctx, prevDb, report) {
const t0 = Date.now(); const t0 = Date.now();
// Strongly prefer "slow and steady" to avoid 429s.
// Use per-category knobs if present; otherwise default conservative.
const perPageDelayMs = Math.max(
0,
cfgNum(ctx?.cat?.pageStaggerMs, cfgNum(ctx?.cat?.discoveryDelayMs, 0)) || 0
) || 0;
const perJsonPageDelayMs = Math.max(
0,
cfgNum(ctx?.cat?.jsonPageDelayMs, perPageDelayMs)
);
// 1) HTML scan: allowlist of in-stock listing URLs // 1) HTML scan: allowlist of in-stock listing URLs
const htmlMap = new Map(); // url -> {name, price, url, img} const htmlMap = new Map(); // url -> {name, price, url, img}
@ -128,6 +143,8 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) {
let emptyStreak = 0; let emptyStreak = 0;
for (let p = 1; p <= maxPages; p++) { for (let p = 1; p <= maxPages; p++) {
if (p > 1 && perPageDelayMs > 0) await sleep(perPageDelayMs);
const pageUrl = makePageUrlShopifyQueryPage(ctx.cat.startUrl, p); const pageUrl = makePageUrlShopifyQueryPage(ctx.cat.startUrl, p);
const { text: html } = await ctx.http.fetchTextWithRetry(pageUrl, `craft:html:${ctx.cat.key}:p${p}`, ctx.store.ua); const { text: html } = await ctx.http.fetchTextWithRetry(pageUrl, `craft:html:${ctx.cat.key}:p${p}`, ctx.store.ua);
htmlPagesFetched++; htmlPagesFetched++;
@ -151,9 +168,7 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) {
// If HTML returns nothing, don't let JSON invent a category // If HTML returns nothing, don't let JSON invent a category
if (!htmlMap.size) { if (!htmlMap.size) {
ctx.logger.warn( ctx.logger.warn(`${ctx.catPrefixOut} | HTML listing returned 0 items; refusing to use products.json as source of truth.`);
`${ctx.catPrefixOut} | HTML listing returned 0 items; refusing to use products.json as source of truth.`
);
} }
// 2) JSON scan: build SKU index (but do NOT add new URLs from JSON) // 2) JSON scan: build SKU index (but do NOT add new URLs from JSON)
@ -170,6 +185,8 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) {
let jsonPagesFetched = 0; let jsonPagesFetched = 0;
while (true) { while (true) {
if (jsonPage > 1 && perJsonPageDelayMs > 0) await sleep(perJsonPageDelayMs);
const url = `https://${ctx.store.host}/collections/${collectionHandle}/products.json?limit=${limit}&page=${jsonPage}`; const url = `https://${ctx.store.host}/collections/${collectionHandle}/products.json?limit=${limit}&page=${jsonPage}`;
const r = await ctx.http.fetchJsonWithRetry(url, `craft:coljson:${ctx.cat.key}:p${jsonPage}`, ctx.store.ua); const r = await ctx.http.fetchJsonWithRetry(url, `craft:coljson:${ctx.cat.key}:p${jsonPage}`, ctx.store.ua);
@ -240,7 +257,9 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) {
ctx.logger.ok(`${ctx.catPrefixOut} | Unique products (this run): ${discovered.size}`); ctx.logger.ok(`${ctx.catPrefixOut} | Unique products (this run): ${discovered.size}`);
const { merged, newItems, updatedItems, removedItems, restoredItems } = mergeDiscoveredIntoDb(prevDb, discovered, { storeLabel: ctx.store.name }); const { merged, newItems, updatedItems, removedItems, restoredItems } = mergeDiscoveredIntoDb(prevDb, discovered, {
storeLabel: ctx.store.name,
});
const dbObj = buildDbObject(ctx, merged); const dbObj = buildDbObject(ctx, merged);
writeJsonAtomic(ctx.dbFile, dbObj); writeJsonAtomic(ctx.dbFile, dbObj);
@ -271,7 +290,6 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) {
addCategoryResultToReport(report, ctx.store.name, ctx.cat.label, newItems, updatedItems, removedItems, restoredItems); addCategoryResultToReport(report, ctx.store.name, ctx.cat.label, newItems, updatedItems, removedItems, restoredItems);
} }
function createStore(defaultUa) { function createStore(defaultUa) {
return { return {
key: "craftcellars", key: "craftcellars",
@ -292,37 +310,68 @@ function createStore(defaultUa) {
key: "whisky", key: "whisky",
label: "Whisky", label: "Whisky",
startUrl: "https://craftcellars.ca/collections/whisky?filter.v.availability=1", startUrl: "https://craftcellars.ca/collections/whisky?filter.v.availability=1",
discoveryStartPage: 10,
// slow-and-safe defaults (override globally if you want)
discoveryStartPage: 3,
discoveryStep: 2,
pageConcurrency: 1,
pageStaggerMs: 10000,
discoveryDelayMs: 10000,
}, },
{ {
key: "rum", key: "rum",
label: "Rum", label: "Rum",
startUrl: "https://craftcellars.ca/collections/rum?filter.v.availability=1", startUrl: "https://craftcellars.ca/collections/rum?filter.v.availability=1",
discoveryStartPage: 5,
discoveryStartPage: 3,
discoveryStep: 2,
pageConcurrency: 1,
pageStaggerMs: 10000,
discoveryDelayMs: 10000,
}, },
{ {
key: "single-malt-scotch", key: "single-malt-scotch",
label: "Single Malt Scotch", label: "Single Malt Scotch",
startUrl: "https://craftcellars.ca/collections/single-malt-scotch?filter.v.availability=1", startUrl: "https://craftcellars.ca/collections/single-malt-scotch?filter.v.availability=1",
discoveryStartPage: 1,
discoveryStartPage: 3,
discoveryStep: 2,
pageConcurrency: 1,
pageStaggerMs: 10000,
discoveryDelayMs: 10000,
}, },
{ {
key: "other-scotch-styles", key: "other-scotch-styles",
label: "Other Scotch Styles", label: "Other Scotch Styles",
startUrl: "https://craftcellars.ca/collections/other-scotch-styles?filter.v.availability=1", startUrl: "https://craftcellars.ca/collections/other-scotch-styles?filter.v.availability=1",
discoveryStartPage: 1,
discoveryStartPage: 3,
discoveryStep: 2,
pageConcurrency: 1,
pageStaggerMs: 10000,
discoveryDelayMs: 10000,
}, },
{ {
key: "single-grain-scotch", key: "single-grain-scotch",
label: "Single Grain Scotch", label: "Single Grain Scotch",
startUrl: "https://craftcellars.ca/collections/single-grain-scotch?filter.v.availability=1", startUrl: "https://craftcellars.ca/collections/single-grain-scotch?filter.v.availability=1",
discoveryStartPage: 1,
discoveryStartPage: 3,
discoveryStep: 2,
pageConcurrency: 1,
pageStaggerMs: 10000,
discoveryDelayMs: 10000,
}, },
{ {
key: "blended-malt-scotch", key: "blended-malt-scotch",
label: "Blended Malt Scotch", label: "Blended Malt Scotch",
startUrl: "https://craftcellars.ca/collections/blended-malt-scotch?filter.v.availability=1", startUrl: "https://craftcellars.ca/collections/blended-malt-scotch?filter.v.availability=1",
discoveryStartPage: 10,
discoveryStartPage: 3,
discoveryStep: 2,
pageConcurrency: 1,
pageStaggerMs: 10000,
discoveryDelayMs: 10000,
}, },
], ],
}; };