feat: Slow down

This commit is contained in:
Brennan Wilkes (Text Groove) 2026-01-30 09:42:06 -08:00
parent 7ff1bde8bd
commit d53b68c3a3
2 changed files with 133 additions and 47 deletions

View file

@ -1,3 +1,4 @@
// src/core/http.js
"use strict";
const { setTimeout: sleep } = require("timers/promises");
@ -134,29 +135,70 @@ function createHttpClient({ maxRetries, timeoutMs, defaultUa, logger }) {
// host -> epoch ms when next request is allowed
const hostNextOkAt = new Map();
const minHostIntervalMs = 900;
// Conservative pacing defaults (slow > blocked)
const minHostIntervalMs = 2500;
// Per-host inflight clamp (prevents bursts when global concurrency is high)
const hostInflight = new Map();
const maxHostInflight = 1;
function inflightStr() {
return `inflight=${inflight}`;
}
async function acquireHost(url) {
const host = hostFromUrl(url);
if (!host) return () => {};
while (true) {
const cur = hostInflight.get(host) || 0;
if (cur < maxHostInflight) {
hostInflight.set(host, cur + 1);
return () => {
const n = (hostInflight.get(host) || 1) - 1;
if (n <= 0) hostInflight.delete(host);
else hostInflight.set(host, n);
};
}
await sleep(50);
}
}
// ✅ Pre-pacing reservation: reserve the next slot BEFORE the fetch is sent
async function throttleHost(url) {
const host = hostFromUrl(url);
if (!host) return;
const now = Date.now();
const next = hostNextOkAt.get(host) || 0;
if (next > now) {
logger?.dbg?.(`THROTTLE host=${host} wait=${next - now}ms`);
await sleep(next - now);
while (true) {
const now = Date.now();
const next = hostNextOkAt.get(host) || 0;
const wait = next - now;
if (wait > 0) {
logger?.dbg?.(`THROTTLE host=${host} wait=${wait}ms`);
await sleep(wait);
continue;
}
// Reserve immediately to prevent concurrent pass-through
hostNextOkAt.set(host, now + minHostIntervalMs);
return;
}
}
function noteHost(url, extraDelayMs = 0) {
const host = hostFromUrl(url);
if (!host) return;
const until = Date.now() + minHostIntervalMs + extraDelayMs;
hostNextOkAt.set(host, until);
logger?.dbg?.(`HOST-PACE host=${host} nextOkIn=${until - Date.now()}ms`);
const now = Date.now();
const current = hostNextOkAt.get(host) || 0;
// Extend (never shorten) any existing cooldown
const target = now + minHostIntervalMs + Math.max(0, extraDelayMs);
hostNextOkAt.set(host, Math.max(current, target));
logger?.dbg?.(`HOST-PACE host=${host} nextOkIn=${Math.max(0, (hostNextOkAt.get(host) || 0) - Date.now())}ms`);
}
async function fetchWithRetry(
@ -170,9 +212,9 @@ function createHttpClient({ maxRetries, timeoutMs, defaultUa, logger }) {
const start = Date.now();
inflight++;
logger?.dbg?.(
`REQ#${reqId} START ${tag} attempt=${attempt + 1}/${maxRetries + 1} ${url} (${inflightStr()})`
);
logger?.dbg?.(`REQ#${reqId} START ${tag} attempt=${attempt + 1}/${maxRetries + 1} ${url} (${inflightStr()})`);
const releaseHost = await acquireHost(url);
try {
await throttleHost(url);
@ -181,11 +223,7 @@ function createHttpClient({ maxRetries, timeoutMs, defaultUa, logger }) {
const t = setTimeoutCb(() => ctrl.abort(), timeoutMs);
const cookieHdr =
cookies &&
!("Cookie" in headers) &&
!("cookie" in headers)
? cookieJar.cookieHeaderFor(url)
: "";
cookies && !("Cookie" in headers) && !("cookie" in headers) ? cookieJar.cookieHeaderFor(url) : "";
const res = await fetch(url, {
method,
@ -207,20 +245,20 @@ function createHttpClient({ maxRetries, timeoutMs, defaultUa, logger }) {
const finalUrl = res.url || url;
const elapsed = Date.now() - start;
// Always pace the host a bit after any response
noteHost(finalUrl);
if (cookies) cookieJar.storeFromResponse(url, res);
logger?.dbg?.(
`REQ#${reqId} HTTP ${status} ${tag} ms=${elapsed} finalUrl=${finalUrl}`
);
logger?.dbg?.(`REQ#${reqId} HTTP ${status} ${tag} ms=${elapsed} finalUrl=${finalUrl}`);
if (status === 429) {
const raMs = retryAfterMs(res);
if (raMs > 0) noteHost(finalUrl, raMs);
let raMs = retryAfterMs(res);
logger?.dbg?.(
`REQ#${reqId} 429 retryAfterMs=${raMs} host=${hostFromUrl(finalUrl)}`
);
// ✅ If no Retry-After header, enforce a real cooldown (Shopify often omits it)
if (raMs <= 0) raMs = 15000 + Math.floor(Math.random() * 5000);
noteHost(finalUrl, raMs);
logger?.dbg?.(`REQ#${reqId} 429 retryAfterMs=${raMs} host=${hostFromUrl(finalUrl)}`);
throw new RetryableError("HTTP 429");
}
@ -231,9 +269,7 @@ function createHttpClient({ maxRetries, timeoutMs, defaultUa, logger }) {
if (status >= 400) {
const bodyTxt = await safeText(res);
throw new Error(
`HTTP ${status} bodyHead=${String(bodyTxt)
.slice(0, 160)
.replace(/\s+/g, " ")}`
`HTTP ${status} bodyHead=${String(bodyTxt).slice(0, 160).replace(/\s+/g, " ")}`
);
}
@ -274,6 +310,7 @@ function createHttpClient({ maxRetries, timeoutMs, defaultUa, logger }) {
logger?.warn?.(`Request failed, retrying in ${delay}ms (${attempt + 1}/${maxRetries})`);
await sleep(delay);
} finally {
releaseHost();
inflight--;
logger?.dbg?.(`REQ#${reqId} END ${tag} (${inflightStr()})`);
}

View file

@ -1,5 +1,8 @@
// src/stores/craftcellars.js
"use strict";
const { setTimeout: sleep } = require("timers/promises");
const { decodeHtml, stripTags, extractFirstImgUrl } = require("../utils/html");
const { sanitizeName } = require("../utils/text");
const { normalizeCspc } = require("../utils/sku");
@ -29,7 +32,8 @@ function canonicalizeCraftProductUrl(raw) {
function extractShopifyCardPrice(block) {
const b = String(block || "");
const dollars = (txt) => [...String(txt).matchAll(/\$\s*[\d,]+(?:\.\d{2})?/g)].map((m) => m[0].replace(/\s+/g, ""));
const dollars = (txt) =>
[...String(txt).matchAll(/\$\s*[\d,]+(?:\.\d{2})?/g)].map((m) => m[0].replace(/\s+/g, ""));
const saleRegion = b.split(/sale price/i)[1] || "";
const saleD = dollars(saleRegion);
@ -83,12 +87,8 @@ function parseProductsCraftCellarsInner(html, ctx) {
url = canonicalizeCraftProductUrl(url);
const nameHtml =
block.match(
/<a\b[^>]*href=["'][^"']*\/products\/[^"']+["'][^>]*>\s*<[^>]*>\s*([^<]{2,200}?)\s*</i
)?.[1] ||
block.match(
/<h[23]\b[^>]*>[\s\S]*?<a\b[^>]*\/products\/[^"']+[^>]*>([\s\S]*?)<\/a>[\s\S]*?<\/h[23]>/i
)?.[1] ||
block.match(/<a\b[^>]*href=["'][^"']*\/products\/[^"']+["'][^>]*>\s*<[^>]*>\s*([^<]{2,200}?)\s*</i)?.[1] ||
block.match(/<h[23]\b[^>]*>[\s\S]*?<a\b[^>]*\/products\/[^"']+[^>]*>([\s\S]*?)<\/a>[\s\S]*?<\/h[23]>/i)?.[1] ||
block.match(/<a\b[^>]*href=["'][^"']*\/products\/[^"']+["'][^>]*>([\s\S]*?)<\/a>/i)?.[1];
const name = sanitizeName(stripTags(decodeHtml(nameHtml || "")));
@ -105,13 +105,16 @@ function parseProductsCraftCellarsInner(html, ctx) {
return [...uniq.values()];
}
function usdFromShopifyPriceStr(s) {
const n = Number(String(s || "").replace(/[^0-9.]/g, ""));
if (!Number.isFinite(n)) return "";
return `$${n.toLocaleString("en-US", { minimumFractionDigits: 2, maximumFractionDigits: 2 })}`;
}
function cfgNum(v, fallback) {
return Number.isFinite(v) ? v : fallback;
}
/**
* Craft Cellars:
* - HTML listing with ?filter.v.availability=1 is the allowlist (prevents OOS leaking in)
@ -120,6 +123,18 @@ function usdFromShopifyPriceStr(s) {
async function scanCategoryCraftCellars(ctx, prevDb, report) {
const t0 = Date.now();
// Strongly prefer "slow and steady" to avoid 429s.
// Use per-category knobs if present; otherwise default conservative.
const perPageDelayMs = Math.max(
0,
cfgNum(ctx?.cat?.pageStaggerMs, cfgNum(ctx?.cat?.discoveryDelayMs, 0)) || 0
) || 0;
const perJsonPageDelayMs = Math.max(
0,
cfgNum(ctx?.cat?.jsonPageDelayMs, perPageDelayMs)
);
// 1) HTML scan: allowlist of in-stock listing URLs
const htmlMap = new Map(); // url -> {name, price, url, img}
@ -128,6 +143,8 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) {
let emptyStreak = 0;
for (let p = 1; p <= maxPages; p++) {
if (p > 1 && perPageDelayMs > 0) await sleep(perPageDelayMs);
const pageUrl = makePageUrlShopifyQueryPage(ctx.cat.startUrl, p);
const { text: html } = await ctx.http.fetchTextWithRetry(pageUrl, `craft:html:${ctx.cat.key}:p${p}`, ctx.store.ua);
htmlPagesFetched++;
@ -151,9 +168,7 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) {
// If HTML returns nothing, don't let JSON invent a category
if (!htmlMap.size) {
ctx.logger.warn(
`${ctx.catPrefixOut} | HTML listing returned 0 items; refusing to use products.json as source of truth.`
);
ctx.logger.warn(`${ctx.catPrefixOut} | HTML listing returned 0 items; refusing to use products.json as source of truth.`);
}
// 2) JSON scan: build SKU index (but do NOT add new URLs from JSON)
@ -170,6 +185,8 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) {
let jsonPagesFetched = 0;
while (true) {
if (jsonPage > 1 && perJsonPageDelayMs > 0) await sleep(perJsonPageDelayMs);
const url = `https://${ctx.store.host}/collections/${collectionHandle}/products.json?limit=${limit}&page=${jsonPage}`;
const r = await ctx.http.fetchJsonWithRetry(url, `craft:coljson:${ctx.cat.key}:p${jsonPage}`, ctx.store.ua);
@ -240,7 +257,9 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) {
ctx.logger.ok(`${ctx.catPrefixOut} | Unique products (this run): ${discovered.size}`);
const { merged, newItems, updatedItems, removedItems, restoredItems } = mergeDiscoveredIntoDb(prevDb, discovered, { storeLabel: ctx.store.name });
const { merged, newItems, updatedItems, removedItems, restoredItems } = mergeDiscoveredIntoDb(prevDb, discovered, {
storeLabel: ctx.store.name,
});
const dbObj = buildDbObject(ctx, merged);
writeJsonAtomic(ctx.dbFile, dbObj);
@ -271,7 +290,6 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) {
addCategoryResultToReport(report, ctx.store.name, ctx.cat.label, newItems, updatedItems, removedItems, restoredItems);
}
function createStore(defaultUa) {
return {
key: "craftcellars",
@ -292,37 +310,68 @@ function createStore(defaultUa) {
key: "whisky",
label: "Whisky",
startUrl: "https://craftcellars.ca/collections/whisky?filter.v.availability=1",
discoveryStartPage: 10,
// slow-and-safe defaults (override globally if you want)
discoveryStartPage: 3,
discoveryStep: 2,
pageConcurrency: 1,
pageStaggerMs: 10000,
discoveryDelayMs: 10000,
},
{
key: "rum",
label: "Rum",
startUrl: "https://craftcellars.ca/collections/rum?filter.v.availability=1",
discoveryStartPage: 5,
discoveryStartPage: 3,
discoveryStep: 2,
pageConcurrency: 1,
pageStaggerMs: 10000,
discoveryDelayMs: 10000,
},
{
key: "single-malt-scotch",
label: "Single Malt Scotch",
startUrl: "https://craftcellars.ca/collections/single-malt-scotch?filter.v.availability=1",
discoveryStartPage: 1,
discoveryStartPage: 3,
discoveryStep: 2,
pageConcurrency: 1,
pageStaggerMs: 10000,
discoveryDelayMs: 10000,
},
{
key: "other-scotch-styles",
label: "Other Scotch Styles",
startUrl: "https://craftcellars.ca/collections/other-scotch-styles?filter.v.availability=1",
discoveryStartPage: 1,
discoveryStartPage: 3,
discoveryStep: 2,
pageConcurrency: 1,
pageStaggerMs: 10000,
discoveryDelayMs: 10000,
},
{
key: "single-grain-scotch",
label: "Single Grain Scotch",
startUrl: "https://craftcellars.ca/collections/single-grain-scotch?filter.v.availability=1",
discoveryStartPage: 1,
discoveryStartPage: 3,
discoveryStep: 2,
pageConcurrency: 1,
pageStaggerMs: 10000,
discoveryDelayMs: 10000,
},
{
key: "blended-malt-scotch",
label: "Blended Malt Scotch",
startUrl: "https://craftcellars.ca/collections/blended-malt-scotch?filter.v.availability=1",
discoveryStartPage: 10,
discoveryStartPage: 3,
discoveryStep: 2,
pageConcurrency: 1,
pageStaggerMs: 10000,
discoveryDelayMs: 10000,
},
],
};