diff --git a/src/core/http.js b/src/core/http.js index 01b2764..a60552c 100644 --- a/src/core/http.js +++ b/src/core/http.js @@ -1,3 +1,4 @@ +// src/core/http.js "use strict"; const { setTimeout: sleep } = require("timers/promises"); @@ -134,29 +135,70 @@ function createHttpClient({ maxRetries, timeoutMs, defaultUa, logger }) { // host -> epoch ms when next request is allowed const hostNextOkAt = new Map(); - const minHostIntervalMs = 900; + + // Conservative pacing defaults (slow > blocked) + const minHostIntervalMs = 2500; + + // Per-host inflight clamp (prevents bursts when global concurrency is high) + const hostInflight = new Map(); + const maxHostInflight = 1; function inflightStr() { return `inflight=${inflight}`; } + async function acquireHost(url) { + const host = hostFromUrl(url); + if (!host) return () => {}; + + while (true) { + const cur = hostInflight.get(host) || 0; + if (cur < maxHostInflight) { + hostInflight.set(host, cur + 1); + return () => { + const n = (hostInflight.get(host) || 1) - 1; + if (n <= 0) hostInflight.delete(host); + else hostInflight.set(host, n); + }; + } + await sleep(50); + } + } + + // ✅ Pre-pacing reservation: reserve the next slot BEFORE the fetch is sent async function throttleHost(url) { const host = hostFromUrl(url); if (!host) return; - const now = Date.now(); - const next = hostNextOkAt.get(host) || 0; - if (next > now) { - logger?.dbg?.(`THROTTLE host=${host} wait=${next - now}ms`); - await sleep(next - now); + + while (true) { + const now = Date.now(); + const next = hostNextOkAt.get(host) || 0; + const wait = next - now; + + if (wait > 0) { + logger?.dbg?.(`THROTTLE host=${host} wait=${wait}ms`); + await sleep(wait); + continue; + } + + // Reserve immediately to prevent concurrent pass-through + hostNextOkAt.set(host, now + minHostIntervalMs); + return; } } function noteHost(url, extraDelayMs = 0) { const host = hostFromUrl(url); if (!host) return; - const until = Date.now() + minHostIntervalMs + extraDelayMs; - hostNextOkAt.set(host, until); - logger?.dbg?.(`HOST-PACE host=${host} nextOkIn=${until - Date.now()}ms`); + + const now = Date.now(); + const current = hostNextOkAt.get(host) || 0; + + // Extend (never shorten) any existing cooldown + const target = now + minHostIntervalMs + Math.max(0, extraDelayMs); + hostNextOkAt.set(host, Math.max(current, target)); + + logger?.dbg?.(`HOST-PACE host=${host} nextOkIn=${Math.max(0, (hostNextOkAt.get(host) || 0) - Date.now())}ms`); } async function fetchWithRetry( @@ -170,9 +212,9 @@ function createHttpClient({ maxRetries, timeoutMs, defaultUa, logger }) { const start = Date.now(); inflight++; - logger?.dbg?.( - `REQ#${reqId} START ${tag} attempt=${attempt + 1}/${maxRetries + 1} ${url} (${inflightStr()})` - ); + logger?.dbg?.(`REQ#${reqId} START ${tag} attempt=${attempt + 1}/${maxRetries + 1} ${url} (${inflightStr()})`); + + const releaseHost = await acquireHost(url); try { await throttleHost(url); @@ -181,11 +223,7 @@ function createHttpClient({ maxRetries, timeoutMs, defaultUa, logger }) { const t = setTimeoutCb(() => ctrl.abort(), timeoutMs); const cookieHdr = - cookies && - !("Cookie" in headers) && - !("cookie" in headers) - ? cookieJar.cookieHeaderFor(url) - : ""; + cookies && !("Cookie" in headers) && !("cookie" in headers) ? cookieJar.cookieHeaderFor(url) : ""; const res = await fetch(url, { method, @@ -207,20 +245,20 @@ function createHttpClient({ maxRetries, timeoutMs, defaultUa, logger }) { const finalUrl = res.url || url; const elapsed = Date.now() - start; + // Always pace the host a bit after any response noteHost(finalUrl); if (cookies) cookieJar.storeFromResponse(url, res); - logger?.dbg?.( - `REQ#${reqId} HTTP ${status} ${tag} ms=${elapsed} finalUrl=${finalUrl}` - ); + logger?.dbg?.(`REQ#${reqId} HTTP ${status} ${tag} ms=${elapsed} finalUrl=${finalUrl}`); if (status === 429) { - const raMs = retryAfterMs(res); - if (raMs > 0) noteHost(finalUrl, raMs); + let raMs = retryAfterMs(res); - logger?.dbg?.( - `REQ#${reqId} 429 retryAfterMs=${raMs} host=${hostFromUrl(finalUrl)}` - ); + // ✅ If no Retry-After header, enforce a real cooldown (Shopify often omits it) + if (raMs <= 0) raMs = 15000 + Math.floor(Math.random() * 5000); + + noteHost(finalUrl, raMs); + logger?.dbg?.(`REQ#${reqId} 429 retryAfterMs=${raMs} host=${hostFromUrl(finalUrl)}`); throw new RetryableError("HTTP 429"); } @@ -231,9 +269,7 @@ function createHttpClient({ maxRetries, timeoutMs, defaultUa, logger }) { if (status >= 400) { const bodyTxt = await safeText(res); throw new Error( - `HTTP ${status} bodyHead=${String(bodyTxt) - .slice(0, 160) - .replace(/\s+/g, " ")}` + `HTTP ${status} bodyHead=${String(bodyTxt).slice(0, 160).replace(/\s+/g, " ")}` ); } @@ -274,6 +310,7 @@ function createHttpClient({ maxRetries, timeoutMs, defaultUa, logger }) { logger?.warn?.(`Request failed, retrying in ${delay}ms (${attempt + 1}/${maxRetries})`); await sleep(delay); } finally { + releaseHost(); inflight--; logger?.dbg?.(`REQ#${reqId} END ${tag} (${inflightStr()})`); } diff --git a/src/stores/craftcellars.js b/src/stores/craftcellars.js index de3ee5f..2f0a052 100644 --- a/src/stores/craftcellars.js +++ b/src/stores/craftcellars.js @@ -1,5 +1,8 @@ +// src/stores/craftcellars.js "use strict"; +const { setTimeout: sleep } = require("timers/promises"); + const { decodeHtml, stripTags, extractFirstImgUrl } = require("../utils/html"); const { sanitizeName } = require("../utils/text"); const { normalizeCspc } = require("../utils/sku"); @@ -29,7 +32,8 @@ function canonicalizeCraftProductUrl(raw) { function extractShopifyCardPrice(block) { const b = String(block || ""); - const dollars = (txt) => [...String(txt).matchAll(/\$\s*[\d,]+(?:\.\d{2})?/g)].map((m) => m[0].replace(/\s+/g, "")); + const dollars = (txt) => + [...String(txt).matchAll(/\$\s*[\d,]+(?:\.\d{2})?/g)].map((m) => m[0].replace(/\s+/g, "")); const saleRegion = b.split(/sale price/i)[1] || ""; const saleD = dollars(saleRegion); @@ -83,12 +87,8 @@ function parseProductsCraftCellarsInner(html, ctx) { url = canonicalizeCraftProductUrl(url); const nameHtml = - block.match( - /]*href=["'][^"']*\/products\/[^"']+["'][^>]*>\s*<[^>]*>\s*([^<]{2,200}?)\s*]*>[\s\S]*?]*\/products\/[^"']+[^>]*>([\s\S]*?)<\/a>[\s\S]*?<\/h[23]>/i - )?.[1] || + block.match(/]*href=["'][^"']*\/products\/[^"']+["'][^>]*>\s*<[^>]*>\s*([^<]{2,200}?)\s*]*>[\s\S]*?]*\/products\/[^"']+[^>]*>([\s\S]*?)<\/a>[\s\S]*?<\/h[23]>/i)?.[1] || block.match(/]*href=["'][^"']*\/products\/[^"']+["'][^>]*>([\s\S]*?)<\/a>/i)?.[1]; const name = sanitizeName(stripTags(decodeHtml(nameHtml || ""))); @@ -105,13 +105,16 @@ function parseProductsCraftCellarsInner(html, ctx) { return [...uniq.values()]; } - function usdFromShopifyPriceStr(s) { const n = Number(String(s || "").replace(/[^0-9.]/g, "")); if (!Number.isFinite(n)) return ""; return `$${n.toLocaleString("en-US", { minimumFractionDigits: 2, maximumFractionDigits: 2 })}`; } +function cfgNum(v, fallback) { + return Number.isFinite(v) ? v : fallback; +} + /** * Craft Cellars: * - HTML listing with ?filter.v.availability=1 is the allowlist (prevents OOS leaking in) @@ -120,6 +123,18 @@ function usdFromShopifyPriceStr(s) { async function scanCategoryCraftCellars(ctx, prevDb, report) { const t0 = Date.now(); + // Strongly prefer "slow and steady" to avoid 429s. + // Use per-category knobs if present; otherwise default conservative. + const perPageDelayMs = Math.max( + 0, + cfgNum(ctx?.cat?.pageStaggerMs, cfgNum(ctx?.cat?.discoveryDelayMs, 0)) || 0 + ) || 0; + + const perJsonPageDelayMs = Math.max( + 0, + cfgNum(ctx?.cat?.jsonPageDelayMs, perPageDelayMs) + ); + // 1) HTML scan: allowlist of in-stock listing URLs const htmlMap = new Map(); // url -> {name, price, url, img} @@ -128,6 +143,8 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) { let emptyStreak = 0; for (let p = 1; p <= maxPages; p++) { + if (p > 1 && perPageDelayMs > 0) await sleep(perPageDelayMs); + const pageUrl = makePageUrlShopifyQueryPage(ctx.cat.startUrl, p); const { text: html } = await ctx.http.fetchTextWithRetry(pageUrl, `craft:html:${ctx.cat.key}:p${p}`, ctx.store.ua); htmlPagesFetched++; @@ -151,9 +168,7 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) { // If HTML returns nothing, don't let JSON invent a category if (!htmlMap.size) { - ctx.logger.warn( - `${ctx.catPrefixOut} | HTML listing returned 0 items; refusing to use products.json as source of truth.` - ); + ctx.logger.warn(`${ctx.catPrefixOut} | HTML listing returned 0 items; refusing to use products.json as source of truth.`); } // 2) JSON scan: build SKU index (but do NOT add new URLs from JSON) @@ -170,6 +185,8 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) { let jsonPagesFetched = 0; while (true) { + if (jsonPage > 1 && perJsonPageDelayMs > 0) await sleep(perJsonPageDelayMs); + const url = `https://${ctx.store.host}/collections/${collectionHandle}/products.json?limit=${limit}&page=${jsonPage}`; const r = await ctx.http.fetchJsonWithRetry(url, `craft:coljson:${ctx.cat.key}:p${jsonPage}`, ctx.store.ua); @@ -240,7 +257,9 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) { ctx.logger.ok(`${ctx.catPrefixOut} | Unique products (this run): ${discovered.size}`); - const { merged, newItems, updatedItems, removedItems, restoredItems } = mergeDiscoveredIntoDb(prevDb, discovered, { storeLabel: ctx.store.name }); + const { merged, newItems, updatedItems, removedItems, restoredItems } = mergeDiscoveredIntoDb(prevDb, discovered, { + storeLabel: ctx.store.name, + }); const dbObj = buildDbObject(ctx, merged); writeJsonAtomic(ctx.dbFile, dbObj); @@ -271,7 +290,6 @@ async function scanCategoryCraftCellars(ctx, prevDb, report) { addCategoryResultToReport(report, ctx.store.name, ctx.cat.label, newItems, updatedItems, removedItems, restoredItems); } - function createStore(defaultUa) { return { key: "craftcellars", @@ -292,37 +310,68 @@ function createStore(defaultUa) { key: "whisky", label: "Whisky", startUrl: "https://craftcellars.ca/collections/whisky?filter.v.availability=1", - discoveryStartPage: 10, + + // slow-and-safe defaults (override globally if you want) + discoveryStartPage: 3, + discoveryStep: 2, + pageConcurrency: 1, + pageStaggerMs: 10000, + discoveryDelayMs: 10000, }, { key: "rum", label: "Rum", startUrl: "https://craftcellars.ca/collections/rum?filter.v.availability=1", - discoveryStartPage: 5, + + discoveryStartPage: 3, + discoveryStep: 2, + pageConcurrency: 1, + pageStaggerMs: 10000, + discoveryDelayMs: 10000, }, { key: "single-malt-scotch", label: "Single Malt Scotch", startUrl: "https://craftcellars.ca/collections/single-malt-scotch?filter.v.availability=1", - discoveryStartPage: 1, + + discoveryStartPage: 3, + discoveryStep: 2, + pageConcurrency: 1, + pageStaggerMs: 10000, + discoveryDelayMs: 10000, }, { key: "other-scotch-styles", label: "Other Scotch Styles", startUrl: "https://craftcellars.ca/collections/other-scotch-styles?filter.v.availability=1", - discoveryStartPage: 1, + + discoveryStartPage: 3, + discoveryStep: 2, + pageConcurrency: 1, + pageStaggerMs: 10000, + discoveryDelayMs: 10000, }, { key: "single-grain-scotch", label: "Single Grain Scotch", startUrl: "https://craftcellars.ca/collections/single-grain-scotch?filter.v.availability=1", - discoveryStartPage: 1, + + discoveryStartPage: 3, + discoveryStep: 2, + pageConcurrency: 1, + pageStaggerMs: 10000, + discoveryDelayMs: 10000, }, { key: "blended-malt-scotch", label: "Blended Malt Scotch", startUrl: "https://craftcellars.ca/collections/blended-malt-scotch?filter.v.availability=1", - discoveryStartPage: 10, + + discoveryStartPage: 3, + discoveryStep: 2, + pageConcurrency: 1, + pageStaggerMs: 10000, + discoveryDelayMs: 10000, }, ], };