mirror of
https://github.com/samsonjs/spirit-tracker.git
synced 2026-03-25 09:25:51 +00:00
fix: Sierra scraping
This commit is contained in:
parent
7d615164a5
commit
09cec6a161
1 changed files with 41 additions and 15 deletions
|
|
@ -240,16 +240,13 @@ async function scanCategoryWooStoreApi(ctx, prevDb, report) {
|
||||||
const discovered = new Map();
|
const discovered = new Map();
|
||||||
|
|
||||||
const catId = await getWooCategoryIdForCat(ctx);
|
const catId = await getWooCategoryIdForCat(ctx);
|
||||||
|
|
||||||
// If we can't infer id, do nothing special; let existing DB stay as-is.
|
|
||||||
// (You can remove this fallback if you prefer hard failure.)
|
|
||||||
if (!catId) return;
|
if (!catId) return;
|
||||||
|
|
||||||
const apiBase = new URL(`https://${ctx.store.host}/wp-json/wc/store/v1/products`);
|
const apiBase = new URL(`https://${ctx.store.host}/wp-json/wc/store/v1/products`);
|
||||||
apiBase.searchParams.set("per_page", String(perPage));
|
apiBase.searchParams.set("per_page", String(perPage));
|
||||||
apiBase.searchParams.set("category", String(catId));
|
apiBase.searchParams.set("category", String(catId));
|
||||||
|
|
||||||
const hardCap = 500; // safety
|
const hardCap = 500;
|
||||||
let page = 1;
|
let page = 1;
|
||||||
|
|
||||||
while (page <= hardCap) {
|
while (page <= hardCap) {
|
||||||
|
|
@ -262,32 +259,51 @@ async function scanCategoryWooStoreApi(ctx, prevDb, report) {
|
||||||
ctx.store.ua
|
ctx.store.ua
|
||||||
);
|
);
|
||||||
|
|
||||||
const itemsRaw = (ctx.store.parseProducts || ctx.config.defaultParseProducts)(text, ctx, finalUrl);
|
// IMPORTANT:
|
||||||
const items = [];
|
// Parse WITHOUT allowUrl so pagination is based on real API page size
|
||||||
|
const ctxNoFilter =
|
||||||
|
typeof ctx?.cat?.allowUrl === "function"
|
||||||
|
? { ...ctx, cat: { ...ctx.cat, allowUrl: null } }
|
||||||
|
: ctx;
|
||||||
|
|
||||||
for (const it of itemsRaw) {
|
const itemsAll =
|
||||||
const allow = ctx?.cat?.allowUrl;
|
(ctx.store.parseProducts || ctx.config.defaultParseProducts)(text, ctxNoFilter, finalUrl);
|
||||||
|
|
||||||
|
const rawCount = itemsAll.length;
|
||||||
|
|
||||||
|
// Now apply allowUrl AFTER pagination logic
|
||||||
|
const items = [];
|
||||||
|
const allow = ctx?.cat?.allowUrl;
|
||||||
|
for (const it of itemsAll) {
|
||||||
if (typeof allow === "function" && !allow(it)) continue;
|
if (typeof allow === "function" && !allow(it)) continue;
|
||||||
items.push(it);
|
items.push(it);
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.ok(
|
logger.ok(
|
||||||
`${ctx.catPrefixOut} | Page ${String(page).padStart(3, " ")} | ${String(status).padStart(3, " ")} | items=${String(items.length).padStart(3, " ")} | bytes=${String(bytes || 0).padStart(8, " ")} | ${(ms / 1000).toFixed(1).padStart(6, " ")}s`
|
`${ctx.catPrefixOut} | Page ${String(page).padStart(3, " ")} | ${String(status).padStart(3, " ")} | raw=${String(rawCount).padStart(3, " ")} kept=${String(items.length).padStart(3, " ")} | bytes=${String(bytes || 0).padStart(8, " ")} | ${(ms / 1000).toFixed(1).padStart(6, " ")}s`
|
||||||
);
|
);
|
||||||
|
|
||||||
// stop on empty OR short last page (prevents requesting the "[]" next page that triggers Short HTML)
|
// Stop only when the API page itself is empty
|
||||||
if (!items.length) break;
|
if (!rawCount) break;
|
||||||
|
|
||||||
for (const it of items) discovered.set(it.url, it);
|
for (const it of items) discovered.set(it.url, it);
|
||||||
|
|
||||||
if (items.length < perPage) break;
|
// Last page if API returned fewer than perPage
|
||||||
|
if (rawCount < perPage) break;
|
||||||
|
|
||||||
page++;
|
page++;
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.ok(`${ctx.catPrefixOut} | Unique products (this run): ${discovered.size}`);
|
logger.ok(`${ctx.catPrefixOut} | Unique products (this run): ${discovered.size}`);
|
||||||
|
|
||||||
const { merged, newItems, updatedItems, removedItems, restoredItems, metaChangedItems } =
|
const {
|
||||||
mergeDiscoveredIntoDb(prevDb, discovered, { storeLabel: ctx.store.name });
|
merged,
|
||||||
|
newItems,
|
||||||
|
updatedItems,
|
||||||
|
removedItems,
|
||||||
|
restoredItems,
|
||||||
|
metaChangedItems,
|
||||||
|
} = mergeDiscoveredIntoDb(prevDb, discovered, { storeLabel: ctx.store.name });
|
||||||
|
|
||||||
const dbObj = buildDbObject(ctx, merged);
|
const dbObj = buildDbObject(ctx, merged);
|
||||||
writeJsonAtomic(ctx.dbFile, dbObj);
|
writeJsonAtomic(ctx.dbFile, dbObj);
|
||||||
|
|
@ -317,9 +333,19 @@ async function scanCategoryWooStoreApi(ctx, prevDb, report) {
|
||||||
report.totals.restoredCount += restoredItems.length;
|
report.totals.restoredCount += restoredItems.length;
|
||||||
report.totals.metaChangedCount += metaChangedItems.length;
|
report.totals.metaChangedCount += metaChangedItems.length;
|
||||||
|
|
||||||
addCategoryResultToReport(report, ctx.store.name, ctx.cat.label, newItems, updatedItems, removedItems, restoredItems);
|
addCategoryResultToReport(
|
||||||
|
report,
|
||||||
|
ctx.store.name,
|
||||||
|
ctx.cat.label,
|
||||||
|
newItems,
|
||||||
|
updatedItems,
|
||||||
|
removedItems,
|
||||||
|
restoredItems
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
function createStore(defaultUa) {
|
function createStore(defaultUa) {
|
||||||
const ua = defaultUa;
|
const ua = defaultUa;
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue