Skip to content

Commit

Permalink
crawl state: add getPendingList() to return pending state from either… (
Browse files Browse the repository at this point in the history
#205)

* crawl state: add getPendingList() to return pending state from either memory or redis crawl state, fix stats logging with redis state. Return pending list as json object
logging: check if data object is an error, log fields from error. Convert missing console.* to new logger
* evaluate failuire: log with error, not fatal
  • Loading branch information
ikreymer authored Jan 23, 2023
1 parent 1a066db commit a767721
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 12 deletions.
14 changes: 7 additions & 7 deletions crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ export class Crawler {

if (this.params.screenshot) {
if (!page.isHTMLPage) {
console.log("Skipping screenshots for non-HTML page");
this.logger.info("Skipping screenshots for non-HTML page");
}
const archiveDir = path.join(this.collDir, "archive");
const screenshots = new Screenshots({page, url: data.url, directory: archiveDir});
Expand Down Expand Up @@ -428,7 +428,7 @@ export class Crawler {
await this.serializeConfig();

} catch (e) {
this.logger.error(`Error crawling page ${data.url}`, e.message);
this.logger.error(`Error crawling page ${data.url}`, e);
await this.markPageFailed(page);
}
}
Expand Down Expand Up @@ -755,16 +755,16 @@ export class Crawler {
}

const realSize = await this.crawlState.realSize();
const pending = await this.crawlState.numRealPending();
const pendingList = await this.crawlState.getPendingList();
const done = await this.crawlState.numDone();
const total = realSize + pending + done;
const total = realSize + pendingList.length + done;
const limit = {max: this.params.limit || 0, hit: this.limitHit};
const stats = {
"crawled": done,
"total": total,
"pending": pending,
"pending": pendingList.length,
"limit": limit,
"pendingPages": Array.from(this.crawlState.pending.values()).map(x => JSON.stringify(x))
"pendingPages": pendingList.map(x => JSON.stringify(x))
};

this.logger.info("Crawl statistics", stats, "crawlState");
Expand Down Expand Up @@ -947,7 +947,7 @@ export class Crawler {
await this.sleep(5.5);
}
} catch (e) {
this.logger.warn("Check CF failed, ignoring", e.message);
this.logger.warn("Check CF failed, ignoring", e);
}
}

Expand Down
4 changes: 2 additions & 2 deletions util/browser.js
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,8 @@ export async function evaluateWithCLI(frame, funcString) {
});

if (exceptionDetails) {
logger.fatal(
"Behavior Evaluation Failed" + exceptionDetails.text
logger.error(
"Behavior Evaluation Failed: " + exceptionDetails.text, exceptionDetails.stackTrace || {}
);
}

Expand Down
4 changes: 3 additions & 1 deletion util/logger.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ export class Logger
}

logAsJSON(message, data, context, logLevel="info") {
if (typeof data !== "object") {
if (data instanceof Error) {
data = {"type": "exception", "message": data.message, "stack": data.stack};
} else if (typeof data !== "object") {
data = {"message": data.toString()};
}
let dataToLog = {
Expand Down
11 changes: 10 additions & 1 deletion util/state.js
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,8 @@ export class MemoryCrawlState extends BaseState

async serialize() {
const queued = this.queue.map(x => JSON.stringify(x));
const pending = Array.from(this.pending.values()).map(x => JSON.stringify(x));
const done = this.done.map(x => JSON.stringify(x));
const pending = (await this.getPendingList()).map(x => JSON.stringify(x));

return {queued, pending, done};
}
Expand Down Expand Up @@ -179,6 +179,10 @@ export class MemoryCrawlState extends BaseState
async numRealPending() {
return this.pending.size;
}

async getPendingList() {
return Array.from(this.pending.values());
}
}


Expand Down Expand Up @@ -445,6 +449,11 @@ return 0;
return res;
}

async getPendingList() {
const list = await this.redis.hvals(this.pkey);
return list.map(x => JSON.parse(x));
}

async resetPendings() {
const pendingUrls = await this.redis.hkeys(this.pkey);

Expand Down
2 changes: 1 addition & 1 deletion util/storage.js
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ export function initStorage() {
userId: process.env.STORE_USER,
};

console.log("Initing Storage...");
logger.info("Initing Storage...");
return new S3StorageSync(storeInfo, opts);
}

Expand Down

0 comments on commit a767721

Please sign in to comment.