Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(perf): use request streams to reduce memory usage #336

Merged
merged 1 commit into from
Oct 5, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 12 additions & 11 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import {EventEmitter} from 'events';
import {URL} from 'url';
import * as http from 'http';
import * as path from 'path';
import {Readable} from 'stream';

import {request, GaxiosResponse} from 'gaxios';

Expand Down Expand Up @@ -222,16 +223,15 @@ export class LinkChecker extends EventEmitter {
// Perform a HEAD or GET request based on the need to crawl
let status = 0;
let state = LinkState.BROKEN;
let data = '';
let shouldRecurse = false;
let res: GaxiosResponse<string> | undefined = undefined;
let res: GaxiosResponse<Readable> | undefined = undefined;
const failures: {}[] = [];
try {
res = await request<string>({
res = await request<Readable>({
method: opts.crawl ? 'GET' : 'HEAD',
url: opts.url.href,
headers,
responseType: opts.crawl ? 'text' : 'stream',
responseType: 'stream',
validateStatus: () => true,
timeout: opts.checkOptions.timeout,
});
Expand All @@ -241,7 +241,7 @@ export class LinkChecker extends EventEmitter {

// If we got an HTTP 405, the server may not like HEAD. GET instead!
if (res.status === 405) {
res = await request<string>({
res = await request<Readable>({
method: 'GET',
url: opts.url.href,
headers,
Expand All @@ -257,7 +257,7 @@ export class LinkChecker extends EventEmitter {
// request failure: invalid domain name, etc.
// this also occasionally catches too many redirects, but is still valid (e.g. https://www.ebay.com)
// for this reason, we also try doing a GET below to see if the link is valid
failures.push(err);
failures.push(err as Error);
}

try {
Expand All @@ -266,10 +266,10 @@ export class LinkChecker extends EventEmitter {
(res === undefined || res.status < 200 || res.status >= 300) &&
!opts.crawl
) {
res = await request<string>({
res = await request<Readable>({
method: 'GET',
url: opts.url.href,
responseType: 'text',
responseType: 'stream',
validateStatus: () => true,
headers,
timeout: opts.checkOptions.timeout,
Expand All @@ -279,13 +279,12 @@ export class LinkChecker extends EventEmitter {
}
}
} catch (ex) {
failures.push(ex);
failures.push(ex as Error);
// catch the next failure
}

if (res !== undefined) {
status = res.status;
data = res.data;
shouldRecurse = isHtml(res);
}

Expand All @@ -309,7 +308,9 @@ export class LinkChecker extends EventEmitter {
// If we need to go deeper, scan the next level of depth for links and crawl
if (opts.crawl && shouldRecurse) {
this.emit('pagestart', opts.url);
const urlResults = getLinks(data, opts.url.href);
const urlResults = res?.data
? await getLinks(res.data, opts.url.href)
: [];
for (const result of urlResults) {
// if there was some sort of problem parsing the link while
// creating a new URL obj, treat it as a broken link.
Expand Down
17 changes: 11 additions & 6 deletions src/links.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import * as htmlParser from 'htmlparser2';
import * as htmlParser from 'htmlparser2/lib/WritableStream';
import {Readable} from 'stream';
import {URL} from 'url';

const linksAttr = {
Expand Down Expand Up @@ -42,11 +43,14 @@ export interface ParsedUrl {
url?: URL;
}

export function getLinks(source: string, baseUrl: string): ParsedUrl[] {
export async function getLinks(
source: Readable,
baseUrl: string
): Promise<ParsedUrl[]> {
let realBaseUrl = baseUrl;
let baseSet = false;
const links = new Array<ParsedUrl>();
const parser = new htmlParser.Parser({
const parser = new htmlParser.WritableStream({
onopentag(tag: string, attributes: {[s: string]: string}) {
// Allow alternate base URL to be specified in tag:
if (tag === 'base' && !baseSet) {
Expand Down Expand Up @@ -79,8 +83,9 @@ export function getLinks(source: string, baseUrl: string): ParsedUrl[] {
}
},
});
parser.write(source);
parser.end();
await new Promise((resolve, reject) => {
source.pipe(parser).on('finish', resolve).on('error', reject);
});
return links;
}

Expand Down Expand Up @@ -110,6 +115,6 @@ function parseLink(link: string, baseUrl: string): ParsedUrl {
url.hash = '';
return {link, url};
} catch (error) {
return {link, error};
return {link, error: error as Error};
}
}