Skip to content

Commit

Permalink
fix: 403 on some sitemaps because of cloudflare protection
Browse files Browse the repository at this point in the history
  • Loading branch information
gmpetrov committed Jul 25, 2023
1 parent 5687175 commit d24c013
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 1 deletion.
27 changes: 27 additions & 0 deletions utils/browser.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import playwright from 'playwright';

export const fetchWithBrowser = async (url: string) => {
const customUserAgent =
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36';

const browser = await playwright.chromium.launch({
headless: true,
});

const context = await browser.newContext({
userAgent: customUserAgent,
});

const page = await context.newPage();
await page.goto(url, {
waitUntil: 'domcontentloaded',
timeout: 100000,
});

let content = await page.content();

await context.close();
await browser.close();

return content;
};
4 changes: 3 additions & 1 deletion utils/find-domain-pages.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import pTimeout from 'p-timeout';
import path from 'path';

import addSlashUrl from './add-slash-url';
import { fetchWithBrowser } from './browser';

export const getUrlsFromSitemap = (data: any) => {
const pages: string[] = [];
Expand Down Expand Up @@ -51,7 +52,8 @@ export const getSitemapPages = async (sitemapURL: string) => {
};

try {
const { data } = await axios.get(sitemapURL);
// const { data } = await axios.get(sitemapURL);
const data = await fetchWithBrowser(sitemapURL);

return getUrlsFromSitemap(data);
} catch (err) {
Expand Down

0 comments on commit d24c013

Please sign in to comment.