From d24c013652daea6630b587dd9f1c06ffe90f8937 Mon Sep 17 00:00:00 2001 From: Georges Petrov Date: Tue, 25 Jul 2023 12:15:54 +0200 Subject: [PATCH] fix: 403 on some sitemaps because of cloudflare protection --- utils/browser.ts | 27 +++++++++++++++++++++++++++ utils/find-domain-pages.ts | 4 +++- 2 files changed, 30 insertions(+), 1 deletion(-) create mode 100644 utils/browser.ts diff --git a/utils/browser.ts b/utils/browser.ts new file mode 100644 index 000000000..fc898450c --- /dev/null +++ b/utils/browser.ts @@ -0,0 +1,27 @@ +import playwright from 'playwright'; + +export const fetchWithBrowser = async (url: string) => { + const customUserAgent = + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'; + + const browser = await playwright.chromium.launch({ + headless: true, + }); + + const context = await browser.newContext({ + userAgent: customUserAgent, + }); + + const page = await context.newPage(); + await page.goto(url, { + waitUntil: 'domcontentloaded', + timeout: 100000, + }); + + let content = await page.content(); + + await context.close(); + await browser.close(); + + return content; +}; diff --git a/utils/find-domain-pages.ts b/utils/find-domain-pages.ts index d868bd41b..fac961b3a 100644 --- a/utils/find-domain-pages.ts +++ b/utils/find-domain-pages.ts @@ -5,6 +5,7 @@ import pTimeout from 'p-timeout'; import path from 'path'; import addSlashUrl from './add-slash-url'; +import { fetchWithBrowser } from './browser'; export const getUrlsFromSitemap = (data: any) => { const pages: string[] = []; @@ -51,7 +52,8 @@ export const getSitemapPages = async (sitemapURL: string) => { }; try { - const { data } = await axios.get(sitemapURL); + // const { data } = await axios.get(sitemapURL); + const data = await fetchWithBrowser(sitemapURL); return getUrlsFromSitemap(data); } catch (err) {