diff --git a/index.js b/index.js index 0d4e0e0..accf609 100644 --- a/index.js +++ b/index.js @@ -5,7 +5,7 @@ var regex * Refresh the local regex variable (clusure) */ function update () { - regex = new RegExp('(' + list.join('|') + ')', 'i') + regex = new RegExp(list.join('|'), 'i') } /** @@ -74,6 +74,9 @@ try { // Addresses: libhttp browser list.splice(list.lastIndexOf('http'), 1) list.push('(? docs/CNAME", diff --git a/scripts/download-fixtures.sh b/scripts/download-fixtures.sh index f758779..e394ecf 100755 --- a/scripts/download-fixtures.sh +++ b/scripts/download-fixtures.sh @@ -2,7 +2,7 @@ echo "Download crawler list from user-agents.net" curl -f -d 'browser_type=bot-crawler&download=txt' https://user-agents.net/download > tests/fixtures/user-agents.net.bot-crawler.txt echo "Download cralwer list from myip.ms" -curl -f https://www.myip.ms/files/bots/live_webcrawlers.txt > tests/fixtures/live_webcrawlers.txt +curl -f https://myip.ms/files/bots/live_webcrawlers.txt > tests/fixtures/live_webcrawlers.txt echo "Download bot list from matomo" curl -f https://raw.githubusercontent.com/matomo-org/device-detector/master/Tests/fixtures/bots.yml > tests/fixtures/matomo-bots.yml diff --git a/tests/fixtures/user-agents.net-bots-ignore-list.txt b/tests/fixtures/user-agents.net-bots-ignore-list.txt index 4c4d587..3fb3f12 100644 --- a/tests/fixtures/user-agents.net-bots-ignore-list.txt +++ b/tests/fixtures/user-agents.net-bots-ignore-list.txt @@ -7,6 +7,7 @@ Mozilla/5.0 (Windows; rv:49.0) Gecko/20100101 Firefox/49.0 Mozilla/5.0 (Windows; rv:55.0) Gecko/20100101 Firefox/55.0 Mozilla/5.0 (Windows; rv:65.0) Gecko/20100101 Firefox/65.0 Mozilla/5.0 (Windows; rv:81.0) Gecko/20100101 Firefox/81.0 +NokiaC3-00/5.0 (08.65) Profile/MIDP-2.1 Configuration/CLDC-1.1 Mozilla/5.0 (Java; U; en-us; nokiac3-00) UCBrowser8.3.0.154/69/444/UCWEB Mobile UNTRUSTED/1.0 NokiaX2-05/2.0 (08.30) Profile/MIDP-2.1 Configuration/CLDC-1.1 UCWEB/2.0 (Java; U; MIDP-2.0; en-US; NokiaX2-05) U2/1.0.0 UCBrowser/9.5.0.449 U2/1.0.0 Mobile UNTRUSTED/1.0 SonyEricssonJ20i/R7CA Profile/MIDP-2.1 Configuration/CLDC-1.1 UNTRUSTED/1.0 UCWEB/2.0 (Java; U; MIDP-2.0; ru; SonyEricssonJ20i) U2/1.0.0 UCBrowser/9.5.0.449 U2/1.0.0 Mobile windows 7 pro 64 bit, opera stable software browser, active x controls, java updater , java script diff --git a/tests/helpers/index.js b/tests/helpers/index.js index c181363..2493b03 100644 --- a/tests/helpers/index.js +++ b/tests/helpers/index.js @@ -24,6 +24,15 @@ const ignoreList = read(botsIgnoreList) line => !line.startsWith('#') ) +/** + * For some reason, UCWEB are all considered bots by these guys + * @type RegExp + */ +const USERAGENT_NET_CRAWLER_EXCLUDE_PATTERN = new RegExp([ + 'ucmini', + 'NokiaC3-00\\/5\\.0 \\(\\d+\\.\\d+\\) Profile\\/MIDP-2\\.1 Configuration\\/CLDC-1\\.1 UCWEB\\/2\\.0 \\(Java; U; MIDP-2\\.0;' +].join('|'), 'i') + /** * List of known crawlers * @type {string[]} @@ -31,7 +40,9 @@ const ignoreList = read(botsIgnoreList) module.exports.crawlers = [ // Read from text file - ...read(crawlerUserAgentsText).trim().split('\n'), + ...read(crawlerUserAgentsText).trim().split('\n').filter( + line => !USERAGENT_NET_CRAWLER_EXCLUDE_PATTERN.test(line) + ), // Read from a different text file ...read(