Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce pattern complexity #115

Merged
merged 2 commits into from
Dec 23, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion index.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ var regex
* Refresh the local regex variable (clusure)
*/
function update () {
regex = new RegExp('(' + list.join('|') + ')', 'i')
regex = new RegExp(list.join('|'), 'i')
}

/**
Expand Down Expand Up @@ -74,6 +74,9 @@ try {
// Addresses: libhttp browser
list.splice(list.lastIndexOf('http'), 1)
list.push('(?<!(lib))http')
// Addresses: java based browsers
list.splice(list.lastIndexOf('java'), 1)
list.push('java(?!;)')
} catch (error) {
// ignore errors
}
Expand Down
26 changes: 4 additions & 22 deletions list.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,12 @@
" daum[/|\\s]",
" deusu/",
" splash ",
"(^|\\s)site",
"(?:^|\\s)site",
"@[a-z]",
"\\(at\\)[a-z]",
"\\[at\\][a-z]",
"^&lt;",
"^12345",
"^<",
"^\\[",
"^ace explorer",
"^acoon",
"^activebookmark",
Expand All @@ -25,7 +23,6 @@
"^asafaweb\\.com",
"^avsdevicesdk/",
"^axios/",
"^azureus",
"^biglotron",
"^blackboard safeassign",
"^blocknote.net",
Expand All @@ -52,8 +49,7 @@
"^evernote clip resolver",
"^facebook",
"^faraday",
"^fdm \\d",
"^fdm/\\d",
"^fdm[/\\s]\\d",
"^flashget",
"^friendica",
"^getright/",
Expand All @@ -67,11 +63,9 @@
"^hobbit",
"^hotzonu",
"^hwcdn/",
"^ice browser",
"^infox-wisg",
"^ingrid/\\d",
"^integrity/",
"^java",
"^jeode/",
"^jetbrains",
"^jetty/",
Expand All @@ -92,13 +86,11 @@
"^monit",
"^movabletype",
"^mozilla/\\d\\.\\d \\(compatible;?\\)$",
"^mucommander",
"^my browser$",
"^navermailapp",
"^netsurf",
"^ning",
"^node-superagent",
"^nokiac3-00/5\\.0",
"^notetextview",
"^nuzzel",
"^octopus",
Expand All @@ -115,20 +107,16 @@
"^robozilla/",
"^ruby$",
"^scrapy",
"^selenium/",
"^seo",
"^set:",
"^shareaza",
"^shortlinktranslate",
"^signalr",
"^sistrix",
"^sixy.ch/",
"^smallproxy",
"^snap$",
"^snapchat",
"^space bison",
"^spotify/",
"^spring ",
"^sprinklr",
"^svn",
"^swcd ",
Expand Down Expand Up @@ -191,7 +179,6 @@
"finder",
"firephp",
"freesafeip",
"fuck",
"ghost",
"gomezagent",
"google",
Expand All @@ -204,9 +191,7 @@
"images",
"index",
"ips-agent",
"java/",
"javafx",
"javaos",
"java",
"jorgee",
"library",
"mail\\.ru/",
Expand All @@ -216,8 +201,6 @@
"nutch",
"offbyone",
"optimize",
"org\\.eclipse\\.ui\\.ide\\.workbench",
"outbrain",
"pagespeed",
"parse",
"perl",
Expand All @@ -236,7 +219,6 @@
"server",
"sogou",
"sparkler/",
"speedmode",
"spider",
"statuscake",
"stumbleupon\\.com",
Expand All @@ -251,7 +233,7 @@
"twingly recon",
"url",
"valid",
"wapchoi",
"wapchoi/",
"wappalyzer",
"webglance",
"webkit2png",
Expand Down
3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "isbot",
"version": "3.0.20",
"version": "3.0.21",
"description": "🤖 detect bots/crawlers/spiders via the user agent.",
"keywords": [
"bot",
Expand All @@ -24,6 +24,7 @@
"main": "index.js",
"types": "index.d.ts",
"scripts": {
"clean": "rm -rf .cache && rm -rf docs",
"prebuild": "which parcel || npm i parcel-bundler --no-save",
"build": "parcel build page/index.pug --out-dir docs --public-url .",
"postbuild": "echo isbot.js.org > docs/CNAME",
Expand Down
2 changes: 1 addition & 1 deletion scripts/download-fixtures.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ echo "Download crawler list from user-agents.net"
curl -f -d 'browser_type=bot-crawler&download=txt' https://user-agents.net/download > tests/fixtures/user-agents.net.bot-crawler.txt

echo "Download cralwer list from myip.ms"
curl -f https://www.myip.ms/files/bots/live_webcrawlers.txt > tests/fixtures/live_webcrawlers.txt
curl -f https://myip.ms/files/bots/live_webcrawlers.txt > tests/fixtures/live_webcrawlers.txt

echo "Download bot list from matomo"
curl -f https://raw.githubusercontent.com/matomo-org/device-detector/master/Tests/fixtures/bots.yml > tests/fixtures/matomo-bots.yml
Expand Down
1 change: 1 addition & 0 deletions tests/fixtures/user-agents.net-bots-ignore-list.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Mozilla/5.0 (Windows; rv:49.0) Gecko/20100101 Firefox/49.0
Mozilla/5.0 (Windows; rv:55.0) Gecko/20100101 Firefox/55.0
Mozilla/5.0 (Windows; rv:65.0) Gecko/20100101 Firefox/65.0
Mozilla/5.0 (Windows; rv:81.0) Gecko/20100101 Firefox/81.0
NokiaC3-00/5.0 (08.65) Profile/MIDP-2.1 Configuration/CLDC-1.1 Mozilla/5.0 (Java; U; en-us; nokiac3-00) UCBrowser8.3.0.154/69/444/UCWEB Mobile UNTRUSTED/1.0
NokiaX2-05/2.0 (08.30) Profile/MIDP-2.1 Configuration/CLDC-1.1 UCWEB/2.0 (Java; U; MIDP-2.0; en-US; NokiaX2-05) U2/1.0.0 UCBrowser/9.5.0.449 U2/1.0.0 Mobile UNTRUSTED/1.0
SonyEricssonJ20i/R7CA Profile/MIDP-2.1 Configuration/CLDC-1.1 UNTRUSTED/1.0 UCWEB/2.0 (Java; U; MIDP-2.0; ru; SonyEricssonJ20i) U2/1.0.0 UCBrowser/9.5.0.449 U2/1.0.0 Mobile
windows 7 pro 64 bit, opera stable software browser, active x controls, java updater , java script
13 changes: 12 additions & 1 deletion tests/helpers/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,25 @@ const ignoreList = read(botsIgnoreList)
line => !line.startsWith('#')
)

/**
* For some reason, UCWEB are all considered bots by these guys
* @type RegExp
*/
const USERAGENT_NET_CRAWLER_EXCLUDE_PATTERN = new RegExp([
'ucmini',
'NokiaC3-00\\/5\\.0 \\(\\d+\\.\\d+\\) Profile\\/MIDP-2\\.1 Configuration\\/CLDC-1\\.1 UCWEB\\/2\\.0 \\(Java; U; MIDP-2\\.0;'
].join('|'), 'i')

/**
* List of known crawlers
* @type {string[]}
*/
module.exports.crawlers = [

// Read from text file
...read(crawlerUserAgentsText).trim().split('\n'),
...read(crawlerUserAgentsText).trim().split('\n').filter(
line => !USERAGENT_NET_CRAWLER_EXCLUDE_PATTERN.test(line)
),

// Read from a different text file
...read(
Expand Down