omrilotan · omrilotan · Dec 23, 2020 · Dec 17, 2020 · Dec 21, 2020
diff --git a/index.js b/index.js
@@ -5,7 +5,7 @@ var regex
  * Refresh the local regex variable (clusure)
  */
 function update () {
-  regex = new RegExp('(' + list.join('|') + ')', 'i')
+  regex = new RegExp(list.join('|'), 'i')
 }
 
 /**
@@ -74,6 +74,9 @@ try {
   // Addresses: libhttp browser
   list.splice(list.lastIndexOf('http'), 1)
   list.push('(?<!(lib))http')
+  // Addresses: java based browsers
+  list.splice(list.lastIndexOf('java'), 1)
+  list.push('java(?!;)')
 } catch (error) {
   // ignore errors
 }

diff --git a/list.json b/list.json
@@ -2,14 +2,12 @@
   " daum[/|\\s]",
   " deusu/",
   " splash ",
-  "(^|\\s)site",
+  "(?:^|\\s)site",
   "@[a-z]",
   "\\(at\\)[a-z]",
   "\\[at\\][a-z]",
-  "^&lt;",
   "^12345",
   "^<",
-  "^\\[",
   "^ace explorer",
   "^acoon",
   "^activebookmark",
@@ -25,7 +23,6 @@
   "^asafaweb\\.com",
   "^avsdevicesdk/",
   "^axios/",
-  "^azureus",
   "^biglotron",
   "^blackboard safeassign",
   "^blocknote.net",
@@ -52,8 +49,7 @@
   "^evernote clip resolver",
   "^facebook",
   "^faraday",
-  "^fdm \\d",
-  "^fdm/\\d",
+  "^fdm[/\\s]\\d",
   "^flashget",
   "^friendica",
   "^getright/",
@@ -67,11 +63,9 @@
   "^hobbit",
   "^hotzonu",
   "^hwcdn/",
-  "^ice browser",
   "^infox-wisg",
   "^ingrid/\\d",
   "^integrity/",
-  "^java",
   "^jeode/",
   "^jetbrains",
   "^jetty/",
@@ -92,13 +86,11 @@
   "^monit",
   "^movabletype",
   "^mozilla/\\d\\.\\d \\(compatible;?\\)$",
-  "^mucommander",
   "^my browser$",
   "^navermailapp",
   "^netsurf",
   "^ning",
   "^node-superagent",
-  "^nokiac3-00/5\\.0",
   "^notetextview",
   "^nuzzel",
   "^octopus",
@@ -115,20 +107,16 @@
   "^robozilla/",
   "^ruby$",
   "^scrapy",
-  "^selenium/",
   "^seo",
-  "^set:",
   "^shareaza",
   "^shortlinktranslate",
-  "^signalr",
   "^sistrix",
   "^sixy.ch/",
   "^smallproxy",
   "^snap$",
   "^snapchat",
   "^space bison",
   "^spotify/",
-  "^spring ",
   "^sprinklr",
   "^svn",
   "^swcd ",
@@ -191,7 +179,6 @@
   "finder",
   "firephp",
   "freesafeip",
-  "fuck",
   "ghost",
   "gomezagent",
   "google",
@@ -204,9 +191,7 @@
   "images",
   "index",
   "ips-agent",
-  "java/",
-  "javafx",
-  "javaos",
+  "java",
   "jorgee",
   "library",
   "mail\\.ru/",
@@ -216,8 +201,6 @@
   "nutch",
   "offbyone",
   "optimize",
-  "org\\.eclipse\\.ui\\.ide\\.workbench",
-  "outbrain",
   "pagespeed",
   "parse",
   "perl",
@@ -236,7 +219,6 @@
   "server",
   "sogou",
   "sparkler/",
-  "speedmode",
   "spider",
   "statuscake",
   "stumbleupon\\.com",
@@ -251,7 +233,7 @@
   "twingly recon",
   "url",
   "valid",
-  "wapchoi",
+  "wapchoi/",
   "wappalyzer",
   "webglance",
   "webkit2png",

diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "isbot",
-  "version": "3.0.20",
+  "version": "3.0.21",
   "description": "🤖 detect bots/crawlers/spiders via the user agent.",
   "keywords": [
     "bot",
@@ -24,6 +24,7 @@
   "main": "index.js",
   "types": "index.d.ts",
   "scripts": {
+    "clean": "rm -rf .cache && rm -rf docs",
     "prebuild": "which parcel || npm i parcel-bundler --no-save",
     "build": "parcel build page/index.pug --out-dir docs --public-url .",
     "postbuild": "echo isbot.js.org > docs/CNAME",

diff --git a/scripts/download-fixtures.sh b/scripts/download-fixtures.sh
@@ -2,7 +2,7 @@ echo "Download crawler list from user-agents.net"
 curl -f -d 'browser_type=bot-crawler&download=txt' https://user-agents.net/download > tests/fixtures/user-agents.net.bot-crawler.txt
 
 echo "Download cralwer list from myip.ms"
-curl -f https://www.myip.ms/files/bots/live_webcrawlers.txt > tests/fixtures/live_webcrawlers.txt
+curl -f https://myip.ms/files/bots/live_webcrawlers.txt > tests/fixtures/live_webcrawlers.txt
 
 echo "Download bot list from matomo"
 curl -f https://raw.githubusercontent.com/matomo-org/device-detector/master/Tests/fixtures/bots.yml > tests/fixtures/matomo-bots.yml

diff --git a/tests/fixtures/user-agents.net-bots-ignore-list.txt b/tests/fixtures/user-agents.net-bots-ignore-list.txt
@@ -7,6 +7,7 @@ Mozilla/5.0 (Windows; rv:49.0) Gecko/20100101 Firefox/49.0
 Mozilla/5.0 (Windows; rv:55.0) Gecko/20100101 Firefox/55.0
 Mozilla/5.0 (Windows; rv:65.0) Gecko/20100101 Firefox/65.0
 Mozilla/5.0 (Windows; rv:81.0) Gecko/20100101 Firefox/81.0
+NokiaC3-00/5.0 (08.65) Profile/MIDP-2.1 Configuration/CLDC-1.1 Mozilla/5.0 (Java; U; en-us; nokiac3-00) UCBrowser8.3.0.154/69/444/UCWEB Mobile UNTRUSTED/1.0
 NokiaX2-05/2.0 (08.30) Profile/MIDP-2.1 Configuration/CLDC-1.1 UCWEB/2.0 (Java; U; MIDP-2.0; en-US; NokiaX2-05) U2/1.0.0 UCBrowser/9.5.0.449 U2/1.0.0 Mobile UNTRUSTED/1.0
 SonyEricssonJ20i/R7CA Profile/MIDP-2.1 Configuration/CLDC-1.1 UNTRUSTED/1.0 UCWEB/2.0 (Java; U; MIDP-2.0; ru; SonyEricssonJ20i) U2/1.0.0 UCBrowser/9.5.0.449 U2/1.0.0 Mobile
 windows 7 pro 64 bit, opera stable software browser, active x controls, java updater , java script
diff --git a/tests/helpers/index.js b/tests/helpers/index.js
@@ -24,14 +24,25 @@ const ignoreList = read(botsIgnoreList)
     line => !line.startsWith('#')
   )
 
+/**
+ * For some reason, UCWEB are all considered bots by these guys
+ * @type RegExp
+ */
+const USERAGENT_NET_CRAWLER_EXCLUDE_PATTERN = new RegExp([
+  'ucmini',
+  'NokiaC3-00\\/5\\.0 \\(\\d+\\.\\d+\\) Profile\\/MIDP-2\\.1 Configuration\\/CLDC-1\\.1 UCWEB\\/2\\.0 \\(Java; U; MIDP-2\\.0;'
+].join('|'), 'i')
+
 /**
  * List of known crawlers
  * @type {string[]}
  */
 module.exports.crawlers = [
 
   // Read from text file
-  ...read(crawlerUserAgentsText).trim().split('\n'),
+  ...read(crawlerUserAgentsText).trim().split('\n').filter(
+    line => !USERAGENT_NET_CRAWLER_EXCLUDE_PATTERN.test(line)
+  ),
 
   // Read from a different text file
   ...read(