From 32a26b0fb22e6525da3c6e5fa64e9f3a79fbebc5 Mon Sep 17 00:00:00 2001
From: Yaroslav Polyakov <yaroslaff@gmail.com>
Date: Mon, 27 Mar 2023 17:09:09 +0700
Subject: [PATCH] --resume

---
 README.md                    | 11 ++++++++-
 bin/detect-image-nsfw-api.py | 11 +++++++++
 bin/nudecrawler              | 43 ++++++++++++++++++++++++++++--------
 bin/refresh-nsfw-api.sh      | 10 +++++++++
 setup.py                     |  6 ++++-
 5 files changed, 70 insertions(+), 11 deletions(-)
 create mode 100755 bin/refresh-nsfw-api.sh

diff --git a/README.md b/README.md
index f560d88..553f250 100644
--- a/README.md
+++ b/README.md
@@ -157,7 +157,16 @@ This list (~300Kb, 11k urls) created from 1.5M words russian wordlist. There are
 
 Now you can use this file as wordlist (nudecrawler will detect it's already base URL, and will only append date to URL). 
 
-## Example usage:
+### Stop/Resume
+When working with worklists an --stats file, current status is periodically saved to this file. When starting again, if stats file exists, nudecrawler will continue from last recording point. To start it from the beginning, delete stats file or use different `--stats filename`.
+
+### Example usage:
+Check one page (using built-in :nude filter):
+~~~
+bin/nudecrawler -v --url1 https://telegra.ph/your-page-address 
+~~~
+
+
 ~~~
 bin/nudecrawler -w urls.txt --nude 5 -d 30 -f 5 --stats .local/mystats.json  --log .local/nudecrawler.log 
 ~~~
diff --git a/bin/detect-image-nsfw-api.py b/bin/detect-image-nsfw-api.py
index 8e080bf..09e17a7 100755
--- a/bin/detect-image-nsfw-api.py
+++ b/bin/detect-image-nsfw-api.py
@@ -31,6 +31,17 @@ def detect_image(path, address, thresholds, verbose=False):
             'image': open(path, 'rb')
         }
         r = requests.post(req_url, files=files)
+        if r.status_code == 500:
+            if verbose:                
+                print(r.text)
+            return  0            
+
+        if r.status_code != 200:
+            print(os.getenv('NUDECRAWLER_PAGE_URL'))
+            print(os.getenv('NUDECRAWLER_IMAGE_URL'))
+            print(r.text)
+            print(r.json())
+
     except requests.Timeout as e:
         # timeout: not interesting image
         print("TIMEOUT")
diff --git a/bin/nudecrawler b/bin/nudecrawler
index cf8e925..8732b18 100755
--- a/bin/nudecrawler
+++ b/bin/nudecrawler
@@ -6,6 +6,8 @@ import json
 import time
 import sys
 import os
+import shlex
+import subprocess
 import nudecrawler 
 from nudecrawler import Page, Unbuffered
 from nudecrawler.page import  get_processed_images
@@ -23,6 +25,7 @@ stats = {
     'word': None,
     'url': None,
     'now': None,
+    'processed_images': 0,
     'found_interesting_pages': 0,
     'found_nude_images': 0,
     'resume': dict()
@@ -37,6 +40,8 @@ started = time.time()
 
 logfile = None
 stop_after = None
+stop_each = None
+refresh = None
 detect_image = None
 detect_url = None
 
@@ -90,7 +95,7 @@ video = 1
 verbose = False
 all_found = True
 
-def get_args():
+def get_args(argv=None):
     parser = argparse.ArgumentParser(description='Telegra.ph Spider')
 
     def_total =5
@@ -104,7 +109,6 @@ def get_args():
     parser.add_argument('--url1', metavar="URL", help='process only one url')
     parser.add_argument('-f', '--fails', type=int, default=0, help='stop searching next pages with same words after N failures')
     parser.add_argument('--day', nargs=2, type=int, metavar=('MONTH', 'DAY'), help='Current date (default is today) example: --day 12 31')
-    parser.add_argument('--stop', type=int, metavar='NUM_IMAGES', help='stop after N images processed (or little after)')
 
 
     g = parser.add_argument_group('Image filtering options')
@@ -123,15 +127,20 @@ def get_args():
 
     g = parser.add_argument_group('list-related options')
     g.add_argument('-w', '--wordlist', help='wordlist (urllist) file')
-    g.add_argument('--stats', default='/tmp/nudecrawler-stats.txt', help='periodical statistics file')
+    g.add_argument('--stats', metavar='STATS_FILE', default='/tmp/nudecrawler-stats.txt', help='periodical statistics file')
+    g.add_argument('--resume', metavar='STATS_FILE', default='/tmp/nudecrawler-stats.txt', help='resume from STATS_FILE (other args are not needed)')
+    g.add_argument('--stop', type=int, metavar='NUM_IMAGES', help='stop (or --refresh) after N images processed (or little after)')
+    g.add_argument('--refresh', metavar=('SCRIPT', 'ARG'), nargs='+', help='run this refresh script every --stop NUM_IMAGES images')
 
-    return parser.parse_args()
+    return parser.parse_args(argv)
 
 
 
 
 def analyse(url):
 
+    global stop_after
+
     p = Page(url, neednnudes=nude, neednvideo=video, all_found=all_found,
             detect_url=detect_url, detect_image=detect_image)
 
@@ -154,8 +163,16 @@ def analyse(url):
 
     save_stats(force=True)
     if stop_after is not None and get_processed_images() > stop_after:
-        print("Stopping after processed", get_processed_images(), "images")
-        sys.exit(2)
+        print("Stop/refresh after processed", get_processed_images(), "images...")
+        if refresh:
+            # print("Refresh:", refresh)
+            subprocess.run(refresh)
+
+            # schedule next stop
+            stop_after = get_processed_images() + stop_each
+        else:
+            print("No --refresh, exiting with code 2")
+            sys.exit(2)
 
     return p
 
@@ -169,6 +186,7 @@ def save_stats(force=False):
     if time.time() > stats_next_write or force:
         stats['now'] = datetime.datetime.now().strftime("%m/%d/%Y %H:%M:%S")
         stats['uptime'] = int(time.time() - started)
+        stats['processed_images'] = get_processed_images()
         
         with open(stats_file, "w") as fh:
             json.dump(stats, fh, indent=4)
@@ -230,15 +248,20 @@ def sanity_check(args):
 
 def main():
     global nude, video, verbose, all_found, stats_file, stats, logfile, \
-        stop_after, detect_image, detect_url, page_image_minsize, page_extensions, \
-        page_mintotal
+        stop_after, stop_each, detect_image, detect_url, page_image_minsize, page_extensions, \
+        page_mintotal, refresh
 
     words = None
     args = get_args()
+    if args.resume:
+        print("Resume from", args.resume)
+        with open(args.resume) as fh:
+            stats = json.load(fh)
+            cmd = stats['cmd']
+            args = get_args(shlex.split(cmd)[1:])
 
     sanity_check(args)
 
-
     nude = args.nude
     video = args.video
     verbose = args.verbose
@@ -246,6 +269,8 @@ def main():
     matched_resume = False
     skipped_words = 0
     stop_after = args.stop
+    stop_each = args.stop
+    refresh = args.refresh
     detect_url = args.detect_url
     detect_image = args.detect_image
 
diff --git a/bin/refresh-nsfw-api.sh b/bin/refresh-nsfw-api.sh
new file mode 100755
index 0000000..2e478ef
--- /dev/null
+++ b/bin/refresh-nsfw-api.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+IMAGE=ghcr.io/arnidan/nsfw-api:latest
+NAME=nsfw-api
+
+echo stop current container....
+sudo docker stop $NAME
+
+echo start new container....
+sudo docker run --rm --name $NAME -d -p 3000:3000 $IMAGE
diff --git a/setup.py b/setup.py
index 567903c..338ce4a 100755
--- a/setup.py
+++ b/setup.py
@@ -20,7 +20,11 @@ def get_version(path):
     scripts=[
     'bin/nudecrawler', 
     'bin/detect-image-aid.py',
-    'bin/detect-url-nsfw-api.py'],
+    'bin/detect-image-nsfw-api.py',
+    'bin/detect-image-nudenet.py',
+    'bin/detect-server-nudenet.py',
+    'bin/refresh-nsfw-api.py'
+    ],
 
     install_requires=['bs4', 'requests', 'pillow', 'nudepy', 'transliterate'],