--resume

yaroslaff · Mar 27, 2023 · 32a26b0 · 32a26b0
1 parent 0987449
commit 32a26b0
Show file tree

Hide file tree

Showing 5 changed files with 70 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -157,7 +157,16 @@ This list (~300Kb, 11k urls) created from 1.5M words russian wordlist. There are
 
 Now you can use this file as wordlist (nudecrawler will detect it's already base URL, and will only append date to URL). 
 
-## Example usage:
+### Stop/Resume
+When working with worklists an --stats file, current status is periodically saved to this file. When starting again, if stats file exists, nudecrawler will continue from last recording point. To start it from the beginning, delete stats file or use different `--stats filename`.
+
+### Example usage:
+Check one page (using built-in :nude filter):
+~~~
+bin/nudecrawler -v --url1 https://telegra.ph/your-page-address 
+~~~
+
+
 ~~~
 bin/nudecrawler -w urls.txt --nude 5 -d 30 -f 5 --stats .local/mystats.json  --log .local/nudecrawler.log 
 ~~~

diff --git a/bin/detect-image-nsfw-api.py b/bin/detect-image-nsfw-api.py
@@ -31,6 +31,17 @@ def detect_image(path, address, thresholds, verbose=False):
             'image': open(path, 'rb')
         }
         r = requests.post(req_url, files=files)
+        if r.status_code == 500:
+            if verbose:                
+                print(r.text)
+            return  0            
+
+        if r.status_code != 200:
+            print(os.getenv('NUDECRAWLER_PAGE_URL'))
+            print(os.getenv('NUDECRAWLER_IMAGE_URL'))
+            print(r.text)
+            print(r.json())
+
     except requests.Timeout as e:
         # timeout: not interesting image
         print("TIMEOUT")

diff --git a/bin/nudecrawler b/bin/nudecrawler
@@ -6,6 +6,8 @@ import json
 import time
 import sys
 import os
+import shlex
+import subprocess
 import nudecrawler 
 from nudecrawler import Page, Unbuffered
 from nudecrawler.page import  get_processed_images
@@ -23,6 +25,7 @@ stats = {
     'word': None,
     'url': None,
     'now': None,
+    'processed_images': 0,
     'found_interesting_pages': 0,
     'found_nude_images': 0,
     'resume': dict()
@@ -37,6 +40,8 @@ started = time.time()
 
 logfile = None
 stop_after = None
+stop_each = None
+refresh = None
 detect_image = None
 detect_url = None
 
@@ -90,7 +95,7 @@ video = 1
 verbose = False
 all_found = True
 
-def get_args():
+def get_args(argv=None):
     parser = argparse.ArgumentParser(description='Telegra.ph Spider')
 
     def_total =5
@@ -104,7 +109,6 @@ def get_args():
     parser.add_argument('--url1', metavar="URL", help='process only one url')
     parser.add_argument('-f', '--fails', type=int, default=0, help='stop searching next pages with same words after N failures')
     parser.add_argument('--day', nargs=2, type=int, metavar=('MONTH', 'DAY'), help='Current date (default is today) example: --day 12 31')
-    parser.add_argument('--stop', type=int, metavar='NUM_IMAGES', help='stop after N images processed (or little after)')
 
 
     g = parser.add_argument_group('Image filtering options')
@@ -123,15 +127,20 @@ def get_args():
 
     g = parser.add_argument_group('list-related options')
     g.add_argument('-w', '--wordlist', help='wordlist (urllist) file')
-    g.add_argument('--stats', default='/tmp/nudecrawler-stats.txt', help='periodical statistics file')
+    g.add_argument('--stats', metavar='STATS_FILE', default='/tmp/nudecrawler-stats.txt', help='periodical statistics file')
+    g.add_argument('--resume', metavar='STATS_FILE', default='/tmp/nudecrawler-stats.txt', help='resume from STATS_FILE (other args are not needed)')
+    g.add_argument('--stop', type=int, metavar='NUM_IMAGES', help='stop (or --refresh) after N images processed (or little after)')
+    g.add_argument('--refresh', metavar=('SCRIPT', 'ARG'), nargs='+', help='run this refresh script every --stop NUM_IMAGES images')
 
-    return parser.parse_args()
+    return parser.parse_args(argv)
 
 
 
 
 def analyse(url):
 
+    global stop_after
+
     p = Page(url, neednnudes=nude, neednvideo=video, all_found=all_found,
             detect_url=detect_url, detect_image=detect_image)
 
@@ -154,8 +163,16 @@ def analyse(url):
 
     save_stats(force=True)
     if stop_after is not None and get_processed_images() > stop_after:
-        print("Stopping after processed", get_processed_images(), "images")
-        sys.exit(2)
+        print("Stop/refresh after processed", get_processed_images(), "images...")
+        if refresh:
+            # print("Refresh:", refresh)
+            subprocess.run(refresh)
+
+            # schedule next stop
+            stop_after = get_processed_images() + stop_each
+        else:
+            print("No --refresh, exiting with code 2")
+            sys.exit(2)
 
     return p
 
@@ -169,6 +186,7 @@ def save_stats(force=False):
     if time.time() > stats_next_write or force:
         stats['now'] = datetime.datetime.now().strftime("%m/%d/%Y %H:%M:%S")
         stats['uptime'] = int(time.time() - started)
+        stats['processed_images'] = get_processed_images()
 
         with open(stats_file, "w") as fh:
             json.dump(stats, fh, indent=4)
@@ -230,22 +248,29 @@ def sanity_check(args):
 
 def main():
     global nude, video, verbose, all_found, stats_file, stats, logfile, \
-        stop_after, detect_image, detect_url, page_image_minsize, page_extensions, \
-        page_mintotal
+        stop_after, stop_each, detect_image, detect_url, page_image_minsize, page_extensions, \
+        page_mintotal, refresh
 
     words = None
     args = get_args()
+    if args.resume:
+        print("Resume from", args.resume)
+        with open(args.resume) as fh:
+            stats = json.load(fh)
+            cmd = stats['cmd']
+            args = get_args(shlex.split(cmd)[1:])
 
     sanity_check(args)
 
-
     nude = args.nude
     video = args.video
     verbose = args.verbose
     all_found = args.all    
     matched_resume = False
     skipped_words = 0
     stop_after = args.stop
+    stop_each = args.stop
+    refresh = args.refresh
     detect_url = args.detect_url
     detect_image = args.detect_image
 

diff --git a/bin/refresh-nsfw-api.sh b/bin/refresh-nsfw-api.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+IMAGE=ghcr.io/arnidan/nsfw-api:latest
+NAME=nsfw-api
+
+echo stop current container....
+sudo docker stop $NAME
+
+echo start new container....
+sudo docker run --rm --name $NAME -d -p 3000:3000 $IMAGE
diff --git a/setup.py b/setup.py
@@ -20,7 +20,11 @@ def get_version(path):
     scripts=[
     'bin/nudecrawler', 
     'bin/detect-image-aid.py',
-    'bin/detect-url-nsfw-api.py'],
+    'bin/detect-image-nsfw-api.py',
+    'bin/detect-image-nudenet.py',
+    'bin/detect-server-nudenet.py',
+    'bin/refresh-nsfw-api.py'
+    ],
 
     install_requires=['bs4', 'requests', 'pillow', 'nudepy', 'transliterate'],