From 32a26b0fb22e6525da3c6e5fa64e9f3a79fbebc5 Mon Sep 17 00:00:00 2001 From: Yaroslav Polyakov Date: Mon, 27 Mar 2023 17:09:09 +0700 Subject: [PATCH] --resume --- README.md | 11 ++++++++- bin/detect-image-nsfw-api.py | 11 +++++++++ bin/nudecrawler | 43 ++++++++++++++++++++++++++++-------- bin/refresh-nsfw-api.sh | 10 +++++++++ setup.py | 6 ++++- 5 files changed, 70 insertions(+), 11 deletions(-) create mode 100755 bin/refresh-nsfw-api.sh diff --git a/README.md b/README.md index f560d88..553f250 100644 --- a/README.md +++ b/README.md @@ -157,7 +157,16 @@ This list (~300Kb, 11k urls) created from 1.5M words russian wordlist. There are Now you can use this file as wordlist (nudecrawler will detect it's already base URL, and will only append date to URL). -## Example usage: +### Stop/Resume +When working with worklists an --stats file, current status is periodically saved to this file. When starting again, if stats file exists, nudecrawler will continue from last recording point. To start it from the beginning, delete stats file or use different `--stats filename`. + +### Example usage: +Check one page (using built-in :nude filter): +~~~ +bin/nudecrawler -v --url1 https://telegra.ph/your-page-address +~~~ + + ~~~ bin/nudecrawler -w urls.txt --nude 5 -d 30 -f 5 --stats .local/mystats.json --log .local/nudecrawler.log ~~~ diff --git a/bin/detect-image-nsfw-api.py b/bin/detect-image-nsfw-api.py index 8e080bf..09e17a7 100755 --- a/bin/detect-image-nsfw-api.py +++ b/bin/detect-image-nsfw-api.py @@ -31,6 +31,17 @@ def detect_image(path, address, thresholds, verbose=False): 'image': open(path, 'rb') } r = requests.post(req_url, files=files) + if r.status_code == 500: + if verbose: + print(r.text) + return 0 + + if r.status_code != 200: + print(os.getenv('NUDECRAWLER_PAGE_URL')) + print(os.getenv('NUDECRAWLER_IMAGE_URL')) + print(r.text) + print(r.json()) + except requests.Timeout as e: # timeout: not interesting image print("TIMEOUT") diff --git a/bin/nudecrawler b/bin/nudecrawler index cf8e925..8732b18 100755 --- a/bin/nudecrawler +++ b/bin/nudecrawler @@ -6,6 +6,8 @@ import json import time import sys import os +import shlex +import subprocess import nudecrawler from nudecrawler import Page, Unbuffered from nudecrawler.page import get_processed_images @@ -23,6 +25,7 @@ stats = { 'word': None, 'url': None, 'now': None, + 'processed_images': 0, 'found_interesting_pages': 0, 'found_nude_images': 0, 'resume': dict() @@ -37,6 +40,8 @@ started = time.time() logfile = None stop_after = None +stop_each = None +refresh = None detect_image = None detect_url = None @@ -90,7 +95,7 @@ video = 1 verbose = False all_found = True -def get_args(): +def get_args(argv=None): parser = argparse.ArgumentParser(description='Telegra.ph Spider') def_total =5 @@ -104,7 +109,6 @@ def get_args(): parser.add_argument('--url1', metavar="URL", help='process only one url') parser.add_argument('-f', '--fails', type=int, default=0, help='stop searching next pages with same words after N failures') parser.add_argument('--day', nargs=2, type=int, metavar=('MONTH', 'DAY'), help='Current date (default is today) example: --day 12 31') - parser.add_argument('--stop', type=int, metavar='NUM_IMAGES', help='stop after N images processed (or little after)') g = parser.add_argument_group('Image filtering options') @@ -123,15 +127,20 @@ def get_args(): g = parser.add_argument_group('list-related options') g.add_argument('-w', '--wordlist', help='wordlist (urllist) file') - g.add_argument('--stats', default='/tmp/nudecrawler-stats.txt', help='periodical statistics file') + g.add_argument('--stats', metavar='STATS_FILE', default='/tmp/nudecrawler-stats.txt', help='periodical statistics file') + g.add_argument('--resume', metavar='STATS_FILE', default='/tmp/nudecrawler-stats.txt', help='resume from STATS_FILE (other args are not needed)') + g.add_argument('--stop', type=int, metavar='NUM_IMAGES', help='stop (or --refresh) after N images processed (or little after)') + g.add_argument('--refresh', metavar=('SCRIPT', 'ARG'), nargs='+', help='run this refresh script every --stop NUM_IMAGES images') - return parser.parse_args() + return parser.parse_args(argv) def analyse(url): + global stop_after + p = Page(url, neednnudes=nude, neednvideo=video, all_found=all_found, detect_url=detect_url, detect_image=detect_image) @@ -154,8 +163,16 @@ def analyse(url): save_stats(force=True) if stop_after is not None and get_processed_images() > stop_after: - print("Stopping after processed", get_processed_images(), "images") - sys.exit(2) + print("Stop/refresh after processed", get_processed_images(), "images...") + if refresh: + # print("Refresh:", refresh) + subprocess.run(refresh) + + # schedule next stop + stop_after = get_processed_images() + stop_each + else: + print("No --refresh, exiting with code 2") + sys.exit(2) return p @@ -169,6 +186,7 @@ def save_stats(force=False): if time.time() > stats_next_write or force: stats['now'] = datetime.datetime.now().strftime("%m/%d/%Y %H:%M:%S") stats['uptime'] = int(time.time() - started) + stats['processed_images'] = get_processed_images() with open(stats_file, "w") as fh: json.dump(stats, fh, indent=4) @@ -230,15 +248,20 @@ def sanity_check(args): def main(): global nude, video, verbose, all_found, stats_file, stats, logfile, \ - stop_after, detect_image, detect_url, page_image_minsize, page_extensions, \ - page_mintotal + stop_after, stop_each, detect_image, detect_url, page_image_minsize, page_extensions, \ + page_mintotal, refresh words = None args = get_args() + if args.resume: + print("Resume from", args.resume) + with open(args.resume) as fh: + stats = json.load(fh) + cmd = stats['cmd'] + args = get_args(shlex.split(cmd)[1:]) sanity_check(args) - nude = args.nude video = args.video verbose = args.verbose @@ -246,6 +269,8 @@ def main(): matched_resume = False skipped_words = 0 stop_after = args.stop + stop_each = args.stop + refresh = args.refresh detect_url = args.detect_url detect_image = args.detect_image diff --git a/bin/refresh-nsfw-api.sh b/bin/refresh-nsfw-api.sh new file mode 100755 index 0000000..2e478ef --- /dev/null +++ b/bin/refresh-nsfw-api.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +IMAGE=ghcr.io/arnidan/nsfw-api:latest +NAME=nsfw-api + +echo stop current container.... +sudo docker stop $NAME + +echo start new container.... +sudo docker run --rm --name $NAME -d -p 3000:3000 $IMAGE diff --git a/setup.py b/setup.py index 567903c..338ce4a 100755 --- a/setup.py +++ b/setup.py @@ -20,7 +20,11 @@ def get_version(path): scripts=[ 'bin/nudecrawler', 'bin/detect-image-aid.py', - 'bin/detect-url-nsfw-api.py'], + 'bin/detect-image-nsfw-api.py', + 'bin/detect-image-nudenet.py', + 'bin/detect-server-nudenet.py', + 'bin/refresh-nsfw-api.py' + ], install_requires=['bs4', 'requests', 'pillow', 'nudepy', 'transliterate'],