Skip to content

Commit

Permalink
--resume
Browse files Browse the repository at this point in the history
  • Loading branch information
yaroslaff committed Mar 27, 2023
1 parent 0987449 commit 32a26b0
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 11 deletions.
11 changes: 10 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,16 @@ This list (~300Kb, 11k urls) created from 1.5M words russian wordlist. There are

Now you can use this file as wordlist (nudecrawler will detect it's already base URL, and will only append date to URL).

## Example usage:
### Stop/Resume
When working with worklists an --stats file, current status is periodically saved to this file. When starting again, if stats file exists, nudecrawler will continue from last recording point. To start it from the beginning, delete stats file or use different `--stats filename`.

### Example usage:
Check one page (using built-in :nude filter):
~~~
bin/nudecrawler -v --url1 https://telegra.ph/your-page-address
~~~


~~~
bin/nudecrawler -w urls.txt --nude 5 -d 30 -f 5 --stats .local/mystats.json --log .local/nudecrawler.log
~~~
Expand Down
11 changes: 11 additions & 0 deletions bin/detect-image-nsfw-api.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,17 @@ def detect_image(path, address, thresholds, verbose=False):
'image': open(path, 'rb')
}
r = requests.post(req_url, files=files)
if r.status_code == 500:
if verbose:
print(r.text)
return 0

if r.status_code != 200:
print(os.getenv('NUDECRAWLER_PAGE_URL'))
print(os.getenv('NUDECRAWLER_IMAGE_URL'))
print(r.text)
print(r.json())

except requests.Timeout as e:
# timeout: not interesting image
print("TIMEOUT")
Expand Down
43 changes: 34 additions & 9 deletions bin/nudecrawler
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ import json
import time
import sys
import os
import shlex
import subprocess
import nudecrawler
from nudecrawler import Page, Unbuffered
from nudecrawler.page import get_processed_images
Expand All @@ -23,6 +25,7 @@ stats = {
'word': None,
'url': None,
'now': None,
'processed_images': 0,
'found_interesting_pages': 0,
'found_nude_images': 0,
'resume': dict()
Expand All @@ -37,6 +40,8 @@ started = time.time()

logfile = None
stop_after = None
stop_each = None
refresh = None
detect_image = None
detect_url = None

Expand Down Expand Up @@ -90,7 +95,7 @@ video = 1
verbose = False
all_found = True

def get_args():
def get_args(argv=None):
parser = argparse.ArgumentParser(description='Telegra.ph Spider')

def_total =5
Expand All @@ -104,7 +109,6 @@ def get_args():
parser.add_argument('--url1', metavar="URL", help='process only one url')
parser.add_argument('-f', '--fails', type=int, default=0, help='stop searching next pages with same words after N failures')
parser.add_argument('--day', nargs=2, type=int, metavar=('MONTH', 'DAY'), help='Current date (default is today) example: --day 12 31')
parser.add_argument('--stop', type=int, metavar='NUM_IMAGES', help='stop after N images processed (or little after)')


g = parser.add_argument_group('Image filtering options')
Expand All @@ -123,15 +127,20 @@ def get_args():

g = parser.add_argument_group('list-related options')
g.add_argument('-w', '--wordlist', help='wordlist (urllist) file')
g.add_argument('--stats', default='/tmp/nudecrawler-stats.txt', help='periodical statistics file')
g.add_argument('--stats', metavar='STATS_FILE', default='/tmp/nudecrawler-stats.txt', help='periodical statistics file')
g.add_argument('--resume', metavar='STATS_FILE', default='/tmp/nudecrawler-stats.txt', help='resume from STATS_FILE (other args are not needed)')
g.add_argument('--stop', type=int, metavar='NUM_IMAGES', help='stop (or --refresh) after N images processed (or little after)')
g.add_argument('--refresh', metavar=('SCRIPT', 'ARG'), nargs='+', help='run this refresh script every --stop NUM_IMAGES images')

return parser.parse_args()
return parser.parse_args(argv)




def analyse(url):

global stop_after

p = Page(url, neednnudes=nude, neednvideo=video, all_found=all_found,
detect_url=detect_url, detect_image=detect_image)

Expand All @@ -154,8 +163,16 @@ def analyse(url):

save_stats(force=True)
if stop_after is not None and get_processed_images() > stop_after:
print("Stopping after processed", get_processed_images(), "images")
sys.exit(2)
print("Stop/refresh after processed", get_processed_images(), "images...")
if refresh:
# print("Refresh:", refresh)
subprocess.run(refresh)

# schedule next stop
stop_after = get_processed_images() + stop_each
else:
print("No --refresh, exiting with code 2")
sys.exit(2)

return p

Expand All @@ -169,6 +186,7 @@ def save_stats(force=False):
if time.time() > stats_next_write or force:
stats['now'] = datetime.datetime.now().strftime("%m/%d/%Y %H:%M:%S")
stats['uptime'] = int(time.time() - started)
stats['processed_images'] = get_processed_images()

with open(stats_file, "w") as fh:
json.dump(stats, fh, indent=4)
Expand Down Expand Up @@ -230,22 +248,29 @@ def sanity_check(args):

def main():
global nude, video, verbose, all_found, stats_file, stats, logfile, \
stop_after, detect_image, detect_url, page_image_minsize, page_extensions, \
page_mintotal
stop_after, stop_each, detect_image, detect_url, page_image_minsize, page_extensions, \
page_mintotal, refresh

words = None
args = get_args()
if args.resume:
print("Resume from", args.resume)
with open(args.resume) as fh:
stats = json.load(fh)
cmd = stats['cmd']
args = get_args(shlex.split(cmd)[1:])

sanity_check(args)


nude = args.nude
video = args.video
verbose = args.verbose
all_found = args.all
matched_resume = False
skipped_words = 0
stop_after = args.stop
stop_each = args.stop
refresh = args.refresh
detect_url = args.detect_url
detect_image = args.detect_image

Expand Down
10 changes: 10 additions & 0 deletions bin/refresh-nsfw-api.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash

IMAGE=ghcr.io/arnidan/nsfw-api:latest
NAME=nsfw-api

echo stop current container....
sudo docker stop $NAME

echo start new container....
sudo docker run --rm --name $NAME -d -p 3000:3000 $IMAGE
6 changes: 5 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,11 @@ def get_version(path):
scripts=[
'bin/nudecrawler',
'bin/detect-image-aid.py',
'bin/detect-url-nsfw-api.py'],
'bin/detect-image-nsfw-api.py',
'bin/detect-image-nudenet.py',
'bin/detect-server-nudenet.py',
'bin/refresh-nsfw-api.py'
],

install_requires=['bs4', 'requests', 'pillow', 'nudepy', 'transliterate'],

Expand Down

0 comments on commit 32a26b0

Please sign in to comment.