diff --git a/.env.dist b/.env.dist index de5dd88..5b5f98f 100644 --- a/.env.dist +++ b/.env.dist @@ -33,3 +33,6 @@ HUBGREP_OLD_RUN_AGE=3600 # max retries per crawler block before we ignore it HUBGREP_BLOCK_MAX_RETRIES=3 + +# keep the data of the last finished hoster crawl in db - needed for manual csv export +HUBGREP_KEEP_LAST_RUN_IN_DB=1 diff --git a/hubgrep_indexer/cli_blueprint/repos.py b/hubgrep_indexer/cli_blueprint/repos.py index 56f5222..cdf32e7 100644 --- a/hubgrep_indexer/cli_blueprint/repos.py +++ b/hubgrep_indexer/cli_blueprint/repos.py @@ -10,8 +10,9 @@ @cli_bp.cli.command() @click.argument("hosting_service") def export_repos(hosting_service): + hosting_service_api_url = hosting_service hosting_service: HostingService = HostingService.query.filter_by( - api_url=hosting_service + api_url=hosting_service_api_url ).first() hosting_service.export_repos() @@ -57,3 +58,19 @@ def prune_exports(keep, hosting_service=None): for export in old_exports_unified: print(f"deleting export {export}") export.delete_file() + + +@cli_bp.cli.command(help="drop a hosting services 'finished' table") +@click.argument("hosting_service") +def drop_finished_run_table(hosting_service): + hosting_service_api_url = hosting_service + hosting_service: HostingService = HostingService.query.filter_by( + api_url=hosting_service_api_url + ).first() + if not hosting_service: + print('could not find hosting service!') + exit(1) + + print(f"found {hosting_service}, dropping finished table") + hosting_service.drop_finished_run_table() + diff --git a/hubgrep_indexer/config/dotenv.py b/hubgrep_indexer/config/dotenv.py index 6343c7c..79f2270 100644 --- a/hubgrep_indexer/config/dotenv.py +++ b/hubgrep_indexer/config/dotenv.py @@ -19,3 +19,4 @@ class DotEnvConfig(Config): LOGLEVEL = os.environ.get("HUBGREP_INDEXER_LOGLEVEL", "debug") BLOCK_MAX_RETRIES = int(os.environ.get("HUBGREP_BLOCK_MAX_RETRIES", 3)) + KEEP_LAST_RUN_IN_DB = bool(int(os.environ.get("HUBGREP_KEEP_LAST_RUN_IN_DB", 1))) diff --git a/hubgrep_indexer/config/testing.py b/hubgrep_indexer/config/testing.py index 845a853..cfab9a3 100644 --- a/hubgrep_indexer/config/testing.py +++ b/hubgrep_indexer/config/testing.py @@ -19,3 +19,4 @@ class TestingConfig(Config): LOGIN_DISABLED = True BLOCK_MAX_RETRIES = 3 + KEEP_LAST_RUN_IN_DB = 1 diff --git a/hubgrep_indexer/models/hosting_service.py b/hubgrep_indexer/models/hosting_service.py index 25a768e..6550abf 100644 --- a/hubgrep_indexer/models/hosting_service.py +++ b/hubgrep_indexer/models/hosting_service.py @@ -16,6 +16,7 @@ from hubgrep_indexer import db from hubgrep_indexer.models.export_meta import ExportMeta from hubgrep_indexer.models.repositories.abstract_repository import Repository +from hubgrep_indexer.lib.table_helper import TableHelper logger = logging.getLogger(__name__) @@ -161,6 +162,16 @@ def handle_finished_run(self): repo_class.rotate(self) logger.debug(f"rotated repos for {self} - took {ts_rotate_start - time.time()}s") self.export_repos() + logger.debug(f"export for {self} finished") + if not current_app.config['KEEP_LAST_RUN_IN_DB']: + self.drop_finished_run_table() + + def drop_finished_run_table(self): + logger.debug(f"dropping table for exported {self}") + + target_table = Repository.get_finished_table_name(self) + with TableHelper._cursor() as cur: + TableHelper.drop_table(cur, target_table) @property def repos(self) -> ResultProxy: