From 9a9f61b1792443a05fd76cf6f9a5e3defc093958 Mon Sep 17 00:00:00 2001 From: jan Date: Sun, 20 Feb 2022 18:20:53 +0100 Subject: [PATCH 1/2] add option to remove finished run from db after export --- .env.dist | 3 +++ hubgrep_indexer/cli_blueprint/repos.py | 3 ++- hubgrep_indexer/config/dotenv.py | 1 + hubgrep_indexer/config/testing.py | 1 + hubgrep_indexer/models/hosting_service.py | 9 +++++++++ 5 files changed, 16 insertions(+), 1 deletion(-) diff --git a/.env.dist b/.env.dist index de5dd88..5b5f98f 100644 --- a/.env.dist +++ b/.env.dist @@ -33,3 +33,6 @@ HUBGREP_OLD_RUN_AGE=3600 # max retries per crawler block before we ignore it HUBGREP_BLOCK_MAX_RETRIES=3 + +# keep the data of the last finished hoster crawl in db - needed for manual csv export +HUBGREP_KEEP_LAST_RUN_IN_DB=1 diff --git a/hubgrep_indexer/cli_blueprint/repos.py b/hubgrep_indexer/cli_blueprint/repos.py index 56f5222..c37e9b3 100644 --- a/hubgrep_indexer/cli_blueprint/repos.py +++ b/hubgrep_indexer/cli_blueprint/repos.py @@ -10,8 +10,9 @@ @cli_bp.cli.command() @click.argument("hosting_service") def export_repos(hosting_service): + hosting_service_api_url = hosting_service hosting_service: HostingService = HostingService.query.filter_by( - api_url=hosting_service + api_url=hosting_service_api_url ).first() hosting_service.export_repos() diff --git a/hubgrep_indexer/config/dotenv.py b/hubgrep_indexer/config/dotenv.py index 6343c7c..79f2270 100644 --- a/hubgrep_indexer/config/dotenv.py +++ b/hubgrep_indexer/config/dotenv.py @@ -19,3 +19,4 @@ class DotEnvConfig(Config): LOGLEVEL = os.environ.get("HUBGREP_INDEXER_LOGLEVEL", "debug") BLOCK_MAX_RETRIES = int(os.environ.get("HUBGREP_BLOCK_MAX_RETRIES", 3)) + KEEP_LAST_RUN_IN_DB = bool(int(os.environ.get("HUBGREP_KEEP_LAST_RUN_IN_DB", 1))) diff --git a/hubgrep_indexer/config/testing.py b/hubgrep_indexer/config/testing.py index 845a853..cfab9a3 100644 --- a/hubgrep_indexer/config/testing.py +++ b/hubgrep_indexer/config/testing.py @@ -19,3 +19,4 @@ class TestingConfig(Config): LOGIN_DISABLED = True BLOCK_MAX_RETRIES = 3 + KEEP_LAST_RUN_IN_DB = 1 diff --git a/hubgrep_indexer/models/hosting_service.py b/hubgrep_indexer/models/hosting_service.py index 25a768e..a6e5a47 100644 --- a/hubgrep_indexer/models/hosting_service.py +++ b/hubgrep_indexer/models/hosting_service.py @@ -16,6 +16,7 @@ from hubgrep_indexer import db from hubgrep_indexer.models.export_meta import ExportMeta from hubgrep_indexer.models.repositories.abstract_repository import Repository +from hubgrep_indexer.lib.table_helper import TableHelper logger = logging.getLogger(__name__) @@ -161,6 +162,14 @@ def handle_finished_run(self): repo_class.rotate(self) logger.debug(f"rotated repos for {self} - took {ts_rotate_start - time.time()}s") self.export_repos() + logger.debug(f"export for {self} finished") + if not current_app.config['KEEP_LAST_RUN_IN_DB']: + logger.debug(f"dropping table for exported {self}") + + target_table = Repository.get_finished_table_name(self) + with TableHelper._cursor() as cur: + TableHelper.drop_table(cur, target_table) + @property def repos(self) -> ResultProxy: From e821b001a1cc794b535f81a361b3fab110d24f73 Mon Sep 17 00:00:00 2001 From: jan Date: Sun, 20 Feb 2022 18:41:41 +0100 Subject: [PATCH 2/2] add cli command to remove finished export manually --- hubgrep_indexer/cli_blueprint/repos.py | 16 ++++++++++++++++ hubgrep_indexer/models/hosting_service.py | 10 ++++++---- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/hubgrep_indexer/cli_blueprint/repos.py b/hubgrep_indexer/cli_blueprint/repos.py index c37e9b3..cdf32e7 100644 --- a/hubgrep_indexer/cli_blueprint/repos.py +++ b/hubgrep_indexer/cli_blueprint/repos.py @@ -58,3 +58,19 @@ def prune_exports(keep, hosting_service=None): for export in old_exports_unified: print(f"deleting export {export}") export.delete_file() + + +@cli_bp.cli.command(help="drop a hosting services 'finished' table") +@click.argument("hosting_service") +def drop_finished_run_table(hosting_service): + hosting_service_api_url = hosting_service + hosting_service: HostingService = HostingService.query.filter_by( + api_url=hosting_service_api_url + ).first() + if not hosting_service: + print('could not find hosting service!') + exit(1) + + print(f"found {hosting_service}, dropping finished table") + hosting_service.drop_finished_run_table() + diff --git a/hubgrep_indexer/models/hosting_service.py b/hubgrep_indexer/models/hosting_service.py index a6e5a47..6550abf 100644 --- a/hubgrep_indexer/models/hosting_service.py +++ b/hubgrep_indexer/models/hosting_service.py @@ -164,12 +164,14 @@ def handle_finished_run(self): self.export_repos() logger.debug(f"export for {self} finished") if not current_app.config['KEEP_LAST_RUN_IN_DB']: - logger.debug(f"dropping table for exported {self}") + self.drop_finished_run_table() - target_table = Repository.get_finished_table_name(self) - with TableHelper._cursor() as cur: - TableHelper.drop_table(cur, target_table) + def drop_finished_run_table(self): + logger.debug(f"dropping table for exported {self}") + target_table = Repository.get_finished_table_name(self) + with TableHelper._cursor() as cur: + TableHelper.drop_table(cur, target_table) @property def repos(self) -> ResultProxy: