diff --git a/.env.dist b/.env.dist index de5dd88..5b5f98f 100644 --- a/.env.dist +++ b/.env.dist @@ -33,3 +33,6 @@ HUBGREP_OLD_RUN_AGE=3600 # max retries per crawler block before we ignore it HUBGREP_BLOCK_MAX_RETRIES=3 + +# keep the data of the last finished hoster crawl in db - needed for manual csv export +HUBGREP_KEEP_LAST_RUN_IN_DB=1 diff --git a/hubgrep_indexer/cli_blueprint/repos.py b/hubgrep_indexer/cli_blueprint/repos.py index 56f5222..c37e9b3 100644 --- a/hubgrep_indexer/cli_blueprint/repos.py +++ b/hubgrep_indexer/cli_blueprint/repos.py @@ -10,8 +10,9 @@ @cli_bp.cli.command() @click.argument("hosting_service") def export_repos(hosting_service): + hosting_service_api_url = hosting_service hosting_service: HostingService = HostingService.query.filter_by( - api_url=hosting_service + api_url=hosting_service_api_url ).first() hosting_service.export_repos() diff --git a/hubgrep_indexer/config/dotenv.py b/hubgrep_indexer/config/dotenv.py index 6343c7c..79f2270 100644 --- a/hubgrep_indexer/config/dotenv.py +++ b/hubgrep_indexer/config/dotenv.py @@ -19,3 +19,4 @@ class DotEnvConfig(Config): LOGLEVEL = os.environ.get("HUBGREP_INDEXER_LOGLEVEL", "debug") BLOCK_MAX_RETRIES = int(os.environ.get("HUBGREP_BLOCK_MAX_RETRIES", 3)) + KEEP_LAST_RUN_IN_DB = bool(int(os.environ.get("HUBGREP_KEEP_LAST_RUN_IN_DB", 1))) diff --git a/hubgrep_indexer/config/testing.py b/hubgrep_indexer/config/testing.py index 845a853..cfab9a3 100644 --- a/hubgrep_indexer/config/testing.py +++ b/hubgrep_indexer/config/testing.py @@ -19,3 +19,4 @@ class TestingConfig(Config): LOGIN_DISABLED = True BLOCK_MAX_RETRIES = 3 + KEEP_LAST_RUN_IN_DB = 1 diff --git a/hubgrep_indexer/models/hosting_service.py b/hubgrep_indexer/models/hosting_service.py index 25a768e..a6e5a47 100644 --- a/hubgrep_indexer/models/hosting_service.py +++ b/hubgrep_indexer/models/hosting_service.py @@ -16,6 +16,7 @@ from hubgrep_indexer import db from hubgrep_indexer.models.export_meta import ExportMeta from hubgrep_indexer.models.repositories.abstract_repository import Repository +from hubgrep_indexer.lib.table_helper import TableHelper logger = logging.getLogger(__name__) @@ -161,6 +162,14 @@ def handle_finished_run(self): repo_class.rotate(self) logger.debug(f"rotated repos for {self} - took {ts_rotate_start - time.time()}s") self.export_repos() + logger.debug(f"export for {self} finished") + if not current_app.config['KEEP_LAST_RUN_IN_DB']: + logger.debug(f"dropping table for exported {self}") + + target_table = Repository.get_finished_table_name(self) + with TableHelper._cursor() as cur: + TableHelper.drop_table(cur, target_table) + @property def repos(self) -> ResultProxy: