Skip to content

Commit

Permalink
[DH-5317] Add click house hyperloglog support (#426)
Browse files Browse the repository at this point in the history
  • Loading branch information
jcjc712 authored Mar 13, 2024
1 parent 29c96f1 commit 61a92c9
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 0 deletions.
36 changes: 36 additions & 0 deletions dataherald/db_scanner/services/click_house_scanner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import sqlalchemy
from overrides import override
from sqlalchemy.sql import func
from sqlalchemy.sql.schema import Column

from dataherald.db_scanner.models.types import QueryHistory
from dataherald.db_scanner.services.abstract_scanner import AbstractScanner
from dataherald.sql_database.base import SQLDatabase

MIN_CATEGORY_VALUE = 1
MAX_CATEGORY_VALUE = 100
MAX_LOGS = 5_000


class ClickHouseScanner(AbstractScanner):
@override
def cardinality_values(self, column: Column, db_engine: SQLDatabase) -> list | None:
query = sqlalchemy.select([func.uniqHLL12(column)])
rs = db_engine.engine.execute(query).fetchall()

if (
len(rs) > 0
and len(rs[0]) > 0
and MIN_CATEGORY_VALUE < rs[0][0] <= MAX_CATEGORY_VALUE
):
cardinality_query = sqlalchemy.select([func.distinct(column)]).limit(101)
cardinality = db_engine.engine.execute(cardinality_query).fetchall()
return [str(category[0]) for category in cardinality]

return None

@override
def get_logs(
self, table: str, db_engine: SQLDatabase, db_connection_id: str # noqa: ARG002
) -> list[QueryHistory]:
return []
2 changes: 2 additions & 0 deletions dataherald/db_scanner/sqlalchemy.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from dataherald.db_scanner.services.abstract_scanner import AbstractScanner
from dataherald.db_scanner.services.base_scanner import BaseScanner
from dataherald.db_scanner.services.big_query_scanner import BigQueryScanner
from dataherald.db_scanner.services.click_house_scanner import ClickHouseScanner
from dataherald.db_scanner.services.postgre_sql_scanner import PostgreSqlScanner
from dataherald.db_scanner.services.snowflake_scanner import SnowflakeScanner
from dataherald.db_scanner.services.sql_server_scanner import SqlServerScanner
Expand Down Expand Up @@ -275,6 +276,7 @@ def scan(
"bigquery": BigQueryScanner,
"psycopg2": PostgreSqlScanner,
"pymssql": SqlServerScanner,
"http": ClickHouseScanner,
}
self.scanner_service = BaseScanner()
if db_engine.engine.driver in services.keys():
Expand Down

0 comments on commit 61a92c9

Please sign in to comment.