From 9de8d8513deed51b71f8f3e7fe09a43cbf8a5e28 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 21 Oct 2024 09:27:07 +0000 Subject: [PATCH 001/157] Bump duckdb from 1.1.1 to 1.1.2 Bumps [duckdb](https://github.com/duckdb/duckdb) from 1.1.1 to 1.1.2. - [Release notes](https://github.com/duckdb/duckdb/releases) - [Changelog](https://github.com/duckdb/duckdb/blob/main/tools/release-pip.py) - [Commits](https://github.com/duckdb/duckdb/compare/v1.1.1...v1.1.2) --- updated-dependencies: - dependency-name: duckdb dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- tests/requirements_arm.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/requirements_arm.txt b/tests/requirements_arm.txt index 82c467fd9..fd0d91aac 100644 --- a/tests/requirements_arm.txt +++ b/tests/requirements_arm.txt @@ -18,7 +18,7 @@ firestore sqlalchemy pymysql psycopg2-binary -duckdb==1.1.1 # 1040 +duckdb==1.1.2 # 1040 duckdb-engine==0.13.2 # 1040 setuptools_rust \ No newline at end of file From eb40aae20754d24159098b9c0baaa477bb1c07bb Mon Sep 17 00:00:00 2001 From: joocer Date: Tue, 22 Oct 2024 15:51:27 +0100 Subject: [PATCH 002/157] #2073 --- opteryx/compiled/structures/node.pyx | 6 ++ opteryx/functions/other_functions.py | 52 ++++++++--- .../managers/expression/binary_operators.py | 25 +++-- opteryx/managers/expression/formatter.py | 4 +- opteryx/managers/expression/ops.py | 23 ++++- .../planner/cost_based_optimizer/__init__.py | 1 + .../strategies/__init__.py | 2 + .../strategies/limit_pushdown.py | 63 +++++++++++++ .../strategies/operator_fusion.py | 5 + .../strategies/optimization_strategy.py | 5 +- .../strategies/predicate_pushdown.py | 6 +- .../strategies/predicate_rewriter.py | 5 +- .../strategies/projection_pushdown.py | 16 +++- .../strategies/redundant_operators.py | 5 + .../split_conjunctive_predicates.py | 7 ++ opteryx/planner/sql_rewriter.py | 86 ++++++++++-------- testdata/flat/nvd/nvd.parquet | Bin 0 -> 14615 bytes .../test_optimizations_invoked.py | 3 +- .../test_temporal_extraction.py | 10 ++ tests/sql_battery/test_battery_sql92.py | 6 +- .../test_shapes_and_errors_battery.py | 34 +++++++ 21 files changed, 297 insertions(+), 67 deletions(-) create mode 100644 opteryx/planner/cost_based_optimizer/strategies/limit_pushdown.py create mode 100644 testdata/flat/nvd/nvd.parquet diff --git a/opteryx/compiled/structures/node.pyx b/opteryx/compiled/structures/node.pyx index 962835e85..80963275a 100644 --- a/opteryx/compiled/structures/node.pyx +++ b/opteryx/compiled/structures/node.pyx @@ -1,4 +1,10 @@ # cython: language_level=3 +# cython: nonecheck=False +# cython: cdivision=True +# cython: initializedcheck=False +# cython: infer_types=True +# cython: wraparound=True +# cython: boundscheck=False """ Node Module diff --git a/opteryx/functions/other_functions.py b/opteryx/functions/other_functions.py index 27b5720d1..b8145b13e 100644 --- a/opteryx/functions/other_functions.py +++ b/opteryx/functions/other_functions.py @@ -15,6 +15,7 @@ import numpy import pyarrow +import simdjson from pyarrow import compute from opteryx.exceptions import SqlError @@ -196,17 +197,46 @@ def cosine_similarity( return similarities -def jsonb_object_keys(arr): +def jsonb_object_keys(arr: numpy.ndarray): + """ + Extract the keys from a NumPy array of JSON objects or JSON strings/bytes. + + Parameters: + arr: numpy.ndarray + A NumPy array of dictionaries or JSON-encoded strings/bytes. + + Returns: + pyarrow.Array + A PyArrow Array containing lists of keys for each input element. + """ + # Early exit for empty input if len(arr) == 0: - return [] - result = [] - if isinstance(arr[0], dict): - result = [[str(key) for key in row] for row in arr] - if isinstance(arr[0], (str, bytes)): - import simdjson + return numpy.array([]) + + # we may get pyarrow arrays here - usually not though + if isinstance(arr, pyarrow.Array): + arr = arr.to_numpy(zero_copy_only=False) + + # Determine type based on dtype of the array + if not numpy.issubdtype(arr.dtype, numpy.object_): + raise ValueError( + "Unsupported array dtype. Expected object dtype for dicts or strings/bytes." + ) - def keys(doc): - return simdjson.Parser().parse(doc).keys() # type:ignore + # Pre-create the result array as a NumPy boolean array set to False + result = numpy.empty(arr.shape, dtype=list) + + if isinstance(arr[0], dict): + # Process dictionaries + for i, row in enumerate(arr): + result[i] = [str(key) for key in row.keys()] + elif isinstance(arr[0], (str, bytes)): + # SIMD-JSON parser instance for JSON string/bytes + parser = simdjson.Parser() + for i, row in enumerate(arr): + result[i] = [str(key) for key in parser.parse(row).keys()] + else: + raise ValueError("Unsupported dtype for array elements. Expected dict, str, or bytes.") - result = [[str(key) for key in keys(row)] for row in arr] - return pyarrow.array(result) + # Return the result as a PyArrow array + return result diff --git a/opteryx/managers/expression/binary_operators.py b/opteryx/managers/expression/binary_operators.py index 1e97ad667..95098e039 100644 --- a/opteryx/managers/expression/binary_operators.py +++ b/opteryx/managers/expression/binary_operators.py @@ -18,33 +18,41 @@ import numpy import pyarrow +import simdjson from orso.types import OrsoTypes from pyarrow import compute from opteryx.compiled import list_ops +# Initialize simdjson parser once +parser = simdjson.Parser() + def ArrowOp(documents, elements) -> pyarrow.Array: """JSON Selector""" element = elements[0] - # if it's dicts, extract the value from the dict + # Fast path: if the documents are dicts, delegate to the cython optimized op if len(documents) > 0 and isinstance(documents[0], dict): return list_ops.cython_arrow_op(documents, element) - # if it's a string, parse and extract, we don't need a dict (dicts are s_l_o_w) - # so we can use a library which allows us to access the values directly - import simdjson + if hasattr(documents, "to_numpy"): + documents = documents.to_numpy(zero_copy_only=False) + # Function to extract value from a document def extract(doc: bytes, elem: Union[bytes, str]) -> Any: - value = simdjson.Parser().parse(doc).get(elem) # type:ignore + value = parser.parse(doc).get(elem) # type:ignore if hasattr(value, "as_list"): return value.as_list() if hasattr(value, "as_dict"): - return value.as_dict() + return value.mini return value - return pyarrow.array([None if d is None else extract(d, element) for d in documents]) + # Use a generator expression to lazily evaluate the extraction + extracted_values = (None if d is None else extract(d, element) for d in documents) + + # Return the result as a PyArrow array + return pyarrow.array(extracted_values) def LongArrowOp(documents, elements) -> pyarrow.Array: @@ -54,6 +62,9 @@ def LongArrowOp(documents, elements) -> pyarrow.Array: if len(documents) > 0 and isinstance(documents[0], dict): return list_ops.cython_long_arrow_op(documents, element) + if hasattr(documents, "to_numpy"): + documents = documents.to_numpy(zero_copy_only=False) + import simdjson def extract(doc: bytes, elem: Union[bytes, str]) -> bytes: diff --git a/opteryx/managers/expression/formatter.py b/opteryx/managers/expression/formatter.py index af7b3f1ba..fa1e644dd 100644 --- a/opteryx/managers/expression/formatter.py +++ b/opteryx/managers/expression/formatter.py @@ -101,8 +101,6 @@ def format_expression(root, qualify: bool = False): "ShiftRight": ">>", "Arrow": "->", "LongArrow": "->>", - "AtQuestion": "@?", - "AtArrow": "@>", } return f"{format_expression(root.left, qualify)} {_map.get(root.value, root.value).upper()} {format_expression(root.right, qualify)}" if node_type == NodeType.EXPRESSION_LIST: @@ -116,6 +114,8 @@ def format_expression(root, qualify: bool = False): "BitwiseOr": "|", "LtEq": "<=", "GtEq": ">=", + "AtQuestion": "@?", + "AtArrow": "@>", } return f"{format_expression(root.left, qualify)} {_map.get(root.value, root.value).upper()} {format_expression(root.right, qualify)}" if node_type == NodeType.UNARY_OPERATOR: diff --git a/opteryx/managers/expression/ops.py b/opteryx/managers/expression/ops.py index e6f59e75b..474a961fd 100644 --- a/opteryx/managers/expression/ops.py +++ b/opteryx/managers/expression/ops.py @@ -178,9 +178,28 @@ def _inner_filter_operations(arr, operator, value): import simdjson - # Don't warn on rule SIM118, the object isn't actually a dictionary + parser = simdjson.Parser() + + if not element.startswith("$."): + # Don't warn on rule SIM118, the object isn't actually a dictionary + return pyarrow.array( + [element in parser.parse(doc).keys() for doc in arr], + type=pyarrow.bool_(), # type:ignore + ) + + _keys = element[2:].split(".") + + def json_path_extract(current_value, keys): + for key in keys: + if key not in current_value: + return False # Key doesn't exist + + # Proceed to the next level of the JSON object + current_value = current_value[key] + return True # Key exists if traversal succeeds + return pyarrow.array( - [element in simdjson.Parser().parse(doc).keys() for doc in arr], + [json_path_extract(parser.parse(doc), _keys) for doc in arr], type=pyarrow.bool_(), # type:ignore ) diff --git a/opteryx/planner/cost_based_optimizer/__init__.py b/opteryx/planner/cost_based_optimizer/__init__.py index 2cdf7fb25..8d24c37c6 100644 --- a/opteryx/planner/cost_based_optimizer/__init__.py +++ b/opteryx/planner/cost_based_optimizer/__init__.py @@ -88,6 +88,7 @@ def __init__(self, statistics: QueryStatistics): ProjectionPushdownStrategy(statistics), DistinctPushdownStrategy(statistics), OperatorFusionStrategy(statistics), + LimitPushdownStrategy(statistics), RedundantOperationsStrategy(statistics), ConstantFoldingStrategy(statistics), ] diff --git a/opteryx/planner/cost_based_optimizer/strategies/__init__.py b/opteryx/planner/cost_based_optimizer/strategies/__init__.py index 6820472f7..6fc7973a6 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/__init__.py +++ b/opteryx/planner/cost_based_optimizer/strategies/__init__.py @@ -1,6 +1,7 @@ from .boolean_simplication import BooleanSimplificationStrategy from .constant_folding import ConstantFoldingStrategy from .distinct_pushdown import DistinctPushdownStrategy +from .limit_pushdown import LimitPushdownStrategy from .operator_fusion import OperatorFusionStrategy from .predicate_pushdown import PredicatePushdownStrategy from .predicate_rewriter import PredicateRewriteStrategy @@ -12,6 +13,7 @@ "BooleanSimplificationStrategy", "ConstantFoldingStrategy", "DistinctPushdownStrategy", + "LimitPushdownStrategy", "OperatorFusionStrategy", "PredicatePushdownStrategy", "PredicateRewriteStrategy", diff --git a/opteryx/planner/cost_based_optimizer/strategies/limit_pushdown.py b/opteryx/planner/cost_based_optimizer/strategies/limit_pushdown.py new file mode 100644 index 000000000..86105d750 --- /dev/null +++ b/opteryx/planner/cost_based_optimizer/strategies/limit_pushdown.py @@ -0,0 +1,63 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Optimization Rule - Limit Pushdown + +Type: Heuristic +Goal: Reduce Rows + +We try to push the limit to the other side of PROJECTS +""" + +from opteryx.planner.logical_planner import LogicalPlan +from opteryx.planner.logical_planner import LogicalPlanNode +from opteryx.planner.logical_planner import LogicalPlanStepType + +from .optimization_strategy import OptimizationStrategy +from .optimization_strategy import OptimizerContext + + +class LimitPushdownStrategy(OptimizationStrategy): + def visit(self, node: LogicalPlanNode, context: OptimizerContext) -> OptimizerContext: + if not context.optimized_plan: + context.optimized_plan = context.pre_optimized_tree.copy() # type: ignore + + if node.node_type == LogicalPlanStepType.Limit: + node.nid = context.node_id + context.collected_limits.append(node) + return context + + if node.node_type in ( + LogicalPlanStepType.Join, + LogicalPlanStepType.Scan, + LogicalPlanStepType.AggregateAndGroup, + LogicalPlanStepType.Aggregate, + LogicalPlanStepType.Subquery, + LogicalPlanStepType.Union, + LogicalPlanStepType.Filter, + ): + # we don't push past here + for limit_node in context.collected_limits: + self.statistics.optimization_limit_pushdown += 1 + context.optimized_plan.remove_node(limit_node.nid, heal=True) + context.optimized_plan.insert_node_after( + limit_node.nid, limit_node, context.node_id + ) + limit_node.columns = [] + context.collected_limits.clear() + + return context + + def complete(self, plan: LogicalPlan, context: OptimizerContext) -> LogicalPlan: + # No finalization needed for this strategy + return plan diff --git a/opteryx/planner/cost_based_optimizer/strategies/operator_fusion.py b/opteryx/planner/cost_based_optimizer/strategies/operator_fusion.py index 1c3369547..77d8fba5c 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/operator_fusion.py +++ b/opteryx/planner/cost_based_optimizer/strategies/operator_fusion.py @@ -11,6 +11,11 @@ # limitations under the License. """ +Optimization Rule - Operator Fusion + +Type: Heuristic +Goal: Chose more efficient physical implementations. + Some operators can be fused to be faster. 'Fused' opertors are when physical operations perform multiple logical operations. diff --git a/opteryx/planner/cost_based_optimizer/strategies/optimization_strategy.py b/opteryx/planner/cost_based_optimizer/strategies/optimization_strategy.py index 9dd6daaea..ec4e3cbe1 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/optimization_strategy.py +++ b/opteryx/planner/cost_based_optimizer/strategies/optimization_strategy.py @@ -34,7 +34,10 @@ def __init__(self, tree: LogicalPlan): """We collect column identities so we can push column selection as close to the read as possible, including off to remote systems""" self.collected_distincts: list = [] - """We collect distincts to try to eliminate records earlier""" + """We collect distincts to try to eliminate rows earlier""" + + self.collected_limits: list = [] + """We collect limits to to to eliminate rows earlier""" class OptimizationStrategy: diff --git a/opteryx/planner/cost_based_optimizer/strategies/predicate_pushdown.py b/opteryx/planner/cost_based_optimizer/strategies/predicate_pushdown.py index f832c99bf..ddcaa1884 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/predicate_pushdown.py +++ b/opteryx/planner/cost_based_optimizer/strategies/predicate_pushdown.py @@ -11,7 +11,10 @@ # limitations under the License. """ -PUSH DOWN +Optimization Rule - Predicate Pushdown + +Type: Heuristic +Goal: Filter rows as early as possible One main heuristic strategy is it eliminate rows to be processed as early as possible, to do that we try to push filter conditions to as close to the @@ -62,7 +65,6 @@ def visit(self, node: LogicalPlanNode, context: OptimizerContext) -> OptimizerCo ): # Handle predicates specific to node types context = self._handle_predicates(node, context) - self.statistics.optimization_predicate_pushdown += 1 context.optimized_plan.add_node(context.node_id, LogicalPlanNode(**node.properties)) if context.last_nid: context.optimized_plan.add_edge(context.node_id, context.last_nid) diff --git a/opteryx/planner/cost_based_optimizer/strategies/predicate_rewriter.py b/opteryx/planner/cost_based_optimizer/strategies/predicate_rewriter.py index 1f0d5b6e5..6d64cfbf8 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/predicate_rewriter.py +++ b/opteryx/planner/cost_based_optimizer/strategies/predicate_rewriter.py @@ -11,7 +11,10 @@ # limitations under the License. """ -PREDICATE REWRITER +Optimization Rule - Predicate rewriter + +Type: Heuristic +Goal: Chose more efficient predicate evaluations We rewrite some conditions to a more optimal form; for example if doing a LIKE comparison and the pattern contains no wildcards, we rewrite to be an diff --git a/opteryx/planner/cost_based_optimizer/strategies/projection_pushdown.py b/opteryx/planner/cost_based_optimizer/strategies/projection_pushdown.py index 87fabeb7d..5f936cbc0 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/projection_pushdown.py +++ b/opteryx/planner/cost_based_optimizer/strategies/projection_pushdown.py @@ -10,6 +10,21 @@ # See the License for the specific language governing permissions and # limitations under the License. +""" +Optimization Rule - Projection Pushdown + +Type: Heuristic +Goal: Limit columns which need to be moved around + +We bind from the the scans, exposing the available columns to each operator +as we make our way to the top of the plan (usually the SELECT). The projection +pushdown is done as part of the optimizers, but isn't quite like the other +optimizations; this is collecting used column information as it goes from the +top of the plan down to the selects. The other optimizations tend to move or +remove operations, or update what a step does, this is just collecting and +updating the used columns. +""" + from typing import Set from opteryx.managers.expression import NodeType @@ -81,7 +96,6 @@ def visit(self, node: LogicalPlanNode, context: OptimizerContext) -> OptimizerCo node.columns = node_columns context.optimized_plan.add_node(context.node_id, LogicalPlanNode(**node.properties)) - self.statistics.optimization_projection_pushdown += 1 if context.parent_nid: context.optimized_plan.add_edge(context.node_id, context.parent_nid) diff --git a/opteryx/planner/cost_based_optimizer/strategies/redundant_operators.py b/opteryx/planner/cost_based_optimizer/strategies/redundant_operators.py index 8a3df5c54..2c2bd205a 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/redundant_operators.py +++ b/opteryx/planner/cost_based_optimizer/strategies/redundant_operators.py @@ -11,6 +11,11 @@ # limitations under the License. """ +Optimization Rule - Remove Redundant Operators + +Type: Heuristic +Goal: Remove steps which don't affect the result + This optimization runs toward the end of the set, it removes operators which were useful during planning and optimization. diff --git a/opteryx/planner/cost_based_optimizer/strategies/split_conjunctive_predicates.py b/opteryx/planner/cost_based_optimizer/strategies/split_conjunctive_predicates.py index f670cdb02..777865bb8 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/split_conjunctive_predicates.py +++ b/opteryx/planner/cost_based_optimizer/strategies/split_conjunctive_predicates.py @@ -10,6 +10,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +""" +Optimization Rule - Split Conjections + +Type: Heuristic +Goal: Break filters into units which are easier to handle +""" + from orso.tools import random_string from opteryx.managers.expression import NodeType diff --git a/opteryx/planner/sql_rewriter.py b/opteryx/planner/sql_rewriter.py index 9a173f5c6..f67371c47 100644 --- a/opteryx/planner/sql_rewriter.py +++ b/opteryx/planner/sql_rewriter.py @@ -70,7 +70,7 @@ from opteryx.exceptions import InvalidTemporalRangeFilterError from opteryx.utils import dates -COLLECT_RELATION = [ +COLLECT_RELATION = { r"FROM", r"INNER\sJOIN", r"CROSS\sJOIN", @@ -87,11 +87,11 @@ r"JOIN", r"CREATE\sTABLE", r"ANALYZE\sTABLE", -] +} -COLLECT_TEMPORAL = [r"FOR"] +COLLECT_TEMPORAL = {r"FOR"} -STOP_COLLECTING = [ +STOP_COLLECTING = { r"GROUP\sBY", r"HAVING", r"LIKE", @@ -107,32 +107,35 @@ r";", r",", r"UNION", -] +} -COLLECT_ALIAS = [r"AS"] +COLLECT_ALIAS = {r"AS"} -BOUNDARIES = [r"(", r")"] +BOUNDARIES = {r"(", r")"} + +FOR_DATE_CLAUSES = { + r"DATES\sIN\s\w+", + r"DATES\sBETWEEN\s[^\r\n\t\f\v]AND\s[^\r\n\t\f\v]", + r"DATES\sSINCE\s\w+", +} + +FUNCTIONS_WITH_FROM_SYNTAX = {"EXTRACT", "SUBSTRING", "TRIM"} SQL_PARTS = ( - COLLECT_RELATION - + COLLECT_TEMPORAL - + STOP_COLLECTING - + COLLECT_ALIAS - + [ - r"DATES\sIN\s\w+", - r"DATES\sBETWEEN\s[^\r\n\t\f\v]AND\s[^\r\n\t\f\v]", - r"DATES\sSINCE\s\w+", - ] + COLLECT_RELATION.union(COLLECT_TEMPORAL) + .union(STOP_COLLECTING) + .union(COLLECT_ALIAS) + .union(FOR_DATE_CLAUSES) ) COMBINE_WHITESPACE_REGEX = re.compile(r"\r\n\t\f\v+") # states for the collection algorithm WAITING: int = 1 -RELATION: int = 4 -TEMPORAL: int = 16 -ALIAS: int = 64 -FUNCTION_RELATION: int = 128 +RELATION: int = 2 +TEMPORAL: int = 4 +ALIAS: int = 8 +FUNCTION_RELATION: int = 16 def sql_parts(string): @@ -259,6 +262,9 @@ def _temporal_extration_state_machine( # # We're essentially using a bit mask to record state and transitions. + in_special_function = False + special_function_brackets = 0 + state = WAITING relation = "" temporal = "" @@ -270,29 +276,37 @@ def _temporal_extration_state_machine( transition = [state] comparable_part = part.upper().replace(" ", r"\s") + if comparable_part in FUNCTIONS_WITH_FROM_SYNTAX: + in_special_function = True + special_function_brackets = open_count + # work out what our current state is - if comparable_part in BOUNDARIES: + elif comparable_part in BOUNDARIES: + if comparable_part == "(": + open_count += 1 + if comparable_part == ")": + open_count -= 1 if relation == "": state = WAITING else: # function relations, like FAKE(234,234) need the items between the # brackets be be consumed state = FUNCTION_RELATION - if comparable_part == "(": - open_count += 1 - if comparable_part == ")": - open_count -= 1 - if comparable_part in STOP_COLLECTING: - if state == FUNCTION_RELATION and open_count > 0: - pass - else: - state = WAITING - if comparable_part in COLLECT_RELATION: - state = RELATION - if comparable_part in COLLECT_TEMPORAL: - state = TEMPORAL - if comparable_part in COLLECT_ALIAS: - state = ALIAS + elif in_special_function and open_count == special_function_brackets: + in_special_function = False + + if not in_special_function: + if comparable_part in STOP_COLLECTING: + if state == FUNCTION_RELATION and open_count > 0: + pass + else: + state = WAITING + if comparable_part in COLLECT_RELATION: + state = RELATION + if comparable_part in COLLECT_TEMPORAL: + state = TEMPORAL + if comparable_part in COLLECT_ALIAS: + state = ALIAS transition.append(state) # based on what the state was and what it is now, do something diff --git a/testdata/flat/nvd/nvd.parquet b/testdata/flat/nvd/nvd.parquet new file mode 100644 index 0000000000000000000000000000000000000000..0429614e58101b70a28185b2036915bd6cfc5161 GIT binary patch literal 14615 zcmeHOdvF_de*Y<6y<%x2%eEpLEB#s6j-5!Whb383uoF3!iHYMlw&Mhn;H;#zw6U}+ zc2`n-ZiW~pNg)FlxNA~w<|w%oC~#$tfTy94gIviC&p^2;6gu2Zay*V}&j&poSKu`L z?)nkic^ri@?H|_sW*2xqK!biDPl~zWK z3v5TacT7ub2Wm8#|@q=}}8+rvkiDL|q#b6KR!E`#!7ep*D86HNWC5euY z07Y2h;m$owT*9;vgGZ%>QCwhR#_TD~3wXo8rf!^PX)LASrgSWq!C6|MfmMv)z=aeW zPhnQXIgv@I#b8Ukx8SV6X5`^nQOq%L1qbwDbz2r_ z#KXCC8itlqIKhhXG%x0W?gA^NRM%Ts4miOks`k5q3|fL`$TM_WRL5&rPLyCGx;Cxf zK(L%@O^Oi&7%jtyBAsL?`@UVUK3Rc}rI}3A(`bxZhC;_$e;9jEz6$%9^8|S%UHc7?#j4PfvpQ&<0bSVoCXimsua1527?P0Ir3ct zEU+`W#_G(bc!_tamSeBS(?X@BOgj8CWL>|xr?abXlkGKAFD=DWeUG8ZJhDA&@`OVE zmhd5@Ywzl?-Ril%C!lhOOgtxm7!C9BoW5v?u?d%cz{Z%oo}e!jW{|Z%x_)amxI-S#UAB%e?&d)ba3`B)a2FR( z=fcjKFKjz+x-I-TGIeh4?(XpBm$co5+#nrsisgoR*V=R|{R(o9zA(^`Y4v#g zEulcb7x1{_o41Nn=0p!^*O`N!LJ`}`%7r!;3y zLFXwAw7XAfOmsS%qQ~owKeQj1ziG6C6+mTp5N3~?@=*aya|z7QLYifS;~S9&Uu=3*4GB;_d{PAT}+6D#Dd9 zIOdoH9v1kFY7G+3Jo?TFOPfvyMx=PNCVaEz^cc|+iUc0rtC3f##UCCl>HId)cCE&n zE+N9MJ^KjM61n+bRv984gU<1Ble5_$@P_lqG`O`l+H=FpCBLaQ<#V90%u`l|OQu;d z<^45bfCY4^x0^t9_FX+0Dq%z8%ZeZBU;9vR=-<+fE#KA~fC5ZCLYUvy7ab*|4imE% z1c=lxwSTNvjn#m#W`G@{{kQt!%95$pwz32TXV2gux00}{CwFVk>_#oc-J0{eH8q=Z zX^G9IndrozyM$nw;}iGahv`HHY#RtDv(*g;jK|%A-Bb(Ze{QqgVszv{c{z!V(-M== zx74sA7%+*=n}1-ixQyRhQ&Q9DX~Joi8zGHGGG;J1aHCgmHh`8V$Tdcyu~~1j4#m^7 zC_2u(M>P7371X;1_47m{t*=JnImdIO!A5@*%ny>8x+%v9qi>v5y`yW4HIaixBpBW2 zI>^&TcVnOlCtwZJJe?R)y>Cc@Waw`RuR|za)ua+XwjYTGGvc1*YR*$V6(V{j?rAs8@{St;SW8#)bpN;xCN% z{=yg$$=xQke%(+uolAlpb;WW?U|xKrQeVr~tz{Ff&7P3IfsQ_AY86v_A?(!#J~6g^ zS6fV@RX&5x;}5hyZZP#lBmJBA{76e4F_6oQCV`JJ1)|l3%AQ8*eyaqyyKmdB){&`Fwu2 z+wTIcR}(c-3dT+_iAaZT-%9OdgZlmkkf;Xr`37|_sAPLP_EK)j>w$G(M9lFL`PNbg z@H#yT+~yK###~oM9ySpoCMU{(i{sbm>fP=YZvfXR&x|@B%yJbCTr22gP>^))ftup;aY9}&yg6=^0Ipl3O z!-eXNww=zwMp%B4N-!~&=FVJajVD=W3>>PpTVS%hAW_h<7A|RR4hB75a<7FDn1)pq z?c2)R?nXb^jcg|>I{G6{kFWjuC8myOdsk%7YEA7+wpTrA?1Eq^=zQMCl%o~(tTl^ zS@FB2H4x!oI-5dXc3|(7W4|D zlhK9`N{QI0`FEu#Z_Zjx4jju#Y7>!Q%_f7og?y?CCj`0xu@i^8{*CYkiVtgN+rfvm zmLDzGg>Z*+BjLo<(HcVtJ8|@H!@n#s4R&p^J64ocx4o^#a(Dt!oxqN-In(LRvLN~y z@FgHJnd77D0?vLWD>0cEBe>jdkK0#$pY9)<^xq@Ae*Jzc448p9v3Ll5>kuORUdu{- z=OKgw&~Xy>{`N!YH-G5>ADYXER5r)OV)Zu!{!?cKBjIWG`rCxcTaIe)-G*AN;QOF; zKC3Z>AfyQwU(`Um?nR9*;PYBOHc~HY8YV1VzbYQoLOXd-i+(=&db#=aaw6b{j`Ih# z_Sef%{OIp7Vl8i%o7x#}4^95EyrP#D8G(OiRqp@~5-Q>-1fF1Da9@A_=0e)?5^0|- zKTFgi+H9*e3^PokZ@*R-X9N;eH)4qqB?vb~Mzpk7n;6{3$5glJBDo}arrHgDdp+2y z819GQRCTXe&(sY7aFuGi?WUJrTv2NwM9RY2-N1H;U^qr{K^Q?g>#Qz86!}8h zs5R7k-Es1tEPu+z;mAX|VO#GFCV1F`0fa5Y<@Mf>i`-p`nvFi+kY~s{)I20Yl=Uq( z8*bIzV$l|lqn{o}t@ZjZDx5StxpRD}S_mdwK5x!NINZ?bq7xB;E}x(1kuu5oFD|{;J^g> z&g2Af+XPwPK-d#xb~PF{-+p!R;OYksu72~_Qhfag4+_;Mip58W1?}!fh@MqtXGfMQ z6nSmGzK0;qmDD{13bi?_;0%Slf(6doV#|voOD!Wyfh=;buJWu*$Q{SzZF{#~ru7(+ zKSm(Wfn%tD(&jeX+(e+c*sVX`tsm%W-vj}$loN3W4?zr%w}F&J%q6vrxUpkfqzSix z!&>#3Z3DA63*jWVm^g%woU~HJ-DGbqQD9UWj0aMzkZ|_W>>aRX!{fnu|*y4j`&O{o66a+4SLGotx5Dtf98EC`q|PQ-)ePA?G| zCQBTsJNi@d6TD|jN%$GnLBdrBL8~NENr4|B4Hg}h8(GCJ50|^zbIVP>kk1`#Pg)3% zzw@)2$-`GyAHMqd(kK5MB@SGNoXm%WQ*64%>P)eFAnGv_w&GDWcT}wl27}&OP4OAk zx1LcIOKoJSO&0`rU23aDmAJCb0RV!jQd6m^382QG8NMEV5NN&vejsBK3V@(M4l<%3 z_soM4R9uC&U6J5O9^K!hv66H|#0s0UaxVV*e4f!T~WODsSmGD@Lng`yyeTrU6trWiUl zg9*r|7x*-fmzMxzWWLRV4P<;lMi?$22Qq*(dv;-DKoaDe=RgV5S~in$qF@~&~oS}E3_PUUS6R?3uv5%9-wy` z`nUv{Uu~W_B+Hf~uwY_L-a z0t106i%Gi@K{M-lrhSzX9XT1#h({SMo122f13?s}Yx8PRf z#R-uQ~hwLI`NbbSFdpNC*8Q_Nu4l5A)c1bm97DrX^Bf9_zkrN+-fxz%3tfwh6I^VUrrn^wWu6I@;r|7b zOjiP+cYawzUcsrSzIY`Uk|0kHg#*QoPnA!U@_|xqm^c4)DPhJqTwG63obzl!VWxz% zfaE{!Zs*k<7Wmgo*B|DRIV~+p{cTP$VgVKZtFF!$xGVv(1(_2bD3uZaH|lqNe#LDe zgiO~gfo+9~P@B91`M-eUhTrFEDh*YBq%l=lOQ4)uxx`XonY?p}<%3+qkw}AXF^lS6 z%%T>vD1b#4TH1@e%0e@$fG=MqTbu7k7PBaL)j+OM%)`hRvnU0ySj?j2dIrr{aMVS7~yL`fnO?#xe^RG8|+^kdtdhLqPa=Qmm*+}*I3M=7PF}Ej)n2aVipB2 z?I~*YRnW_sEUL-@@2aB8B?vYq*dbI&R5|>}su7{&ouTD_4E0NtWC~d-s%olg7LzQw zlDb%`0_mH5Z+Nj(b;;WEVv;4tYKx^R`(mkTemVMLNO-YSb%}2)WbIijRV|jPras_U zELFh=L6urIV7D|EBn7IRb8B9DK)8z1|8I;DiErwu^J+MeVez1 zr;;w~eJYpyYmUgjEy^|$*%*lwBJIgYXC#u0tWPSv?d_4sCi!T6B+@1C;e2KfXHmIp ztGt0D*eB)tH^}3_Z;O2K)Gr!Uu7p3~ryNf8%R{&CNo2OlmqPyxDUk@W!pj2!e4-*b j>'state' FROM (SELECT VARCHAR(birth_place) AS BP FROM $astronauts) AS I", 357, 1, None), ("SELECT BP->>'address' FROM (SELECT VARCHAR(birth_place) AS BP FROM $astronauts) AS I", 357, 1, None), ("SELECT dict->>'list', dict->'list' AS thisisalongercolumnname, STRUCT(dict)->'list', dict->>'once', dict->'once' FROM testdata.flat.struct", 6, 5, None), + ("SELECT cve -> 'CVE_data_meta' ->> 'ASSIGNER' FROM testdata.flat.nvd limit 10", 10, 1, None), + ("SELECT cve ->> 'CVE_data_meta' ->> 'ASSIGNER' FROM testdata.flat.nvd limit 10", 10, 1, None), + ("SELECT cve -> 'CVE_data_meta' -> 'ASSIGNER' FROM testdata.flat.nvd limit 10", 10, 1, None), ("SELECT dict @? 'list' FROM testdata.flat.struct", 6, 1, None), ("SELECT struct(dict) @? 'list' FROM testdata.flat.struct", 6, 1, None), ("SELECT birth_place @? 'town' FROM $astronauts", 357, 1, None), + ("SELECT dict @? '$.list' FROM testdata.flat.struct", 6, 1, None), + ("SELECT cve @? '$.CVE_data_meta.ASSIGNER' FROM testdata.flat.nvd LIMIT 10", 10, 1, None), + ("SELECT cve @? '$.data_meta.ASSIGNER' FROM testdata.flat.nvd LIMIT 10", 10, 1, None), + ("SELECT cve @? '$.CVE_data_meta' FROM testdata.flat.nvd LIMIT 10", 10, 1, None), + ("SELECT cve @? 'CVE_data_meta' FROM testdata.flat.nvd LIMIT 10", 10, 1, None), + ("SELECT cve @? '$.CVE_data_meta.REASSIGNER' FROM testdata.flat.nvd LIMIT 10", 10, 1, None), + ("SELECT struct(dict) @? '$.list' FROM testdata.flat.struct", 6, 1, None), + ("SELECT birth_place @? '$.town' FROM $astronauts", 357, 1, None), ("SELECT birth_place['town'] FROM $astronauts", 357, 1, None), ("SELECT missions[0] FROM $astronauts", 357, 1, None), ("SELECT birth_place['town'] FROM $astronauts WHERE birth_place['town'] = 'Warsaw'", 1, 1, None), @@ -1287,6 +1298,20 @@ ("SELECT name FROM $planets WHERE SUBSTRING ( name, 2, 1 ) = 'a'", 3, 1, None), ("SELECT name FROM $planets WHERE SUBSTRING ( name, 3 ) = 'rth'", 1, 1, None), ("SELECT name FROM $planets WHERE SUBSTRING ( name, -1 ) = 's'", 3, 1, None), + ("SELECT name FROM $planets WHERE SUBSTRING ( name FROM 1 FOR 1 ) = 'M'", 2, 1, None), + ("SELECT name FROM $planets WHERE SUBSTRING ( name FROM 2 FOR 1 ) = 'a'", 3, 1, None), + ("SELECT name FROM $planets WHERE SUBSTRING ( name FROM 3 ) = 'rth'", 1, 1, None), + ("SELECT name FROM $planets WHERE SUBSTRING ( name FROM -1 ) = 's'", 3, 1, None), + ("SELECT SUBSTRING ( name FROM 5 FOR 2 ) FROM $planets", 9, 1, None), + ("SELECT name FROM $planets FOR TODAY WHERE SUBSTRING ( name, 1, 1 ) = 'M'", 2, 1, None), + ("SELECT name FROM $planets FOR TODAY WHERE SUBSTRING ( name, 2, 1 ) = 'a'", 3, 1, None), + ("SELECT name FROM $planets FOR TODAY WHERE SUBSTRING ( name, 3 ) = 'rth'", 1, 1, None), + ("SELECT name FROM $planets FOR TODAY WHERE SUBSTRING ( name, -1 ) = 's'", 3, 1, None), + ("SELECT name FROM $planets FOR TODAY WHERE SUBSTRING ( name FROM 1 FOR 1 ) = 'M'", 2, 1, None), + ("SELECT name FROM $planets FOR TODAY WHERE SUBSTRING ( name FROM 2 FOR 1 ) = 'a'", 3, 1, None), + ("SELECT name FROM $planets FOR TODAY WHERE SUBSTRING ( name FROM 3 ) = 'rth'", 1, 1, None), + ("SELECT name FROM $planets FOR TODAY WHERE SUBSTRING ( name FROM -1 ) = 's'", 3, 1, None), + ("SELECT SUBSTRING ( name FROM 5 FOR 2 ) FROM $planets FOR TODAY ", 9, 1, None), ("SELECT TIMESTAMP '2022-01-02', DATEDIFF('days', TIMESTAMP '2022-01-02', TIMESTAMP '2022-10-01') FROM $astronauts;", 357, 2, None), ("SELECT * FROM $satellites WHERE NULLIF(planetId, 5) IS NULL", 67, 8, None), ("SELECT * FROM $satellites WHERE NULLIF(planetId, 5) IS NOT NULL", 110, 8, None), @@ -1350,6 +1375,8 @@ ("SELECT TRIM(LEADING 'E' FROM name) FROM $planets;", 9, 1, None), ("SELECT * FROM $planets WHERE TRIM(TRAILING 'arth' FROM name) = 'E'", 1, 20, None), ("SELECT * FROM $planets WHERE TRIM(TRAILING 'ahrt' FROM name) = 'E'", 1, 20, None), + ("SELECT TRIM ( 'MVEJSONP' FROM name ) FROM $planets", 9, 1, None), + ("SELECT TRIM ( 'MVEJSONP' FROM name ) FROM $planets FOR TODAY", 9, 1, None), ("SELECT user_name, user_verified FROM testdata.flat.formats.parquet WITH(NO_PARTITION) WHERE user_verified IS TRUE", 711, 2, None), ("SELECT user_name, user_verified FROM testdata.flat.formats.parquet WITH(NO_PARTITION) WHERE user_verified = TRUE", 711, 2, None), @@ -1724,6 +1751,13 @@ ("SELECT name FROM (SELECT MD5(name) AS hash, name FROM $planets) AS S", 9, 1, None), + ("SELECT jsonb_object_keys(birth_place) FROM $astronauts", 357, 1, None), + ("SELECT jsonb_object_keys(VARCHAR(birth_place)) FROM $astronauts", 357, 1, None), + ("SELECT jsonb_object_keys(BLOB(birth_place)) FROM $astronauts", 357, 1, None), + ("SELECT jsonb_object_keys(birth_place) FROM testdata.astronauts", 357, 1, None), + ("SELECT jsonb_object_keys(VARCHAR(birth_place)) FROM testdata.astronauts", 357, 1, None), + ("SELECT jsonb_object_keys(BLOB(birth_place)) FROM testdata.astronauts", 357, 1, None), + # Edge Case with Empty Joins ("SELECT * FROM $planets LEFT JOIN (SELECT id FROM $satellites WHERE planetId < 0) AS S ON $planets.id = S.id", 9, 21, None), # Handling NULL Comparisons in WHERE Clause From 8d4454f9f31e6d3beffbb2094a402d79b2e427fc Mon Sep 17 00:00:00 2001 From: XB500 Date: Tue, 22 Oct 2024 14:52:11 +0000 Subject: [PATCH 003/157] Opteryx Version 0.18.1 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 5330b2b1d..779c1e9a2 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 836 +__build__ = 838 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 83c663089e6cd88e400c74c9becb26f2b16d8007 Mon Sep 17 00:00:00 2001 From: joocer Date: Sat, 26 Oct 2024 10:53:22 +0000 Subject: [PATCH 004/157] #2061/1 --- opteryx/__init__.py | 3 + opteryx/config.py | 3 + opteryx/connectors/capabilities/__init__.py | 3 +- .../connectors/capabilities/limit_pushable.py | 16 + opteryx/connectors/sql_connector.py | 9 +- opteryx/cursor.py | 2 + opteryx/functions/string_functions.py | 6 + opteryx/models/__init__.py | 4 +- .../{execution_tree.py => physical_plan.py} | 55 ++- opteryx/models/serial_engine.py | 7 + .../operators/bench/metadata_writer_node.py_ | 72 ---- .../bench/morsel_defragment_node.py_ | 129 ------ opteryx/operators/read_node.py | 5 +- opteryx/operatorsv2/__init__.py | 53 +++ .../operatorsv2/aggregate_and_group_node.py | 153 +++++++ opteryx/operatorsv2/aggregate_node.py | 256 ++++++++++++ opteryx/operatorsv2/async_read_node.py | 211 ++++++++++ opteryx/operatorsv2/base_plan_node.py | 115 ++++++ .../bench/#information_schema_node.py | 186 +++++++++ .../operatorsv2/bench/#show_databases_node.py | 79 ++++ opteryx/operatorsv2/cross_join_node.py | 377 ++++++++++++++++++ opteryx/operatorsv2/distinct_node.py | 73 ++++ opteryx/operatorsv2/exit_node.py | 107 +++++ opteryx/operatorsv2/explain_node.py | 49 +++ opteryx/operatorsv2/filter_node.py | 81 ++++ opteryx/operatorsv2/function_dataset_node.py | 150 +++++++ opteryx/operatorsv2/heap_sort_node.py | 139 +++++++ opteryx/operatorsv2/inner_join_node.py | 134 +++++++ opteryx/operatorsv2/inner_join_node_single.py | 215 ++++++++++ opteryx/operatorsv2/join_node.py | 97 +++++ opteryx/operatorsv2/limit_node.py | 77 ++++ opteryx/operatorsv2/noop_node.py | 48 +++ opteryx/operatorsv2/outer_join_node.py | 331 +++++++++++++++ opteryx/operatorsv2/projection_node.py | 72 ++++ opteryx/operatorsv2/read_node.py | 224 +++++++++++ opteryx/operatorsv2/set_variable_node.py | 53 +++ opteryx/operatorsv2/show_columns_node.py | 118 ++++++ opteryx/operatorsv2/show_create_node.py | 65 +++ opteryx/operatorsv2/show_value_node.py | 60 +++ opteryx/operatorsv2/sort_node.py | 100 +++++ opteryx/operatorsv2/union_node.py | 60 +++ opteryx/planner/__init__.py | 12 +- .../strategies/limit_pushdown.py | 11 +- .../logical_planner/logical_planner.py | 5 +- opteryx/planner/physical_planner.py | 112 ++++++ opteryx/planner/temporary_physical_planner.py | 6 +- pyproject.toml | 2 +- .../test_limit_pushdown_postgres.py | 92 +++++ .../test_limit_pushdown_sqlite.py | 90 +++++ .../test_projection_pushdown_sqlite.py | 4 +- tests/query_execution/test_execution_tree.py | 2 +- .../test_shapes_and_errors_battery.py | 6 + 52 files changed, 4117 insertions(+), 222 deletions(-) create mode 100644 opteryx/connectors/capabilities/limit_pushable.py rename opteryx/models/{execution_tree.py => physical_plan.py} (78%) create mode 100644 opteryx/models/serial_engine.py delete mode 100644 opteryx/operators/bench/metadata_writer_node.py_ delete mode 100644 opteryx/operators/bench/morsel_defragment_node.py_ create mode 100644 opteryx/operatorsv2/__init__.py create mode 100644 opteryx/operatorsv2/aggregate_and_group_node.py create mode 100644 opteryx/operatorsv2/aggregate_node.py create mode 100644 opteryx/operatorsv2/async_read_node.py create mode 100644 opteryx/operatorsv2/base_plan_node.py create mode 100644 opteryx/operatorsv2/bench/#information_schema_node.py create mode 100644 opteryx/operatorsv2/bench/#show_databases_node.py create mode 100644 opteryx/operatorsv2/cross_join_node.py create mode 100644 opteryx/operatorsv2/distinct_node.py create mode 100644 opteryx/operatorsv2/exit_node.py create mode 100644 opteryx/operatorsv2/explain_node.py create mode 100644 opteryx/operatorsv2/filter_node.py create mode 100644 opteryx/operatorsv2/function_dataset_node.py create mode 100644 opteryx/operatorsv2/heap_sort_node.py create mode 100644 opteryx/operatorsv2/inner_join_node.py create mode 100644 opteryx/operatorsv2/inner_join_node_single.py create mode 100644 opteryx/operatorsv2/join_node.py create mode 100644 opteryx/operatorsv2/limit_node.py create mode 100644 opteryx/operatorsv2/noop_node.py create mode 100644 opteryx/operatorsv2/outer_join_node.py create mode 100644 opteryx/operatorsv2/projection_node.py create mode 100644 opteryx/operatorsv2/read_node.py create mode 100644 opteryx/operatorsv2/set_variable_node.py create mode 100644 opteryx/operatorsv2/show_columns_node.py create mode 100644 opteryx/operatorsv2/show_create_node.py create mode 100644 opteryx/operatorsv2/show_value_node.py create mode 100644 opteryx/operatorsv2/sort_node.py create mode 100644 opteryx/operatorsv2/union_node.py create mode 100644 opteryx/planner/physical_planner.py create mode 100644 tests/plan_optimization/test_limit_pushdown_postgres.py create mode 100644 tests/plan_optimization/test_limit_pushdown_sqlite.py diff --git a/opteryx/__init__.py b/opteryx/__init__.py index 712ed5bb5..e749087e1 100644 --- a/opteryx/__init__.py +++ b/opteryx/__init__.py @@ -38,6 +38,9 @@ # Set Decimal precision to 28 globally getcontext().prec = 28 +# end-of-stream marker +EOS = object() + def is_mac() -> bool: # pragma: no cover """ diff --git a/opteryx/config.py b/opteryx/config.py index 28940bd14..297ba68c0 100644 --- a/opteryx/config.py +++ b/opteryx/config.py @@ -172,6 +172,9 @@ def get(key: str, default: Optional[typing.Any] = None) -> Optional[typing.Any]: DATA_CATALOG_CONFIGURATION: Optional[str] = get("DATA_CATALOG_CONFIGURATION") """Data Catalog configuration, different catalogs have different config formats.""" +EXPERIMENTAL_EXECUTION_ENGINE: bool = bool(get("EXPERIMENTAL_EXECUTION_ENGINE", False)) +"""Use the experimental/incomplete generation 2 execution engine.""" + # GCP project ID - for Google Cloud Data GCP_PROJECT_ID: str = get("GCP_PROJECT_ID") # don't try to raise the priority of the server process diff --git a/opteryx/connectors/capabilities/__init__.py b/opteryx/connectors/capabilities/__init__.py index 854acaf85..1fed4fc37 100644 --- a/opteryx/connectors/capabilities/__init__.py +++ b/opteryx/connectors/capabilities/__init__.py @@ -12,7 +12,8 @@ from opteryx.connectors.capabilities.asynchronous import Asynchronous from opteryx.connectors.capabilities.cacheable import Cacheable +from opteryx.connectors.capabilities.limit_pushable import LimitPushable from opteryx.connectors.capabilities.partitionable import Partitionable from opteryx.connectors.capabilities.predicate_pushable import PredicatePushable -__all__ = ("Asynchronous", "Cacheable", "Partitionable", "PredicatePushable") +__all__ = ("Asynchronous", "Cacheable", "LimitPushable", "Partitionable", "PredicatePushable") diff --git a/opteryx/connectors/capabilities/limit_pushable.py b/opteryx/connectors/capabilities/limit_pushable.py new file mode 100644 index 000000000..e8d9340b0 --- /dev/null +++ b/opteryx/connectors/capabilities/limit_pushable.py @@ -0,0 +1,16 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class LimitPushable: + def __init__(self, **kwargs): + self.limit: int = None diff --git a/opteryx/connectors/sql_connector.py b/opteryx/connectors/sql_connector.py index efe2bcf8b..7b807811c 100644 --- a/opteryx/connectors/sql_connector.py +++ b/opteryx/connectors/sql_connector.py @@ -19,6 +19,7 @@ from typing import Any from typing import Dict from typing import Generator +from typing import Optional from typing import Tuple import pyarrow @@ -34,6 +35,7 @@ from opteryx.connectors.base.base_connector import INITIAL_CHUNK_SIZE from opteryx.connectors.base.base_connector import MIN_CHUNK_SIZE from opteryx.connectors.base.base_connector import BaseConnector +from opteryx.connectors.capabilities import LimitPushable from opteryx.connectors.capabilities import PredicatePushable from opteryx.exceptions import MissingDependencyError from opteryx.exceptions import UnmetRequirementError @@ -55,7 +57,7 @@ def _handle_operand(operand: Node, parameters: dict) -> Tuple[Any, dict]: return f":{name}", parameters -class SqlConnector(BaseConnector, PredicatePushable): +class SqlConnector(BaseConnector, LimitPushable, PredicatePushable): __mode__ = "Sql" __type__ = "SQL" @@ -95,6 +97,7 @@ class SqlConnector(BaseConnector, PredicatePushable): def __init__(self, *args, connection: str = None, engine=None, **kwargs): BaseConnector.__init__(self, **kwargs) + LimitPushable.__init__(self, **kwargs) PredicatePushable.__init__(self, **kwargs) try: @@ -129,6 +132,7 @@ def read_dataset( # type:ignore columns: list = None, predicates: list = None, chunk_size: int = INITIAL_CHUNK_SIZE, # type:ignore + limit: Optional[int] = None, ) -> Generator[pyarrow.Table, None, None]: # type:ignore from sqlalchemy.sql import text @@ -168,6 +172,9 @@ def read_dataset( # type:ignore query_builder.WHERE(f"{left_value} {operator} {right_value}") + if limit is not None: + query_builder.LIMIT(str(limit)) + at_least_once = False convert_time = 0.0 diff --git a/opteryx/cursor.py b/opteryx/cursor.py index 655a7445c..37ff5016f 100644 --- a/opteryx/cursor.py +++ b/opteryx/cursor.py @@ -349,6 +349,8 @@ def execute_to_arrow( result_data, self._result_type = next(results, (ResultType._UNDEFINED, None)) if limit is not None: result_data = utils.arrow.limit_records(result_data, limit) # type: ignore + if isinstance(result_data, pyarrow.Table): + return result_data try: return pyarrow.concat_tables(result_data, promote_options="permissive") except ( diff --git a/opteryx/functions/string_functions.py b/opteryx/functions/string_functions.py index c2300e89e..9764d3d30 100644 --- a/opteryx/functions/string_functions.py +++ b/opteryx/functions/string_functions.py @@ -263,11 +263,17 @@ def substring( if len(arr) == 0: return [[]] + if hasattr(arr, "to_numpy"): + arr = arr.to_numpy(zero_copy_only=False) + def _inner(val, _from, _for): + if _from is None: + _from = 0 if _from > 0: _from -= 1 _for = int(_for) if _for and _for == _for else None # nosec if _for is None: + print(val, _from) return val[_from:] return val[_from : _for + _from] diff --git a/opteryx/models/__init__.py b/opteryx/models/__init__.py index 21765bb3c..e40ff5095 100644 --- a/opteryx/models/__init__.py +++ b/opteryx/models/__init__.py @@ -12,18 +12,18 @@ from opteryx.compiled.structures.node import Node from opteryx.models.connection_context import ConnectionContext -from opteryx.models.execution_tree import ExecutionTree from opteryx.models.logical_column import LogicalColumn from opteryx.models.non_tabular_result import NonTabularResult +from opteryx.models.physical_plan import PhysicalPlan from opteryx.models.query_properties import QueryProperties from opteryx.models.query_statistics import QueryStatistics __all__ = ( "ConnectionContext", - "ExecutionTree", "LogicalColumn", "Node", "NonTabularResult", + "PhysicalPlan", "QueryProperties", "QueryStatistics", ) diff --git a/opteryx/models/execution_tree.py b/opteryx/models/physical_plan.py similarity index 78% rename from opteryx/models/execution_tree.py rename to opteryx/models/physical_plan.py index 3d6357ee5..03a7e9fc0 100644 --- a/opteryx/models/execution_tree.py +++ b/opteryx/models/physical_plan.py @@ -29,18 +29,25 @@ import pyarrow +from opteryx import EOS +from opteryx import config from opteryx.constants import ResultType from opteryx.exceptions import InvalidInternalStateError from opteryx.third_party.travers import Graph -class ExecutionTree(Graph): +class PhysicalPlan(Graph): """ The execution tree is defined separately to the planner to simplify the complex code which is the planner from the tree that describes the plan. """ - def execute( + def execute(self) -> Generator[Tuple[Union[pyarrow.Table, Any], ResultType], None, None]: + if config.EXPERIMENTAL_EXECUTION_ENGINE: + return self.push_executor() + return self.legacy_executor() + + def legacy_executor( self, ) -> Generator[Tuple[Union[pyarrow.Table, Any], ResultType], None, None]: """ @@ -170,3 +177,47 @@ def _inner_explain(node, depth): table = pyarrow.Table.from_pylist(plan) yield table + + def push_executor(self): + pump_nodes = self.get_entry_points() + for pump_node in pump_nodes: + pump_instance = self[pump_node] + for morsel in pump_instance(None): + yield from self.process_node(pump_node, morsel) + + def process_node(self, nid, morsel): + from opteryx.operatorsv2 import ReaderNode + + node = self[nid] + + if isinstance(node, ReaderNode): + children = [t for s, t, r in self.outgoing_edges(nid)] + for child in children: + results = self.process_node(child, morsel) + yield from results + else: + results = node(morsel) + if results is None: + return None + if not isinstance(results, list): + results = [results] + if morsel == EOS and not any(r == EOS for r in results): + results.append(EOS) + for result in results: + if result is not None: + children = [t for s, t, r in self.outgoing_edges(nid)] + for child in children: + yield from self.process_node(child, result) + if len(children) == 0: + yield result, ResultType.TABULAR + + def sensors(self): + readings = {} + for nid in self.nodes(): + node = self[nid] + readings[node.identity] = node.sensors() + return readings + + def __del__(self): + pass +# print(self.sensors()) diff --git a/opteryx/models/serial_engine.py b/opteryx/models/serial_engine.py new file mode 100644 index 000000000..9a7dc5d05 --- /dev/null +++ b/opteryx/models/serial_engine.py @@ -0,0 +1,7 @@ +import gc + +import pyarrow + +from opteryx.constants import ResultType +from opteryx.exceptions import InvalidInternalStateError +from opteryx.third_party.travers import Graph diff --git a/opteryx/operators/bench/metadata_writer_node.py_ b/opteryx/operators/bench/metadata_writer_node.py_ deleted file mode 100644 index 9560d0ca6..000000000 --- a/opteryx/operators/bench/metadata_writer_node.py_ +++ /dev/null @@ -1,72 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Union Node - -This is a SQL Query Execution Plan Node. -""" -from typing import Generator - -import orso - -from opteryx.constants import QueryStatus -from opteryx.models import NonTabularResult -from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode, OperatorType - - -class MetadataWriterNode(BasePlanNode): - - operator_type = OperatorType.PASSTHRU - - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) - - - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - - @property - def name(self): # pragma: no cover - return "MetadataWriter" - - @property - def config(self): # pragma: no cover - return "" - - def execute(self) -> Generator: - - # don't use the standard scanner for blob-based data - - # read the manifest from disk/either .manifest / .complete - # if we used .complete (maybe if we have blobs not in the manifest) - # build statistics for the blobs - # save the manifest to a .manifest file - # save the manifest to the metastore () - - # if it's a collection or sql, we should accumulate - # if it's a blob store we can write back - # save the metastore - - morsel_count = -1 - for morsel_count, morsel in enumerate(self._producers[0].execute()): - metadata = morsel.schema.metadata - if metadata is None: - raise Exception("Cannot analyze dataset") - df = orso.DataFrame.from_arrow(morsel) - profile = df.profile - - print(profile.to_dicts()) - - return NonTabularResult(record_count=morsel_count + 1, status=QueryStatus.SQL_SUCCESS) # type: ignore diff --git a/opteryx/operators/bench/morsel_defragment_node.py_ b/opteryx/operators/bench/morsel_defragment_node.py_ deleted file mode 100644 index be360215d..000000000 --- a/opteryx/operators/bench/morsel_defragment_node.py_ +++ /dev/null @@ -1,129 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Morsel Defragment Node - -This is a SQL Query Execution Plan Node. - - Orignally implemented to test if datasets have any records as they pass through the DAG, this - function normalizes the number of bytes per morsel. - - This is to balance two competing demands: - - operate in a low memory environment, if the morsels are too large they may cause the - process to fail. - - operate quickly, if we spend our time doing Vecorization/SIMD on morsel with few records - we're not working as fast as we can. - - The low-water mark is 75% of the target size, less than this we look to merge morsels together. - This is more common following the implementation of projection push down, one column doesn't - take up a lot of memory so we consolidate tens of morsels into a single morsel. - - The high-water mark is 199% of the target size, more than this we split the morsel. Splitting - at a size any less than this will end up with morsels less that the target morsel size. - - We also have a record count limit, this is because of quirks with PyArrow, it changes long - arrays into ChunkedArrays which behave differently to Arrays in some circumstances. -""" -import time -from typing import Generator - -import pyarrow - -from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode, OperatorType - -MORSEL_SIZE_BYTES: int = 64 * 1024 * 1024 # 64Mb -MORSEL_SIZE_COUNT: int = 500000 # hard record count limit, half a million -HIGH_WATER: float = 1.99 # Split morsels over 199% of MORSEL_SIZE -LOW_WATER: float = 0.75 # Merge morsels under 75% of MORSEL_SIZE - - -class MorselDefragmentNode(BasePlanNode): - - operator_type = OperatorType.PASSTHRU - - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) - - - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - - @property - def name(self): # pragma: no cover - return "Morsel Defragment" - - @property - def config(self): # pragma: no cover - return "" - - def execute(self) -> Generator: # pragma: no cover - morsels = self._producers[0] # type:ignore - - row_counter = 0 - collected_rows = None - at_least_one_morsel = False - - for morsel in morsels.execute(): - if morsel.num_rows > 0: - start = time.monotonic_ns() - # add what we've collected before to the table - if collected_rows: # pragma: no cover - self.statistics.morsel_merges += 1 - morsel = pyarrow.concat_tables([collected_rows, morsel], promote_options="none") - collected_rows = None - self.statistics.time_defragmenting += time.monotonic_ns() - start - - # work out some stats about what we have - morsel_bytes = morsel.nbytes - morsel_records = morsel.num_rows - - # if we're more than double the target size, let's do something - if ( - morsel_bytes > (MORSEL_SIZE_BYTES * HIGH_WATER) - or morsel_records > MORSEL_SIZE_COUNT - ): # pragma: no cover - start = time.monotonic_ns() - - average_record_size = morsel_bytes / morsel_records - new_row_count = min( - int(MORSEL_SIZE_BYTES / average_record_size), MORSEL_SIZE_COUNT - ) - row_counter += new_row_count - self.statistics.morsel_splits += 1 - new_morsel = morsel.slice(offset=0, length=new_row_count) - at_least_one_morsel = True - collected_rows = morsel.slice(offset=new_row_count) - - self.statistics.time_defragmenting += time.monotonic_ns() - start - - yield new_morsel - # if we're less than 75% of the morsel size, save hold what we have so far and go - # collect the next morsel - elif morsel_bytes < (MORSEL_SIZE_BYTES * LOW_WATER): - collected_rows = morsel - # otherwise the morsel size is okay so we can emit the current morsel - else: - row_counter += morsel_records - yield morsel - at_least_one_morsel = True - elif not at_least_one_morsel: - # we have to emit something to the next step, but don't emit multiple empty morsels - yield morsel - at_least_one_morsel = True - - # if we're at the end and haven't emitted all the records, emit them now - if collected_rows: - row_counter += collected_rows.num_rows - yield collected_rows diff --git a/opteryx/operators/read_node.py b/opteryx/operators/read_node.py index 89841bbf0..71e055860 100644 --- a/opteryx/operators/read_node.py +++ b/opteryx/operators/read_node.py @@ -144,6 +144,7 @@ def __init__(self, properties: QueryProperties, **parameters): self.connector = parameters.get("connector") self.schema = parameters.get("schema") + self.limit = parameters.get("limit") if len(self.hints) != 0: self.statistics.add_message("All HINTS are currently ignored") @@ -201,7 +202,9 @@ def execute(self) -> Generator: orso_schema.columns = orso_schema_cols arrow_schema = None start_clock = time.monotonic_ns() - reader = self.connector.read_dataset(columns=self.columns, predicates=self.predicates) + reader = self.connector.read_dataset( + columns=self.columns, predicates=self.predicates, limit=self.limit + ) for morsel in reader: # try to make each morsel have the same schema morsel = struct_to_jsonb(morsel) diff --git a/opteryx/operatorsv2/__init__.py b/opteryx/operatorsv2/__init__.py new file mode 100644 index 000000000..4f2505e0b --- /dev/null +++ b/opteryx/operatorsv2/__init__.py @@ -0,0 +1,53 @@ +# isort: skip + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from .base_plan_node import BasePlanDataObject # isort: skip +from .base_plan_node import BasePlanNode # isort: skip + +from .aggregate_and_group_node import AggregateAndGroupNode # Group is always followed by aggregate +from .aggregate_node import AGGREGATORS +from .aggregate_node import AggregateNode # aggregate data +from .async_read_node import AsyncReaderNode + +# from .build_statistics_node import BuildStatisticsNode # Analyze Tables +from .cross_join_node import CrossJoinNode # CROSS JOIN +from .distinct_node import DistinctNode # remove duplicate records +from .exit_node import ExitNode +from .explain_node import ExplainNode # EXPLAIN queries +from .filter_node import FilterNode # filter unwanted rows +from .function_dataset_node import FunctionDatasetNode # Dataset Constructors +from .heap_sort_node import HeapSortNode # Heap + +# from .information_schema_node import InformationSchemaNode # information_schema +from .inner_join_node import InnerJoinNode +from .inner_join_node_single import InnerJoinSingleNode +from .join_node import JoinNode +from .limit_node import LimitNode # select the first N records + +# from .metadata_writer_node import MetadataWriterNode +# from .morsel_defragment_node import MorselDefragmentNode # consolidate small morsels +from .noop_node import NoOpNode # No Operation +from .outer_join_node import OuterJoinNode +from .projection_node import ProjectionNode # remove unwanted columns including renames +from .read_node import ReaderNode +from .set_variable_node import SetVariableNode +from .show_columns_node import ShowColumnsNode # column details +from .show_create_node import ShowCreateNode # SHOW CREATE VIEW + +# from .show_databases_node import ShowDatabasesNode # SHOW DATABASES +# from .show_functions_node import ShowFunctionsNode # supported functions +from .show_value_node import ShowValueNode # display node for SHOW +from .sort_node import SortNode # order by selected columns +from .union_node import UnionNode diff --git a/opteryx/operatorsv2/aggregate_and_group_node.py b/opteryx/operatorsv2/aggregate_and_group_node.py new file mode 100644 index 000000000..a0f2c2b15 --- /dev/null +++ b/opteryx/operatorsv2/aggregate_and_group_node.py @@ -0,0 +1,153 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Grouping Node + +This is a SQL Query Execution Plan Node. + +This is the grouping node, it is always followed by the aggregation node, but +the aggregation node doesn't need the grouping node. + + +""" + +from dataclasses import dataclass + +import numpy +import pyarrow +from orso.types import OrsoTypes + +from opteryx import EOS +from opteryx.managers.expression import NodeType +from opteryx.managers.expression import evaluate_and_append +from opteryx.managers.expression import get_all_nodes_of_type +from opteryx.models import QueryProperties +from opteryx.operators.aggregate_node import build_aggregations +from opteryx.operators.aggregate_node import extract_evaluations +from opteryx.operators.aggregate_node import project +from opteryx.operators.base_plan_node import BasePlanDataObject + +from . import BasePlanNode + + +@dataclass +class AggregateAndGroupDataObject(BasePlanDataObject): + groups: list = None + aggregates: list = None + all_identifiers: list = None + evaluatable_nodes: list = None + group_by_columns: list = None + column_map: list = None + aggregate_functions: list = None + + +class AggregateAndGroupNode(BasePlanNode): + def __init__(self, properties: QueryProperties, **config): + super().__init__(properties=properties) + self.groups = list(config["groups"]) + self.aggregates = list(config["aggregates"]) + projection = list(config["projection"]) + + # we're going to preload some of the evaluation + + # Replace offset based GROUP BYs with their column + self.groups = [ + ( + group + if not (group.node_type == NodeType.LITERAL and group.type == OrsoTypes.INTEGER) + else projection[group.value - 1] + ) + for group in self.groups + ] + + # get all the columns anywhere in the groups or aggregates + all_identifiers = [ + node.schema_column.identity + for node in get_all_nodes_of_type( + self.groups + self.aggregates, select_nodes=(NodeType.IDENTIFIER,) + ) + ] + self.all_identifiers = list(dict.fromkeys(all_identifiers)) + + # Get any functions we need to execute before aggregating + self.evaluatable_nodes = extract_evaluations(self.aggregates) + + # get the aggregated groupings and functions + self.group_by_columns = list({node.schema_column.identity for node in self.groups}) + self.column_map, self.aggregate_functions = build_aggregations(self.aggregates) + + self.do = AggregateAndGroupDataObject() + + self.buffer = [] + + @classmethod + def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover + raise NotImplementedError() + + @property + def config(self): # pragma: no cover + from opteryx.managers.expression import format_expression + + return f"AGGREGATE ({', '.join(format_expression(col) for col in self.aggregates)}) GROUP BY ({', '.join(format_expression(col) for col in self.groups)})" + + @property + def name(self): # pragma: no cover + return "Group" + + def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + if morsel == EOS: + # merge all the morsels together into one table, selecting only the columns + # we're pretty sure we're going to use - this will fail for datasets + # larger than memory + table = pyarrow.concat_tables( + self.buffer, + promote_options="permissive", + ) + + # do the group by and aggregates + table = table.combine_chunks() + groups = table.group_by(self.group_by_columns) + groups = groups.aggregate(self.aggregate_functions) + + # do the secondary activities for ARRAY_AGG + for node in get_all_nodes_of_type(self.aggregates, select_nodes=(NodeType.AGGREGATOR,)): + if node.value == "ARRAY_AGG" and node.order or node.limit: + # rip the column out of the table + column_name = self.column_map[node.schema_column.identity] + column_def = groups.field(column_name) # this is used + column = groups.column(column_name).to_pylist() + groups = groups.drop([column_name]) + if node.order: + column = [sorted(c, reverse=bool(node.order[0][1])) for c in column] + if node.limit: + column = [c[: node.limit] for c in column] + # put the new column into the table + groups = groups.append_column(column_def, [column]) + + # project to the desired column names from the pyarrow names + groups = groups.select(list(self.column_map.values()) + self.group_by_columns) + groups = groups.rename_columns(list(self.column_map.keys()) + self.group_by_columns) + + return [groups, EOS] + + morsel = project(morsel, self.all_identifiers) + # Add a "*" column, this is an int because when a bool it miscounts + if "*" not in morsel.column_names: + morsel = morsel.append_column( + "*", [numpy.ones(shape=morsel.num_rows, dtype=numpy.bool_)] + ) + if self.evaluatable_nodes: + morsel = evaluate_and_append(self.evaluatable_nodes, morsel) + morsel = evaluate_and_append(self.groups, morsel) + + self.buffer.append(morsel) diff --git a/opteryx/operatorsv2/aggregate_node.py b/opteryx/operatorsv2/aggregate_node.py new file mode 100644 index 000000000..606d4f232 --- /dev/null +++ b/opteryx/operatorsv2/aggregate_node.py @@ -0,0 +1,256 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Aggregation Node + +This is a SQL Query Execution Plan Node. + +This node performs aggregates without performing groupings. +""" + +from dataclasses import dataclass + +import numpy +import pyarrow + +from opteryx import EOS +from opteryx.exceptions import UnsupportedSyntaxError +from opteryx.managers.expression import NodeType +from opteryx.managers.expression import evaluate_and_append +from opteryx.managers.expression import get_all_nodes_of_type +from opteryx.models import QueryProperties +from opteryx.operators.base_plan_node import BasePlanDataObject + +from . import BasePlanNode + +COUNT_STAR: str = "COUNT(*)" + +# use the aggregators from pyarrow +AGGREGATORS = { + "ALL": "all", + "ANY": "any", + "APPROXIMATE_MEDIAN": "approximate_median", + "ARRAY_AGG": "hash_list", + "COUNT": "count", # counts only non nulls + "COUNT_DISTINCT": "count_distinct", + "DISTINCT": "distinct", # fated + "LIST": "hash_list", # fated + "MAX": "max", + "MAXIMUM": "max", # alias + "MEAN": "mean", + "AVG": "mean", # alias + "AVERAGE": "mean", # alias + "MIN": "min", + "MINIMUM": "min", # alias + "MIN_MAX": "min_max", + "ONE": "hash_one", + "ANY_VALUE": "hash_one", + "PRODUCT": "product", + "STDDEV": "stddev", + "SUM": "sum", + "VARIANCE": "variance", +} + + +def _is_count_star(aggregates): + """ + Is the SELECT clause `SELECT COUNT(*)` with no GROUP BY + """ + if len(aggregates) != 1: + return False + if aggregates[0].value != "COUNT": + return False + return aggregates[0].parameters[0].node_type == NodeType.WILDCARD + + +def _count_star(morsel_promise, column_name): + count = sum(morsel.num_rows for morsel in morsel_promise) + table = pyarrow.Table.from_pylist([{column_name: count}]) + return table + + +def project(tables, column_names): + for table in tables: + row_count = table.num_rows + if len(column_names) > 0: + yield table.select(dict.fromkeys(column_names)) + else: + # if we can't find the column, add a placeholder column + yield pyarrow.Table.from_pydict({"*": numpy.full(row_count, 1, dtype=numpy.int8)}) + + +def build_aggregations(aggregators): + column_map = {} + aggs = [] + + if not isinstance(aggregators, list): + aggregators = [aggregators] + + for root in aggregators: + for aggregator in get_all_nodes_of_type(root, select_nodes=(NodeType.AGGREGATOR,)): + field_node = aggregator.parameters[0] + count_options = None + + if field_node.node_type == NodeType.WILDCARD: + field_name = "*" + # count * counts nulls + count_options = pyarrow.compute.CountOptions(mode="all") + else: + field_name = field_node.schema_column.identity + function = AGGREGATORS[aggregator.value] + # if the array agg is distinct, base off that function instead + if aggregator.value == "ARRAY_AGG" and aggregator.distinct: + function = "distinct" + aggs.append((field_name, function, count_options)) + column_map[aggregator.schema_column.identity] = f"{field_name}_{function}".replace( + "_hash_", "_" + ) + + return column_map, aggs + + +def _non_group_aggregates(aggregates, table): + """ + If we're not doing a group by, we're just doing aggregations, the pyarrow + functionality for aggregate doesn't work. So we do the calculation, it's + relatively straightforward as it's the entire table we're summarizing. + """ + + result = {} + + for aggregate in aggregates: + if aggregate.node_type in (NodeType.AGGREGATOR,): + column_node = aggregate.parameters[0] + if column_node.node_type == NodeType.LITERAL: + raw_column_values = numpy.full(table.num_rows, column_node.value) + elif ( + aggregate.value == "COUNT" + and aggregate.parameters[0].node_type == NodeType.WILDCARD + ): + result[aggregate.schema_column.identity] = table.num_rows + continue + else: + raw_column_values = table[column_node.schema_column.identity].to_numpy() + aggregate_function_name = AGGREGATORS[aggregate.value] + # this maps a string which is the function name to that function on the + # pyarrow.compute module + if not hasattr(pyarrow.compute, aggregate_function_name): + raise UnsupportedSyntaxError( + f"Aggregate `{aggregate.value}` can only be used with GROUP BY" + ) + aggregate_function = getattr(pyarrow.compute, aggregate_function_name) + aggregate_column_value = aggregate_function(raw_column_values).as_py() + result[aggregate.schema_column.identity] = aggregate_column_value + + return pyarrow.Table.from_pylist([result]) + + +def extract_evaluations(aggregates): + # extract any inner evaluations, like the IIF in SUM(IIF(x, 1, 0)) + + all_evaluatable_nodes = get_all_nodes_of_type( + aggregates, + select_nodes=( + NodeType.FUNCTION, + NodeType.BINARY_OPERATOR, + NodeType.COMPARISON_OPERATOR, + NodeType.LITERAL, + ), + ) + + evaluatable_nodes = [] + for node in all_evaluatable_nodes: + aggregators = get_all_nodes_of_type(node, select_nodes=(NodeType.AGGREGATOR,)) + if len(aggregators) == 0: + evaluatable_nodes.append(node) + + return evaluatable_nodes + + +@dataclass +class AggregateDataObject(BasePlanDataObject): + aggregates: list = None + all_identifiers: list = None + evaluatable_nodes: list = None + column_map: list = None + aggregate_functions: list = None + + +class AggregateNode(BasePlanNode): + def __init__(self, properties: QueryProperties, **config): + super().__init__(properties=properties) + + self.aggregates = config.get("aggregates", []) + + # get all the columns anywhere in the aggregates + all_identifiers = [ + node.schema_column.identity + for node in get_all_nodes_of_type(self.aggregates, select_nodes=(NodeType.IDENTIFIER,)) + ] + self.all_identifiers = list(dict.fromkeys(all_identifiers)) + + # Get any functions we need to execute before aggregating + self.evaluatable_nodes = extract_evaluations(self.aggregates) + + self.column_map, self.aggregate_functions = build_aggregations(self.aggregates) + + self.do = AggregateDataObject() + self.buffer = [] + + @classmethod + def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover + raise NotImplementedError() + + @property + def config(self): # pragma: no cover + return str(self.aggregates) + + @property + def name(self): # pragma: no cover + return "Aggregation" + + def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + if morsel == EOS: + if _is_count_star(self.aggregates): + return _count_star( + morsel_promise=self.buffer, + column_name=self.aggregates[0].schema_column.identity, + ) + + # merge all the morsels together into one table, selecting only the columns + # we're pretty sure we're going to use - this will fail for datasets + # larger than memory until we implement some form of partitioning + table = pyarrow.concat_tables( + project(self.buffer, self.all_identifiers), promote_options="none" + ) + + # Allow grouping by functions by evaluating them first + if self.evaluatable_nodes: + table = evaluate_and_append(self.evaluatable_nodes, table) + + # Add a "*" column, this is an int because when a bool it miscounts + if "*" not in table.column_names: + table = table.append_column( + "*", [numpy.full(shape=table.num_rows, fill_value=1, dtype=numpy.int8)] + ) + + # we're not a group_by - we're aggregating without grouping + aggregates = _non_group_aggregates(self.aggregates, table) + del table + + # name the aggregate fields and add them to the Columns data + aggregates = aggregates.select(list(self.column_map.keys())) + + return [aggregates, EOS] + + self.buffer.append(morsel) diff --git a/opteryx/operatorsv2/async_read_node.py b/opteryx/operatorsv2/async_read_node.py new file mode 100644 index 000000000..8859d748d --- /dev/null +++ b/opteryx/operatorsv2/async_read_node.py @@ -0,0 +1,211 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Async Scanner Node + +This is the SQL Query Execution Plan Node responsible for the reading of data. + +It wraps different internal readers (e.g. GCP Blob reader, SQL Reader), +normalizes the data into the format for internal processing. +""" + +import asyncio +import queue +import threading +import time +from dataclasses import dataclass +from typing import Generator + +import aiohttp +import pyarrow +import pyarrow.parquet +from orso.schema import convert_orso_schema_to_arrow_schema + +from opteryx import EOS +from opteryx import config +from opteryx.exceptions import DataError +from opteryx.operators.base_plan_node import BasePlanDataObject +from opteryx.shared import AsyncMemoryPool +from opteryx.shared import MemoryPool +from opteryx.utils.file_decoders import get_decoder + +from .read_node import ReaderNode +from .read_node import normalize_morsel +from .read_node import struct_to_jsonb + +CONCURRENT_READS = config.CONCURRENT_READS +MAX_READ_BUFFER_CAPACITY = config.MAX_READ_BUFFER_CAPACITY + + +async def fetch_data(blob_names, pool, reader, reply_queue, statistics): + semaphore = asyncio.Semaphore(CONCURRENT_READS) + session = aiohttp.ClientSession() + + async def fetch_and_process(blob_name): + async with semaphore: + start_per_blob = time.monotonic_ns() + reference = await reader( + blob_name=blob_name, pool=pool, session=session, statistics=statistics + ) + reply_queue.put((blob_name, reference)) # Put data onto the queue + statistics.time_reading_blobs += time.monotonic_ns() - start_per_blob + + tasks = (fetch_and_process(blob) for blob in blob_names) + + await asyncio.gather(*tasks) + reply_queue.put(None) + await session.close() + + +@dataclass +class AsyncReaderDataObject(BasePlanDataObject): + pass + + +class AsyncReaderNode(ReaderNode): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.pool = MemoryPool(MAX_READ_BUFFER_CAPACITY, f"ReadBuffer <{self.parameters['alias']}>") + + self.do = AsyncReaderDataObject() + self.predicates = kwargs.get("predicates") + + @classmethod + def from_dict(cls, dic: dict) -> "AsyncReaderNode": # pragma: no cover + raise NotImplementedError() + + def execute(self, morsel) -> Generator: + from opteryx import system_statistics + + """Perform this step, time how long is spent doing work""" + orso_schema = self.parameters["schema"] + reader = self.parameters["connector"] + + orso_schema_cols = [] + for col in orso_schema.columns: + if col.identity in [c.schema_column.identity for c in self.columns]: + orso_schema_cols.append(col) + orso_schema.columns = orso_schema_cols + + self.statistics.columns_read = len(orso_schema.columns) + + blob_names = reader.partition_scheme.get_blobs_in_partition( + start_date=reader.start_date, + end_date=reader.end_date, + blob_list_getter=reader.get_list_of_blob_names, + prefix=reader.dataset, + predicates=self.predicates, + ) + + if len(blob_names) == 0: + # if we don't have any matching blobs, create an empty dataset + # TODO: rewrite + from orso import DataFrame + + as_arrow = DataFrame(rows=[], schema=orso_schema).arrow() + renames = [orso_schema.column(col).identity for col in as_arrow.column_names] + as_arrow = as_arrow.rename_columns(renames) + yield as_arrow + + data_queue: queue.Queue = queue.Queue() + + loop = asyncio.new_event_loop() + read_thread = threading.Thread( + target=lambda: loop.run_until_complete( + fetch_data( + blob_names, + AsyncMemoryPool(self.pool), + reader.async_read_blob, + data_queue, + self.statistics, + ) + ), + daemon=True, + ) + read_thread.start() + + morsel = None + arrow_schema = None + + while True: + try: + # Attempt to get an item with a timeout. + item = data_queue.get(timeout=0.1) + except queue.Empty: + # Increment stall count if the queue is empty. + self.statistics.stalls_reading_from_read_buffer += 1 + system_statistics.io_wait_seconds += 0.1 + continue # Skip the rest of the loop and try to get an item again. + + if item is None: + # Break out of the loop if the item is None, indicating a termination condition. + break + + blob_name, reference = item + + decoder = get_decoder(blob_name) + + try: + # the sync readers include the decode time as part of the read time + try: + # This pool is being used by async processes in another thread, using + # zero copy versions occassionally results in data getting corrupted + # due to a read-after-free type error + start = time.monotonic_ns() + blob_bytes = self.pool.read_and_release(reference, zero_copy=False) + decoded = decoder( + blob_bytes, projection=self.columns, selection=self.predicates + ) + except Exception as err: + from pyarrow import ArrowInvalid + + if isinstance(err, ArrowInvalid) and "No match for" in str(err): + raise DataError( + f"Unable to read blob {blob_name} - this error is likely caused by a blob having an significantly different schema to previously handled blobs, or the data catalog." + ) + raise DataError(f"Unable to read blob {blob_name} - error {err}") from err + self.statistics.time_reading_blobs += time.monotonic_ns() - start + num_rows, _, morsel = decoded + self.statistics.rows_seen += num_rows + + morsel = struct_to_jsonb(morsel) + morsel = normalize_morsel(orso_schema, morsel) + + if arrow_schema: + morsel = morsel.cast(arrow_schema) + else: + arrow_schema = morsel.schema + + self.statistics.blobs_read += 1 + self.records_out += morsel.num_rows + self.bytes_out += morsel.nbytes + + yield morsel + except Exception as err: + self.statistics.add_message(f"failed to read {blob_name}") + self.statistics.failed_reads += 1 + import warnings + + warnings.warn(f"failed to read {blob_name} - {err}") + + # Ensure the thread is closed + read_thread.join() + + if morsel is None: + self.statistics.empty_datasets += 1 + arrow_schema = convert_orso_schema_to_arrow_schema(orso_schema, use_identities=True) + yield pyarrow.Table.from_arrays( + [pyarrow.array([]) for _ in arrow_schema], schema=arrow_schema + ) + + yield EOS diff --git a/opteryx/operatorsv2/base_plan_node.py b/opteryx/operatorsv2/base_plan_node.py new file mode 100644 index 000000000..0188159b4 --- /dev/null +++ b/opteryx/operatorsv2/base_plan_node.py @@ -0,0 +1,115 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import time +from dataclasses import dataclass +from typing import Optional + +import pyarrow +from orso.tools import random_string + +from opteryx import EOS + + +@dataclass +class BasePlanDataObject: + operation: Optional[str] = None + query_id: str = None + identity: str = None + + def __post_init__(self): + # Perform actions after initialization + if self.identity is None: + self.identity = random_string() + if self.operation is None: + self.operation = self.__class__.__name__.replace("DataObject", "Node") + + +class BasePlanNode: + def __init__(self, *, properties, **parameters): + """ + This is the base class for nodes in the execution plan. + + The initializer accepts a QueryStatistics node which is populated by different nodes + differently to record what happened during the query execution. + """ + from opteryx.models import QueryProperties + from opteryx.models import QueryStatistics + + self.properties: QueryProperties = properties + self.statistics: QueryStatistics = QueryStatistics(properties.qid) + self.parameters = parameters + self.execution_time = 0 + self.identity = random_string() + self.do: Optional[BasePlanDataObject] = None + self.calls = 0 + self.records_in = 0 + self.bytes_in = 0 + self.records_out = 0 + self.bytes_out = 0 + + def to_json(self) -> bytes: # pragma: no cover + import orjson + + from opteryx.utils import dataclass_to_dict + + return orjson.dumps(dataclass_to_dict(self.do)) + + @classmethod + def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover + raise NotImplementedError() + + def config(self) -> str: + return "" + + @property + def name(self): # pragma: no cover + """ + Friendly Name of this node + """ + return "no name" + + @property + def node_type(self) -> str: + return self.name + + def __str__(self) -> str: + return f"{self.name} {self.sensors()}" + + def execute(self, morsel: pyarrow.Table) -> Optional[pyarrow.Table]: # pragma: no cover + pass + + def __call__(self, morsel: pyarrow.Table) -> Optional[pyarrow.Table]: + if morsel is not None and morsel != EOS: + self.records_in += morsel.num_rows + self.bytes_in += morsel.nbytes + self.calls += 1 + + start_time = time.monotonic_ns() + result = self.execute(morsel) + + self.execution_time += time.monotonic_ns() - start_time + if result is not None and result != EOS and hasattr(result, "num_rows"): + self.records_out += result.num_rows + self.bytes_out += result.nbytes + return result + + def sensors(self): + return { + "calls": self.calls, + "execution_time": self.execution_time, + "records_in": self.records_in, + "records_out": self.records_out, + "bytes_in": self.bytes_in, + "bytes_out": self.bytes_out, + } diff --git a/opteryx/operatorsv2/bench/#information_schema_node.py b/opteryx/operatorsv2/bench/#information_schema_node.py new file mode 100644 index 000000000..97cc5f847 --- /dev/null +++ b/opteryx/operatorsv2/bench/#information_schema_node.py @@ -0,0 +1,186 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Collection Reader Node + +This is a SQL Query Execution Plan Node. + +This Node primarily is used for reading NoSQL sources like MongoDB and Firestore. +""" + +import datetime +from typing import Iterable + +import pyarrow + +from opteryx.exceptions import DatasetNotFoundError +from opteryx.models import QueryProperties +from opteryx.operators import BasePlanNode +from opteryx.operators import OperatorType + + +def information_schema_routines(): + schema = { + "specific_name": None, + "routine_catalog": None, + "routine_schema": None, + "routine_name": None, + "routine_type": None, + "data_type": None, + "character_maximum_length": None, + "character_octet_length": None, + "numeric_precision": None, + "numeric_scale": None, + "datetime_precision": None, + "character_set_name": None, + "collation_name": None, + "dtd_identifier": None, + "routine_body": None, + "routine_definition": None, + "external_name": None, + "external_language": None, + "parameter_style": None, + "is_deterministic": None, + "sql_data_access": None, + "sql_path": None, + "security_type": None, + "created": None, + "last_altered": None, + "sql_mode": None, + "routine_comment": None, + "definer": None, + "character_set_client": None, + "collation_connection": None, + "database_collation": None, + } + + buffer = [schema] + + table = pyarrow.Table.from_pylist(buffer) + table = Columns.create_table_metadata( + table=table, + expected_rows=len(buffer), + name="information_schema_routines", + table_aliases=[], + disposition="calculated", + path="information_schema_routines", + ) + + return table + + +def information_schema_views(): + schema = { + "table_catalog": None, + "table_schema": None, + "table_name": None, + "view_definition": None, + "check_option": "NONE", + "is_updatable": "NO", + "definer": None, + "security_type": None, + "character_set_client": None, + "collation_connection": None, + } + + buffer = [schema] + + table = pyarrow.Table.from_pylist(buffer) + table = Columns.create_table_metadata( + table=table, + expected_rows=len(buffer), + name="information_schema_views", + table_aliases=[], + disposition="calculated", + path="information_schema_views", + ) + + return table + + +def information_schema_tables(): + schema = { + "table_catalog": "opteryx", + "table_schema": None, + "table_name": "$planets", + "table_type": "SYSTEM VIEW", + "engine": "Interal", + "version": "0", + "row_format": "fIXED", + "table_rows": 0, + "avg_row_length": 0, + "data_length": 0, + "max_data_length": 0, + "index_length": 0, + "data_free": 0, + "auto_increment": 0, + "create_time": datetime.datetime.utcnow(), + "update_time": datetime.datetime.utcnow(), + "check_time": datetime.datetime.utcnow(), + "table_collation": None, + "checksum": 0, + "create_options": None, + "table_comment": None, + } + + buffer = [schema] + + table = pyarrow.Table.from_pylist(buffer) + table = Columns.create_table_metadata( + table=table, + expected_rows=len(buffer), + name="information_schema_tables", + table_aliases=[], + disposition="calculated", + path="information_schema_tables", + ) + + return table + + +class InformationSchemaNode(BasePlanNode): + operator_type = OperatorType.PRODUCER + + def __init__(self, properties: QueryProperties, **config): + super().__init__(properties=properties) + + self._alias = config.get("alias") + self._dataset = config["dataset"].lower() + + # pushed down selection/filter + self._selection = config.get("selection") + + @classmethod + def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover + raise NotImplementedError() + + @property + def config(self): # pragma: no cover + if self._alias: + return f"{self._dataset} => {self._alias}" + return f"{self._dataset}" + + @property + def name(self): # pragma: no cover + return "Information Schema Reader" + + def execute(self) -> Iterable: + if self._dataset == "information_schema.tables": + yield information_schema_tables() + elif self._dataset == "information_schema.views": + yield information_schema_views() + elif self._dataset == "information_schema.routines": + yield information_schema_routines() + else: + raise DatasetNotFoundError(dataset=self._dataset) + return diff --git a/opteryx/operatorsv2/bench/#show_databases_node.py b/opteryx/operatorsv2/bench/#show_databases_node.py new file mode 100644 index 000000000..6dc7e3500 --- /dev/null +++ b/opteryx/operatorsv2/bench/#show_databases_node.py @@ -0,0 +1,79 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Show Stores Node + +This is a SQL Query Execution Plan Node. +""" + +from typing import Iterable + +import pyarrow + +from opteryx.models import QueryProperties +from opteryx.operators import BasePlanNode +from opteryx.operators import OperatorType + + +class ShowDatabasesNode(BasePlanNode): + operator_type = OperatorType.PRODUCER + + def __init__(self, properties: QueryProperties, **config): + super().__init__(properties=properties) + + @classmethod + def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover + raise NotImplementedError() + + @property + def name(self): # pragma: no cover + return "Show Databases" + + @property + def config(self): # pragma: no cover + return "" + + def execute(self) -> Iterable: + from opteryx.connectors import _storage_prefixes + + buffer = [ + { + "Database": "" if s == "_" else s, # type: ignore + "Connector": str(c["connector"].__name__), # type: ignore + "Remove_Prefix": c["remove_prefix"], # type: ignore + "Type": str(c["connector"].mro()[1].__name__[4:-14]), # type: ignore + } + for s, c in _storage_prefixes.items() + if isinstance(c, dict) + ] + buffer.append( + { + "Database": "opteryx", # type: ignore + "Connector": "Internal", # type: ignore + "Remove_Prefix": True, # type: ignore + "Type": "Internal", # type: ignore + } + ) + + table = pyarrow.Table.from_pylist(buffer) + table = Columns.create_table_metadata( + table=table, + expected_rows=len(buffer), + name="show_stores", + table_aliases=[], + disposition="calculated", + path="show_stores", + ) + + yield table + return diff --git a/opteryx/operatorsv2/cross_join_node.py b/opteryx/operatorsv2/cross_join_node.py new file mode 100644 index 000000000..6ab42e4ad --- /dev/null +++ b/opteryx/operatorsv2/cross_join_node.py @@ -0,0 +1,377 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Cross Join Node + +This is a SQL Query Execution Plan Node. + +This performs a CROSS JOIN - CROSS JOIN is not natively supported by PyArrow so this is written +here rather than calling the join() functions +""" + +from dataclasses import dataclass +from typing import Generator +from typing import Set +from typing import Tuple + +import numpy +import pyarrow +from orso.schema import FlatColumn + +from opteryx import EOS +from opteryx.managers.expression import NodeType +from opteryx.models import Node +from opteryx.models import QueryProperties +from opteryx.operators.base_plan_node import BasePlanDataObject + +from . import BasePlanNode + +INTERNAL_BATCH_SIZE: int = 7500 # config +MAX_JOIN_SIZE: int = 1000 # config +MORSEL_SIZE_BYTES: int = 16 * 1024 * 1024 +CROSS_JOIN_UNNEST_BATCH_SIZE = 10000 + + +def _cross_join_unnest_column( + morsels: BasePlanNode = None, + source: Node = None, + target_column: FlatColumn = None, + conditions: Set = None, + statistics=None, + distinct: bool = False, + single_column: bool = False, +) -> Generator[pyarrow.Table, None, None]: + """ + Perform a cross join on an unnested column of pyarrow tables. + + Args: + morsels: An iterable of `pyarrow.Table` objects to be cross joined. + source: The source node indicating the column. + target_column: The column to be unnested. + + Returns: + A generator that yields the resulting `pyarrow.Table` objects. + """ + from opteryx.compiled.cross_join import build_filtered_rows_indices_and_column + from opteryx.compiled.cross_join import build_rows_indices_and_column + from opteryx.compiled.structures import HashSet + from opteryx.compiled.structures import list_distinct + + hash_set = HashSet() + + # Check if the source node type is an identifier, raise error otherwise + if source.node_type != NodeType.IDENTIFIER: + raise NotImplementedError("Can only CROSS JOIN UNNEST on a column") + + batch_size: int = INTERNAL_BATCH_SIZE + at_least_once = False + single_column_collector = [] + + # Loop through each morsel from the morsels execution + for left_morsel in morsels.execute(): + # Break the morsel into batches to avoid memory issues + for left_block in left_morsel.to_batches(max_chunksize=batch_size): + new_block = None + # Fetch the data of the column to be unnested + column_data = left_block[source.schema_column.identity] + + # Filter out null values + valid_offsets = column_data.is_valid() + column_data = column_data.drop_null() + if len(column_data) == 0: + continue + left_block = left_block.filter(valid_offsets) + + # Build indices and new column data + if conditions is None: + indices, new_column_data = build_rows_indices_and_column( + column_data.to_numpy(False) + ) + else: + indices, new_column_data = build_filtered_rows_indices_and_column( + column_data.to_numpy(False), conditions + ) + + if single_column and distinct and indices.size > 0: + # if the unnest target is the only field in the SELECT and we're DISTINCTING + indices = numpy.array(indices, dtype=numpy.int32) + new_column_data, indices, hash_set = list_distinct( + new_column_data, indices, hash_set + ) + + if len(indices) > 0: + if single_column: + single_column_collector.extend(new_column_data) + if len(single_column_collector) > INTERNAL_BATCH_SIZE: + schema = pyarrow.schema( + [ + pyarrow.field( + name=target_column.identity, type=target_column.arrow_field.type + ) + ] + ) + arrow_array = pyarrow.array(single_column_collector) + if arrow_array.type != target_column.arrow_field.type: + arrow_array = arrow_array.cast(target_column.arrow_field.type) + new_block = pyarrow.Table.from_arrays([arrow_array], schema=schema) + single_column_collector.clear() + del arrow_array + yield new_block + at_least_once = True + else: + # Rebuild the block with the new column data if we have any rows to build for + + total_rows = len(indices) # Both arrays have the same length + block_size = MORSEL_SIZE_BYTES / (left_block.nbytes / left_block.num_rows) + block_size = int(block_size // 1000) * 1000 + + for start_block in range(0, total_rows, block_size): + # Compute the end index for the current chunk + end_block = min(start_block + block_size, total_rows) + + # Slice the current chunk of indices and new_column_data + indices_chunk = indices[start_block:end_block] + new_column_data_chunk = new_column_data[start_block:end_block] + + # Create a new block using the chunk of indices + indices_chunk = numpy.array(indices_chunk, dtype=numpy.int32) + new_block = left_block.take(indices_chunk) + new_block = pyarrow.Table.from_batches( + [new_block], schema=left_morsel.schema + ) + + # Append the corresponding chunk of new_column_data to the block + new_block = new_block.append_column( + target_column.identity, pyarrow.array(new_column_data_chunk) + ) + + yield new_block + at_least_once = True + + if single_column_collector: + schema = pyarrow.schema( + [pyarrow.field(name=target_column.identity, type=target_column.arrow_field.type)] + ) + arrow_array = pyarrow.array(single_column_collector) + if arrow_array.type != target_column.arrow_field.type: + arrow_array = arrow_array.cast(target_column.arrow_field.type) + new_block = pyarrow.Table.from_arrays([arrow_array], schema=schema) + yield new_block + at_least_once = True + + if not at_least_once: + # Create an empty table with the new schema + schema = left_morsel.schema + new_column = pyarrow.field(target_column.identity, pyarrow.string()) + new_schema = pyarrow.schema(list(schema) + [new_column]) + new_block = pyarrow.Table.from_batches([], schema=new_schema) + yield new_block + + +def _cross_join_unnest_literal( + morsels: BasePlanNode, source: Tuple, target_column: FlatColumn, statistics +) -> Generator[pyarrow.Table, None, None]: + joined_list_size = len(source) + + # Loop through each morsel from the morsels execution + for left_morsel in morsels.execute(): + # Break the morsel into batches to avoid memory issues + for left_block in left_morsel.to_batches(max_chunksize=INTERNAL_BATCH_SIZE): + left_block = pyarrow.Table.from_batches([left_block], schema=left_morsel.schema) + block_size = left_block.num_rows + + # Repeat each row in the table n times + repeated_indices = numpy.repeat(numpy.arange(block_size), joined_list_size) + appended_table = left_block.take(repeated_indices) + + # Tile the array to match the new number of rows + tiled_array = numpy.tile(source, block_size) + + # Convert tiled_array to PyArrow array and append it to the table + array_column = pyarrow.array(tiled_array) + appended_table = appended_table.append_column(target_column.identity, array_column) + + yield appended_table + + +def _cartesian_product(*arrays): + """ + Cartesian product of arrays creates every combination of the elements in the arrays + """ + array_count = len(arrays) + arr = numpy.empty([len(array) for array in arrays] + [array_count], dtype=numpy.int64) + for i, array in enumerate(numpy.ix_(*arrays)): + arr[..., i] = array + return numpy.hsplit(arr.reshape(-1, array_count), array_count) + + +def _cross_join(left_morsel, right, statistics): + """ + A cross join is the cartesian product of two tables - this usually isn't very + useful, but it does allow you to the theta joins (non-equi joins) + """ + + def _chunker(seq_1, seq_2, size): + """ + Chunk two equal length interables into size sized chunks + + This returns a generator. + """ + return ( + (seq_1[pos : pos + size], seq_2[pos : pos + size]) for pos in range(0, len(seq_1), size) + ) + + from opteryx.utils.arrow import align_tables + + at_least_once = False + left_schema = None + right_schema = right.schema + + # Iterate through left table in chunks of size INTERNAL_BATCH_SIZE + for left_block in left_morsel.to_batches(max_chunksize=INTERNAL_BATCH_SIZE): + # Convert the chunk to a table to retain column names + left_block = pyarrow.Table.from_batches([left_block], schema=left_morsel.schema) + + # Create an array of row indices for each table + left_array = numpy.arange(left_block.num_rows, dtype=numpy.int64) + right_array = numpy.arange(right.num_rows, dtype=numpy.int64) + + # Calculate the cartesian product of the two arrays of row indices + left_align, right_align = _cartesian_product(left_array, right_array) + + # Further break down the result into manageable chunks of size MAX_JOIN_SIZE + for left_chunk, right_chunk in _chunker(left_align, right_align, MAX_JOIN_SIZE): + # Align the tables using the specified chunks of row indices + table = align_tables(left_block, right, left_chunk.flatten(), right_chunk.flatten()) + + # Yield the resulting table to the caller + yield table + at_least_once = True + + if not at_least_once: + fields = [pyarrow.field(name=f.name, type=f.type) for f in right_schema] + [ + pyarrow.field(name=f.name, type=f.type) for f in left_schema + ] + combined_schemas = pyarrow.schema(fields) + yield pyarrow.Table.from_arrays( + [pyarrow.array([]) for _ in combined_schemas], schema=combined_schemas + ) + + +@dataclass +class CrossJoinDataObject(BasePlanDataObject): + source: str = None + _unnest_column: str = None + _unnest_target: str = None + _filters: str = None + _distinct: bool = False + + +class CrossJoinNode(BasePlanNode): + """ + Implements a SQL CROSS JOIN + """ + + def __init__(self, properties: QueryProperties, **config): + super().__init__(properties=properties) + + self.source = config.get("column") + + self._left_relation = config.get("left_relation_names") + self._right_relation = config.get("right_relation_names") + + # do we have unnest details? + self._unnest_column = config.get("unnest_column") + self._unnest_target = config.get("unnest_target") + self._filters = config.get("filters") + self._distinct = config.get("distinct", False) + + # handle variation in how the unnested column is represented + if self._unnest_column: + if self._unnest_column.node_type == NodeType.NESTED: + self._unnest_column = self._unnest_column.centre + # if we have a literal that's not a tuple, wrap it + if self._unnest_column.node_type == NodeType.LITERAL and not isinstance( + self._unnest_column.value, tuple + ): + self._unnest_column.value = tuple([self._unnest_column.value]) + + self._single_column = config.get("pre_update_columns", set()) == { + self._unnest_target.identity, + } + + self.stream = "left" + self.left_buffer = [] + self.right_buffer = [] + self.left_relation = None + self.right_relation = None + + @classmethod + def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover + raise NotImplementedError() + + @property + def name(self): # pragma: no cover + return "Cross Join" + + @property + def config(self): # pragma: no cover + filters = "" + if self._filters: + filters = f"({self._unnest_target.name} IN ({', '.join(self._filters)}))" + return f"CROSS JOIN {filters}" + + def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + if self._unnest_column is not None: + if morsel == EOS: + return EOS + if isinstance(self._unnest_column.value, tuple): + if morsel == EOS: + return EOS + return list( + _cross_join_unnest_literal( + morsels=morsel, + source=self._unnest_column.value, + target_column=self._unnest_target, + statistics=self.statistics, + ) + ) + return list( + _cross_join_unnest_column( + morsels=morsel, + source=self._unnest_column, + target_column=self._unnest_target, + conditions=self._filters, + statistics=self.statistics, + distinct=self._distinct, + single_column=self._single_column, + ) + ) + + if self.stream == "left": + if morsel == EOS: + self.stream = "right" + self.left_relation = pyarrow.concat_tables(self.left_buffer, promote_options="none") + self.left_buffer.clear() + else: + self.left_buffer.append(morsel) + return None + + if self.stream == "right": + if morsel == EOS: + right_table = pyarrow.concat_tables(self.right_buffer, promote_options="none") # type:ignore + self.right_buffer = None + return list(_cross_join(self.left_relation, right_table, self.statistics)) + else: + self.right_buffer.append(morsel) + return None diff --git a/opteryx/operatorsv2/distinct_node.py b/opteryx/operatorsv2/distinct_node.py new file mode 100644 index 000000000..0c4adaf3e --- /dev/null +++ b/opteryx/operatorsv2/distinct_node.py @@ -0,0 +1,73 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Distinct Node + +This is a SQL Query Execution Plan Node. + +This Node eliminates duplicate records. +""" + +from pyarrow import Table + +from opteryx import EOS +from opteryx.models import QueryProperties + +from . import BasePlanNode + + +class DistinctNode(BasePlanNode): + def __init__(self, properties: QueryProperties, **config): + from opteryx.compiled.structures import HashSet + + super().__init__(properties=properties) + self._distinct_on = config.get("on") + if self._distinct_on: + self._distinct_on = [col.schema_column.identity for col in self._distinct_on] + self.hash_set = HashSet() + + @classmethod + def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover + raise NotImplementedError() + + @property + def config(self): # pragma: no cover + return "" + + @property + def name(self): # pragma: no cover + return "Distinction" + + def execute(self, morsel: Table) -> Table: + from opteryx.compiled.structures import distinct + + # We create a HashSet outside the distinct call, this allows us to pass + # the hash to each run of the distinct which means we don't need to concat + # all of the tables together to return a result. + # + # Being able to run morsel-by-morsel means if we have a LIMIT clause, we can + # limit processing + + if morsel == EOS: + return EOS + + unique_indexes, self.hash_set = distinct( + morsel, columns=self._distinct_on, seen_hashes=self.hash_set + ) + + if len(unique_indexes) > 0: + distinct_table = morsel.take(unique_indexes) + return distinct_table + else: + distinct_table = morsel.slice(0, 0) + return distinct_table diff --git a/opteryx/operatorsv2/exit_node.py b/opteryx/operatorsv2/exit_node.py new file mode 100644 index 000000000..a741f98c2 --- /dev/null +++ b/opteryx/operatorsv2/exit_node.py @@ -0,0 +1,107 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Exit Node + +This is a SQL Query Execution Plan Node. + +This does the final preparation before returning results to users. + +This does two things that the projection node doesn't do: + - renames columns from the internal names + - removes all columns not being returned to the user + +This node doesn't do any calculations, it is a pure Projection. +""" + +from dataclasses import dataclass +from dataclasses import field +from typing import List + +from pyarrow import Table + +from opteryx import EOS +from opteryx.exceptions import AmbiguousIdentifierError +from opteryx.exceptions import InvalidInternalStateError +from opteryx.models import LogicalColumn +from opteryx.models import QueryProperties +from opteryx.operators.base_plan_node import BasePlanDataObject + +from . import BasePlanNode + + +@dataclass +class ExitDataObject(BasePlanDataObject): + columns: List[LogicalColumn] = field(default_factory=list) + + +class ExitNode(BasePlanNode): + def __init__(self, properties: QueryProperties, **config): + super().__init__(properties=properties) + self.columns = config.get("columns", []) + + self.do = ExitDataObject(columns=self.columns) + + @classmethod + def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover + raise NotImplementedError() + + @property + def config(self): # pragma: no cover + return None + + @property + def name(self): # pragma: no cover + return "Exit" + + def execute(self, morsel: Table) -> Table: + if morsel == EOS: + return EOS + + final_columns = [] + final_names = [] + for column in self.columns: + final_columns.append(column.schema_column.identity) + final_names.append(column.current_name) + + if len(final_columns) != len(set(final_columns)): # pragma: no cover + from collections import Counter + + duplicates = [column for column, count in Counter(final_columns).items() if count > 1] + matches = {a for a, b in zip(final_names, final_columns) if b in duplicates} + raise AmbiguousIdentifierError( + message=f"Query result contains multiple instances of the same column(s) - `{'`, `'.join(matches)}`" + ) + + if len(set(final_names)) != len(final_names): # we have duplicate names + final_names = [] + for column in self.columns: + # if column.schema_column.origin: + # final_names.append(f"{column.schema_column.origin[0]}.{column.current_name}") + # else: + final_names.append(column.qualified_name) + + if not set(final_columns).issubset(morsel.column_names): # pragma: no cover + mapping = {name: int_name for name, int_name in zip(final_columns, final_names)} + missing_references = { + mapping.get(ref): ref for ref in final_columns if ref not in morsel.column_names + } + + raise InvalidInternalStateError( + f"The following fields were not in the resultset - {', '.join(missing_references.keys())}" + ) + + morsel = morsel.select(final_columns) + morsel = morsel.rename_columns(final_names) + + return morsel diff --git a/opteryx/operatorsv2/explain_node.py b/opteryx/operatorsv2/explain_node.py new file mode 100644 index 000000000..389589c46 --- /dev/null +++ b/opteryx/operatorsv2/explain_node.py @@ -0,0 +1,49 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Explain Node + +This is a SQL Query Execution Plan Node. + +This writes out a query plan +""" + +from typing import Generator + +from opteryx.models import QueryProperties +from opteryx.operators import BasePlanNode +from opteryx.operators import OperatorType + + +class ExplainNode(BasePlanNode): + operator_type = OperatorType.PRODUCER + + def __init__(self, properties: QueryProperties, **config): + super().__init__(properties=properties) + self._query_plan = config.get("query_plan") + + @property + def name(self): # pragma: no cover + return "Explain" + + @property # pragma: no cover + def config(self): + return "" + + @classmethod + def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover + raise NotImplementedError() + + def execute(self) -> Generator: + if self._query_plan: + yield from self._query_plan.explain() diff --git a/opteryx/operatorsv2/filter_node.py b/opteryx/operatorsv2/filter_node.py new file mode 100644 index 000000000..a756c48dd --- /dev/null +++ b/opteryx/operatorsv2/filter_node.py @@ -0,0 +1,81 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Selection Node + +This is a SQL Query Execution Plan Node. + +This node is responsible for applying filters to datasets. +""" + +import numpy +import pyarrow + +from opteryx import EOS +from opteryx.exceptions import SqlError +from opteryx.managers.expression import NodeType +from opteryx.managers.expression import evaluate +from opteryx.managers.expression import evaluate_and_append +from opteryx.managers.expression import format_expression +from opteryx.managers.expression import get_all_nodes_of_type +from opteryx.models import QueryProperties + +from . import BasePlanNode + + +class FilterNode(BasePlanNode): + def __init__(self, properties: QueryProperties, **config): + super().__init__(properties=properties) + self.filter = config.get("filter") + + self.function_evaluations = get_all_nodes_of_type( + self.filter, + select_nodes=(NodeType.FUNCTION,), + ) + + @classmethod + def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover + raise NotImplementedError() + + @property + def config(self): # pragma: no cover + return format_expression(self.filter) + + @property + def name(self): # pragma: no cover + return "Filter" + + def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + if morsel == EOS: + return EOS + + if morsel.num_rows == 0: + return morsel + + if self.function_evaluations: + morsel = evaluate_and_append(self.function_evaluations, morsel) + mask = evaluate(self.filter, morsel) + + if not isinstance(mask, pyarrow.lib.BooleanArray): + try: + mask = pyarrow.array(mask, type=pyarrow.bool_()) + except Exception as err: # nosec + raise SqlError( + f"Unable to filter on expression '{format_expression(self.filter)} {err}'." + ) + mask = numpy.nonzero(mask)[0] + + # if there's no matching rows, just drop the morsel + if mask.size > 0 and not numpy.all(mask is None): + return morsel.take(pyarrow.array(mask)) + return morsel.slice(0, 0) diff --git a/opteryx/operatorsv2/function_dataset_node.py b/opteryx/operatorsv2/function_dataset_node.py new file mode 100644 index 000000000..409baa294 --- /dev/null +++ b/opteryx/operatorsv2/function_dataset_node.py @@ -0,0 +1,150 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Blob Reader Node + +This is a SQL Query Execution Plan Node. + +This Node creates datasets based on function calls like VALUES and UNNEST. +""" + +import time +from typing import Generator + +import pyarrow + +from opteryx.exceptions import SqlError +from opteryx.managers.expression import NodeType +from opteryx.models import QueryProperties +from opteryx.utils import series + +from .read_node import ReaderNode +from opteryx import EOS + + +def _generate_series(**kwargs): + value_array = series.generate_series(*kwargs["args"]) + column_name = kwargs["columns"][0].schema_column.identity + return pyarrow.Table.from_arrays([value_array], [column_name]) + + +def _unnest(**kwargs): + """unnest converts an list into rows""" + if kwargs["args"][0].node_type == NodeType.NESTED: + list_items = [kwargs["args"][0].centre.value] + else: + list_items = kwargs["args"][0].value + column_name = kwargs["columns"][0].schema_column.identity + + return pyarrow.Table.from_arrays([list_items], [column_name]) + + +def _values(**parameters): + columns = [col.schema_column.identity for col in parameters["columns"]] + values_array = parameters["values"] + return [{columns[i]: value.value for i, value in enumerate(values)} for values in values_array] + + +def _fake_data(**kwargs): + from orso.faker import generate_fake_data + + rows = kwargs["rows"] + schema = kwargs["schema"] + for column in schema.columns: + column.name = column.identity + return generate_fake_data(schema, rows) + + +def _http(**kwargs): + aliases = kwargs.get("schema") + data = kwargs.get("data") + + renames = [aliases.column(column).identity for column in data.column_names] + data = data.rename_columns(renames) + + return data + + +DATASET_FUNCTIONS = { + "FAKE": _fake_data, + "GENERATE_SERIES": _generate_series, + "UNNEST": _unnest, + "VALUES": _values, + "HTTP": _http, +} + + +class FunctionDatasetNode(ReaderNode): + def __init__(self, properties: QueryProperties, **config): + """ + The Blob Reader Node is responsible for reading the relevant blobs + and returning a Table/Relation. + """ + super().__init__(properties=properties) + self.alias = config.get("alias") + self.function = config["function"] + self.parameters = config + self.columns = config.get("columns", []) + self.args = config.get("args", []) + + @classmethod + def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover + raise NotImplementedError() + + @property + def config(self): # pragma: no cover + from opteryx.managers.expression import format_expression + + if self.function == "FAKE": + return f"FAKE ({', '.join(format_expression(arg) for arg in self.args)}{' AS ' + self.alias if self.alias else ''})" + if self.function == "GENERATE_SERIES": + return f"GENERATE SERIES ({', '.join(format_expression(arg) for arg in self.args)}){' AS ' + self.alias if self.alias else ''}" + if self.function == "VALUES": + return f"VALUES (({', '.join(self.columns)}) x {len(self.values)} AS {self.alias})" + if self.function == "UNNEST": + return f"UNNEST ({', '.join(format_expression(arg) for arg in self.args)}{' AS ' + self.parameters.get('unnest_target', '')})" + if self.function == "HTTP": + return f"HTTP ({self.url}) AS {self.alias}" + + @property + def name(self): # pragma: no cover + return "Dataset Constructor" + + @property + def can_push_selection(self): + return False + + def execute(self, morsel) -> Generator: + try: + start_time = time.time_ns() + data = DATASET_FUNCTIONS[self.function](**self.parameters) # type:ignore + self.statistics.time_evaluate_dataset += time.time_ns() - start_time + except TypeError as err: # pragma: no cover + if str(err).startswith("_unnest() takes 2"): + raise SqlError( + "UNNEST expects a literal list in paranthesis, or a field name as a parameter." + ) + raise err + + if isinstance(data, list): + table = pyarrow.Table.from_pylist(data) + elif hasattr(data, "arrow"): + table = data.arrow() + else: + table = data + + self.records_out += table.num_rows + self.bytes_out += table.nbytes + self.statistics.columns_read += len(table.column_names) + + return [table, EOS] diff --git a/opteryx/operatorsv2/heap_sort_node.py b/opteryx/operatorsv2/heap_sort_node.py new file mode 100644 index 000000000..7efb101c1 --- /dev/null +++ b/opteryx/operatorsv2/heap_sort_node.py @@ -0,0 +1,139 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Heap Sort Node + +This is a SQL Query Execution Plan Node. + +This node orders a dataset, note that Heap Sort in this instance isn't the heap sort +algorithm, it is an approach where a heap of n items (the limit) is maintained as the +data passes through the operator. Because we are working with chunks, we build small +batches which we order and then discard the excess items. + +This is faster, particularly when working with large datasets even though we're now +sorting smaller chunks over and over again. +""" + +from dataclasses import dataclass + +import numpy +import pyarrow +import pyarrow.compute +from pyarrow import concat_tables + +from opteryx import EOS +from opteryx.exceptions import ColumnNotFoundError +from opteryx.models import QueryProperties +from opteryx.operators.base_plan_node import BasePlanDataObject + +from . import BasePlanNode + + +@dataclass +class HeapSortDataObject(BasePlanDataObject): + order_by: list = None + limit: int = -1 + + +class HeapSortNode(BasePlanNode): + def __init__(self, properties: QueryProperties, **config): + super().__init__(properties=properties) + self.order_by = config.get("order_by", []) + self.limit: int = config.get("limit", -1) + + self.do = HeapSortDataObject(order_by=self.order_by, limit=self.limit) + self.mapped_order = [] + self.table = None + + for column, direction in self.order_by: + try: + self.mapped_order.append( + ( + column.schema_column.identity, + direction, + ) + ) + except ColumnNotFoundError as cnfe: + raise ColumnNotFoundError( + f"`ORDER BY` must reference columns as they appear in the `SELECT` clause. {cnfe}" + ) + + @classmethod + def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover + raise NotImplementedError() + + @property + def config(self): # pragma: no cover + return f"LIMIT = {self.limit} ORDER = " + ", ".join( + f"{i[0].value} {i[1][0:3].upper()}" for i in self.order_by + ) + + @property + def name(self): # pragma: no cover + return "Heap Sort" + + def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + if morsel == EOS: + return [self.table, EOS] + + if self.table: + # Concatenate the accumulated table with the new morsel + self.table = concat_tables([self.table, morsel], promote_options="permissive") + else: + self.table = morsel + + # Determine if any columns are string-based + use_pyarrow_sort = any( + pyarrow.types.is_string(self.table.column(column_name).type) + or pyarrow.types.is_binary(self.table.column(column_name).type) + for column_name, _ in self.mapped_order + ) + + # strings are sorted faster user pyarrow, single columns faster using compute + if len(self.mapped_order) == 1 and use_pyarrow_sort: + column_name, sort_direction = self.mapped_order[0] + column = self.table.column(column_name) + if sort_direction == "ascending": + sort_indices = pyarrow.compute.sort_indices(column) + else: + sort_indices = pyarrow.compute.sort_indices(column)[::-1] + self.table = self.table.take(sort_indices[: self.limit]) + # strings are sorted faster using pyarrow + elif use_pyarrow_sort: + self.table = self.table.sort_by(self.mapped_order).slice(offset=0, length=self.limit) + # single column sort using numpy + elif len(self.mapped_order) == 1: + # Single-column sort using mergesort to take advantage of partially sorted data + column_name, sort_direction = self.mapped_order[0] + column = self.table.column(column_name).to_numpy() + if sort_direction == "ascending": + sort_indices = numpy.argsort(column) + else: + sort_indices = numpy.argsort(column)[::-1] # Reverse for descending + # Slice the sorted table + self.table = self.table.take(sort_indices[: self.limit]) + # multi column sort using numpy + else: + # Multi-column sort using lexsort + columns_for_sorting = [] + directions = [] + for column_name, sort_direction in self.mapped_order: + column = self.table.column(column_name).to_numpy() + columns_for_sorting.append(column) + directions.append(1 if sort_direction == "ascending" else -1) + + sort_indices = numpy.lexsort( + [col[::direction] for col, direction in zip(columns_for_sorting, directions)] + ) + # Slice the sorted table + self.table = self.table.take(sort_indices[: self.limit]) diff --git a/opteryx/operatorsv2/inner_join_node.py b/opteryx/operatorsv2/inner_join_node.py new file mode 100644 index 000000000..1e3c16a34 --- /dev/null +++ b/opteryx/operatorsv2/inner_join_node.py @@ -0,0 +1,134 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Inner Join Node + +This is a SQL Query Execution Plan Node. + +PyArrow has a good LEFT JOIN implementation, but it errors when the +relations being joined contain STRUCT or ARRAY columns, this is true +for all of the JOIN types, however we've only written our own INNER +and LEFT JOINs. + +It is comparible performance to the PyArrow INNER JOIN, in benchmarks +sometimes native is faster, sometimes PyArrow is faster. Generally +PyArrow is more forgiving when the relations are the "wrong" way around +(unoptimized order) but native is faster for well-ordered relations, as +we intend to take steps to help ensure relations are well-ordered, this +should work in our favour. + +This is a hash join, this is completely rewritten from the earlier +pyarrow_ops implementation which was a variation of a sort-merge join. +""" + +import pyarrow +from pyarrow import Table + +from opteryx import EOS +from opteryx.compiled.structures.hash_table import hash_join_map +from opteryx.models import QueryProperties +from opteryx.utils.arrow import align_tables + +from . import BasePlanNode + + +def inner_join_with_preprocessed_left_side(left_relation, right_relation, join_columns, hash_table): + """ + Perform an INNER JOIN using a preprocessed hash table from the left relation. + + Parameters: + left_relation: The preprocessed left pyarrow.Table. + right_relation: The right pyarrow.Table to join. + join_columns: A list of column names to join on. + hash_table: The preprocessed hash table from the left table. + + Returns: + A tuple containing lists of matching row indices from the left and right relations. + """ + left_indexes = [] + right_indexes = [] + + right_hash = hash_join_map(right_relation, join_columns) + + for h, right_rows in right_hash.hash_table.items(): + left_rows = hash_table.get(h) + if left_rows is None: + continue + for l in left_rows: + for r in right_rows: + left_indexes.append(l) + right_indexes.append(r) + + return align_tables(right_relation, left_relation, right_indexes, left_indexes) + + +class InnerJoinNode(BasePlanNode): + def __init__(self, properties: QueryProperties, **config): + super().__init__(properties=properties) + self._join_type = config["type"] + self._on = config.get("on") + self._using = config.get("using") + + self._left_columns = config.get("left_columns") + self._left_relation = config.get("left_relation_names") + + self._right_columns = config.get("right_columns") + self._right_relation = config.get("right_relation_names") + + self.stream = "left" + self.left_buffer = [] + self.left_hash = None + + @classmethod + def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover + raise NotImplementedError() + + @property + def name(self): # pragma: no cover + return "Inner Join" + + @property + def config(self): # pragma: no cover + return "" + + def execute(self, morsel: Table) -> Table: + if self.stream == "left": + if morsel == EOS: + self.stream = "right" + self.left_relation = pyarrow.concat_tables(self.left_buffer, promote_options="none") + self.left_buffer.clear() + + # in place until #1295 resolved + if self._left_columns[0] not in self.left_relation.column_names: + self._right_columns, self._left_columns = ( + self._left_columns, + self._right_columns, + ) + + self.left_hash = hash_join_map(self.left_relation, self._left_columns) + else: + self.left_buffer.append(morsel) + return None + + if morsel == EOS: + return EOS + + # do the join + new_morsel = inner_join_with_preprocessed_left_side( + left_relation=self.left_relation, + right_relation=morsel, + join_columns=self._right_columns, + hash_table=self.left_hash, + ) + + return new_morsel diff --git a/opteryx/operatorsv2/inner_join_node_single.py b/opteryx/operatorsv2/inner_join_node_single.py new file mode 100644 index 000000000..2b1b99ed0 --- /dev/null +++ b/opteryx/operatorsv2/inner_join_node_single.py @@ -0,0 +1,215 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Inner Join Node (Single Condition) + +This is a SQL Query Execution Plan Node. + +We have a generic Inner Join node, this is optimized for single conditions in the +Inner Join, this is currently only used for INTEGERS and is about 25% faster than +the generic INNER JOIN. +""" + +import time +from typing import Generator + +import numpy +import pyarrow +from pyarrow import compute + +from opteryx.compiled.structures import HashTable +from opteryx.models import QueryProperties +from opteryx.operators import BasePlanNode +from opteryx.operators import OperatorType +from opteryx.utils.arrow import align_tables + + +def preprocess_left(relation, join_columns): + """ + Convert a PyArrow array to an array of bytes. + + Parameters: + array (pyarrow.Array): The input PyArrow array. + + Returns: + numpy.ndarray: A numpy array of bytes representing the values in the input array. + """ + ht = HashTable() + + array = relation.column(join_columns[0]) + + if isinstance(array, pyarrow.ChunkedArray): + array = array.combine_chunks() + + num_rows = len(array) + # Access the null bitmap buffer + null_bitmap = array.buffers()[0] + + if null_bitmap is not None: + null_array = [((byte >> bit) & 1) for byte in null_bitmap for bit in range(8)][:num_rows] + else: + null_array = numpy.ones(num_rows, dtype=bool) + + value_offset_map = numpy.where(null_array)[0] + non_null_array = array.filter(compute.is_valid(array)) + + if pyarrow.types.is_integer(array.type): + for i, val in enumerate(non_null_array.to_numpy()): + ht.insert(val, value_offset_map[i]) + + elif pyarrow.types.is_fixed_size_binary(array.type) or pyarrow.types.is_floating( + array.type + ): # pragma: no cover + # Access the data buffer directly for fixed-width types + data_buffer = array.buffers()[1] + item_size = array.type.bit_width // 8 + + for i in range(num_rows): + if null_array[i]: + start = i * item_size + end = start + item_size + value_bytes = data_buffer[start:end].to_pybytes() + ht.insert(hash(value_bytes), i) + + elif pyarrow.types.is_binary(array.type) or pyarrow.types.is_string(array.type): + for i, val in enumerate(array): + if null_array[i]: + ht.insert(hash(val), i) + + else: + raise TypeError(f"Unsupported column type: {array.type}") + + return ht + + +def inner_join_with_preprocessed_left_side(left_relation, right_relation, join_columns, hash_table): + """ + Perform an INNER JOIN using a preprocessed hash table from the left relation. + + Parameters: + left_relation: The preprocessed left pyarrow.Table. + right_relation: The right pyarrow.Table to join. + join_columns: A list of column names to join on. + hash_table: The preprocessed hash table from the left table. + + Returns: + A tuple containing lists of matching row indices from the left and right relations. + """ + left_indexes = [] + right_indexes = [] + + array = right_relation.column(join_columns[0]) + + if isinstance(array, pyarrow.ChunkedArray): + array = array.combine_chunks() + + num_rows = len(array) + # Access the null bitmap buffer + null_bitmap = array.buffers()[0] + + if null_bitmap is not None: + null_array = [((byte >> bit) & 1) for byte in null_bitmap for bit in range(8)][:num_rows] + else: + null_array = numpy.ones(num_rows, dtype=bool) + + value_offset_map = numpy.where(null_array)[0] + non_null_array = array.filter(compute.is_valid(array)) + + if pyarrow.types.is_integer(array.type): + for i, val in enumerate(non_null_array.to_numpy()): + rows = hash_table.get(val) + if rows: + left_indexes.extend(rows) + right_indexes.extend([value_offset_map[i]] * len(rows)) + + elif pyarrow.types.is_fixed_size_binary(array.type) or pyarrow.types.is_floating( + array.type + ): # pragma: no cover + # Access the data buffer directly for fixed-width types + data_buffer = array.buffers()[1] + item_size = array.type.bit_width // 8 + + for i in range(num_rows): + if null_array[i]: + start = i * item_size + end = start + item_size + value_bytes = data_buffer[start:end].to_pybytes() + rows = hash_table.get(hash(value_bytes)) + if rows: + left_indexes.extend(rows) + right_indexes.extend([i] * len(rows)) + + if pyarrow.types.is_binary(array.type) or pyarrow.types.is_string(array.type): + for i, val in enumerate(array): + if null_array[i]: + rows = hash_table.get(hash(val)) + if rows: + left_indexes.extend(rows) + right_indexes.extend([i] * len(rows)) + + return align_tables(right_relation, left_relation, right_indexes, left_indexes) + + +class InnerJoinSingleNode(BasePlanNode): + operator_type = OperatorType.PASSTHRU + + def __init__(self, properties: QueryProperties, **config): + super().__init__(properties=properties) + self._join_type = config["type"] + self._on = config.get("on") + self._using = config.get("using") + + self._left_columns = config.get("left_columns") + self._left_relation = config.get("left_relation_names") + + self._right_columns = config.get("right_columns") + self._right_relation = config.get("right_relation_names") + + @classmethod + def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover + raise NotImplementedError() + + @property + def name(self): # pragma: no cover + return "Inner Join (Single)" + + @property + def config(self): # pragma: no cover + return "" + + def execute(self) -> Generator: + left_node = self._producers[0] # type:ignore + right_node = self._producers[1] # type:ignore + + left_relation = pyarrow.concat_tables(left_node.execute(), promote_options="none") + # in place until #1295 resolved + if self._left_columns[0] not in left_relation.column_names: + self._right_columns, self._left_columns = ( + self._left_columns, + self._right_columns, + ) + + start = time.monotonic_ns() + left_hash = preprocess_left(left_relation, self._left_columns) + self.statistics.time_inner_join += time.monotonic_ns() - start + for morsel in right_node.execute(): + start = time.monotonic_ns() + # do the join + new_morsel = inner_join_with_preprocessed_left_side( + left_relation=left_relation, + right_relation=morsel, + join_columns=self._right_columns, + hash_table=left_hash, + ) + self.statistics.time_inner_join += time.monotonic_ns() - start + yield new_morsel diff --git a/opteryx/operatorsv2/join_node.py b/opteryx/operatorsv2/join_node.py new file mode 100644 index 000000000..1bd2dbc1f --- /dev/null +++ b/opteryx/operatorsv2/join_node.py @@ -0,0 +1,97 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Join Node + +We have our own implementations of INNER and OUTER joins, this uses PyArrow +to implement less-common joins of ANTI and SEMI joins. +""" + +from typing import Generator + +import pyarrow + +from opteryx.exceptions import UnsupportedSyntaxError +from opteryx.models import QueryProperties +from opteryx.operators import BasePlanNode +from opteryx.operators import OperatorType + + +class JoinNode(BasePlanNode): + operator_type = OperatorType.PASSTHRU + + def __init__(self, properties: QueryProperties, **config): + super().__init__(properties=properties) + self._join_type = config["type"] + self._on = config.get("on") + self._using = config.get("using") + + self._left_columns = config.get("left_columns") + self._left_relation = config.get("left_relation_names") + + self._right_columns = config.get("right_columns") + self._right_relation = config.get("right_relation_names") + + @classmethod + def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover + raise NotImplementedError() + + @property + def name(self): # pragma: no cover + return f"{self._join_type} Join" + + @property + def config(self): # pragma: no cover + from opteryx.managers.expression import format_expression + + if self._on: + return f"{self._join_type.upper()} JOIN ({format_expression(self._on, True)})" + if self._using: + return f"{self._join_type.upper()} JOIN (USING {','.join(map(format_expression, self._using))})" + return f"{self._join_type.upper()}" + + def execute(self) -> Generator: + left_node = self._producers[0] # type:ignore + right_node = self._producers[1] # type:ignore + + left_table = pyarrow.concat_tables(left_node.execute(), promote_options="none") + right_table = pyarrow.concat_tables(right_node.execute(), promote_options="none") + + try: + new_morsel = left_table.join( + right_table, + keys=self._left_columns, + right_keys=self._right_columns, + join_type=self._join_type, + coalesce_keys=self._using is not None, + ) + except pyarrow.ArrowInvalid as err: # pragma: no cover + last_token = str(err).split(" ")[-1] + column = None + for col in left_node.columns: + if last_token == col.identity: + column = col.name + break + for col in right_node.columns: + if last_token == col.identity: + column = col.name + break + if column: + raise UnsupportedSyntaxError( + f"Unable to ANTI/SEMI JOIN with unsupported column types in table, '{column}'." + ) from err + raise UnsupportedSyntaxError( + "Unable to ANTI/SEMI JOIN with unsupported column types in table." + ) from err + + yield new_morsel diff --git a/opteryx/operatorsv2/limit_node.py b/opteryx/operatorsv2/limit_node.py new file mode 100644 index 000000000..55528b7bf --- /dev/null +++ b/opteryx/operatorsv2/limit_node.py @@ -0,0 +1,77 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Limit Node + +This is a SQL Query Execution Plan Node. + +This Node performs the LIMIT and the OFFSET steps +""" + +import pyarrow + +from opteryx import EOS +from opteryx.models import QueryProperties + +from . import BasePlanNode + + +class LimitNode(BasePlanNode): + def __init__(self, properties: QueryProperties, **config): + super().__init__(properties=properties) + self.limit = config.get("limit") + self.offset = config.get("offset", 0) + + self.remaining_rows = self.limit if self.limit is not None else float("inf") + self.rows_left_to_skip = max(0, self.offset) + self.at_least_one = False + + @classmethod + def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover + raise NotImplementedError() + + @property + def name(self): # pragma: no cover + return "LIMIT" + + @property + def config(self): # pragma: no cover + return str(self.limit) + " OFFSET " + str(self.offset) + + def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + if morsel == EOS: + return EOS + + if self.rows_left_to_skip > 0: + if self.rows_left_to_skip >= morsel.num_rows: + self.rows_left_to_skip -= morsel.num_rows + return None + else: + morsel = morsel.slice( + offset=self.rows_left_to_skip, length=morsel.num_rows - self.rows_left_to_skip + ) + self.rows_left_to_skip = 0 + + if self.remaining_rows <= 0: + self.at_least_one = True + return morsel.slice(offset=0, length=0) + + if morsel.num_rows > 0: + if morsel.num_rows < self.remaining_rows: + self.remaining_rows -= morsel.num_rows + self.at_least_one = True + return morsel + + else: + self.at_least_one = True + return morsel.slice(offset=0, length=self.remaining_rows) diff --git a/opteryx/operatorsv2/noop_node.py b/opteryx/operatorsv2/noop_node.py new file mode 100644 index 000000000..6ff91cb77 --- /dev/null +++ b/opteryx/operatorsv2/noop_node.py @@ -0,0 +1,48 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +No Operation + +This is a SQL Query Execution Plan Node. +""" + +from typing import Generator + +from opteryx.models import QueryProperties +from opteryx.operators import BasePlanNode +from opteryx.operators import OperatorType + + +class NoOpNode(BasePlanNode): + operator_type = OperatorType.PASSTHRU + + def __init__(self, properties: QueryProperties, **config): + super().__init__(properties=properties) + + @classmethod + def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover + raise NotImplementedError() + + @property + def name(self): # pragma: no cover + return "NoOp" + + @property + def config(self): # pragma: no cover + return "" + + def execute(self) -> Generator: + # nodes generally have 0 (scan), 1 (most) or 2 (join, union) producers + if self._producers: + for morsels in self._producers: + yield from morsels.execute() diff --git a/opteryx/operatorsv2/outer_join_node.py b/opteryx/operatorsv2/outer_join_node.py new file mode 100644 index 000000000..b96c9b03f --- /dev/null +++ b/opteryx/operatorsv2/outer_join_node.py @@ -0,0 +1,331 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Outer Join Node + +This is a SQL Query Execution Plan Node. + +PyArrow has LEFT/RIGHT/FULL OUTER JOIN implementations, but they error when the +relations being joined contain STRUCT or ARRAY columns so we've written our own +OUTER JOIN implementations. + +We also have our own INNER JOIN implementations, it's really just the less +popular SEMI and ANTI joins we leave to PyArrow for now. +""" + +import time +from typing import Generator +from typing import List + +import pyarrow + +from opteryx.compiled.structures import HashTable +from opteryx.models import QueryProperties +from opteryx.operators import BasePlanNode +from opteryx.operators import OperatorType +from opteryx.utils.arrow import align_tables + + +def left_join(left_relation, right_relation, left_columns: List[str], right_columns: List[str]): + """ + Perform an LEFT JOIN. + + This implementation ensures that all rows from the left table are included in the result set, + with rows from the right table matched where possible, and columns from the right table + filled with NULLs where no match is found. + + Parameters: + left_relation (pyarrow.Table): The left pyarrow.Table to join. + right_relation (pyarrow.Table): The right pyarrow.Table to join. + left_columns (list of str): Column names from the left table to join on. + right_columns (list of str): Column names from the right table to join on. + + Returns: + A pyarrow.Table containing the result of the LEFT JOIN operation. + """ + from collections import deque + + from opteryx.compiled.structures.hash_table import hash_join_map + + left_indexes: deque = deque() + right_indexes: deque = deque() + + right_relation = pyarrow.concat_tables(right_relation.execute(), promote_options="none") + + if len(set(left_columns) & set(right_relation.column_names)) > 0: + left_columns, right_columns = right_columns, left_columns + + right_hash = hash_join_map(right_relation, right_columns) + + for left_batch in left_relation.execute(): + left_hash = hash_join_map(left_batch, left_columns) + for hash_value, left_rows in left_hash.hash_table.items(): + right_rows = right_hash.get(hash_value) + if right_rows: + for l in left_rows: + for r in right_rows: + left_indexes.append(l) + right_indexes.append(r) + else: + for l in left_rows: + left_indexes.append(l) + right_indexes.append(None) + + if len(left_indexes) > 50_000: + table = align_tables( + right_relation, left_batch, list(right_indexes), list(left_indexes) + ) + yield table + left_indexes.clear() + right_indexes.clear() + + if len(left_indexes) > 0: + table = align_tables( + right_relation, left_batch, list(right_indexes), list(left_indexes) + ) + yield table + left_indexes.clear() + right_indexes.clear() + + +def full_join(left_relation, right_relation, left_columns: List[str], right_columns: List[str]): + chunk_size = 1000 + right_relation = pyarrow.concat_tables(right_relation.execute(), promote_options="none") + + hash_table = HashTable() + non_null_right_values = right_relation.select(right_columns).itercolumns() + for i, value_tuple in enumerate(zip(*non_null_right_values)): + hash_table.insert(hash(value_tuple), i) + + left_indexes = [] + right_indexes = [] + + left_relation = pyarrow.concat_tables(left_relation.execute(), promote_options="none") + left_values = left_relation.select(left_columns).itercolumns() + for i, value_tuple in enumerate(zip(*left_values)): + rows = hash_table.get(hash(value_tuple)) + if rows: + right_indexes.extend(rows) + left_indexes.extend([i] * len(rows)) + else: + right_indexes.append(None) + left_indexes.append(i) + + for i in range(right_relation.num_rows): + if i not in right_indexes: + right_indexes.append(i) + left_indexes.append(None) + + for i in range(0, len(left_indexes), chunk_size): + chunk_left_indexes = left_indexes[i : i + chunk_size] + chunk_right_indexes = right_indexes[i : i + chunk_size] + + # Align this chunk and add the resulting table to our list + yield align_tables(right_relation, left_relation, chunk_right_indexes, chunk_left_indexes) + + +def right_join(left_relation, right_relation, left_columns: List[str], right_columns: List[str]): + """ + Perform a RIGHT JOIN. + + This implementation ensures that all rows from the right table are included in the result set, + with rows from the left table matched where possible, and columns from the left table + filled with NULLs where no match is found. + + Parameters: + left_relation (pyarrow.Table): The left pyarrow.Table to join. + right_relation (pyarrow.Table): The right pyarrow.Table to join. + left_columns (list of str): Column names from the left table to join on. + right_columns (list of str): Column names from the right table to join on. + + Yields: + pyarrow.Table: A chunk of the result of the RIGHT JOIN operation. + """ + chunk_size = 1000 + left_relation = pyarrow.concat_tables(left_relation.execute(), promote_options="none") + + hash_table = HashTable() + non_null_left_values = left_relation.select(left_columns).itercolumns() + for i, value_tuple in enumerate(zip(*non_null_left_values)): + hash_table.insert(hash(value_tuple), i) + + # Iterate over the right_relation in chunks + right_batches = right_relation.execute() + for right_batch in right_batches: + for right_chunk in right_batch.to_batches(chunk_size): + left_indexes = [] + right_indexes = [] + + right_values = right_chunk.select(right_columns).itercolumns() + for i, value_tuple in enumerate(zip(*right_values)): + rows = hash_table.get(hash(value_tuple)) + if rows: + left_indexes.extend(rows) + right_indexes.extend([i] * len(rows)) + else: + left_indexes.append(None) + right_indexes.append(i) + + # Yield the aligned chunk + # we intentionally swap them to the other calls so we're building a table + # not a record batch (what the chunk is) + yield align_tables(left_relation, right_chunk, left_indexes, right_indexes) + + +def left_anti_join( + left_relation, right_relation, left_columns: List[str], right_columns: List[str] +): + """ + Perform a LEFT ANTI JOIN. + + This implementation ensures that all rows from the left table are included in the result set, + where there are no matching rows in the right table based on the join columns. + + Parameters: + left_relation (pyarrow.Table): The left pyarrow.Table to join. + right_relation (pyarrow.Table): The right pyarrow.Table to join. + left_columns (list of str): Column names from the left table to join on. + right_columns (list of str): Column names from the right table to join on. + + Returns: + A pyarrow.Table containing the result of the LEFT ANTI JOIN operation. + """ + right_relation = pyarrow.concat_tables(right_relation.execute(), promote_options="none") + + hash_table = HashTable() + non_null_right_values = right_relation.select(right_columns).itercolumns() + for i, value_tuple in enumerate(zip(*non_null_right_values)): + hash_table.insert(hash(value_tuple), i) + + at_least_once = False + # Iterate over the left_relation in chunks + for left_batch in left_relation.execute(): + left_indexes = [] + left_values = left_batch.select(left_columns).itercolumns() + for i, value_tuple in enumerate(zip(*left_values)): + rows = hash_table.get(hash(value_tuple)) + if not rows: # Only include left rows that have no match in the right table + left_indexes.append(i) + + # Filter the left_chunk based on the anti join condition + if left_indexes: + yield left_batch.take(left_indexes) + at_least_once = True + + if not at_least_once: + yield left_batch.slice(0, 0) + + +def left_semi_join( + left_relation, right_relation, left_columns: List[str], right_columns: List[str] +): + """ + Perform a LEFT SEMI JOIN. + + This implementation ensures that all rows from the left table that have a matching row in the right table + based on the join columns are included in the result set. + + Parameters: + left_relation (pyarrow.Table): The left pyarrow.Table to join. + right_relation (pyarrow.Table): The right pyarrow.Table to join. + left_columns (list of str): Column names from the left table to join on. + right_columns (list of str): Column names from the right table to join on. + + Returns: + A pyarrow.Table containing the result of the LEFT SEMI JOIN operation. + """ + right_relation = pyarrow.concat_tables(right_relation.execute(), promote_options="none") + + hash_table = HashTable() + non_null_right_values = right_relation.select(right_columns).itercolumns() + for i, value_tuple in enumerate(zip(*non_null_right_values)): + hash_table.insert(hash(value_tuple), i) + + at_least_once = False + # Iterate over the left_relation in chunks + for left_batch in left_relation.execute(): + left_indexes = [] + left_values = left_batch.select(left_columns).itercolumns() + + for i, value_tuple in enumerate(zip(*left_values)): + rows = hash_table.get(hash(value_tuple)) + if rows: # Only include left rows that have a match in the right table + left_indexes.append(i) + + # Filter the left_chunk based on the anti join condition + if left_indexes: + yield left_batch.take(left_indexes) + at_least_once = True + + if not at_least_once: + yield left_batch.slice(0, 0) + + +class OuterJoinNode(BasePlanNode): + operator_type = OperatorType.PASSTHRU + + def __init__(self, properties: QueryProperties, **config): + super().__init__(properties=properties) + self._join_type = config["type"] + self._on = config.get("on") + self._using = config.get("using") + + self._left_columns = config.get("left_columns") + self._left_relation = config.get("left_relation_names") + + self._right_columns = config.get("right_columns") + self._right_relation = config.get("right_relation_names") + + @classmethod + def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover + raise NotImplementedError() + + @property + def name(self): # pragma: no cover + return self._join_type + + @property + def config(self): # pragma: no cover + from opteryx.managers.expression import format_expression + + if self._on: + return f"{self._join_type.upper()} JOIN ({format_expression(self._on, True)})" + if self._using: + return f"{self._join_type.upper()} JOIN (USING {','.join(map(format_expression, self._using))})" + return f"{self._join_type.upper()}" + + def execute(self) -> Generator: + left_node = self._producers[0] # type:ignore + right_node = self._producers[1] # type:ignore + + join_provider = providers.get(self._join_type) + + start = time.monotonic_ns() + for morsel in join_provider( + left_relation=left_node, + right_relation=right_node, + left_columns=self._left_columns, + right_columns=self._right_columns, + ): + self.statistics.time_outer_join += time.monotonic_ns() - start + yield morsel + start = time.monotonic_ns() + + +providers = { + "left outer": left_join, + "full outer": full_join, + "right outer": right_join, + "left anti": left_anti_join, + "left semi": left_semi_join, +} diff --git a/opteryx/operatorsv2/projection_node.py b/opteryx/operatorsv2/projection_node.py new file mode 100644 index 000000000..03d6c9312 --- /dev/null +++ b/opteryx/operatorsv2/projection_node.py @@ -0,0 +1,72 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Projection Node + +This is a SQL Query Execution Plan Node. + +This Node eliminates columns that are not needed in a Relation. This is also the Node +that performs column renames. +""" + +import pyarrow + +from opteryx import EOS +from opteryx.managers.expression import NodeType +from opteryx.managers.expression import evaluate_and_append +from opteryx.models import QueryProperties + +from . import BasePlanNode + + +class ProjectionNode(BasePlanNode): + def __init__(self, properties: QueryProperties, **config): + """ + Attribute Projection, remove unwanted columns and performs column renames. + """ + super().__init__(properties=properties) + + projection = config["projection"] + config.get("order_by_columns", []) + + self.projection = [] + for column in projection: + self.projection.append(column.schema_column.identity) + + self.evaluations = [ + column for column in projection if column.node_type != NodeType.IDENTIFIER + ] + + self.columns = config["projection"] + + @classmethod + def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover + raise NotImplementedError() + + @property + def config(self): # pragma: no cover + from opteryx.managers.expression import format_expression + + return ", ".join(format_expression(col) for col in self.columns) + + @property + def name(self): # pragma: no cover + return "Projection" + + def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + if morsel == EOS: + return EOS + + # If any of the columns need evaluating, we need to do that here + morsel = evaluate_and_append(self.evaluations, morsel) + morsel = morsel.select(self.projection) + return morsel diff --git a/opteryx/operatorsv2/read_node.py b/opteryx/operatorsv2/read_node.py new file mode 100644 index 000000000..aa1505568 --- /dev/null +++ b/opteryx/operatorsv2/read_node.py @@ -0,0 +1,224 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Read Node + +This is the SQL Query Execution Plan Node responsible for the reading of data. + +It wraps different internal readers (e.g. GCP Blob reader, SQL Reader), +normalizes the data into the format for internal processing. +""" + +import time +from typing import Generator + +import orjson +import pyarrow +from orso.schema import RelationSchema +from orso.schema import convert_orso_schema_to_arrow_schema + +from opteryx import EOS +from opteryx.models import QueryProperties + +from . import BasePlanNode + + +def struct_to_jsonb(table: pyarrow.Table) -> pyarrow.Table: + """ + Converts any STRUCT columns in a PyArrow Table to JSON strings and replaces them + in the same column position. + + Parameters: + table (pa.Table): The PyArrow Table to process. + + Returns: + pa.Table: A new PyArrow Table with STRUCT columns converted to JSON strings. + """ + for i in range(table.num_columns): + field = table.schema.field(i) + + # Check if the column is a STRUCT + if pyarrow.types.is_struct(field.type): + # Convert each row in the STRUCT column to a JSON string + json_strings = [ + orjson.dumps(row.as_py()) if row.is_valid else None for row in table.column(i) + ] + json_array = pyarrow.array(json_strings, type=pyarrow.binary()) + + # Drop the original STRUCT column + table = table.drop_columns(field.name) + + # Insert the new JSON column at the same position + table = table.add_column( + i, pyarrow.field(name=field.name, type=pyarrow.binary()), json_array + ) + + return table + + +def normalize_morsel(schema: RelationSchema, morsel: pyarrow.Table) -> pyarrow.Table: + if len(schema.columns) == 0 and morsel.column_names != ["*"]: + one_column = pyarrow.array([True] * morsel.num_rows, type=pyarrow.bool_()) + morsel = morsel.append_column("*", one_column) + return morsel.select(["*"]) + + # rename columns for internal use + target_column_names = [] + # columns in the data but not in the schema, droppable + droppable_columns = [] + + # Find which columns to drop and which columns we already have + for i, column in enumerate(morsel.column_names): + column_name = schema.find_column(column) + if column_name is None: + droppable_columns.append(i) + else: + target_column_names.append(str(column_name)) + + # Remove from the end otherwise we'll remove the wrong columns after we've removed one + droppable_columns.reverse() + for droppable in droppable_columns: + morsel = morsel.remove_column(droppable) + + # remane columns to the internal names (identities) + morsel = morsel.rename_columns(target_column_names) + + # add columns we don't have, populate with nulls but try to get the correct type + for column in schema.columns: + if column.identity not in target_column_names: + null_column = pyarrow.array([None] * morsel.num_rows, type=column.arrow_field.type) + field = pyarrow.field(name=column.identity, type=column.arrow_field.type) + morsel = morsel.append_column(field, null_column) + + # ensure the columns are in the right order + return morsel.select([col.identity for col in schema.columns]) + + +def merge_schemas( + hypothetical_schema: RelationSchema, observed_schema: pyarrow.Schema +) -> pyarrow.schema: + """ + Using the hypothetical schema as the base, replace with fields from the observed schema + which are a Decimal type. + """ + # convert the Orso schema to an Arrow schema + hypothetical_arrow_schema = convert_orso_schema_to_arrow_schema(hypothetical_schema, True) + + # Convert the hypothetical schema to a dictionary for easy modification + schema_dict = {field.name: field for field in hypothetical_arrow_schema} + + # Iterate through fields in the observed schema + for observed_field in observed_schema: + # Check if the field is of type Decimal or List/Array + if pyarrow.types.is_decimal(observed_field.type) or pyarrow.types.is_list( + observed_field.type + ): + # Replace or add the field to the schema dictionary + schema_dict[observed_field.name] = observed_field + + # Create a new schema from the updated dictionary of fields + merged_schema = pyarrow.schema(list(schema_dict.values())) + + return merged_schema + + +class ReaderNode(BasePlanNode): + def __init__(self, properties: QueryProperties, **parameters): + BasePlanNode.__init__(self, properties=properties, **parameters) + + self.start_date = parameters.get("start_date") + self.end_date = parameters.get("end_date") + self.hints = parameters.get("hints", []) + self.columns = parameters.get("columns", []) + self.predicates = parameters.get("predicates", []) + + self.connector = parameters.get("connector") + self.schema = parameters.get("schema") + self.limit = parameters.get("limit") + + if len(self.hints) != 0: + self.statistics.add_message("All HINTS are currently ignored") + + def to_dict(self) -> dict: + return { + "identity": f"read-{self.identity}", + "opterator": "ReadNode", + "schema": self.columns, + "projection": self.columns, + "filters": self.predicates, + } + + @classmethod + def from_dict(cls, dic: dict) -> "BasePlanNode": + raise NotImplementedError() + + @property + def name(self): # pragma: no cover + """friendly name for this step""" + return "Read" + + @property + def config(self): + """Additional details for this step""" + date_range = "" + if self.parameters.get("start_date") == self.parameters.get("end_date"): + if self.parameters.get("start_date") is not None: + date_range = f" FOR '{self.parameters.get('start_date')}'" + else: + date_range = ( + f" FOR '{self.parameters.get('start_date')}' TO '{self.parameters.get('end_date')}'" + ) + return ( + f"{self.connector.__type__} " + f"({self.parameters.get('relation')}" + f"{' AS ' + self.parameters.get('alias') if self.parameters.get('alias') else ''}" + f"{date_range}" + f"{' WITH(' + ','.join(self.parameters.get('hints')) + ')' if self.parameters.get('hints') else ''})" + ) + + def execute(self, morsel) -> Generator: + """Perform this step, time how long is spent doing work""" + + morsel = None + orso_schema = self.schema + orso_schema_cols = [] + for col in orso_schema.columns: + if col.identity in [c.schema_column.identity for c in self.columns]: + orso_schema_cols.append(col) + orso_schema.columns = orso_schema_cols + arrow_schema = None + start_clock = time.monotonic_ns() + reader = self.connector.read_dataset( + columns=self.columns, predicates=self.predicates, limit=self.limit + ) + for morsel in reader: + # try to make each morsel have the same schema + morsel = struct_to_jsonb(morsel) + morsel = normalize_morsel(orso_schema, morsel) + if arrow_schema is None: + arrow_schema = merge_schemas(self.schema, morsel.schema) + if arrow_schema.names: + morsel = morsel.cast(arrow_schema) + + self.statistics.time_reading_blobs += time.monotonic_ns() - start_clock + self.statistics.blobs_read += 1 + self.records_out += morsel.num_rows + self.bytes_out += morsel.nbytes + yield morsel + start_clock = time.monotonic_ns() + if morsel: + self.statistics.columns_read += morsel.num_columns + else: + self.statistics.columns_read += len(orso_schema.columns) + + yield EOS diff --git a/opteryx/operatorsv2/set_variable_node.py b/opteryx/operatorsv2/set_variable_node.py new file mode 100644 index 000000000..8d55e0284 --- /dev/null +++ b/opteryx/operatorsv2/set_variable_node.py @@ -0,0 +1,53 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Set Variables Node + +This is a SQL Query Execution Plan Node. +""" + +from typing import Generator + +from opteryx.constants import QueryStatus +from opteryx.models import NonTabularResult +from opteryx.models import QueryProperties +from opteryx.operators import BasePlanNode +from opteryx.operators import OperatorType + + +class SetVariableNode(BasePlanNode): + operator_type = OperatorType.PRODUCER + + def __init__(self, properties: QueryProperties, **config): + super().__init__(properties=properties) + + self.variable = config.get("variable") + self.value = config.get("value") + + self.variables = config.get("variables") + + @classmethod + def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover + raise NotImplementedError() + + @property + def name(self): # pragma: no cover + return "Set Variables" + + @property + def config(self): # pragma: no cover + return f"{self.variable} TO {self.value}" + + def execute(self) -> Generator: + self.variables[self.variable] = self.value + return NonTabularResult(record_count=1, status=QueryStatus.SQL_SUCCESS) # type: ignore diff --git a/opteryx/operatorsv2/show_columns_node.py b/opteryx/operatorsv2/show_columns_node.py new file mode 100644 index 000000000..247ac6159 --- /dev/null +++ b/opteryx/operatorsv2/show_columns_node.py @@ -0,0 +1,118 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Show Columns Node + +This is a SQL Query Execution Plan Node. + +Gives information about a dataset's columns +""" + +from typing import Generator + +import pyarrow + +from opteryx.models import QueryProperties +from opteryx.operators import BasePlanNode +from opteryx.operators import OperatorType + + +def _simple_collector(schema): + """ + We've been given the schema, so just translate to a table + """ + buffer = [] + for column in schema.columns: + new_row = { + "name": column.name, + "type": column.type, + "nullable": column.nullable, + "aliases": column.aliases, + } + buffer.append(new_row) + + table = pyarrow.Table.from_pylist(buffer) + return table + + +def _extended_collector(morsels): + """ + Collect summary statistics about each column + + We use orso, which means converting to an orso DataFrame and then converting back + to a PyArrow table. + """ + import orso + + profile = None + for morsel in morsels: + df = orso.DataFrame.from_arrow(morsel) + if profile is None: + profile = df.profile + else: + profile += df.profile + + return profile.to_dicts() + + +class ShowColumnsNode(BasePlanNode): + operator_type = OperatorType.PRODUCER + + def __init__(self, properties: QueryProperties, **config): + super().__init__(properties=properties) + self._full = config.get("full") + self._extended = config.get("extended") + self._schema = config.get("schema") + self._column_map = {c.schema_column.identity: c.source_column for c in config["columns"]} + + @classmethod + def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover + raise NotImplementedError() + + @property + def name(self): # pragma: no cover + return "Show Columns" + + @property + def config(self): # pragma: no cover + return "" + + def rename_column(self, dic: dict, renames) -> dict: + dic["name"] = renames[dic["name"]] + return dic + + def execute(self) -> Generator: + morsels = self._producers[0] # type:ignore + + if morsels is None: + return None + + if not (self._full or self._extended): + # if it's not full or extended, do just get the list of columns and their + # types + yield _simple_collector(self._schema) + return + + if self._full and not self._extended: + # we're going to read the full table, so we can count stuff + dicts = _extended_collector(morsels.execute()) + dicts = [self.rename_column(d, self._column_map) for d in dicts] + yield pyarrow.Table.from_pylist(dicts) + return + + if self._extended: + # get everything we can reasonable get + dicts = _extended_collector(morsels.execute()) + dicts = [self.rename_column(d, self._column_map) for d in dicts] + yield pyarrow.Table.from_pylist(dicts) + return diff --git a/opteryx/operatorsv2/show_create_node.py b/opteryx/operatorsv2/show_create_node.py new file mode 100644 index 000000000..c33a5d415 --- /dev/null +++ b/opteryx/operatorsv2/show_create_node.py @@ -0,0 +1,65 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Show Create Node + +This is a SQL Query Execution Plan Node. +""" + +from typing import Generator + +import pyarrow + +from opteryx.exceptions import DatasetNotFoundError +from opteryx.exceptions import UnsupportedSyntaxError +from opteryx.models import QueryProperties +from opteryx.operators import BasePlanNode +from opteryx.operators import OperatorType + + +class ShowCreateNode(BasePlanNode): + operator_type = OperatorType.PRODUCER + + def __init__(self, properties: QueryProperties, **config): + super().__init__(properties=properties) + + self.object_type = config.get("object_type") + self.object_name = config.get("object_name") + + @classmethod + def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover + raise NotImplementedError() + + @property + def name(self): # pragma: no cover + return "Show" + + @property + def config(self): # pragma: no cover + return "" + + def execute(self) -> Generator: + if self.object_type == "VIEW": + from opteryx.planner.views import is_view + from opteryx.planner.views import view_as_sql + + if is_view(self.object_name): + view_sql = view_as_sql(self.object_name) + buffer = [{self.object_name: view_sql}] + table = pyarrow.Table.from_pylist(buffer) + yield table + return + + raise DatasetNotFoundError(self.object_name) + + raise UnsupportedSyntaxError("Invalid SHOW statement") diff --git a/opteryx/operatorsv2/show_value_node.py b/opteryx/operatorsv2/show_value_node.py new file mode 100644 index 000000000..c889b66e1 --- /dev/null +++ b/opteryx/operatorsv2/show_value_node.py @@ -0,0 +1,60 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Show Variables Node + +This is a SQL Query Execution Plan Node. +""" + +from typing import Generator + +import pyarrow + +from opteryx.exceptions import SqlError +from opteryx.models import QueryProperties +from opteryx.operators import BasePlanNode +from opteryx.operators import OperatorType + + +class ShowValueNode(BasePlanNode): + operator_type = OperatorType.PRODUCER + + def __init__(self, properties: QueryProperties, **config): + super().__init__(properties=properties) + + self.key = config.get("key") + self.kind = config.get("kind") + self.value = config.get("value") + + if self.kind == "PARAMETER": + if self.value[0] == "@": + raise SqlError("PARAMETERS cannot start with '@'") + self.key = self.value + self.value = properties.variables[self.value] + + @classmethod + def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover + raise NotImplementedError() + + @property + def name(self): # pragma: no cover + return "Show Value" + + @property + def config(self): # pragma: no cover + return "" + + def execute(self) -> Generator: + buffer = [{"name": self.key, "value": str(self.value)}] + table = pyarrow.Table.from_pylist(buffer) + yield table diff --git a/opteryx/operatorsv2/sort_node.py b/opteryx/operatorsv2/sort_node.py new file mode 100644 index 000000000..e685f12d7 --- /dev/null +++ b/opteryx/operatorsv2/sort_node.py @@ -0,0 +1,100 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Sort Node + +This is a SQL Query Execution Plan Node. + +This node orders a dataset +""" + +import numpy +from orso.types import OrsoTypes +from pyarrow import Table +from pyarrow import concat_tables + +from opteryx import EOS +from opteryx.exceptions import ColumnNotFoundError +from opteryx.exceptions import UnsupportedSyntaxError +from opteryx.managers.expression import NodeType +from opteryx.models import QueryProperties + +from . import BasePlanNode + + +class SortNode(BasePlanNode): + def __init__(self, properties: QueryProperties, **config): + super().__init__(properties=properties) + self.order_by = config.get("order", []) + self.morsels = [] + + @classmethod + def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover + raise NotImplementedError() + + @property + def config(self): # pragma: no cover + return ", ".join([f"{i[0].value} {i[1][0:3].upper()}" for i in self.order_by]) + + @property + def name(self): # pragma: no cover + return "Sort" + + def execute(self, morsel: Table) -> Table: + if morsel != EOS and morsel.num_rows > 0: + self.morsels.append(morsel) + return None + + table = concat_tables(self.morsels, promote_options="permissive") + + mapped_order = [] + + for column, direction in self.order_by: + if column.node_type == NodeType.FUNCTION: + # ORDER BY RAND() shuffles the results + # we create a random list, sort that then take the rows from the + # table in that order - this is faster than ordering the data + if column.value in ("RANDOM", "RAND"): + new_order = numpy.argsort(numpy.random.uniform(size=table.num_rows)) + table = table.take(new_order) + return table + + raise UnsupportedSyntaxError( + "`ORDER BY` only supports `RAND()` as a functional sort order." + ) + + elif column.node_type == NodeType.LITERAL and column.type == OrsoTypes.INTEGER: + # we have an index rather than a column name, it's a natural + # number but the list of column names is zero-based, so we + # subtract one + column_name = table.column_names[int(column.value) - 1] + mapped_order.append( + ( + column_name, + direction, + ) + ) + else: + try: + mapped_order.append( + ( + column.schema_column.identity, + direction, + ) + ) + except ColumnNotFoundError as cnfe: # pragma: no cover + raise ColumnNotFoundError( + f"`ORDER BY` must reference columns as they appear in the `SELECT` clause. {cnfe}" + ) + + return [table.sort_by(mapped_order), EOS] diff --git a/opteryx/operatorsv2/union_node.py b/opteryx/operatorsv2/union_node.py new file mode 100644 index 000000000..34e102be9 --- /dev/null +++ b/opteryx/operatorsv2/union_node.py @@ -0,0 +1,60 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Union Node + +This is a SQL Query Execution Plan Node. +""" + +from typing import Generator + +from opteryx.models import QueryProperties +from opteryx.operators import BasePlanNode +from opteryx.operators import OperatorType + + +class UnionNode(BasePlanNode): + operator_type = OperatorType.PASSTHRU + + def __init__(self, properties: QueryProperties, **config): + super().__init__(properties=properties) + self.columns = config.get("columns", []) + self.column_ids = [c.schema_column.identity for c in self.columns] + + @classmethod + def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover + raise NotImplementedError() + + @property + def name(self): # pragma: no cover + return "Union" + + @property + def config(self): # pragma: no cover + return "" + + def execute(self) -> Generator: + """ + Union needs to ensure the column names are the same and that + coercible types are coerced. + """ + schema = None + if self._producers: + for morsels in self._producers: + for morsel in morsels.execute(): + if schema is None: + schema = morsel.schema + else: + morsel = morsel.rename_columns(schema.names) + morsel = morsel.cast(schema) + yield morsel.select(self.column_ids) diff --git a/opteryx/planner/__init__.py b/opteryx/planner/__init__.py index 832ffe51d..c8604c66c 100644 --- a/opteryx/planner/__init__.py +++ b/opteryx/planner/__init__.py @@ -55,8 +55,8 @@ from opteryx import config from opteryx.managers.expression import NodeType -from opteryx.models import ExecutionTree from opteryx.models import Node +from opteryx.models import PhysicalPlan PROFILE_LOCATION = config.PROFILE_LOCATION @@ -125,7 +125,7 @@ def query_planner( connection, qid: str, statistics, -) -> Generator[ExecutionTree, Any, Any]: +) -> Generator[PhysicalPlan, Any, Any]: from opteryx.exceptions import SqlError from opteryx.models import QueryProperties from opteryx.planner.ast_rewriter import do_ast_rewriter @@ -134,8 +134,9 @@ def query_planner( from opteryx.planner.logical_planner import LogicalPlan from opteryx.planner.logical_planner import apply_visibility_filters from opteryx.planner.logical_planner import do_logical_planning_phase + from opteryx.planner.physical_planner import create_physical_plan from opteryx.planner.sql_rewriter import do_sql_rewrite - from opteryx.planner.temporary_physical_planner import create_physical_plan + from opteryx.planner.temporary_physical_planner import create_legacy_physical_plan from opteryx.third_party import sqloxide # SQL Rewriter extracts temporal filters @@ -200,6 +201,9 @@ def query_planner( # before we write the new optimizer and execution engine, convert to a V1 plan start = time.monotonic_ns() query_properties = QueryProperties(qid=qid, variables=connection.context.variables) - physical_plan = create_physical_plan(optimized_plan, query_properties) + if config.EXPERIMENTAL_EXECUTION_ENGINE: + physical_plan = create_physical_plan(optimized_plan, query_properties) + else: + physical_plan = create_legacy_physical_plan(optimized_plan, query_properties) statistics.time_planning_physical_planner += time.monotonic_ns() - start yield physical_plan diff --git a/opteryx/planner/cost_based_optimizer/strategies/limit_pushdown.py b/opteryx/planner/cost_based_optimizer/strategies/limit_pushdown.py index 86105d750..61eb80937 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/limit_pushdown.py +++ b/opteryx/planner/cost_based_optimizer/strategies/limit_pushdown.py @@ -19,6 +19,7 @@ We try to push the limit to the other side of PROJECTS """ +from opteryx.connectors.capabilities import LimitPushable from opteryx.planner.logical_planner import LogicalPlan from opteryx.planner.logical_planner import LogicalPlanNode from opteryx.planner.logical_planner import LogicalPlanStepType @@ -37,12 +38,18 @@ def visit(self, node: LogicalPlanNode, context: OptimizerContext) -> OptimizerCo context.collected_limits.append(node) return context - if node.node_type in ( + if node.node_type.Scan and LimitPushable in node.connector.__class__.mro(): + for limit_node in context.collected_limits: + if node.relation in limit_node.all_relations: + self.statistics.optimization_limit_pushdown += 1 + context.optimized_plan.remove_node(limit_node.nid, heal=True) + node.limit = limit_node.limit + context.optimized_plan[context.node_id] = node + elif node.node_type in ( LogicalPlanStepType.Join, LogicalPlanStepType.Scan, LogicalPlanStepType.AggregateAndGroup, LogicalPlanStepType.Aggregate, - LogicalPlanStepType.Subquery, LogicalPlanStepType.Union, LogicalPlanStepType.Filter, ): diff --git a/opteryx/planner/logical_planner/logical_planner.py b/opteryx/planner/logical_planner/logical_planner.py index 6d69b16ee..2de46489d 100644 --- a/opteryx/planner/logical_planner/logical_planner.py +++ b/opteryx/planner/logical_planner/logical_planner.py @@ -136,7 +136,10 @@ def __str__(self): predicates = "" if self.predicates: predicates = " (" + " AND ".join(map(format_expression, self.predicates)) + ")" - return f"READ ({self.relation}{alias}{date_range}{' WITH(' + ','.join(self.hints) + ')' if self.hints else ''}){columns}{predicates}" + limit = "" + if self.limit: + limit = f" LIMIT {self.limit}" + return f"READ ({self.relation}{alias}{date_range}{' WITH(' + ','.join(self.hints) + ')' if self.hints else ''}){columns}{predicates}{limit}" if node_type == LogicalPlanStepType.Set: return f"SET ({self.variable} TO {self.value.value})" if node_type == LogicalPlanStepType.Show: diff --git a/opteryx/planner/physical_planner.py b/opteryx/planner/physical_planner.py new file mode 100644 index 000000000..69a28b2c5 --- /dev/null +++ b/opteryx/planner/physical_planner.py @@ -0,0 +1,112 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from orso.schema import OrsoTypes + +from opteryx import operatorsv2 as operators +from opteryx.exceptions import UnsupportedSyntaxError +from opteryx.models import PhysicalPlan +from opteryx.planner.logical_planner import LogicalPlanStepType + + +def create_physical_plan(logical_plan, query_properties) -> PhysicalPlan: + plan = PhysicalPlan() + + for nid, logical_node in logical_plan.nodes(data=True): + node_type = logical_node.node_type + node_config = logical_node.properties + node: operators.BasePlanNode = None + + # fmt: off + if node_type == LogicalPlanStepType.Aggregate: + node = operators.AggregateNode(query_properties, aggregates=node_config["aggregates"]) + elif node_type == LogicalPlanStepType.AggregateAndGroup: + node = operators.AggregateAndGroupNode(query_properties, groups=node_config["groups"], aggregates=node_config["aggregates"], projection=node_config["projection"]) + # elif node_type == LogicalPlanStepType.Defragment: + # node = operators.MorselDefragmentNode(query_properties, **node_config) + elif node_type == LogicalPlanStepType.Distinct: + node = operators.DistinctNode(query_properties, **node_config) + elif node_type == LogicalPlanStepType.Exit: + node = operators.ExitNode(query_properties, **node_config) + elif node_type == LogicalPlanStepType.Explain: + node = operators.ExplainNode(query_properties, **node_config) + elif node_type == LogicalPlanStepType.Filter: + node = operators.FilterNode(query_properties, filter=node_config["condition"]) + elif node_type == LogicalPlanStepType.FunctionDataset: + node = operators.FunctionDatasetNode(query_properties, **node_config) + elif node_type == LogicalPlanStepType.HeapSort: + node = operators.HeapSortNode(query_properties, **node_config) + elif node_type == LogicalPlanStepType.Join: + if node_config.get("type") == "inner": + # We use our own implementation of INNER JOIN + # We have optimized VARCHAR version + if len(node_config["left_columns"]) == 1 and node_config["columns"][0].schema_column.type == OrsoTypes.VARCHAR: + node = operators.InnerJoinSingleNode(query_properties, **node_config) + else: + node = operators.InnerJoinNode(query_properties, **node_config) + elif node_config.get("type") in ("left outer", "full outer", "right outer", "left anti", "left semi"): + # We use out own implementation of OUTER JOINS + node = operators.OuterJoinNode(query_properties, **node_config) + elif node_config.get("type") == "cross join": + # Pyarrow doesn't have a CROSS JOIN + node = operators.CrossJoinNode(query_properties, **node_config) + else: + # Use Pyarrow for all other joins + node = operators.JoinNode(query_properties, **node_config) + elif node_type == LogicalPlanStepType.Limit: + node = operators.LimitNode(query_properties, limit=node_config.get("limit"), offset=node_config.get("offset", 0)) + elif node_type == LogicalPlanStepType.Order: + node = operators.SortNode(query_properties, order=node_config["order_by"]) + elif node_type == LogicalPlanStepType.Project: + node = operators.ProjectionNode(query_properties, projection=logical_node.columns) + elif node_type == LogicalPlanStepType.Scan: + connector = node_config.get("connector") + if connector and hasattr(connector, "async_read_blob"): + node = operators.AsyncReaderNode(query_properties, **node_config) + else: + node = operators.ReaderNode(properties=query_properties, **node_config) + elif node_type == LogicalPlanStepType.Set: + node = operators.SetVariableNode(query_properties, **node_config) + elif node_type == LogicalPlanStepType.Show: + if node_config["object_type"] == "VARIABLE": + node = operators.ShowValueNode(query_properties, kind=node_config["items"][1], value=node_config["items"][1]) + elif node_config["object_type"] == "VIEW": + node = operators.ShowCreateNode(query_properties, **node_config) + else: + raise UnsupportedSyntaxError(f"Unsupported SHOW type '{node_config['object_type']}'") + elif node_type == LogicalPlanStepType.ShowColumns: + node = operators.ShowColumnsNode(query_properties, **node_config) + elif node_type == LogicalPlanStepType.Subquery: + node = operators.NoOpNode(query_properties, **node_config) + elif node_type == LogicalPlanStepType.Union: + node = operators.UnionNode(query_properties, **node_config) + else: # pragma: no cover + raise Exception(f"something unexpected happed - {node_type.name}") + # fmt: on + + # DEBUG: from opteryx.exceptions import InvalidInternalStateError + # DEBUG: + # DEBUG: try: + # DEBUG: config = node.to_json() + ## DEBUG: print(config) + # DEBUG: except Exception as err: + # DEBUG: message = f"Internal Error - node '{node}' unable to be serialized" + # DEBUG: print(message) + ## DEBUG: raise InvalidInternalStateError(message) + + plan.add_node(nid, node) + + for source, destination, relation in logical_plan.edges(): + plan.add_edge(source, destination, relation) + + return plan diff --git a/opteryx/planner/temporary_physical_planner.py b/opteryx/planner/temporary_physical_planner.py index e80c81f1d..6d51b150a 100644 --- a/opteryx/planner/temporary_physical_planner.py +++ b/opteryx/planner/temporary_physical_planner.py @@ -22,12 +22,12 @@ from opteryx import operators from opteryx.exceptions import UnsupportedSyntaxError -from opteryx.models import ExecutionTree +from opteryx.models import PhysicalPlan from opteryx.planner.logical_planner import LogicalPlanStepType -def create_physical_plan(logical_plan, query_properties) -> ExecutionTree: - plan = ExecutionTree() +def create_legacy_physical_plan(logical_plan, query_properties) -> PhysicalPlan: + plan = PhysicalPlan() for nid, logical_node in logical_plan.nodes(data=True): node_type = logical_node.node_type diff --git a/pyproject.toml b/pyproject.toml index 288686841..5a4d777a2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ fast = true [tool.isort] profile = "black" -extend_skip_glob = ["tests/**", "*.pyx", "testdata/**"] +extend_skip_glob = ["tests/**", "*.pyx", "testdata/**", "**/operatorsv2/__init__.py"] skip_gitignore = true line_length = 100 multi_line_output = 9 diff --git a/tests/plan_optimization/test_limit_pushdown_postgres.py b/tests/plan_optimization/test_limit_pushdown_postgres.py new file mode 100644 index 000000000..b0e309786 --- /dev/null +++ b/tests/plan_optimization/test_limit_pushdown_postgres.py @@ -0,0 +1,92 @@ +import os +import sys +import pytest + +sys.path.insert(1, os.path.join(sys.path[0], "../..")) + +import opteryx +from opteryx.connectors import SqlConnector +from opteryx.utils.formatter import format_sql + +POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD") +POSTGRES_USER = os.environ.get("POSTGRES_USER") + + +opteryx.register_store( + "pg", + SqlConnector, + remove_prefix=True, + connection=f"postgresql+psycopg2://{POSTGRES_USER}:{POSTGRES_PASSWORD}@trumpet.db.elephantsql.com/{POSTGRES_USER}", +) + +STATEMENTS = [ + # baseline + ("SELECT name FROM pg.planets;", 9), + # push limit + ("SELECT name FROM pg.planets LIMIT 1;", 1), + # test with filter + ("SELECT name FROM pg.planets WHERE gravity > 1;", 8), + # pushable filter and limit should push the limit + ("SELECT name FROM pg.planets WHERE gravity > 1 LIMIT 1;", 1), + # if we can't push the filter, we can't push the limit + ("SELECT name FROM pg.planets WHERE SIGNUM(gravity) > 1 LIMIT 1;", 9), + # we don't push past ORDER BY + ("SELECT * FROM pg.planets ORDER BY name LIMIT 3", 9), + # push past subqueries + ("SELECT name FROM (SELECT * FROM pg.planets) AS S LIMIT 3", 3), +] + +@pytest.mark.parametrize("query, expected_columns", STATEMENTS) +def test_postgres_limit_pushdown(query, expected_rows): + cur = opteryx.query(query) + cur.materialize() + assert cur.stats["rows_read"] == expected_rows, cur.stats + +if __name__ == "__main__": # pragma: no cover + import shutil + import time + + from tests.tools import trunc_printable + from opteryx.utils.formatter import format_sql + + start_suite = time.monotonic_ns() + passed = 0 + failed = 0 + + width = shutil.get_terminal_size((80, 20))[0] - 15 + + print(f"RUNNING BATTERY OF {len(STATEMENTS)} TESTS") + for index, (statement, read_columns) in enumerate(STATEMENTS): + print( + f"\033[38;2;255;184;108m{(index + 1):04}\033[0m" + f" {trunc_printable(format_sql(statement), width - 1)}", + end="", + flush=True, + ) + try: + start = time.monotonic_ns() + test_postgres_limit_pushdown(statement, read_columns) + print( + f"\033[38;2;26;185;67m{str(int((time.monotonic_ns() - start)/1e6)).rjust(4)}ms\033[0m ✅", + end="", + ) + passed += 1 + if failed > 0: + print(" \033[0;31m*\033[0m") + else: + print() + except Exception as err: + print(f"\033[0;31m{str(int((time.monotonic_ns() - start)/1e6)).rjust(4)}ms ❌ *\033[0m") + print(">", err) + failed += 1 + + print("--- ✅ \033[0;32mdone\033[0m") + + if failed > 0: + print("\n\033[38;2;139;233;253m\033[3mFAILURES\033[0m") + + print( + f"\n\033[38;2;139;233;253m\033[3mCOMPLETE\033[0m ({((time.monotonic_ns() - start_suite) / 1e9):.2f} seconds)\n" + f" \033[38;2;26;185;67m{passed} passed ({(passed * 100) // (passed + failed)}%)\033[0m\n" + f" \033[38;2;255;121;198m{failed} failed\033[0m" + ) diff --git a/tests/plan_optimization/test_limit_pushdown_sqlite.py b/tests/plan_optimization/test_limit_pushdown_sqlite.py new file mode 100644 index 000000000..81a8cf20e --- /dev/null +++ b/tests/plan_optimization/test_limit_pushdown_sqlite.py @@ -0,0 +1,90 @@ +import os +import sys +import pytest + +sys.path.insert(1, os.path.join(sys.path[0], "../..")) + +import opteryx +from opteryx.connectors import SqlConnector +from opteryx.utils.formatter import format_sql + +opteryx.register_store( + "sqlite", + SqlConnector, + remove_prefix=True, + connection="sqlite:///testdata/sqlite/database.db", +) + +STATEMENTS = [ + # baseline + ("SELECT name FROM sqlite.planets;", 9), + # push limit + ("SELECT name FROM sqlite.planets LIMIT 1;", 1), + # test with filter + ("SELECT name FROM sqlite.planets WHERE gravity > 1;", 8), + # pushable filter and limit should push the limit + ("SELECT name FROM sqlite.planets WHERE gravity > 1 LIMIT 1;", 1), + # if we can't push the filter, we can't push the limit + ("SELECT name FROM sqlite.planets WHERE SIGNUM(gravity) > 1 LIMIT 1;", 9), + # we don't push past ORDER BY + ("SELECT * FROM sqlite.planets ORDER BY name LIMIT 3", 9), + # push past subqueries + ("SELECT name FROM (SELECT * FROM sqlite.planets) AS S LIMIT 3", 3), +] + +@pytest.mark.parametrize("query, expected_columns", STATEMENTS) +def test_sqlite_limit_pushdown(query, expected_rows): + cur = opteryx.query(query) + cur.materialize() + assert cur.stats["rows_read"] == expected_rows, cur.stats + + + +if __name__ == "__main__": # pragma: no cover + import shutil + import time + + from tests.tools import trunc_printable + from opteryx.utils.formatter import format_sql + + start_suite = time.monotonic_ns() + passed = 0 + failed = 0 + + width = shutil.get_terminal_size((80, 20))[0] - 15 + + print(f"RUNNING BATTERY OF {len(STATEMENTS)} TESTS") + for index, (statement, read_columns) in enumerate(STATEMENTS): + print( + f"\033[38;2;255;184;108m{(index + 1):04}\033[0m" + f" {trunc_printable(format_sql(statement), width - 1)}", + end="", + flush=True, + ) + try: + start = time.monotonic_ns() + test_sqlite_limit_pushdown(statement, read_columns) + print( + f"\033[38;2;26;185;67m{str(int((time.monotonic_ns() - start)/1e6)).rjust(4)}ms\033[0m ✅", + end="", + ) + passed += 1 + if failed > 0: + print(" \033[0;31m*\033[0m") + else: + print() + except Exception as err: + print(f"\033[0;31m{str(int((time.monotonic_ns() - start)/1e6)).rjust(4)}ms ❌ *\033[0m") + print(">", err) + failed += 1 + + print("--- ✅ \033[0;32mdone\033[0m") + + if failed > 0: + print("\n\033[38;2;139;233;253m\033[3mFAILURES\033[0m") + + print( + f"\n\033[38;2;139;233;253m\033[3mCOMPLETE\033[0m ({((time.monotonic_ns() - start_suite) / 1e9):.2f} seconds)\n" + f" \033[38;2;26;185;67m{passed} passed ({(passed * 100) // (passed + failed)}%)\033[0m\n" + f" \033[38;2;255;121;198m{failed} failed\033[0m" + ) diff --git a/tests/plan_optimization/test_projection_pushdown_sqlite.py b/tests/plan_optimization/test_projection_pushdown_sqlite.py index ef30ce532..cf8e52ce5 100644 --- a/tests/plan_optimization/test_projection_pushdown_sqlite.py +++ b/tests/plan_optimization/test_projection_pushdown_sqlite.py @@ -53,7 +53,7 @@ ] @pytest.mark.parametrize("query, expected_columns", STATEMENTS) -def test_parquet_projection_pushdown(query, expected_columns): +def test_sqlite_projection_pushdown(query, expected_columns): cur = opteryx.query(query) cur.materialize() assert cur.stats["columns_read"] == expected_columns, cur.stats @@ -83,7 +83,7 @@ def test_parquet_projection_pushdown(query, expected_columns): ) try: start = time.monotonic_ns() - test_parquet_projection_pushdown(statement, read_columns) + test_sqlite_projection_pushdown(statement, read_columns) print( f"\033[38;2;26;185;67m{str(int((time.monotonic_ns() - start)/1e6)).rjust(4)}ms\033[0m ✅", end="", diff --git a/tests/query_execution/test_execution_tree.py b/tests/query_execution/test_execution_tree.py index 0a06100ef..cafa582d5 100644 --- a/tests/query_execution/test_execution_tree.py +++ b/tests/query_execution/test_execution_tree.py @@ -7,7 +7,7 @@ sys.path.insert(1, os.path.join(sys.path[0], "../..")) -from opteryx.models.execution_tree import ExecutionTree +from opteryx.models.physical_plan import ExecutionTree def test_execution_tree(): diff --git a/tests/sql_battery/test_shapes_and_errors_battery.py b/tests/sql_battery/test_shapes_and_errors_battery.py index d43408e2f..c092c9048 100644 --- a/tests/sql_battery/test_shapes_and_errors_battery.py +++ b/tests/sql_battery/test_shapes_and_errors_battery.py @@ -1758,6 +1758,12 @@ ("SELECT jsonb_object_keys(VARCHAR(birth_place)) FROM testdata.astronauts", 357, 1, None), ("SELECT jsonb_object_keys(BLOB(birth_place)) FROM testdata.astronauts", 357, 1, None), + ("SELECT VARCHAR(SUBSTRING(BLOB(birth_date) FROM -4)) FROM $astronauts", 357, 1, None), + ("SELECT SUBSTRING(BLOB(birth_date) FROM -4) FROM $astronauts", 357, 1, None), + ("SELECT SUBSTRING(name FROM 4) FROM $astronauts", 357, 1, None), + ("SELECT SUBSTRING(name FROM 1 FOR 1) FROM $astronauts", 357, 1, None), + ("SELECT SUBSTRING(name FROM -1 FOR 1) FROM $astronauts", 357, 1, None), + # Edge Case with Empty Joins ("SELECT * FROM $planets LEFT JOIN (SELECT id FROM $satellites WHERE planetId < 0) AS S ON $planets.id = S.id", 9, 21, None), # Handling NULL Comparisons in WHERE Clause From a1d31773f75891f72dc0234d75bd2910fecf601b Mon Sep 17 00:00:00 2001 From: XB500 Date: Sat, 26 Oct 2024 10:53:49 +0000 Subject: [PATCH 005/157] Opteryx Version 0.18.1 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 5330b2b1d..a3b377443 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 836 +__build__ = 839 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 4df8b4162b01cba5266f87418ba563d53582e8df Mon Sep 17 00:00:00 2001 From: XB500 Date: Sat, 26 Oct 2024 11:00:18 +0000 Subject: [PATCH 006/157] Opteryx Version 0.18.1 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index a3b377443..5de3a95c8 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 839 +__build__ = 840 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 7cd8adaf8e31dc322860e8f73872070eaa13c640 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 28 Oct 2024 09:22:36 +0000 Subject: [PATCH 007/157] Bump duckdb-engine from 0.13.2 to 0.13.4 Bumps [duckdb-engine](https://github.com/Mause/duckdb_engine) from 0.13.2 to 0.13.4. - [Release notes](https://github.com/Mause/duckdb_engine/releases) - [Changelog](https://github.com/Mause/duckdb_engine/blob/main/CHANGELOG.md) - [Commits](https://github.com/Mause/duckdb_engine/compare/v0.13.2...v0.13.4) --- updated-dependencies: - dependency-name: duckdb-engine dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- tests/requirements_arm.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/requirements_arm.txt b/tests/requirements_arm.txt index fd0d91aac..0a7b711e5 100644 --- a/tests/requirements_arm.txt +++ b/tests/requirements_arm.txt @@ -19,6 +19,6 @@ sqlalchemy pymysql psycopg2-binary duckdb==1.1.2 # 1040 -duckdb-engine==0.13.2 # 1040 +duckdb-engine==0.13.4 # 1040 setuptools_rust \ No newline at end of file From 968eec1afd91ebb0db3933d174a5edbff03da546 Mon Sep 17 00:00:00 2001 From: joocer Date: Sat, 2 Nov 2024 14:40:30 -0400 Subject: [PATCH 008/157] =?UTF-8?q?=E2=9C=A8=20push-based=20execution=20?= =?UTF-8?q?=20#2061?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- opteryx/connectors/capabilities/cacheable.py | 5 +- opteryx/models/physical_plan.py | 117 +++++++++- opteryx/operators/aggregate_node.py | 16 +- opteryx/operatorsv2/__init__.py | 3 +- opteryx/operatorsv2/base_plan_node.py | 4 + opteryx/operatorsv2/cross_join_node.py | 181 ++++++++-------- opteryx/operatorsv2/exit_node.py | 3 +- opteryx/operatorsv2/explain_node.py | 17 +- opteryx/operatorsv2/function_dataset_node.py | 5 +- opteryx/operatorsv2/inner_join_node.py | 4 +- opteryx/operatorsv2/inner_join_node_single.py | 73 ++++--- opteryx/operatorsv2/join_node.py | 97 --------- opteryx/operatorsv2/noop_node.py | 16 +- opteryx/operatorsv2/outer_join_node.py | 203 +++++++++--------- opteryx/operatorsv2/projection_node.py | 3 +- opteryx/operatorsv2/set_variable_node.py | 10 +- opteryx/operatorsv2/show_columns_node.py | 72 +++---- opteryx/operatorsv2/sort_node.py | 2 +- opteryx/operatorsv2/union_node.py | 36 ++-- opteryx/planner/binder/__init__.py | 3 - .../planner/cost_based_optimizer/__init__.py | 2 +- .../logical_planner/logical_planner.py | 9 +- opteryx/planner/physical_planner.py | 6 +- opteryx/planner/sql_rewriter.py | 8 +- opteryx/third_party/travers/graph.py | 2 +- .../test_temporal_extraction.py | 8 + 26 files changed, 443 insertions(+), 462 deletions(-) delete mode 100644 opteryx/operatorsv2/join_node.py diff --git a/opteryx/connectors/capabilities/cacheable.py b/opteryx/connectors/capabilities/cacheable.py index e70360c3f..d996d646f 100644 --- a/opteryx/connectors/capabilities/cacheable.py +++ b/opteryx/connectors/capabilities/cacheable.py @@ -124,11 +124,12 @@ async def wrapper(blob_name: str, statistics, pool: MemoryPool, **kwargs): raise # Optionally re-raise the error after logging it finally: - # If we found the file, see if we need to write it to the caches - if source != SOURCE_NOT_FOUND and evictions_remaining > 0: + if payload is None and read_buffer_ref is not None: # we set a per-query eviction limit payload = await pool.read(read_buffer_ref) # type: ignore + # If we found the file, see if we need to write it to the caches + if source != SOURCE_NOT_FOUND and evictions_remaining > 0: if source != SOURCE_BUFFER_POOL and len(payload) < buffer_pool.size // 10: # if we didn't get it from the buffer pool (origin or remote cache) we add it evicted = buffer_pool.set(key, payload) diff --git a/opteryx/models/physical_plan.py b/opteryx/models/physical_plan.py index 03a7e9fc0..ca690c528 100644 --- a/opteryx/models/physical_plan.py +++ b/opteryx/models/physical_plan.py @@ -149,7 +149,7 @@ def map_operators_to_producers(nodes: list) -> None: else: yield results, ResultType.TABULAR - def explain(self): + def explain(self) -> Generator[pyarrow.Table, None, None]: from opteryx import operators def _inner_explain(node, depth): @@ -178,12 +178,106 @@ def _inner_explain(node, depth): yield table - def push_executor(self): - pump_nodes = self.get_entry_points() - for pump_node in pump_nodes: - pump_instance = self[pump_node] - for morsel in pump_instance(None): - yield from self.process_node(pump_node, morsel) + def explainv2(self, analyze: bool) -> Generator[pyarrow.Table, None, None]: + from opteryx import operatorsv2 + + def _inner_explain(node, depth): + incoming_operators = self.ingoing_edges(node) + for operator_name in incoming_operators: + operator = self[operator_name[0]] + if isinstance( + operator, (operatorsv2.ExitNode, operatorsv2.ExplainNode) + ): # Skip ExitNode + yield from _inner_explain(operator_name[0], depth) + continue + elif isinstance(operator, operatorsv2.BasePlanNode): + record = { + "tree": depth, + "operator": operator.name, + "config": operator.config, + } + if analyze: + record["time_ms"] = operator.execution_time / 1e6 + record["records_in"] = operator.records_in + record["records_out"] = operator.records_out + yield record + yield from _inner_explain(operator_name[0], depth + 1) + + head = list(dict.fromkeys(self.get_exit_points())) + if len(head) != 1: # pragma: no cover + raise InvalidInternalStateError(f"Problem with the plan - it has {len(head)} heads.") + + # for EXPLAIN ANALYZE, we execute the query and report statistics + if analyze: + # we don't want the results, just the details from the plan + temp = None + for temp in self.push_executor(): + pass + del temp + + plan = list(_inner_explain(head[0], 1)) + + table = pyarrow.Table.from_pylist(plan) + + yield table + + def push_executor(self) -> Tuple[Generator[pyarrow.Table, None, None], ResultType]: + from opteryx.operatorsv2 import ExplainNode + from opteryx.operatorsv2 import JoinNode + from opteryx.operatorsv2 import SetVariableNode + + return_type = ResultType.TABULAR + + # Validate query plan to ensure it's acyclic + if not self.is_acyclic(): + raise InvalidInternalStateError("Query plan is cyclic, cannot execute.") + + # Retrieve the tail of the query plan, which should ideally be a single head node + head_nodes = list(set(self.get_exit_points())) + + if len(head_nodes) != 1: + raise InvalidInternalStateError( + f"Query plan has {len(head_nodes)} heads, expected exactly 1." + ) + + head_node = self[head_nodes[0]] + + joins = [(nid, node) for nid, node in self.nodes(True) if isinstance(node, JoinNode)] + for nid, join in joins: + print(join, self.ingoing_edges(nid)) + for s, t, r in self.breadth_first_search(nid, reverse=True): + if set(join._left_relation).intersection( + { + self[s].parameters.get("alias"), + self[s].parameters.get("relation"), + } + ): + self.remove_edge(s, t, r) + self.add_edge(s, t, "left") + elif set(join._right_relation).intersection( + { + self[s].parameters.get("alias"), + self[s].parameters.get("relation"), + } + ): + self.remove_edge(s, t, r) + self.add_edge(s, t, "right") + + # Special case handling for 'Explain' queries + if isinstance(head_node, ExplainNode): + yield self.explainv2(head_node.analyze), ResultType.TABULAR + + if isinstance(head_node, SetVariableNode): + yield head_node(None), ResultType.NON_TABULAR + + def inner_execute(plan): + pump_nodes = plan.get_entry_points() + for pump_node in pump_nodes: + pump_instance = plan[pump_node] + for morsel in pump_instance(None): + yield from plan.process_node(pump_node, morsel) + + yield inner_execute(self), ResultType.TABULAR def process_node(self, nid, morsel): from opteryx.operatorsv2 import ReaderNode @@ -191,9 +285,10 @@ def process_node(self, nid, morsel): node = self[nid] if isinstance(node, ReaderNode): - children = [t for s, t, r in self.outgoing_edges(nid)] + children = (t for s, t, r in self.outgoing_edges(nid)) for child in children: results = self.process_node(child, morsel) + results = list(results) yield from results else: results = node(morsel) @@ -208,8 +303,8 @@ def process_node(self, nid, morsel): children = [t for s, t, r in self.outgoing_edges(nid)] for child in children: yield from self.process_node(child, result) - if len(children) == 0: - yield result, ResultType.TABULAR + if len(children) == 0 and result != EOS: + yield result def sensors(self): readings = {} @@ -220,4 +315,6 @@ def sensors(self): def __del__(self): pass + + # print(self.sensors()) diff --git a/opteryx/operators/aggregate_node.py b/opteryx/operators/aggregate_node.py index 436af7ba7..935d517a8 100644 --- a/opteryx/operators/aggregate_node.py +++ b/opteryx/operators/aggregate_node.py @@ -21,6 +21,7 @@ import time from dataclasses import dataclass from typing import Generator +from typing import List import numpy import pyarrow @@ -80,14 +81,13 @@ def _count_star(morsel_promise, column_name): yield table -def project(tables, column_names): - for table in tables: - row_count = table.num_rows - if len(column_names) > 0: - yield table.select(dict.fromkeys(column_names)) - else: - # if we can't find the column, add a placeholder column - yield pyarrow.Table.from_pydict({"*": numpy.full(row_count, 1, dtype=numpy.int8)}) +def project(table: pyarrow.Table, column_names: List) -> pyarrow.Table: + row_count = table.num_rows + if len(column_names) > 0: + return table.select(dict.fromkeys(column_names)) + else: + # if we can't find the column, add a placeholder column + return pyarrow.Table.from_pydict({"*": numpy.full(row_count, 1, dtype=numpy.int8)}) def build_aggregations(aggregators): diff --git a/opteryx/operatorsv2/__init__.py b/opteryx/operatorsv2/__init__.py index 4f2505e0b..bbcc9d679 100644 --- a/opteryx/operatorsv2/__init__.py +++ b/opteryx/operatorsv2/__init__.py @@ -14,7 +14,7 @@ from .base_plan_node import BasePlanDataObject # isort: skip -from .base_plan_node import BasePlanNode # isort: skip +from .base_plan_node import BasePlanNode, JoinNode # isort: skip from .aggregate_and_group_node import AggregateAndGroupNode # Group is always followed by aggregate from .aggregate_node import AGGREGATORS @@ -33,7 +33,6 @@ # from .information_schema_node import InformationSchemaNode # information_schema from .inner_join_node import InnerJoinNode from .inner_join_node_single import InnerJoinSingleNode -from .join_node import JoinNode from .limit_node import LimitNode # select the first N records # from .metadata_writer_node import MetadataWriterNode diff --git a/opteryx/operatorsv2/base_plan_node.py b/opteryx/operatorsv2/base_plan_node.py index 0188159b4..7e025efaf 100644 --- a/opteryx/operatorsv2/base_plan_node.py +++ b/opteryx/operatorsv2/base_plan_node.py @@ -113,3 +113,7 @@ def sensors(self): "bytes_in": self.bytes_in, "bytes_out": self.bytes_out, } + + +class JoinNode(BasePlanNode): + pass diff --git a/opteryx/operatorsv2/cross_join_node.py b/opteryx/operatorsv2/cross_join_node.py index 6ab42e4ad..d79b6453a 100644 --- a/opteryx/operatorsv2/cross_join_node.py +++ b/opteryx/operatorsv2/cross_join_node.py @@ -29,12 +29,13 @@ from orso.schema import FlatColumn from opteryx import EOS +from opteryx.compiled.structures import HashSet from opteryx.managers.expression import NodeType -from opteryx.models import Node +from opteryx.models import LogicalColumn from opteryx.models import QueryProperties from opteryx.operators.base_plan_node import BasePlanDataObject -from . import BasePlanNode +from . import JoinNode INTERNAL_BATCH_SIZE: int = 7500 # config MAX_JOIN_SIZE: int = 1000 # config @@ -43,14 +44,15 @@ def _cross_join_unnest_column( - morsels: BasePlanNode = None, - source: Node = None, + *, + morsel: pyarrow.Table = None, + source: LogicalColumn = None, target_column: FlatColumn = None, conditions: Set = None, - statistics=None, distinct: bool = False, single_column: bool = False, -) -> Generator[pyarrow.Table, None, None]: + hash_set=None, +) -> pyarrow.Table: """ Perform a cross join on an unnested column of pyarrow tables. @@ -64,11 +66,8 @@ def _cross_join_unnest_column( """ from opteryx.compiled.cross_join import build_filtered_rows_indices_and_column from opteryx.compiled.cross_join import build_rows_indices_and_column - from opteryx.compiled.structures import HashSet from opteryx.compiled.structures import list_distinct - hash_set = HashSet() - # Check if the source node type is an identifier, raise error otherwise if source.node_type != NodeType.IDENTIFIER: raise NotImplementedError("Can only CROSS JOIN UNNEST on a column") @@ -77,86 +76,78 @@ def _cross_join_unnest_column( at_least_once = False single_column_collector = [] - # Loop through each morsel from the morsels execution - for left_morsel in morsels.execute(): - # Break the morsel into batches to avoid memory issues - for left_block in left_morsel.to_batches(max_chunksize=batch_size): - new_block = None - # Fetch the data of the column to be unnested - column_data = left_block[source.schema_column.identity] - - # Filter out null values - valid_offsets = column_data.is_valid() - column_data = column_data.drop_null() - if len(column_data) == 0: - continue - left_block = left_block.filter(valid_offsets) - - # Build indices and new column data - if conditions is None: - indices, new_column_data = build_rows_indices_and_column( - column_data.to_numpy(False) - ) + # Break the morsel into batches to avoid memory issues + for left_block in morsel.to_batches(max_chunksize=batch_size): + new_block = None + # Fetch the data of the column to be unnested + column_data = left_block[source.schema_column.identity] + + # Filter out null values + valid_offsets = column_data.is_valid() + column_data = column_data.drop_null() + if len(column_data) == 0: + continue + left_block = left_block.filter(valid_offsets) + + # Build indices and new column data + if conditions is None: + indices, new_column_data = build_rows_indices_and_column(column_data.to_numpy(False)) + else: + indices, new_column_data = build_filtered_rows_indices_and_column( + column_data.to_numpy(False), conditions + ) + + if single_column and distinct and indices.size > 0: + # if the unnest target is the only field in the SELECT and we're DISTINCTING + indices = numpy.array(indices, dtype=numpy.int32) + new_column_data, indices, hash_set = list_distinct(new_column_data, indices, hash_set) + + if len(indices) > 0: + if single_column: + single_column_collector.extend(new_column_data) + if len(single_column_collector) > INTERNAL_BATCH_SIZE: + schema = pyarrow.schema( + [ + pyarrow.field( + name=target_column.identity, type=target_column.arrow_field.type + ) + ] + ) + arrow_array = pyarrow.array(single_column_collector) + if arrow_array.type != target_column.arrow_field.type: + arrow_array = arrow_array.cast(target_column.arrow_field.type) + new_block = pyarrow.Table.from_arrays([arrow_array], schema=schema) + single_column_collector.clear() + del arrow_array + yield new_block + at_least_once = True else: - indices, new_column_data = build_filtered_rows_indices_and_column( - column_data.to_numpy(False), conditions - ) + # Rebuild the block with the new column data if we have any rows to build for - if single_column and distinct and indices.size > 0: - # if the unnest target is the only field in the SELECT and we're DISTINCTING - indices = numpy.array(indices, dtype=numpy.int32) - new_column_data, indices, hash_set = list_distinct( - new_column_data, indices, hash_set - ) + total_rows = len(indices) # Both arrays have the same length + block_size = MORSEL_SIZE_BYTES / (left_block.nbytes / left_block.num_rows) + block_size = int(block_size // 1000) * 1000 + + for start_block in range(0, total_rows, block_size): + # Compute the end index for the current chunk + end_block = min(start_block + block_size, total_rows) + + # Slice the current chunk of indices and new_column_data + indices_chunk = indices[start_block:end_block] + new_column_data_chunk = new_column_data[start_block:end_block] + + # Create a new block using the chunk of indices + indices_chunk = numpy.array(indices_chunk, dtype=numpy.int32) + new_block = left_block.take(indices_chunk) + new_block = pyarrow.Table.from_batches([new_block], schema=morsel.schema) + + # Append the corresponding chunk of new_column_data to the block + new_block = new_block.append_column( + target_column.identity, pyarrow.array(new_column_data_chunk) + ) - if len(indices) > 0: - if single_column: - single_column_collector.extend(new_column_data) - if len(single_column_collector) > INTERNAL_BATCH_SIZE: - schema = pyarrow.schema( - [ - pyarrow.field( - name=target_column.identity, type=target_column.arrow_field.type - ) - ] - ) - arrow_array = pyarrow.array(single_column_collector) - if arrow_array.type != target_column.arrow_field.type: - arrow_array = arrow_array.cast(target_column.arrow_field.type) - new_block = pyarrow.Table.from_arrays([arrow_array], schema=schema) - single_column_collector.clear() - del arrow_array - yield new_block - at_least_once = True - else: - # Rebuild the block with the new column data if we have any rows to build for - - total_rows = len(indices) # Both arrays have the same length - block_size = MORSEL_SIZE_BYTES / (left_block.nbytes / left_block.num_rows) - block_size = int(block_size // 1000) * 1000 - - for start_block in range(0, total_rows, block_size): - # Compute the end index for the current chunk - end_block = min(start_block + block_size, total_rows) - - # Slice the current chunk of indices and new_column_data - indices_chunk = indices[start_block:end_block] - new_column_data_chunk = new_column_data[start_block:end_block] - - # Create a new block using the chunk of indices - indices_chunk = numpy.array(indices_chunk, dtype=numpy.int32) - new_block = left_block.take(indices_chunk) - new_block = pyarrow.Table.from_batches( - [new_block], schema=left_morsel.schema - ) - - # Append the corresponding chunk of new_column_data to the block - new_block = new_block.append_column( - target_column.identity, pyarrow.array(new_column_data_chunk) - ) - - yield new_block - at_least_once = True + yield new_block + at_least_once = True if single_column_collector: schema = pyarrow.schema( @@ -171,7 +162,7 @@ def _cross_join_unnest_column( if not at_least_once: # Create an empty table with the new schema - schema = left_morsel.schema + schema = morsel.schema new_column = pyarrow.field(target_column.identity, pyarrow.string()) new_schema = pyarrow.schema(list(schema) + [new_column]) new_block = pyarrow.Table.from_batches([], schema=new_schema) @@ -179,7 +170,7 @@ def _cross_join_unnest_column( def _cross_join_unnest_literal( - morsels: BasePlanNode, source: Tuple, target_column: FlatColumn, statistics + morsels: pyarrow.Table, source: Tuple, target_column: FlatColumn ) -> Generator[pyarrow.Table, None, None]: joined_list_size = len(source) @@ -215,7 +206,7 @@ def _cartesian_product(*arrays): return numpy.hsplit(arr.reshape(-1, array_count), array_count) -def _cross_join(left_morsel, right, statistics): +def _cross_join(left_morsel, right): """ A cross join is the cartesian product of two tables - this usually isn't very useful, but it does allow you to the theta joins (non-equi joins) @@ -277,7 +268,7 @@ class CrossJoinDataObject(BasePlanDataObject): _distinct: bool = False -class CrossJoinNode(BasePlanNode): +class CrossJoinNode(JoinNode): """ Implements a SQL CROSS JOIN """ @@ -315,6 +306,7 @@ def __init__(self, properties: QueryProperties, **config): self.right_buffer = [] self.left_relation = None self.right_relation = None + self.hash_set = HashSet() @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover @@ -336,23 +328,20 @@ def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: if morsel == EOS: return EOS if isinstance(self._unnest_column.value, tuple): - if morsel == EOS: - return EOS return list( _cross_join_unnest_literal( morsels=morsel, source=self._unnest_column.value, target_column=self._unnest_target, - statistics=self.statistics, ) ) return list( _cross_join_unnest_column( - morsels=morsel, + morsel=morsel, source=self._unnest_column, target_column=self._unnest_target, conditions=self._filters, - statistics=self.statistics, + hash_set=self.hash_set, distinct=self._distinct, single_column=self._single_column, ) @@ -371,7 +360,7 @@ def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: if morsel == EOS: right_table = pyarrow.concat_tables(self.right_buffer, promote_options="none") # type:ignore self.right_buffer = None - return list(_cross_join(self.left_relation, right_table, self.statistics)) + return list(_cross_join(self.left_relation, right_table)) else: self.right_buffer.append(morsel) return None diff --git a/opteryx/operatorsv2/exit_node.py b/opteryx/operatorsv2/exit_node.py index a741f98c2..cac38c629 100644 --- a/opteryx/operatorsv2/exit_node.py +++ b/opteryx/operatorsv2/exit_node.py @@ -65,8 +65,9 @@ def name(self): # pragma: no cover return "Exit" def execute(self, morsel: Table) -> Table: + # Exit doesn't return EOS if morsel == EOS: - return EOS + return None final_columns = [] final_names = [] diff --git a/opteryx/operatorsv2/explain_node.py b/opteryx/operatorsv2/explain_node.py index 389589c46..4aa6c3104 100644 --- a/opteryx/operatorsv2/explain_node.py +++ b/opteryx/operatorsv2/explain_node.py @@ -18,19 +18,18 @@ This writes out a query plan """ -from typing import Generator +from pyarrow import Table from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType +from . import BasePlanNode -class ExplainNode(BasePlanNode): - operator_type = OperatorType.PRODUCER - def __init__(self, properties: QueryProperties, **config): +class ExplainNode(BasePlanNode): + def __init__(self, properties: QueryProperties, **parameters): super().__init__(properties=properties) - self._query_plan = config.get("query_plan") + self._query_plan = parameters.get("query_plan") + self.analyze = parameters.get("analyze", False) @property def name(self): # pragma: no cover @@ -44,6 +43,6 @@ def config(self): def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover raise NotImplementedError() - def execute(self) -> Generator: + def execute(self, morsel: Table) -> Table: if self._query_plan: - yield from self._query_plan.explain() + return self._query_plan.explain(self.analyze) diff --git a/opteryx/operatorsv2/function_dataset_node.py b/opteryx/operatorsv2/function_dataset_node.py index 409baa294..0252a3039 100644 --- a/opteryx/operatorsv2/function_dataset_node.py +++ b/opteryx/operatorsv2/function_dataset_node.py @@ -23,13 +23,13 @@ import pyarrow +from opteryx import EOS from opteryx.exceptions import SqlError from opteryx.managers.expression import NodeType from opteryx.models import QueryProperties from opteryx.utils import series from .read_node import ReaderNode -from opteryx import EOS def _generate_series(**kwargs): @@ -147,4 +147,5 @@ def execute(self, morsel) -> Generator: self.bytes_out += table.nbytes self.statistics.columns_read += len(table.column_names) - return [table, EOS] + yield table + yield EOS diff --git a/opteryx/operatorsv2/inner_join_node.py b/opteryx/operatorsv2/inner_join_node.py index 1e3c16a34..7ae59ef50 100644 --- a/opteryx/operatorsv2/inner_join_node.py +++ b/opteryx/operatorsv2/inner_join_node.py @@ -39,7 +39,7 @@ from opteryx.models import QueryProperties from opteryx.utils.arrow import align_tables -from . import BasePlanNode +from . import JoinNode def inner_join_with_preprocessed_left_side(left_relation, right_relation, join_columns, hash_table): @@ -72,7 +72,7 @@ def inner_join_with_preprocessed_left_side(left_relation, right_relation, join_c return align_tables(right_relation, left_relation, right_indexes, left_indexes) -class InnerJoinNode(BasePlanNode): +class InnerJoinNode(JoinNode): def __init__(self, properties: QueryProperties, **config): super().__init__(properties=properties) self._join_type = config["type"] diff --git a/opteryx/operatorsv2/inner_join_node_single.py b/opteryx/operatorsv2/inner_join_node_single.py index 2b1b99ed0..53c5c8416 100644 --- a/opteryx/operatorsv2/inner_join_node_single.py +++ b/opteryx/operatorsv2/inner_join_node_single.py @@ -20,19 +20,17 @@ the generic INNER JOIN. """ -import time -from typing import Generator - import numpy import pyarrow from pyarrow import compute +from opteryx import EOS from opteryx.compiled.structures import HashTable from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType from opteryx.utils.arrow import align_tables +from . import JoinNode + def preprocess_left(relation, join_columns): """ @@ -160,9 +158,7 @@ def inner_join_with_preprocessed_left_side(left_relation, right_relation, join_c return align_tables(right_relation, left_relation, right_indexes, left_indexes) -class InnerJoinSingleNode(BasePlanNode): - operator_type = OperatorType.PASSTHRU - +class InnerJoinSingleNode(JoinNode): def __init__(self, properties: QueryProperties, **config): super().__init__(properties=properties) self._join_type = config["type"] @@ -175,6 +171,10 @@ def __init__(self, properties: QueryProperties, **config): self._right_columns = config.get("right_columns") self._right_relation = config.get("right_relation_names") + self.stream = "left" + self.left_buffer = [] + self.left_hash = None + @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover raise NotImplementedError() @@ -187,29 +187,34 @@ def name(self): # pragma: no cover def config(self): # pragma: no cover return "" - def execute(self) -> Generator: - left_node = self._producers[0] # type:ignore - right_node = self._producers[1] # type:ignore - - left_relation = pyarrow.concat_tables(left_node.execute(), promote_options="none") - # in place until #1295 resolved - if self._left_columns[0] not in left_relation.column_names: - self._right_columns, self._left_columns = ( - self._left_columns, - self._right_columns, - ) - - start = time.monotonic_ns() - left_hash = preprocess_left(left_relation, self._left_columns) - self.statistics.time_inner_join += time.monotonic_ns() - start - for morsel in right_node.execute(): - start = time.monotonic_ns() - # do the join - new_morsel = inner_join_with_preprocessed_left_side( - left_relation=left_relation, - right_relation=morsel, - join_columns=self._right_columns, - hash_table=left_hash, - ) - self.statistics.time_inner_join += time.monotonic_ns() - start - yield new_morsel + def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + if self.stream == "left": + if morsel == EOS: + self.stream = "right" + self.left_relation = pyarrow.concat_tables(self.left_buffer, promote_options="none") + self.left_buffer.clear() + + # in place until #1295 resolved + if self._left_columns[0] not in self.left_relation.column_names: + self._right_columns, self._left_columns = ( + self._left_columns, + self._right_columns, + ) + + self.left_hash = preprocess_left(self.left_relation, self._left_columns) + else: + self.left_buffer.append(morsel) + return None + + if morsel == EOS: + return EOS + + # do the join + new_morsel = inner_join_with_preprocessed_left_side( + left_relation=self.left_relation, + right_relation=morsel, + join_columns=self._right_columns, + hash_table=self.left_hash, + ) + + return new_morsel diff --git a/opteryx/operatorsv2/join_node.py b/opteryx/operatorsv2/join_node.py deleted file mode 100644 index 1bd2dbc1f..000000000 --- a/opteryx/operatorsv2/join_node.py +++ /dev/null @@ -1,97 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Join Node - -We have our own implementations of INNER and OUTER joins, this uses PyArrow -to implement less-common joins of ANTI and SEMI joins. -""" - -from typing import Generator - -import pyarrow - -from opteryx.exceptions import UnsupportedSyntaxError -from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType - - -class JoinNode(BasePlanNode): - operator_type = OperatorType.PASSTHRU - - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) - self._join_type = config["type"] - self._on = config.get("on") - self._using = config.get("using") - - self._left_columns = config.get("left_columns") - self._left_relation = config.get("left_relation_names") - - self._right_columns = config.get("right_columns") - self._right_relation = config.get("right_relation_names") - - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - - @property - def name(self): # pragma: no cover - return f"{self._join_type} Join" - - @property - def config(self): # pragma: no cover - from opteryx.managers.expression import format_expression - - if self._on: - return f"{self._join_type.upper()} JOIN ({format_expression(self._on, True)})" - if self._using: - return f"{self._join_type.upper()} JOIN (USING {','.join(map(format_expression, self._using))})" - return f"{self._join_type.upper()}" - - def execute(self) -> Generator: - left_node = self._producers[0] # type:ignore - right_node = self._producers[1] # type:ignore - - left_table = pyarrow.concat_tables(left_node.execute(), promote_options="none") - right_table = pyarrow.concat_tables(right_node.execute(), promote_options="none") - - try: - new_morsel = left_table.join( - right_table, - keys=self._left_columns, - right_keys=self._right_columns, - join_type=self._join_type, - coalesce_keys=self._using is not None, - ) - except pyarrow.ArrowInvalid as err: # pragma: no cover - last_token = str(err).split(" ")[-1] - column = None - for col in left_node.columns: - if last_token == col.identity: - column = col.name - break - for col in right_node.columns: - if last_token == col.identity: - column = col.name - break - if column: - raise UnsupportedSyntaxError( - f"Unable to ANTI/SEMI JOIN with unsupported column types in table, '{column}'." - ) from err - raise UnsupportedSyntaxError( - "Unable to ANTI/SEMI JOIN with unsupported column types in table." - ) from err - - yield new_morsel diff --git a/opteryx/operatorsv2/noop_node.py b/opteryx/operatorsv2/noop_node.py index 6ff91cb77..8e9178884 100644 --- a/opteryx/operatorsv2/noop_node.py +++ b/opteryx/operatorsv2/noop_node.py @@ -16,16 +16,14 @@ This is a SQL Query Execution Plan Node. """ -from typing import Generator +from pyarrow import Table from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType +from . import BasePlanNode -class NoOpNode(BasePlanNode): - operator_type = OperatorType.PASSTHRU +class NoOpNode(BasePlanNode): def __init__(self, properties: QueryProperties, **config): super().__init__(properties=properties) @@ -41,8 +39,6 @@ def name(self): # pragma: no cover def config(self): # pragma: no cover return "" - def execute(self) -> Generator: - # nodes generally have 0 (scan), 1 (most) or 2 (join, union) producers - if self._producers: - for morsels in self._producers: - yield from morsels.execute() + def execute(self, morsel: Table) -> Table: + print("NOOP was called") + return [morsel] diff --git a/opteryx/operatorsv2/outer_join_node.py b/opteryx/operatorsv2/outer_join_node.py index b96c9b03f..df31ed00c 100644 --- a/opteryx/operatorsv2/outer_join_node.py +++ b/opteryx/operatorsv2/outer_join_node.py @@ -23,18 +23,17 @@ popular SEMI and ANTI joins we leave to PyArrow for now. """ -import time -from typing import Generator from typing import List import pyarrow +from opteryx import EOS from opteryx.compiled.structures import HashTable from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType from opteryx.utils.arrow import align_tables +from . import JoinNode + def left_join(left_relation, right_relation, left_columns: List[str], right_columns: List[str]): """ @@ -60,47 +59,41 @@ def left_join(left_relation, right_relation, left_columns: List[str], right_colu left_indexes: deque = deque() right_indexes: deque = deque() - right_relation = pyarrow.concat_tables(right_relation.execute(), promote_options="none") - if len(set(left_columns) & set(right_relation.column_names)) > 0: left_columns, right_columns = right_columns, left_columns right_hash = hash_join_map(right_relation, right_columns) + left_hash = hash_join_map(left_relation, left_columns) - for left_batch in left_relation.execute(): - left_hash = hash_join_map(left_batch, left_columns) - for hash_value, left_rows in left_hash.hash_table.items(): - right_rows = right_hash.get(hash_value) - if right_rows: - for l in left_rows: - for r in right_rows: - left_indexes.append(l) - right_indexes.append(r) - else: - for l in left_rows: + for hash_value, left_rows in left_hash.hash_table.items(): + right_rows = right_hash.get(hash_value) + if right_rows: + for l in left_rows: + for r in right_rows: left_indexes.append(l) - right_indexes.append(None) - - if len(left_indexes) > 50_000: - table = align_tables( - right_relation, left_batch, list(right_indexes), list(left_indexes) - ) - yield table - left_indexes.clear() - right_indexes.clear() + right_indexes.append(r) + else: + for l in left_rows: + left_indexes.append(l) + right_indexes.append(None) - if len(left_indexes) > 0: + if len(left_indexes) > 50_000: table = align_tables( - right_relation, left_batch, list(right_indexes), list(left_indexes) + right_relation, left_relation, list(right_indexes), list(left_indexes) ) yield table left_indexes.clear() right_indexes.clear() + if len(left_indexes) > 0: + table = align_tables(right_relation, left_relation, list(right_indexes), list(left_indexes)) + yield table + left_indexes.clear() + right_indexes.clear() + def full_join(left_relation, right_relation, left_columns: List[str], right_columns: List[str]): chunk_size = 1000 - right_relation = pyarrow.concat_tables(right_relation.execute(), promote_options="none") hash_table = HashTable() non_null_right_values = right_relation.select(right_columns).itercolumns() @@ -110,7 +103,6 @@ def full_join(left_relation, right_relation, left_columns: List[str], right_colu left_indexes = [] right_indexes = [] - left_relation = pyarrow.concat_tables(left_relation.execute(), promote_options="none") left_values = left_relation.select(left_columns).itercolumns() for i, value_tuple in enumerate(zip(*left_values)): rows = hash_table.get(hash(value_tuple)) @@ -152,7 +144,6 @@ def right_join(left_relation, right_relation, left_columns: List[str], right_col pyarrow.Table: A chunk of the result of the RIGHT JOIN operation. """ chunk_size = 1000 - left_relation = pyarrow.concat_tables(left_relation.execute(), promote_options="none") hash_table = HashTable() non_null_left_values = left_relation.select(left_columns).itercolumns() @@ -160,26 +151,25 @@ def right_join(left_relation, right_relation, left_columns: List[str], right_col hash_table.insert(hash(value_tuple), i) # Iterate over the right_relation in chunks - right_batches = right_relation.execute() - for right_batch in right_batches: - for right_chunk in right_batch.to_batches(chunk_size): - left_indexes = [] - right_indexes = [] - - right_values = right_chunk.select(right_columns).itercolumns() - for i, value_tuple in enumerate(zip(*right_values)): - rows = hash_table.get(hash(value_tuple)) - if rows: - left_indexes.extend(rows) - right_indexes.extend([i] * len(rows)) - else: - left_indexes.append(None) - right_indexes.append(i) - - # Yield the aligned chunk - # we intentionally swap them to the other calls so we're building a table - # not a record batch (what the chunk is) - yield align_tables(left_relation, right_chunk, left_indexes, right_indexes) + + for right_chunk in right_relation.to_batches(chunk_size): + left_indexes = [] + right_indexes = [] + + right_values = right_chunk.select(right_columns).itercolumns() + for i, value_tuple in enumerate(zip(*right_values)): + rows = hash_table.get(hash(value_tuple)) + if rows: + left_indexes.extend(rows) + right_indexes.extend([i] * len(rows)) + else: + left_indexes.append(None) + right_indexes.append(i) + + # Yield the aligned chunk + # we intentionally swap them to the other calls so we're building a table + # not a record batch (what the chunk is) + yield align_tables(left_relation, right_chunk, left_indexes, right_indexes) def left_anti_join( @@ -200,30 +190,23 @@ def left_anti_join( Returns: A pyarrow.Table containing the result of the LEFT ANTI JOIN operation. """ - right_relation = pyarrow.concat_tables(right_relation.execute(), promote_options="none") - hash_table = HashTable() non_null_right_values = right_relation.select(right_columns).itercolumns() for i, value_tuple in enumerate(zip(*non_null_right_values)): hash_table.insert(hash(value_tuple), i) - at_least_once = False - # Iterate over the left_relation in chunks - for left_batch in left_relation.execute(): - left_indexes = [] - left_values = left_batch.select(left_columns).itercolumns() - for i, value_tuple in enumerate(zip(*left_values)): - rows = hash_table.get(hash(value_tuple)) - if not rows: # Only include left rows that have no match in the right table - left_indexes.append(i) - - # Filter the left_chunk based on the anti join condition - if left_indexes: - yield left_batch.take(left_indexes) - at_least_once = True + left_indexes = [] + left_values = left_relation.select(left_columns).itercolumns() + for i, value_tuple in enumerate(zip(*left_values)): + rows = hash_table.get(hash(value_tuple)) + if not rows: # Only include left rows that have no match in the right table + left_indexes.append(i) - if not at_least_once: - yield left_batch.slice(0, 0) + # Filter the left_chunk based on the anti join condition + if left_indexes: + yield left_relation.take(left_indexes) + else: + yield left_relation.slice(0, 0) def left_semi_join( @@ -244,36 +227,28 @@ def left_semi_join( Returns: A pyarrow.Table containing the result of the LEFT SEMI JOIN operation. """ - right_relation = pyarrow.concat_tables(right_relation.execute(), promote_options="none") hash_table = HashTable() non_null_right_values = right_relation.select(right_columns).itercolumns() for i, value_tuple in enumerate(zip(*non_null_right_values)): hash_table.insert(hash(value_tuple), i) - at_least_once = False - # Iterate over the left_relation in chunks - for left_batch in left_relation.execute(): - left_indexes = [] - left_values = left_batch.select(left_columns).itercolumns() - - for i, value_tuple in enumerate(zip(*left_values)): - rows = hash_table.get(hash(value_tuple)) - if rows: # Only include left rows that have a match in the right table - left_indexes.append(i) - - # Filter the left_chunk based on the anti join condition - if left_indexes: - yield left_batch.take(left_indexes) - at_least_once = True + left_indexes = [] + left_values = left_relation.select(left_columns).itercolumns() - if not at_least_once: - yield left_batch.slice(0, 0) + for i, value_tuple in enumerate(zip(*left_values)): + rows = hash_table.get(hash(value_tuple)) + if rows: # Only include left rows that have a match in the right table + left_indexes.append(i) + # Filter the left_chunk based on the anti join condition + if left_indexes: + yield left_relation.take(left_indexes) + else: + yield left_relation.slice(0, 0) -class OuterJoinNode(BasePlanNode): - operator_type = OperatorType.PASSTHRU +class OuterJoinNode(JoinNode): def __init__(self, properties: QueryProperties, **config): super().__init__(properties=properties) self._join_type = config["type"] @@ -286,6 +261,11 @@ def __init__(self, properties: QueryProperties, **config): self._right_columns = config.get("right_columns") self._right_relation = config.get("right_relation_names") + self.stream = "left" + self.left_buffer = [] + self.right_buffer = [] + self.left_relation = None + @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover raise NotImplementedError() @@ -304,22 +284,35 @@ def config(self): # pragma: no cover return f"{self._join_type.upper()} JOIN (USING {','.join(map(format_expression, self._using))})" return f"{self._join_type.upper()}" - def execute(self) -> Generator: - left_node = self._producers[0] # type:ignore - right_node = self._producers[1] # type:ignore - - join_provider = providers.get(self._join_type) - - start = time.monotonic_ns() - for morsel in join_provider( - left_relation=left_node, - right_relation=right_node, - left_columns=self._left_columns, - right_columns=self._right_columns, - ): - self.statistics.time_outer_join += time.monotonic_ns() - start - yield morsel - start = time.monotonic_ns() + def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + if self.stream == "left": + if morsel == EOS: + self.stream = "right" + self.left_relation = pyarrow.concat_tables(self.left_buffer, promote_options="none") + self.left_buffer.clear() + else: + self.left_buffer.append(morsel) + return None + + if self.stream == "right": + if morsel == EOS: + right_relation = pyarrow.concat_tables(self.right_buffer, promote_options="none") + self.right_buffer.clear() + + join_provider = providers.get(self._join_type) + + return list( + join_provider( + left_relation=self.left_relation, + right_relation=right_relation, + left_columns=self._left_columns, + right_columns=self._right_columns, + ) + ) + [EOS] + + else: + self.right_buffer.append(morsel) + return None providers = { diff --git a/opteryx/operatorsv2/projection_node.py b/opteryx/operatorsv2/projection_node.py index 03d6c9312..7c9cb616d 100644 --- a/opteryx/operatorsv2/projection_node.py +++ b/opteryx/operatorsv2/projection_node.py @@ -68,5 +68,4 @@ def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: # If any of the columns need evaluating, we need to do that here morsel = evaluate_and_append(self.evaluations, morsel) - morsel = morsel.select(self.projection) - return morsel + return morsel.select(self.projection) diff --git a/opteryx/operatorsv2/set_variable_node.py b/opteryx/operatorsv2/set_variable_node.py index 8d55e0284..b730a6ded 100644 --- a/opteryx/operatorsv2/set_variable_node.py +++ b/opteryx/operatorsv2/set_variable_node.py @@ -16,18 +16,14 @@ This is a SQL Query Execution Plan Node. """ -from typing import Generator - from opteryx.constants import QueryStatus from opteryx.models import NonTabularResult from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType +from . import BasePlanNode -class SetVariableNode(BasePlanNode): - operator_type = OperatorType.PRODUCER +class SetVariableNode(BasePlanNode): def __init__(self, properties: QueryProperties, **config): super().__init__(properties=properties) @@ -48,6 +44,6 @@ def name(self): # pragma: no cover def config(self): # pragma: no cover return f"{self.variable} TO {self.value}" - def execute(self) -> Generator: + def execute(self, morsel) -> NonTabularResult: self.variables[self.variable] = self.value return NonTabularResult(record_count=1, status=QueryStatus.SQL_SUCCESS) # type: ignore diff --git a/opteryx/operatorsv2/show_columns_node.py b/opteryx/operatorsv2/show_columns_node.py index 247ac6159..d1961414b 100644 --- a/opteryx/operatorsv2/show_columns_node.py +++ b/opteryx/operatorsv2/show_columns_node.py @@ -18,13 +18,12 @@ Gives information about a dataset's columns """ -from typing import Generator - import pyarrow +from opteryx import EOS from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType + +from . import BasePlanNode def _simple_collector(schema): @@ -41,39 +40,18 @@ def _simple_collector(schema): } buffer.append(new_row) - table = pyarrow.Table.from_pylist(buffer) - return table - - -def _extended_collector(morsels): - """ - Collect summary statistics about each column - - We use orso, which means converting to an orso DataFrame and then converting back - to a PyArrow table. - """ - import orso - - profile = None - for morsel in morsels: - df = orso.DataFrame.from_arrow(morsel) - if profile is None: - profile = df.profile - else: - profile += df.profile - - return profile.to_dicts() + return pyarrow.Table.from_pylist(buffer) class ShowColumnsNode(BasePlanNode): - operator_type = OperatorType.PRODUCER - def __init__(self, properties: QueryProperties, **config): super().__init__(properties=properties) self._full = config.get("full") self._extended = config.get("extended") self._schema = config.get("schema") self._column_map = {c.schema_column.identity: c.source_column for c in config["columns"]} + self.collector = None + self.seen = False @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover @@ -91,28 +69,32 @@ def rename_column(self, dic: dict, renames) -> dict: dic["name"] = renames[dic["name"]] return dic - def execute(self) -> Generator: - morsels = self._producers[0] # type:ignore + def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + from orso import DataFrame - if morsels is None: + if self.seen: return None if not (self._full or self._extended): # if it's not full or extended, do just get the list of columns and their # types - yield _simple_collector(self._schema) - return + self.seen = True + return _simple_collector(self._schema) - if self._full and not self._extended: + if self._full or self._extended: # we're going to read the full table, so we can count stuff - dicts = _extended_collector(morsels.execute()) - dicts = [self.rename_column(d, self._column_map) for d in dicts] - yield pyarrow.Table.from_pylist(dicts) - return - - if self._extended: - # get everything we can reasonable get - dicts = _extended_collector(morsels.execute()) - dicts = [self.rename_column(d, self._column_map) for d in dicts] - yield pyarrow.Table.from_pylist(dicts) - return + + if morsel == EOS: + dicts = self.collector.to_dicts() + dicts = [self.rename_column(d, self._column_map) for d in dicts] + self.seen = True + return pyarrow.Table.from_pylist(dicts) + + df = DataFrame.from_arrow(morsel) + + if self.collector is None: + self.collector = df.profile + else: + self.collector += df.profile + + return None diff --git a/opteryx/operatorsv2/sort_node.py b/opteryx/operatorsv2/sort_node.py index e685f12d7..1361dd704 100644 --- a/opteryx/operatorsv2/sort_node.py +++ b/opteryx/operatorsv2/sort_node.py @@ -51,7 +51,7 @@ def name(self): # pragma: no cover return "Sort" def execute(self, morsel: Table) -> Table: - if morsel != EOS and morsel.num_rows > 0: + if morsel != EOS: self.morsels.append(morsel) return None diff --git a/opteryx/operatorsv2/union_node.py b/opteryx/operatorsv2/union_node.py index 34e102be9..ca268fc07 100644 --- a/opteryx/operatorsv2/union_node.py +++ b/opteryx/operatorsv2/union_node.py @@ -16,20 +16,21 @@ This is a SQL Query Execution Plan Node. """ -from typing import Generator +from pyarrow import Table +from opteryx import EOS from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType +from . import BasePlanNode -class UnionNode(BasePlanNode): - operator_type = OperatorType.PASSTHRU +class UnionNode(BasePlanNode): def __init__(self, properties: QueryProperties, **config): super().__init__(properties=properties) self.columns = config.get("columns", []) self.column_ids = [c.schema_column.identity for c in self.columns] + self.seen_first_eos = False + self.schema = None @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover @@ -43,18 +44,21 @@ def name(self): # pragma: no cover def config(self): # pragma: no cover return "" - def execute(self) -> Generator: + def execute(self, morsel: Table) -> Table: """ Union needs to ensure the column names are the same and that coercible types are coerced. """ - schema = None - if self._producers: - for morsels in self._producers: - for morsel in morsels.execute(): - if schema is None: - schema = morsel.schema - else: - morsel = morsel.rename_columns(schema.names) - morsel = morsel.cast(schema) - yield morsel.select(self.column_ids) + if morsel == EOS and self.seen_first_eos: + return [EOS] + if morsel == EOS: + self.seen_first_eos = True + return None + + if self.schema is None: + self.schema = morsel.schema + else: + morsel = morsel.rename_columns(self.schema.names) + morsel = morsel.cast(self.schema) + + return morsel.select(self.column_ids) diff --git a/opteryx/planner/binder/__init__.py b/opteryx/planner/binder/__init__.py index 1a6f03ad3..269803ee9 100644 --- a/opteryx/planner/binder/__init__.py +++ b/opteryx/planner/binder/__init__.py @@ -91,7 +91,4 @@ def do_bind_phase(plan: LogicalPlan, connection=None, qid: str = None) -> Logica plan, _ = binder_visitor.traverse(plan, root_node[0], context=context) - # DEBUG: log ("AFTER BINDING") - # DEBUG: log (plan.draw()) - return plan diff --git a/opteryx/planner/cost_based_optimizer/__init__.py b/opteryx/planner/cost_based_optimizer/__init__.py index 8d24c37c6..676cb9710 100644 --- a/opteryx/planner/cost_based_optimizer/__init__.py +++ b/opteryx/planner/cost_based_optimizer/__init__.py @@ -135,7 +135,7 @@ def optimize(self, plan: LogicalPlan) -> LogicalPlan: current_plan = plan for strategy in self.strategies: current_plan = self.traverse(current_plan, strategy) - # DEBUG: log ("AFTER COST OPTIMIZATION") + # DEBUG: log ("AFTER OPTIMIZATION") # DEBUG: log (current_plan.draw()) return current_plan diff --git a/opteryx/planner/logical_planner/logical_planner.py b/opteryx/planner/logical_planner/logical_planner.py index 2de46489d..b962d9cdf 100644 --- a/opteryx/planner/logical_planner/logical_planner.py +++ b/opteryx/planner/logical_planner/logical_planner.py @@ -95,7 +95,7 @@ def __str__(self): if self.function == "GENERATE_SERIES": return f"GENERATE SERIES ({', '.join(format_expression(arg) for arg in self.args)}){' AS ' + self.alias if self.alias else ''}" if self.function == "VALUES": - return f"VALUES (({', '.join(self.columns)}) x {len(self.values)} AS {self.alias})" + return f"VALUES (({', '.join(c.value for c in self.columns)}) x {len(self.values)} AS {self.alias})" if self.function == "UNNEST": return f"UNNEST ({', '.join(format_expression(arg) for arg in self.args)}{' AS ' + self.unnest_target if self.unnest_target else ''})" if self.function == "HTTP": @@ -119,9 +119,10 @@ def __str__(self): if node_type == LogicalPlanStepType.Order: return f"ORDER BY ({', '.join(format_expression(item[0]) + (' DESC' if item[1] =='descending' else '') for item in self.order_by)})" if node_type == LogicalPlanStepType.Project: - order_by_indicator = " +" if self.order_by_columns else "" + order_by_indicator = f" + ({', '.join(format_expression(col) for col in self.order_by_columns)})" if self.order_by_columns else "" return f"PROJECT ({', '.join(format_expression(col) for col in self.columns)}){order_by_indicator}" if node_type == LogicalPlanStepType.Scan: + io_async = "ASYNC " if hasattr(self.connector, "async_read_blob") else "" date_range = "" if self.start_date == self.end_date and self.start_date is not None: date_range = f" FOR '{self.start_date}'" @@ -139,7 +140,7 @@ def __str__(self): limit = "" if self.limit: limit = f" LIMIT {self.limit}" - return f"READ ({self.relation}{alias}{date_range}{' WITH(' + ','.join(self.hints) + ')' if self.hints else ''}){columns}{predicates}{limit}" + return f"{io_async}READ ({self.relation}{alias}{date_range}{' WITH(' + ','.join(self.hints) + ')' if self.hints else ''}){columns}{predicates}{limit}" if node_type == LogicalPlanStepType.Set: return f"SET ({self.variable} TO {self.value.value})" if node_type == LogicalPlanStepType.Show: @@ -154,7 +155,7 @@ def __str__(self): if node_type == LogicalPlanStepType.Union: columns = "" if self.columns: - columns = " [" + ", ".join(c.qualified_name for c in self.columns) + "]" + columns = " [" + ", ".join(c.current_name for c in self.columns) + "]" return f"UNION {'' if self.modifier is None else self.modifier.upper()}{columns}" # fmt:on diff --git a/opteryx/planner/physical_planner.py b/opteryx/planner/physical_planner.py index 69a28b2c5..b291850f9 100644 --- a/opteryx/planner/physical_planner.py +++ b/opteryx/planner/physical_planner.py @@ -15,6 +15,7 @@ from opteryx import operatorsv2 as operators from opteryx.exceptions import UnsupportedSyntaxError +from opteryx.models import LogicalColumn from opteryx.models import PhysicalPlan from opteryx.planner.logical_planner import LogicalPlanStepType @@ -43,7 +44,10 @@ def create_physical_plan(logical_plan, query_properties) -> PhysicalPlan: elif node_type == LogicalPlanStepType.Filter: node = operators.FilterNode(query_properties, filter=node_config["condition"]) elif node_type == LogicalPlanStepType.FunctionDataset: - node = operators.FunctionDatasetNode(query_properties, **node_config) + if node_config.get("function") != "UNNEST" or (len(node_config.get("args", [])) > 0 and not isinstance(node_config["args"][0], LogicalColumn)): + node = operators.FunctionDatasetNode(query_properties, **node_config) + else: + node = operators.NoOpNode(query_properties, **node_config) elif node_type == LogicalPlanStepType.HeapSort: node = operators.HeapSortNode(query_properties, **node_config) elif node_type == LogicalPlanStepType.Join: diff --git a/opteryx/planner/sql_rewriter.py b/opteryx/planner/sql_rewriter.py index f67371c47..e9f4c2b02 100644 --- a/opteryx/planner/sql_rewriter.py +++ b/opteryx/planner/sql_rewriter.py @@ -286,14 +286,14 @@ def _temporal_extration_state_machine( open_count += 1 if comparable_part == ")": open_count -= 1 + if in_special_function and open_count == special_function_brackets: + in_special_function = False if relation == "": state = WAITING else: # function relations, like FAKE(234,234) need the items between the # brackets be be consumed state = FUNCTION_RELATION - elif in_special_function and open_count == special_function_brackets: - in_special_function = False if not in_special_function: if comparable_part in STOP_COLLECTING: @@ -310,7 +310,9 @@ def _temporal_extration_state_machine( transition.append(state) # based on what the state was and what it is now, do something - if transition == [TEMPORAL, TEMPORAL]: + if in_special_function: + pass + elif transition == [TEMPORAL, TEMPORAL]: temporal = (temporal + " " + part).strip() elif ( transition diff --git a/opteryx/third_party/travers/graph.py b/opteryx/third_party/travers/graph.py index 0c0e2262d..4ac779a4e 100644 --- a/opteryx/third_party/travers/graph.py +++ b/opteryx/third_party/travers/graph.py @@ -240,7 +240,7 @@ def depth_first_search( return tree - def outgoing_edges(self, source) -> List[Tuple]: + def outgoing_edges(self, source: str) -> List[Tuple]: """ Get the list of edges traversable from a given node. diff --git a/tests/plan_optimization/test_temporal_extraction.py b/tests/plan_optimization/test_temporal_extraction.py index 477d3360a..5933c353f 100644 --- a/tests/plan_optimization/test_temporal_extraction.py +++ b/tests/plan_optimization/test_temporal_extraction.py @@ -46,6 +46,7 @@ ("SELECT * FROM $planets\nFOR TODAY;", [('$planets', THIS_MORNING, TONIGHT)]), ("SELECT * FROM $planets\tFOR TODAY;", [('$planets', THIS_MORNING, TONIGHT)]), ("SELECT * FROM $planets -- FOR YESTERDAY\nFOR TODAY;", [('$planets', THIS_MORNING, TONIGHT)]), + ("SELECT * FROM $planets /* FOR YESTERDAY */ FOR TODAY;", [('$planets', THIS_MORNING, TONIGHT)]), ("SELECT * FROM $planets FOR TODAY WHERE name = ?", [('$planets', THIS_MORNING, TONIGHT)]), ("SELECT * FROM $planets FOR TODAY GROUP BY name", [('$planets', THIS_MORNING, TONIGHT)]), ("SELECT * FROM $planets FOR TODAY ORDER BY name", [('$planets', THIS_MORNING, TONIGHT)]), @@ -101,6 +102,7 @@ ("SELECT * FROM $planets FOR YESTERDAY WHERE id IN (SELECT * FROM $planets);", [('$planets', YESTERDAY, YESTERDAY.replace(hour=23, minute=59)), ('$planets', None, None)]), ("SELECT * FROM $planets WHERE id IN (SELECT * FROM $planets FOR YESTERDAY);", [('$planets', None, None), ('$planets', YESTERDAY, YESTERDAY.replace(hour=23, minute=59))]), ("SELECT * FROM $planets WHERE id IN (SELECT * FROM $planets);", [('$planets', None, None), ('$planets', None, None)]), + # FROM in functions ("SELECT EXTRACT(YEAR FROM birth_date) FROM $astronauts", [("$astronauts", None, None)]), ("SELECT SUBSTRING(name FROM 1 FOR 1) FROM $astronauts", [("$astronauts", None, None)]), @@ -111,6 +113,12 @@ ("SELECT TRIM ( 'foo' FROM 'foo' )", []), ("SELECT TRIM ( 'MVEJSONP' FROM name ) FROM $planets", [("$planets", None, None)]), ("SELECT TRIM ( 'MVEJSONP' FROM name ) FROM $planets FOR TODAY", [("$planets", THIS_MORNING, TONIGHT)]), + + # function in a 'special' function + ("SELECT VARCHAR(SUBSTRING(BLOB(birth_date) FROM -4)) FROM $astronauts", [("$astronauts", None, None)]), + ("SELECT VARCHAR(SUBSTRING(BLOB(birth_date) FROM -4)) FROM $astronauts FOR TODAY", [("$astronauts", THIS_MORNING, TONIGHT)]), + ("SELECT VARCHAR(SUBSTRING(BLOB(birth_date) FROM 1 FOR 1)) FROM $astronauts", [("$astronauts", None, None)]), + ("SELECT VARCHAR(SUBSTRING(BLOB(birth_date) FROM 1 FOR 1)) FROM $astronauts FOR TODAY", [("$astronauts", THIS_MORNING, TONIGHT)]), ] # fmt:on From 2639043c969ce6064947a299448f2375fc5ce554 Mon Sep 17 00:00:00 2001 From: XB500 Date: Sat, 2 Nov 2024 18:40:54 +0000 Subject: [PATCH 009/157] Opteryx Version 0.18.1 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 5de3a95c8..10b61b345 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 840 +__build__ = 842 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 5119fac1eb97821f468eb1902058769bcf68d2dd Mon Sep 17 00:00:00 2001 From: joocer Date: Thu, 7 Nov 2024 17:55:35 +0000 Subject: [PATCH 010/157] =?UTF-8?q?=E2=9C=A8=20#2061?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- opteryx/connectors/capabilities/cacheable.py | 25 ++-- opteryx/models/physical_plan.py | 73 +++++++++-- opteryx/operatorsv2/__init__.py | 1 + opteryx/operatorsv2/cross_join_node.py | 2 +- opteryx/operatorsv2/limit_node.py | 22 ++-- opteryx/operatorsv2/pyarrow_join_node.py | 118 ++++++++++++++++++ opteryx/operatorsv2/show_value_node.py | 9 +- opteryx/operatorsv2/sort_node.py | 2 +- opteryx/planner/binder/binder_visitor.py | 20 ++- .../strategies/distinct_pushdown.py | 6 +- .../strategies/limit_pushdown.py | 9 +- .../strategies/redundant_operators.py | 5 + opteryx/planner/physical_planner.py | 18 +-- .../test_shapes_and_errors_battery.py | 2 +- 14 files changed, 241 insertions(+), 71 deletions(-) create mode 100644 opteryx/operatorsv2/pyarrow_join_node.py diff --git a/opteryx/connectors/capabilities/cacheable.py b/opteryx/connectors/capabilities/cacheable.py index d996d646f..fcae2e957 100644 --- a/opteryx/connectors/capabilities/cacheable.py +++ b/opteryx/connectors/capabilities/cacheable.py @@ -129,17 +129,20 @@ async def wrapper(blob_name: str, statistics, pool: MemoryPool, **kwargs): payload = await pool.read(read_buffer_ref) # type: ignore # If we found the file, see if we need to write it to the caches - if source != SOURCE_NOT_FOUND and evictions_remaining > 0: - if source != SOURCE_BUFFER_POOL and len(payload) < buffer_pool.size // 10: - # if we didn't get it from the buffer pool (origin or remote cache) we add it - evicted = buffer_pool.set(key, payload) - if evicted: - # if we're evicting items we just put in the cache, stop - if evicted in my_keys: - evictions_remaining = 0 - else: - evictions_remaining -= 1 - statistics.cache_evictions += 1 + if ( + not source in (SOURCE_NOT_FOUND, SOURCE_BUFFER_POOL) + and evictions_remaining > 0 + and len(payload) < buffer_pool.size // 10 + ): + # if we didn't get it from the buffer pool (origin or remote cache) we add it + evicted = buffer_pool.set(key, payload) + if evicted: + # if we're evicting items we just put in the cache, stop + if evicted in my_keys: + evictions_remaining = 0 + else: + evictions_remaining -= 1 + statistics.cache_evictions += 1 if source == SOURCE_ORIGIN and len(payload) < MAX_CACHEABLE_ITEM_SIZE: # If we read from the source, it's not in the remote cache diff --git a/opteryx/models/physical_plan.py b/opteryx/models/physical_plan.py index ca690c528..003820b04 100644 --- a/opteryx/models/physical_plan.py +++ b/opteryx/models/physical_plan.py @@ -24,6 +24,7 @@ import gc from typing import Any from typing import Generator +from typing import Optional from typing import Tuple from typing import Union @@ -221,12 +222,49 @@ def _inner_explain(node, depth): yield table - def push_executor(self) -> Tuple[Generator[pyarrow.Table, None, None], ResultType]: + def depth_first_search_flat( + self, node: Optional[str] = None, visited: Optional[set] = None + ) -> list: + """ + Returns a flat list representing the depth-first traversal of the graph with left/right ordering. + + We do this so we always evaluate the left side of a join before the right side. It technically + doesn't need the entire plan flattened DFS-wise, but this is what we are doing here to achieve + the outcome we're after. + """ + if node is None: + node = self.get_exit_points()[0] + + if visited is None: + visited = set() + + visited.add(node) + + # Collect this node's information in a flat list format + traversal_list = [ + ( + node, + self[node], + ) + ] + + # Sort neighbors based on relationship to ensure left, right, then unlabelled order + neighbors = sorted(self.ingoing_edges(node), key=lambda x: (x[2] == "right", x[2] == "")) + + # Traverse each child, prioritizing left, then right, then unlabelled + for neighbor, _, _ in neighbors: + if neighbor not in visited: + child_list = self.depth_first_search_flat(neighbor, visited) + traversal_list.extend(child_list) + + return traversal_list + + def push_executor(self) -> Tuple[Generator[pyarrow.Table, Any, Any], ResultType]: from opteryx.operatorsv2 import ExplainNode from opteryx.operatorsv2 import JoinNode + from opteryx.operatorsv2 import ReaderNode from opteryx.operatorsv2 import SetVariableNode - - return_type = ResultType.TABULAR + from opteryx.operatorsv2 import ShowValueNode # Validate query plan to ensure it's acyclic if not self.is_acyclic(): @@ -242,22 +280,23 @@ def push_executor(self) -> Tuple[Generator[pyarrow.Table, None, None], ResultTyp head_node = self[head_nodes[0]] + # add the left/right labels to the edges coming into the joins joins = [(nid, node) for nid, node in self.nodes(True) if isinstance(node, JoinNode)] for nid, join in joins: - print(join, self.ingoing_edges(nid)) for s, t, r in self.breadth_first_search(nid, reverse=True): + source_parameters = self[s].parameters if set(join._left_relation).intersection( { - self[s].parameters.get("alias"), - self[s].parameters.get("relation"), + source_parameters.get("alias"), + source_parameters.get("relation"), } ): self.remove_edge(s, t, r) self.add_edge(s, t, "left") elif set(join._right_relation).intersection( { - self[s].parameters.get("alias"), - self[s].parameters.get("relation"), + source_parameters.get("alias"), + source_parameters.get("relation"), } ): self.remove_edge(s, t, r) @@ -267,15 +306,23 @@ def push_executor(self) -> Tuple[Generator[pyarrow.Table, None, None], ResultTyp if isinstance(head_node, ExplainNode): yield self.explainv2(head_node.analyze), ResultType.TABULAR - if isinstance(head_node, SetVariableNode): + # Special case handling for 'Set' queries + elif isinstance(head_node, SetVariableNode): yield head_node(None), ResultType.NON_TABULAR + elif isinstance(head_node, ShowValueNode): + yield head_node(None), ResultType.TABULAR + def inner_execute(plan): - pump_nodes = plan.get_entry_points() - for pump_node in pump_nodes: - pump_instance = plan[pump_node] + # Get the pump nodes from the plan and execute them in order + pump_nodes = [ + (nid, node) + for nid, node in self.depth_first_search_flat() + if isinstance(node, ReaderNode) + ] + for pump_nid, pump_instance in pump_nodes: for morsel in pump_instance(None): - yield from plan.process_node(pump_node, morsel) + yield from plan.process_node(pump_nid, morsel) yield inner_execute(self), ResultType.TABULAR diff --git a/opteryx/operatorsv2/__init__.py b/opteryx/operatorsv2/__init__.py index bbcc9d679..326f7b31a 100644 --- a/opteryx/operatorsv2/__init__.py +++ b/opteryx/operatorsv2/__init__.py @@ -34,6 +34,7 @@ from .inner_join_node import InnerJoinNode from .inner_join_node_single import InnerJoinSingleNode from .limit_node import LimitNode # select the first N records +from .pyarrow_join_node import PyArrowJoinNode # from .metadata_writer_node import MetadataWriterNode # from .morsel_defragment_node import MorselDefragmentNode # consolidate small morsels diff --git a/opteryx/operatorsv2/cross_join_node.py b/opteryx/operatorsv2/cross_join_node.py index d79b6453a..4dcab1747 100644 --- a/opteryx/operatorsv2/cross_join_node.py +++ b/opteryx/operatorsv2/cross_join_node.py @@ -225,7 +225,7 @@ def _chunker(seq_1, seq_2, size): from opteryx.utils.arrow import align_tables at_least_once = False - left_schema = None + left_schema = left_morsel.schema right_schema = right.schema # Iterate through left table in chunks of size INTERNAL_BATCH_SIZE diff --git a/opteryx/operatorsv2/limit_node.py b/opteryx/operatorsv2/limit_node.py index 55528b7bf..c5065311d 100644 --- a/opteryx/operatorsv2/limit_node.py +++ b/opteryx/operatorsv2/limit_node.py @@ -29,12 +29,11 @@ class LimitNode(BasePlanNode): def __init__(self, properties: QueryProperties, **config): super().__init__(properties=properties) - self.limit = config.get("limit") + self.limit = config.get("limit", float("inf")) self.offset = config.get("offset", 0) self.remaining_rows = self.limit if self.limit is not None else float("inf") self.rows_left_to_skip = max(0, self.offset) - self.at_least_one = False @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover @@ -62,16 +61,13 @@ def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: ) self.rows_left_to_skip = 0 - if self.remaining_rows <= 0: - self.at_least_one = True + if self.remaining_rows <= 0 or morsel.num_rows == 0: return morsel.slice(offset=0, length=0) - if morsel.num_rows > 0: - if morsel.num_rows < self.remaining_rows: - self.remaining_rows -= morsel.num_rows - self.at_least_one = True - return morsel - - else: - self.at_least_one = True - return morsel.slice(offset=0, length=self.remaining_rows) + if morsel.num_rows < self.remaining_rows: + self.remaining_rows -= morsel.num_rows + return morsel + else: + rows_to_slice = self.remaining_rows + self.remaining_rows = 0 + return morsel.slice(offset=0, length=rows_to_slice) diff --git a/opteryx/operatorsv2/pyarrow_join_node.py b/opteryx/operatorsv2/pyarrow_join_node.py new file mode 100644 index 000000000..7856f28d0 --- /dev/null +++ b/opteryx/operatorsv2/pyarrow_join_node.py @@ -0,0 +1,118 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Join Node + +We have our own implementations of INNER and OUTER joins, this uses PyArrow +to implement less-common joins of ANTI and SEMI joins. +""" + +import pyarrow + +from opteryx import EOS +from opteryx.exceptions import UnsupportedSyntaxError +from opteryx.models import QueryProperties + +from . import JoinNode + + +class PyArrowJoinNode(JoinNode): + def __init__(self, properties: QueryProperties, **config): + super().__init__(properties=properties) + self._join_type = config["type"] + self._on = config.get("on") + self._using = config.get("using") + + self._left_columns = config.get("left_columns") + self._left_relation = config.get("left_relation_names") + + self._right_columns = config.get("right_columns") + self._right_relation = config.get("right_relation_names") + + self.stream = "left" + self.left_buffer = [] + self.right_buffer = [] + self.left_relation = None + + @classmethod + def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover + raise NotImplementedError() + + @property + def name(self): # pragma: no cover + return f"{self._join_type} Join" + + @property + def config(self): # pragma: no cover + from opteryx.managers.expression import format_expression + + if self._on: + return f"{self._join_type.upper()} JOIN ({format_expression(self._on, True)})" + if self._using: + return f"{self._join_type.upper()} JOIN (USING {','.join(map(format_expression, self._using))})" + return f"{self._join_type.upper()}" + + def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + if self.stream == "left": + if morsel == EOS: + self.stream = "right" + self.left_relation = pyarrow.concat_tables(self.left_buffer, promote_options="none") + self.left_buffer.clear() + + # in place until #1295 resolved + if self._left_columns[0] not in self.left_relation.column_names: + self._right_columns, self._left_columns = ( + self._left_columns, + self._right_columns, + ) + + else: + self.left_buffer.append(morsel) + return None + + if morsel == EOS: + right_relation = pyarrow.concat_tables(self.right_buffer, promote_options="none") + self.right_buffer.clear() + # do the join + try: + new_morsel = self.left_relation.join( + right_relation, + keys=self._left_columns, + right_keys=self._right_columns, + join_type=self._join_type, + coalesce_keys=self._using is not None, + ) + except pyarrow.ArrowInvalid as err: # pragma: no cover + last_token = str(err).split(" ")[-1] + column = None + for col in self.left_relation.columns: + if last_token == col.identity: + column = col.name + break + for col in right_relation.columns: + if last_token == col.identity: + column = col.name + break + if column: + raise UnsupportedSyntaxError( + f"Unable to ANTI/SEMI JOIN with unsupported column types in table, '{column}'." + ) from err + raise UnsupportedSyntaxError( + "Unable to ANTI/SEMI JOIN with unsupported column types in table." + ) from err + + return [new_morsel, EOS] + + else: + self.right_buffer.append(morsel) + return None diff --git a/opteryx/operatorsv2/show_value_node.py b/opteryx/operatorsv2/show_value_node.py index c889b66e1..c96adce7c 100644 --- a/opteryx/operatorsv2/show_value_node.py +++ b/opteryx/operatorsv2/show_value_node.py @@ -20,15 +20,14 @@ import pyarrow +from opteryx import EOS from opteryx.exceptions import SqlError from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType +from . import ReaderNode -class ShowValueNode(BasePlanNode): - operator_type = OperatorType.PRODUCER +class ShowValueNode(ReaderNode): def __init__(self, properties: QueryProperties, **config): super().__init__(properties=properties) @@ -54,7 +53,7 @@ def name(self): # pragma: no cover def config(self): # pragma: no cover return "" - def execute(self) -> Generator: + def execute(self, morsel) -> Generator: buffer = [{"name": self.key, "value": str(self.value)}] table = pyarrow.Table.from_pylist(buffer) yield table diff --git a/opteryx/operatorsv2/sort_node.py b/opteryx/operatorsv2/sort_node.py index 1361dd704..a8704129d 100644 --- a/opteryx/operatorsv2/sort_node.py +++ b/opteryx/operatorsv2/sort_node.py @@ -35,7 +35,7 @@ class SortNode(BasePlanNode): def __init__(self, properties: QueryProperties, **config): super().__init__(properties=properties) - self.order_by = config.get("order", []) + self.order_by = config.get("order_by", []) self.morsels = [] @classmethod diff --git a/opteryx/planner/binder/binder_visitor.py b/opteryx/planner/binder/binder_visitor.py index 402de9eb7..ca1da30e5 100644 --- a/opteryx/planner/binder/binder_visitor.py +++ b/opteryx/planner/binder/binder_visitor.py @@ -32,6 +32,7 @@ from opteryx.planner.binder.binder import merge_schemas from opteryx.planner.binder.binding_context import BindingContext from opteryx.planner.logical_planner import LogicalPlan +from opteryx.planner.logical_planner import LogicalPlanStepType from opteryx.virtual_datasets import derived CAMEL_TO_SNAKE = re.compile(r"(? OptimizerCo return context if node.node_type in ( + LogicalPlanStepType.Aggregate, + LogicalPlanStepType.AggregateAndGroup, LogicalPlanStepType.Join, + LogicalPlanStepType.Limit, LogicalPlanStepType.Scan, - LogicalPlanStepType.AggregateAndGroup, - LogicalPlanStepType.Aggregate, LogicalPlanStepType.Subquery, + LogicalPlanStepType.Union, ): # we don't push past here context.collected_distincts.clear() diff --git a/opteryx/planner/cost_based_optimizer/strategies/limit_pushdown.py b/opteryx/planner/cost_based_optimizer/strategies/limit_pushdown.py index 61eb80937..ee0d778d0 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/limit_pushdown.py +++ b/opteryx/planner/cost_based_optimizer/strategies/limit_pushdown.py @@ -46,12 +46,13 @@ def visit(self, node: LogicalPlanNode, context: OptimizerContext) -> OptimizerCo node.limit = limit_node.limit context.optimized_plan[context.node_id] = node elif node.node_type in ( - LogicalPlanStepType.Join, - LogicalPlanStepType.Scan, - LogicalPlanStepType.AggregateAndGroup, LogicalPlanStepType.Aggregate, - LogicalPlanStepType.Union, + LogicalPlanStepType.AggregateAndGroup, + LogicalPlanStepType.Distinct, LogicalPlanStepType.Filter, + LogicalPlanStepType.Join, + LogicalPlanStepType.Union, + LogicalPlanStepType.Scan, ): # we don't push past here for limit_node in context.collected_limits: diff --git a/opteryx/planner/cost_based_optimizer/strategies/redundant_operators.py b/opteryx/planner/cost_based_optimizer/strategies/redundant_operators.py index 2c2bd205a..1d1c7a0bf 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/redundant_operators.py +++ b/opteryx/planner/cost_based_optimizer/strategies/redundant_operators.py @@ -58,6 +58,11 @@ def visit(self, node: LogicalPlanNode, context: OptimizerContext) -> OptimizerCo # Subqueries are useful for planning but not needed for execution if node.node_type == LogicalPlanStepType.Subquery: + alias = node.alias + for nid, _, _ in context.optimized_plan.ingoing_edges(context.node_id): + updated_node = context.optimized_plan[nid] + updated_node.alias = alias + context.optimized_plan.add_node(nid, updated_node) context.optimized_plan.remove_node(context.node_id, heal=True) self.statistics.optimization_remove_redundant_operators_subquery += 1 diff --git a/opteryx/planner/physical_planner.py b/opteryx/planner/physical_planner.py index b291850f9..8f0639897 100644 --- a/opteryx/planner/physical_planner.py +++ b/opteryx/planner/physical_planner.py @@ -30,9 +30,9 @@ def create_physical_plan(logical_plan, query_properties) -> PhysicalPlan: # fmt: off if node_type == LogicalPlanStepType.Aggregate: - node = operators.AggregateNode(query_properties, aggregates=node_config["aggregates"]) + node = operators.AggregateNode(query_properties, **{k:v for k,v in node_config.items() if k in ("aggregates", "all_relations")}) elif node_type == LogicalPlanStepType.AggregateAndGroup: - node = operators.AggregateAndGroupNode(query_properties, groups=node_config["groups"], aggregates=node_config["aggregates"], projection=node_config["projection"]) + node = operators.AggregateAndGroupNode(query_properties, **{k:v for k,v in node_config.items() if k in ("aggregates", "groups", "projection", "all_relations")}) # elif node_type == LogicalPlanStepType.Defragment: # node = operators.MorselDefragmentNode(query_properties, **node_config) elif node_type == LogicalPlanStepType.Distinct: @@ -42,7 +42,7 @@ def create_physical_plan(logical_plan, query_properties) -> PhysicalPlan: elif node_type == LogicalPlanStepType.Explain: node = operators.ExplainNode(query_properties, **node_config) elif node_type == LogicalPlanStepType.Filter: - node = operators.FilterNode(query_properties, filter=node_config["condition"]) + node = operators.FilterNode(query_properties, filter=node_config["condition"], **{k:v for k,v in node_config.items() if k in ("all_relations",)}) elif node_type == LogicalPlanStepType.FunctionDataset: if node_config.get("function") != "UNNEST" or (len(node_config.get("args", [])) > 0 and not isinstance(node_config["args"][0], LogicalColumn)): node = operators.FunctionDatasetNode(query_properties, **node_config) @@ -65,14 +65,14 @@ def create_physical_plan(logical_plan, query_properties) -> PhysicalPlan: # Pyarrow doesn't have a CROSS JOIN node = operators.CrossJoinNode(query_properties, **node_config) else: - # Use Pyarrow for all other joins - node = operators.JoinNode(query_properties, **node_config) + # Use Pyarrow for all other joins (right semi, right anti) + node = operators.PyArrowJoinNode(query_properties, **node_config) elif node_type == LogicalPlanStepType.Limit: - node = operators.LimitNode(query_properties, limit=node_config.get("limit"), offset=node_config.get("offset", 0)) + node = operators.LimitNode(query_properties, **{k:v for k,v in node_config.items() if k in ("limit", "offset", "all_relations")}) elif node_type == LogicalPlanStepType.Order: - node = operators.SortNode(query_properties, order=node_config["order_by"]) + node = operators.SortNode(query_properties, **{k:v for k,v in node_config.items() if k in ("order_by", "all_relations")}) elif node_type == LogicalPlanStepType.Project: - node = operators.ProjectionNode(query_properties, projection=logical_node.columns) + node = operators.ProjectionNode(query_properties, projection=logical_node.columns, **{k:v for k,v in node_config.items() if k in ("projection", "all_relations")}) elif node_type == LogicalPlanStepType.Scan: connector = node_config.get("connector") if connector and hasattr(connector, "async_read_blob"): @@ -83,7 +83,7 @@ def create_physical_plan(logical_plan, query_properties) -> PhysicalPlan: node = operators.SetVariableNode(query_properties, **node_config) elif node_type == LogicalPlanStepType.Show: if node_config["object_type"] == "VARIABLE": - node = operators.ShowValueNode(query_properties, kind=node_config["items"][1], value=node_config["items"][1]) + node = operators.ShowValueNode(query_properties, kind=node_config["items"][1], value=node_config["items"][1], **node_config) elif node_config["object_type"] == "VIEW": node = operators.ShowCreateNode(query_properties, **node_config) else: diff --git a/tests/sql_battery/test_shapes_and_errors_battery.py b/tests/sql_battery/test_shapes_and_errors_battery.py index c092c9048..1f1056d21 100644 --- a/tests/sql_battery/test_shapes_and_errors_battery.py +++ b/tests/sql_battery/test_shapes_and_errors_battery.py @@ -1499,7 +1499,7 @@ ("SELECT * FROM $planets AS P LEFT SEMI JOIN (SELECT id FROM $satellites WHERE name != 'Moon') AS S ON S.id = P.id;", 8, 20, None), ("SELECT * FROM $planets AS P LEFT SEMI JOIN $satellites AS S ON S.id = P.id WHERE P.name != 'Earth';", 8, 20, None), ("SELECT * FROM GENERATE_SERIES(1, 10) AS G LEFT SEMI JOIN $satellites AS S ON S.id = G;", 10, 1, None), - ("EXPLAIN ANALYZE FORMAT JSON SELECT * FROM $planets AS a INNER JOIN (SELECT id FROM $planets) AS b USING (id);", 3, 3, None), + ("EXPLAIN ANALYZE FORMAT JSON SELECT * FROM $planets AS a INNER JOIN (SELECT id FROM $planets) AS b USING (id);", 3, 6, None), ("SELECT DISTINCT ON (planetId) planetId, name FROM $satellites ", 7, 2, None), ("SELECT 8 DIV 4", 1, 1, None), From 4765e1a83f122b817491b28c011c339f567d2dc6 Mon Sep 17 00:00:00 2001 From: XB500 Date: Thu, 7 Nov 2024 17:56:01 +0000 Subject: [PATCH 011/157] Opteryx Version 0.18.1 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 10b61b345..b052bba1f 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 842 +__build__ = 843 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 91933c255374075cdd8aa7990efe6db2f03ec250 Mon Sep 17 00:00:00 2001 From: joocer Date: Thu, 7 Nov 2024 17:57:46 +0000 Subject: [PATCH 012/157] =?UTF-8?q?=E2=9C=A8=20push-based=20execution=20?= =?UTF-8?q?=20#2061?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- opteryx/__version__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index b052bba1f..acc993cff 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -27,9 +27,9 @@ class VersionStatus(Enum): _major = 0 -_minor = 18 -_revision = 1 -_status = VersionStatus.RELEASE +_minor = 19 +_revision = 0 +_status = VersionStatus.ALPHA __author__ = "@joocer" __version__ = f"{_major}.{_minor}.{_revision}" + ( From 2dcd589477e5ce1c573f2aac73d063d2ed04ac96 Mon Sep 17 00:00:00 2001 From: joocer Date: Fri, 8 Nov 2024 12:01:41 +0000 Subject: [PATCH 013/157] #2061 --- opteryx/models/physical_plan.py | 20 ++----- .../operatorsv2/aggregate_and_group_node.py | 10 ++-- opteryx/operatorsv2/aggregate_node.py | 6 +-- opteryx/operatorsv2/async_read_node.py | 7 +-- opteryx/operatorsv2/cross_join_node.py | 52 +++++++++---------- opteryx/operatorsv2/distinct_node.py | 6 +-- opteryx/operatorsv2/exit_node.py | 6 +-- opteryx/operatorsv2/explain_node.py | 2 +- opteryx/operatorsv2/filter_node.py | 6 +-- opteryx/operatorsv2/function_dataset_node.py | 14 ++--- opteryx/operatorsv2/heap_sort_node.py | 8 +-- opteryx/operatorsv2/inner_join_node.py | 18 +++---- opteryx/operatorsv2/inner_join_node_single.py | 18 +++---- opteryx/operatorsv2/limit_node.py | 8 +-- opteryx/operatorsv2/noop_node.py | 4 +- opteryx/operatorsv2/outer_join_node.py | 18 +++---- opteryx/operatorsv2/projection_node.py | 8 +-- opteryx/operatorsv2/pyarrow_join_node.py | 18 +++---- opteryx/operatorsv2/set_variable_node.py | 11 ++-- opteryx/operatorsv2/show_columns_node.py | 14 ++--- opteryx/operatorsv2/show_create_node.py | 18 +++---- opteryx/operatorsv2/show_value_node.py | 10 ++-- opteryx/operatorsv2/sort_node.py | 6 +-- opteryx/operatorsv2/union_node.py | 6 +-- .../strategies/redundant_operators.py | 28 ++++++++-- opteryx/planner/physical_planner.py | 4 +- .../test_shapes_and_errors_battery.py | 2 +- 27 files changed, 168 insertions(+), 160 deletions(-) diff --git a/opteryx/models/physical_plan.py b/opteryx/models/physical_plan.py index 003820b04..d683fcda1 100644 --- a/opteryx/models/physical_plan.py +++ b/opteryx/models/physical_plan.py @@ -264,7 +264,7 @@ def push_executor(self) -> Tuple[Generator[pyarrow.Table, Any, Any], ResultType] from opteryx.operatorsv2 import JoinNode from opteryx.operatorsv2 import ReaderNode from opteryx.operatorsv2 import SetVariableNode - from opteryx.operatorsv2 import ShowValueNode + from opteryx.operatorsv2 import ShowValueNode, ShowCreateNode # Validate query plan to ensure it's acyclic if not self.is_acyclic(): @@ -284,21 +284,11 @@ def push_executor(self) -> Tuple[Generator[pyarrow.Table, Any, Any], ResultType] joins = [(nid, node) for nid, node in self.nodes(True) if isinstance(node, JoinNode)] for nid, join in joins: for s, t, r in self.breadth_first_search(nid, reverse=True): - source_parameters = self[s].parameters - if set(join._left_relation).intersection( - { - source_parameters.get("alias"), - source_parameters.get("relation"), - } - ): + source_relations = self[s].parameters.get("all_relations", set()) + if set(join._left_relation).intersection(source_relations): self.remove_edge(s, t, r) self.add_edge(s, t, "left") - elif set(join._right_relation).intersection( - { - source_parameters.get("alias"), - source_parameters.get("relation"), - } - ): + elif set(join._right_relation).intersection(source_relations): self.remove_edge(s, t, r) self.add_edge(s, t, "right") @@ -310,7 +300,7 @@ def push_executor(self) -> Tuple[Generator[pyarrow.Table, Any, Any], ResultType] elif isinstance(head_node, SetVariableNode): yield head_node(None), ResultType.NON_TABULAR - elif isinstance(head_node, ShowValueNode): + elif isinstance(head_node, (ShowValueNode, ShowCreateNode)): yield head_node(None), ResultType.TABULAR def inner_execute(plan): diff --git a/opteryx/operatorsv2/aggregate_and_group_node.py b/opteryx/operatorsv2/aggregate_and_group_node.py index a0f2c2b15..6cc40d281 100644 --- a/opteryx/operatorsv2/aggregate_and_group_node.py +++ b/opteryx/operatorsv2/aggregate_and_group_node.py @@ -52,11 +52,11 @@ class AggregateAndGroupDataObject(BasePlanDataObject): class AggregateAndGroupNode(BasePlanNode): - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) - self.groups = list(config["groups"]) - self.aggregates = list(config["aggregates"]) - projection = list(config["projection"]) + def __init__(self, properties: QueryProperties, **parameters): + BasePlanNode.__init__(self, properties=properties, **parameters) + self.groups = list(parameters["groups"]) + self.aggregates = list(parameters["aggregates"]) + projection = list(parameters["projection"]) # we're going to preload some of the evaluation diff --git a/opteryx/operatorsv2/aggregate_node.py b/opteryx/operatorsv2/aggregate_node.py index 606d4f232..a577f101c 100644 --- a/opteryx/operatorsv2/aggregate_node.py +++ b/opteryx/operatorsv2/aggregate_node.py @@ -187,10 +187,10 @@ class AggregateDataObject(BasePlanDataObject): class AggregateNode(BasePlanNode): - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) + def __init__(self, properties: QueryProperties, **parameters): + BasePlanNode.__init__(self, properties=properties, **parameters) - self.aggregates = config.get("aggregates", []) + self.aggregates = parameters.get("aggregates", []) # get all the columns anywhere in the aggregates all_identifiers = [ diff --git a/opteryx/operatorsv2/async_read_node.py b/opteryx/operatorsv2/async_read_node.py index 8859d748d..16b288b10 100644 --- a/opteryx/operatorsv2/async_read_node.py +++ b/opteryx/operatorsv2/async_read_node.py @@ -34,6 +34,7 @@ from opteryx import EOS from opteryx import config from opteryx.exceptions import DataError +from opteryx.models import QueryProperties from opteryx.operators.base_plan_node import BasePlanDataObject from opteryx.shared import AsyncMemoryPool from opteryx.shared import MemoryPool @@ -73,12 +74,12 @@ class AsyncReaderDataObject(BasePlanDataObject): class AsyncReaderNode(ReaderNode): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + def __init__(self, properties: QueryProperties, **parameters): + ReaderNode.__init__(self, properties=properties, **parameters) self.pool = MemoryPool(MAX_READ_BUFFER_CAPACITY, f"ReadBuffer <{self.parameters['alias']}>") self.do = AsyncReaderDataObject() - self.predicates = kwargs.get("predicates") + self.predicates = parameters.get("predicates") @classmethod def from_dict(cls, dic: dict) -> "AsyncReaderNode": # pragma: no cover diff --git a/opteryx/operatorsv2/cross_join_node.py b/opteryx/operatorsv2/cross_join_node.py index 4dcab1747..fe0e8bd11 100644 --- a/opteryx/operatorsv2/cross_join_node.py +++ b/opteryx/operatorsv2/cross_join_node.py @@ -170,29 +170,27 @@ def _cross_join_unnest_column( def _cross_join_unnest_literal( - morsels: pyarrow.Table, source: Tuple, target_column: FlatColumn + morsel: pyarrow.Table, source: Tuple, target_column: FlatColumn ) -> Generator[pyarrow.Table, None, None]: joined_list_size = len(source) - # Loop through each morsel from the morsels execution - for left_morsel in morsels.execute(): - # Break the morsel into batches to avoid memory issues - for left_block in left_morsel.to_batches(max_chunksize=INTERNAL_BATCH_SIZE): - left_block = pyarrow.Table.from_batches([left_block], schema=left_morsel.schema) - block_size = left_block.num_rows + # Break the morsel into batches to avoid memory issues + for left_block in morsel.to_batches(max_chunksize=INTERNAL_BATCH_SIZE): + left_block = pyarrow.Table.from_batches([left_block], schema=morsel.schema) + block_size = left_block.num_rows - # Repeat each row in the table n times - repeated_indices = numpy.repeat(numpy.arange(block_size), joined_list_size) - appended_table = left_block.take(repeated_indices) + # Repeat each row in the table n times + repeated_indices = numpy.repeat(numpy.arange(block_size), joined_list_size) + appended_table = left_block.take(repeated_indices) - # Tile the array to match the new number of rows - tiled_array = numpy.tile(source, block_size) + # Tile the array to match the new number of rows + tiled_array = numpy.tile(source, block_size) - # Convert tiled_array to PyArrow array and append it to the table - array_column = pyarrow.array(tiled_array) - appended_table = appended_table.append_column(target_column.identity, array_column) + # Convert tiled_array to PyArrow array and append it to the table + array_column = pyarrow.array(tiled_array) + appended_table = appended_table.append_column(target_column.identity, array_column) - yield appended_table + yield appended_table def _cartesian_product(*arrays): @@ -273,19 +271,19 @@ class CrossJoinNode(JoinNode): Implements a SQL CROSS JOIN """ - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) + def __init__(self, properties: QueryProperties, **parameters): + JoinNode.__init__(self, properties=properties, **parameters) - self.source = config.get("column") + self.source = parameters.get("column") - self._left_relation = config.get("left_relation_names") - self._right_relation = config.get("right_relation_names") + self._left_relation = parameters.get("left_relation_names") + self._right_relation = parameters.get("right_relation_names") # do we have unnest details? - self._unnest_column = config.get("unnest_column") - self._unnest_target = config.get("unnest_target") - self._filters = config.get("filters") - self._distinct = config.get("distinct", False) + self._unnest_column = parameters.get("unnest_column") + self._unnest_target = parameters.get("unnest_target") + self._filters = parameters.get("filters") + self._distinct = parameters.get("distinct", False) # handle variation in how the unnested column is represented if self._unnest_column: @@ -297,7 +295,7 @@ def __init__(self, properties: QueryProperties, **config): ): self._unnest_column.value = tuple([self._unnest_column.value]) - self._single_column = config.get("pre_update_columns", set()) == { + self._single_column = parameters.get("pre_update_columns", set()) == { self._unnest_target.identity, } @@ -330,7 +328,7 @@ def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: if isinstance(self._unnest_column.value, tuple): return list( _cross_join_unnest_literal( - morsels=morsel, + morsel=morsel, source=self._unnest_column.value, target_column=self._unnest_target, ) diff --git a/opteryx/operatorsv2/distinct_node.py b/opteryx/operatorsv2/distinct_node.py index 0c4adaf3e..60cf76c2d 100644 --- a/opteryx/operatorsv2/distinct_node.py +++ b/opteryx/operatorsv2/distinct_node.py @@ -27,11 +27,11 @@ class DistinctNode(BasePlanNode): - def __init__(self, properties: QueryProperties, **config): + def __init__(self, properties: QueryProperties, **parameters): from opteryx.compiled.structures import HashSet - super().__init__(properties=properties) - self._distinct_on = config.get("on") + BasePlanNode.__init__(self, properties=properties, **parameters) + self._distinct_on = parameters.get("on") if self._distinct_on: self._distinct_on = [col.schema_column.identity for col in self._distinct_on] self.hash_set = HashSet() diff --git a/opteryx/operatorsv2/exit_node.py b/opteryx/operatorsv2/exit_node.py index cac38c629..a428e955b 100644 --- a/opteryx/operatorsv2/exit_node.py +++ b/opteryx/operatorsv2/exit_node.py @@ -46,9 +46,9 @@ class ExitDataObject(BasePlanDataObject): class ExitNode(BasePlanNode): - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) - self.columns = config.get("columns", []) + def __init__(self, properties: QueryProperties, **parameters): + BasePlanNode.__init__(self, properties=properties, **parameters) + self.columns = parameters.get("columns", []) self.do = ExitDataObject(columns=self.columns) diff --git a/opteryx/operatorsv2/explain_node.py b/opteryx/operatorsv2/explain_node.py index 4aa6c3104..2b16067a2 100644 --- a/opteryx/operatorsv2/explain_node.py +++ b/opteryx/operatorsv2/explain_node.py @@ -27,7 +27,7 @@ class ExplainNode(BasePlanNode): def __init__(self, properties: QueryProperties, **parameters): - super().__init__(properties=properties) + BasePlanNode.__init__(self, properties=properties, **parameters) self._query_plan = parameters.get("query_plan") self.analyze = parameters.get("analyze", False) diff --git a/opteryx/operatorsv2/filter_node.py b/opteryx/operatorsv2/filter_node.py index a756c48dd..c4cff2e78 100644 --- a/opteryx/operatorsv2/filter_node.py +++ b/opteryx/operatorsv2/filter_node.py @@ -34,9 +34,9 @@ class FilterNode(BasePlanNode): - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) - self.filter = config.get("filter") + def __init__(self, properties: QueryProperties, **parameters): + BasePlanNode.__init__(self, properties=properties, **parameters) + self.filter = parameters.get("filter") self.function_evaluations = get_all_nodes_of_type( self.filter, diff --git a/opteryx/operatorsv2/function_dataset_node.py b/opteryx/operatorsv2/function_dataset_node.py index 0252a3039..9ac8cf80a 100644 --- a/opteryx/operatorsv2/function_dataset_node.py +++ b/opteryx/operatorsv2/function_dataset_node.py @@ -85,17 +85,17 @@ def _http(**kwargs): class FunctionDatasetNode(ReaderNode): - def __init__(self, properties: QueryProperties, **config): + def __init__(self, properties: QueryProperties, **parameters): """ The Blob Reader Node is responsible for reading the relevant blobs and returning a Table/Relation. """ - super().__init__(properties=properties) - self.alias = config.get("alias") - self.function = config["function"] - self.parameters = config - self.columns = config.get("columns", []) - self.args = config.get("args", []) + ReaderNode.__init__(self, properties=properties, **parameters) + self.alias = parameters.get("alias") + self.function = parameters["function"] + self.parameters = parameters + self.columns = parameters.get("columns", []) + self.args = parameters.get("args", []) @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover diff --git a/opteryx/operatorsv2/heap_sort_node.py b/opteryx/operatorsv2/heap_sort_node.py index 7efb101c1..782e8ab44 100644 --- a/opteryx/operatorsv2/heap_sort_node.py +++ b/opteryx/operatorsv2/heap_sort_node.py @@ -46,10 +46,10 @@ class HeapSortDataObject(BasePlanDataObject): class HeapSortNode(BasePlanNode): - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) - self.order_by = config.get("order_by", []) - self.limit: int = config.get("limit", -1) + def __init__(self, properties: QueryProperties, **parameters): + BasePlanNode.__init__(self, properties=properties, **parameters) + self.order_by = parameters.get("order_by", []) + self.limit: int = parameters.get("limit", -1) self.do = HeapSortDataObject(order_by=self.order_by, limit=self.limit) self.mapped_order = [] diff --git a/opteryx/operatorsv2/inner_join_node.py b/opteryx/operatorsv2/inner_join_node.py index 7ae59ef50..533a0060a 100644 --- a/opteryx/operatorsv2/inner_join_node.py +++ b/opteryx/operatorsv2/inner_join_node.py @@ -73,17 +73,17 @@ def inner_join_with_preprocessed_left_side(left_relation, right_relation, join_c class InnerJoinNode(JoinNode): - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) - self._join_type = config["type"] - self._on = config.get("on") - self._using = config.get("using") + def __init__(self, properties: QueryProperties, **parameters): + JoinNode.__init__(self, properties=properties, **parameters) + self._join_type = parameters["type"] + self._on = parameters.get("on") + self._using = parameters.get("using") - self._left_columns = config.get("left_columns") - self._left_relation = config.get("left_relation_names") + self._left_columns = parameters.get("left_columns") + self._left_relation = parameters.get("left_relation_names") - self._right_columns = config.get("right_columns") - self._right_relation = config.get("right_relation_names") + self._right_columns = parameters.get("right_columns") + self._right_relation = parameters.get("right_relation_names") self.stream = "left" self.left_buffer = [] diff --git a/opteryx/operatorsv2/inner_join_node_single.py b/opteryx/operatorsv2/inner_join_node_single.py index 53c5c8416..f2f45692c 100644 --- a/opteryx/operatorsv2/inner_join_node_single.py +++ b/opteryx/operatorsv2/inner_join_node_single.py @@ -159,17 +159,17 @@ def inner_join_with_preprocessed_left_side(left_relation, right_relation, join_c class InnerJoinSingleNode(JoinNode): - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) - self._join_type = config["type"] - self._on = config.get("on") - self._using = config.get("using") + def __init__(self, properties: QueryProperties, **parameters): + JoinNode.__init__(self, properties=properties, **parameters) + self._join_type = parameters["type"] + self._on = parameters.get("on") + self._using = parameters.get("using") - self._left_columns = config.get("left_columns") - self._left_relation = config.get("left_relation_names") + self._left_columns = parameters.get("left_columns") + self._left_relation = parameters.get("left_relation_names") - self._right_columns = config.get("right_columns") - self._right_relation = config.get("right_relation_names") + self._right_columns = parameters.get("right_columns") + self._right_relation = parameters.get("right_relation_names") self.stream = "left" self.left_buffer = [] diff --git a/opteryx/operatorsv2/limit_node.py b/opteryx/operatorsv2/limit_node.py index c5065311d..993f85d9f 100644 --- a/opteryx/operatorsv2/limit_node.py +++ b/opteryx/operatorsv2/limit_node.py @@ -27,10 +27,10 @@ class LimitNode(BasePlanNode): - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) - self.limit = config.get("limit", float("inf")) - self.offset = config.get("offset", 0) + def __init__(self, properties: QueryProperties, **parameters): + BasePlanNode.__init__(self, properties=properties, **parameters) + self.limit = parameters.get("limit", float("inf")) + self.offset = parameters.get("offset", 0) self.remaining_rows = self.limit if self.limit is not None else float("inf") self.rows_left_to_skip = max(0, self.offset) diff --git a/opteryx/operatorsv2/noop_node.py b/opteryx/operatorsv2/noop_node.py index 8e9178884..b0c4bce8b 100644 --- a/opteryx/operatorsv2/noop_node.py +++ b/opteryx/operatorsv2/noop_node.py @@ -24,8 +24,8 @@ class NoOpNode(BasePlanNode): - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) + def __init__(self, properties: QueryProperties, **parameters): + BasePlanNode.__init__(self, properties=properties, **parameters) @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover diff --git a/opteryx/operatorsv2/outer_join_node.py b/opteryx/operatorsv2/outer_join_node.py index df31ed00c..ccb80ff5f 100644 --- a/opteryx/operatorsv2/outer_join_node.py +++ b/opteryx/operatorsv2/outer_join_node.py @@ -249,17 +249,17 @@ def left_semi_join( class OuterJoinNode(JoinNode): - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) - self._join_type = config["type"] - self._on = config.get("on") - self._using = config.get("using") + def __init__(self, properties: QueryProperties, **parameters): + JoinNode.__init__(self, properties=properties, **parameters) + self._join_type = parameters["type"] + self._on = parameters.get("on") + self._using = parameters.get("using") - self._left_columns = config.get("left_columns") - self._left_relation = config.get("left_relation_names") + self._left_columns = parameters.get("left_columns") + self._left_relation = parameters.get("left_relation_names") - self._right_columns = config.get("right_columns") - self._right_relation = config.get("right_relation_names") + self._right_columns = parameters.get("right_columns") + self._right_relation = parameters.get("right_relation_names") self.stream = "left" self.left_buffer = [] diff --git a/opteryx/operatorsv2/projection_node.py b/opteryx/operatorsv2/projection_node.py index 7c9cb616d..35b890597 100644 --- a/opteryx/operatorsv2/projection_node.py +++ b/opteryx/operatorsv2/projection_node.py @@ -30,13 +30,13 @@ class ProjectionNode(BasePlanNode): - def __init__(self, properties: QueryProperties, **config): + def __init__(self, properties: QueryProperties, **parameters): """ Attribute Projection, remove unwanted columns and performs column renames. """ - super().__init__(properties=properties) + BasePlanNode.__init__(self, properties=properties, **parameters) - projection = config["projection"] + config.get("order_by_columns", []) + projection = parameters["projection"] + parameters.get("order_by_columns", []) self.projection = [] for column in projection: @@ -46,7 +46,7 @@ def __init__(self, properties: QueryProperties, **config): column for column in projection if column.node_type != NodeType.IDENTIFIER ] - self.columns = config["projection"] + self.columns = parameters["projection"] @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover diff --git a/opteryx/operatorsv2/pyarrow_join_node.py b/opteryx/operatorsv2/pyarrow_join_node.py index 7856f28d0..02b5ed131 100644 --- a/opteryx/operatorsv2/pyarrow_join_node.py +++ b/opteryx/operatorsv2/pyarrow_join_node.py @@ -27,17 +27,17 @@ class PyArrowJoinNode(JoinNode): - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) - self._join_type = config["type"] - self._on = config.get("on") - self._using = config.get("using") + def __init__(self, properties: QueryProperties, **parameters): + JoinNode.__init__(self, properties=properties, **parameters) + self._join_type = parameters["type"] + self._on = parameters.get("on") + self._using = parameters.get("using") - self._left_columns = config.get("left_columns") - self._left_relation = config.get("left_relation_names") + self._left_columns = parameters.get("left_columns") + self._left_relation = parameters.get("left_relation_names") - self._right_columns = config.get("right_columns") - self._right_relation = config.get("right_relation_names") + self._right_columns = parameters.get("right_columns") + self._right_relation = parameters.get("right_relation_names") self.stream = "left" self.left_buffer = [] diff --git a/opteryx/operatorsv2/set_variable_node.py b/opteryx/operatorsv2/set_variable_node.py index b730a6ded..02676434d 100644 --- a/opteryx/operatorsv2/set_variable_node.py +++ b/opteryx/operatorsv2/set_variable_node.py @@ -24,13 +24,12 @@ class SetVariableNode(BasePlanNode): - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) + def __init__(self, properties: QueryProperties, **parameters): + BasePlanNode.__init__(self, properties=properties, **parameters) - self.variable = config.get("variable") - self.value = config.get("value") - - self.variables = config.get("variables") + self.variable = parameters.get("variable") + self.value = parameters.get("value") + self.variables = parameters.get("variables") @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover diff --git a/opteryx/operatorsv2/show_columns_node.py b/opteryx/operatorsv2/show_columns_node.py index d1961414b..3d57a8c21 100644 --- a/opteryx/operatorsv2/show_columns_node.py +++ b/opteryx/operatorsv2/show_columns_node.py @@ -44,12 +44,14 @@ def _simple_collector(schema): class ShowColumnsNode(BasePlanNode): - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) - self._full = config.get("full") - self._extended = config.get("extended") - self._schema = config.get("schema") - self._column_map = {c.schema_column.identity: c.source_column for c in config["columns"]} + def __init__(self, properties: QueryProperties, **parameters): + BasePlanNode.__init__(self, properties=properties, **parameters) + self._full = parameters.get("full") + self._extended = parameters.get("extended") + self._schema = parameters.get("schema") + self._column_map = { + c.schema_column.identity: c.source_column for c in parameters["columns"] + } self.collector = None self.seen = False diff --git a/opteryx/operatorsv2/show_create_node.py b/opteryx/operatorsv2/show_create_node.py index c33a5d415..5ca7c224c 100644 --- a/opteryx/operatorsv2/show_create_node.py +++ b/opteryx/operatorsv2/show_create_node.py @@ -23,18 +23,15 @@ from opteryx.exceptions import DatasetNotFoundError from opteryx.exceptions import UnsupportedSyntaxError from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType +from . import BasePlanNode class ShowCreateNode(BasePlanNode): - operator_type = OperatorType.PRODUCER + def __init__(self, properties: QueryProperties, **parameters): + BasePlanNode.__init__(self, properties=properties, **parameters) - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) - - self.object_type = config.get("object_type") - self.object_name = config.get("object_name") + self.object_type = parameters.get("object_type") + self.object_name = parameters.get("object_name") @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover @@ -48,7 +45,7 @@ def name(self): # pragma: no cover def config(self): # pragma: no cover return "" - def execute(self) -> Generator: + def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: if self.object_type == "VIEW": from opteryx.planner.views import is_view from opteryx.planner.views import view_as_sql @@ -57,8 +54,7 @@ def execute(self) -> Generator: view_sql = view_as_sql(self.object_name) buffer = [{self.object_name: view_sql}] table = pyarrow.Table.from_pylist(buffer) - yield table - return + return table raise DatasetNotFoundError(self.object_name) diff --git a/opteryx/operatorsv2/show_value_node.py b/opteryx/operatorsv2/show_value_node.py index c96adce7c..f223363bb 100644 --- a/opteryx/operatorsv2/show_value_node.py +++ b/opteryx/operatorsv2/show_value_node.py @@ -28,12 +28,12 @@ class ShowValueNode(ReaderNode): - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) + def __init__(self, properties: QueryProperties, **parameters): + ReaderNode.__init__(self, properties=properties, **parameters) - self.key = config.get("key") - self.kind = config.get("kind") - self.value = config.get("value") + self.key = parameters.get("key") + self.kind = parameters.get("kind") + self.value = parameters.get("value") if self.kind == "PARAMETER": if self.value[0] == "@": diff --git a/opteryx/operatorsv2/sort_node.py b/opteryx/operatorsv2/sort_node.py index a8704129d..12c399240 100644 --- a/opteryx/operatorsv2/sort_node.py +++ b/opteryx/operatorsv2/sort_node.py @@ -33,9 +33,9 @@ class SortNode(BasePlanNode): - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) - self.order_by = config.get("order_by", []) + def __init__(self, properties: QueryProperties, **parameters): + BasePlanNode.__init__(self, properties=properties, **parameters) + self.order_by = parameters.get("order_by", []) self.morsels = [] @classmethod diff --git a/opteryx/operatorsv2/union_node.py b/opteryx/operatorsv2/union_node.py index ca268fc07..a59a07530 100644 --- a/opteryx/operatorsv2/union_node.py +++ b/opteryx/operatorsv2/union_node.py @@ -25,9 +25,9 @@ class UnionNode(BasePlanNode): - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) - self.columns = config.get("columns", []) + def __init__(self, properties: QueryProperties, **parameters): + BasePlanNode.__init__(self, properties=properties, **parameters) + self.columns = parameters.get("columns", []) self.column_ids = [c.schema_column.identity for c in self.columns] self.seen_first_eos = False self.schema = None diff --git a/opteryx/planner/cost_based_optimizer/strategies/redundant_operators.py b/opteryx/planner/cost_based_optimizer/strategies/redundant_operators.py index 1d1c7a0bf..95c0e8f87 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/redundant_operators.py +++ b/opteryx/planner/cost_based_optimizer/strategies/redundant_operators.py @@ -48,21 +48,41 @@ def visit(self, node: LogicalPlanNode, context: OptimizerContext) -> OptimizerCo if node.node_type == LogicalPlanStepType.Project: providers = context.pre_optimized_tree.ingoing_edges(context.node_id) if len(providers) == 1: - provider_node = context.pre_optimized_tree[providers[0][0]] + provider_nid = providers[0][0] + provider_node = context.pre_optimized_tree[provider_nid] if provider_node.node_type != LogicalPlanStepType.Subquery: provider_columns = {c.schema_column.identity for c in provider_node.columns} + # if the columns in the project are the same as the operator before it + # we don't need to project my_columns = {c.schema_column.identity for c in node.columns} if provider_columns == my_columns: + # we need to ensure we keep some of the context if not the step + source_node_alias = context.optimized_plan[context.node_id].alias + if provider_node.all_relations: + provider_node.all_relations.add(source_node_alias) + else: + provider_node.all_relations = {source_node_alias} + context.optimized_plan.add_node(provider_nid, provider_node) + # remove the node context.optimized_plan.remove_node(context.node_id, heal=True) self.statistics.optimization_remove_redundant_operators_project += 1 # Subqueries are useful for planning but not needed for execution + # We need to ensure the alias of the subquery is pushed if node.node_type == LogicalPlanStepType.Subquery: alias = node.alias - for nid, _, _ in context.optimized_plan.ingoing_edges(context.node_id): + nid = context.optimized_plan.ingoing_edges(context.node_id)[0][0] + updated_node = context.optimized_plan[nid] + # if we have multiple layers of subqueries, ignore everything other than the outermost + while updated_node.node_type == LogicalPlanStepType.Subquery: + nid = context.optimized_plan.ingoing_edges(nid)[0][0] updated_node = context.optimized_plan[nid] - updated_node.alias = alias - context.optimized_plan.add_node(nid, updated_node) + updated_node.alias = alias + if updated_node.all_relations: + updated_node.all_relations.add(alias) + else: + updated_node.all_relations = {alias} + context.optimized_plan.add_node(nid, updated_node) context.optimized_plan.remove_node(context.node_id, heal=True) self.statistics.optimization_remove_redundant_operators_subquery += 1 diff --git a/opteryx/planner/physical_planner.py b/opteryx/planner/physical_planner.py index 8f0639897..177c6758d 100644 --- a/opteryx/planner/physical_planner.py +++ b/opteryx/planner/physical_planner.py @@ -44,7 +44,9 @@ def create_physical_plan(logical_plan, query_properties) -> PhysicalPlan: elif node_type == LogicalPlanStepType.Filter: node = operators.FilterNode(query_properties, filter=node_config["condition"], **{k:v for k,v in node_config.items() if k in ("all_relations",)}) elif node_type == LogicalPlanStepType.FunctionDataset: - if node_config.get("function") != "UNNEST" or (len(node_config.get("args", [])) > 0 and not isinstance(node_config["args"][0], LogicalColumn)): + if False and node_config.get("function") == "UNNEST": + node = operators.NoOpNode(query_properties, **node_config) + elif node_config.get("function") != "UNNEST" or len(node_config.get("args", [])) > 0 and not isinstance(node_config["args"][0], LogicalColumn): node = operators.FunctionDatasetNode(query_properties, **node_config) else: node = operators.NoOpNode(query_properties, **node_config) diff --git a/tests/sql_battery/test_shapes_and_errors_battery.py b/tests/sql_battery/test_shapes_and_errors_battery.py index 1f1056d21..d3f0d81fd 100644 --- a/tests/sql_battery/test_shapes_and_errors_battery.py +++ b/tests/sql_battery/test_shapes_and_errors_battery.py @@ -2059,7 +2059,7 @@ ("SELECT * FROM (SELECT * FROM $satellites LEFT JOIN (SELECT id AS pid, mass FROM $planets) AS p ON $satellites.planetId = p.pid) AS mapped WHERE mass > 1", 170, 10, None), ("SELECT * FROM (SELECT planetId, mass FROM $satellites LEFT JOIN $planets AS p ON $satellites.planetId = p.id) AS mapped WHERE mass > 1", 170, 2, None), ("SELECT * FROM $satellites LEFT JOIN $planets AS p ON $satellites.planetId = p.id WHERE mass > 1", 170, 28, None), - ("SELECT * FROM (SELECT p.id, mass FROM (SELECT * FROM $satellites) AS s LEFT JOIN $planets AS p ON s.planetId = p.id) AS mapped WHERE mass > 1", 171, 2, None), + ("SELECT * FROM (SELECT p.id, mass FROM (SELECT * FROM $satellites) AS s LEFT JOIN $planets AS p ON s.planetId = p.id) AS mapped WHERE mass > 1", 170, 2, None), ("SELECT * FROM (SELECT * FROM $satellites) AS s LEFT JOIN (SELECT id as pid, mass FROM $planets) AS p ON s.planetId = p.pid WHERE mass > 1", 170, 10, None), ("SELECT * FROM $satellites LEFT JOIN (SELECT * FROM (SELECT * FROM $planets) AS p) AS planets ON $satellites.planetId = planets.id WHERE mass > 1", 170, 28, None), ("SELECT * FROM (SELECT * FROM (SELECT p.id, mass FROM $satellites LEFT JOIN $planets AS p ON $satellites.planetId = p.id) AS joined) AS mapped WHERE mass > 1", 170, 2, None), From 6fbe9506686aff95b61ec259c39924b6b57e01ec Mon Sep 17 00:00:00 2001 From: XB500 Date: Fri, 8 Nov 2024 12:02:06 +0000 Subject: [PATCH 014/157] Opteryx Version 0.19.0-alpha.846 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index acc993cff..24d0483db 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 843 +__build__ = 846 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 45a21e421716fd473b44c07eadf247d328ba4d1e Mon Sep 17 00:00:00 2001 From: joocer Date: Fri, 8 Nov 2024 13:05:34 +0000 Subject: [PATCH 015/157] #2061 --- opteryx/models/physical_plan.py | 3 ++- opteryx/operatorsv2/cross_join_node.py | 6 ++++++ opteryx/operatorsv2/show_create_node.py | 3 +-- opteryx/planner/physical_planner.py | 7 +------ 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/opteryx/models/physical_plan.py b/opteryx/models/physical_plan.py index d683fcda1..d4dc2b5b7 100644 --- a/opteryx/models/physical_plan.py +++ b/opteryx/models/physical_plan.py @@ -264,7 +264,8 @@ def push_executor(self) -> Tuple[Generator[pyarrow.Table, Any, Any], ResultType] from opteryx.operatorsv2 import JoinNode from opteryx.operatorsv2 import ReaderNode from opteryx.operatorsv2 import SetVariableNode - from opteryx.operatorsv2 import ShowValueNode, ShowCreateNode + from opteryx.operatorsv2 import ShowCreateNode + from opteryx.operatorsv2 import ShowValueNode # Validate query plan to ensure it's acyclic if not self.is_acyclic(): diff --git a/opteryx/operatorsv2/cross_join_node.py b/opteryx/operatorsv2/cross_join_node.py index fe0e8bd11..490762ba2 100644 --- a/opteryx/operatorsv2/cross_join_node.py +++ b/opteryx/operatorsv2/cross_join_node.py @@ -306,6 +306,8 @@ def __init__(self, properties: QueryProperties, **parameters): self.right_relation = None self.hash_set = HashSet() + self.continue_executing = True + @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover raise NotImplementedError() @@ -322,8 +324,12 @@ def config(self): # pragma: no cover return f"CROSS JOIN {filters}" def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + if not self.continue_executing: + return None + if self._unnest_column is not None: if morsel == EOS: + self.continue_executing = False return EOS if isinstance(self._unnest_column.value, tuple): return list( diff --git a/opteryx/operatorsv2/show_create_node.py b/opteryx/operatorsv2/show_create_node.py index 5ca7c224c..d76d95d9b 100644 --- a/opteryx/operatorsv2/show_create_node.py +++ b/opteryx/operatorsv2/show_create_node.py @@ -16,13 +16,12 @@ This is a SQL Query Execution Plan Node. """ -from typing import Generator - import pyarrow from opteryx.exceptions import DatasetNotFoundError from opteryx.exceptions import UnsupportedSyntaxError from opteryx.models import QueryProperties + from . import BasePlanNode diff --git a/opteryx/planner/physical_planner.py b/opteryx/planner/physical_planner.py index 177c6758d..320b90f0d 100644 --- a/opteryx/planner/physical_planner.py +++ b/opteryx/planner/physical_planner.py @@ -44,12 +44,7 @@ def create_physical_plan(logical_plan, query_properties) -> PhysicalPlan: elif node_type == LogicalPlanStepType.Filter: node = operators.FilterNode(query_properties, filter=node_config["condition"], **{k:v for k,v in node_config.items() if k in ("all_relations",)}) elif node_type == LogicalPlanStepType.FunctionDataset: - if False and node_config.get("function") == "UNNEST": - node = operators.NoOpNode(query_properties, **node_config) - elif node_config.get("function") != "UNNEST" or len(node_config.get("args", [])) > 0 and not isinstance(node_config["args"][0], LogicalColumn): - node = operators.FunctionDatasetNode(query_properties, **node_config) - else: - node = operators.NoOpNode(query_properties, **node_config) + node = operators.FunctionDatasetNode(query_properties, **node_config) elif node_type == LogicalPlanStepType.HeapSort: node = operators.HeapSortNode(query_properties, **node_config) elif node_type == LogicalPlanStepType.Join: From db22cc5a2068c36345555e83cc9d71edc5f78255 Mon Sep 17 00:00:00 2001 From: XB500 Date: Fri, 8 Nov 2024 13:06:00 +0000 Subject: [PATCH 016/157] Opteryx Version 0.19.0-alpha.847 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 24d0483db..c342fb798 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 846 +__build__ = 847 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 0982787c4b633ced9802ff8e6a4bd038f4dc56a4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 11 Nov 2024 09:28:40 +0000 Subject: [PATCH 017/157] Update typer requirement from ==0.11.* to ==0.13.* Updates the requirements on [typer](https://github.com/fastapi/typer) to permit the latest version. - [Release notes](https://github.com/fastapi/typer/releases) - [Changelog](https://github.com/fastapi/typer/blob/master/docs/release-notes.md) - [Commits](https://github.com/fastapi/typer/compare/0.11.0...0.13.0) --- updated-dependencies: - dependency-name: typer dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e3b440055..af93665f3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,5 +6,5 @@ psutil pyarrow>=12.0.1 pysimdjson requests -typer==0.11.* +typer==0.13.* aiohttp From bf02f4a5c5b6769ceaeba1c52317be0aa0893648 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 11 Nov 2024 09:28:54 +0000 Subject: [PATCH 018/157] Bump duckdb-engine from 0.13.4 to 0.13.5 Bumps [duckdb-engine](https://github.com/Mause/duckdb_engine) from 0.13.4 to 0.13.5. - [Release notes](https://github.com/Mause/duckdb_engine/releases) - [Changelog](https://github.com/Mause/duckdb_engine/blob/main/CHANGELOG.md) - [Commits](https://github.com/Mause/duckdb_engine/compare/v0.13.4...v0.13.5) --- updated-dependencies: - dependency-name: duckdb-engine dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- tests/requirements_arm.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/requirements_arm.txt b/tests/requirements_arm.txt index 0a7b711e5..f631694e7 100644 --- a/tests/requirements_arm.txt +++ b/tests/requirements_arm.txt @@ -19,6 +19,6 @@ sqlalchemy pymysql psycopg2-binary duckdb==1.1.2 # 1040 -duckdb-engine==0.13.4 # 1040 +duckdb-engine==0.13.5 # 1040 setuptools_rust \ No newline at end of file From f664dc8f1cdb2f702d2aad19681ba467d79d2e0c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 11 Nov 2024 09:29:21 +0000 Subject: [PATCH 019/157] Bump duckdb from 1.1.2 to 1.1.3 Bumps [duckdb](https://github.com/duckdb/duckdb) from 1.1.2 to 1.1.3. - [Release notes](https://github.com/duckdb/duckdb/releases) - [Changelog](https://github.com/duckdb/duckdb/blob/main/tools/release-pip.py) - [Commits](https://github.com/duckdb/duckdb/compare/v1.1.2...v1.1.3) --- updated-dependencies: - dependency-name: duckdb dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- tests/requirements_arm.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/requirements_arm.txt b/tests/requirements_arm.txt index 0a7b711e5..65812c5ff 100644 --- a/tests/requirements_arm.txt +++ b/tests/requirements_arm.txt @@ -18,7 +18,7 @@ firestore sqlalchemy pymysql psycopg2-binary -duckdb==1.1.2 # 1040 +duckdb==1.1.3 # 1040 duckdb-engine==0.13.4 # 1040 setuptools_rust \ No newline at end of file From b8702ff53a09f6307c5633dd1b75179e38970050 Mon Sep 17 00:00:00 2001 From: joocer Date: Mon, 11 Nov 2024 20:33:04 +0000 Subject: [PATCH 020/157] #2061 --- opteryx/connectors/cql_connector.py | 1 + opteryx/managers/expression/__init__.py | 2 +- opteryx/models/physical_plan.py | 42 ++++++++++++------- opteryx/operatorsv2/async_read_node.py | 1 + opteryx/operatorsv2/limit_node.py | 2 +- opteryx/operatorsv2/outer_join_node.py | 10 ++--- opteryx/operatorsv2/read_node.py | 4 ++ opteryx/planner/binder/binder_visitor.py | 22 +++++----- .../strategies/limit_pushdown.py | 1 + .../strategies/predicate_pushdown.py | 32 +++++++++++--- .../split_conjunctive_predicates.py | 1 + .../logical_planner/logical_planner.py | 1 + .../test_limit_pushdown_postgres.py | 2 +- .../test_limit_pushdown_sqlite.py | 2 +- tests/query_execution/test_execution_plan.py | 4 +- tests/query_execution/test_execution_tree.py | 4 +- .../test_shapes_and_errors_battery.py | 4 ++ tests/storage/test_blob_gcs.py | 2 +- .../storage/test_collection_gcs_firestore.py | 2 +- 19 files changed, 93 insertions(+), 46 deletions(-) diff --git a/opteryx/connectors/cql_connector.py b/opteryx/connectors/cql_connector.py index 1d5f0347b..e53bb421a 100644 --- a/opteryx/connectors/cql_connector.py +++ b/opteryx/connectors/cql_connector.py @@ -113,6 +113,7 @@ def read_dataset( # type:ignore columns: list = None, predicates: list = None, chunk_size: int = INITIAL_CHUNK_SIZE, # type:ignore + limit: int = None, ) -> Generator[pyarrow.Table, None, None]: # type:ignore self.chunk_size = chunk_size diff --git a/opteryx/managers/expression/__init__.py b/opteryx/managers/expression/__init__.py index e6c76d8ce..ac4307fe9 100644 --- a/opteryx/managers/expression/__init__.py +++ b/opteryx/managers/expression/__init__.py @@ -96,7 +96,7 @@ class NodeType(int, Enum): OrsoTypes.STRUCT: numpy.dtype("O"), OrsoTypes.TIMESTAMP: numpy.dtype("datetime64[us]"), # [290301 BC, 294241 AD] OrsoTypes.TIME: numpy.dtype("O"), - OrsoTypes.VARCHAR: numpy.unicode_(), + OrsoTypes.VARCHAR: numpy.dtype("U"), OrsoTypes.NULL: numpy.dtype("O"), } diff --git a/opteryx/models/physical_plan.py b/opteryx/models/physical_plan.py index d4dc2b5b7..958cdc2c3 100644 --- a/opteryx/models/physical_plan.py +++ b/opteryx/models/physical_plan.py @@ -212,8 +212,13 @@ def _inner_explain(node, depth): if analyze: # we don't want the results, just the details from the plan temp = None - for temp in self.push_executor(): - pass + head_node = self.get_exit_points()[0] + query_head, _, _ = self.ingoing_edges(head_node)[0] + results = self.push_executor(query_head) + if results is not None: + results_generator, _ = next(results, ([], None)) + for temp in results_generator: + pass del temp plan = list(_inner_explain(head[0], 1)) @@ -259,7 +264,9 @@ def depth_first_search_flat( return traversal_list - def push_executor(self) -> Tuple[Generator[pyarrow.Table, Any, Any], ResultType]: + def push_executor( + self, head_node=None + ) -> Tuple[Generator[pyarrow.Table, Any, Any], ResultType]: from opteryx.operatorsv2 import ExplainNode from opteryx.operatorsv2 import JoinNode from opteryx.operatorsv2 import ReaderNode @@ -279,7 +286,8 @@ def push_executor(self) -> Tuple[Generator[pyarrow.Table, Any, Any], ResultType] f"Query plan has {len(head_nodes)} heads, expected exactly 1." ) - head_node = self[head_nodes[0]] + if head_node is None: + head_node = self[head_nodes[0]] # add the left/right labels to the edges coming into the joins joins = [(nid, node) for nid, node in self.nodes(True) if isinstance(node, JoinNode)] @@ -304,18 +312,20 @@ def push_executor(self) -> Tuple[Generator[pyarrow.Table, Any, Any], ResultType] elif isinstance(head_node, (ShowValueNode, ShowCreateNode)): yield head_node(None), ResultType.TABULAR - def inner_execute(plan): - # Get the pump nodes from the plan and execute them in order - pump_nodes = [ - (nid, node) - for nid, node in self.depth_first_search_flat() - if isinstance(node, ReaderNode) - ] - for pump_nid, pump_instance in pump_nodes: - for morsel in pump_instance(None): - yield from plan.process_node(pump_nid, morsel) - - yield inner_execute(self), ResultType.TABULAR + else: + + def inner_execute(plan): + # Get the pump nodes from the plan and execute them in order + pump_nodes = [ + (nid, node) + for nid, node in self.depth_first_search_flat() + if isinstance(node, ReaderNode) + ] + for pump_nid, pump_instance in pump_nodes: + for morsel in pump_instance(None): + yield from plan.process_node(pump_nid, morsel) + + yield inner_execute(self), ResultType.TABULAR def process_node(self, nid, morsel): from opteryx.operatorsv2 import ReaderNode diff --git a/opteryx/operatorsv2/async_read_node.py b/opteryx/operatorsv2/async_read_node.py index 16b288b10..792936a0d 100644 --- a/opteryx/operatorsv2/async_read_node.py +++ b/opteryx/operatorsv2/async_read_node.py @@ -189,6 +189,7 @@ def execute(self, morsel) -> Generator: self.statistics.blobs_read += 1 self.records_out += morsel.num_rows + self.statistics.rows_read += morsel.num_rows self.bytes_out += morsel.nbytes yield morsel diff --git a/opteryx/operatorsv2/limit_node.py b/opteryx/operatorsv2/limit_node.py index 993f85d9f..20b204829 100644 --- a/opteryx/operatorsv2/limit_node.py +++ b/opteryx/operatorsv2/limit_node.py @@ -54,7 +54,7 @@ def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: if self.rows_left_to_skip > 0: if self.rows_left_to_skip >= morsel.num_rows: self.rows_left_to_skip -= morsel.num_rows - return None + return morsel.slice(offset=0, length=0) else: morsel = morsel.slice( offset=self.rows_left_to_skip, length=morsel.num_rows - self.rows_left_to_skip diff --git a/opteryx/operatorsv2/outer_join_node.py b/opteryx/operatorsv2/outer_join_node.py index ccb80ff5f..191d43c21 100644 --- a/opteryx/operatorsv2/outer_join_node.py +++ b/opteryx/operatorsv2/outer_join_node.py @@ -85,11 +85,11 @@ def left_join(left_relation, right_relation, left_columns: List[str], right_colu left_indexes.clear() right_indexes.clear() - if len(left_indexes) > 0: - table = align_tables(right_relation, left_relation, list(right_indexes), list(left_indexes)) - yield table - left_indexes.clear() - right_indexes.clear() + # this may return an empty table each time - fix later + table = align_tables(right_relation, left_relation, list(right_indexes), list(left_indexes)) + yield table + left_indexes.clear() + right_indexes.clear() def full_join(left_relation, right_relation, left_columns: List[str], right_columns: List[str]): diff --git a/opteryx/operatorsv2/read_node.py b/opteryx/operatorsv2/read_node.py index aa1505568..e81bcb7eb 100644 --- a/opteryx/operatorsv2/read_node.py +++ b/opteryx/operatorsv2/read_node.py @@ -149,6 +149,9 @@ def __init__(self, properties: QueryProperties, **parameters): if len(self.hints) != 0: self.statistics.add_message("All HINTS are currently ignored") + self.statistics.rows_read += 0 + self.statistics.columns_read += 0 + def to_dict(self) -> dict: return { "identity": f"read-{self.identity}", @@ -213,6 +216,7 @@ def execute(self, morsel) -> Generator: self.statistics.time_reading_blobs += time.monotonic_ns() - start_clock self.statistics.blobs_read += 1 self.records_out += morsel.num_rows + self.statistics.rows_read += morsel.num_rows self.bytes_out += morsel.nbytes yield morsel start_clock = time.monotonic_ns() diff --git a/opteryx/planner/binder/binder_visitor.py b/opteryx/planner/binder/binder_visitor.py index ca1da30e5..151b7aa12 100644 --- a/opteryx/planner/binder/binder_visitor.py +++ b/opteryx/planner/binder/binder_visitor.py @@ -1086,16 +1086,18 @@ def traverse( return_node, context = self.visit_node(graph[node], context=context) # We keep track of the relations which are 'visible' along each branch - return_node.all_relations = { - value for value in [return_node.relation, return_node.alias] if value is not None - } - # subqueries change the context of the query - if return_node.node_type not in (LogicalPlanStepType.Subquery, LogicalPlanStepType.Union): - children = graph.ingoing_edges(node) - for plan_node_id, _, _ in children: - plan_node = graph[plan_node_id] - if plan_node.all_relations: - return_node.all_relations.update(plan_node.all_relations) + if return_node.all_relations is None: + return_node.all_relations = set() # Initialize as an empty set if None + + return_node.all_relations.update( + {value for value in [return_node.relation, return_node.alias] if value is not None} + ) + + children = graph.ingoing_edges(node) + for plan_node_id, _, _ in children: + plan_node = graph[plan_node_id] + if plan_node.all_relations: + return_node.all_relations.update(plan_node.all_relations) return_node = self.post_bind(return_node) graph[node] = return_node diff --git a/opteryx/planner/cost_based_optimizer/strategies/limit_pushdown.py b/opteryx/planner/cost_based_optimizer/strategies/limit_pushdown.py index ee0d778d0..34251ee52 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/limit_pushdown.py +++ b/opteryx/planner/cost_based_optimizer/strategies/limit_pushdown.py @@ -51,6 +51,7 @@ def visit(self, node: LogicalPlanNode, context: OptimizerContext) -> OptimizerCo LogicalPlanStepType.Distinct, LogicalPlanStepType.Filter, LogicalPlanStepType.Join, + LogicalPlanStepType.Order, LogicalPlanStepType.Union, LogicalPlanStepType.Scan, ): diff --git a/opteryx/planner/cost_based_optimizer/strategies/predicate_pushdown.py b/opteryx/planner/cost_based_optimizer/strategies/predicate_pushdown.py index ddcaa1884..cd5a17062 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/predicate_pushdown.py +++ b/opteryx/planner/cost_based_optimizer/strategies/predicate_pushdown.py @@ -54,6 +54,32 @@ def _add_condition(existing_condition, new_condition): return _and +def _is_pushable_predicate(predicate) -> bool: + """ + There's a few restrictions on what predicates can be pushed, most of the complex ones are + pushing filters into joins + """ + if len(get_all_nodes_of_type(predicate, (NodeType.AGGREGATOR,))) > 0: + return False + + identifiers = get_all_nodes_of_type(predicate, (NodeType.IDENTIFIER,)) + if len(identifiers) < 2: + return True + if len(identifiers) > 2: + return False + if len(identifiers) == 2 and (identifiers[0].source == identifiers[1].source): + return False + if len(identifiers) == 2 and predicate.value != "Eq": + return False + if predicate.left.node_type not in (NodeType.LITERAL, NodeType.IDENTIFIER): + return False + if predicate.right.node_type not in (NodeType.LITERAL, NodeType.IDENTIFIER): + return False + if predicate.right.node_type == predicate.left.node_type: + return False + return True + + class PredicatePushdownStrategy(OptimizationStrategy): def visit(self, node: LogicalPlanNode, context: OptimizerContext) -> OptimizerContext: if not context.optimized_plan: @@ -81,11 +107,7 @@ def visit(self, node: LogicalPlanNode, context: OptimizerContext) -> OptimizerCo elif node.node_type == LogicalPlanStepType.Filter: # collect predicates we can probably push - if ( - len(node.relations) > 0 - and not get_all_nodes_of_type(node.condition, (NodeType.AGGREGATOR,)) - and len(get_all_nodes_of_type(node.condition, (NodeType.IDENTIFIER,))) == 1 - ): + if _is_pushable_predicate(node.condition): # record where the node was, so we can put it back node.nid = context.node_id node.plan_path = context.optimized_plan.trace_to_root(context.node_id) diff --git a/opteryx/planner/cost_based_optimizer/strategies/split_conjunctive_predicates.py b/opteryx/planner/cost_based_optimizer/strategies/split_conjunctive_predicates.py index 777865bb8..3cb598411 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/split_conjunctive_predicates.py +++ b/opteryx/planner/cost_based_optimizer/strategies/split_conjunctive_predicates.py @@ -78,6 +78,7 @@ def visit(self, node: LogicalPlanNode, context: OptimizerContext) -> OptimizerCo if col.schema_column is not None: sources.extend(col.schema_column.origin) new_node.relations = set(sources) + new_node.all_relations = node.all_relations new_nodes.append(new_node) else: new_nodes = [node] diff --git a/opteryx/planner/logical_planner/logical_planner.py b/opteryx/planner/logical_planner/logical_planner.py index b962d9cdf..49c70115a 100644 --- a/opteryx/planner/logical_planner/logical_planner.py +++ b/opteryx/planner/logical_planner/logical_planner.py @@ -1146,6 +1146,7 @@ def build_expression_tree(relation, dnf_list): filter_node = LogicalPlanNode( node_type=LogicalPlanStepType.Filter, condition=expression_tree, # Use the built expression tree + all_relations={node.relation, node.alias}, ) logical_plan.insert_node_after(random_string(), filter_node, nid) diff --git a/tests/plan_optimization/test_limit_pushdown_postgres.py b/tests/plan_optimization/test_limit_pushdown_postgres.py index b0e309786..bb310825c 100644 --- a/tests/plan_optimization/test_limit_pushdown_postgres.py +++ b/tests/plan_optimization/test_limit_pushdown_postgres.py @@ -36,7 +36,7 @@ ("SELECT name FROM (SELECT * FROM pg.planets) AS S LIMIT 3", 3), ] -@pytest.mark.parametrize("query, expected_columns", STATEMENTS) +@pytest.mark.parametrize("query, expected_rows", STATEMENTS) def test_postgres_limit_pushdown(query, expected_rows): cur = opteryx.query(query) cur.materialize() diff --git a/tests/plan_optimization/test_limit_pushdown_sqlite.py b/tests/plan_optimization/test_limit_pushdown_sqlite.py index 81a8cf20e..199c9aa33 100644 --- a/tests/plan_optimization/test_limit_pushdown_sqlite.py +++ b/tests/plan_optimization/test_limit_pushdown_sqlite.py @@ -32,7 +32,7 @@ ("SELECT name FROM (SELECT * FROM sqlite.planets) AS S LIMIT 3", 3), ] -@pytest.mark.parametrize("query, expected_columns", STATEMENTS) +@pytest.mark.parametrize("query, expected_rows", STATEMENTS) def test_sqlite_limit_pushdown(query, expected_rows): cur = opteryx.query(query) cur.materialize() diff --git a/tests/query_execution/test_execution_plan.py b/tests/query_execution/test_execution_plan.py index 21f3616e5..431c3ec21 100644 --- a/tests/query_execution/test_execution_plan.py +++ b/tests/query_execution/test_execution_plan.py @@ -8,14 +8,14 @@ sys.path.insert(1, os.path.join(sys.path[0], "../..")) -from opteryx.models import ExecutionTree +from opteryx.models import PhysicalPlan def test_linear_execution_tree(): """ Test an execution tree where each item has no more than one incoming edge """ - tree = ExecutionTree() + tree = PhysicalPlan() tree.add_node("p", print) tree.add_node("m", max) tree.add_edge("p", "m") diff --git a/tests/query_execution/test_execution_tree.py b/tests/query_execution/test_execution_tree.py index cafa582d5..92059475d 100644 --- a/tests/query_execution/test_execution_tree.py +++ b/tests/query_execution/test_execution_tree.py @@ -7,11 +7,11 @@ sys.path.insert(1, os.path.join(sys.path[0], "../..")) -from opteryx.models.physical_plan import ExecutionTree +from opteryx.models.physical_plan import PhysicalPlan def test_execution_tree(): - et = ExecutionTree() + et = PhysicalPlan() et.add_node("a", None) et.add_node("b", None) et.add_edge("a", "b", "forwards") diff --git a/tests/sql_battery/test_shapes_and_errors_battery.py b/tests/sql_battery/test_shapes_and_errors_battery.py index d3f0d81fd..479d338b1 100644 --- a/tests/sql_battery/test_shapes_and_errors_battery.py +++ b/tests/sql_battery/test_shapes_and_errors_battery.py @@ -1764,6 +1764,10 @@ ("SELECT SUBSTRING(name FROM 1 FOR 1) FROM $astronauts", 357, 1, None), ("SELECT SUBSTRING(name FROM -1 FOR 1) FROM $astronauts", 357, 1, None), + ("SELECT * FROM $planets LEFT JOIN $satellites USING(id) WHERE False", 0, 27, None), + ("SELECT * FROM (SELECT * FROM $planets WHERE False) AS S LEFT JOIN $satellites USING(id)", 0, 27, None), + ("SELECT * FROM $planets LEFT JOIN (SELECT * FROM $satellites WHERE False) AS S USING(id)", 9, 27, None), + # Edge Case with Empty Joins ("SELECT * FROM $planets LEFT JOIN (SELECT id FROM $satellites WHERE planetId < 0) AS S ON $planets.id = S.id", 9, 21, None), # Handling NULL Comparisons in WHERE Clause diff --git a/tests/storage/test_blob_gcs.py b/tests/storage/test_blob_gcs.py index 0f7b876f9..48f99f6bb 100644 --- a/tests/storage/test_blob_gcs.py +++ b/tests/storage/test_blob_gcs.py @@ -56,7 +56,7 @@ query=f"SELECT name, kepler_name FROM {BUCKET_NAME}.exoplanets AS exoplanets INNER JOIN $planets AS planets ON rowid = id LIMIT 5", expected_rowcount=5, expected_columncount=2, - stats={"columns_read": 2}, + stats={"columns_read": 4}, ), ] diff --git a/tests/storage/test_collection_gcs_firestore.py b/tests/storage/test_collection_gcs_firestore.py index 28a24a89b..88edf8200 100644 --- a/tests/storage/test_collection_gcs_firestore.py +++ b/tests/storage/test_collection_gcs_firestore.py @@ -104,5 +104,5 @@ def test_predicate_pushdown_multiple_mixed(): if __name__ == "__main__": # pragma: no cover from tests.tools import run_tests - + test_predicate_pushdown_multiple_equals() run_tests() From fc271c2cd59e889aff802d584dfc260a2ce99bfa Mon Sep 17 00:00:00 2001 From: joocer Date: Tue, 12 Nov 2024 19:51:18 +0000 Subject: [PATCH 021/157] HOUSEKEEPING --- opteryx/__version__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index b1508580a..cceb9e1e8 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -25,6 +25,8 @@ class VersionStatus(Enum): BETA = "beta" RELEASE = "release" + +_major = 0 _minor = 19 _revision = 0 _status = VersionStatus.ALPHA From 12ad01c9292965a770c05d451a06e73fe9334566 Mon Sep 17 00:00:00 2001 From: XB500 Date: Tue, 12 Nov 2024 19:51:42 +0000 Subject: [PATCH 022/157] Opteryx Version 0.19.0-alpha.856 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index cceb9e1e8..612027a68 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 854 +__build__ = 856 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 73020989af02452e40e22d870e7fad499f415eb5 Mon Sep 17 00:00:00 2001 From: joocer Date: Thu, 14 Nov 2024 22:25:21 +0000 Subject: [PATCH 023/157] #2061 --- .github/workflows/regression_suite.yaml | 1 + .github/workflows/regression_suite_arm.yaml | 1 + .../workflows/regression_suite_mac_ARM.yaml | 3 +- .../workflows/regression_suite_mac_x86.yaml | 3 +- .../workflows/regression_suite_windows.yaml | 3 +- .gitignore | 1 + opteryx/managers/expression/__init__.py | 3 +- .../strategies/predicate_pushdown.py | 32 +++---------------- requirements.txt | 1 - 9 files changed, 16 insertions(+), 32 deletions(-) diff --git a/.github/workflows/regression_suite.yaml b/.github/workflows/regression_suite.yaml index 7ac949aa2..99e07b5b9 100644 --- a/.github/workflows/regression_suite.yaml +++ b/.github/workflows/regression_suite.yaml @@ -74,6 +74,7 @@ jobs: DATA_CATALOG_PROVIDER: 'TARCHIA' DATA_CATALOG_CONFIGURATION: '${{ secrets.DATA_CATALOG_CONFIGURATION }}' TARCHIA_KEY: '${{ secrets.TARCHIA_KEY }}' + EXPERIMENTAL_EXECUTION_ENGINE: 'true' - name: Check Coverage run: python -m coverage report --include=opteryx/** --fail-under=90 -m diff --git a/.github/workflows/regression_suite_arm.yaml b/.github/workflows/regression_suite_arm.yaml index bfe8d03ad..ac0472cc1 100644 --- a/.github/workflows/regression_suite_arm.yaml +++ b/.github/workflows/regression_suite_arm.yaml @@ -57,3 +57,4 @@ jobs: DATA_CATALOG_PROVIDER: 'TARCHIA' DATA_CATALOG_CONFIGURATION: '${{ secrets.DATA_CATALOG_CONFIGURATION }}' TARCHIA_KEY: '${{ secrets.TARCHIA_KEY }}' + EXPERIMENTAL_EXECUTION_ENGINE: 'true' diff --git a/.github/workflows/regression_suite_mac_ARM.yaml b/.github/workflows/regression_suite_mac_ARM.yaml index baf738b27..abb278d74 100644 --- a/.github/workflows/regression_suite_mac_ARM.yaml +++ b/.github/workflows/regression_suite_mac_ARM.yaml @@ -63,4 +63,5 @@ jobs: MEMCACHED_SERVER: 'localhost:11211' DATA_CATALOG_PROVIDER: 'TARCHIA' DATA_CATALOG_CONFIGURATION: '${{ secrets.DATA_CATALOG_CONFIGURATION }}' - TARCHIA_KEY: '${{ secrets.TARCHIA_KEY }}' \ No newline at end of file + TARCHIA_KEY: '${{ secrets.TARCHIA_KEY }}' + EXPERIMENTAL_EXECUTION_ENGINE: 'true' diff --git a/.github/workflows/regression_suite_mac_x86.yaml b/.github/workflows/regression_suite_mac_x86.yaml index d96c46b08..134c1c3eb 100644 --- a/.github/workflows/regression_suite_mac_x86.yaml +++ b/.github/workflows/regression_suite_mac_x86.yaml @@ -52,4 +52,5 @@ jobs: MEMCACHED_SERVER: 'localhost:11211' DATA_CATALOG_PROVIDER: 'TARCHIA' DATA_CATALOG_CONFIGURATION: '${{ secrets.DATA_CATALOG_CONFIGURATION }}' - TARCHIA_KEY: '${{ secrets.TARCHIA_KEY }}' \ No newline at end of file + TARCHIA_KEY: '${{ secrets.TARCHIA_KEY }}' + EXPERIMENTAL_EXECUTION_ENGINE: 'true' diff --git a/.github/workflows/regression_suite_windows.yaml b/.github/workflows/regression_suite_windows.yaml index ed07985b2..bdcd35585 100644 --- a/.github/workflows/regression_suite_windows.yaml +++ b/.github/workflows/regression_suite_windows.yaml @@ -53,4 +53,5 @@ jobs: MEMCACHED_SERVER: 'localhost:11211' DATA_CATALOG_PROVIDER: 'TARCHIA' DATA_CATALOG_CONFIGURATION: '${{ secrets.DATA_CATALOG_CONFIGURATION }}' - TARCHIA_KEY: '${{ secrets.TARCHIA_KEY }}' \ No newline at end of file + TARCHIA_KEY: '${{ secrets.TARCHIA_KEY }}' + EXPERIMENTAL_EXECUTION_ENGINE: 'true' diff --git a/.gitignore b/.gitignore index c417d7687..5954c8869 100644 --- a/.gitignore +++ b/.gitignore @@ -178,3 +178,4 @@ space_missions.parquet **.del **.disabled **.psv +planets.parquet diff --git a/opteryx/managers/expression/__init__.py b/opteryx/managers/expression/__init__.py index ac4307fe9..e252b9374 100644 --- a/opteryx/managers/expression/__init__.py +++ b/opteryx/managers/expression/__init__.py @@ -343,8 +343,9 @@ def evaluate_and_append(expressions, table: Table): if table.num_rows > 0: new_column = evaluate_statement(statement, table) else: + # we make all unknown fields int64s, this can be cast to _most_ other types new_column = numpy.array( - [], dtype=ORSO_TO_NUMPY_MAP.get(statement.schema_column.type, numpy.str_) + [], dtype=ORSO_TO_NUMPY_MAP.get(statement.schema_column.type, numpy.int64) ) new_column = pyarrow.array(new_column) diff --git a/opteryx/planner/cost_based_optimizer/strategies/predicate_pushdown.py b/opteryx/planner/cost_based_optimizer/strategies/predicate_pushdown.py index cd5a17062..ddcaa1884 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/predicate_pushdown.py +++ b/opteryx/planner/cost_based_optimizer/strategies/predicate_pushdown.py @@ -54,32 +54,6 @@ def _add_condition(existing_condition, new_condition): return _and -def _is_pushable_predicate(predicate) -> bool: - """ - There's a few restrictions on what predicates can be pushed, most of the complex ones are - pushing filters into joins - """ - if len(get_all_nodes_of_type(predicate, (NodeType.AGGREGATOR,))) > 0: - return False - - identifiers = get_all_nodes_of_type(predicate, (NodeType.IDENTIFIER,)) - if len(identifiers) < 2: - return True - if len(identifiers) > 2: - return False - if len(identifiers) == 2 and (identifiers[0].source == identifiers[1].source): - return False - if len(identifiers) == 2 and predicate.value != "Eq": - return False - if predicate.left.node_type not in (NodeType.LITERAL, NodeType.IDENTIFIER): - return False - if predicate.right.node_type not in (NodeType.LITERAL, NodeType.IDENTIFIER): - return False - if predicate.right.node_type == predicate.left.node_type: - return False - return True - - class PredicatePushdownStrategy(OptimizationStrategy): def visit(self, node: LogicalPlanNode, context: OptimizerContext) -> OptimizerContext: if not context.optimized_plan: @@ -107,7 +81,11 @@ def visit(self, node: LogicalPlanNode, context: OptimizerContext) -> OptimizerCo elif node.node_type == LogicalPlanStepType.Filter: # collect predicates we can probably push - if _is_pushable_predicate(node.condition): + if ( + len(node.relations) > 0 + and not get_all_nodes_of_type(node.condition, (NodeType.AGGREGATOR,)) + and len(get_all_nodes_of_type(node.condition, (NodeType.IDENTIFIER,))) == 1 + ): # record where the node was, so we can put it back node.nid = context.node_id node.plan_path = context.optimized_plan.trace_to_root(context.node_id) diff --git a/requirements.txt b/requirements.txt index af93665f3..1ea4297ea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,5 +6,4 @@ psutil pyarrow>=12.0.1 pysimdjson requests -typer==0.13.* aiohttp From 42a964ec1ec30d85be5ce34f02309498b48844e3 Mon Sep 17 00:00:00 2001 From: XB500 Date: Thu, 14 Nov 2024 22:25:45 +0000 Subject: [PATCH 024/157] Opteryx Version 0.19.0-alpha.857 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 612027a68..ea099b8f1 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 856 +__build__ = 857 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From b14b0c69428a0afec99e8556b59cd46ca18e29e3 Mon Sep 17 00:00:00 2001 From: joocer Date: Fri, 15 Nov 2024 00:13:28 +0000 Subject: [PATCH 025/157] #2061 --- opteryx/config.py | 3 - opteryx/models/physical_plan.py | 163 +------- opteryx/operators/__init__.py | 8 +- opteryx/operators/aggregate_and_group_node.py | 114 +++--- opteryx/operators/aggregate_node.py | 87 ++-- opteryx/operators/async_read_node.py | 17 +- opteryx/operators/base_plan_node.py | 73 ++-- opteryx/operators/cross_join_node.py | 363 +++++++++-------- opteryx/operators/distinct_node.py | 53 +-- opteryx/operators/exit_node.py | 58 ++- opteryx/operators/explain_node.py | 19 +- opteryx/operators/filter_node.py | 70 ++-- opteryx/operators/function_dataset_node.py | 29 +- opteryx/operators/heap_sort_node.py | 158 ++++---- opteryx/operators/inner_join_node.py | 91 +++-- opteryx/operators/inner_join_node_single.py | 89 +++-- opteryx/operators/join_node.py | 97 ----- opteryx/operators/limit_node.py | 50 ++- opteryx/operators/noop_node.py | 20 +- opteryx/operators/outer_join_node.py | 219 +++++------ opteryx/operators/projection_node.py | 35 +- .../pyarrow_join_node.py | 0 opteryx/operators/read_node.py | 24 +- opteryx/operators/set_variable_node.py | 21 +- opteryx/operators/show_columns_node.py | 86 ++-- opteryx/operators/show_create_node.py | 21 +- opteryx/operators/show_value_node.py | 19 +- opteryx/operators/sort_node.py | 41 +- opteryx/operators/union_node.py | 42 +- opteryx/operatorsv2/__init__.py | 53 --- .../operatorsv2/aggregate_and_group_node.py | 153 -------- opteryx/operatorsv2/aggregate_node.py | 256 ------------ opteryx/operatorsv2/async_read_node.py | 213 ---------- opteryx/operatorsv2/base_plan_node.py | 119 ------ .../bench/#information_schema_node.py | 186 --------- .../operatorsv2/bench/#show_databases_node.py | 79 ---- opteryx/operatorsv2/cross_join_node.py | 370 ------------------ opteryx/operatorsv2/distinct_node.py | 73 ---- opteryx/operatorsv2/exit_node.py | 108 ----- opteryx/operatorsv2/explain_node.py | 48 --- opteryx/operatorsv2/filter_node.py | 81 ---- opteryx/operatorsv2/function_dataset_node.py | 151 ------- opteryx/operatorsv2/heap_sort_node.py | 139 ------- opteryx/operatorsv2/inner_join_node.py | 134 ------- opteryx/operatorsv2/inner_join_node_single.py | 220 ----------- opteryx/operatorsv2/limit_node.py | 73 ---- opteryx/operatorsv2/noop_node.py | 44 --- opteryx/operatorsv2/outer_join_node.py | 324 --------------- opteryx/operatorsv2/projection_node.py | 71 ---- opteryx/operatorsv2/read_node.py | 228 ----------- opteryx/operatorsv2/set_variable_node.py | 48 --- opteryx/operatorsv2/show_columns_node.py | 102 ----- opteryx/operatorsv2/show_create_node.py | 60 --- opteryx/operatorsv2/show_value_node.py | 59 --- opteryx/operatorsv2/sort_node.py | 100 ----- opteryx/operatorsv2/union_node.py | 64 --- opteryx/planner/__init__.py | 6 +- opteryx/planner/physical_planner.py | 2 +- opteryx/planner/temporary_physical_planner.py | 119 ------ pyproject.toml | 2 +- tests/misc/test_cli.py | 6 +- 61 files changed, 891 insertions(+), 4870 deletions(-) delete mode 100644 opteryx/operators/join_node.py rename opteryx/{operatorsv2 => operators}/pyarrow_join_node.py (100%) delete mode 100644 opteryx/operatorsv2/__init__.py delete mode 100644 opteryx/operatorsv2/aggregate_and_group_node.py delete mode 100644 opteryx/operatorsv2/aggregate_node.py delete mode 100644 opteryx/operatorsv2/async_read_node.py delete mode 100644 opteryx/operatorsv2/base_plan_node.py delete mode 100644 opteryx/operatorsv2/bench/#information_schema_node.py delete mode 100644 opteryx/operatorsv2/bench/#show_databases_node.py delete mode 100644 opteryx/operatorsv2/cross_join_node.py delete mode 100644 opteryx/operatorsv2/distinct_node.py delete mode 100644 opteryx/operatorsv2/exit_node.py delete mode 100644 opteryx/operatorsv2/explain_node.py delete mode 100644 opteryx/operatorsv2/filter_node.py delete mode 100644 opteryx/operatorsv2/function_dataset_node.py delete mode 100644 opteryx/operatorsv2/heap_sort_node.py delete mode 100644 opteryx/operatorsv2/inner_join_node.py delete mode 100644 opteryx/operatorsv2/inner_join_node_single.py delete mode 100644 opteryx/operatorsv2/limit_node.py delete mode 100644 opteryx/operatorsv2/noop_node.py delete mode 100644 opteryx/operatorsv2/outer_join_node.py delete mode 100644 opteryx/operatorsv2/projection_node.py delete mode 100644 opteryx/operatorsv2/read_node.py delete mode 100644 opteryx/operatorsv2/set_variable_node.py delete mode 100644 opteryx/operatorsv2/show_columns_node.py delete mode 100644 opteryx/operatorsv2/show_create_node.py delete mode 100644 opteryx/operatorsv2/show_value_node.py delete mode 100644 opteryx/operatorsv2/sort_node.py delete mode 100644 opteryx/operatorsv2/union_node.py delete mode 100644 opteryx/planner/temporary_physical_planner.py diff --git a/opteryx/config.py b/opteryx/config.py index 297ba68c0..28940bd14 100644 --- a/opteryx/config.py +++ b/opteryx/config.py @@ -172,9 +172,6 @@ def get(key: str, default: Optional[typing.Any] = None) -> Optional[typing.Any]: DATA_CATALOG_CONFIGURATION: Optional[str] = get("DATA_CATALOG_CONFIGURATION") """Data Catalog configuration, different catalogs have different config formats.""" -EXPERIMENTAL_EXECUTION_ENGINE: bool = bool(get("EXPERIMENTAL_EXECUTION_ENGINE", False)) -"""Use the experimental/incomplete generation 2 execution engine.""" - # GCP project ID - for Google Cloud Data GCP_PROJECT_ID: str = get("GCP_PROJECT_ID") # don't try to raise the priority of the server process diff --git a/opteryx/models/physical_plan.py b/opteryx/models/physical_plan.py index 958cdc2c3..050b6af93 100644 --- a/opteryx/models/physical_plan.py +++ b/opteryx/models/physical_plan.py @@ -21,12 +21,10 @@ """ -import gc from typing import Any from typing import Generator from typing import Optional from typing import Tuple -from typing import Union import pyarrow @@ -43,114 +41,7 @@ class PhysicalPlan(Graph): complex code which is the planner from the tree that describes the plan. """ - def execute(self) -> Generator[Tuple[Union[pyarrow.Table, Any], ResultType], None, None]: - if config.EXPERIMENTAL_EXECUTION_ENGINE: - return self.push_executor() - return self.legacy_executor() - - def legacy_executor( - self, - ) -> Generator[Tuple[Union[pyarrow.Table, Any], ResultType], None, None]: - """ - Implements a 'pull' model execution engine, pulling records starting from - the last stage (head) of the query plan, and working backwards towards the first stage. - - Yields: - tuple: The first element is the result (either tabular data or a - NonTabularResult object). The second element is a ResultType enum, - indicating the type of the result. - """ - from opteryx.models import NonTabularResult - from opteryx.operators import ExplainNode - - def map_operators_to_producers(nodes: list) -> None: - """ - Walks through the query plan, linking each operator node with its data producers. - - Parameters: - nodes: list - List of operator nodes in the query plan. - """ - - for node in nodes: - producers = self.ingoing_edges(node) - operator = self[node] - - if len(producers) == 1: - # If there is only one producer, set it directly - operator.set_producers([self[producers[0][0]]]) - elif len(producers) == 2 and hasattr(operator, "_left_relation"): - left_producer = None - right_producer = None - - left_relation = operator._left_relation - right_relation = operator._right_relation - for source, target, relation in producers: - for s, t, r in self.breadth_first_search(source, reverse=True) + [ - (source, target, relation) - ]: - if set(left_relation).intersection( - { - self[s].parameters.get("alias"), - self[s].parameters.get("relation"), - } - ): - left_producer = self[source] - elif set(right_relation).intersection( - { - self[s].parameters.get("alias"), - self[s].parameters.get("relation"), - } - ): - right_producer = self[source] - - if left_producer and right_producer: - operator.set_producers([left_producer, right_producer]) - else: - # Default to setting producers as in the current method if left and right cannot be determined - operator.set_producers([self[src_node[0]] for src_node in producers]) - else: - # Handle cases with more than two producers if applicable - operator.set_producers([self[src_node[0]] for src_node in producers]) - - # Recursively process the producers - map_operators_to_producers([src_node[0] for src_node in producers]) - - # Validate query plan to ensure it's acyclic - if not self.is_acyclic(): - raise InvalidInternalStateError("Query plan is cyclic, cannot execute.") - - # Retrieve the tail of the query plan, which should ideally be a single head node - head_nodes = list(set(self.get_exit_points())) - - if len(head_nodes) != 1: - raise InvalidInternalStateError( - f"Query plan has {len(head_nodes)} heads, expected exactly 1." - ) - - head_node = head_nodes[0] - - # Special case handling for 'Explain' queries - if isinstance(self[head_node], ExplainNode): - yield self.explain(), ResultType.TABULAR - return - - # Link operators with their producers - map_operators_to_producers([head_node]) - - # Execute the head node's operation - operator = self[head_node] - gc.disable() - results = operator.execute() - gc.enable() - - # If the results are non-tabular, handle them accordingly - if isinstance(results, NonTabularResult): - yield results, ResultType.NON_TABULAR - else: - yield results, ResultType.TABULAR - - def explain(self) -> Generator[pyarrow.Table, None, None]: + def explainv2(self, analyze: bool) -> Generator[pyarrow.Table, None, None]: from opteryx import operators def _inner_explain(node, depth): @@ -163,35 +54,6 @@ def _inner_explain(node, depth): yield from _inner_explain(operator_name[0], depth) continue elif isinstance(operator, operators.BasePlanNode): - yield { - "operator": operator.name, - "config": operator.config, - "depth": depth, - } - yield from _inner_explain(operator_name[0], depth + 1) - - head = list(dict.fromkeys(self.get_exit_points())) - if len(head) != 1: # pragma: no cover - raise InvalidInternalStateError(f"Problem with the plan - it has {len(head)} heads.") - plan = list(_inner_explain(head[0], 1)) - - table = pyarrow.Table.from_pylist(plan) - - yield table - - def explainv2(self, analyze: bool) -> Generator[pyarrow.Table, None, None]: - from opteryx import operatorsv2 - - def _inner_explain(node, depth): - incoming_operators = self.ingoing_edges(node) - for operator_name in incoming_operators: - operator = self[operator_name[0]] - if isinstance( - operator, (operatorsv2.ExitNode, operatorsv2.ExplainNode) - ): # Skip ExitNode - yield from _inner_explain(operator_name[0], depth) - continue - elif isinstance(operator, operatorsv2.BasePlanNode): record = { "tree": depth, "operator": operator.name, @@ -214,7 +76,7 @@ def _inner_explain(node, depth): temp = None head_node = self.get_exit_points()[0] query_head, _, _ = self.ingoing_edges(head_node)[0] - results = self.push_executor(query_head) + results = self.execute(query_head) if results is not None: results_generator, _ = next(results, ([], None)) for temp in results_generator: @@ -264,15 +126,13 @@ def depth_first_search_flat( return traversal_list - def push_executor( - self, head_node=None - ) -> Tuple[Generator[pyarrow.Table, Any, Any], ResultType]: - from opteryx.operatorsv2 import ExplainNode - from opteryx.operatorsv2 import JoinNode - from opteryx.operatorsv2 import ReaderNode - from opteryx.operatorsv2 import SetVariableNode - from opteryx.operatorsv2 import ShowCreateNode - from opteryx.operatorsv2 import ShowValueNode + def execute(self, head_node=None) -> Tuple[Generator[pyarrow.Table, Any, Any], ResultType]: + from opteryx.operators import ExplainNode + from opteryx.operators import JoinNode + from opteryx.operators import ReaderNode + from opteryx.operators import SetVariableNode + from opteryx.operators import ShowCreateNode + from opteryx.operators import ShowValueNode # Validate query plan to ensure it's acyclic if not self.is_acyclic(): @@ -328,7 +188,7 @@ def inner_execute(plan): yield inner_execute(self), ResultType.TABULAR def process_node(self, nid, morsel): - from opteryx.operatorsv2 import ReaderNode + from opteryx.operators import ReaderNode node = self[nid] @@ -363,6 +223,3 @@ def sensors(self): def __del__(self): pass - - -# print(self.sensors()) diff --git a/opteryx/operators/__init__.py b/opteryx/operators/__init__.py index 931910e19..5ffb5a7cf 100644 --- a/opteryx/operators/__init__.py +++ b/opteryx/operators/__init__.py @@ -1,3 +1,5 @@ +# isort: skip + # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -10,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. + from .base_plan_node import BasePlanDataObject # isort: skip -from .base_plan_node import BasePlanNode # isort: skip -from .base_plan_node import OperatorType # isort: skip +from .base_plan_node import BasePlanNode, JoinNode # isort: skip from .aggregate_and_group_node import AggregateAndGroupNode # Group is always followed by aggregate from .aggregate_node import AGGREGATORS @@ -31,8 +33,8 @@ # from .information_schema_node import InformationSchemaNode # information_schema from .inner_join_node import InnerJoinNode from .inner_join_node_single import InnerJoinSingleNode -from .join_node import JoinNode from .limit_node import LimitNode # select the first N records +from .pyarrow_join_node import PyArrowJoinNode # from .metadata_writer_node import MetadataWriterNode # from .morsel_defragment_node import MorselDefragmentNode # consolidate small morsels diff --git a/opteryx/operators/aggregate_and_group_node.py b/opteryx/operators/aggregate_and_group_node.py index 81ccf2bfb..6cc40d281 100644 --- a/opteryx/operators/aggregate_and_group_node.py +++ b/opteryx/operators/aggregate_and_group_node.py @@ -21,25 +21,24 @@ """ -import time from dataclasses import dataclass -from typing import Generator import numpy import pyarrow from orso.types import OrsoTypes +from opteryx import EOS from opteryx.managers.expression import NodeType from opteryx.managers.expression import evaluate_and_append from opteryx.managers.expression import get_all_nodes_of_type from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType from opteryx.operators.aggregate_node import build_aggregations from opteryx.operators.aggregate_node import extract_evaluations from opteryx.operators.aggregate_node import project from opteryx.operators.base_plan_node import BasePlanDataObject +from . import BasePlanNode + @dataclass class AggregateAndGroupDataObject(BasePlanDataObject): @@ -53,13 +52,11 @@ class AggregateAndGroupDataObject(BasePlanDataObject): class AggregateAndGroupNode(BasePlanNode): - operator_type = OperatorType.BLOCKING - - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) - self.groups = list(config["groups"]) - self.aggregates = list(config["aggregates"]) - projection = list(config["projection"]) + def __init__(self, properties: QueryProperties, **parameters): + BasePlanNode.__init__(self, properties=properties, **parameters) + self.groups = list(parameters["groups"]) + self.aggregates = list(parameters["aggregates"]) + projection = list(parameters["projection"]) # we're going to preload some of the evaluation @@ -91,6 +88,8 @@ def __init__(self, properties: QueryProperties, **config): self.do = AggregateAndGroupDataObject() + self.buffer = [] + @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover raise NotImplementedError() @@ -105,55 +104,50 @@ def config(self): # pragma: no cover def name(self): # pragma: no cover return "Group" - def execute(self) -> Generator[pyarrow.Table, None, None]: - morsels = self._producers[0] # type:ignore - - # merge all the morsels together into one table, selecting only the columns - # we're pretty sure we're going to use - this will fail for datasets - # larger than memory - table = pyarrow.concat_tables( - project(morsels.execute(), self.all_identifiers), - promote_options="permissive", - ) - - # Allow grouping by functions by evaluating them first - start_time = time.time_ns() - table = evaluate_and_append(self.evaluatable_nodes, table) - table = evaluate_and_append(self.groups, table) + def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + if morsel == EOS: + # merge all the morsels together into one table, selecting only the columns + # we're pretty sure we're going to use - this will fail for datasets + # larger than memory + table = pyarrow.concat_tables( + self.buffer, + promote_options="permissive", + ) + # do the group by and aggregates + table = table.combine_chunks() + groups = table.group_by(self.group_by_columns) + groups = groups.aggregate(self.aggregate_functions) + + # do the secondary activities for ARRAY_AGG + for node in get_all_nodes_of_type(self.aggregates, select_nodes=(NodeType.AGGREGATOR,)): + if node.value == "ARRAY_AGG" and node.order or node.limit: + # rip the column out of the table + column_name = self.column_map[node.schema_column.identity] + column_def = groups.field(column_name) # this is used + column = groups.column(column_name).to_pylist() + groups = groups.drop([column_name]) + if node.order: + column = [sorted(c, reverse=bool(node.order[0][1])) for c in column] + if node.limit: + column = [c[: node.limit] for c in column] + # put the new column into the table + groups = groups.append_column(column_def, [column]) + + # project to the desired column names from the pyarrow names + groups = groups.select(list(self.column_map.values()) + self.group_by_columns) + groups = groups.rename_columns(list(self.column_map.keys()) + self.group_by_columns) + + return [groups, EOS] + + morsel = project(morsel, self.all_identifiers) # Add a "*" column, this is an int because when a bool it miscounts - if "*" not in table.column_names: - table = table.append_column( - "*", [numpy.full(shape=table.num_rows, fill_value=1, dtype=numpy.int8)] + if "*" not in morsel.column_names: + morsel = morsel.append_column( + "*", [numpy.ones(shape=morsel.num_rows, dtype=numpy.bool_)] ) - self.statistics.time_evaluating += time.time_ns() - start_time - - start_time = time.time_ns() - - # do the group by and aggregates - table = table.combine_chunks() - groups = table.group_by(self.group_by_columns) - groups = groups.aggregate(self.aggregate_functions) - - # do the secondary activities for ARRAY_AGG - for node in get_all_nodes_of_type(self.aggregates, select_nodes=(NodeType.AGGREGATOR,)): - if node.value == "ARRAY_AGG" and node.order or node.limit: - # rip the column out of the table - column_name = self.column_map[node.schema_column.identity] - column_def = groups.field(column_name) # this is used - column = groups.column(column_name).to_pylist() - groups = groups.drop([column_name]) - if node.order: - column = [sorted(c, reverse=bool(node.order[0][1])) for c in column] - if node.limit: - column = [c[: node.limit] for c in column] - # put the new column into the table - groups = groups.append_column(column_def, [column]) - - # project to the desired column names from the pyarrow names - groups = groups.select(list(self.column_map.values()) + self.group_by_columns) - groups = groups.rename_columns(list(self.column_map.keys()) + self.group_by_columns) - - self.statistics.time_grouping += time.time_ns() - start_time - - yield groups + if self.evaluatable_nodes: + morsel = evaluate_and_append(self.evaluatable_nodes, morsel) + morsel = evaluate_and_append(self.groups, morsel) + + self.buffer.append(morsel) diff --git a/opteryx/operators/aggregate_node.py b/opteryx/operators/aggregate_node.py index 935d517a8..0d6d75143 100644 --- a/opteryx/operators/aggregate_node.py +++ b/opteryx/operators/aggregate_node.py @@ -18,23 +18,21 @@ This node performs aggregates without performing groupings. """ -import time from dataclasses import dataclass -from typing import Generator -from typing import List import numpy import pyarrow +from opteryx import EOS from opteryx.exceptions import UnsupportedSyntaxError from opteryx.managers.expression import NodeType from opteryx.managers.expression import evaluate_and_append from opteryx.managers.expression import get_all_nodes_of_type from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType from opteryx.operators.base_plan_node import BasePlanDataObject +from . import BasePlanNode + COUNT_STAR: str = "COUNT(*)" # use the aggregators from pyarrow @@ -76,12 +74,12 @@ def _is_count_star(aggregates): def _count_star(morsel_promise, column_name): - count = sum(morsel.num_rows for morsel in morsel_promise.execute()) + count = sum(morsel.num_rows for morsel in morsel_promise) table = pyarrow.Table.from_pylist([{column_name: count}]) - yield table + return table -def project(table: pyarrow.Table, column_names: List) -> pyarrow.Table: +def project(table: pyarrow.Table, column_names: list) -> pyarrow.Table: row_count = table.num_rows if len(column_names) > 0: return table.select(dict.fromkeys(column_names)) @@ -188,12 +186,10 @@ class AggregateDataObject(BasePlanDataObject): class AggregateNode(BasePlanNode): - operator_type = OperatorType.BLOCKING - - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) + def __init__(self, properties: QueryProperties, **parameters): + BasePlanNode.__init__(self, properties=properties, **parameters) - self.aggregates = config.get("aggregates", []) + self.aggregates = parameters.get("aggregates", []) # get all the columns anywhere in the aggregates all_identifiers = [ @@ -208,6 +204,7 @@ def __init__(self, properties: QueryProperties, **config): self.column_map, self.aggregate_functions = build_aggregations(self.aggregates) self.do = AggregateDataObject() + self.buffer = [] @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover @@ -221,46 +218,36 @@ def config(self): # pragma: no cover def name(self): # pragma: no cover return "Aggregation" - def execute(self) -> Generator[pyarrow.Table, None, None]: - morsels = self._producers[0] # type:ignore - if isinstance(morsels, pyarrow.Table): - morsels = (morsels,) + def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + if morsel == EOS: + if _is_count_star(self.aggregates): + return _count_star( + morsel_promise=self.buffer, + column_name=self.aggregates[0].schema_column.identity, + ) - if _is_count_star(self.aggregates): - yield from _count_star( - morsel_promise=morsels, - column_name=self.aggregates[0].schema_column.identity, - ) - return - - # merge all the morsels together into one table, selecting only the columns - # we're pretty sure we're going to use - this will fail for datasets - # larger than memory until we implement some form of partitioning - table = pyarrow.concat_tables( - project(morsels.execute(), self.all_identifiers), promote_options="none" - ) - - # Allow grouping by functions by evaluating them first - start_time = time.time_ns() - if self.evaluatable_nodes: - table = evaluate_and_append(self.evaluatable_nodes, table) - - # Add a "*" column, this is an int because when a bool it miscounts - if "*" not in table.column_names: - table = table.append_column( - "*", [numpy.full(shape=table.num_rows, fill_value=1, dtype=numpy.int8)] - ) - self.statistics.time_evaluating += time.time_ns() - start_time + # merge all the morsels together into one table, selecting only the columns + # we're pretty sure we're going to use - this will fail for datasets + # larger than memory until we implement some form of partitioning + table = pyarrow.concat_tables(self.buffer, promote_options="none") + + # Allow grouping by functions by evaluating them first + if self.evaluatable_nodes: + table = evaluate_and_append(self.evaluatable_nodes, table) - start_time = time.time_ns() + # Add a "*" column, this is an int because when a bool it miscounts + if "*" not in table.column_names: + table = table.append_column( + "*", [numpy.full(shape=table.num_rows, fill_value=1, dtype=numpy.int8)] + ) - # we're not a group_by - we're aggregating without grouping - aggregates = _non_group_aggregates(self.aggregates, table) - del table + # we're not a group_by - we're aggregating without grouping + aggregates = _non_group_aggregates(self.aggregates, table) + del table - # name the aggregate fields and add them to the Columns data - aggregates = aggregates.select(list(self.column_map.keys())) + # name the aggregate fields and add them to the Columns data + aggregates = aggregates.select(list(self.column_map.keys())) - self.statistics.time_aggregating += time.time_ns() - start_time + return [aggregates, EOS] - yield aggregates + self.buffer.append(project(morsel, self.all_identifiers)) diff --git a/opteryx/operators/async_read_node.py b/opteryx/operators/async_read_node.py index eed73bc56..792936a0d 100644 --- a/opteryx/operators/async_read_node.py +++ b/opteryx/operators/async_read_node.py @@ -31,14 +31,16 @@ import pyarrow.parquet from orso.schema import convert_orso_schema_to_arrow_schema +from opteryx import EOS from opteryx import config from opteryx.exceptions import DataError +from opteryx.models import QueryProperties from opteryx.operators.base_plan_node import BasePlanDataObject -from opteryx.operators.read_node import ReaderNode from opteryx.shared import AsyncMemoryPool from opteryx.shared import MemoryPool from opteryx.utils.file_decoders import get_decoder +from .read_node import ReaderNode from .read_node import normalize_morsel from .read_node import struct_to_jsonb @@ -72,18 +74,18 @@ class AsyncReaderDataObject(BasePlanDataObject): class AsyncReaderNode(ReaderNode): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + def __init__(self, properties: QueryProperties, **parameters): + ReaderNode.__init__(self, properties=properties, **parameters) self.pool = MemoryPool(MAX_READ_BUFFER_CAPACITY, f"ReadBuffer <{self.parameters['alias']}>") self.do = AsyncReaderDataObject() - self.predicates = kwargs.get("predicates") + self.predicates = parameters.get("predicates") @classmethod def from_dict(cls, dic: dict) -> "AsyncReaderNode": # pragma: no cover raise NotImplementedError() - def execute(self) -> Generator: + def execute(self, morsel) -> Generator: from opteryx import system_statistics """Perform this step, time how long is spent doing work""" @@ -186,8 +188,9 @@ def execute(self) -> Generator: arrow_schema = morsel.schema self.statistics.blobs_read += 1 + self.records_out += morsel.num_rows self.statistics.rows_read += morsel.num_rows - self.statistics.bytes_processed += morsel.nbytes + self.bytes_out += morsel.nbytes yield morsel except Exception as err: @@ -206,3 +209,5 @@ def execute(self) -> Generator: yield pyarrow.Table.from_arrays( [pyarrow.array([]) for _ in arrow_schema], schema=arrow_schema ) + + yield EOS diff --git a/opteryx/operators/base_plan_node.py b/opteryx/operators/base_plan_node.py index b8d917f46..7e025efaf 100644 --- a/opteryx/operators/base_plan_node.py +++ b/opteryx/operators/base_plan_node.py @@ -11,30 +11,19 @@ # limitations under the License. +import time from dataclasses import dataclass -from enum import Enum -from enum import auto -from typing import Generator from typing import Optional import pyarrow from orso.tools import random_string -from opteryx.models import QueryProperties -from opteryx.models import QueryStatistics - - -class OperatorType(int, Enum): - PRODUCER = auto() - PASSTHRU = auto() - BLOCKING = auto() - _UNKNOWN = auto() +from opteryx import EOS @dataclass class BasePlanDataObject: operation: Optional[str] = None - operator_type: OperatorType = OperatorType._UNKNOWN query_id: str = None identity: str = None @@ -47,22 +36,27 @@ def __post_init__(self): class BasePlanNode: - _producers = None - operator_type = OperatorType._UNKNOWN - - def __init__(self, properties: QueryProperties, **parameters): + def __init__(self, *, properties, **parameters): """ This is the base class for nodes in the execution plan. The initializer accepts a QueryStatistics node which is populated by different nodes differently to record what happened during the query execution. """ - self.properties = properties + from opteryx.models import QueryProperties + from opteryx.models import QueryStatistics + + self.properties: QueryProperties = properties + self.statistics: QueryStatistics = QueryStatistics(properties.qid) self.parameters = parameters - self.statistics = QueryStatistics(properties.qid) self.execution_time = 0 self.identity = random_string() self.do: Optional[BasePlanDataObject] = None + self.calls = 0 + self.records_in = 0 + self.bytes_in = 0 + self.records_out = 0 + self.bytes_out = 0 def to_json(self) -> bytes: # pragma: no cover import orjson @@ -75,9 +69,6 @@ def to_json(self) -> bytes: # pragma: no cover def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover raise NotImplementedError() - def set_producers(self, producers): - self._producers = producers - def config(self) -> str: return "" @@ -88,5 +79,41 @@ def name(self): # pragma: no cover """ return "no name" - def execute(self) -> Generator[pyarrow.Table, None, None]: # pragma: no cover + @property + def node_type(self) -> str: + return self.name + + def __str__(self) -> str: + return f"{self.name} {self.sensors()}" + + def execute(self, morsel: pyarrow.Table) -> Optional[pyarrow.Table]: # pragma: no cover pass + + def __call__(self, morsel: pyarrow.Table) -> Optional[pyarrow.Table]: + if morsel is not None and morsel != EOS: + self.records_in += morsel.num_rows + self.bytes_in += morsel.nbytes + self.calls += 1 + + start_time = time.monotonic_ns() + result = self.execute(morsel) + + self.execution_time += time.monotonic_ns() - start_time + if result is not None and result != EOS and hasattr(result, "num_rows"): + self.records_out += result.num_rows + self.bytes_out += result.nbytes + return result + + def sensors(self): + return { + "calls": self.calls, + "execution_time": self.execution_time, + "records_in": self.records_in, + "records_out": self.records_out, + "bytes_in": self.bytes_in, + "bytes_out": self.bytes_out, + } + + +class JoinNode(BasePlanNode): + pass diff --git a/opteryx/operators/cross_join_node.py b/opteryx/operators/cross_join_node.py index f6f9c5e6b..490762ba2 100644 --- a/opteryx/operators/cross_join_node.py +++ b/opteryx/operators/cross_join_node.py @@ -19,7 +19,6 @@ here rather than calling the join() functions """ -import time from dataclasses import dataclass from typing import Generator from typing import Set @@ -29,13 +28,15 @@ import pyarrow from orso.schema import FlatColumn +from opteryx import EOS +from opteryx.compiled.structures import HashSet from opteryx.managers.expression import NodeType -from opteryx.models import Node +from opteryx.models import LogicalColumn from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType from opteryx.operators.base_plan_node import BasePlanDataObject +from . import JoinNode + INTERNAL_BATCH_SIZE: int = 7500 # config MAX_JOIN_SIZE: int = 1000 # config MORSEL_SIZE_BYTES: int = 16 * 1024 * 1024 @@ -43,14 +44,15 @@ def _cross_join_unnest_column( - morsels: BasePlanNode = None, - source: Node = None, + *, + morsel: pyarrow.Table = None, + source: LogicalColumn = None, target_column: FlatColumn = None, conditions: Set = None, - statistics=None, distinct: bool = False, single_column: bool = False, -) -> Generator[pyarrow.Table, None, None]: + hash_set=None, +) -> pyarrow.Table: """ Perform a cross join on an unnested column of pyarrow tables. @@ -64,11 +66,8 @@ def _cross_join_unnest_column( """ from opteryx.compiled.cross_join import build_filtered_rows_indices_and_column from opteryx.compiled.cross_join import build_rows_indices_and_column - from opteryx.compiled.structures import HashSet from opteryx.compiled.structures import list_distinct - hash_set = HashSet() - # Check if the source node type is an identifier, raise error otherwise if source.node_type != NodeType.IDENTIFIER: raise NotImplementedError("Can only CROSS JOIN UNNEST on a column") @@ -77,91 +76,78 @@ def _cross_join_unnest_column( at_least_once = False single_column_collector = [] - # Loop through each morsel from the morsels execution - for left_morsel in morsels.execute(): - start = time.monotonic_ns() - # Break the morsel into batches to avoid memory issues - for left_block in left_morsel.to_batches(max_chunksize=batch_size): - new_block = None - # Fetch the data of the column to be unnested - column_data = left_block[source.schema_column.identity] - - # Filter out null values - valid_offsets = column_data.is_valid() - column_data = column_data.drop_null() - if len(column_data) == 0: - continue - left_block = left_block.filter(valid_offsets) - - # Build indices and new column data - if conditions is None: - indices, new_column_data = build_rows_indices_and_column( - column_data.to_numpy(False) - ) + # Break the morsel into batches to avoid memory issues + for left_block in morsel.to_batches(max_chunksize=batch_size): + new_block = None + # Fetch the data of the column to be unnested + column_data = left_block[source.schema_column.identity] + + # Filter out null values + valid_offsets = column_data.is_valid() + column_data = column_data.drop_null() + if len(column_data) == 0: + continue + left_block = left_block.filter(valid_offsets) + + # Build indices and new column data + if conditions is None: + indices, new_column_data = build_rows_indices_and_column(column_data.to_numpy(False)) + else: + indices, new_column_data = build_filtered_rows_indices_and_column( + column_data.to_numpy(False), conditions + ) + + if single_column and distinct and indices.size > 0: + # if the unnest target is the only field in the SELECT and we're DISTINCTING + indices = numpy.array(indices, dtype=numpy.int32) + new_column_data, indices, hash_set = list_distinct(new_column_data, indices, hash_set) + + if len(indices) > 0: + if single_column: + single_column_collector.extend(new_column_data) + if len(single_column_collector) > INTERNAL_BATCH_SIZE: + schema = pyarrow.schema( + [ + pyarrow.field( + name=target_column.identity, type=target_column.arrow_field.type + ) + ] + ) + arrow_array = pyarrow.array(single_column_collector) + if arrow_array.type != target_column.arrow_field.type: + arrow_array = arrow_array.cast(target_column.arrow_field.type) + new_block = pyarrow.Table.from_arrays([arrow_array], schema=schema) + single_column_collector.clear() + del arrow_array + yield new_block + at_least_once = True else: - indices, new_column_data = build_filtered_rows_indices_and_column( - column_data.to_numpy(False), conditions - ) + # Rebuild the block with the new column data if we have any rows to build for - if single_column and distinct and indices.size > 0: - # if the unnest target is the only field in the SELECT and we're DISTINCTING - indices = numpy.array(indices, dtype=numpy.int32) - new_column_data, indices, hash_set = list_distinct( - new_column_data, indices, hash_set - ) + total_rows = len(indices) # Both arrays have the same length + block_size = MORSEL_SIZE_BYTES / (left_block.nbytes / left_block.num_rows) + block_size = int(block_size // 1000) * 1000 + + for start_block in range(0, total_rows, block_size): + # Compute the end index for the current chunk + end_block = min(start_block + block_size, total_rows) + + # Slice the current chunk of indices and new_column_data + indices_chunk = indices[start_block:end_block] + new_column_data_chunk = new_column_data[start_block:end_block] + + # Create a new block using the chunk of indices + indices_chunk = numpy.array(indices_chunk, dtype=numpy.int32) + new_block = left_block.take(indices_chunk) + new_block = pyarrow.Table.from_batches([new_block], schema=morsel.schema) - if len(indices) > 0: - if single_column: - single_column_collector.extend(new_column_data) - if len(single_column_collector) > INTERNAL_BATCH_SIZE: - schema = pyarrow.schema( - [ - pyarrow.field( - name=target_column.identity, type=target_column.arrow_field.type - ) - ] - ) - arrow_array = pyarrow.array(single_column_collector) - if arrow_array.type != target_column.arrow_field.type: - arrow_array = arrow_array.cast(target_column.arrow_field.type) - new_block = pyarrow.Table.from_arrays([arrow_array], schema=schema) - single_column_collector.clear() - del arrow_array - statistics.time_cross_join_unnest += time.monotonic_ns() - start - yield new_block - start = time.monotonic_ns() - at_least_once = True - else: - # Rebuild the block with the new column data if we have any rows to build for - - total_rows = len(indices) # Both arrays have the same length - block_size = MORSEL_SIZE_BYTES / (left_block.nbytes / left_block.num_rows) - block_size = int(block_size // 1000) * 1000 - - for start_block in range(0, total_rows, block_size): - # Compute the end index for the current chunk - end_block = min(start_block + block_size, total_rows) - - # Slice the current chunk of indices and new_column_data - indices_chunk = indices[start_block:end_block] - new_column_data_chunk = new_column_data[start_block:end_block] - - # Create a new block using the chunk of indices - indices_chunk = numpy.array(indices_chunk, dtype=numpy.int32) - new_block = left_block.take(indices_chunk) - new_block = pyarrow.Table.from_batches( - [new_block], schema=left_morsel.schema - ) - - # Append the corresponding chunk of new_column_data to the block - new_block = new_block.append_column( - target_column.identity, pyarrow.array(new_column_data_chunk) - ) - - statistics.time_cross_join_unnest += time.monotonic_ns() - start - yield new_block - at_least_once = True - start = time.monotonic_ns() + # Append the corresponding chunk of new_column_data to the block + new_block = new_block.append_column( + target_column.identity, pyarrow.array(new_column_data_chunk) + ) + + yield new_block + at_least_once = True if single_column_collector: schema = pyarrow.schema( @@ -171,48 +157,40 @@ def _cross_join_unnest_column( if arrow_array.type != target_column.arrow_field.type: arrow_array = arrow_array.cast(target_column.arrow_field.type) new_block = pyarrow.Table.from_arrays([arrow_array], schema=schema) - statistics.time_cross_join_unnest += time.monotonic_ns() - start yield new_block at_least_once = True - start = time.monotonic_ns() if not at_least_once: # Create an empty table with the new schema - schema = left_morsel.schema + schema = morsel.schema new_column = pyarrow.field(target_column.identity, pyarrow.string()) new_schema = pyarrow.schema(list(schema) + [new_column]) new_block = pyarrow.Table.from_batches([], schema=new_schema) - statistics.time_cross_join_unnest += time.monotonic_ns() - start yield new_block def _cross_join_unnest_literal( - morsels: BasePlanNode, source: Tuple, target_column: FlatColumn, statistics + morsel: pyarrow.Table, source: Tuple, target_column: FlatColumn ) -> Generator[pyarrow.Table, None, None]: joined_list_size = len(source) - # Loop through each morsel from the morsels execution - for left_morsel in morsels.execute(): - start = time.monotonic_ns() - # Break the morsel into batches to avoid memory issues - for left_block in left_morsel.to_batches(max_chunksize=INTERNAL_BATCH_SIZE): - left_block = pyarrow.Table.from_batches([left_block], schema=left_morsel.schema) - block_size = left_block.num_rows + # Break the morsel into batches to avoid memory issues + for left_block in morsel.to_batches(max_chunksize=INTERNAL_BATCH_SIZE): + left_block = pyarrow.Table.from_batches([left_block], schema=morsel.schema) + block_size = left_block.num_rows - # Repeat each row in the table n times - repeated_indices = numpy.repeat(numpy.arange(block_size), joined_list_size) - appended_table = left_block.take(repeated_indices) + # Repeat each row in the table n times + repeated_indices = numpy.repeat(numpy.arange(block_size), joined_list_size) + appended_table = left_block.take(repeated_indices) - # Tile the array to match the new number of rows - tiled_array = numpy.tile(source, block_size) + # Tile the array to match the new number of rows + tiled_array = numpy.tile(source, block_size) - # Convert tiled_array to PyArrow array and append it to the table - array_column = pyarrow.array(tiled_array) - appended_table = appended_table.append_column(target_column.identity, array_column) + # Convert tiled_array to PyArrow array and append it to the table + array_column = pyarrow.array(tiled_array) + appended_table = appended_table.append_column(target_column.identity, array_column) - statistics.time_cross_join_unnest += time.monotonic_ns() - start - yield appended_table - start = time.monotonic_ns() + yield appended_table def _cartesian_product(*arrays): @@ -226,7 +204,7 @@ def _cartesian_product(*arrays): return numpy.hsplit(arr.reshape(-1, array_count), array_count) -def _cross_join(left, right, statistics): +def _cross_join(left_morsel, right): """ A cross join is the cartesian product of two tables - this usually isn't very useful, but it does allow you to the theta joins (non-equi joins) @@ -245,35 +223,29 @@ def _chunker(seq_1, seq_2, size): from opteryx.utils.arrow import align_tables at_least_once = False - left_schema = None + left_schema = left_morsel.schema right_schema = right.schema - for left_morsel in left.execute(): - if left_schema is None: - left_schema = left_morsel.schema - start = time.monotonic_ns() - # Iterate through left table in chunks of size INTERNAL_BATCH_SIZE - for left_block in left_morsel.to_batches(max_chunksize=INTERNAL_BATCH_SIZE): - # Convert the chunk to a table to retain column names - left_block = pyarrow.Table.from_batches([left_block], schema=left_morsel.schema) - - # Create an array of row indices for each table - left_array = numpy.arange(left_block.num_rows, dtype=numpy.int64) - right_array = numpy.arange(right.num_rows, dtype=numpy.int64) - - # Calculate the cartesian product of the two arrays of row indices - left_align, right_align = _cartesian_product(left_array, right_array) - - # Further break down the result into manageable chunks of size MAX_JOIN_SIZE - for left_chunk, right_chunk in _chunker(left_align, right_align, MAX_JOIN_SIZE): - # Align the tables using the specified chunks of row indices - table = align_tables(left_block, right, left_chunk.flatten(), right_chunk.flatten()) - - # Yield the resulting table to the caller - statistics.time_cross_join += time.monotonic_ns() - start - yield table - at_least_once = True - start = time.monotonic_ns() + # Iterate through left table in chunks of size INTERNAL_BATCH_SIZE + for left_block in left_morsel.to_batches(max_chunksize=INTERNAL_BATCH_SIZE): + # Convert the chunk to a table to retain column names + left_block = pyarrow.Table.from_batches([left_block], schema=left_morsel.schema) + + # Create an array of row indices for each table + left_array = numpy.arange(left_block.num_rows, dtype=numpy.int64) + right_array = numpy.arange(right.num_rows, dtype=numpy.int64) + + # Calculate the cartesian product of the two arrays of row indices + left_align, right_align = _cartesian_product(left_array, right_array) + + # Further break down the result into manageable chunks of size MAX_JOIN_SIZE + for left_chunk, right_chunk in _chunker(left_align, right_align, MAX_JOIN_SIZE): + # Align the tables using the specified chunks of row indices + table = align_tables(left_block, right, left_chunk.flatten(), right_chunk.flatten()) + + # Yield the resulting table to the caller + yield table + at_least_once = True if not at_least_once: fields = [pyarrow.field(name=f.name, type=f.type) for f in right_schema] + [ @@ -294,26 +266,24 @@ class CrossJoinDataObject(BasePlanDataObject): _distinct: bool = False -class CrossJoinNode(BasePlanNode): +class CrossJoinNode(JoinNode): """ Implements a SQL CROSS JOIN """ - operator_type = OperatorType.PASSTHRU + def __init__(self, properties: QueryProperties, **parameters): + JoinNode.__init__(self, properties=properties, **parameters) - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) + self.source = parameters.get("column") - self.source = config.get("column") - - self._left_relation = config.get("left_relation_names") - self._right_relation = config.get("right_relation_names") + self._left_relation = parameters.get("left_relation_names") + self._right_relation = parameters.get("right_relation_names") # do we have unnest details? - self._unnest_column = config.get("unnest_column") - self._unnest_target = config.get("unnest_target") - self._filters = config.get("filters") - self._distinct = config.get("distinct", False) + self._unnest_column = parameters.get("unnest_column") + self._unnest_target = parameters.get("unnest_target") + self._filters = parameters.get("filters") + self._distinct = parameters.get("distinct", False) # handle variation in how the unnested column is represented if self._unnest_column: @@ -325,10 +295,19 @@ def __init__(self, properties: QueryProperties, **config): ): self._unnest_column.value = tuple([self._unnest_column.value]) - self._single_column = config.get("pre_update_columns", set()) == { + self._single_column = parameters.get("pre_update_columns", set()) == { self._unnest_target.identity, } + self.stream = "left" + self.left_buffer = [] + self.right_buffer = [] + self.left_relation = None + self.right_relation = None + self.hash_set = HashSet() + + self.continue_executing = True + @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover raise NotImplementedError() @@ -344,30 +323,48 @@ def config(self): # pragma: no cover filters = f"({self._unnest_target.name} IN ({', '.join(self._filters)}))" return f"CROSS JOIN {filters}" - def execute(self) -> Generator: - left_node = self._producers[0] # type:ignore - right_node = self._producers[1] # type:ignore - - if self._unnest_column is None: - right_table = pyarrow.concat_tables(right_node.execute(), promote_options="none") # type:ignore - yield from _cross_join(left_node, right_table, self.statistics) - - elif isinstance(self._unnest_column.value, tuple): - yield from _cross_join_unnest_literal( - morsels=left_node, - source=self._unnest_column.value, - target_column=self._unnest_target, - statistics=self.statistics, - ) - else: - if hasattr(left_node, "function") and left_node.function == "UNNEST": - left_node = right_node - yield from _cross_join_unnest_column( - morsels=left_node, - source=self._unnest_column, - target_column=self._unnest_target, - conditions=self._filters, - statistics=self.statistics, - distinct=self._distinct, - single_column=self._single_column, + def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + if not self.continue_executing: + return None + + if self._unnest_column is not None: + if morsel == EOS: + self.continue_executing = False + return EOS + if isinstance(self._unnest_column.value, tuple): + return list( + _cross_join_unnest_literal( + morsel=morsel, + source=self._unnest_column.value, + target_column=self._unnest_target, + ) + ) + return list( + _cross_join_unnest_column( + morsel=morsel, + source=self._unnest_column, + target_column=self._unnest_target, + conditions=self._filters, + hash_set=self.hash_set, + distinct=self._distinct, + single_column=self._single_column, + ) ) + + if self.stream == "left": + if morsel == EOS: + self.stream = "right" + self.left_relation = pyarrow.concat_tables(self.left_buffer, promote_options="none") + self.left_buffer.clear() + else: + self.left_buffer.append(morsel) + return None + + if self.stream == "right": + if morsel == EOS: + right_table = pyarrow.concat_tables(self.right_buffer, promote_options="none") # type:ignore + self.right_buffer = None + return list(_cross_join(self.left_relation, right_table)) + else: + self.right_buffer.append(morsel) + return None diff --git a/opteryx/operators/distinct_node.py b/opteryx/operators/distinct_node.py index d38afa829..60cf76c2d 100644 --- a/opteryx/operators/distinct_node.py +++ b/opteryx/operators/distinct_node.py @@ -18,25 +18,20 @@ This Node eliminates duplicate records. """ -import time -from typing import Generator - -import pyarrow -import pyarrow.compute +from pyarrow import Table +from opteryx import EOS from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType +from . import BasePlanNode -class DistinctNode(BasePlanNode): - operator_type = OperatorType.PASSTHRU - def __init__(self, properties: QueryProperties, **config): +class DistinctNode(BasePlanNode): + def __init__(self, properties: QueryProperties, **parameters): from opteryx.compiled.structures import HashSet - super().__init__(properties=properties) - self._distinct_on = config.get("on") + BasePlanNode.__init__(self, properties=properties, **parameters) + self._distinct_on = parameters.get("on") if self._distinct_on: self._distinct_on = [col.schema_column.identity for col in self._distinct_on] self.hash_set = HashSet() @@ -53,7 +48,7 @@ def config(self): # pragma: no cover def name(self): # pragma: no cover return "Distinction" - def execute(self) -> Generator[pyarrow.Table, None, None]: + def execute(self, morsel: Table) -> Table: from opteryx.compiled.structures import distinct # We create a HashSet outside the distinct call, this allows us to pass @@ -63,22 +58,16 @@ def execute(self) -> Generator[pyarrow.Table, None, None]: # Being able to run morsel-by-morsel means if we have a LIMIT clause, we can # limit processing - morsels = self._producers[0] # type:ignore - at_least_one = False - - for morsel in morsels.execute(): - start = time.monotonic_ns() - unique_indexes, self.hash_set = distinct( - morsel, columns=self._distinct_on, seen_hashes=self.hash_set - ) - - if len(unique_indexes) > 0: - distinct_table = morsel.take(unique_indexes) - self.statistics.time_distincting += time.monotonic_ns() - start - yield distinct_table - at_least_one = True - elif not at_least_one: - distinct_table = morsel.slice(0, 0) - self.statistics.time_distincting += time.monotonic_ns() - start - yield distinct_table - at_least_one = True + if morsel == EOS: + return EOS + + unique_indexes, self.hash_set = distinct( + morsel, columns=self._distinct_on, seen_hashes=self.hash_set + ) + + if len(unique_indexes) > 0: + distinct_table = morsel.take(unique_indexes) + return distinct_table + else: + distinct_table = morsel.slice(0, 0) + return distinct_table diff --git a/opteryx/operators/exit_node.py b/opteryx/operators/exit_node.py index 2de2271db..a428e955b 100644 --- a/opteryx/operators/exit_node.py +++ b/opteryx/operators/exit_node.py @@ -24,20 +24,21 @@ This node doesn't do any calculations, it is a pure Projection. """ -import time from dataclasses import dataclass from dataclasses import field -from typing import Generator from typing import List +from pyarrow import Table + +from opteryx import EOS from opteryx.exceptions import AmbiguousIdentifierError from opteryx.exceptions import InvalidInternalStateError from opteryx.models import LogicalColumn from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType from opteryx.operators.base_plan_node import BasePlanDataObject +from . import BasePlanNode + @dataclass class ExitDataObject(BasePlanDataObject): @@ -45,11 +46,9 @@ class ExitDataObject(BasePlanDataObject): class ExitNode(BasePlanNode): - operator_type = OperatorType.PASSTHRU - - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) - self.columns = config.get("columns", []) + def __init__(self, properties: QueryProperties, **parameters): + BasePlanNode.__init__(self, properties=properties, **parameters) + self.columns = parameters.get("columns", []) self.do = ExitDataObject(columns=self.columns) @@ -65,10 +64,10 @@ def config(self): # pragma: no cover def name(self): # pragma: no cover return "Exit" - def execute(self) -> Generator: - start = time.monotonic_ns() - morsels = self._producers[0] # type:ignore - at_least_one = False + def execute(self, morsel: Table) -> Table: + # Exit doesn't return EOS + if morsel == EOS: + return None final_columns = [] final_names = [] @@ -93,28 +92,17 @@ def execute(self) -> Generator: # else: final_names.append(column.qualified_name) - self.statistics.time_exiting += time.monotonic_ns() - start - for morsel in morsels.execute(): - start = time.monotonic_ns() - if not set(final_columns).issubset(morsel.column_names): # pragma: no cover - mapping = {name: int_name for name, int_name in zip(final_columns, final_names)} - missing_references = { - mapping.get(ref): ref for ref in final_columns if ref not in morsel.column_names - } + if not set(final_columns).issubset(morsel.column_names): # pragma: no cover + mapping = {name: int_name for name, int_name in zip(final_columns, final_names)} + missing_references = { + mapping.get(ref): ref for ref in final_columns if ref not in morsel.column_names + } - raise InvalidInternalStateError( - f"The following fields were not in the resultset - {', '.join(missing_references.keys())}" - ) - - morsel = morsel.select(final_columns) - morsel = morsel.rename_columns(final_names) - - self.statistics.time_exiting += time.monotonic_ns() - start - yield morsel - at_least_one = True - start = time.monotonic_ns() + raise InvalidInternalStateError( + f"The following fields were not in the resultset - {', '.join(missing_references.keys())}" + ) - if not at_least_one: - from orso import DataFrame + morsel = morsel.select(final_columns) + morsel = morsel.rename_columns(final_names) - yield DataFrame(schema=final_names).arrow() + return morsel diff --git a/opteryx/operators/explain_node.py b/opteryx/operators/explain_node.py index 389589c46..2b16067a2 100644 --- a/opteryx/operators/explain_node.py +++ b/opteryx/operators/explain_node.py @@ -18,19 +18,18 @@ This writes out a query plan """ -from typing import Generator +from pyarrow import Table from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType +from . import BasePlanNode -class ExplainNode(BasePlanNode): - operator_type = OperatorType.PRODUCER - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) - self._query_plan = config.get("query_plan") +class ExplainNode(BasePlanNode): + def __init__(self, properties: QueryProperties, **parameters): + BasePlanNode.__init__(self, properties=properties, **parameters) + self._query_plan = parameters.get("query_plan") + self.analyze = parameters.get("analyze", False) @property def name(self): # pragma: no cover @@ -44,6 +43,6 @@ def config(self): def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover raise NotImplementedError() - def execute(self) -> Generator: + def execute(self, morsel: Table) -> Table: if self._query_plan: - yield from self._query_plan.explain() + return self._query_plan.explain(self.analyze) diff --git a/opteryx/operators/filter_node.py b/opteryx/operators/filter_node.py index c96045aba..c4cff2e78 100644 --- a/opteryx/operators/filter_node.py +++ b/opteryx/operators/filter_node.py @@ -18,12 +18,10 @@ This node is responsible for applying filters to datasets. """ -import time -from typing import Generator - import numpy import pyarrow +from opteryx import EOS from opteryx.exceptions import SqlError from opteryx.managers.expression import NodeType from opteryx.managers.expression import evaluate @@ -31,16 +29,14 @@ from opteryx.managers.expression import format_expression from opteryx.managers.expression import get_all_nodes_of_type from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType +from . import BasePlanNode -class FilterNode(BasePlanNode): - operator_type = OperatorType.PASSTHRU - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) - self.filter = config.get("filter") +class FilterNode(BasePlanNode): + def __init__(self, properties: QueryProperties, **parameters): + BasePlanNode.__init__(self, properties=properties, **parameters) + self.filter = parameters.get("filter") self.function_evaluations = get_all_nodes_of_type( self.filter, @@ -59,39 +55,27 @@ def config(self): # pragma: no cover def name(self): # pragma: no cover return "Filter" - def execute(self) -> Generator: - morsels = self._producers[0] # type:ignore - schema = None - at_least_one = False - - for morsel in morsels.execute(): - if schema is None: - schema = morsel.schema + def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + if morsel == EOS: + return EOS - if morsel.num_rows == 0: - continue + if morsel.num_rows == 0: + return morsel - start_selection = time.time_ns() + if self.function_evaluations: morsel = evaluate_and_append(self.function_evaluations, morsel) - mask = evaluate(self.filter, morsel) - self.statistics.time_evaluating += time.time_ns() - start_selection - - if not isinstance(mask, pyarrow.lib.BooleanArray): - try: - mask = pyarrow.array(mask, type=pyarrow.bool_()) - except Exception as err: # nosec - raise SqlError( - f"Unable to filter on expression '{format_expression(self.filter)} {err}'." - ) - mask = numpy.nonzero(mask)[0] - - self.statistics.time_selecting += time.time_ns() - start_selection - - # if there's no matching rows, just drop the morsel - if mask.size > 0 and not numpy.all(mask is None): - yield morsel.take(pyarrow.array(mask)) - at_least_one = True - - # we need to send something to the next operator, send an empty table - if not at_least_one: - yield pyarrow.Table.from_arrays([[] for i in schema.names], schema=schema) + mask = evaluate(self.filter, morsel) + + if not isinstance(mask, pyarrow.lib.BooleanArray): + try: + mask = pyarrow.array(mask, type=pyarrow.bool_()) + except Exception as err: # nosec + raise SqlError( + f"Unable to filter on expression '{format_expression(self.filter)} {err}'." + ) + mask = numpy.nonzero(mask)[0] + + # if there's no matching rows, just drop the morsel + if mask.size > 0 and not numpy.all(mask is None): + return morsel.take(pyarrow.array(mask)) + return morsel.slice(0, 0) diff --git a/opteryx/operators/function_dataset_node.py b/opteryx/operators/function_dataset_node.py index 15b75f9eb..9ac8cf80a 100644 --- a/opteryx/operators/function_dataset_node.py +++ b/opteryx/operators/function_dataset_node.py @@ -23,13 +23,14 @@ import pyarrow +from opteryx import EOS from opteryx.exceptions import SqlError from opteryx.managers.expression import NodeType from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType from opteryx.utils import series +from .read_node import ReaderNode + def _generate_series(**kwargs): value_array = series.generate_series(*kwargs["args"]) @@ -83,20 +84,18 @@ def _http(**kwargs): } -class FunctionDatasetNode(BasePlanNode): - operator_type = OperatorType.PRODUCER - - def __init__(self, properties: QueryProperties, **config): +class FunctionDatasetNode(ReaderNode): + def __init__(self, properties: QueryProperties, **parameters): """ The Blob Reader Node is responsible for reading the relevant blobs and returning a Table/Relation. """ - super().__init__(properties=properties) - self.alias = config.get("alias") - self.function = config["function"] - self.parameters = config - self.columns = config.get("columns", []) - self.args = config.get("args", []) + ReaderNode.__init__(self, properties=properties, **parameters) + self.alias = parameters.get("alias") + self.function = parameters["function"] + self.parameters = parameters + self.columns = parameters.get("columns", []) + self.args = parameters.get("args", []) @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover @@ -125,7 +124,7 @@ def name(self): # pragma: no cover def can_push_selection(self): return False - def execute(self) -> Generator: + def execute(self, morsel) -> Generator: try: start_time = time.time_ns() data = DATASET_FUNCTIONS[self.function](**self.parameters) # type:ignore @@ -144,7 +143,9 @@ def execute(self) -> Generator: else: table = data - self.statistics.rows_read += table.num_rows + self.records_out += table.num_rows + self.bytes_out += table.nbytes self.statistics.columns_read += len(table.column_names) yield table + yield EOS diff --git a/opteryx/operators/heap_sort_node.py b/opteryx/operators/heap_sort_node.py index b089038fa..782e8ab44 100644 --- a/opteryx/operators/heap_sort_node.py +++ b/opteryx/operators/heap_sort_node.py @@ -24,21 +24,20 @@ sorting smaller chunks over and over again. """ -import time from dataclasses import dataclass -from typing import Generator import numpy import pyarrow import pyarrow.compute from pyarrow import concat_tables +from opteryx import EOS from opteryx.exceptions import ColumnNotFoundError from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType from opteryx.operators.base_plan_node import BasePlanDataObject +from . import BasePlanNode + @dataclass class HeapSortDataObject(BasePlanDataObject): @@ -47,14 +46,27 @@ class HeapSortDataObject(BasePlanDataObject): class HeapSortNode(BasePlanNode): - operator_type = OperatorType.BLOCKING - - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) - self.order_by = config.get("order_by", []) - self.limit: int = config.get("limit", -1) + def __init__(self, properties: QueryProperties, **parameters): + BasePlanNode.__init__(self, properties=properties, **parameters) + self.order_by = parameters.get("order_by", []) + self.limit: int = parameters.get("limit", -1) self.do = HeapSortDataObject(order_by=self.order_by, limit=self.limit) + self.mapped_order = [] + self.table = None + + for column, direction in self.order_by: + try: + self.mapped_order.append( + ( + column.schema_column.identity, + direction, + ) + ) + except ColumnNotFoundError as cnfe: + raise ColumnNotFoundError( + f"`ORDER BY` must reference columns as they appear in the `SELECT` clause. {cnfe}" + ) @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover @@ -70,80 +82,58 @@ def config(self): # pragma: no cover def name(self): # pragma: no cover return "Heap Sort" - def execute(self) -> Generator[pyarrow.Table, None, None]: # pragma: no cover - table = None - morsels = self._producers[0] # type:ignore - - mapped_order = [] - - for column, direction in self.order_by: - try: - mapped_order.append( - ( - column.schema_column.identity, - direction, - ) - ) - except ColumnNotFoundError as cnfe: - raise ColumnNotFoundError( - f"`ORDER BY` must reference columns as they appear in the `SELECT` clause. {cnfe}" - ) - - for morsel in morsels.execute(): - start_time = time.time_ns() + def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + if morsel == EOS: + return [self.table, EOS] + + if self.table: + # Concatenate the accumulated table with the new morsel + self.table = concat_tables([self.table, morsel], promote_options="permissive") + else: + self.table = morsel + + # Determine if any columns are string-based + use_pyarrow_sort = any( + pyarrow.types.is_string(self.table.column(column_name).type) + or pyarrow.types.is_binary(self.table.column(column_name).type) + for column_name, _ in self.mapped_order + ) - if table: - # Concatenate the accumulated table with the new morsel - table = concat_tables([table, morsel], promote_options="permissive") + # strings are sorted faster user pyarrow, single columns faster using compute + if len(self.mapped_order) == 1 and use_pyarrow_sort: + column_name, sort_direction = self.mapped_order[0] + column = self.table.column(column_name) + if sort_direction == "ascending": + sort_indices = pyarrow.compute.sort_indices(column) else: - table = morsel - - # Determine if any columns are string-based - use_pyarrow_sort = any( - pyarrow.types.is_string(table.column(column_name).type) - or pyarrow.types.is_binary(table.column(column_name).type) - for column_name, _ in mapped_order - ) - - # strings are sorted faster user pyarrow, single columns faster using compute - if len(mapped_order) == 1 and use_pyarrow_sort: - column_name, sort_direction = mapped_order[0] - column = table.column(column_name) - if sort_direction == "ascending": - sort_indices = pyarrow.compute.sort_indices(column) - else: - sort_indices = pyarrow.compute.sort_indices(column)[::-1] - table = table.take(sort_indices[: self.limit]) - # strings are sorted faster using pyarrow - elif use_pyarrow_sort: - table = table.sort_by(mapped_order).slice(offset=0, length=self.limit) - # single column sort using numpy - elif len(mapped_order) == 1: - # Single-column sort using mergesort to take advantage of partially sorted data - column_name, sort_direction = mapped_order[0] - column = table.column(column_name).to_numpy() - if sort_direction == "ascending": - sort_indices = numpy.argsort(column) - else: - sort_indices = numpy.argsort(column)[::-1] # Reverse for descending - # Slice the sorted table - table = table.take(sort_indices[: self.limit]) - # multi column sort using numpy + sort_indices = pyarrow.compute.sort_indices(column)[::-1] + self.table = self.table.take(sort_indices[: self.limit]) + # strings are sorted faster using pyarrow + elif use_pyarrow_sort: + self.table = self.table.sort_by(self.mapped_order).slice(offset=0, length=self.limit) + # single column sort using numpy + elif len(self.mapped_order) == 1: + # Single-column sort using mergesort to take advantage of partially sorted data + column_name, sort_direction = self.mapped_order[0] + column = self.table.column(column_name).to_numpy() + if sort_direction == "ascending": + sort_indices = numpy.argsort(column) else: - # Multi-column sort using lexsort - columns_for_sorting = [] - directions = [] - for column_name, sort_direction in mapped_order: - column = table.column(column_name).to_numpy() - columns_for_sorting.append(column) - directions.append(1 if sort_direction == "ascending" else -1) - - sort_indices = numpy.lexsort( - [col[::direction] for col, direction in zip(columns_for_sorting, directions)] - ) - # Slice the sorted table - table = table.take(sort_indices[: self.limit]) - - self.statistics.time_heap_sorting += time.time_ns() - start_time - - yield table + sort_indices = numpy.argsort(column)[::-1] # Reverse for descending + # Slice the sorted table + self.table = self.table.take(sort_indices[: self.limit]) + # multi column sort using numpy + else: + # Multi-column sort using lexsort + columns_for_sorting = [] + directions = [] + for column_name, sort_direction in self.mapped_order: + column = self.table.column(column_name).to_numpy() + columns_for_sorting.append(column) + directions.append(1 if sort_direction == "ascending" else -1) + + sort_indices = numpy.lexsort( + [col[::direction] for col, direction in zip(columns_for_sorting, directions)] + ) + # Slice the sorted table + self.table = self.table.take(sort_indices[: self.limit]) diff --git a/opteryx/operators/inner_join_node.py b/opteryx/operators/inner_join_node.py index a73caa784..533a0060a 100644 --- a/opteryx/operators/inner_join_node.py +++ b/opteryx/operators/inner_join_node.py @@ -31,17 +31,16 @@ pyarrow_ops implementation which was a variation of a sort-merge join. """ -import time -from typing import Generator - import pyarrow +from pyarrow import Table +from opteryx import EOS from opteryx.compiled.structures.hash_table import hash_join_map from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType from opteryx.utils.arrow import align_tables +from . import JoinNode + def inner_join_with_preprocessed_left_side(left_relation, right_relation, join_columns, hash_table): """ @@ -73,20 +72,22 @@ def inner_join_with_preprocessed_left_side(left_relation, right_relation, join_c return align_tables(right_relation, left_relation, right_indexes, left_indexes) -class InnerJoinNode(BasePlanNode): - operator_type = OperatorType.PASSTHRU +class InnerJoinNode(JoinNode): + def __init__(self, properties: QueryProperties, **parameters): + JoinNode.__init__(self, properties=properties, **parameters) + self._join_type = parameters["type"] + self._on = parameters.get("on") + self._using = parameters.get("using") - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) - self._join_type = config["type"] - self._on = config.get("on") - self._using = config.get("using") + self._left_columns = parameters.get("left_columns") + self._left_relation = parameters.get("left_relation_names") - self._left_columns = config.get("left_columns") - self._left_relation = config.get("left_relation_names") + self._right_columns = parameters.get("right_columns") + self._right_relation = parameters.get("right_relation_names") - self._right_columns = config.get("right_columns") - self._right_relation = config.get("right_relation_names") + self.stream = "left" + self.left_buffer = [] + self.left_hash = None @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover @@ -100,30 +101,34 @@ def name(self): # pragma: no cover def config(self): # pragma: no cover return "" - def execute(self) -> Generator: - left_node = self._producers[0] # type:ignore - right_node = self._producers[1] # type:ignore - - left_relation = pyarrow.concat_tables(left_node.execute(), promote_options="none") - # in place until #1295 resolved - if self._left_columns[0] not in left_relation.column_names: - self._right_columns, self._left_columns = ( - self._left_columns, - self._right_columns, - ) - - start = time.monotonic_ns() - left_hash = hash_join_map(left_relation, self._left_columns) - - self.statistics.time_inner_join += time.monotonic_ns() - start - for morsel in right_node.execute(): - start = time.monotonic_ns() - # do the join - new_morsel = inner_join_with_preprocessed_left_side( - left_relation=left_relation, - right_relation=morsel, - join_columns=self._right_columns, - hash_table=left_hash, - ) - self.statistics.time_inner_join += time.monotonic_ns() - start - yield new_morsel + def execute(self, morsel: Table) -> Table: + if self.stream == "left": + if morsel == EOS: + self.stream = "right" + self.left_relation = pyarrow.concat_tables(self.left_buffer, promote_options="none") + self.left_buffer.clear() + + # in place until #1295 resolved + if self._left_columns[0] not in self.left_relation.column_names: + self._right_columns, self._left_columns = ( + self._left_columns, + self._right_columns, + ) + + self.left_hash = hash_join_map(self.left_relation, self._left_columns) + else: + self.left_buffer.append(morsel) + return None + + if morsel == EOS: + return EOS + + # do the join + new_morsel = inner_join_with_preprocessed_left_side( + left_relation=self.left_relation, + right_relation=morsel, + join_columns=self._right_columns, + hash_table=self.left_hash, + ) + + return new_morsel diff --git a/opteryx/operators/inner_join_node_single.py b/opteryx/operators/inner_join_node_single.py index 2b1b99ed0..f2f45692c 100644 --- a/opteryx/operators/inner_join_node_single.py +++ b/opteryx/operators/inner_join_node_single.py @@ -20,19 +20,17 @@ the generic INNER JOIN. """ -import time -from typing import Generator - import numpy import pyarrow from pyarrow import compute +from opteryx import EOS from opteryx.compiled.structures import HashTable from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType from opteryx.utils.arrow import align_tables +from . import JoinNode + def preprocess_left(relation, join_columns): """ @@ -160,20 +158,22 @@ def inner_join_with_preprocessed_left_side(left_relation, right_relation, join_c return align_tables(right_relation, left_relation, right_indexes, left_indexes) -class InnerJoinSingleNode(BasePlanNode): - operator_type = OperatorType.PASSTHRU +class InnerJoinSingleNode(JoinNode): + def __init__(self, properties: QueryProperties, **parameters): + JoinNode.__init__(self, properties=properties, **parameters) + self._join_type = parameters["type"] + self._on = parameters.get("on") + self._using = parameters.get("using") - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) - self._join_type = config["type"] - self._on = config.get("on") - self._using = config.get("using") + self._left_columns = parameters.get("left_columns") + self._left_relation = parameters.get("left_relation_names") - self._left_columns = config.get("left_columns") - self._left_relation = config.get("left_relation_names") + self._right_columns = parameters.get("right_columns") + self._right_relation = parameters.get("right_relation_names") - self._right_columns = config.get("right_columns") - self._right_relation = config.get("right_relation_names") + self.stream = "left" + self.left_buffer = [] + self.left_hash = None @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover @@ -187,29 +187,34 @@ def name(self): # pragma: no cover def config(self): # pragma: no cover return "" - def execute(self) -> Generator: - left_node = self._producers[0] # type:ignore - right_node = self._producers[1] # type:ignore - - left_relation = pyarrow.concat_tables(left_node.execute(), promote_options="none") - # in place until #1295 resolved - if self._left_columns[0] not in left_relation.column_names: - self._right_columns, self._left_columns = ( - self._left_columns, - self._right_columns, - ) - - start = time.monotonic_ns() - left_hash = preprocess_left(left_relation, self._left_columns) - self.statistics.time_inner_join += time.monotonic_ns() - start - for morsel in right_node.execute(): - start = time.monotonic_ns() - # do the join - new_morsel = inner_join_with_preprocessed_left_side( - left_relation=left_relation, - right_relation=morsel, - join_columns=self._right_columns, - hash_table=left_hash, - ) - self.statistics.time_inner_join += time.monotonic_ns() - start - yield new_morsel + def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + if self.stream == "left": + if morsel == EOS: + self.stream = "right" + self.left_relation = pyarrow.concat_tables(self.left_buffer, promote_options="none") + self.left_buffer.clear() + + # in place until #1295 resolved + if self._left_columns[0] not in self.left_relation.column_names: + self._right_columns, self._left_columns = ( + self._left_columns, + self._right_columns, + ) + + self.left_hash = preprocess_left(self.left_relation, self._left_columns) + else: + self.left_buffer.append(morsel) + return None + + if morsel == EOS: + return EOS + + # do the join + new_morsel = inner_join_with_preprocessed_left_side( + left_relation=self.left_relation, + right_relation=morsel, + join_columns=self._right_columns, + hash_table=self.left_hash, + ) + + return new_morsel diff --git a/opteryx/operators/join_node.py b/opteryx/operators/join_node.py deleted file mode 100644 index 1bd2dbc1f..000000000 --- a/opteryx/operators/join_node.py +++ /dev/null @@ -1,97 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Join Node - -We have our own implementations of INNER and OUTER joins, this uses PyArrow -to implement less-common joins of ANTI and SEMI joins. -""" - -from typing import Generator - -import pyarrow - -from opteryx.exceptions import UnsupportedSyntaxError -from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType - - -class JoinNode(BasePlanNode): - operator_type = OperatorType.PASSTHRU - - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) - self._join_type = config["type"] - self._on = config.get("on") - self._using = config.get("using") - - self._left_columns = config.get("left_columns") - self._left_relation = config.get("left_relation_names") - - self._right_columns = config.get("right_columns") - self._right_relation = config.get("right_relation_names") - - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - - @property - def name(self): # pragma: no cover - return f"{self._join_type} Join" - - @property - def config(self): # pragma: no cover - from opteryx.managers.expression import format_expression - - if self._on: - return f"{self._join_type.upper()} JOIN ({format_expression(self._on, True)})" - if self._using: - return f"{self._join_type.upper()} JOIN (USING {','.join(map(format_expression, self._using))})" - return f"{self._join_type.upper()}" - - def execute(self) -> Generator: - left_node = self._producers[0] # type:ignore - right_node = self._producers[1] # type:ignore - - left_table = pyarrow.concat_tables(left_node.execute(), promote_options="none") - right_table = pyarrow.concat_tables(right_node.execute(), promote_options="none") - - try: - new_morsel = left_table.join( - right_table, - keys=self._left_columns, - right_keys=self._right_columns, - join_type=self._join_type, - coalesce_keys=self._using is not None, - ) - except pyarrow.ArrowInvalid as err: # pragma: no cover - last_token = str(err).split(" ")[-1] - column = None - for col in left_node.columns: - if last_token == col.identity: - column = col.name - break - for col in right_node.columns: - if last_token == col.identity: - column = col.name - break - if column: - raise UnsupportedSyntaxError( - f"Unable to ANTI/SEMI JOIN with unsupported column types in table, '{column}'." - ) from err - raise UnsupportedSyntaxError( - "Unable to ANTI/SEMI JOIN with unsupported column types in table." - ) from err - - yield new_morsel diff --git a/opteryx/operators/limit_node.py b/opteryx/operators/limit_node.py index bac99f50f..20b204829 100644 --- a/opteryx/operators/limit_node.py +++ b/opteryx/operators/limit_node.py @@ -18,24 +18,22 @@ This Node performs the LIMIT and the OFFSET steps """ -import time -from typing import Generator - import pyarrow +from opteryx import EOS from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType -from opteryx.utils import arrow + +from . import BasePlanNode class LimitNode(BasePlanNode): - operator_type = OperatorType.PASSTHRU + def __init__(self, properties: QueryProperties, **parameters): + BasePlanNode.__init__(self, properties=properties, **parameters) + self.limit = parameters.get("limit", float("inf")) + self.offset = parameters.get("offset", 0) - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) - self.limit = config.get("limit") - self.offset = config.get("offset", 0) + self.remaining_rows = self.limit if self.limit is not None else float("inf") + self.rows_left_to_skip = max(0, self.offset) @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover @@ -49,9 +47,27 @@ def name(self): # pragma: no cover def config(self): # pragma: no cover return str(self.limit) + " OFFSET " + str(self.offset) - def execute(self) -> Generator[pyarrow.Table, None, None]: - morsels = self._producers[0] # type:ignore - start_time = time.monotonic_ns() - limited = arrow.limit_records(morsels.execute(), limit=self.limit, offset=self.offset) - self.statistics.time_limiting += time.monotonic_ns() - start_time - return limited # type: ignore + def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + if morsel == EOS: + return EOS + + if self.rows_left_to_skip > 0: + if self.rows_left_to_skip >= morsel.num_rows: + self.rows_left_to_skip -= morsel.num_rows + return morsel.slice(offset=0, length=0) + else: + morsel = morsel.slice( + offset=self.rows_left_to_skip, length=morsel.num_rows - self.rows_left_to_skip + ) + self.rows_left_to_skip = 0 + + if self.remaining_rows <= 0 or morsel.num_rows == 0: + return morsel.slice(offset=0, length=0) + + if morsel.num_rows < self.remaining_rows: + self.remaining_rows -= morsel.num_rows + return morsel + else: + rows_to_slice = self.remaining_rows + self.remaining_rows = 0 + return morsel.slice(offset=0, length=rows_to_slice) diff --git a/opteryx/operators/noop_node.py b/opteryx/operators/noop_node.py index 6ff91cb77..b0c4bce8b 100644 --- a/opteryx/operators/noop_node.py +++ b/opteryx/operators/noop_node.py @@ -16,18 +16,16 @@ This is a SQL Query Execution Plan Node. """ -from typing import Generator +from pyarrow import Table from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType +from . import BasePlanNode -class NoOpNode(BasePlanNode): - operator_type = OperatorType.PASSTHRU - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) +class NoOpNode(BasePlanNode): + def __init__(self, properties: QueryProperties, **parameters): + BasePlanNode.__init__(self, properties=properties, **parameters) @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover @@ -41,8 +39,6 @@ def name(self): # pragma: no cover def config(self): # pragma: no cover return "" - def execute(self) -> Generator: - # nodes generally have 0 (scan), 1 (most) or 2 (join, union) producers - if self._producers: - for morsels in self._producers: - yield from morsels.execute() + def execute(self, morsel: Table) -> Table: + print("NOOP was called") + return [morsel] diff --git a/opteryx/operators/outer_join_node.py b/opteryx/operators/outer_join_node.py index b96c9b03f..191d43c21 100644 --- a/opteryx/operators/outer_join_node.py +++ b/opteryx/operators/outer_join_node.py @@ -23,18 +23,17 @@ popular SEMI and ANTI joins we leave to PyArrow for now. """ -import time -from typing import Generator from typing import List import pyarrow +from opteryx import EOS from opteryx.compiled.structures import HashTable from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType from opteryx.utils.arrow import align_tables +from . import JoinNode + def left_join(left_relation, right_relation, left_columns: List[str], right_columns: List[str]): """ @@ -60,47 +59,41 @@ def left_join(left_relation, right_relation, left_columns: List[str], right_colu left_indexes: deque = deque() right_indexes: deque = deque() - right_relation = pyarrow.concat_tables(right_relation.execute(), promote_options="none") - if len(set(left_columns) & set(right_relation.column_names)) > 0: left_columns, right_columns = right_columns, left_columns right_hash = hash_join_map(right_relation, right_columns) + left_hash = hash_join_map(left_relation, left_columns) - for left_batch in left_relation.execute(): - left_hash = hash_join_map(left_batch, left_columns) - for hash_value, left_rows in left_hash.hash_table.items(): - right_rows = right_hash.get(hash_value) - if right_rows: - for l in left_rows: - for r in right_rows: - left_indexes.append(l) - right_indexes.append(r) - else: - for l in left_rows: + for hash_value, left_rows in left_hash.hash_table.items(): + right_rows = right_hash.get(hash_value) + if right_rows: + for l in left_rows: + for r in right_rows: left_indexes.append(l) - right_indexes.append(None) - - if len(left_indexes) > 50_000: - table = align_tables( - right_relation, left_batch, list(right_indexes), list(left_indexes) - ) - yield table - left_indexes.clear() - right_indexes.clear() + right_indexes.append(r) + else: + for l in left_rows: + left_indexes.append(l) + right_indexes.append(None) - if len(left_indexes) > 0: + if len(left_indexes) > 50_000: table = align_tables( - right_relation, left_batch, list(right_indexes), list(left_indexes) + right_relation, left_relation, list(right_indexes), list(left_indexes) ) yield table left_indexes.clear() right_indexes.clear() + # this may return an empty table each time - fix later + table = align_tables(right_relation, left_relation, list(right_indexes), list(left_indexes)) + yield table + left_indexes.clear() + right_indexes.clear() + def full_join(left_relation, right_relation, left_columns: List[str], right_columns: List[str]): chunk_size = 1000 - right_relation = pyarrow.concat_tables(right_relation.execute(), promote_options="none") hash_table = HashTable() non_null_right_values = right_relation.select(right_columns).itercolumns() @@ -110,7 +103,6 @@ def full_join(left_relation, right_relation, left_columns: List[str], right_colu left_indexes = [] right_indexes = [] - left_relation = pyarrow.concat_tables(left_relation.execute(), promote_options="none") left_values = left_relation.select(left_columns).itercolumns() for i, value_tuple in enumerate(zip(*left_values)): rows = hash_table.get(hash(value_tuple)) @@ -152,7 +144,6 @@ def right_join(left_relation, right_relation, left_columns: List[str], right_col pyarrow.Table: A chunk of the result of the RIGHT JOIN operation. """ chunk_size = 1000 - left_relation = pyarrow.concat_tables(left_relation.execute(), promote_options="none") hash_table = HashTable() non_null_left_values = left_relation.select(left_columns).itercolumns() @@ -160,26 +151,25 @@ def right_join(left_relation, right_relation, left_columns: List[str], right_col hash_table.insert(hash(value_tuple), i) # Iterate over the right_relation in chunks - right_batches = right_relation.execute() - for right_batch in right_batches: - for right_chunk in right_batch.to_batches(chunk_size): - left_indexes = [] - right_indexes = [] - - right_values = right_chunk.select(right_columns).itercolumns() - for i, value_tuple in enumerate(zip(*right_values)): - rows = hash_table.get(hash(value_tuple)) - if rows: - left_indexes.extend(rows) - right_indexes.extend([i] * len(rows)) - else: - left_indexes.append(None) - right_indexes.append(i) - - # Yield the aligned chunk - # we intentionally swap them to the other calls so we're building a table - # not a record batch (what the chunk is) - yield align_tables(left_relation, right_chunk, left_indexes, right_indexes) + + for right_chunk in right_relation.to_batches(chunk_size): + left_indexes = [] + right_indexes = [] + + right_values = right_chunk.select(right_columns).itercolumns() + for i, value_tuple in enumerate(zip(*right_values)): + rows = hash_table.get(hash(value_tuple)) + if rows: + left_indexes.extend(rows) + right_indexes.extend([i] * len(rows)) + else: + left_indexes.append(None) + right_indexes.append(i) + + # Yield the aligned chunk + # we intentionally swap them to the other calls so we're building a table + # not a record batch (what the chunk is) + yield align_tables(left_relation, right_chunk, left_indexes, right_indexes) def left_anti_join( @@ -200,30 +190,23 @@ def left_anti_join( Returns: A pyarrow.Table containing the result of the LEFT ANTI JOIN operation. """ - right_relation = pyarrow.concat_tables(right_relation.execute(), promote_options="none") - hash_table = HashTable() non_null_right_values = right_relation.select(right_columns).itercolumns() for i, value_tuple in enumerate(zip(*non_null_right_values)): hash_table.insert(hash(value_tuple), i) - at_least_once = False - # Iterate over the left_relation in chunks - for left_batch in left_relation.execute(): - left_indexes = [] - left_values = left_batch.select(left_columns).itercolumns() - for i, value_tuple in enumerate(zip(*left_values)): - rows = hash_table.get(hash(value_tuple)) - if not rows: # Only include left rows that have no match in the right table - left_indexes.append(i) - - # Filter the left_chunk based on the anti join condition - if left_indexes: - yield left_batch.take(left_indexes) - at_least_once = True + left_indexes = [] + left_values = left_relation.select(left_columns).itercolumns() + for i, value_tuple in enumerate(zip(*left_values)): + rows = hash_table.get(hash(value_tuple)) + if not rows: # Only include left rows that have no match in the right table + left_indexes.append(i) - if not at_least_once: - yield left_batch.slice(0, 0) + # Filter the left_chunk based on the anti join condition + if left_indexes: + yield left_relation.take(left_indexes) + else: + yield left_relation.slice(0, 0) def left_semi_join( @@ -244,47 +227,44 @@ def left_semi_join( Returns: A pyarrow.Table containing the result of the LEFT SEMI JOIN operation. """ - right_relation = pyarrow.concat_tables(right_relation.execute(), promote_options="none") hash_table = HashTable() non_null_right_values = right_relation.select(right_columns).itercolumns() for i, value_tuple in enumerate(zip(*non_null_right_values)): hash_table.insert(hash(value_tuple), i) - at_least_once = False - # Iterate over the left_relation in chunks - for left_batch in left_relation.execute(): - left_indexes = [] - left_values = left_batch.select(left_columns).itercolumns() - - for i, value_tuple in enumerate(zip(*left_values)): - rows = hash_table.get(hash(value_tuple)) - if rows: # Only include left rows that have a match in the right table - left_indexes.append(i) + left_indexes = [] + left_values = left_relation.select(left_columns).itercolumns() - # Filter the left_chunk based on the anti join condition - if left_indexes: - yield left_batch.take(left_indexes) - at_least_once = True + for i, value_tuple in enumerate(zip(*left_values)): + rows = hash_table.get(hash(value_tuple)) + if rows: # Only include left rows that have a match in the right table + left_indexes.append(i) - if not at_least_once: - yield left_batch.slice(0, 0) + # Filter the left_chunk based on the anti join condition + if left_indexes: + yield left_relation.take(left_indexes) + else: + yield left_relation.slice(0, 0) -class OuterJoinNode(BasePlanNode): - operator_type = OperatorType.PASSTHRU +class OuterJoinNode(JoinNode): + def __init__(self, properties: QueryProperties, **parameters): + JoinNode.__init__(self, properties=properties, **parameters) + self._join_type = parameters["type"] + self._on = parameters.get("on") + self._using = parameters.get("using") - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) - self._join_type = config["type"] - self._on = config.get("on") - self._using = config.get("using") + self._left_columns = parameters.get("left_columns") + self._left_relation = parameters.get("left_relation_names") - self._left_columns = config.get("left_columns") - self._left_relation = config.get("left_relation_names") + self._right_columns = parameters.get("right_columns") + self._right_relation = parameters.get("right_relation_names") - self._right_columns = config.get("right_columns") - self._right_relation = config.get("right_relation_names") + self.stream = "left" + self.left_buffer = [] + self.right_buffer = [] + self.left_relation = None @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover @@ -304,22 +284,35 @@ def config(self): # pragma: no cover return f"{self._join_type.upper()} JOIN (USING {','.join(map(format_expression, self._using))})" return f"{self._join_type.upper()}" - def execute(self) -> Generator: - left_node = self._producers[0] # type:ignore - right_node = self._producers[1] # type:ignore - - join_provider = providers.get(self._join_type) - - start = time.monotonic_ns() - for morsel in join_provider( - left_relation=left_node, - right_relation=right_node, - left_columns=self._left_columns, - right_columns=self._right_columns, - ): - self.statistics.time_outer_join += time.monotonic_ns() - start - yield morsel - start = time.monotonic_ns() + def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + if self.stream == "left": + if morsel == EOS: + self.stream = "right" + self.left_relation = pyarrow.concat_tables(self.left_buffer, promote_options="none") + self.left_buffer.clear() + else: + self.left_buffer.append(morsel) + return None + + if self.stream == "right": + if morsel == EOS: + right_relation = pyarrow.concat_tables(self.right_buffer, promote_options="none") + self.right_buffer.clear() + + join_provider = providers.get(self._join_type) + + return list( + join_provider( + left_relation=self.left_relation, + right_relation=right_relation, + left_columns=self._left_columns, + right_columns=self._right_columns, + ) + ) + [EOS] + + else: + self.right_buffer.append(morsel) + return None providers = { diff --git a/opteryx/operators/projection_node.py b/opteryx/operators/projection_node.py index 069535ce0..35b890597 100644 --- a/opteryx/operators/projection_node.py +++ b/opteryx/operators/projection_node.py @@ -19,26 +19,24 @@ that performs column renames. """ -import time -from typing import Generator +import pyarrow +from opteryx import EOS from opteryx.managers.expression import NodeType from opteryx.managers.expression import evaluate_and_append from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType +from . import BasePlanNode -class ProjectionNode(BasePlanNode): - operator_type = OperatorType.PASSTHRU - def __init__(self, properties: QueryProperties, **config): +class ProjectionNode(BasePlanNode): + def __init__(self, properties: QueryProperties, **parameters): """ Attribute Projection, remove unwanted columns and performs column renames. """ - super().__init__(properties=properties) + BasePlanNode.__init__(self, properties=properties, **parameters) - projection = config["projection"] + config.get("order_by_columns", []) + projection = parameters["projection"] + parameters.get("order_by_columns", []) self.projection = [] for column in projection: @@ -48,7 +46,7 @@ def __init__(self, properties: QueryProperties, **config): column for column in projection if column.node_type != NodeType.IDENTIFIER ] - self.columns = config["projection"] + self.columns = parameters["projection"] @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover @@ -64,15 +62,10 @@ def config(self): # pragma: no cover def name(self): # pragma: no cover return "Projection" - def execute(self) -> Generator: - morsels = self._producers[0] # type:ignore - - for morsel in morsels.execute(): - # If any of the columns need evaluating, we need to do that here - start_time = time.time_ns() - morsel = evaluate_and_append(self.evaluations, morsel) - self.statistics.time_evaluating += time.time_ns() - start_time - - morsel = morsel.select(self.projection) + def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + if morsel == EOS: + return EOS - yield morsel + # If any of the columns need evaluating, we need to do that here + morsel = evaluate_and_append(self.evaluations, morsel) + return morsel.select(self.projection) diff --git a/opteryx/operatorsv2/pyarrow_join_node.py b/opteryx/operators/pyarrow_join_node.py similarity index 100% rename from opteryx/operatorsv2/pyarrow_join_node.py rename to opteryx/operators/pyarrow_join_node.py diff --git a/opteryx/operators/read_node.py b/opteryx/operators/read_node.py index 71e055860..e81bcb7eb 100644 --- a/opteryx/operators/read_node.py +++ b/opteryx/operators/read_node.py @@ -27,9 +27,10 @@ from orso.schema import RelationSchema from orso.schema import convert_orso_schema_to_arrow_schema +from opteryx import EOS from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType + +from . import BasePlanNode def struct_to_jsonb(table: pyarrow.Table) -> pyarrow.Table: @@ -132,10 +133,9 @@ def merge_schemas( class ReaderNode(BasePlanNode): - operator_type = OperatorType.PRODUCER - def __init__(self, properties: QueryProperties, **parameters): - super().__init__(properties=properties, **parameters) + BasePlanNode.__init__(self, properties=properties, **parameters) + self.start_date = parameters.get("start_date") self.end_date = parameters.get("end_date") self.hints = parameters.get("hints", []) @@ -149,6 +149,9 @@ def __init__(self, properties: QueryProperties, **parameters): if len(self.hints) != 0: self.statistics.add_message("All HINTS are currently ignored") + self.statistics.rows_read += 0 + self.statistics.columns_read += 0 + def to_dict(self) -> dict: return { "identity": f"read-{self.identity}", @@ -186,13 +189,9 @@ def config(self): f"{' WITH(' + ','.join(self.parameters.get('hints')) + ')' if self.parameters.get('hints') else ''})" ) - def execute(self) -> Generator: + def execute(self, morsel) -> Generator: """Perform this step, time how long is spent doing work""" - self.statistics.blobs_read += 0 - self.statistics.rows_read += 0 - self.statistics.bytes_processed += 0 - morsel = None orso_schema = self.schema orso_schema_cols = [] @@ -216,11 +215,14 @@ def execute(self) -> Generator: self.statistics.time_reading_blobs += time.monotonic_ns() - start_clock self.statistics.blobs_read += 1 + self.records_out += morsel.num_rows self.statistics.rows_read += morsel.num_rows - self.statistics.bytes_processed += morsel.nbytes + self.bytes_out += morsel.nbytes yield morsel start_clock = time.monotonic_ns() if morsel: self.statistics.columns_read += morsel.num_columns else: self.statistics.columns_read += len(orso_schema.columns) + + yield EOS diff --git a/opteryx/operators/set_variable_node.py b/opteryx/operators/set_variable_node.py index 8d55e0284..02676434d 100644 --- a/opteryx/operators/set_variable_node.py +++ b/opteryx/operators/set_variable_node.py @@ -16,25 +16,20 @@ This is a SQL Query Execution Plan Node. """ -from typing import Generator - from opteryx.constants import QueryStatus from opteryx.models import NonTabularResult from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType - -class SetVariableNode(BasePlanNode): - operator_type = OperatorType.PRODUCER +from . import BasePlanNode - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) - self.variable = config.get("variable") - self.value = config.get("value") +class SetVariableNode(BasePlanNode): + def __init__(self, properties: QueryProperties, **parameters): + BasePlanNode.__init__(self, properties=properties, **parameters) - self.variables = config.get("variables") + self.variable = parameters.get("variable") + self.value = parameters.get("value") + self.variables = parameters.get("variables") @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover @@ -48,6 +43,6 @@ def name(self): # pragma: no cover def config(self): # pragma: no cover return f"{self.variable} TO {self.value}" - def execute(self) -> Generator: + def execute(self, morsel) -> NonTabularResult: self.variables[self.variable] = self.value return NonTabularResult(record_count=1, status=QueryStatus.SQL_SUCCESS) # type: ignore diff --git a/opteryx/operators/show_columns_node.py b/opteryx/operators/show_columns_node.py index 247ac6159..3d57a8c21 100644 --- a/opteryx/operators/show_columns_node.py +++ b/opteryx/operators/show_columns_node.py @@ -18,13 +18,12 @@ Gives information about a dataset's columns """ -from typing import Generator - import pyarrow +from opteryx import EOS from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType + +from . import BasePlanNode def _simple_collector(schema): @@ -41,39 +40,20 @@ def _simple_collector(schema): } buffer.append(new_row) - table = pyarrow.Table.from_pylist(buffer) - return table - - -def _extended_collector(morsels): - """ - Collect summary statistics about each column - - We use orso, which means converting to an orso DataFrame and then converting back - to a PyArrow table. - """ - import orso - - profile = None - for morsel in morsels: - df = orso.DataFrame.from_arrow(morsel) - if profile is None: - profile = df.profile - else: - profile += df.profile - - return profile.to_dicts() + return pyarrow.Table.from_pylist(buffer) class ShowColumnsNode(BasePlanNode): - operator_type = OperatorType.PRODUCER - - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) - self._full = config.get("full") - self._extended = config.get("extended") - self._schema = config.get("schema") - self._column_map = {c.schema_column.identity: c.source_column for c in config["columns"]} + def __init__(self, properties: QueryProperties, **parameters): + BasePlanNode.__init__(self, properties=properties, **parameters) + self._full = parameters.get("full") + self._extended = parameters.get("extended") + self._schema = parameters.get("schema") + self._column_map = { + c.schema_column.identity: c.source_column for c in parameters["columns"] + } + self.collector = None + self.seen = False @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover @@ -91,28 +71,32 @@ def rename_column(self, dic: dict, renames) -> dict: dic["name"] = renames[dic["name"]] return dic - def execute(self) -> Generator: - morsels = self._producers[0] # type:ignore + def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + from orso import DataFrame - if morsels is None: + if self.seen: return None if not (self._full or self._extended): # if it's not full or extended, do just get the list of columns and their # types - yield _simple_collector(self._schema) - return + self.seen = True + return _simple_collector(self._schema) - if self._full and not self._extended: + if self._full or self._extended: # we're going to read the full table, so we can count stuff - dicts = _extended_collector(morsels.execute()) - dicts = [self.rename_column(d, self._column_map) for d in dicts] - yield pyarrow.Table.from_pylist(dicts) - return - - if self._extended: - # get everything we can reasonable get - dicts = _extended_collector(morsels.execute()) - dicts = [self.rename_column(d, self._column_map) for d in dicts] - yield pyarrow.Table.from_pylist(dicts) - return + + if morsel == EOS: + dicts = self.collector.to_dicts() + dicts = [self.rename_column(d, self._column_map) for d in dicts] + self.seen = True + return pyarrow.Table.from_pylist(dicts) + + df = DataFrame.from_arrow(morsel) + + if self.collector is None: + self.collector = df.profile + else: + self.collector += df.profile + + return None diff --git a/opteryx/operators/show_create_node.py b/opteryx/operators/show_create_node.py index c33a5d415..d76d95d9b 100644 --- a/opteryx/operators/show_create_node.py +++ b/opteryx/operators/show_create_node.py @@ -16,25 +16,21 @@ This is a SQL Query Execution Plan Node. """ -from typing import Generator - import pyarrow from opteryx.exceptions import DatasetNotFoundError from opteryx.exceptions import UnsupportedSyntaxError from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType +from . import BasePlanNode -class ShowCreateNode(BasePlanNode): - operator_type = OperatorType.PRODUCER - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) +class ShowCreateNode(BasePlanNode): + def __init__(self, properties: QueryProperties, **parameters): + BasePlanNode.__init__(self, properties=properties, **parameters) - self.object_type = config.get("object_type") - self.object_name = config.get("object_name") + self.object_type = parameters.get("object_type") + self.object_name = parameters.get("object_name") @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover @@ -48,7 +44,7 @@ def name(self): # pragma: no cover def config(self): # pragma: no cover return "" - def execute(self) -> Generator: + def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: if self.object_type == "VIEW": from opteryx.planner.views import is_view from opteryx.planner.views import view_as_sql @@ -57,8 +53,7 @@ def execute(self) -> Generator: view_sql = view_as_sql(self.object_name) buffer = [{self.object_name: view_sql}] table = pyarrow.Table.from_pylist(buffer) - yield table - return + return table raise DatasetNotFoundError(self.object_name) diff --git a/opteryx/operators/show_value_node.py b/opteryx/operators/show_value_node.py index c889b66e1..f223363bb 100644 --- a/opteryx/operators/show_value_node.py +++ b/opteryx/operators/show_value_node.py @@ -20,21 +20,20 @@ import pyarrow +from opteryx import EOS from opteryx.exceptions import SqlError from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType +from . import ReaderNode -class ShowValueNode(BasePlanNode): - operator_type = OperatorType.PRODUCER - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) +class ShowValueNode(ReaderNode): + def __init__(self, properties: QueryProperties, **parameters): + ReaderNode.__init__(self, properties=properties, **parameters) - self.key = config.get("key") - self.kind = config.get("kind") - self.value = config.get("value") + self.key = parameters.get("key") + self.kind = parameters.get("kind") + self.value = parameters.get("value") if self.kind == "PARAMETER": if self.value[0] == "@": @@ -54,7 +53,7 @@ def name(self): # pragma: no cover def config(self): # pragma: no cover return "" - def execute(self) -> Generator: + def execute(self, morsel) -> Generator: buffer = [{"name": self.key, "value": str(self.value)}] table = pyarrow.Table.from_pylist(buffer) yield table diff --git a/opteryx/operators/sort_node.py b/opteryx/operators/sort_node.py index c8db89da6..12c399240 100644 --- a/opteryx/operators/sort_node.py +++ b/opteryx/operators/sort_node.py @@ -18,27 +18,25 @@ This node orders a dataset """ -import time -from typing import Generator - import numpy from orso.types import OrsoTypes +from pyarrow import Table from pyarrow import concat_tables +from opteryx import EOS from opteryx.exceptions import ColumnNotFoundError from opteryx.exceptions import UnsupportedSyntaxError from opteryx.managers.expression import NodeType from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType +from . import BasePlanNode -class SortNode(BasePlanNode): - operator_type = OperatorType.BLOCKING - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) - self.order_by = config.get("order", []) +class SortNode(BasePlanNode): + def __init__(self, properties: QueryProperties, **parameters): + BasePlanNode.__init__(self, properties=properties, **parameters) + self.order_by = parameters.get("order_by", []) + self.morsels = [] @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover @@ -52,15 +50,14 @@ def config(self): # pragma: no cover def name(self): # pragma: no cover return "Sort" - def execute(self) -> Generator: - morsels = self._producers[0] # type:ignore - morsels = morsels.execute() - morsels = tuple(morsels) - mapped_order = [] + def execute(self, morsel: Table) -> Table: + if morsel != EOS: + self.morsels.append(morsel) + return None - table = concat_tables(morsels, promote_options="permissive") + table = concat_tables(self.morsels, promote_options="permissive") - start_time = time.time_ns() + mapped_order = [] for column, direction in self.order_by: if column.node_type == NodeType.FUNCTION: @@ -70,10 +67,7 @@ def execute(self) -> Generator: if column.value in ("RANDOM", "RAND"): new_order = numpy.argsort(numpy.random.uniform(size=table.num_rows)) table = table.take(new_order) - self.statistics.time_ordering = time.time_ns() - start_time - - yield table - return + return table raise UnsupportedSyntaxError( "`ORDER BY` only supports `RAND()` as a functional sort order." @@ -103,7 +97,4 @@ def execute(self) -> Generator: f"`ORDER BY` must reference columns as they appear in the `SELECT` clause. {cnfe}" ) - table = table.sort_by(mapped_order) - self.statistics.time_ordering = time.time_ns() - start_time - - yield table + return [table.sort_by(mapped_order), EOS] diff --git a/opteryx/operators/union_node.py b/opteryx/operators/union_node.py index 34e102be9..a59a07530 100644 --- a/opteryx/operators/union_node.py +++ b/opteryx/operators/union_node.py @@ -16,20 +16,21 @@ This is a SQL Query Execution Plan Node. """ -from typing import Generator +from pyarrow import Table +from opteryx import EOS from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType +from . import BasePlanNode -class UnionNode(BasePlanNode): - operator_type = OperatorType.PASSTHRU - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) - self.columns = config.get("columns", []) +class UnionNode(BasePlanNode): + def __init__(self, properties: QueryProperties, **parameters): + BasePlanNode.__init__(self, properties=properties, **parameters) + self.columns = parameters.get("columns", []) self.column_ids = [c.schema_column.identity for c in self.columns] + self.seen_first_eos = False + self.schema = None @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover @@ -43,18 +44,21 @@ def name(self): # pragma: no cover def config(self): # pragma: no cover return "" - def execute(self) -> Generator: + def execute(self, morsel: Table) -> Table: """ Union needs to ensure the column names are the same and that coercible types are coerced. """ - schema = None - if self._producers: - for morsels in self._producers: - for morsel in morsels.execute(): - if schema is None: - schema = morsel.schema - else: - morsel = morsel.rename_columns(schema.names) - morsel = morsel.cast(schema) - yield morsel.select(self.column_ids) + if morsel == EOS and self.seen_first_eos: + return [EOS] + if morsel == EOS: + self.seen_first_eos = True + return None + + if self.schema is None: + self.schema = morsel.schema + else: + morsel = morsel.rename_columns(self.schema.names) + morsel = morsel.cast(self.schema) + + return morsel.select(self.column_ids) diff --git a/opteryx/operatorsv2/__init__.py b/opteryx/operatorsv2/__init__.py deleted file mode 100644 index 326f7b31a..000000000 --- a/opteryx/operatorsv2/__init__.py +++ /dev/null @@ -1,53 +0,0 @@ -# isort: skip - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from .base_plan_node import BasePlanDataObject # isort: skip -from .base_plan_node import BasePlanNode, JoinNode # isort: skip - -from .aggregate_and_group_node import AggregateAndGroupNode # Group is always followed by aggregate -from .aggregate_node import AGGREGATORS -from .aggregate_node import AggregateNode # aggregate data -from .async_read_node import AsyncReaderNode - -# from .build_statistics_node import BuildStatisticsNode # Analyze Tables -from .cross_join_node import CrossJoinNode # CROSS JOIN -from .distinct_node import DistinctNode # remove duplicate records -from .exit_node import ExitNode -from .explain_node import ExplainNode # EXPLAIN queries -from .filter_node import FilterNode # filter unwanted rows -from .function_dataset_node import FunctionDatasetNode # Dataset Constructors -from .heap_sort_node import HeapSortNode # Heap - -# from .information_schema_node import InformationSchemaNode # information_schema -from .inner_join_node import InnerJoinNode -from .inner_join_node_single import InnerJoinSingleNode -from .limit_node import LimitNode # select the first N records -from .pyarrow_join_node import PyArrowJoinNode - -# from .metadata_writer_node import MetadataWriterNode -# from .morsel_defragment_node import MorselDefragmentNode # consolidate small morsels -from .noop_node import NoOpNode # No Operation -from .outer_join_node import OuterJoinNode -from .projection_node import ProjectionNode # remove unwanted columns including renames -from .read_node import ReaderNode -from .set_variable_node import SetVariableNode -from .show_columns_node import ShowColumnsNode # column details -from .show_create_node import ShowCreateNode # SHOW CREATE VIEW - -# from .show_databases_node import ShowDatabasesNode # SHOW DATABASES -# from .show_functions_node import ShowFunctionsNode # supported functions -from .show_value_node import ShowValueNode # display node for SHOW -from .sort_node import SortNode # order by selected columns -from .union_node import UnionNode diff --git a/opteryx/operatorsv2/aggregate_and_group_node.py b/opteryx/operatorsv2/aggregate_and_group_node.py deleted file mode 100644 index 6cc40d281..000000000 --- a/opteryx/operatorsv2/aggregate_and_group_node.py +++ /dev/null @@ -1,153 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Grouping Node - -This is a SQL Query Execution Plan Node. - -This is the grouping node, it is always followed by the aggregation node, but -the aggregation node doesn't need the grouping node. - - -""" - -from dataclasses import dataclass - -import numpy -import pyarrow -from orso.types import OrsoTypes - -from opteryx import EOS -from opteryx.managers.expression import NodeType -from opteryx.managers.expression import evaluate_and_append -from opteryx.managers.expression import get_all_nodes_of_type -from opteryx.models import QueryProperties -from opteryx.operators.aggregate_node import build_aggregations -from opteryx.operators.aggregate_node import extract_evaluations -from opteryx.operators.aggregate_node import project -from opteryx.operators.base_plan_node import BasePlanDataObject - -from . import BasePlanNode - - -@dataclass -class AggregateAndGroupDataObject(BasePlanDataObject): - groups: list = None - aggregates: list = None - all_identifiers: list = None - evaluatable_nodes: list = None - group_by_columns: list = None - column_map: list = None - aggregate_functions: list = None - - -class AggregateAndGroupNode(BasePlanNode): - def __init__(self, properties: QueryProperties, **parameters): - BasePlanNode.__init__(self, properties=properties, **parameters) - self.groups = list(parameters["groups"]) - self.aggregates = list(parameters["aggregates"]) - projection = list(parameters["projection"]) - - # we're going to preload some of the evaluation - - # Replace offset based GROUP BYs with their column - self.groups = [ - ( - group - if not (group.node_type == NodeType.LITERAL and group.type == OrsoTypes.INTEGER) - else projection[group.value - 1] - ) - for group in self.groups - ] - - # get all the columns anywhere in the groups or aggregates - all_identifiers = [ - node.schema_column.identity - for node in get_all_nodes_of_type( - self.groups + self.aggregates, select_nodes=(NodeType.IDENTIFIER,) - ) - ] - self.all_identifiers = list(dict.fromkeys(all_identifiers)) - - # Get any functions we need to execute before aggregating - self.evaluatable_nodes = extract_evaluations(self.aggregates) - - # get the aggregated groupings and functions - self.group_by_columns = list({node.schema_column.identity for node in self.groups}) - self.column_map, self.aggregate_functions = build_aggregations(self.aggregates) - - self.do = AggregateAndGroupDataObject() - - self.buffer = [] - - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - - @property - def config(self): # pragma: no cover - from opteryx.managers.expression import format_expression - - return f"AGGREGATE ({', '.join(format_expression(col) for col in self.aggregates)}) GROUP BY ({', '.join(format_expression(col) for col in self.groups)})" - - @property - def name(self): # pragma: no cover - return "Group" - - def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: - if morsel == EOS: - # merge all the morsels together into one table, selecting only the columns - # we're pretty sure we're going to use - this will fail for datasets - # larger than memory - table = pyarrow.concat_tables( - self.buffer, - promote_options="permissive", - ) - - # do the group by and aggregates - table = table.combine_chunks() - groups = table.group_by(self.group_by_columns) - groups = groups.aggregate(self.aggregate_functions) - - # do the secondary activities for ARRAY_AGG - for node in get_all_nodes_of_type(self.aggregates, select_nodes=(NodeType.AGGREGATOR,)): - if node.value == "ARRAY_AGG" and node.order or node.limit: - # rip the column out of the table - column_name = self.column_map[node.schema_column.identity] - column_def = groups.field(column_name) # this is used - column = groups.column(column_name).to_pylist() - groups = groups.drop([column_name]) - if node.order: - column = [sorted(c, reverse=bool(node.order[0][1])) for c in column] - if node.limit: - column = [c[: node.limit] for c in column] - # put the new column into the table - groups = groups.append_column(column_def, [column]) - - # project to the desired column names from the pyarrow names - groups = groups.select(list(self.column_map.values()) + self.group_by_columns) - groups = groups.rename_columns(list(self.column_map.keys()) + self.group_by_columns) - - return [groups, EOS] - - morsel = project(morsel, self.all_identifiers) - # Add a "*" column, this is an int because when a bool it miscounts - if "*" not in morsel.column_names: - morsel = morsel.append_column( - "*", [numpy.ones(shape=morsel.num_rows, dtype=numpy.bool_)] - ) - if self.evaluatable_nodes: - morsel = evaluate_and_append(self.evaluatable_nodes, morsel) - morsel = evaluate_and_append(self.groups, morsel) - - self.buffer.append(morsel) diff --git a/opteryx/operatorsv2/aggregate_node.py b/opteryx/operatorsv2/aggregate_node.py deleted file mode 100644 index a577f101c..000000000 --- a/opteryx/operatorsv2/aggregate_node.py +++ /dev/null @@ -1,256 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Aggregation Node - -This is a SQL Query Execution Plan Node. - -This node performs aggregates without performing groupings. -""" - -from dataclasses import dataclass - -import numpy -import pyarrow - -from opteryx import EOS -from opteryx.exceptions import UnsupportedSyntaxError -from opteryx.managers.expression import NodeType -from opteryx.managers.expression import evaluate_and_append -from opteryx.managers.expression import get_all_nodes_of_type -from opteryx.models import QueryProperties -from opteryx.operators.base_plan_node import BasePlanDataObject - -from . import BasePlanNode - -COUNT_STAR: str = "COUNT(*)" - -# use the aggregators from pyarrow -AGGREGATORS = { - "ALL": "all", - "ANY": "any", - "APPROXIMATE_MEDIAN": "approximate_median", - "ARRAY_AGG": "hash_list", - "COUNT": "count", # counts only non nulls - "COUNT_DISTINCT": "count_distinct", - "DISTINCT": "distinct", # fated - "LIST": "hash_list", # fated - "MAX": "max", - "MAXIMUM": "max", # alias - "MEAN": "mean", - "AVG": "mean", # alias - "AVERAGE": "mean", # alias - "MIN": "min", - "MINIMUM": "min", # alias - "MIN_MAX": "min_max", - "ONE": "hash_one", - "ANY_VALUE": "hash_one", - "PRODUCT": "product", - "STDDEV": "stddev", - "SUM": "sum", - "VARIANCE": "variance", -} - - -def _is_count_star(aggregates): - """ - Is the SELECT clause `SELECT COUNT(*)` with no GROUP BY - """ - if len(aggregates) != 1: - return False - if aggregates[0].value != "COUNT": - return False - return aggregates[0].parameters[0].node_type == NodeType.WILDCARD - - -def _count_star(morsel_promise, column_name): - count = sum(morsel.num_rows for morsel in morsel_promise) - table = pyarrow.Table.from_pylist([{column_name: count}]) - return table - - -def project(tables, column_names): - for table in tables: - row_count = table.num_rows - if len(column_names) > 0: - yield table.select(dict.fromkeys(column_names)) - else: - # if we can't find the column, add a placeholder column - yield pyarrow.Table.from_pydict({"*": numpy.full(row_count, 1, dtype=numpy.int8)}) - - -def build_aggregations(aggregators): - column_map = {} - aggs = [] - - if not isinstance(aggregators, list): - aggregators = [aggregators] - - for root in aggregators: - for aggregator in get_all_nodes_of_type(root, select_nodes=(NodeType.AGGREGATOR,)): - field_node = aggregator.parameters[0] - count_options = None - - if field_node.node_type == NodeType.WILDCARD: - field_name = "*" - # count * counts nulls - count_options = pyarrow.compute.CountOptions(mode="all") - else: - field_name = field_node.schema_column.identity - function = AGGREGATORS[aggregator.value] - # if the array agg is distinct, base off that function instead - if aggregator.value == "ARRAY_AGG" and aggregator.distinct: - function = "distinct" - aggs.append((field_name, function, count_options)) - column_map[aggregator.schema_column.identity] = f"{field_name}_{function}".replace( - "_hash_", "_" - ) - - return column_map, aggs - - -def _non_group_aggregates(aggregates, table): - """ - If we're not doing a group by, we're just doing aggregations, the pyarrow - functionality for aggregate doesn't work. So we do the calculation, it's - relatively straightforward as it's the entire table we're summarizing. - """ - - result = {} - - for aggregate in aggregates: - if aggregate.node_type in (NodeType.AGGREGATOR,): - column_node = aggregate.parameters[0] - if column_node.node_type == NodeType.LITERAL: - raw_column_values = numpy.full(table.num_rows, column_node.value) - elif ( - aggregate.value == "COUNT" - and aggregate.parameters[0].node_type == NodeType.WILDCARD - ): - result[aggregate.schema_column.identity] = table.num_rows - continue - else: - raw_column_values = table[column_node.schema_column.identity].to_numpy() - aggregate_function_name = AGGREGATORS[aggregate.value] - # this maps a string which is the function name to that function on the - # pyarrow.compute module - if not hasattr(pyarrow.compute, aggregate_function_name): - raise UnsupportedSyntaxError( - f"Aggregate `{aggregate.value}` can only be used with GROUP BY" - ) - aggregate_function = getattr(pyarrow.compute, aggregate_function_name) - aggregate_column_value = aggregate_function(raw_column_values).as_py() - result[aggregate.schema_column.identity] = aggregate_column_value - - return pyarrow.Table.from_pylist([result]) - - -def extract_evaluations(aggregates): - # extract any inner evaluations, like the IIF in SUM(IIF(x, 1, 0)) - - all_evaluatable_nodes = get_all_nodes_of_type( - aggregates, - select_nodes=( - NodeType.FUNCTION, - NodeType.BINARY_OPERATOR, - NodeType.COMPARISON_OPERATOR, - NodeType.LITERAL, - ), - ) - - evaluatable_nodes = [] - for node in all_evaluatable_nodes: - aggregators = get_all_nodes_of_type(node, select_nodes=(NodeType.AGGREGATOR,)) - if len(aggregators) == 0: - evaluatable_nodes.append(node) - - return evaluatable_nodes - - -@dataclass -class AggregateDataObject(BasePlanDataObject): - aggregates: list = None - all_identifiers: list = None - evaluatable_nodes: list = None - column_map: list = None - aggregate_functions: list = None - - -class AggregateNode(BasePlanNode): - def __init__(self, properties: QueryProperties, **parameters): - BasePlanNode.__init__(self, properties=properties, **parameters) - - self.aggregates = parameters.get("aggregates", []) - - # get all the columns anywhere in the aggregates - all_identifiers = [ - node.schema_column.identity - for node in get_all_nodes_of_type(self.aggregates, select_nodes=(NodeType.IDENTIFIER,)) - ] - self.all_identifiers = list(dict.fromkeys(all_identifiers)) - - # Get any functions we need to execute before aggregating - self.evaluatable_nodes = extract_evaluations(self.aggregates) - - self.column_map, self.aggregate_functions = build_aggregations(self.aggregates) - - self.do = AggregateDataObject() - self.buffer = [] - - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - - @property - def config(self): # pragma: no cover - return str(self.aggregates) - - @property - def name(self): # pragma: no cover - return "Aggregation" - - def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: - if morsel == EOS: - if _is_count_star(self.aggregates): - return _count_star( - morsel_promise=self.buffer, - column_name=self.aggregates[0].schema_column.identity, - ) - - # merge all the morsels together into one table, selecting only the columns - # we're pretty sure we're going to use - this will fail for datasets - # larger than memory until we implement some form of partitioning - table = pyarrow.concat_tables( - project(self.buffer, self.all_identifiers), promote_options="none" - ) - - # Allow grouping by functions by evaluating them first - if self.evaluatable_nodes: - table = evaluate_and_append(self.evaluatable_nodes, table) - - # Add a "*" column, this is an int because when a bool it miscounts - if "*" not in table.column_names: - table = table.append_column( - "*", [numpy.full(shape=table.num_rows, fill_value=1, dtype=numpy.int8)] - ) - - # we're not a group_by - we're aggregating without grouping - aggregates = _non_group_aggregates(self.aggregates, table) - del table - - # name the aggregate fields and add them to the Columns data - aggregates = aggregates.select(list(self.column_map.keys())) - - return [aggregates, EOS] - - self.buffer.append(morsel) diff --git a/opteryx/operatorsv2/async_read_node.py b/opteryx/operatorsv2/async_read_node.py deleted file mode 100644 index 792936a0d..000000000 --- a/opteryx/operatorsv2/async_read_node.py +++ /dev/null @@ -1,213 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Async Scanner Node - -This is the SQL Query Execution Plan Node responsible for the reading of data. - -It wraps different internal readers (e.g. GCP Blob reader, SQL Reader), -normalizes the data into the format for internal processing. -""" - -import asyncio -import queue -import threading -import time -from dataclasses import dataclass -from typing import Generator - -import aiohttp -import pyarrow -import pyarrow.parquet -from orso.schema import convert_orso_schema_to_arrow_schema - -from opteryx import EOS -from opteryx import config -from opteryx.exceptions import DataError -from opteryx.models import QueryProperties -from opteryx.operators.base_plan_node import BasePlanDataObject -from opteryx.shared import AsyncMemoryPool -from opteryx.shared import MemoryPool -from opteryx.utils.file_decoders import get_decoder - -from .read_node import ReaderNode -from .read_node import normalize_morsel -from .read_node import struct_to_jsonb - -CONCURRENT_READS = config.CONCURRENT_READS -MAX_READ_BUFFER_CAPACITY = config.MAX_READ_BUFFER_CAPACITY - - -async def fetch_data(blob_names, pool, reader, reply_queue, statistics): - semaphore = asyncio.Semaphore(CONCURRENT_READS) - session = aiohttp.ClientSession() - - async def fetch_and_process(blob_name): - async with semaphore: - start_per_blob = time.monotonic_ns() - reference = await reader( - blob_name=blob_name, pool=pool, session=session, statistics=statistics - ) - reply_queue.put((blob_name, reference)) # Put data onto the queue - statistics.time_reading_blobs += time.monotonic_ns() - start_per_blob - - tasks = (fetch_and_process(blob) for blob in blob_names) - - await asyncio.gather(*tasks) - reply_queue.put(None) - await session.close() - - -@dataclass -class AsyncReaderDataObject(BasePlanDataObject): - pass - - -class AsyncReaderNode(ReaderNode): - def __init__(self, properties: QueryProperties, **parameters): - ReaderNode.__init__(self, properties=properties, **parameters) - self.pool = MemoryPool(MAX_READ_BUFFER_CAPACITY, f"ReadBuffer <{self.parameters['alias']}>") - - self.do = AsyncReaderDataObject() - self.predicates = parameters.get("predicates") - - @classmethod - def from_dict(cls, dic: dict) -> "AsyncReaderNode": # pragma: no cover - raise NotImplementedError() - - def execute(self, morsel) -> Generator: - from opteryx import system_statistics - - """Perform this step, time how long is spent doing work""" - orso_schema = self.parameters["schema"] - reader = self.parameters["connector"] - - orso_schema_cols = [] - for col in orso_schema.columns: - if col.identity in [c.schema_column.identity for c in self.columns]: - orso_schema_cols.append(col) - orso_schema.columns = orso_schema_cols - - self.statistics.columns_read = len(orso_schema.columns) - - blob_names = reader.partition_scheme.get_blobs_in_partition( - start_date=reader.start_date, - end_date=reader.end_date, - blob_list_getter=reader.get_list_of_blob_names, - prefix=reader.dataset, - predicates=self.predicates, - ) - - if len(blob_names) == 0: - # if we don't have any matching blobs, create an empty dataset - # TODO: rewrite - from orso import DataFrame - - as_arrow = DataFrame(rows=[], schema=orso_schema).arrow() - renames = [orso_schema.column(col).identity for col in as_arrow.column_names] - as_arrow = as_arrow.rename_columns(renames) - yield as_arrow - - data_queue: queue.Queue = queue.Queue() - - loop = asyncio.new_event_loop() - read_thread = threading.Thread( - target=lambda: loop.run_until_complete( - fetch_data( - blob_names, - AsyncMemoryPool(self.pool), - reader.async_read_blob, - data_queue, - self.statistics, - ) - ), - daemon=True, - ) - read_thread.start() - - morsel = None - arrow_schema = None - - while True: - try: - # Attempt to get an item with a timeout. - item = data_queue.get(timeout=0.1) - except queue.Empty: - # Increment stall count if the queue is empty. - self.statistics.stalls_reading_from_read_buffer += 1 - system_statistics.io_wait_seconds += 0.1 - continue # Skip the rest of the loop and try to get an item again. - - if item is None: - # Break out of the loop if the item is None, indicating a termination condition. - break - - blob_name, reference = item - - decoder = get_decoder(blob_name) - - try: - # the sync readers include the decode time as part of the read time - try: - # This pool is being used by async processes in another thread, using - # zero copy versions occassionally results in data getting corrupted - # due to a read-after-free type error - start = time.monotonic_ns() - blob_bytes = self.pool.read_and_release(reference, zero_copy=False) - decoded = decoder( - blob_bytes, projection=self.columns, selection=self.predicates - ) - except Exception as err: - from pyarrow import ArrowInvalid - - if isinstance(err, ArrowInvalid) and "No match for" in str(err): - raise DataError( - f"Unable to read blob {blob_name} - this error is likely caused by a blob having an significantly different schema to previously handled blobs, or the data catalog." - ) - raise DataError(f"Unable to read blob {blob_name} - error {err}") from err - self.statistics.time_reading_blobs += time.monotonic_ns() - start - num_rows, _, morsel = decoded - self.statistics.rows_seen += num_rows - - morsel = struct_to_jsonb(morsel) - morsel = normalize_morsel(orso_schema, morsel) - - if arrow_schema: - morsel = morsel.cast(arrow_schema) - else: - arrow_schema = morsel.schema - - self.statistics.blobs_read += 1 - self.records_out += morsel.num_rows - self.statistics.rows_read += morsel.num_rows - self.bytes_out += morsel.nbytes - - yield morsel - except Exception as err: - self.statistics.add_message(f"failed to read {blob_name}") - self.statistics.failed_reads += 1 - import warnings - - warnings.warn(f"failed to read {blob_name} - {err}") - - # Ensure the thread is closed - read_thread.join() - - if morsel is None: - self.statistics.empty_datasets += 1 - arrow_schema = convert_orso_schema_to_arrow_schema(orso_schema, use_identities=True) - yield pyarrow.Table.from_arrays( - [pyarrow.array([]) for _ in arrow_schema], schema=arrow_schema - ) - - yield EOS diff --git a/opteryx/operatorsv2/base_plan_node.py b/opteryx/operatorsv2/base_plan_node.py deleted file mode 100644 index 7e025efaf..000000000 --- a/opteryx/operatorsv2/base_plan_node.py +++ /dev/null @@ -1,119 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import time -from dataclasses import dataclass -from typing import Optional - -import pyarrow -from orso.tools import random_string - -from opteryx import EOS - - -@dataclass -class BasePlanDataObject: - operation: Optional[str] = None - query_id: str = None - identity: str = None - - def __post_init__(self): - # Perform actions after initialization - if self.identity is None: - self.identity = random_string() - if self.operation is None: - self.operation = self.__class__.__name__.replace("DataObject", "Node") - - -class BasePlanNode: - def __init__(self, *, properties, **parameters): - """ - This is the base class for nodes in the execution plan. - - The initializer accepts a QueryStatistics node which is populated by different nodes - differently to record what happened during the query execution. - """ - from opteryx.models import QueryProperties - from opteryx.models import QueryStatistics - - self.properties: QueryProperties = properties - self.statistics: QueryStatistics = QueryStatistics(properties.qid) - self.parameters = parameters - self.execution_time = 0 - self.identity = random_string() - self.do: Optional[BasePlanDataObject] = None - self.calls = 0 - self.records_in = 0 - self.bytes_in = 0 - self.records_out = 0 - self.bytes_out = 0 - - def to_json(self) -> bytes: # pragma: no cover - import orjson - - from opteryx.utils import dataclass_to_dict - - return orjson.dumps(dataclass_to_dict(self.do)) - - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - - def config(self) -> str: - return "" - - @property - def name(self): # pragma: no cover - """ - Friendly Name of this node - """ - return "no name" - - @property - def node_type(self) -> str: - return self.name - - def __str__(self) -> str: - return f"{self.name} {self.sensors()}" - - def execute(self, morsel: pyarrow.Table) -> Optional[pyarrow.Table]: # pragma: no cover - pass - - def __call__(self, morsel: pyarrow.Table) -> Optional[pyarrow.Table]: - if morsel is not None and morsel != EOS: - self.records_in += morsel.num_rows - self.bytes_in += morsel.nbytes - self.calls += 1 - - start_time = time.monotonic_ns() - result = self.execute(morsel) - - self.execution_time += time.monotonic_ns() - start_time - if result is not None and result != EOS and hasattr(result, "num_rows"): - self.records_out += result.num_rows - self.bytes_out += result.nbytes - return result - - def sensors(self): - return { - "calls": self.calls, - "execution_time": self.execution_time, - "records_in": self.records_in, - "records_out": self.records_out, - "bytes_in": self.bytes_in, - "bytes_out": self.bytes_out, - } - - -class JoinNode(BasePlanNode): - pass diff --git a/opteryx/operatorsv2/bench/#information_schema_node.py b/opteryx/operatorsv2/bench/#information_schema_node.py deleted file mode 100644 index 97cc5f847..000000000 --- a/opteryx/operatorsv2/bench/#information_schema_node.py +++ /dev/null @@ -1,186 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Collection Reader Node - -This is a SQL Query Execution Plan Node. - -This Node primarily is used for reading NoSQL sources like MongoDB and Firestore. -""" - -import datetime -from typing import Iterable - -import pyarrow - -from opteryx.exceptions import DatasetNotFoundError -from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType - - -def information_schema_routines(): - schema = { - "specific_name": None, - "routine_catalog": None, - "routine_schema": None, - "routine_name": None, - "routine_type": None, - "data_type": None, - "character_maximum_length": None, - "character_octet_length": None, - "numeric_precision": None, - "numeric_scale": None, - "datetime_precision": None, - "character_set_name": None, - "collation_name": None, - "dtd_identifier": None, - "routine_body": None, - "routine_definition": None, - "external_name": None, - "external_language": None, - "parameter_style": None, - "is_deterministic": None, - "sql_data_access": None, - "sql_path": None, - "security_type": None, - "created": None, - "last_altered": None, - "sql_mode": None, - "routine_comment": None, - "definer": None, - "character_set_client": None, - "collation_connection": None, - "database_collation": None, - } - - buffer = [schema] - - table = pyarrow.Table.from_pylist(buffer) - table = Columns.create_table_metadata( - table=table, - expected_rows=len(buffer), - name="information_schema_routines", - table_aliases=[], - disposition="calculated", - path="information_schema_routines", - ) - - return table - - -def information_schema_views(): - schema = { - "table_catalog": None, - "table_schema": None, - "table_name": None, - "view_definition": None, - "check_option": "NONE", - "is_updatable": "NO", - "definer": None, - "security_type": None, - "character_set_client": None, - "collation_connection": None, - } - - buffer = [schema] - - table = pyarrow.Table.from_pylist(buffer) - table = Columns.create_table_metadata( - table=table, - expected_rows=len(buffer), - name="information_schema_views", - table_aliases=[], - disposition="calculated", - path="information_schema_views", - ) - - return table - - -def information_schema_tables(): - schema = { - "table_catalog": "opteryx", - "table_schema": None, - "table_name": "$planets", - "table_type": "SYSTEM VIEW", - "engine": "Interal", - "version": "0", - "row_format": "fIXED", - "table_rows": 0, - "avg_row_length": 0, - "data_length": 0, - "max_data_length": 0, - "index_length": 0, - "data_free": 0, - "auto_increment": 0, - "create_time": datetime.datetime.utcnow(), - "update_time": datetime.datetime.utcnow(), - "check_time": datetime.datetime.utcnow(), - "table_collation": None, - "checksum": 0, - "create_options": None, - "table_comment": None, - } - - buffer = [schema] - - table = pyarrow.Table.from_pylist(buffer) - table = Columns.create_table_metadata( - table=table, - expected_rows=len(buffer), - name="information_schema_tables", - table_aliases=[], - disposition="calculated", - path="information_schema_tables", - ) - - return table - - -class InformationSchemaNode(BasePlanNode): - operator_type = OperatorType.PRODUCER - - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) - - self._alias = config.get("alias") - self._dataset = config["dataset"].lower() - - # pushed down selection/filter - self._selection = config.get("selection") - - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - - @property - def config(self): # pragma: no cover - if self._alias: - return f"{self._dataset} => {self._alias}" - return f"{self._dataset}" - - @property - def name(self): # pragma: no cover - return "Information Schema Reader" - - def execute(self) -> Iterable: - if self._dataset == "information_schema.tables": - yield information_schema_tables() - elif self._dataset == "information_schema.views": - yield information_schema_views() - elif self._dataset == "information_schema.routines": - yield information_schema_routines() - else: - raise DatasetNotFoundError(dataset=self._dataset) - return diff --git a/opteryx/operatorsv2/bench/#show_databases_node.py b/opteryx/operatorsv2/bench/#show_databases_node.py deleted file mode 100644 index 6dc7e3500..000000000 --- a/opteryx/operatorsv2/bench/#show_databases_node.py +++ /dev/null @@ -1,79 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Show Stores Node - -This is a SQL Query Execution Plan Node. -""" - -from typing import Iterable - -import pyarrow - -from opteryx.models import QueryProperties -from opteryx.operators import BasePlanNode -from opteryx.operators import OperatorType - - -class ShowDatabasesNode(BasePlanNode): - operator_type = OperatorType.PRODUCER - - def __init__(self, properties: QueryProperties, **config): - super().__init__(properties=properties) - - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - - @property - def name(self): # pragma: no cover - return "Show Databases" - - @property - def config(self): # pragma: no cover - return "" - - def execute(self) -> Iterable: - from opteryx.connectors import _storage_prefixes - - buffer = [ - { - "Database": "" if s == "_" else s, # type: ignore - "Connector": str(c["connector"].__name__), # type: ignore - "Remove_Prefix": c["remove_prefix"], # type: ignore - "Type": str(c["connector"].mro()[1].__name__[4:-14]), # type: ignore - } - for s, c in _storage_prefixes.items() - if isinstance(c, dict) - ] - buffer.append( - { - "Database": "opteryx", # type: ignore - "Connector": "Internal", # type: ignore - "Remove_Prefix": True, # type: ignore - "Type": "Internal", # type: ignore - } - ) - - table = pyarrow.Table.from_pylist(buffer) - table = Columns.create_table_metadata( - table=table, - expected_rows=len(buffer), - name="show_stores", - table_aliases=[], - disposition="calculated", - path="show_stores", - ) - - yield table - return diff --git a/opteryx/operatorsv2/cross_join_node.py b/opteryx/operatorsv2/cross_join_node.py deleted file mode 100644 index 490762ba2..000000000 --- a/opteryx/operatorsv2/cross_join_node.py +++ /dev/null @@ -1,370 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Cross Join Node - -This is a SQL Query Execution Plan Node. - -This performs a CROSS JOIN - CROSS JOIN is not natively supported by PyArrow so this is written -here rather than calling the join() functions -""" - -from dataclasses import dataclass -from typing import Generator -from typing import Set -from typing import Tuple - -import numpy -import pyarrow -from orso.schema import FlatColumn - -from opteryx import EOS -from opteryx.compiled.structures import HashSet -from opteryx.managers.expression import NodeType -from opteryx.models import LogicalColumn -from opteryx.models import QueryProperties -from opteryx.operators.base_plan_node import BasePlanDataObject - -from . import JoinNode - -INTERNAL_BATCH_SIZE: int = 7500 # config -MAX_JOIN_SIZE: int = 1000 # config -MORSEL_SIZE_BYTES: int = 16 * 1024 * 1024 -CROSS_JOIN_UNNEST_BATCH_SIZE = 10000 - - -def _cross_join_unnest_column( - *, - morsel: pyarrow.Table = None, - source: LogicalColumn = None, - target_column: FlatColumn = None, - conditions: Set = None, - distinct: bool = False, - single_column: bool = False, - hash_set=None, -) -> pyarrow.Table: - """ - Perform a cross join on an unnested column of pyarrow tables. - - Args: - morsels: An iterable of `pyarrow.Table` objects to be cross joined. - source: The source node indicating the column. - target_column: The column to be unnested. - - Returns: - A generator that yields the resulting `pyarrow.Table` objects. - """ - from opteryx.compiled.cross_join import build_filtered_rows_indices_and_column - from opteryx.compiled.cross_join import build_rows_indices_and_column - from opteryx.compiled.structures import list_distinct - - # Check if the source node type is an identifier, raise error otherwise - if source.node_type != NodeType.IDENTIFIER: - raise NotImplementedError("Can only CROSS JOIN UNNEST on a column") - - batch_size: int = INTERNAL_BATCH_SIZE - at_least_once = False - single_column_collector = [] - - # Break the morsel into batches to avoid memory issues - for left_block in morsel.to_batches(max_chunksize=batch_size): - new_block = None - # Fetch the data of the column to be unnested - column_data = left_block[source.schema_column.identity] - - # Filter out null values - valid_offsets = column_data.is_valid() - column_data = column_data.drop_null() - if len(column_data) == 0: - continue - left_block = left_block.filter(valid_offsets) - - # Build indices and new column data - if conditions is None: - indices, new_column_data = build_rows_indices_and_column(column_data.to_numpy(False)) - else: - indices, new_column_data = build_filtered_rows_indices_and_column( - column_data.to_numpy(False), conditions - ) - - if single_column and distinct and indices.size > 0: - # if the unnest target is the only field in the SELECT and we're DISTINCTING - indices = numpy.array(indices, dtype=numpy.int32) - new_column_data, indices, hash_set = list_distinct(new_column_data, indices, hash_set) - - if len(indices) > 0: - if single_column: - single_column_collector.extend(new_column_data) - if len(single_column_collector) > INTERNAL_BATCH_SIZE: - schema = pyarrow.schema( - [ - pyarrow.field( - name=target_column.identity, type=target_column.arrow_field.type - ) - ] - ) - arrow_array = pyarrow.array(single_column_collector) - if arrow_array.type != target_column.arrow_field.type: - arrow_array = arrow_array.cast(target_column.arrow_field.type) - new_block = pyarrow.Table.from_arrays([arrow_array], schema=schema) - single_column_collector.clear() - del arrow_array - yield new_block - at_least_once = True - else: - # Rebuild the block with the new column data if we have any rows to build for - - total_rows = len(indices) # Both arrays have the same length - block_size = MORSEL_SIZE_BYTES / (left_block.nbytes / left_block.num_rows) - block_size = int(block_size // 1000) * 1000 - - for start_block in range(0, total_rows, block_size): - # Compute the end index for the current chunk - end_block = min(start_block + block_size, total_rows) - - # Slice the current chunk of indices and new_column_data - indices_chunk = indices[start_block:end_block] - new_column_data_chunk = new_column_data[start_block:end_block] - - # Create a new block using the chunk of indices - indices_chunk = numpy.array(indices_chunk, dtype=numpy.int32) - new_block = left_block.take(indices_chunk) - new_block = pyarrow.Table.from_batches([new_block], schema=morsel.schema) - - # Append the corresponding chunk of new_column_data to the block - new_block = new_block.append_column( - target_column.identity, pyarrow.array(new_column_data_chunk) - ) - - yield new_block - at_least_once = True - - if single_column_collector: - schema = pyarrow.schema( - [pyarrow.field(name=target_column.identity, type=target_column.arrow_field.type)] - ) - arrow_array = pyarrow.array(single_column_collector) - if arrow_array.type != target_column.arrow_field.type: - arrow_array = arrow_array.cast(target_column.arrow_field.type) - new_block = pyarrow.Table.from_arrays([arrow_array], schema=schema) - yield new_block - at_least_once = True - - if not at_least_once: - # Create an empty table with the new schema - schema = morsel.schema - new_column = pyarrow.field(target_column.identity, pyarrow.string()) - new_schema = pyarrow.schema(list(schema) + [new_column]) - new_block = pyarrow.Table.from_batches([], schema=new_schema) - yield new_block - - -def _cross_join_unnest_literal( - morsel: pyarrow.Table, source: Tuple, target_column: FlatColumn -) -> Generator[pyarrow.Table, None, None]: - joined_list_size = len(source) - - # Break the morsel into batches to avoid memory issues - for left_block in morsel.to_batches(max_chunksize=INTERNAL_BATCH_SIZE): - left_block = pyarrow.Table.from_batches([left_block], schema=morsel.schema) - block_size = left_block.num_rows - - # Repeat each row in the table n times - repeated_indices = numpy.repeat(numpy.arange(block_size), joined_list_size) - appended_table = left_block.take(repeated_indices) - - # Tile the array to match the new number of rows - tiled_array = numpy.tile(source, block_size) - - # Convert tiled_array to PyArrow array and append it to the table - array_column = pyarrow.array(tiled_array) - appended_table = appended_table.append_column(target_column.identity, array_column) - - yield appended_table - - -def _cartesian_product(*arrays): - """ - Cartesian product of arrays creates every combination of the elements in the arrays - """ - array_count = len(arrays) - arr = numpy.empty([len(array) for array in arrays] + [array_count], dtype=numpy.int64) - for i, array in enumerate(numpy.ix_(*arrays)): - arr[..., i] = array - return numpy.hsplit(arr.reshape(-1, array_count), array_count) - - -def _cross_join(left_morsel, right): - """ - A cross join is the cartesian product of two tables - this usually isn't very - useful, but it does allow you to the theta joins (non-equi joins) - """ - - def _chunker(seq_1, seq_2, size): - """ - Chunk two equal length interables into size sized chunks - - This returns a generator. - """ - return ( - (seq_1[pos : pos + size], seq_2[pos : pos + size]) for pos in range(0, len(seq_1), size) - ) - - from opteryx.utils.arrow import align_tables - - at_least_once = False - left_schema = left_morsel.schema - right_schema = right.schema - - # Iterate through left table in chunks of size INTERNAL_BATCH_SIZE - for left_block in left_morsel.to_batches(max_chunksize=INTERNAL_BATCH_SIZE): - # Convert the chunk to a table to retain column names - left_block = pyarrow.Table.from_batches([left_block], schema=left_morsel.schema) - - # Create an array of row indices for each table - left_array = numpy.arange(left_block.num_rows, dtype=numpy.int64) - right_array = numpy.arange(right.num_rows, dtype=numpy.int64) - - # Calculate the cartesian product of the two arrays of row indices - left_align, right_align = _cartesian_product(left_array, right_array) - - # Further break down the result into manageable chunks of size MAX_JOIN_SIZE - for left_chunk, right_chunk in _chunker(left_align, right_align, MAX_JOIN_SIZE): - # Align the tables using the specified chunks of row indices - table = align_tables(left_block, right, left_chunk.flatten(), right_chunk.flatten()) - - # Yield the resulting table to the caller - yield table - at_least_once = True - - if not at_least_once: - fields = [pyarrow.field(name=f.name, type=f.type) for f in right_schema] + [ - pyarrow.field(name=f.name, type=f.type) for f in left_schema - ] - combined_schemas = pyarrow.schema(fields) - yield pyarrow.Table.from_arrays( - [pyarrow.array([]) for _ in combined_schemas], schema=combined_schemas - ) - - -@dataclass -class CrossJoinDataObject(BasePlanDataObject): - source: str = None - _unnest_column: str = None - _unnest_target: str = None - _filters: str = None - _distinct: bool = False - - -class CrossJoinNode(JoinNode): - """ - Implements a SQL CROSS JOIN - """ - - def __init__(self, properties: QueryProperties, **parameters): - JoinNode.__init__(self, properties=properties, **parameters) - - self.source = parameters.get("column") - - self._left_relation = parameters.get("left_relation_names") - self._right_relation = parameters.get("right_relation_names") - - # do we have unnest details? - self._unnest_column = parameters.get("unnest_column") - self._unnest_target = parameters.get("unnest_target") - self._filters = parameters.get("filters") - self._distinct = parameters.get("distinct", False) - - # handle variation in how the unnested column is represented - if self._unnest_column: - if self._unnest_column.node_type == NodeType.NESTED: - self._unnest_column = self._unnest_column.centre - # if we have a literal that's not a tuple, wrap it - if self._unnest_column.node_type == NodeType.LITERAL and not isinstance( - self._unnest_column.value, tuple - ): - self._unnest_column.value = tuple([self._unnest_column.value]) - - self._single_column = parameters.get("pre_update_columns", set()) == { - self._unnest_target.identity, - } - - self.stream = "left" - self.left_buffer = [] - self.right_buffer = [] - self.left_relation = None - self.right_relation = None - self.hash_set = HashSet() - - self.continue_executing = True - - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - - @property - def name(self): # pragma: no cover - return "Cross Join" - - @property - def config(self): # pragma: no cover - filters = "" - if self._filters: - filters = f"({self._unnest_target.name} IN ({', '.join(self._filters)}))" - return f"CROSS JOIN {filters}" - - def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: - if not self.continue_executing: - return None - - if self._unnest_column is not None: - if morsel == EOS: - self.continue_executing = False - return EOS - if isinstance(self._unnest_column.value, tuple): - return list( - _cross_join_unnest_literal( - morsel=morsel, - source=self._unnest_column.value, - target_column=self._unnest_target, - ) - ) - return list( - _cross_join_unnest_column( - morsel=morsel, - source=self._unnest_column, - target_column=self._unnest_target, - conditions=self._filters, - hash_set=self.hash_set, - distinct=self._distinct, - single_column=self._single_column, - ) - ) - - if self.stream == "left": - if morsel == EOS: - self.stream = "right" - self.left_relation = pyarrow.concat_tables(self.left_buffer, promote_options="none") - self.left_buffer.clear() - else: - self.left_buffer.append(morsel) - return None - - if self.stream == "right": - if morsel == EOS: - right_table = pyarrow.concat_tables(self.right_buffer, promote_options="none") # type:ignore - self.right_buffer = None - return list(_cross_join(self.left_relation, right_table)) - else: - self.right_buffer.append(morsel) - return None diff --git a/opteryx/operatorsv2/distinct_node.py b/opteryx/operatorsv2/distinct_node.py deleted file mode 100644 index 60cf76c2d..000000000 --- a/opteryx/operatorsv2/distinct_node.py +++ /dev/null @@ -1,73 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Distinct Node - -This is a SQL Query Execution Plan Node. - -This Node eliminates duplicate records. -""" - -from pyarrow import Table - -from opteryx import EOS -from opteryx.models import QueryProperties - -from . import BasePlanNode - - -class DistinctNode(BasePlanNode): - def __init__(self, properties: QueryProperties, **parameters): - from opteryx.compiled.structures import HashSet - - BasePlanNode.__init__(self, properties=properties, **parameters) - self._distinct_on = parameters.get("on") - if self._distinct_on: - self._distinct_on = [col.schema_column.identity for col in self._distinct_on] - self.hash_set = HashSet() - - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - - @property - def config(self): # pragma: no cover - return "" - - @property - def name(self): # pragma: no cover - return "Distinction" - - def execute(self, morsel: Table) -> Table: - from opteryx.compiled.structures import distinct - - # We create a HashSet outside the distinct call, this allows us to pass - # the hash to each run of the distinct which means we don't need to concat - # all of the tables together to return a result. - # - # Being able to run morsel-by-morsel means if we have a LIMIT clause, we can - # limit processing - - if morsel == EOS: - return EOS - - unique_indexes, self.hash_set = distinct( - morsel, columns=self._distinct_on, seen_hashes=self.hash_set - ) - - if len(unique_indexes) > 0: - distinct_table = morsel.take(unique_indexes) - return distinct_table - else: - distinct_table = morsel.slice(0, 0) - return distinct_table diff --git a/opteryx/operatorsv2/exit_node.py b/opteryx/operatorsv2/exit_node.py deleted file mode 100644 index a428e955b..000000000 --- a/opteryx/operatorsv2/exit_node.py +++ /dev/null @@ -1,108 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Exit Node - -This is a SQL Query Execution Plan Node. - -This does the final preparation before returning results to users. - -This does two things that the projection node doesn't do: - - renames columns from the internal names - - removes all columns not being returned to the user - -This node doesn't do any calculations, it is a pure Projection. -""" - -from dataclasses import dataclass -from dataclasses import field -from typing import List - -from pyarrow import Table - -from opteryx import EOS -from opteryx.exceptions import AmbiguousIdentifierError -from opteryx.exceptions import InvalidInternalStateError -from opteryx.models import LogicalColumn -from opteryx.models import QueryProperties -from opteryx.operators.base_plan_node import BasePlanDataObject - -from . import BasePlanNode - - -@dataclass -class ExitDataObject(BasePlanDataObject): - columns: List[LogicalColumn] = field(default_factory=list) - - -class ExitNode(BasePlanNode): - def __init__(self, properties: QueryProperties, **parameters): - BasePlanNode.__init__(self, properties=properties, **parameters) - self.columns = parameters.get("columns", []) - - self.do = ExitDataObject(columns=self.columns) - - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - - @property - def config(self): # pragma: no cover - return None - - @property - def name(self): # pragma: no cover - return "Exit" - - def execute(self, morsel: Table) -> Table: - # Exit doesn't return EOS - if morsel == EOS: - return None - - final_columns = [] - final_names = [] - for column in self.columns: - final_columns.append(column.schema_column.identity) - final_names.append(column.current_name) - - if len(final_columns) != len(set(final_columns)): # pragma: no cover - from collections import Counter - - duplicates = [column for column, count in Counter(final_columns).items() if count > 1] - matches = {a for a, b in zip(final_names, final_columns) if b in duplicates} - raise AmbiguousIdentifierError( - message=f"Query result contains multiple instances of the same column(s) - `{'`, `'.join(matches)}`" - ) - - if len(set(final_names)) != len(final_names): # we have duplicate names - final_names = [] - for column in self.columns: - # if column.schema_column.origin: - # final_names.append(f"{column.schema_column.origin[0]}.{column.current_name}") - # else: - final_names.append(column.qualified_name) - - if not set(final_columns).issubset(morsel.column_names): # pragma: no cover - mapping = {name: int_name for name, int_name in zip(final_columns, final_names)} - missing_references = { - mapping.get(ref): ref for ref in final_columns if ref not in morsel.column_names - } - - raise InvalidInternalStateError( - f"The following fields were not in the resultset - {', '.join(missing_references.keys())}" - ) - - morsel = morsel.select(final_columns) - morsel = morsel.rename_columns(final_names) - - return morsel diff --git a/opteryx/operatorsv2/explain_node.py b/opteryx/operatorsv2/explain_node.py deleted file mode 100644 index 2b16067a2..000000000 --- a/opteryx/operatorsv2/explain_node.py +++ /dev/null @@ -1,48 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Explain Node - -This is a SQL Query Execution Plan Node. - -This writes out a query plan -""" - -from pyarrow import Table - -from opteryx.models import QueryProperties - -from . import BasePlanNode - - -class ExplainNode(BasePlanNode): - def __init__(self, properties: QueryProperties, **parameters): - BasePlanNode.__init__(self, properties=properties, **parameters) - self._query_plan = parameters.get("query_plan") - self.analyze = parameters.get("analyze", False) - - @property - def name(self): # pragma: no cover - return "Explain" - - @property # pragma: no cover - def config(self): - return "" - - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - - def execute(self, morsel: Table) -> Table: - if self._query_plan: - return self._query_plan.explain(self.analyze) diff --git a/opteryx/operatorsv2/filter_node.py b/opteryx/operatorsv2/filter_node.py deleted file mode 100644 index c4cff2e78..000000000 --- a/opteryx/operatorsv2/filter_node.py +++ /dev/null @@ -1,81 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Selection Node - -This is a SQL Query Execution Plan Node. - -This node is responsible for applying filters to datasets. -""" - -import numpy -import pyarrow - -from opteryx import EOS -from opteryx.exceptions import SqlError -from opteryx.managers.expression import NodeType -from opteryx.managers.expression import evaluate -from opteryx.managers.expression import evaluate_and_append -from opteryx.managers.expression import format_expression -from opteryx.managers.expression import get_all_nodes_of_type -from opteryx.models import QueryProperties - -from . import BasePlanNode - - -class FilterNode(BasePlanNode): - def __init__(self, properties: QueryProperties, **parameters): - BasePlanNode.__init__(self, properties=properties, **parameters) - self.filter = parameters.get("filter") - - self.function_evaluations = get_all_nodes_of_type( - self.filter, - select_nodes=(NodeType.FUNCTION,), - ) - - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - - @property - def config(self): # pragma: no cover - return format_expression(self.filter) - - @property - def name(self): # pragma: no cover - return "Filter" - - def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: - if morsel == EOS: - return EOS - - if morsel.num_rows == 0: - return morsel - - if self.function_evaluations: - morsel = evaluate_and_append(self.function_evaluations, morsel) - mask = evaluate(self.filter, morsel) - - if not isinstance(mask, pyarrow.lib.BooleanArray): - try: - mask = pyarrow.array(mask, type=pyarrow.bool_()) - except Exception as err: # nosec - raise SqlError( - f"Unable to filter on expression '{format_expression(self.filter)} {err}'." - ) - mask = numpy.nonzero(mask)[0] - - # if there's no matching rows, just drop the morsel - if mask.size > 0 and not numpy.all(mask is None): - return morsel.take(pyarrow.array(mask)) - return morsel.slice(0, 0) diff --git a/opteryx/operatorsv2/function_dataset_node.py b/opteryx/operatorsv2/function_dataset_node.py deleted file mode 100644 index 9ac8cf80a..000000000 --- a/opteryx/operatorsv2/function_dataset_node.py +++ /dev/null @@ -1,151 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Blob Reader Node - -This is a SQL Query Execution Plan Node. - -This Node creates datasets based on function calls like VALUES and UNNEST. -""" - -import time -from typing import Generator - -import pyarrow - -from opteryx import EOS -from opteryx.exceptions import SqlError -from opteryx.managers.expression import NodeType -from opteryx.models import QueryProperties -from opteryx.utils import series - -from .read_node import ReaderNode - - -def _generate_series(**kwargs): - value_array = series.generate_series(*kwargs["args"]) - column_name = kwargs["columns"][0].schema_column.identity - return pyarrow.Table.from_arrays([value_array], [column_name]) - - -def _unnest(**kwargs): - """unnest converts an list into rows""" - if kwargs["args"][0].node_type == NodeType.NESTED: - list_items = [kwargs["args"][0].centre.value] - else: - list_items = kwargs["args"][0].value - column_name = kwargs["columns"][0].schema_column.identity - - return pyarrow.Table.from_arrays([list_items], [column_name]) - - -def _values(**parameters): - columns = [col.schema_column.identity for col in parameters["columns"]] - values_array = parameters["values"] - return [{columns[i]: value.value for i, value in enumerate(values)} for values in values_array] - - -def _fake_data(**kwargs): - from orso.faker import generate_fake_data - - rows = kwargs["rows"] - schema = kwargs["schema"] - for column in schema.columns: - column.name = column.identity - return generate_fake_data(schema, rows) - - -def _http(**kwargs): - aliases = kwargs.get("schema") - data = kwargs.get("data") - - renames = [aliases.column(column).identity for column in data.column_names] - data = data.rename_columns(renames) - - return data - - -DATASET_FUNCTIONS = { - "FAKE": _fake_data, - "GENERATE_SERIES": _generate_series, - "UNNEST": _unnest, - "VALUES": _values, - "HTTP": _http, -} - - -class FunctionDatasetNode(ReaderNode): - def __init__(self, properties: QueryProperties, **parameters): - """ - The Blob Reader Node is responsible for reading the relevant blobs - and returning a Table/Relation. - """ - ReaderNode.__init__(self, properties=properties, **parameters) - self.alias = parameters.get("alias") - self.function = parameters["function"] - self.parameters = parameters - self.columns = parameters.get("columns", []) - self.args = parameters.get("args", []) - - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - - @property - def config(self): # pragma: no cover - from opteryx.managers.expression import format_expression - - if self.function == "FAKE": - return f"FAKE ({', '.join(format_expression(arg) for arg in self.args)}{' AS ' + self.alias if self.alias else ''})" - if self.function == "GENERATE_SERIES": - return f"GENERATE SERIES ({', '.join(format_expression(arg) for arg in self.args)}){' AS ' + self.alias if self.alias else ''}" - if self.function == "VALUES": - return f"VALUES (({', '.join(self.columns)}) x {len(self.values)} AS {self.alias})" - if self.function == "UNNEST": - return f"UNNEST ({', '.join(format_expression(arg) for arg in self.args)}{' AS ' + self.parameters.get('unnest_target', '')})" - if self.function == "HTTP": - return f"HTTP ({self.url}) AS {self.alias}" - - @property - def name(self): # pragma: no cover - return "Dataset Constructor" - - @property - def can_push_selection(self): - return False - - def execute(self, morsel) -> Generator: - try: - start_time = time.time_ns() - data = DATASET_FUNCTIONS[self.function](**self.parameters) # type:ignore - self.statistics.time_evaluate_dataset += time.time_ns() - start_time - except TypeError as err: # pragma: no cover - if str(err).startswith("_unnest() takes 2"): - raise SqlError( - "UNNEST expects a literal list in paranthesis, or a field name as a parameter." - ) - raise err - - if isinstance(data, list): - table = pyarrow.Table.from_pylist(data) - elif hasattr(data, "arrow"): - table = data.arrow() - else: - table = data - - self.records_out += table.num_rows - self.bytes_out += table.nbytes - self.statistics.columns_read += len(table.column_names) - - yield table - yield EOS diff --git a/opteryx/operatorsv2/heap_sort_node.py b/opteryx/operatorsv2/heap_sort_node.py deleted file mode 100644 index 782e8ab44..000000000 --- a/opteryx/operatorsv2/heap_sort_node.py +++ /dev/null @@ -1,139 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Heap Sort Node - -This is a SQL Query Execution Plan Node. - -This node orders a dataset, note that Heap Sort in this instance isn't the heap sort -algorithm, it is an approach where a heap of n items (the limit) is maintained as the -data passes through the operator. Because we are working with chunks, we build small -batches which we order and then discard the excess items. - -This is faster, particularly when working with large datasets even though we're now -sorting smaller chunks over and over again. -""" - -from dataclasses import dataclass - -import numpy -import pyarrow -import pyarrow.compute -from pyarrow import concat_tables - -from opteryx import EOS -from opteryx.exceptions import ColumnNotFoundError -from opteryx.models import QueryProperties -from opteryx.operators.base_plan_node import BasePlanDataObject - -from . import BasePlanNode - - -@dataclass -class HeapSortDataObject(BasePlanDataObject): - order_by: list = None - limit: int = -1 - - -class HeapSortNode(BasePlanNode): - def __init__(self, properties: QueryProperties, **parameters): - BasePlanNode.__init__(self, properties=properties, **parameters) - self.order_by = parameters.get("order_by", []) - self.limit: int = parameters.get("limit", -1) - - self.do = HeapSortDataObject(order_by=self.order_by, limit=self.limit) - self.mapped_order = [] - self.table = None - - for column, direction in self.order_by: - try: - self.mapped_order.append( - ( - column.schema_column.identity, - direction, - ) - ) - except ColumnNotFoundError as cnfe: - raise ColumnNotFoundError( - f"`ORDER BY` must reference columns as they appear in the `SELECT` clause. {cnfe}" - ) - - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - - @property - def config(self): # pragma: no cover - return f"LIMIT = {self.limit} ORDER = " + ", ".join( - f"{i[0].value} {i[1][0:3].upper()}" for i in self.order_by - ) - - @property - def name(self): # pragma: no cover - return "Heap Sort" - - def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: - if morsel == EOS: - return [self.table, EOS] - - if self.table: - # Concatenate the accumulated table with the new morsel - self.table = concat_tables([self.table, morsel], promote_options="permissive") - else: - self.table = morsel - - # Determine if any columns are string-based - use_pyarrow_sort = any( - pyarrow.types.is_string(self.table.column(column_name).type) - or pyarrow.types.is_binary(self.table.column(column_name).type) - for column_name, _ in self.mapped_order - ) - - # strings are sorted faster user pyarrow, single columns faster using compute - if len(self.mapped_order) == 1 and use_pyarrow_sort: - column_name, sort_direction = self.mapped_order[0] - column = self.table.column(column_name) - if sort_direction == "ascending": - sort_indices = pyarrow.compute.sort_indices(column) - else: - sort_indices = pyarrow.compute.sort_indices(column)[::-1] - self.table = self.table.take(sort_indices[: self.limit]) - # strings are sorted faster using pyarrow - elif use_pyarrow_sort: - self.table = self.table.sort_by(self.mapped_order).slice(offset=0, length=self.limit) - # single column sort using numpy - elif len(self.mapped_order) == 1: - # Single-column sort using mergesort to take advantage of partially sorted data - column_name, sort_direction = self.mapped_order[0] - column = self.table.column(column_name).to_numpy() - if sort_direction == "ascending": - sort_indices = numpy.argsort(column) - else: - sort_indices = numpy.argsort(column)[::-1] # Reverse for descending - # Slice the sorted table - self.table = self.table.take(sort_indices[: self.limit]) - # multi column sort using numpy - else: - # Multi-column sort using lexsort - columns_for_sorting = [] - directions = [] - for column_name, sort_direction in self.mapped_order: - column = self.table.column(column_name).to_numpy() - columns_for_sorting.append(column) - directions.append(1 if sort_direction == "ascending" else -1) - - sort_indices = numpy.lexsort( - [col[::direction] for col, direction in zip(columns_for_sorting, directions)] - ) - # Slice the sorted table - self.table = self.table.take(sort_indices[: self.limit]) diff --git a/opteryx/operatorsv2/inner_join_node.py b/opteryx/operatorsv2/inner_join_node.py deleted file mode 100644 index 533a0060a..000000000 --- a/opteryx/operatorsv2/inner_join_node.py +++ /dev/null @@ -1,134 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Inner Join Node - -This is a SQL Query Execution Plan Node. - -PyArrow has a good LEFT JOIN implementation, but it errors when the -relations being joined contain STRUCT or ARRAY columns, this is true -for all of the JOIN types, however we've only written our own INNER -and LEFT JOINs. - -It is comparible performance to the PyArrow INNER JOIN, in benchmarks -sometimes native is faster, sometimes PyArrow is faster. Generally -PyArrow is more forgiving when the relations are the "wrong" way around -(unoptimized order) but native is faster for well-ordered relations, as -we intend to take steps to help ensure relations are well-ordered, this -should work in our favour. - -This is a hash join, this is completely rewritten from the earlier -pyarrow_ops implementation which was a variation of a sort-merge join. -""" - -import pyarrow -from pyarrow import Table - -from opteryx import EOS -from opteryx.compiled.structures.hash_table import hash_join_map -from opteryx.models import QueryProperties -from opteryx.utils.arrow import align_tables - -from . import JoinNode - - -def inner_join_with_preprocessed_left_side(left_relation, right_relation, join_columns, hash_table): - """ - Perform an INNER JOIN using a preprocessed hash table from the left relation. - - Parameters: - left_relation: The preprocessed left pyarrow.Table. - right_relation: The right pyarrow.Table to join. - join_columns: A list of column names to join on. - hash_table: The preprocessed hash table from the left table. - - Returns: - A tuple containing lists of matching row indices from the left and right relations. - """ - left_indexes = [] - right_indexes = [] - - right_hash = hash_join_map(right_relation, join_columns) - - for h, right_rows in right_hash.hash_table.items(): - left_rows = hash_table.get(h) - if left_rows is None: - continue - for l in left_rows: - for r in right_rows: - left_indexes.append(l) - right_indexes.append(r) - - return align_tables(right_relation, left_relation, right_indexes, left_indexes) - - -class InnerJoinNode(JoinNode): - def __init__(self, properties: QueryProperties, **parameters): - JoinNode.__init__(self, properties=properties, **parameters) - self._join_type = parameters["type"] - self._on = parameters.get("on") - self._using = parameters.get("using") - - self._left_columns = parameters.get("left_columns") - self._left_relation = parameters.get("left_relation_names") - - self._right_columns = parameters.get("right_columns") - self._right_relation = parameters.get("right_relation_names") - - self.stream = "left" - self.left_buffer = [] - self.left_hash = None - - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - - @property - def name(self): # pragma: no cover - return "Inner Join" - - @property - def config(self): # pragma: no cover - return "" - - def execute(self, morsel: Table) -> Table: - if self.stream == "left": - if morsel == EOS: - self.stream = "right" - self.left_relation = pyarrow.concat_tables(self.left_buffer, promote_options="none") - self.left_buffer.clear() - - # in place until #1295 resolved - if self._left_columns[0] not in self.left_relation.column_names: - self._right_columns, self._left_columns = ( - self._left_columns, - self._right_columns, - ) - - self.left_hash = hash_join_map(self.left_relation, self._left_columns) - else: - self.left_buffer.append(morsel) - return None - - if morsel == EOS: - return EOS - - # do the join - new_morsel = inner_join_with_preprocessed_left_side( - left_relation=self.left_relation, - right_relation=morsel, - join_columns=self._right_columns, - hash_table=self.left_hash, - ) - - return new_morsel diff --git a/opteryx/operatorsv2/inner_join_node_single.py b/opteryx/operatorsv2/inner_join_node_single.py deleted file mode 100644 index f2f45692c..000000000 --- a/opteryx/operatorsv2/inner_join_node_single.py +++ /dev/null @@ -1,220 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Inner Join Node (Single Condition) - -This is a SQL Query Execution Plan Node. - -We have a generic Inner Join node, this is optimized for single conditions in the -Inner Join, this is currently only used for INTEGERS and is about 25% faster than -the generic INNER JOIN. -""" - -import numpy -import pyarrow -from pyarrow import compute - -from opteryx import EOS -from opteryx.compiled.structures import HashTable -from opteryx.models import QueryProperties -from opteryx.utils.arrow import align_tables - -from . import JoinNode - - -def preprocess_left(relation, join_columns): - """ - Convert a PyArrow array to an array of bytes. - - Parameters: - array (pyarrow.Array): The input PyArrow array. - - Returns: - numpy.ndarray: A numpy array of bytes representing the values in the input array. - """ - ht = HashTable() - - array = relation.column(join_columns[0]) - - if isinstance(array, pyarrow.ChunkedArray): - array = array.combine_chunks() - - num_rows = len(array) - # Access the null bitmap buffer - null_bitmap = array.buffers()[0] - - if null_bitmap is not None: - null_array = [((byte >> bit) & 1) for byte in null_bitmap for bit in range(8)][:num_rows] - else: - null_array = numpy.ones(num_rows, dtype=bool) - - value_offset_map = numpy.where(null_array)[0] - non_null_array = array.filter(compute.is_valid(array)) - - if pyarrow.types.is_integer(array.type): - for i, val in enumerate(non_null_array.to_numpy()): - ht.insert(val, value_offset_map[i]) - - elif pyarrow.types.is_fixed_size_binary(array.type) or pyarrow.types.is_floating( - array.type - ): # pragma: no cover - # Access the data buffer directly for fixed-width types - data_buffer = array.buffers()[1] - item_size = array.type.bit_width // 8 - - for i in range(num_rows): - if null_array[i]: - start = i * item_size - end = start + item_size - value_bytes = data_buffer[start:end].to_pybytes() - ht.insert(hash(value_bytes), i) - - elif pyarrow.types.is_binary(array.type) or pyarrow.types.is_string(array.type): - for i, val in enumerate(array): - if null_array[i]: - ht.insert(hash(val), i) - - else: - raise TypeError(f"Unsupported column type: {array.type}") - - return ht - - -def inner_join_with_preprocessed_left_side(left_relation, right_relation, join_columns, hash_table): - """ - Perform an INNER JOIN using a preprocessed hash table from the left relation. - - Parameters: - left_relation: The preprocessed left pyarrow.Table. - right_relation: The right pyarrow.Table to join. - join_columns: A list of column names to join on. - hash_table: The preprocessed hash table from the left table. - - Returns: - A tuple containing lists of matching row indices from the left and right relations. - """ - left_indexes = [] - right_indexes = [] - - array = right_relation.column(join_columns[0]) - - if isinstance(array, pyarrow.ChunkedArray): - array = array.combine_chunks() - - num_rows = len(array) - # Access the null bitmap buffer - null_bitmap = array.buffers()[0] - - if null_bitmap is not None: - null_array = [((byte >> bit) & 1) for byte in null_bitmap for bit in range(8)][:num_rows] - else: - null_array = numpy.ones(num_rows, dtype=bool) - - value_offset_map = numpy.where(null_array)[0] - non_null_array = array.filter(compute.is_valid(array)) - - if pyarrow.types.is_integer(array.type): - for i, val in enumerate(non_null_array.to_numpy()): - rows = hash_table.get(val) - if rows: - left_indexes.extend(rows) - right_indexes.extend([value_offset_map[i]] * len(rows)) - - elif pyarrow.types.is_fixed_size_binary(array.type) or pyarrow.types.is_floating( - array.type - ): # pragma: no cover - # Access the data buffer directly for fixed-width types - data_buffer = array.buffers()[1] - item_size = array.type.bit_width // 8 - - for i in range(num_rows): - if null_array[i]: - start = i * item_size - end = start + item_size - value_bytes = data_buffer[start:end].to_pybytes() - rows = hash_table.get(hash(value_bytes)) - if rows: - left_indexes.extend(rows) - right_indexes.extend([i] * len(rows)) - - if pyarrow.types.is_binary(array.type) or pyarrow.types.is_string(array.type): - for i, val in enumerate(array): - if null_array[i]: - rows = hash_table.get(hash(val)) - if rows: - left_indexes.extend(rows) - right_indexes.extend([i] * len(rows)) - - return align_tables(right_relation, left_relation, right_indexes, left_indexes) - - -class InnerJoinSingleNode(JoinNode): - def __init__(self, properties: QueryProperties, **parameters): - JoinNode.__init__(self, properties=properties, **parameters) - self._join_type = parameters["type"] - self._on = parameters.get("on") - self._using = parameters.get("using") - - self._left_columns = parameters.get("left_columns") - self._left_relation = parameters.get("left_relation_names") - - self._right_columns = parameters.get("right_columns") - self._right_relation = parameters.get("right_relation_names") - - self.stream = "left" - self.left_buffer = [] - self.left_hash = None - - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - - @property - def name(self): # pragma: no cover - return "Inner Join (Single)" - - @property - def config(self): # pragma: no cover - return "" - - def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: - if self.stream == "left": - if morsel == EOS: - self.stream = "right" - self.left_relation = pyarrow.concat_tables(self.left_buffer, promote_options="none") - self.left_buffer.clear() - - # in place until #1295 resolved - if self._left_columns[0] not in self.left_relation.column_names: - self._right_columns, self._left_columns = ( - self._left_columns, - self._right_columns, - ) - - self.left_hash = preprocess_left(self.left_relation, self._left_columns) - else: - self.left_buffer.append(morsel) - return None - - if morsel == EOS: - return EOS - - # do the join - new_morsel = inner_join_with_preprocessed_left_side( - left_relation=self.left_relation, - right_relation=morsel, - join_columns=self._right_columns, - hash_table=self.left_hash, - ) - - return new_morsel diff --git a/opteryx/operatorsv2/limit_node.py b/opteryx/operatorsv2/limit_node.py deleted file mode 100644 index 20b204829..000000000 --- a/opteryx/operatorsv2/limit_node.py +++ /dev/null @@ -1,73 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Limit Node - -This is a SQL Query Execution Plan Node. - -This Node performs the LIMIT and the OFFSET steps -""" - -import pyarrow - -from opteryx import EOS -from opteryx.models import QueryProperties - -from . import BasePlanNode - - -class LimitNode(BasePlanNode): - def __init__(self, properties: QueryProperties, **parameters): - BasePlanNode.__init__(self, properties=properties, **parameters) - self.limit = parameters.get("limit", float("inf")) - self.offset = parameters.get("offset", 0) - - self.remaining_rows = self.limit if self.limit is not None else float("inf") - self.rows_left_to_skip = max(0, self.offset) - - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - - @property - def name(self): # pragma: no cover - return "LIMIT" - - @property - def config(self): # pragma: no cover - return str(self.limit) + " OFFSET " + str(self.offset) - - def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: - if morsel == EOS: - return EOS - - if self.rows_left_to_skip > 0: - if self.rows_left_to_skip >= morsel.num_rows: - self.rows_left_to_skip -= morsel.num_rows - return morsel.slice(offset=0, length=0) - else: - morsel = morsel.slice( - offset=self.rows_left_to_skip, length=morsel.num_rows - self.rows_left_to_skip - ) - self.rows_left_to_skip = 0 - - if self.remaining_rows <= 0 or morsel.num_rows == 0: - return morsel.slice(offset=0, length=0) - - if morsel.num_rows < self.remaining_rows: - self.remaining_rows -= morsel.num_rows - return morsel - else: - rows_to_slice = self.remaining_rows - self.remaining_rows = 0 - return morsel.slice(offset=0, length=rows_to_slice) diff --git a/opteryx/operatorsv2/noop_node.py b/opteryx/operatorsv2/noop_node.py deleted file mode 100644 index b0c4bce8b..000000000 --- a/opteryx/operatorsv2/noop_node.py +++ /dev/null @@ -1,44 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -No Operation - -This is a SQL Query Execution Plan Node. -""" - -from pyarrow import Table - -from opteryx.models import QueryProperties - -from . import BasePlanNode - - -class NoOpNode(BasePlanNode): - def __init__(self, properties: QueryProperties, **parameters): - BasePlanNode.__init__(self, properties=properties, **parameters) - - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - - @property - def name(self): # pragma: no cover - return "NoOp" - - @property - def config(self): # pragma: no cover - return "" - - def execute(self, morsel: Table) -> Table: - print("NOOP was called") - return [morsel] diff --git a/opteryx/operatorsv2/outer_join_node.py b/opteryx/operatorsv2/outer_join_node.py deleted file mode 100644 index 191d43c21..000000000 --- a/opteryx/operatorsv2/outer_join_node.py +++ /dev/null @@ -1,324 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Outer Join Node - -This is a SQL Query Execution Plan Node. - -PyArrow has LEFT/RIGHT/FULL OUTER JOIN implementations, but they error when the -relations being joined contain STRUCT or ARRAY columns so we've written our own -OUTER JOIN implementations. - -We also have our own INNER JOIN implementations, it's really just the less -popular SEMI and ANTI joins we leave to PyArrow for now. -""" - -from typing import List - -import pyarrow - -from opteryx import EOS -from opteryx.compiled.structures import HashTable -from opteryx.models import QueryProperties -from opteryx.utils.arrow import align_tables - -from . import JoinNode - - -def left_join(left_relation, right_relation, left_columns: List[str], right_columns: List[str]): - """ - Perform an LEFT JOIN. - - This implementation ensures that all rows from the left table are included in the result set, - with rows from the right table matched where possible, and columns from the right table - filled with NULLs where no match is found. - - Parameters: - left_relation (pyarrow.Table): The left pyarrow.Table to join. - right_relation (pyarrow.Table): The right pyarrow.Table to join. - left_columns (list of str): Column names from the left table to join on. - right_columns (list of str): Column names from the right table to join on. - - Returns: - A pyarrow.Table containing the result of the LEFT JOIN operation. - """ - from collections import deque - - from opteryx.compiled.structures.hash_table import hash_join_map - - left_indexes: deque = deque() - right_indexes: deque = deque() - - if len(set(left_columns) & set(right_relation.column_names)) > 0: - left_columns, right_columns = right_columns, left_columns - - right_hash = hash_join_map(right_relation, right_columns) - left_hash = hash_join_map(left_relation, left_columns) - - for hash_value, left_rows in left_hash.hash_table.items(): - right_rows = right_hash.get(hash_value) - if right_rows: - for l in left_rows: - for r in right_rows: - left_indexes.append(l) - right_indexes.append(r) - else: - for l in left_rows: - left_indexes.append(l) - right_indexes.append(None) - - if len(left_indexes) > 50_000: - table = align_tables( - right_relation, left_relation, list(right_indexes), list(left_indexes) - ) - yield table - left_indexes.clear() - right_indexes.clear() - - # this may return an empty table each time - fix later - table = align_tables(right_relation, left_relation, list(right_indexes), list(left_indexes)) - yield table - left_indexes.clear() - right_indexes.clear() - - -def full_join(left_relation, right_relation, left_columns: List[str], right_columns: List[str]): - chunk_size = 1000 - - hash_table = HashTable() - non_null_right_values = right_relation.select(right_columns).itercolumns() - for i, value_tuple in enumerate(zip(*non_null_right_values)): - hash_table.insert(hash(value_tuple), i) - - left_indexes = [] - right_indexes = [] - - left_values = left_relation.select(left_columns).itercolumns() - for i, value_tuple in enumerate(zip(*left_values)): - rows = hash_table.get(hash(value_tuple)) - if rows: - right_indexes.extend(rows) - left_indexes.extend([i] * len(rows)) - else: - right_indexes.append(None) - left_indexes.append(i) - - for i in range(right_relation.num_rows): - if i not in right_indexes: - right_indexes.append(i) - left_indexes.append(None) - - for i in range(0, len(left_indexes), chunk_size): - chunk_left_indexes = left_indexes[i : i + chunk_size] - chunk_right_indexes = right_indexes[i : i + chunk_size] - - # Align this chunk and add the resulting table to our list - yield align_tables(right_relation, left_relation, chunk_right_indexes, chunk_left_indexes) - - -def right_join(left_relation, right_relation, left_columns: List[str], right_columns: List[str]): - """ - Perform a RIGHT JOIN. - - This implementation ensures that all rows from the right table are included in the result set, - with rows from the left table matched where possible, and columns from the left table - filled with NULLs where no match is found. - - Parameters: - left_relation (pyarrow.Table): The left pyarrow.Table to join. - right_relation (pyarrow.Table): The right pyarrow.Table to join. - left_columns (list of str): Column names from the left table to join on. - right_columns (list of str): Column names from the right table to join on. - - Yields: - pyarrow.Table: A chunk of the result of the RIGHT JOIN operation. - """ - chunk_size = 1000 - - hash_table = HashTable() - non_null_left_values = left_relation.select(left_columns).itercolumns() - for i, value_tuple in enumerate(zip(*non_null_left_values)): - hash_table.insert(hash(value_tuple), i) - - # Iterate over the right_relation in chunks - - for right_chunk in right_relation.to_batches(chunk_size): - left_indexes = [] - right_indexes = [] - - right_values = right_chunk.select(right_columns).itercolumns() - for i, value_tuple in enumerate(zip(*right_values)): - rows = hash_table.get(hash(value_tuple)) - if rows: - left_indexes.extend(rows) - right_indexes.extend([i] * len(rows)) - else: - left_indexes.append(None) - right_indexes.append(i) - - # Yield the aligned chunk - # we intentionally swap them to the other calls so we're building a table - # not a record batch (what the chunk is) - yield align_tables(left_relation, right_chunk, left_indexes, right_indexes) - - -def left_anti_join( - left_relation, right_relation, left_columns: List[str], right_columns: List[str] -): - """ - Perform a LEFT ANTI JOIN. - - This implementation ensures that all rows from the left table are included in the result set, - where there are no matching rows in the right table based on the join columns. - - Parameters: - left_relation (pyarrow.Table): The left pyarrow.Table to join. - right_relation (pyarrow.Table): The right pyarrow.Table to join. - left_columns (list of str): Column names from the left table to join on. - right_columns (list of str): Column names from the right table to join on. - - Returns: - A pyarrow.Table containing the result of the LEFT ANTI JOIN operation. - """ - hash_table = HashTable() - non_null_right_values = right_relation.select(right_columns).itercolumns() - for i, value_tuple in enumerate(zip(*non_null_right_values)): - hash_table.insert(hash(value_tuple), i) - - left_indexes = [] - left_values = left_relation.select(left_columns).itercolumns() - for i, value_tuple in enumerate(zip(*left_values)): - rows = hash_table.get(hash(value_tuple)) - if not rows: # Only include left rows that have no match in the right table - left_indexes.append(i) - - # Filter the left_chunk based on the anti join condition - if left_indexes: - yield left_relation.take(left_indexes) - else: - yield left_relation.slice(0, 0) - - -def left_semi_join( - left_relation, right_relation, left_columns: List[str], right_columns: List[str] -): - """ - Perform a LEFT SEMI JOIN. - - This implementation ensures that all rows from the left table that have a matching row in the right table - based on the join columns are included in the result set. - - Parameters: - left_relation (pyarrow.Table): The left pyarrow.Table to join. - right_relation (pyarrow.Table): The right pyarrow.Table to join. - left_columns (list of str): Column names from the left table to join on. - right_columns (list of str): Column names from the right table to join on. - - Returns: - A pyarrow.Table containing the result of the LEFT SEMI JOIN operation. - """ - - hash_table = HashTable() - non_null_right_values = right_relation.select(right_columns).itercolumns() - for i, value_tuple in enumerate(zip(*non_null_right_values)): - hash_table.insert(hash(value_tuple), i) - - left_indexes = [] - left_values = left_relation.select(left_columns).itercolumns() - - for i, value_tuple in enumerate(zip(*left_values)): - rows = hash_table.get(hash(value_tuple)) - if rows: # Only include left rows that have a match in the right table - left_indexes.append(i) - - # Filter the left_chunk based on the anti join condition - if left_indexes: - yield left_relation.take(left_indexes) - else: - yield left_relation.slice(0, 0) - - -class OuterJoinNode(JoinNode): - def __init__(self, properties: QueryProperties, **parameters): - JoinNode.__init__(self, properties=properties, **parameters) - self._join_type = parameters["type"] - self._on = parameters.get("on") - self._using = parameters.get("using") - - self._left_columns = parameters.get("left_columns") - self._left_relation = parameters.get("left_relation_names") - - self._right_columns = parameters.get("right_columns") - self._right_relation = parameters.get("right_relation_names") - - self.stream = "left" - self.left_buffer = [] - self.right_buffer = [] - self.left_relation = None - - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - - @property - def name(self): # pragma: no cover - return self._join_type - - @property - def config(self): # pragma: no cover - from opteryx.managers.expression import format_expression - - if self._on: - return f"{self._join_type.upper()} JOIN ({format_expression(self._on, True)})" - if self._using: - return f"{self._join_type.upper()} JOIN (USING {','.join(map(format_expression, self._using))})" - return f"{self._join_type.upper()}" - - def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: - if self.stream == "left": - if morsel == EOS: - self.stream = "right" - self.left_relation = pyarrow.concat_tables(self.left_buffer, promote_options="none") - self.left_buffer.clear() - else: - self.left_buffer.append(morsel) - return None - - if self.stream == "right": - if morsel == EOS: - right_relation = pyarrow.concat_tables(self.right_buffer, promote_options="none") - self.right_buffer.clear() - - join_provider = providers.get(self._join_type) - - return list( - join_provider( - left_relation=self.left_relation, - right_relation=right_relation, - left_columns=self._left_columns, - right_columns=self._right_columns, - ) - ) + [EOS] - - else: - self.right_buffer.append(morsel) - return None - - -providers = { - "left outer": left_join, - "full outer": full_join, - "right outer": right_join, - "left anti": left_anti_join, - "left semi": left_semi_join, -} diff --git a/opteryx/operatorsv2/projection_node.py b/opteryx/operatorsv2/projection_node.py deleted file mode 100644 index 35b890597..000000000 --- a/opteryx/operatorsv2/projection_node.py +++ /dev/null @@ -1,71 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Projection Node - -This is a SQL Query Execution Plan Node. - -This Node eliminates columns that are not needed in a Relation. This is also the Node -that performs column renames. -""" - -import pyarrow - -from opteryx import EOS -from opteryx.managers.expression import NodeType -from opteryx.managers.expression import evaluate_and_append -from opteryx.models import QueryProperties - -from . import BasePlanNode - - -class ProjectionNode(BasePlanNode): - def __init__(self, properties: QueryProperties, **parameters): - """ - Attribute Projection, remove unwanted columns and performs column renames. - """ - BasePlanNode.__init__(self, properties=properties, **parameters) - - projection = parameters["projection"] + parameters.get("order_by_columns", []) - - self.projection = [] - for column in projection: - self.projection.append(column.schema_column.identity) - - self.evaluations = [ - column for column in projection if column.node_type != NodeType.IDENTIFIER - ] - - self.columns = parameters["projection"] - - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - - @property - def config(self): # pragma: no cover - from opteryx.managers.expression import format_expression - - return ", ".join(format_expression(col) for col in self.columns) - - @property - def name(self): # pragma: no cover - return "Projection" - - def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: - if morsel == EOS: - return EOS - - # If any of the columns need evaluating, we need to do that here - morsel = evaluate_and_append(self.evaluations, morsel) - return morsel.select(self.projection) diff --git a/opteryx/operatorsv2/read_node.py b/opteryx/operatorsv2/read_node.py deleted file mode 100644 index e81bcb7eb..000000000 --- a/opteryx/operatorsv2/read_node.py +++ /dev/null @@ -1,228 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Read Node - -This is the SQL Query Execution Plan Node responsible for the reading of data. - -It wraps different internal readers (e.g. GCP Blob reader, SQL Reader), -normalizes the data into the format for internal processing. -""" - -import time -from typing import Generator - -import orjson -import pyarrow -from orso.schema import RelationSchema -from orso.schema import convert_orso_schema_to_arrow_schema - -from opteryx import EOS -from opteryx.models import QueryProperties - -from . import BasePlanNode - - -def struct_to_jsonb(table: pyarrow.Table) -> pyarrow.Table: - """ - Converts any STRUCT columns in a PyArrow Table to JSON strings and replaces them - in the same column position. - - Parameters: - table (pa.Table): The PyArrow Table to process. - - Returns: - pa.Table: A new PyArrow Table with STRUCT columns converted to JSON strings. - """ - for i in range(table.num_columns): - field = table.schema.field(i) - - # Check if the column is a STRUCT - if pyarrow.types.is_struct(field.type): - # Convert each row in the STRUCT column to a JSON string - json_strings = [ - orjson.dumps(row.as_py()) if row.is_valid else None for row in table.column(i) - ] - json_array = pyarrow.array(json_strings, type=pyarrow.binary()) - - # Drop the original STRUCT column - table = table.drop_columns(field.name) - - # Insert the new JSON column at the same position - table = table.add_column( - i, pyarrow.field(name=field.name, type=pyarrow.binary()), json_array - ) - - return table - - -def normalize_morsel(schema: RelationSchema, morsel: pyarrow.Table) -> pyarrow.Table: - if len(schema.columns) == 0 and morsel.column_names != ["*"]: - one_column = pyarrow.array([True] * morsel.num_rows, type=pyarrow.bool_()) - morsel = morsel.append_column("*", one_column) - return morsel.select(["*"]) - - # rename columns for internal use - target_column_names = [] - # columns in the data but not in the schema, droppable - droppable_columns = [] - - # Find which columns to drop and which columns we already have - for i, column in enumerate(morsel.column_names): - column_name = schema.find_column(column) - if column_name is None: - droppable_columns.append(i) - else: - target_column_names.append(str(column_name)) - - # Remove from the end otherwise we'll remove the wrong columns after we've removed one - droppable_columns.reverse() - for droppable in droppable_columns: - morsel = morsel.remove_column(droppable) - - # remane columns to the internal names (identities) - morsel = morsel.rename_columns(target_column_names) - - # add columns we don't have, populate with nulls but try to get the correct type - for column in schema.columns: - if column.identity not in target_column_names: - null_column = pyarrow.array([None] * morsel.num_rows, type=column.arrow_field.type) - field = pyarrow.field(name=column.identity, type=column.arrow_field.type) - morsel = morsel.append_column(field, null_column) - - # ensure the columns are in the right order - return morsel.select([col.identity for col in schema.columns]) - - -def merge_schemas( - hypothetical_schema: RelationSchema, observed_schema: pyarrow.Schema -) -> pyarrow.schema: - """ - Using the hypothetical schema as the base, replace with fields from the observed schema - which are a Decimal type. - """ - # convert the Orso schema to an Arrow schema - hypothetical_arrow_schema = convert_orso_schema_to_arrow_schema(hypothetical_schema, True) - - # Convert the hypothetical schema to a dictionary for easy modification - schema_dict = {field.name: field for field in hypothetical_arrow_schema} - - # Iterate through fields in the observed schema - for observed_field in observed_schema: - # Check if the field is of type Decimal or List/Array - if pyarrow.types.is_decimal(observed_field.type) or pyarrow.types.is_list( - observed_field.type - ): - # Replace or add the field to the schema dictionary - schema_dict[observed_field.name] = observed_field - - # Create a new schema from the updated dictionary of fields - merged_schema = pyarrow.schema(list(schema_dict.values())) - - return merged_schema - - -class ReaderNode(BasePlanNode): - def __init__(self, properties: QueryProperties, **parameters): - BasePlanNode.__init__(self, properties=properties, **parameters) - - self.start_date = parameters.get("start_date") - self.end_date = parameters.get("end_date") - self.hints = parameters.get("hints", []) - self.columns = parameters.get("columns", []) - self.predicates = parameters.get("predicates", []) - - self.connector = parameters.get("connector") - self.schema = parameters.get("schema") - self.limit = parameters.get("limit") - - if len(self.hints) != 0: - self.statistics.add_message("All HINTS are currently ignored") - - self.statistics.rows_read += 0 - self.statistics.columns_read += 0 - - def to_dict(self) -> dict: - return { - "identity": f"read-{self.identity}", - "opterator": "ReadNode", - "schema": self.columns, - "projection": self.columns, - "filters": self.predicates, - } - - @classmethod - def from_dict(cls, dic: dict) -> "BasePlanNode": - raise NotImplementedError() - - @property - def name(self): # pragma: no cover - """friendly name for this step""" - return "Read" - - @property - def config(self): - """Additional details for this step""" - date_range = "" - if self.parameters.get("start_date") == self.parameters.get("end_date"): - if self.parameters.get("start_date") is not None: - date_range = f" FOR '{self.parameters.get('start_date')}'" - else: - date_range = ( - f" FOR '{self.parameters.get('start_date')}' TO '{self.parameters.get('end_date')}'" - ) - return ( - f"{self.connector.__type__} " - f"({self.parameters.get('relation')}" - f"{' AS ' + self.parameters.get('alias') if self.parameters.get('alias') else ''}" - f"{date_range}" - f"{' WITH(' + ','.join(self.parameters.get('hints')) + ')' if self.parameters.get('hints') else ''})" - ) - - def execute(self, morsel) -> Generator: - """Perform this step, time how long is spent doing work""" - - morsel = None - orso_schema = self.schema - orso_schema_cols = [] - for col in orso_schema.columns: - if col.identity in [c.schema_column.identity for c in self.columns]: - orso_schema_cols.append(col) - orso_schema.columns = orso_schema_cols - arrow_schema = None - start_clock = time.monotonic_ns() - reader = self.connector.read_dataset( - columns=self.columns, predicates=self.predicates, limit=self.limit - ) - for morsel in reader: - # try to make each morsel have the same schema - morsel = struct_to_jsonb(morsel) - morsel = normalize_morsel(orso_schema, morsel) - if arrow_schema is None: - arrow_schema = merge_schemas(self.schema, morsel.schema) - if arrow_schema.names: - morsel = morsel.cast(arrow_schema) - - self.statistics.time_reading_blobs += time.monotonic_ns() - start_clock - self.statistics.blobs_read += 1 - self.records_out += morsel.num_rows - self.statistics.rows_read += morsel.num_rows - self.bytes_out += morsel.nbytes - yield morsel - start_clock = time.monotonic_ns() - if morsel: - self.statistics.columns_read += morsel.num_columns - else: - self.statistics.columns_read += len(orso_schema.columns) - - yield EOS diff --git a/opteryx/operatorsv2/set_variable_node.py b/opteryx/operatorsv2/set_variable_node.py deleted file mode 100644 index 02676434d..000000000 --- a/opteryx/operatorsv2/set_variable_node.py +++ /dev/null @@ -1,48 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Set Variables Node - -This is a SQL Query Execution Plan Node. -""" - -from opteryx.constants import QueryStatus -from opteryx.models import NonTabularResult -from opteryx.models import QueryProperties - -from . import BasePlanNode - - -class SetVariableNode(BasePlanNode): - def __init__(self, properties: QueryProperties, **parameters): - BasePlanNode.__init__(self, properties=properties, **parameters) - - self.variable = parameters.get("variable") - self.value = parameters.get("value") - self.variables = parameters.get("variables") - - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - - @property - def name(self): # pragma: no cover - return "Set Variables" - - @property - def config(self): # pragma: no cover - return f"{self.variable} TO {self.value}" - - def execute(self, morsel) -> NonTabularResult: - self.variables[self.variable] = self.value - return NonTabularResult(record_count=1, status=QueryStatus.SQL_SUCCESS) # type: ignore diff --git a/opteryx/operatorsv2/show_columns_node.py b/opteryx/operatorsv2/show_columns_node.py deleted file mode 100644 index 3d57a8c21..000000000 --- a/opteryx/operatorsv2/show_columns_node.py +++ /dev/null @@ -1,102 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Show Columns Node - -This is a SQL Query Execution Plan Node. - -Gives information about a dataset's columns -""" - -import pyarrow - -from opteryx import EOS -from opteryx.models import QueryProperties - -from . import BasePlanNode - - -def _simple_collector(schema): - """ - We've been given the schema, so just translate to a table - """ - buffer = [] - for column in schema.columns: - new_row = { - "name": column.name, - "type": column.type, - "nullable": column.nullable, - "aliases": column.aliases, - } - buffer.append(new_row) - - return pyarrow.Table.from_pylist(buffer) - - -class ShowColumnsNode(BasePlanNode): - def __init__(self, properties: QueryProperties, **parameters): - BasePlanNode.__init__(self, properties=properties, **parameters) - self._full = parameters.get("full") - self._extended = parameters.get("extended") - self._schema = parameters.get("schema") - self._column_map = { - c.schema_column.identity: c.source_column for c in parameters["columns"] - } - self.collector = None - self.seen = False - - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - - @property - def name(self): # pragma: no cover - return "Show Columns" - - @property - def config(self): # pragma: no cover - return "" - - def rename_column(self, dic: dict, renames) -> dict: - dic["name"] = renames[dic["name"]] - return dic - - def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: - from orso import DataFrame - - if self.seen: - return None - - if not (self._full or self._extended): - # if it's not full or extended, do just get the list of columns and their - # types - self.seen = True - return _simple_collector(self._schema) - - if self._full or self._extended: - # we're going to read the full table, so we can count stuff - - if morsel == EOS: - dicts = self.collector.to_dicts() - dicts = [self.rename_column(d, self._column_map) for d in dicts] - self.seen = True - return pyarrow.Table.from_pylist(dicts) - - df = DataFrame.from_arrow(morsel) - - if self.collector is None: - self.collector = df.profile - else: - self.collector += df.profile - - return None diff --git a/opteryx/operatorsv2/show_create_node.py b/opteryx/operatorsv2/show_create_node.py deleted file mode 100644 index d76d95d9b..000000000 --- a/opteryx/operatorsv2/show_create_node.py +++ /dev/null @@ -1,60 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Show Create Node - -This is a SQL Query Execution Plan Node. -""" - -import pyarrow - -from opteryx.exceptions import DatasetNotFoundError -from opteryx.exceptions import UnsupportedSyntaxError -from opteryx.models import QueryProperties - -from . import BasePlanNode - - -class ShowCreateNode(BasePlanNode): - def __init__(self, properties: QueryProperties, **parameters): - BasePlanNode.__init__(self, properties=properties, **parameters) - - self.object_type = parameters.get("object_type") - self.object_name = parameters.get("object_name") - - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - - @property - def name(self): # pragma: no cover - return "Show" - - @property - def config(self): # pragma: no cover - return "" - - def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: - if self.object_type == "VIEW": - from opteryx.planner.views import is_view - from opteryx.planner.views import view_as_sql - - if is_view(self.object_name): - view_sql = view_as_sql(self.object_name) - buffer = [{self.object_name: view_sql}] - table = pyarrow.Table.from_pylist(buffer) - return table - - raise DatasetNotFoundError(self.object_name) - - raise UnsupportedSyntaxError("Invalid SHOW statement") diff --git a/opteryx/operatorsv2/show_value_node.py b/opteryx/operatorsv2/show_value_node.py deleted file mode 100644 index f223363bb..000000000 --- a/opteryx/operatorsv2/show_value_node.py +++ /dev/null @@ -1,59 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Show Variables Node - -This is a SQL Query Execution Plan Node. -""" - -from typing import Generator - -import pyarrow - -from opteryx import EOS -from opteryx.exceptions import SqlError -from opteryx.models import QueryProperties - -from . import ReaderNode - - -class ShowValueNode(ReaderNode): - def __init__(self, properties: QueryProperties, **parameters): - ReaderNode.__init__(self, properties=properties, **parameters) - - self.key = parameters.get("key") - self.kind = parameters.get("kind") - self.value = parameters.get("value") - - if self.kind == "PARAMETER": - if self.value[0] == "@": - raise SqlError("PARAMETERS cannot start with '@'") - self.key = self.value - self.value = properties.variables[self.value] - - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - - @property - def name(self): # pragma: no cover - return "Show Value" - - @property - def config(self): # pragma: no cover - return "" - - def execute(self, morsel) -> Generator: - buffer = [{"name": self.key, "value": str(self.value)}] - table = pyarrow.Table.from_pylist(buffer) - yield table diff --git a/opteryx/operatorsv2/sort_node.py b/opteryx/operatorsv2/sort_node.py deleted file mode 100644 index 12c399240..000000000 --- a/opteryx/operatorsv2/sort_node.py +++ /dev/null @@ -1,100 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Sort Node - -This is a SQL Query Execution Plan Node. - -This node orders a dataset -""" - -import numpy -from orso.types import OrsoTypes -from pyarrow import Table -from pyarrow import concat_tables - -from opteryx import EOS -from opteryx.exceptions import ColumnNotFoundError -from opteryx.exceptions import UnsupportedSyntaxError -from opteryx.managers.expression import NodeType -from opteryx.models import QueryProperties - -from . import BasePlanNode - - -class SortNode(BasePlanNode): - def __init__(self, properties: QueryProperties, **parameters): - BasePlanNode.__init__(self, properties=properties, **parameters) - self.order_by = parameters.get("order_by", []) - self.morsels = [] - - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - - @property - def config(self): # pragma: no cover - return ", ".join([f"{i[0].value} {i[1][0:3].upper()}" for i in self.order_by]) - - @property - def name(self): # pragma: no cover - return "Sort" - - def execute(self, morsel: Table) -> Table: - if morsel != EOS: - self.morsels.append(morsel) - return None - - table = concat_tables(self.morsels, promote_options="permissive") - - mapped_order = [] - - for column, direction in self.order_by: - if column.node_type == NodeType.FUNCTION: - # ORDER BY RAND() shuffles the results - # we create a random list, sort that then take the rows from the - # table in that order - this is faster than ordering the data - if column.value in ("RANDOM", "RAND"): - new_order = numpy.argsort(numpy.random.uniform(size=table.num_rows)) - table = table.take(new_order) - return table - - raise UnsupportedSyntaxError( - "`ORDER BY` only supports `RAND()` as a functional sort order." - ) - - elif column.node_type == NodeType.LITERAL and column.type == OrsoTypes.INTEGER: - # we have an index rather than a column name, it's a natural - # number but the list of column names is zero-based, so we - # subtract one - column_name = table.column_names[int(column.value) - 1] - mapped_order.append( - ( - column_name, - direction, - ) - ) - else: - try: - mapped_order.append( - ( - column.schema_column.identity, - direction, - ) - ) - except ColumnNotFoundError as cnfe: # pragma: no cover - raise ColumnNotFoundError( - f"`ORDER BY` must reference columns as they appear in the `SELECT` clause. {cnfe}" - ) - - return [table.sort_by(mapped_order), EOS] diff --git a/opteryx/operatorsv2/union_node.py b/opteryx/operatorsv2/union_node.py deleted file mode 100644 index a59a07530..000000000 --- a/opteryx/operatorsv2/union_node.py +++ /dev/null @@ -1,64 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Union Node - -This is a SQL Query Execution Plan Node. -""" - -from pyarrow import Table - -from opteryx import EOS -from opteryx.models import QueryProperties - -from . import BasePlanNode - - -class UnionNode(BasePlanNode): - def __init__(self, properties: QueryProperties, **parameters): - BasePlanNode.__init__(self, properties=properties, **parameters) - self.columns = parameters.get("columns", []) - self.column_ids = [c.schema_column.identity for c in self.columns] - self.seen_first_eos = False - self.schema = None - - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - - @property - def name(self): # pragma: no cover - return "Union" - - @property - def config(self): # pragma: no cover - return "" - - def execute(self, morsel: Table) -> Table: - """ - Union needs to ensure the column names are the same and that - coercible types are coerced. - """ - if morsel == EOS and self.seen_first_eos: - return [EOS] - if morsel == EOS: - self.seen_first_eos = True - return None - - if self.schema is None: - self.schema = morsel.schema - else: - morsel = morsel.rename_columns(self.schema.names) - morsel = morsel.cast(self.schema) - - return morsel.select(self.column_ids) diff --git a/opteryx/planner/__init__.py b/opteryx/planner/__init__.py index c8604c66c..ff1cec9f2 100644 --- a/opteryx/planner/__init__.py +++ b/opteryx/planner/__init__.py @@ -136,7 +136,6 @@ def query_planner( from opteryx.planner.logical_planner import do_logical_planning_phase from opteryx.planner.physical_planner import create_physical_plan from opteryx.planner.sql_rewriter import do_sql_rewrite - from opteryx.planner.temporary_physical_planner import create_legacy_physical_plan from opteryx.third_party import sqloxide # SQL Rewriter extracts temporal filters @@ -201,9 +200,6 @@ def query_planner( # before we write the new optimizer and execution engine, convert to a V1 plan start = time.monotonic_ns() query_properties = QueryProperties(qid=qid, variables=connection.context.variables) - if config.EXPERIMENTAL_EXECUTION_ENGINE: - physical_plan = create_physical_plan(optimized_plan, query_properties) - else: - physical_plan = create_legacy_physical_plan(optimized_plan, query_properties) + physical_plan = create_physical_plan(optimized_plan, query_properties) statistics.time_planning_physical_planner += time.monotonic_ns() - start yield physical_plan diff --git a/opteryx/planner/physical_planner.py b/opteryx/planner/physical_planner.py index 320b90f0d..3233a6e7c 100644 --- a/opteryx/planner/physical_planner.py +++ b/opteryx/planner/physical_planner.py @@ -13,7 +13,7 @@ from orso.schema import OrsoTypes -from opteryx import operatorsv2 as operators +from opteryx import operators as operators from opteryx.exceptions import UnsupportedSyntaxError from opteryx.models import LogicalColumn from opteryx.models import PhysicalPlan diff --git a/opteryx/planner/temporary_physical_planner.py b/opteryx/planner/temporary_physical_planner.py deleted file mode 100644 index 6d51b150a..000000000 --- a/opteryx/planner/temporary_physical_planner.py +++ /dev/null @@ -1,119 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This is a temporary step, which takes logical plans from the V2 planner -and converts them to modified-V1 physical plans. - -This should look different when the operators are rewritten for the -Gen 2 execution engine (a later piece of work) -""" - -from orso.schema import OrsoTypes - -from opteryx import operators -from opteryx.exceptions import UnsupportedSyntaxError -from opteryx.models import PhysicalPlan -from opteryx.planner.logical_planner import LogicalPlanStepType - - -def create_legacy_physical_plan(logical_plan, query_properties) -> PhysicalPlan: - plan = PhysicalPlan() - - for nid, logical_node in logical_plan.nodes(data=True): - node_type = logical_node.node_type - node_config = logical_node.properties - node: operators.BasePlanNode = None - - # fmt: off - if node_type == LogicalPlanStepType.Aggregate: - node = operators.AggregateNode(query_properties, aggregates=node_config["aggregates"]) - elif node_type == LogicalPlanStepType.AggregateAndGroup: - node = operators.AggregateAndGroupNode(query_properties, groups=node_config["groups"], aggregates=node_config["aggregates"], projection=node_config["projection"]) - # elif node_type == LogicalPlanStepType.Defragment: - # node = operators.MorselDefragmentNode(query_properties, **node_config) - elif node_type == LogicalPlanStepType.Distinct: - node = operators.DistinctNode(query_properties, **node_config) - elif node_type == LogicalPlanStepType.Exit: - node = operators.ExitNode(query_properties, **node_config) - elif node_type == LogicalPlanStepType.Explain: - node = operators.ExplainNode(query_properties, **node_config) - elif node_type == LogicalPlanStepType.Filter: - node = operators.FilterNode(query_properties, filter=node_config["condition"]) - elif node_type == LogicalPlanStepType.FunctionDataset: - node = operators.FunctionDatasetNode(query_properties, **node_config) - elif node_type == LogicalPlanStepType.HeapSort: - node = operators.HeapSortNode(query_properties, **node_config) - elif node_type == LogicalPlanStepType.Join: - if node_config.get("type") == "inner": - # We use our own implementation of INNER JOIN - # We have optimized VARCHAR version - if len(node_config["left_columns"]) == 1 and node_config["columns"][0].schema_column.type == OrsoTypes.VARCHAR: - node = operators.InnerJoinSingleNode(query_properties, **node_config) - else: - node = operators.InnerJoinNode(query_properties, **node_config) - elif node_config.get("type") in ("left outer", "full outer", "right outer", "left anti", "left semi"): - # We use out own implementation of OUTER JOINS - node = operators.OuterJoinNode(query_properties, **node_config) - elif node_config.get("type") == "cross join": - # Pyarrow doesn't have a CROSS JOIN - node = operators.CrossJoinNode(query_properties, **node_config) - else: - # Use Pyarrow for all other joins - node = operators.JoinNode(query_properties, **node_config) - elif node_type == LogicalPlanStepType.Limit: - node = operators.LimitNode(query_properties, limit=node_config.get("limit"), offset=node_config.get("offset", 0)) - elif node_type == LogicalPlanStepType.Order: - node = operators.SortNode(query_properties, order=node_config["order_by"]) - elif node_type == LogicalPlanStepType.Project: - node = operators.ProjectionNode(query_properties, projection=logical_node.columns) - elif node_type == LogicalPlanStepType.Scan: - connector = node_config.get("connector") - if connector and hasattr(connector, "async_read_blob"): - node = operators.AsyncReaderNode(query_properties, **node_config) - else: - node = operators.ReaderNode(query_properties, **node_config) - elif node_type == LogicalPlanStepType.Set: - node = operators.SetVariableNode(query_properties, **node_config) - elif node_type == LogicalPlanStepType.Show: - if node_config["object_type"] == "VARIABLE": - node = operators.ShowValueNode(query_properties, kind=node_config["items"][1], value=node_config["items"][1]) - elif node_config["object_type"] == "VIEW": - node = operators.ShowCreateNode(query_properties, **node_config) - else: - raise UnsupportedSyntaxError(f"Unsupported SHOW type '{node_config['object_type']}'") - elif node_type == LogicalPlanStepType.ShowColumns: - node = operators.ShowColumnsNode(query_properties, **node_config) - elif node_type == LogicalPlanStepType.Subquery: - node = operators.NoOpNode(query_properties, **node_config) - elif node_type == LogicalPlanStepType.Union: - node = operators.UnionNode(query_properties, **node_config) - else: # pragma: no cover - raise Exception(f"something unexpected happed - {node_type.name}") - # fmt: on - - # DEBUG: from opteryx.exceptions import InvalidInternalStateError - # DEBUG: - # DEBUG: try: - # DEBUG: config = node.to_json() - ## DEBUG: print(config) - # DEBUG: except Exception as err: - # DEBUG: message = f"Internal Error - node '{node}' unable to be serialized" - # DEBUG: print(message) - ## DEBUG: raise InvalidInternalStateError(message) - - plan.add_node(nid, node) - - for source, destination, relation in logical_plan.edges(): - plan.add_edge(source, destination, relation) - - return plan diff --git a/pyproject.toml b/pyproject.toml index 5a4d777a2..764e40f9e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ fast = true [tool.isort] profile = "black" -extend_skip_glob = ["tests/**", "*.pyx", "testdata/**", "**/operatorsv2/__init__.py"] +extend_skip_glob = ["tests/**", "*.pyx", "testdata/**", "**/operators/__init__.py"] skip_gitignore = true line_length = 100 multi_line_output = 9 diff --git a/tests/misc/test_cli.py b/tests/misc/test_cli.py index 6d6863183..64f90487f 100644 --- a/tests/misc/test_cli.py +++ b/tests/misc/test_cli.py @@ -7,11 +7,12 @@ def run_cli(args): """Helper function to run the CLI and return the result.""" result = subprocess.run( - [sys.executable, "opteryx"] + args, + [sys.executable, "opteryx/__main__.py"] + args, capture_output=True, text=True, timeout=5 ) + return result @@ -59,8 +60,9 @@ def test_table_width(): def test_column_width(): """Test the CLI when no SQL is provided, expecting an error.""" result = run_cli(["--no-color", "--max_col_width", "4", "SELECT * FROM $planets"]) + output = result.stdout assert result.returncode == 0 - assert '│ Merc │' in result.stdout + assert '│ Merc │' in output, output def test_unknown_param(): """Test the CLI when no SQL is provided, expecting an error.""" From 131baa210d3545fd877ae683693d368aa6730978 Mon Sep 17 00:00:00 2001 From: XB500 Date: Fri, 15 Nov 2024 00:13:58 +0000 Subject: [PATCH 026/157] Opteryx Version 0.19.0-alpha.858 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index ea099b8f1..971ac3397 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 857 +__build__ = 858 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 38f7a4829d409871c61011c42a836e28ebe9e221 Mon Sep 17 00:00:00 2001 From: joocer Date: Fri, 15 Nov 2024 22:04:02 +0000 Subject: [PATCH 027/157] #2061 --- tests/misc/test_cli.py | 5 ++++- tests/tools.py | 8 ++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/tests/misc/test_cli.py b/tests/misc/test_cli.py index 64f90487f..4ba5420c6 100644 --- a/tests/misc/test_cli.py +++ b/tests/misc/test_cli.py @@ -6,8 +6,11 @@ def run_cli(args): """Helper function to run the CLI and return the result.""" + from tests.tools import find_file + + path = find_file("**/__main__.py") result = subprocess.run( - [sys.executable, "opteryx/__main__.py"] + args, + [sys.executable, path] + args, capture_output=True, text=True, timeout=5 diff --git a/tests/tools.py b/tests/tools.py index 39dd21c51..0086a1ff6 100644 --- a/tests/tools.py +++ b/tests/tools.py @@ -47,6 +47,7 @@ def test_example(): import platform from functools import wraps +from typing import Optional def is_arm(): # pragma: no cover @@ -179,6 +180,13 @@ def wrapper(*args, **kwargs): return decorate +def find_file(path: str) -> Optional[str]: + import glob + + matches = glob.iglob(path) + return next(matches, None) + + def download_file(url: str, path: str): # pragma: no cover """ Download a file from a given URL and save it to a specified path. From 4fc92483383c435eb1dc5d02be33a0f5cf8005dc Mon Sep 17 00:00:00 2001 From: XB500 Date: Fri, 15 Nov 2024 22:04:46 +0000 Subject: [PATCH 028/157] Opteryx Version 0.19.0-alpha.859 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index ea099b8f1..49b707ffd 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 857 +__build__ = 859 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 8573fbd7b5143de8e68e3756c79f64d9c0a2350c Mon Sep 17 00:00:00 2001 From: joocer Date: Fri, 15 Nov 2024 22:19:22 +0000 Subject: [PATCH 029/157] #2061 --- opteryx/__main__.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/opteryx/__main__.py b/opteryx/__main__.py index 949edc33b..177b31327 100644 --- a/opteryx/__main__.py +++ b/opteryx/__main__.py @@ -23,13 +23,14 @@ import threading import time -import opteryx -from opteryx.exceptions import MissingSqlStatement -from opteryx.utils.sql import clean_statement -from opteryx.utils.sql import remove_comments - sys.path.insert(1, os.path.join(sys.path[0], "..")) +if True: + import opteryx + from opteryx.exceptions import MissingSqlStatement + from opteryx.utils.sql import clean_statement + from opteryx.utils.sql import remove_comments + if readline: pass From e69741b59ebd8d88b09233d20db4653b6ea76e74 Mon Sep 17 00:00:00 2001 From: XB500 Date: Fri, 15 Nov 2024 22:19:46 +0000 Subject: [PATCH 030/157] Opteryx Version 0.19.0-alpha.860 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 49b707ffd..f67a632e1 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 859 +__build__ = 860 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From e474686808c209891a08352c52612fb6fb000cbb Mon Sep 17 00:00:00 2001 From: joocer Date: Fri, 15 Nov 2024 22:29:08 +0000 Subject: [PATCH 031/157] #2061 --- testdata/astronauts/astronauts.parquet | Bin 0 -> 34448 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 testdata/astronauts/astronauts.parquet diff --git a/testdata/astronauts/astronauts.parquet b/testdata/astronauts/astronauts.parquet new file mode 100644 index 0000000000000000000000000000000000000000..506c4319f5978db584e1e3ca12391b1290828fc0 GIT binary patch literal 34448 zcmdSBd3aOR8aKLDveWFe$!-#oG$~2RE^TQ`yD0@CEf7iCbfWV>0Rh=@@9D~oe)wNj!FZ4&IZBh=@z?04v*@? z5h>*Cq(*Hl9MaR=nPO0iIxEn^F9xIG5G@;M?+XXTsIwB!`lU!T4R3klt5-Wc0e?hR zWt%)9e?%HooxuhIl4_hT#5M~~^EiE5cQ_VPRbLv21;vm7FLla+fGh@`9&{A)qo;9r zSP>A1FtX}=m6^e48P*25yHAt?sy1Clz6lK~!$CPDcWWxFXeOZtde`haDH0A!PR~Hh zSzBA{G}6!KN|Lh<{YLO|uuF^wV)Q(1qa+7Z?vh9_8jFNOy=L^*N`u;n-`Ps1*hArf zAFn%e`I%BrPedAWwxNS)U)K=rH?>J1i>OL4@le6^f*L25b)X} zLF*0A6AzulChu=Bn9#pTABl*u811rGiXmAFINLR5GDLqUstfmpiYvvHvfo*y$^f|~ z3-31diIIR5)pTg1Q7OignNBTc=?RD-XFZ0L3f7of!(CD&rZJlOLj6)m=b=UD$mzqk z0kKcyQw>{49W~j*;b0Gj^HZC)IxdGb&*lf>Jz=c1E+RPT$NG6OoUPiiYHV&sl71cu3>X%l=*|s#?UKYpRGxVsbRs!$!h`n#X9Q;0n?}&t$@b zx${j8Vx-^Ms59`-)jD~qk9YS4WDjSm6Nj7?0a{@4#=HBaAzq(*u@$@NjRyjrn;iF< z;*l;ajF#!lq?1m<)Vuqrsrbz#Pgmd|ghU$CFI*W8g>qB%6=H8dqRrYWDH5VzSE2#P zBS*qEY)pGZ>>lE4sJUnmtmw?I9KsPP@a&x0i#=GR4~h}LNG)?NH&x&mMZ&(X^p$Zj zB=Ps7S8y23*W&E8txrLxkpU?blOlYHsalF)2vN}DVT7J*ZxBNPaAQ5cpG2sIMWft7 zIwjXkYUo;RbO7%>=d9?9$kAAMppWLMY}%;oAM!j?%(4|yC>S&O;=NKx60=B!>gR&u z>Ah0K>BU0(#GW2W;@@;uN2K0xWXS1LSxiAW7Vhbx{k~mnKn{r`IcOH@Ev}Ts)qK6Z zxlazr1I~G>smwr7{?~pOPkOu*3;P_V zdNCfsN#5eo({Hrcxgn2ist8EpO38EiitW}`%qbxEi0f#xA;SuGuLz5=JuSixY7KYy z#fE~g?M z3dx~fNFV+M$HFATNn%hng?_7T#F@}stB*$_vLtS`HVV!`T$R3F9E?cAVh$u% zr4)#Fi&4+Vj3>aqqQ!nyp0M!>{%6)3k?Cx0yr*XsPrWMszCt))AZCLEyNc(mZf)R#C;1w({4A+5nCbJR z!qKD+ivel4NZxFPN~ywj^A5ubx`_!##0m6AYqJ=O$mr`;E53uYiD~)_0`XA~WNkdi zZ?w1fNnt()Dr+^4PzdMv zyy7)hETLMAh_&zX2DVZRtbbFFkN!}BfvJugZ)SByy9N@Y4r+qxRhzU;gHR}Q?PwE` zp&03omhEQIZu#6oqI<`ZV6)KJ1p zz7lH=Wk9U?XCeFL7*FKImQ@!C}NPlIH6{0l&mwnX5Rap$A*)NHr}vI)RTdZNhra?ng_x zw>LW6Yo9MidgXw#mga+VqTwJlt3DsDTOqPwSy3_^iH-d9`)R7#Z06?FiMRk=*WPxni|-Uyn6kr z%+*>|E)xn_5*pIZ1=AR)woe%;#P7|Gasyhv8a+SGg!~27xl!V!tx)`s0H@AuanVjz zmbyG|@Y_&&C8n_v{BFR|;%1O~;F!PC@Jzdtya2y9bnp+7e1v;c#j%$67TWjJ`C_+Ck z*VgDEeQF~s7VE`Gv`?gKnA(-Pvs&_rcl1j(vQp07%`}MXk3%E`<)npp52&L`C8Ah? zN0E@+D;mE@)q7z;#XO^IofK2OV;D08_-(p51`DY%Tj=e@-;qjcW~X-0SR@&?b(7_>-iLQ?W2n zE7*86n0o=!7PFkL#X&cdmAUz~UlWhU02|&pQqFzClIlYbu=U|Vs(0s0{l(?CZ=t zQx9IQbix)FySw}3FjuV6)slJCW{p8R^ah5wv-JEm)Knt%LO(Widm^#Ec*OHSOFm?f zIYadpUrE}y&06kk4H2oG#JN0eK`9C6?ld)YLrMB`C(QOn;@uM7t-#!1R!0p{!%}XF zM&Df#4hKA(@DgFVE4Z83+H&J8BYoc5T`8}WA&iSI`V4g0!8U|L+$xUmJhd5mA_J;h zj))JEijf3gz)GTDm8;oCbM#UG4$09vQ!|XbfT-%h1DHXy!BmfrA^&g*(;#t!X=DQT zemazqae+1;9Ni$ry2U_Gv8a`KA{mvbY?a{598IUs$+zfz{$4Sns?*D{?mo#9Pbc0J zlbDF8I;syxKn_m$Oh7*)$0TS16pTvs|r(05{`oz{sQ9 ziO+b*sBh}-3yb^{E!rGt=OzidL6VIp)3r%Ih%ePQ1>}{Y@nz~W|I@J!_y(u~<25?|+KGO+0HLa8)!68dsnq#@XSxnl1UHhLb0sl8c1+N|oMf+v` zmxemxQCS%|>MPUD*O*AZWuji+D0c_KCmfzGT5LR;Hu`YEVpXvT)_eErA>)d4Qq8SM zSB1C>O*FsZ2|K9LEaCJO-=edSR+WW`$wpIJ?l=~!=Zt!o$OA_oHoQm+*l=&$cp7Wm zGAb8ahA|^D%pPePL`7jfjBT{!5P7VKrp(DANR+I<3{5Qk5`rVB;C|P43q;uFyn$s_7(}`Im8~y0~AC_3`7GR*6d+ z#}ACjgAuObi`;8;+RZL={x+IPIxQZ(@o^(u z19-=(8h(+*l?H#`!&}?B`{0Ab=u~ZMJQ^Kc!NAtDV1r<}i3JaEbw;*Zj(UtHzzJQF zH2M^&G_FX+VJ*(QNN4SY2LOYezeZAyyq3EjDkayBlQC4>93FJ-QztpE4LsQsjv5c9 zS7OFEWm3%9+9!$l!tb}-uQ7rDV34Vv;xaQx#PX0i*h=4LA%pmd`ev9fQP0`-HYg+9 znYj`z3#fzpC#U+H?Gpz`$5qZ3EnQY1C5u zoY`avWSk0^07JCJM5A}y=qwmJvY-2oBeJnsn{?HC#L!T*ZzxpUEDvZ#bWL=XJ|@A~ zH{Ot8!XbeBq-kVfdGfj3Z5j)wgD32V1<>6e9^`ddm0JE$_%$Qt!d4BRUl-XtJ6h$J)iv9&P4Z*aNr*Ay$oNd&$_r=4x zMZV*zcJ4iP|KZGWn&g>mkHiB*{M?fIuwNRKm-jniC^)%%tJVv(%g^2TH>Og68(b6a ziCLac-7`zuE;#3N3Qh~lPUe3;G@I0OFI)8UB@uqV*YYxF?L-4P63aWqX%?CwZRUeR zzS06mfMl2G3kD5^bmRfa8EE+4Dy?31Jblsx@D^W9XHg_UCx_mPJGr@mS0?Hz&|?(CBfUBYJ@UpM7o z9e~xkziFJI z7_O36t2fc{QH}WgXB)0T)0M zSMKRZdR?92K#ymE@$2#Q6($@uW}CH1L@t*#c>;jc{JD2QW%&z@ti>dh(U6uXP$r;mX#W39&E7%Te2mj8ax;3JKpT5zv+ zD8IRmDsgjMMKxy-1#6wVbM3H_HRdlkvUB)cwLZIM``nRr>(i{IeXkxFIroBer+0(Z zv+KN#oUT}NgYY*?)eC3p74{xwl?7i@+X?@S{TlmP{i>^6s;JwGMs<`pu z>t?O`>Z8{lx$%dO6F_srW~D!=~d{>yG` z+Uebx={s=w{bzeEj;+gIC^q=8<)p@l`i%Kh*vB z)`Bx0yXmSsdY|1@drj5NS0C+D<$8Nsn?(n-0ZyaBJ%eD7LKYD)u8GpOwy1x#7 z@!qj(s&C!#z*#?h_3;~jyY>2q&LPTFd(CY-N6%%)6)t}Kwvk6J&`+$pw&u{TqnoYM zmcRM-gb=uiU!qkxhxV389p2zJ4xMGlb|^qSdj?O*=GJaz8#4>8xZX{?5h4;M%9lCw`rte1Xe}>myu` z;`u$eN_0+Q)v@{4?wE^#2)T2|TnxLDF&|+5#I9C$>`VXkbAP`>bBingzdS{|OBLUw z)0wzY{*pzNzuIx>ab;2UgWtbnp~}x@QuVLz{DkLc=is>&pC0-!kE(wCbvpj}ESsu2 zoE!KW+g`ihn7>>Vd)3;z?Zc<*s&9Mch_U2O^~8109w8^w=!K%Cpyi194F8Ip*XpC< z<@wy|w97p^RzGFm_qy?KbEoZ`6>NC4=PcuOEiVTS)moNL_dNRHqRkB5@NVI4hNF!` z9%oN|k^bR@N5rC{uiC{6=cc?et>E$$dfw-cez%O>#kXj;KXasN(Pq^S->#{;vZ@Qu z{laElpTBqXrFt=3KtHetZ+o@%yoU4hGR~Mhy0UIs`t6Tpr!obvw9m|bI3@e6(Hv{_ zxgX9o?;bjqq+Q;JU3PQozc^$F3i~$>S<70^SLfZoB=6lb?Ef27{-5#4UH`!&D#}co zs{)JUV5jqOvAD3?M8Gu_S1GPMTuBz1jw>5i5iTb#J1!Hh5?qBkHs_1elj|Tv|G!`m zGW}l+LcTct-wg5}Ptn8w?-}I6@0Qj4A7_w#{eLissuynkz`r?Utvc^lDQ}WAaK#jk{JUCPSMgk94KfEs0UdpNe|`^>DO; zz&Rim9;fD~>7T>`w3zz$_pB+ub%j^_Uv&SjtrWTK9l5zJZB^!Kin@F67zLLB4+zEV#bQdnE!=%_BrPJ(Q}E!)R)l)^BIRRU-~AIn1^|N4w|?= zNFKgcNyq&7v3v{>I-9j@4k>o*wIH6>p1>nB0)fp zCsc%di+Al9cL>i1G2bWf_bFU2f)-imrws30^=u*`qD=$tU&MMxu>O90YOU!Eu+B9I!Euh7B zPh;OP)=adYg!S}bobxdL0r32Nm|Oizi9{FXIs@~2;TL=c&3?u^H>1xup4VdBN6_vB z+S~B{t)Sxu#Jrc|z4OraL(Ji{2NH?DVa{csMHA>Pf&ZE@_p`8vyYOBQ-dXVo_6u|S z1$2KC`(OHCBJnET{}OGk#{H9cR*${D3TGo9W1hm_S3zSnX!Skjp~af&vDY>DyBRc` z3ffj+4i?PqBlP>xQJgp2@4fgU_XB8Ki+gC8|ItsfGD*T&ndC9Fko-S=f~@-guwyiL!s+J_m2vupX1?MAeQxWW z8(Jn-U*hy_oxZU(cixpH((`w2Y@5_{{oM7&8JpS*+HP##(|UMQ$K=jimmS?YWApr* z7u^*)aePmI#;r>q{0mLX3s?_zJ$5laZ*RbMN6*tcJe%`^nTMCZe2e(R-r)GVLvKGg zoHi+Bzc=#X(lscPdyT6fjhbxP0Hcx-h`1be*TW%2$^jLQQxOw)PoYDXx&5pw} zlUW3!Uqaf4EM~8nl#FE7Hrg60D=-B_NC`%x;KAo)#Ve-cCP&t8*Vx{WYRHMUSrr#K+?BiSe> z;ytFJ(w1kh=tJZ%5*TuG3Z{L5Eysk2Z$OMDJGqwT)FNyJTW|m|1_Q}0y&Ti4u+>?s z#36*trTGt2(mv;uzCj8iU!&f&PC+{7Om-%7Wx{dv>W)%UvE-DsF&qKp-S1XVVXZ>% z>B3Cw^h%+gBD_4L%63H+LJrlBp+odSSDr$crgTkC3)GhfGMQh$D6CDq=5v zi3-0^Eke~p?n)xOsIXUr`w+>IV(yz2Wd59|a+<@4um;2NSf96B)XGR)+Cp3p5aDu#wG+(%+WYP!lS^9^ z$Qnbw3}R5?RmDZ#xD-M>!~=Uj>OPkWUs64i%0$0sch9pa`j$A7PrL_Hg!hzoWU?T3 zE{42=D&a#i`t2AV@4cA(Vnbd=9D(p=*TqDbNZ3j#wEk|_cBL?1!D5alMqIU2*sjzg z4Mz%{VqMilSgq8zi2*6-PEg@>s_($|g=(*K;Zrchg!GzjvDX_`2p2K=h}egaza|dI zes6vX(uzW&f6gnMnbk%lG2q^z1~cH01OM-K6pd&RZHY?mB4YO-*#c=svSTs=QGqUm zBSMn<3v%?@h4Zxtdxz+n9tY!WkOPAvvP&fQmkJ@SAoZTBwFn&sJlB@nRl)>@1gX+LW3sQ}-D2(h`-mHOJ47#J#xU`vCNdm>KaJeyPB29Q0(hg>@x{ z@MU>EA&VUO`4Bxdacnu#W5P`CK;i>~mPfNz%bZr8fGBWC)F-QVxL9Bx{fP_aY|&F zg`%qSLaA6kGN!~p*gab<6jNl2Ax<8NYQnBF7(u5nwID4|>~lXvg)Ia*W=Iu~{UgU+ zv(=@8x)O0fa#n*aBYv=vyG0U-FLCqjm?OK;Y>VNqXq(z8*{nsac zbaiMT9Pdp^*@1AxdrT`#q6o!z_c!Jc;ZtU)!*xU<3_(tf1$hJBBWc11iX2a%D~@cA z$cni43>9g(?n@J*w&7K>dyPsUR1bX`344#F2xg-m3Qi6+RJ#s?lhYtD5CzFi^zaI# zzyt^Qi(So1;wgNWW4&_Z#0RI_hzN=jS}KB76)|i_9}&Z z?utiPu3tZT(&eCnOtlRNELLr%q@nN#i}Mw6A{nPE?7ht(+^j@=7z5CLS5vx>!Q?bR zX+R_)9CB3Ys!#}Xl}J7FBT^Of`jtWh<`YDSF))0q>!4D2lR%eu1ILKqtO}{A$vj1@ zB@_<#dgn62B%QSxmWM2MLuJ?1tC5z&t6V8+A)l7!n_NUGtVw0O@w_&@>j5fkRc3K% zE+s9!Q^Cf2<43o-CPI5s#6q{3k>KOrrWcNA5De<>kA}1E=3MiTyj-}$U`5C%ydo}o z8`O@8CdAzPrCvVAb!!Sj(!jL~KhrUukg+yBDc|#x*+u}W{Kosu?s-a~PYrGBUny3d zP#{1g4~n6@$*dK31CpFw{-C1tUgAWWMhq#DQqZ_X@7hL%g=xtQj6HXjzpE%*nxU;k zsu;IK>%NR(I>phmv#-*S3T_6+^z^t>7;W2tC_C=M0q{q-Y}R!}lCxQ_6tFzRx+2gw z40j{PR>~4rq{nBbqObIIgB|iUnX@Rn?<1sE_^XOVpfl@BovVQeOSH(gNU>$@N_AaD zgfs^6e53$c&eXbciJ%>4G@F>Nc-2!TQnO^p4hs#|Lm=+{E9?w1miQ@$xOA&Kk1(C{ z=WH+RN_TA|!iIE9Gi#D#L)mMDQbVdxgoBA0#1Y_{JkFW443NEnkknf`S0{uMMG9Xa zj68ELPoEt1;#7V85O`7ZH&?Aqn5c!SL!eFchn*WQ^D5JX*OVB!A`*_$t6eWEgc=4h z(uica%jB{vg$IcioE)uE8%Sf}FIH&cQ1k@TBr)n^|}usPs5?8 zlzlLxe2K2`;`Dqpjjw{{haF;k!9u2&{)LiSN8!;Qt;uYynsBh{0OJZOgy#uzZX(8W ztX$7H*C!U?SJ-_05&le&WSwtuEr(*{!114CvY0csL+?6MAsnLmMrj}xuUb1!s7X&+e6UdCL3e?Mc&c8> zFtrUO6CK^FRl?b6MGf&t57ck83;B2Md@J@Mq2$|X0p@gIg6k0~?A5RpV)W1x<>~Y- zzuhOo2CbAL+*B6wMYx`D2PZI~)(vBo*cBm7yeQ790WJ8U66 z#`${J^;DRcVQuaUOCfocdr6wms?a8LA3r|V0 z_r=+D*TX}BOD7udwz!_L4_E7*NMjFj|47f?oW`jwY=2Lj``+N{R0v^?bB@>c!mRUj zmM^4*J2FTWr^xcQq~L_G)sgUelWQe7A&q!WEEGBSmyGaE3aPT(!L!+G3QFHlF^v+R zIAJGI+q?F*0G!Q!IpDq(B9$@TXVyC_;COJmtuCuVP*Zp@5r254cfcUg_gkK0lfu3>DI5K@$6O|bph(Y4x55f9{c9>4jz@E&uD59UFIs&ae7auF zH_qj<3r{ML)by>!brBKfE7__*+<2|cyF^p?pp)^+T(^z{RhPQ%Ai@x1YJg$t8S?JT z71X>nnbQK9nwesCH7W!jW$VSBmdURBNa>@DJ`R&a<`|>v4TbQe8kS&pplUNGR4R~q zg`5@c<%zD>i14r~GoW{Or3f|!cMB(O)gvFKJM3PUDy(HnkZ_5VLTK`&O#Lqy5g`7_ ze|Y_4eou0rpyvPZ|Hq0xdXS%#y~djTM`KU@A5CCUAd4@V-JZSX{}s4oy0m22t4hl# z@jrXp2)TgEa<66iE@q=KuW@J1$ht?)GZ!toXBC}z{)M*b%U```c;W>YJ7%x`;Rl+t z;ZnYGleKJJPA$!yw|)AGuRdTe&u`t^yua&Y>J>$c?i$$m?D|btPEj2>&oJ$LB~vQh zv%T!xs`|_s{jc7+;_H+~ds+0O7Y{|%OhI`Xer{S$S!au2ruazIhEeVg;)^9_%*J^AWX zzkTbmmb)&z>b5^)Q>xMVIoI43eDl7AU-o=D{+@x%@(mMbaO<>@lTTh(ab@FMZPFWlXfak``BpFjSjF4;NPrj+zWea(+}&b!!HxOnE% zb*A$TvFU)VlHk(CuL4~9SJ*x6H*jenAa?(bF5R-1S~k?=efJE_JO5Ie3ysgTfKtXvVCM*;1*BxOtqRsnKmrS|qvj3&Lm-VwHnOJt)&y-mU(EeIibk`E zQdNq}NOU&kGL~VSM2Y0kwBdDqtRaV}E9m8Vq8yh`)Ha3FLR4m_o@ln{Q%T;lZ~lou zDL?>FlJ;AS>|%vwyCU!B*J<88uaA-DKga*C>)N{1{Ay9X`3B|K(l$=nmxp=6kB3hb z#o=^@dy<7jGSIG&EEWT%ftLgGxC@Ae*;`1Kc_Q*^R8*kCgd`y}*ptajubAYtwGfMU z2Kl$GGmd7iYD+rmFj-K;;{i4y1&5NAa?QZDx+OT*g5GX@0Y_B-4PNwi7y;e`pGB|Y~E=U56fKPt6=4z3kC&ysn@#o~U zOWl326T`hjw9H-u{Rfc}5v$f&DDw*=@T}}!J4hBr97mMM|CV*VXkaV@v zd~gkchFb~O%<&6x(A!FvQ^O~w;tmiplsiq|9FO?pD0idJyIjQHmISB(Ni1n8$;69Q zunc2KcyLpZ0!3DlA<)3@D{@^%^nS4~oZ&q{obAa1H;8PB--f!ZWTw&_)^W0`mFU~0 zRbte~=_ArADc~roL~ZF1=yF5au_n%yOiW!wJce; z76vbfP7$6;9~|XCvW_8;K3vYk`vsNKlU8=bqneYSU!MfglF$(n{25n1$w4AnH*7>X zl;#UM0a=25VLw!LDm&iKFHXXNV;HcF-$KIcoivtAYa6Q;&6DQxdd4T#F4m8gp70CV z1|*jKW8*II%JNGH3lbj6rXR6@PK1vL?}6JThI+!K<;j4d z6neNzTMSI~*oAyMJghk@tw8Mhu}8i;;a|m__F}k?V>KQ3XU${1uaV)ewH-0B&uhb- zYauCV01BLh+yDi~qP|y6V|Z9p_*}uuY$u%i$~kh*Z&k%ZNP;%}BKF6xFI`Q)eNWW5 zhKVC_T{eA~ZR(R3y+KN^q&F)^e%5>9IFiCaRn=5g36lJ}#GqJQ14lTz-d(j$q01Tm zBIgh5gP&!)$-9WEUK2i7nZNcdcu72#QF?@+ga!^$bU0bI(GVUANmH9>3?q{c9xUg0TPRiMFfj75-cC3KH~`MX z9P!*RCwnBKZIAcFgoT90fk&66A!LXIxyS9*V^zP-YL04cMZ>q+IR1rvNK3$c-ExoI z&0l2fXUX7+K~tl+8cEYb>XGLP%86$^~0I3YRfLBLHZ^M89THm zCKx6%Z85rM#@((0!WUURLCD}}Y--WSTjou6R6+EG!D9f!`5#e4jZ|47Uq$jcdA?F@^z7heTPX@J79lU%0^Hywt!BXzpKU5%b8`hf%$?*=WY-PIA;xXdus9r9 z77Z!pK4U{W3E~G48F27nBhTxiRPt0L-E`R2P#eaM)Kb63wsrK?k?X7|L~z3CHr(xb zA29}(Su4J4E2BLN-#@vN2}s6A(%>x_53t;w$QScI!xr3`g@ko4_nIcZuecft6%s2P z8vYVqU+($Yu{5zY3FnyI1#~1)UQY7CTQ$Jip#LJin;a9Sy0}NuOi5H7dAFwgr1pcs z-w^lv4cWBwt++PNSuY7~3Vs@APbGei;dC9E8<>8HdpcblbqtnZyIX-Dzzq?l$oygz z$@Z4gIV(7AnjUaaZ+Lh$svpHu7Z&SuYw2;sE(T=@Kr3Bno2RqeuC*2?VUtjp?si3K z@*K451f3{41f$kbC9VQ`2T2%(%V;R!4K~?RY5xtxMjcFt6`9Io)&Irf#1e2!Na~fP zV#FT;rQ!dyFJKC9Gt+X@!ent&)c9_?J6}m-5OSmfOflT#8c#Ius8-X?tZG&dQ*9vT z`iF|nHcxQJ2xr!0e{H;O1)l}PNY&32SE58J>N%uM1_HSnQxaOgmfdqn=0jY=PF_F| zNGxWd@j5Q0&K5ptQmSqJRgY}JM& z<{AKA9PyoFY6EZ+b8^Qt<&ULt+cn~~-o*-j4=ov;$V9@W_2W)_;o3sW`bYj*zD#GV zL7n~j`>wi#ADC*TYlrQQFZgfr-q)EwEdQIHyD_cmxMJzA3R5yBC5K@0-=sC)r7bc) zpZZ4z`1s%UB^jdr-4Oo6m`)nWBI_Ra6u(_XlIYRCM^hcgSG z+k0TsqQg(VRCk#v`|#x*Hy3tnySa7w!h;7o_AkA=ecPV?!totPZ@Trs@)>ng4lkGX z*7>&OzqIRy{9sPOw!$0Q7jzVlWtBgls$|b0RH0;4DU6!cq^4#SY>gJa6Rz{UxXlYY(c1OFH>)>?rGdhI@3l|m6F3jN6#?CX8g+^e*i`7e(+Lq|f zTx`tJ@r{;c>hj`3bw_!pM(pa=q;opPFQpfAJ)EwfzPE3-Mq6l+mzOjBfnY(13lFU5 zGIHY!Bhf;8;goL9Zc)bya&PaceZlP zUw9pN4!5?qQ?MCh@9=$D3 z5w7-MbJ-Mi&9%h~I<8}c9oOr)@?y@Wv-A}198o(8X6@RozQMnzV6RSHe#MRZn8hvo zI}UVC(e6z1Z#Nz+%g}MUn;hJ(oAc^#DZh36ZJlLG?vSofD!*N`c1pn=>R{2Gg%)P< zt(2SDariDlXDM8LH^=ER?g%q+k6zPaU%I;G{h^1oJXP^(Q`fk{mn_5Hfq z!h-ton>z~UD<61p8i$oxwsN|1hGGrshvv<8S05QwJ}fD(eMGIj-|^^Co?GC*oXdF3 zBK+;~`o?4Bg@v;hC@t#xP77Cd{D~(m^+mcaP^sssvZohV7~S36U!ECna$Fnrms_4y zKWArp1nw`%bAkWz&fl*1x9>kQRSX2hGlNO&7DZsum>TxpSk9^fwKA5c`z<@l^WNK} z$ouH6F<k4^bN_RS-4M0S1urCG#v4PDG_XhUdJp7=bphj32|IEKcCW3N2lG{)-59@ zCebr7mSGMPHS*hSTXj>G6=QNtjMD7-IBEW|mX6oqnWR&NK~pj*^L`H`1<)6wah+ zKIXm~Mtp1F*&sR(uSuaGHo9Z!3kv>Bhl7dgY4OaE3^2yo7{?DN#E^=>=NGlXGx0lp zP{V!BVgnnkAi^vqZ$!=snWYg*iku$k=V-F(4ZAjIhlz&tnaGg28eV#Jv0HZrzhvls zkM`YpOJ-Z81nQ(CZvmLjAU@cCP>uW@%U8wYCr4|%wTyH9SS@nw( z$w%QReko;$-hn)p-~ha(h~Wd8t@X;`sC>YaQ9KH_=LKuqP&6h5qrywXm7#PTr?1(e z&7q+pvg-i_$>*h^Q;z8%tosVWt5C3uyaSKdgP&``LIZ(#r1l?<+2TUYd@Yi1y1h3L z$L6wbrw;S4i34w)+6+6kdFmUC81Q&)2Z~cY$FxZh$9IlnofuhSbz;Uzz(CRcGvqUAe7#eP3KraQv;Qo`0V-(BfUiH)qo6xS36rcU4l=n?Ds%<%1}7`3A)=zdCUhqR!==WNl0) zu6(L|+laq2ap|eLW9`~ybDbuwy3#vbQe-_Zg}%FLy8a?nPWFnTJe_(^^Tz2#=ljlm zc5b`CTv)MXp}u6xrrn47^wT#td@#x{E8py6-k&>D&t3d?<$?1mH*7hVS>ViMyvJgh zqMlQ4de?bF`>^>SKbN)ep3!01m(4ud#Sc_!Eege!1?`J3@{F_$l`Yd<;QTQAJo`nu zhfge6R(1Phfe&_GwCEP!#P{Yd$e!^=O543K6PR0uJy?MkIB(T%!>&m+%Lw~U zy46&!XRkijITyoTp!D3~G_E_*SMg?$x&YW%6G_X4diqvU)_ZDc% zESsAv^x5qerg!aJXC=#0#jvx;B|LiYy$yQp$Rx#~g)^HEr4FxM=gMxWRPsyOA^> zL;hRG#gh)`7*+=z07bAo{5#Hhd!FS6OWu3?6?yL=@xt{#*|h&E@jsep-kDWz-k}(i z_}}DyQU52ApJaN-3jj*L3;SfYV^V03N!35Y5=rIoUsQmWmh8j98v|z`;xf?c-(uC{ zMzMHq2#E|njT`ME+QFb;B>iV@+VC|GE1isZ4`$~7$hVg@*hoH6NpPB^!;{U zXeA>0p`Xe@MjpDQb}hk#ux3nJN>LJBJi+ z^Q;n{BDxIvm)bk5SW*SFi>H?vD&%_||I`%=Td89LnN}Jl?lwi&I7*6jrxm>CXvR46 z3n{9crZy&s<{C|=dmZ7=UJ*4R<0KjsSL`K|OIwM1Cb7?QCIQ_@od3%A6u&q(hc9*< zB1QOB3b{Li%nyDW>0Un$-MJh}M+uq3e=Qf;LHse~7s*5)D}@&cT~M}#ZuG3xd?Bu` z^$^GKcrEbtZik5&rrt6l?eDs+Vja6R$W($mzwkbx%^@ z+od7K^&;`a;GFbGu_0uu@aO5rWOnYNG6f4*v|w`%raLst31Q#e@<7onXVX9op-4Q- zDZP>uN3GaQsJ2+{QQbI_rpqsA_GSD_GrBG$NhJ{+K8k?d$oX<5wAB7aZLbs_*uP60 zd@6^X<*YVmpjvE2q=dRJRghHvZQAK8_PlIsLC0;991W^6`IBWU-Br+&4ZnC?wdbk! z_+Bk&I27D}g7iLj-@VNvqdD~=g2rMDn4jir-4{8v;Go*%kAsDzQWJKu6+fbt#@pvl zQ*{li@G~9__ufPDeD&UGl+5t#mWk^Ynp@=fh;imOe5m=J6wov2WyRmgVegrQa3?mg za#$+Uc<|Flwsf8=J(^Nh!2fEh1xOkR<|GrH_ z)|Rp23DOk*@WdtVG&(wJSm}C7v2=~fbFyGR3&V=ve8N@yEzx+Mrh01kxQf$V_ppQ? z?f6ki@_sxtN6TNn{AE)75q+WH$$RIJ2_x%SNTA_D5?%3H@KW8r6QAX#N|VZNG|d9? zJeWnVFMGM%OX%zMI2i#Q6++Yv?kwbx{+;r}&<__b9hAb*Z4d&O=+J6w4!N zs#-E&`6R6vd1d0AZyhA3d16E3mYjog<9$-$I(qypAlIs^q_{dN!>r z;JT9$Uj<4 z4cFEt+`HL@k7SmmN;P8cA|pL@w`!6ednU)YqZ%e2nfmt}P%s&h!#2*JMfVFgpZ=DE zRJw|k!lT6F>9E0)#{8?UQ;ckNAEMms)H$ZoX5z4taiy=YxdmKX`ud-Snk*M`i$3K@ z$>^|PC%TV?s0J?;t7;$EKZOxvS;OhR&mTw#H!w)kSe-8PtGF-IN(YFeoXj$;aa4?4 zqlL>6wG6Q&%C)g3K)u~kWMmUP#WVSTvQ7;*j9z|pB2AR~Nz2^P?{v>Cy-!U;d%k>W z{E6L;#=NzxSXBF6W{#%7OLv{Ps_cX%tf`8nd)Dx_W4GyiT?&r7RV!>D<`To(CuVy4 zl)?g)t~4hZ?yDKX58}kRd3r}1PCt?aWWVh_zLj_yFGw}a+jDB8xoRb)|L_|K($i`CwyS*1yy-1k#`UIV}5g#W(nwP~5;;d)h>G-~IikU(Q*aEN#xD>g6}x zZ9)9~x(^K=W-v=K;MEkQJFGl$@(xQLISk$Z{MU#@M0L+raSyBMB zm=tDVq-X&If-s#DuC#{qFssC7(6mKwN=BHN&ws7S&tnw0#Ml!8LT{mNYJ+YTW1c-HMWF@eqDt|2)tME#%1qX$sj99~D{9Sk^Xe4} zgW9ZXP&YO;BTq=DXlbp^z>~K2j``}&1q<=8dXc*7jKxcqp1DlyQgr*Jp5BzcTzPrQ zgno6PH&~StQX6us%-XP`ctBTGwIUK_vtw~}>dL`Yz2^U`xhnyWs>s%LyKi^;LOSW2 zx3h<&NkSltY}vz-5JCtL5E2ArCu|}r0vUXQC?K2QD4Pn% za|6^-Tn3j>2L+iKb>4G)`u^$!48oi7d*9RV-7nRtTXjy=sdLV)Tg$0)`8cJzCWN2U z5<8c0T!>knJnx?QH4!0#EMFG6ptX%($lt3}YxzZs$C{Viw{+QZWuFjP$hZ}eD`l(3 zs#o`*BPLv?m;cahh!=LJ(cKH9v`b8;rLW6Or=P;;?$?t49fyaH$c6o~96=Wx9lpj? z(qS4k*2DHE-NMb8RE1d>CU{K-wN_yqi^Wx}F4AFRE*+i1lqhH!$|;hg>o9l1@HJ}K zaH7^F(!md0J=I{Bs!AA46qp;&Bu;*r(Z!l-JbdmV7L(`caD zjZHj8lQ;(HH*WEmJR=iM$-2*6o{+_dc8AgNH|o1d77E?MTz}Y#{~^zs`QA+a4?}-Y zzB~W5qk$tsjy^s3t;S~0f{>4U?EK^T>Rl(VX*P}h?jEMG^iTi!SJp)|cF7Qn^L;k_ z%4G)FWEp$~CL@uNSj@wfY(E2$#S)n@#30i!2C|T20c?;NVp#(&(qZtI6E>Da$f8Jy zOconU;zDF(V1ul+JTJ8;F&JdiWwEISVp@5*n|`S4y37S8z-D|(r;)>#O6^jm>81jBcNFy^1^h4M2+=`NpOO_)5n&HnqPryVT?e2Ov}z&xhDm_FH_>yw6X|YC zJw3Y8*f2vMqs~!bVIn4n^yTdO99`!kbybXqgbwZpHgg&#lju0w-SZL&Psi)Q3iJH3 z;MOkQ+kr>uxL!yl0+*Wp2vILG>PR21KN~2scuD`C=%8~|nQ-((lN z#}DCcj-))^#Yc`3efS+3Cr0KAW&9{jzkx!r7%4iQw~3+{>#$Ws^7;zi&~H?P5TkWC zcn2RD<#hFq=U8KO{gLzUndz3B5#FYtexOxW3- z|2ox;%q|xX0K;jfI%koElw`!5Z7!3VPiR@tYson|Gw>3m8OU&Ak=-IBg zdw}ZPhN{wSQCTu&&(klQNXL_u5XWYYT^OraG_|gyk{^@KT-edWnR(JMiCgD4oHkw> z>{}7hU$?}VV;>YL%VU;A`PP{qSie>e8>v&CVQW{MA7qzBU zgc8$7SHX3*>lmLpILDn(9UQ%Oow+~8Y9C}x&s>q2&i+{jKLx+I!FS~RWux#ipCdfn zX+(8hqxvG0C>U}~p+t8VU^tGo$T*6!-(WcdS z_K*dDd4cAejfh`|XB9yWb1Wd&D7e{VJ^=rd3unm{MDBr8mTf$zXVV1` zvl?|D08TD&?CM&gRBOydD)M0n=OMGHT6xtzIOp(@bpUdMtd0ndCw{Yze1L=npy*kY z`5>rxP2B*8FE;iQ4d){*D5?iYcu2jLd`R247)9io8^{LK;{xU%z?^0^CqivCWQn?% ze1;;nqY)#3RfV+k_|F0GX;e{-_+-Q{auZl`8GvTo48{LEwB<}c&Pa|^QXfZ3548Ek zFpzi;m~j}mgUos45~X0T*#O@&#pE&IE;sw&+|ozF)%)OsCX?JBEXzQKAZ-mOoB^0K zJbaSm*05+K2j?_8uxd6K=mUa}yb6j$H25$m{Rqeh5M)R@lY5=~8$=^R$U#BbNdEzp zI01JJ@iQoA67qhJGVagSk?Ba^%~K~R1CWxeejDvpp_WWA$w2-Fe*6dGiHNrWGeYfQ z&)K+>WE*bY2EtPF+w6y0sMucsbq1JGz-XbZeFX}(6NP8yo*7*h~mkFW_MwCKS1 zkAXfaR|BDI$iIWqbHFZ#)37U1ko*btPSO_(l0%R&y6y=19>r?`_#&6Y!ikTL%)(6p z!Z?(27MMv$TW9uxC>!{q1CLaM3AV-q9RTHK8~G6Mm+MqPy}t-SYzB#0;7u%ASBAWK zh|gEYvpZusJ<*soWD5{K2D36y$r$KM&mNqOv{Ok_`4@q<<)CwfxfN_|Km$Br;zY2* zhaxp301|cs`vT%SP+bOa1ca~5`^Yym}iMyQ-L!UY#8T zHs?+x+szY+28ger@YT?$Gu%j)9K?+*HY!^L2xN zacGQxL;S*yv`U?;@#}a$$%N%_N8w8s;&L`S>pptprO0I?w-qkxJh7e~`>KKfhzyOU_lLt|{Sr%2u@JKC~AG!@J@?j9>Az zyuj!4kr8=(Bp-!5B$e?@KC*B-gEzF5^Sq1~3V6nkthGF!4=A1=&GQi&AW8`NQ3X6Q zHBNpzPoyYgV^jo+1`xv-d7HrVZ9K2d=ULRWNv&t6)aWVlE^(bzXqPHXmsZG4Z!DKXT(@(9itF~&|3O?z zVW&Jts1C~O0Wc0uvm$~+v-Oo1CgNrtR&n} z|5?KkJO~c83jH76@*4K~aK-P~)^rp}9dD%}WFoi`dM?QC-5Cs}K5gj1|t_!M)pw zyqn&;NUK)a<=*Xu+Gsgr+Zn-yL}o}ijS3eHoZY3X;@|*+qR!Y zL^?qv4NZ|W&QuK>(6A>X**uM+HHsajVGEI-M!}efVo5_z3PF@Sh16Ol`D#y|u0mD} z&yK%5PtwFsq=hKU@oS4kdL@c-DJ4wawQX#VdDEpMCiy6U6 zBl4urmDcDyX`6DohAq{sEVU^lgW@E=p+Unw^s?f_A!QL|GBKTc5Gs4a-5cLcWxMw? zt3$Qzoaws_a?}2`RB?l-x3K$v5W*e%nP&@x>)5XPveP>Mk%WD88D&)AD6=Sf@f8nh znhZnzISF*~nl);a*&$jTb~qKEHfLr_g1{yE3@*`}Zir8*#a3?>%$ge&7A7sx?m<$g zk#w0LIGqjy78|S98ALmy#wTrHs6Wjhip&{B3310*d4YA?v-bep6(f*v#U5g|fcPjyWE}K5@H}C? zwiq9OjTLQ{R-)GuZvfsg?5>z6RNxu^D(f+DbEDDDB&%q*=}pAyJ>q(eI10)b;<1!5 z`lTc(u#~V4yT07zWF1MAG-pzF2lf^UrVt`pe1v)5e1v$!jjY!ZSV>r$Ev8&Jf}9S* zYgxNDunAj7mQ4&lm326?Um?U57zlbr{04k7*6oRToS#HISivc~TP1p}#uqg~#wH&j zj=*t1+GED>S6HvxGKjdoV{I;2RanC1#BQ7(J!%T9^vewnK3yf+9ob_ED>?#=%$O~x z#qzD3#N%f%k7Xt%IReh_g7ie#t; zKOTF$oF7BHb}bP*sm*jzsl(`{8v`YyClV;BkB~lh43k?C|dr(FoW7e zmECJCyFac?s7IqeR}igvMU-gsSVb}54IysJY=$ZQurKuC5UXEo(Gq5mVzI_ywL1;! z#jd`r=*01B-VfaEEaPYDNRp*l(Y6(f$czMoMs~_q6r5ES59Pb*aS5thp;0Zvcy$C?4i}f3r+wcUr4H`=*sx zV~?psmWU^-{ICV`+7IJ%I|cipu7O8FaQnKPg!SITZBO976o-S9;Y>0^{9E?(iPE@3 zkL4d>oud0&mN%nae++>#cmhL1Sg&XBRASt$vb(Luz1))`0bi@+tl49+$@oEOjT^V} z+OTAmYh~J4^E=>e@~6aQe23Q)k=Q& z-|n%VJk6yD7a^gydx*lY3yfkon`C7k;)XMYp|oEtD|Yexu!6~<5KyVyxHbQK;ucmI z)fKTF2wZIetAMPH9!$A>v@3$RA;P2|gSw)jj!B ztlMdE%lK1hr7}e9Z|N&1z0QwT+3eQA1wHvvdoriiGVfTF4A+poTV*_^awJ*XED6Vi zF~;MrM$4b^xF;u*9Q(I&e(3heC6AdmgKakZy$rLSnXj_DtZk>b`OH_0U~oFDPN&uB zvWAJ_Tp!-1a(b;{ebk(=aTu|Otq2sZlhzp9wN zE^NGt8_q|B?GNK3_@SyKx78k{(RTno;ysO6nB75C4yUy(oy+GtR1S|d%o)lF{Oqts zz>iy#_E7E`;&nBLy{GoBvb=6m$*f92?hg_y~>$Pr*gQ_?pifKpMV3i)#+{KnhNiFT`owHS=0p&qx@Gw>C4e{0u^voQgqvr8J`f(D~~ z^$km5I9lj>M>ut=8*fj%RG(B`e{dT9FSu|hFth5C>&n%)4t=(IcSW)UR|( z+dVa#8Yj;Y+*jW^_oqj;ily$l_dVlgK6LH9DURR-kp?|;~nE z*dFU(opd9NnjI%&`)iQoPf1OS^63Y`jyEGS3tL~g-oQv@o~%!P0d~7|V~UCg$ChaL z(!0mT7mQO34eodKuBeQts&>@Wh8DmP`_#JnX$_P6_e_Na#0=f|#HN|b=w{ul*+Zk7 z=fn=FXqh{&F}?3S^RchBu*bcNtc#c2i!H5?<$;=_6)RV*Uems8?Amps=B?kbaY96& z)KPI02c>LCY`TB&$T(B4+)dhE>tqjXu3H%>^*!i}Yp?|)GxA5e?r{-o2j>iNZM+I=s)czR>jj3+j^rx7`oDgDNNs-3@nvy`6v`&!g&ld?ixWL_9;-=~IrfFj z4Givmc=D>rOUJfMcw<4*_Kr7a^{dD?d)CKnEk3@dV$+DdPv*M|4Ht4Z?aWR9A`Y*n`8n5cb59@#M+E>^A^4Gk-ZNH%XdP96d(>KQFj_qGK_5Kw*FJJ}BbopgW zN4gG5dsHySodLVD89z6bKBv~U<8u)<)8XICO*x(!5&#o=e(J@ zzR77N7}i@bb996%lGiQ<^AStKozDv7a^vb&Jr<|y2V-IB87IP z!t~wia#P2vm@alB^5-<)Q9#W?Qx3j8KsU37EX7af+|ZWv)ATNZ&OmRxUvT^uIVmxf|=;k1OIMg0sw9O1kAejLIaM;HW|pv-|n5*MorKBM$o0JNAJ#~Nl*;!>{h=-}I~diT}TJtp2O{fBk>y`{V!8-^Ooy!0FdEr~m(K z%k>-C5&dSCE5E6+#&5r+_8V>o{pQQ~zxqJ^R~%yg76Ufz+uthiA~&hDtDlia;cQ8? zE>tQg7+Ey9a|EYb8EU1MQ%aL0DIbAeBP3~-Xbry<~rc$)@Stv%DM(@3}>Rei!-$Nm&vD zsloEgX=SNFA}EbgUS5mUigKm=(Ls8HKsQMTC{4{Mm|9X?Fs;}wHdHtQ4dr$xMYUP; ztk&|I_%%WNOq$aTKiJ-z_yx6<8Kcpj9FNuN2}0K2#Gi$XE+%y1)8unZjEOyXsF%4Kg7m2p^0F zf&4V;Z>h*gpIkb|>nJIj-B=-dr(1?`v}Oiol@tZ0 z7CS0y%QNuV;4Df*u>JlZQbuZ5d1)zkriV)9COviK8B^=aG7GB9GQ46%J@g@W@~!1% zxBRTUpX42EpG0|aTX`)s4~m2WYoKDC`WS={N+z|!T5YvbM1MgrHCSknzrkCm8m{u9 zsWZxo3u-Isr&gELXV;e31*S=XLP}4rwfUdo2Y!xUQI|fmEGX;3LBRL?3;dB3ztoXk zTbelrdN!uIyeNHoxdXgQrTE?%Kf|9w;k(gaNbf6)TvJMmv+Gg!l**!X8|2?qQq<5~ z0q@`xe_CErm%RU(zD}bhL)Ky~o*7EZ9B{UT;@fWG|1y1@Md7WJY5P%TT6$1F(3#L1 z`qO!p;Ge&~G)OzzJ2D7g1H?{wbvCks!ggzK4!tj_n>wd5qqe#tV|HzceYBL0XUb~3 zp`U44o%IEU(5=2;`zDnYXWPosy;IOPOhx-ry6S_z-^BlC^tM!Hq}wWsp>7!ze$bRI ze7`iCQYO(xb*C5oXHdS)mBlocTGR!fA_YP9bp%KWa n8XC;7o1Zqz5cF0zt==FeS!@;?W`vmL|L^{Y4@O0KQi1s284yji literal 0 HcmV?d00001 From 0e873c93f5a47b5406de99ab1162774cdc7fd9d8 Mon Sep 17 00:00:00 2001 From: XB500 Date: Fri, 15 Nov 2024 22:29:35 +0000 Subject: [PATCH 032/157] Opteryx Version 0.19.0-alpha.861 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index f67a632e1..ca0c35a39 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 860 +__build__ = 861 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From fecf7dffc5b952bdf68e9a9da74f951a3b861b00 Mon Sep 17 00:00:00 2001 From: XB500 Date: Fri, 15 Nov 2024 22:45:05 +0000 Subject: [PATCH 033/157] Opteryx Version 0.19.0-alpha.862 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index ca0c35a39..1a9f02e9a 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 861 +__build__ = 862 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 6e42f5aa402e82ca19f91f7bfaf3ce0bdd5cf0cd Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 18 Nov 2024 10:07:01 +0000 Subject: [PATCH 034/157] Update sqlparser requirement from 0.51.0 to 0.52.0 Updates the requirements on [sqlparser](https://github.com/apache/datafusion-sqlparser-rs) to permit the latest version. - [Changelog](https://github.com/apache/datafusion-sqlparser-rs/blob/main/CHANGELOG.md) - [Commits](https://github.com/apache/datafusion-sqlparser-rs/commits) --- updated-dependencies: - dependency-name: sqlparser dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 851d380dc..950a56e9f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,5 +14,5 @@ pythonize = "0.20" serde = "1.0.171" [dependencies.sqlparser] -version = "0.51.0" +version = "0.52.0" features = ["serde", "visitor"] \ No newline at end of file From 6bf5d705ef76bcc1192241d62604f19d17358eda Mon Sep 17 00:00:00 2001 From: joocer Date: Fri, 22 Nov 2024 21:16:54 +0000 Subject: [PATCH 035/157] #2105 --- Cargo.toml | 9 +- opteryx/managers/expression/ops.py | 59 ++++++++++++ opteryx/models/serial_engine.py | 7 -- .../logical_planner/logical_planner.py | 12 +-- .../logical_planner_builders.py | 8 ++ opteryx/third_party/sqloxide/__init__.py | 3 +- src/lib.rs | 45 +++++++-- src/sqloxide.rs | 93 ------------------- .../test_shapes_and_errors_battery.py | 16 ++++ 9 files changed, 132 insertions(+), 120 deletions(-) delete mode 100644 opteryx/models/serial_engine.py delete mode 100644 src/sqloxide.rs diff --git a/Cargo.toml b/Cargo.toml index 950a56e9f..0130e0db2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,17 +2,20 @@ name = "compute" version = "0.1.0" authors = ["@joocer"] -edition = "2018" +edition = "2021" [lib] name = "compute" crate-type = ["cdylib"] [dependencies] -pyo3 = { version = "0.20.3", features = ["extension-module", "abi3-py39"] } -pythonize = "0.20" +pythonize = "0.22" serde = "1.0.171" +[dependencies.pyo3] +version = "0.22" +features = ["extension-module"] + [dependencies.sqlparser] version = "0.52.0" features = ["serde", "visitor"] \ No newline at end of file diff --git a/opteryx/managers/expression/ops.py b/opteryx/managers/expression/ops.py index 474a961fd..58615fc16 100644 --- a/opteryx/managers/expression/ops.py +++ b/opteryx/managers/expression/ops.py @@ -32,6 +32,12 @@ def filter_operations(arr, left_type, operator, value, right_type): "AnyOpGtEq", "AnyOpLt", "AnyOpLtEq", + "AnyOpLike", + "AnyOpNotLike", + "AnyOpILike", + "AnyOpNotILike", + "AnyOpRLike", + "AnyOpNotRLike", "AllOpEq", "AllOpNotEq", "AtArrow", @@ -170,6 +176,59 @@ def _inner_filter_operations(arr, operator, value): if operator == "AllOpNotEq": return list_ops.cython_allop_neq(arr[0], value) + if operator == "AnyOpLike": + patterns = value[0] + return numpy.array( + [ + None + if row is None + else any(compute.match_like(row, pattern).true_count > 0 for pattern in patterns) + for row in arr + ], + dtype=bool, + ) + if operator == "AnyOpNotLike": + patterns = value[0] + matches = numpy.array( + [ + None + if row is None + else any(compute.match_like(row, pattern).true_count > 0 for pattern in patterns) + for row in arr + ], + dtype=bool, + ) + return numpy.invert(matches) + if operator == "AnyOpILike": + patterns = value[0] + return numpy.array( + [ + None + if row is None + else any( + compute.match_like(row, pattern, ignore_case=True).true_count > 0 + for pattern in patterns + ) + for row in arr + ], + dtype=bool, + ) + if operator == "AnyOpNotILike": + patterns = value[0] + matches = numpy.array( + [ + None + if row is None + else any( + compute.match_like(row, pattern, ignore_case=True).true_count > 0 + for pattern in patterns + ) + for row in arr + ], + dtype=bool, + ) + return numpy.invert(matches) + if operator == "AtQuestion": element = value[0] diff --git a/opteryx/models/serial_engine.py b/opteryx/models/serial_engine.py deleted file mode 100644 index 9a7dc5d05..000000000 --- a/opteryx/models/serial_engine.py +++ /dev/null @@ -1,7 +0,0 @@ -import gc - -import pyarrow - -from opteryx.constants import ResultType -from opteryx.exceptions import InvalidInternalStateError -from opteryx.third_party.travers import Graph diff --git a/opteryx/planner/logical_planner/logical_planner.py b/opteryx/planner/logical_planner/logical_planner.py index 49c70115a..510e34d3d 100644 --- a/opteryx/planner/logical_planner/logical_planner.py +++ b/opteryx/planner/logical_planner/logical_planner.py @@ -79,13 +79,13 @@ def __str__(self): # fmt:off node_type = self.node_type if node_type == LogicalPlanStepType.AggregateAndGroup: - return f"AGGREGATE ({', '.join(format_expression(col) for col in self.aggregates)}) GROUP BY ({', '.join(format_expression(col) for col in self.groups)})" + return f"AGGREGATE [{', '.join(format_expression(col) for col in self.aggregates)}] GROUP BY [{', '.join(format_expression(col) for col in self.groups)}]" if node_type == LogicalPlanStepType.Aggregate: - return f"AGGREGATE ({', '.join(format_expression(col) for col in self.aggregates)})" + return f"AGGREGATE [{', '.join(format_expression(col) for col in self.aggregates)}]" if node_type == LogicalPlanStepType.Distinct: distinct_on = "" if self.on is not None: - distinct_on = f" ON ({','.join(format_expression(col) for col in self.on)})" + distinct_on = f" ON [{','.join(format_expression(col) for col in self.on)}]" return f"DISTINCT{distinct_on}" if node_type == LogicalPlanStepType.Explain: return f"EXPLAIN{' ANALYZE' if self.analyze else ''}{(' (' + self.format + ')') if self.format else ''}" @@ -111,16 +111,16 @@ def __str__(self): return f"{self.type.upper()} JOIN{distinct} (USING {','.join(map(format_expression, self.using))}){filters}" return f"{self.type.upper()}{distinct} {filters}" if node_type == LogicalPlanStepType.HeapSort: - return f"HEAP SORT (LIMIT {self.limit}, ORDER BY {', '.join(format_expression(item[0]) + (' DESC' if item[1] =='descending' else '') for item in self.order_by)})" + return f"HEAP SORT (LIMIT {self.limit}, ORDER BY [{', '.join(format_expression(item[0]) + (' DESC' if item[1] =='descending' else '') for item in self.order_by)}])" if node_type == LogicalPlanStepType.Limit: limit_str = f"LIMIT ({self.limit})" if self.limit is not None else "" offset_str = f" OFFSET ({self.offset})" if self.offset is not None else "" return (limit_str + offset_str).strip() if node_type == LogicalPlanStepType.Order: - return f"ORDER BY ({', '.join(format_expression(item[0]) + (' DESC' if item[1] =='descending' else '') for item in self.order_by)})" + return f"ORDER BY [{', '.join(format_expression(item[0]) + (' DESC' if item[1] =='descending' else '') for item in self.order_by)}]" if node_type == LogicalPlanStepType.Project: order_by_indicator = f" + ({', '.join(format_expression(col) for col in self.order_by_columns)})" if self.order_by_columns else "" - return f"PROJECT ({', '.join(format_expression(col) for col in self.columns)}){order_by_indicator}" + return f"PROJECT [{', '.join(format_expression(col) for col in self.columns)}]{order_by_indicator}" if node_type == LogicalPlanStepType.Scan: io_async = "ASYNC " if hasattr(self.connector, "async_read_blob") else "" date_range = "" diff --git a/opteryx/planner/logical_planner/logical_planner_builders.py b/opteryx/planner/logical_planner/logical_planner_builders.py index a52c69e9a..7d853b5b0 100644 --- a/opteryx/planner/logical_planner/logical_planner_builders.py +++ b/opteryx/planner/logical_planner/logical_planner_builders.py @@ -549,10 +549,18 @@ def pattern_match(branch, alias: Optional[List[str]] = None, key=None): negated = branch["negated"] left = build(branch["expr"]) right = build(branch["pattern"]) + is_any = branch.get("any", False) if key in ("PGRegexMatch", "SimilarTo"): key = "RLike" if negated: key = f"Not{key}" + if is_any: + key = f"AnyOp{key}" + if right.node_type == NodeType.NESTED: + right = right.centre + if right.type != OrsoTypes.ARRAY: + right.value = (right.value,) + right.type = OrsoTypes.ARRAY return Node( NodeType.COMPARISON_OPERATOR, value=key, diff --git a/opteryx/third_party/sqloxide/__init__.py b/opteryx/third_party/sqloxide/__init__.py index aa2c47b68..36517ff0e 100644 --- a/opteryx/third_party/sqloxide/__init__.py +++ b/opteryx/third_party/sqloxide/__init__.py @@ -8,7 +8,6 @@ """ from opteryx.compute import parse_sql -from opteryx.compute import restore_ast # Explicitly define the API of this module for external consumers -__all__ = ["parse_sql", "restore_ast"] +__all__ = ["parse_sql"] diff --git a/src/lib.rs b/src/lib.rs index 02b0f0a48..420a4cd3f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,18 +1,45 @@ +use pythonize::pythonize; - - +use pyo3::exceptions::PyValueError; use pyo3::prelude::*; -use pyo3::wrap_pyfunction; - -mod sqloxide; -use sqloxide::{restore_ast, parse_sql}; +use sqlparser::dialect::dialect_from_str; +use sqlparser::dialect::*; +use sqlparser::parser::Parser; + + +/// Function to parse SQL statements from a string. Returns a list with +/// one item per query statement. +/// +/// Available `dialects`: https://github.com/sqlparser-rs/sqlparser-rs/blob/main/src/dialect/mod.rs#L189-L206 +#[pyfunction] +#[pyo3(text_signature = "(sql, dialect)")] +fn parse_sql(py: Python, sql: String, dialect: String) -> PyResult { + let chosen_dialect = dialect_from_str(dialect).unwrap_or_else(|| { + println!("The dialect you chose was not recognized, falling back to 'generic'"); + Box::new(GenericDialect {}) + }); + let parse_result = Parser::parse_sql(&*chosen_dialect, &sql); + + let output = match parse_result { + Ok(statements) => pythonize(py, &statements).map_err(|e| { + let msg = e.to_string(); + PyValueError::new_err(format!("Python object serialization failed.\n\t{msg}")) + })?, + Err(e) => { + let msg = e.to_string(); + return Err(PyValueError::new_err(format!( + "Query parsing failed.\n\t{msg}" + ))); + } + }; + + Ok(output.into()) +} #[pymodule] -fn compute(_py: Python, m: &PyModule) -> PyResult<()> { +fn compute(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(parse_sql, m)?)?; - m.add_function(wrap_pyfunction!(restore_ast, m)?)?; - Ok(()) } \ No newline at end of file diff --git a/src/sqloxide.rs b/src/sqloxide.rs deleted file mode 100644 index bf2a5d11c..000000000 --- a/src/sqloxide.rs +++ /dev/null @@ -1,93 +0,0 @@ -use pythonize::pythonize; - -use pyo3::exceptions::PyValueError; -use pyo3::prelude::*; - -use pythonize::PythonizeError; - -use sqlparser::ast::Statement; -use sqlparser::dialect::*; -use sqlparser::parser::Parser; - -fn string_to_dialect(dialect: &str) -> Box { - match dialect.to_lowercase().as_str() { - "ansi" => Box::new(AnsiDialect {}), - "bigquery" | "bq" => Box::new(BigQueryDialect {}), - "clickhouse" => Box::new(ClickHouseDialect {}), - "generic" => Box::new(GenericDialect {}), - "hive" => Box::new(HiveDialect {}), - "ms" | "mssql" => Box::new(MsSqlDialect {}), - "mysql" => Box::new(MySqlDialect {}), - "postgres" => Box::new(PostgreSqlDialect {}), - "redshift" => Box::new(RedshiftSqlDialect {}), - "snowflake" => Box::new(SnowflakeDialect {}), - "sqlite" => Box::new(SQLiteDialect {}), - _ => { - println!("The dialect you chose was not recognized, falling back to 'generic'"); - Box::new(GenericDialect {}) - } - } -} - -/// Function to parse SQL statements from a string. Returns a list with -/// one item per query statement. -/// -/// Available `dialects`: -/// - generic -/// - ansi -/// - hive -/// - ms (mssql) -/// - mysql -/// - postgres -/// - snowflake -/// - sqlite -/// - clickhouse -/// - redshift -/// - bigquery (bq) -/// -#[pyfunction] -#[pyo3(text_signature = "(sql, dialect)")] -pub fn parse_sql(py: Python, sql: &str, dialect: &str) -> PyResult { - let chosen_dialect = string_to_dialect(dialect); - let parse_result = Parser::parse_sql(&*chosen_dialect, sql); - - let output = match parse_result { - Ok(statements) => { - pythonize(py, &statements).map_err(|e| { - let msg = e.to_string(); - PyValueError::new_err(format!("Python object serialization failed.\n\t{msg}")) - })? - } - Err(e) => { - let msg = e.to_string(); - return Err(PyValueError::new_err(format!( - "Query parsing failed.\n\t{msg}" - ))); - } - }; - - Ok(output) -} - -/// This utility function allows reconstituing a modified AST back into list of SQL queries. -#[pyfunction] -#[pyo3(text_signature = "(ast)")] -pub fn restore_ast(_py: Python, ast: &PyAny) -> PyResult> { - let parse_result: Result, PythonizeError> = pythonize::depythonize(ast); - - let output = match parse_result { - Ok(statements) => statements, - Err(e) => { - let msg = e.to_string(); - return Err(PyValueError::new_err(format!( - "Query serialization failed.\n\t{msg}" - ))); - } - }; - - Ok(output - .iter() - .map(std::string::ToString::to_string) - .collect::>()) -} - diff --git a/tests/sql_battery/test_shapes_and_errors_battery.py b/tests/sql_battery/test_shapes_and_errors_battery.py index 479d338b1..7c26aeeac 100644 --- a/tests/sql_battery/test_shapes_and_errors_battery.py +++ b/tests/sql_battery/test_shapes_and_errors_battery.py @@ -1805,6 +1805,22 @@ # Aggregate Functions with HAVING Clause ("SELECT name, COUNT(*) AS count FROM $satellites GROUP BY name HAVING count > 1", 0, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY '%apoll%'", 0, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions ILIKE ANY '%apoll%'", 34, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('%apoll%')", 0, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions ILIKE ANY ('%apoll%')", 34, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('%Apoll%')", 34, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions ILIKE ANY ('%Apoll%')", 34, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('%Apoll%', 'mission')", 34, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions ILIKE ANY ('%Apoll%', 'mission')", 34, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY '%apoll%'", 0, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions NOT ILIKE ANY '%apoll%'", 34, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('%apoll%')", 0, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions NOT ILIKE ANY ('%apoll%')", 34, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('%Apoll%')", 34, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions NOT ILIKE ANY ('%Apoll%')", 34, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('%Apoll%', 'mission')", 34, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions NOT ILIKE ANY ('%Apoll%', 'mission')", 34, 2, None), # **************************************************************************************** From bdbabc33d06353e9da0cf384301f60d68ba70837 Mon Sep 17 00:00:00 2001 From: XB500 Date: Fri, 22 Nov 2024 21:18:08 +0000 Subject: [PATCH 036/157] Opteryx Version 0.19.0-alpha.865 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 1a9f02e9a..8494118f2 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 862 +__build__ = 865 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 4381a33a85b91b7d2265bd153d6ff0085265761d Mon Sep 17 00:00:00 2001 From: joocer Date: Sun, 24 Nov 2024 21:33:12 +0000 Subject: [PATCH 037/157] #2100 --- opteryx/models/physical_plan.py | 289 ++++++++++-------- opteryx/models/query_statistics.py | 3 + opteryx/operators/aggregate_and_group_node.py | 5 +- opteryx/operators/aggregate_node.py | 8 +- opteryx/operators/async_read_node.py | 2 - opteryx/operators/base_plan_node.py | 31 +- opteryx/operators/cross_join_node.py | 35 ++- opteryx/operators/distinct_node.py | 7 +- opteryx/operators/exit_node.py | 5 +- opteryx/operators/explain_node.py | 2 +- opteryx/operators/filter_node.py | 11 +- opteryx/operators/function_dataset_node.py | 1 - opteryx/operators/heap_sort_node.py | 3 +- opteryx/operators/inner_join_node.py | 7 +- opteryx/operators/inner_join_node_single.py | 7 +- opteryx/operators/limit_node.py | 12 +- opteryx/operators/noop_node.py | 2 +- opteryx/operators/outer_join_node.py | 19 +- opteryx/operators/projection_node.py | 4 +- opteryx/operators/pyarrow_join_node.py | 7 +- opteryx/operators/read_node.py | 2 - opteryx/operators/show_columns_node.py | 11 +- opteryx/operators/sort_node.py | 9 +- opteryx/operators/union_node.py | 10 +- .../test_shapes_and_errors_battery.py | 30 +- 25 files changed, 294 insertions(+), 228 deletions(-) diff --git a/opteryx/models/physical_plan.py b/opteryx/models/physical_plan.py index 050b6af93..94eb95270 100644 --- a/opteryx/models/physical_plan.py +++ b/opteryx/models/physical_plan.py @@ -10,94 +10,35 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" -The Execution Tree is the Graph which defines a Query Plan. - -The execution tree contains functionality to: - -- build and define the plan -- execute the plan -- manipulate the plan - -""" - +from queue import Empty +from queue import Queue +from threading import Lock +from threading import Thread from typing import Any from typing import Generator from typing import Optional from typing import Tuple -import pyarrow - from opteryx import EOS -from opteryx import config from opteryx.constants import ResultType from opteryx.exceptions import InvalidInternalStateError from opteryx.third_party.travers import Graph +morsel_lock = Lock() +active_tasks: int = 0 + class PhysicalPlan(Graph): """ - The execution tree is defined separately to the planner to simplify the - complex code which is the planner from the tree that describes the plan. + The execution tree is defined separately from the planner to simplify the + complex code that is the planner from the tree that describes the plan. """ - def explainv2(self, analyze: bool) -> Generator[pyarrow.Table, None, None]: - from opteryx import operators - - def _inner_explain(node, depth): - incoming_operators = self.ingoing_edges(node) - for operator_name in incoming_operators: - operator = self[operator_name[0]] - if isinstance( - operator, (operators.ExitNode, operators.ExplainNode) - ): # Skip ExitNode - yield from _inner_explain(operator_name[0], depth) - continue - elif isinstance(operator, operators.BasePlanNode): - record = { - "tree": depth, - "operator": operator.name, - "config": operator.config, - } - if analyze: - record["time_ms"] = operator.execution_time / 1e6 - record["records_in"] = operator.records_in - record["records_out"] = operator.records_out - yield record - yield from _inner_explain(operator_name[0], depth + 1) - - head = list(dict.fromkeys(self.get_exit_points())) - if len(head) != 1: # pragma: no cover - raise InvalidInternalStateError(f"Problem with the plan - it has {len(head)} heads.") - - # for EXPLAIN ANALYZE, we execute the query and report statistics - if analyze: - # we don't want the results, just the details from the plan - temp = None - head_node = self.get_exit_points()[0] - query_head, _, _ = self.ingoing_edges(head_node)[0] - results = self.execute(query_head) - if results is not None: - results_generator, _ = next(results, ([], None)) - for temp in results_generator: - pass - del temp - - plan = list(_inner_explain(head[0], 1)) - - table = pyarrow.Table.from_pylist(plan) - - yield table - def depth_first_search_flat( self, node: Optional[str] = None, visited: Optional[set] = None ) -> list: """ Returns a flat list representing the depth-first traversal of the graph with left/right ordering. - - We do this so we always evaluate the left side of a join before the right side. It technically - doesn't need the entire plan flattened DFS-wise, but this is what we are doing here to achieve - the outcome we're after. """ if node is None: node = self.get_exit_points()[0] @@ -106,19 +47,11 @@ def depth_first_search_flat( visited = set() visited.add(node) - - # Collect this node's information in a flat list format - traversal_list = [ - ( - node, - self[node], - ) - ] + traversal_list = [(node, self[node])] # Sort neighbors based on relationship to ensure left, right, then unlabelled order neighbors = sorted(self.ingoing_edges(node), key=lambda x: (x[2] == "right", x[2] == "")) - # Traverse each child, prioritizing left, then right, then unlabelled for neighbor, _, _ in neighbors: if neighbor not in visited: child_list = self.depth_first_search_flat(neighbor, visited) @@ -126,7 +59,7 @@ def depth_first_search_flat( return traversal_list - def execute(self, head_node=None) -> Tuple[Generator[pyarrow.Table, Any, Any], ResultType]: + def execute(self, head_node=None) -> Generator[Tuple[Any, ResultType], Any, Any]: from opteryx.operators import ExplainNode from opteryx.operators import JoinNode from opteryx.operators import ReaderNode @@ -134,13 +67,58 @@ def execute(self, head_node=None) -> Tuple[Generator[pyarrow.Table, Any, Any], R from opteryx.operators import ShowCreateNode from opteryx.operators import ShowValueNode - # Validate query plan to ensure it's acyclic + morsel_accounting = {nid: 0 for nid in self.nodes()} # Total morsels received by each node + node_exhaustion = {nid: False for nid in self.nodes()} # Exhaustion state of each node + + def mark_node_exhausted(node_id): + """ + Mark a node as exhausted and propagate exhaustion downstream. + """ + if node_exhaustion[node_id]: + return # Node is already marked as exhausted + + global active_tasks + node_exhaustion[node_id] = True + # print("+", node_id) + + # Notify downstream nodes + for _, downstream_node, _ in self.outgoing_edges(node_id): + # Check if all parents of downstream_node are exhausted + if all( + node_exhaustion[parent] for parent, _, _ in self.ingoing_edges(downstream_node) + ): + work_queue.put((downstream_node, EOS)) # None signals exhaustion + active_tasks += 1 + morsel_accounting[node_id] += 1 + + def update_morsel_accounting(node_id, morsel_count_change: int): + """ + Updates the morsel accounting for a node and checks for exhaustion. + + Parameters: + node_id (str): The ID of the node to update. + morsel_count_change (int): The change in morsel count (+1 for increment, -1 for decrement). + + Returns: + None + """ + with morsel_lock: + morsel_accounting[node_id] += morsel_count_change + # print(">", node_id, morsel_accounting[node_id], morsel_count_change, self[node_id].name) + + # Check if the node is exhausted + if morsel_accounting[node_id] <= 0: # No more pending morsels for this node + # Ensure all parent nodes are exhausted + all_parents_exhausted = all( + node_exhaustion[parent] for parent, _, _ in self.ingoing_edges(node_id) + ) + if all_parents_exhausted: + mark_node_exhausted(node_id) + if not self.is_acyclic(): raise InvalidInternalStateError("Query plan is cyclic, cannot execute.") - # Retrieve the tail of the query plan, which should ideally be a single head node head_nodes = list(set(self.get_exit_points())) - if len(head_nodes) != 1: raise InvalidInternalStateError( f"Query plan has {len(head_nodes)} heads, expected exactly 1." @@ -149,77 +127,118 @@ def execute(self, head_node=None) -> Tuple[Generator[pyarrow.Table, Any, Any], R if head_node is None: head_node = self[head_nodes[0]] - # add the left/right labels to the edges coming into the joins - joins = [(nid, node) for nid, node in self.nodes(True) if isinstance(node, JoinNode)] - for nid, join in joins: - for s, t, r in self.breadth_first_search(nid, reverse=True): - source_relations = self[s].parameters.get("all_relations", set()) - if set(join._left_relation).intersection(source_relations): - self.remove_edge(s, t, r) - self.add_edge(s, t, "left") - elif set(join._right_relation).intersection(source_relations): - self.remove_edge(s, t, r) - self.add_edge(s, t, "right") - # Special case handling for 'Explain' queries if isinstance(head_node, ExplainNode): - yield self.explainv2(head_node.analyze), ResultType.TABULAR - - # Special case handling for 'Set' queries - elif isinstance(head_node, SetVariableNode): - yield head_node(None), ResultType.NON_TABULAR + yield self.explain(head_node.analyze), ResultType.TABULAR - elif isinstance(head_node, (ShowValueNode, ShowCreateNode)): + elif isinstance(head_node, (SetVariableNode, ShowValueNode, ShowCreateNode)): yield head_node(None), ResultType.TABULAR else: + # Work queue for worker tasks + work_queue = Queue() + # Response queue for results sent back to the engine + response_queue = Queue() + num_workers = 2 + workers = [] + + def worker_process(): + """ + Worker thread: Processes tasks from the work queue and sends results to the response queue. + """ + while True: + task = work_queue.get() + if task is None: + break + + node_id, morsel = task + if morsel_accounting[node_id] is False: + print("RUNNING AN EXHAUSTED NODE") + operator = self[node_id] + results = operator(morsel) + + for result in results: + # Send results back to the response queue + response_queue.put((node_id, result)) + + update_morsel_accounting(node_id, -1) + + work_queue.task_done() + + # Launch worker threads + for _ in range(num_workers): + worker = Thread(target=worker_process) + worker.daemon = True + worker.start() + workers.append(worker) def inner_execute(plan): - # Get the pump nodes from the plan and execute them in order + # Identify pump nodes + global active_tasks + pump_nodes = [ (nid, node) for nid, node in self.depth_first_search_flat() if isinstance(node, ReaderNode) ] + + # Main engine loop processes pump nodes and coordinates work for pump_nid, pump_instance in pump_nodes: for morsel in pump_instance(None): - yield from plan.process_node(pump_nid, morsel) - - yield inner_execute(self), ResultType.TABULAR + # Initial morsels pushed to the work queue + # Determine downstream operators + next_nodes = [target for _, target, _ in self.outgoing_edges(pump_nid)] + for downstream_node in next_nodes: + # Queue tasks for downstream operators + work_queue.put((downstream_node, morsel)) + active_tasks += 1 + update_morsel_accounting(downstream_node, +1) + + # Pump is exhausted after emitting all morsels + mark_node_exhausted(pump_nid) + + # Process results from the response queue + def should_stop(): + all_nodes_exhausted = all(node_exhaustion.values()) + queues_empty = work_queue.empty() and response_queue.empty() + all_nodes_inactive = active_tasks <= 0 + # print(node_exhaustion.values(), all(node_exhaustion.values()), work_queue.empty(), response_queue.empty(), active_tasks) + return all_nodes_exhausted and queues_empty and all_nodes_inactive + + while not should_stop(): + # Wait for results from workers + try: + node_id, result = response_queue.get(timeout=0.1) + except Empty: + continue + + # Handle EOS + if result is None: + active_tasks -= 1 + continue + + # Determine downstream operators + downstream_nodes = [target for _, target, _ in self.outgoing_edges(node_id)] + if len(downstream_nodes) == 0: + # print("YIELD") + yield result + else: + for downstream_node in downstream_nodes: + # Queue tasks for downstream operators + active_tasks += 1 + work_queue.put((downstream_node, result)) + update_morsel_accounting(downstream_node, +1) - def process_node(self, nid, morsel): - from opteryx.operators import ReaderNode + # decrement _after_ we've done the work relation to handling the task + active_tasks -= 1 - node = self[nid] + # print("DONE!", node_exhaustion, work_queue.empty(), response_queue.empty()) - if isinstance(node, ReaderNode): - children = (t for s, t, r in self.outgoing_edges(nid)) - for child in children: - results = self.process_node(child, morsel) - results = list(results) - yield from results - else: - results = node(morsel) - if results is None: - return None - if not isinstance(results, list): - results = [results] - if morsel == EOS and not any(r == EOS for r in results): - results.append(EOS) - for result in results: - if result is not None: - children = [t for s, t, r in self.outgoing_edges(nid)] - for child in children: - yield from self.process_node(child, result) - if len(children) == 0 and result != EOS: - yield result + for worker in workers: + work_queue.put(None) - def sensors(self): - readings = {} - for nid in self.nodes(): - node = self[nid] - readings[node.identity] = node.sensors() - return readings + # Wait for all workers to complete + for worker in workers: + worker.join() - def __del__(self): - pass + yield inner_execute(self), ResultType.TABULAR diff --git a/opteryx/models/query_statistics.py b/opteryx/models/query_statistics.py index 1d9ee5e6f..24a771fb1 100644 --- a/opteryx/models/query_statistics.py +++ b/opteryx/models/query_statistics.py @@ -36,6 +36,9 @@ def __setattr__(self, attr, value): else: self._stats[attr] = value + def increase(self, attr: str, amount: float): + self._stats[attr] += amount + def add_message(self, message: str): """collect warnings""" if "messages" not in self._stats: diff --git a/opteryx/operators/aggregate_and_group_node.py b/opteryx/operators/aggregate_and_group_node.py index 6cc40d281..900ce180b 100644 --- a/opteryx/operators/aggregate_and_group_node.py +++ b/opteryx/operators/aggregate_and_group_node.py @@ -104,7 +104,7 @@ def config(self): # pragma: no cover def name(self): # pragma: no cover return "Group" - def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + def execute(self, morsel: pyarrow.Table): if morsel == EOS: # merge all the morsels together into one table, selecting only the columns # we're pretty sure we're going to use - this will fail for datasets @@ -138,7 +138,8 @@ def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: groups = groups.select(list(self.column_map.values()) + self.group_by_columns) groups = groups.rename_columns(list(self.column_map.keys()) + self.group_by_columns) - return [groups, EOS] + yield groups + return morsel = project(morsel, self.all_identifiers) # Add a "*" column, this is an int because when a bool it miscounts diff --git a/opteryx/operators/aggregate_node.py b/opteryx/operators/aggregate_node.py index 0d6d75143..81142d930 100644 --- a/opteryx/operators/aggregate_node.py +++ b/opteryx/operators/aggregate_node.py @@ -221,10 +221,11 @@ def name(self): # pragma: no cover def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: if morsel == EOS: if _is_count_star(self.aggregates): - return _count_star( + yield _count_star( morsel_promise=self.buffer, column_name=self.aggregates[0].schema_column.identity, ) + return # merge all the morsels together into one table, selecting only the columns # we're pretty sure we're going to use - this will fail for datasets @@ -248,6 +249,9 @@ def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: # name the aggregate fields and add them to the Columns data aggregates = aggregates.select(list(self.column_map.keys())) - return [aggregates, EOS] + yield aggregates + + return self.buffer.append(project(morsel, self.all_identifiers)) + yield None diff --git a/opteryx/operators/async_read_node.py b/opteryx/operators/async_read_node.py index 792936a0d..7096f68d2 100644 --- a/opteryx/operators/async_read_node.py +++ b/opteryx/operators/async_read_node.py @@ -209,5 +209,3 @@ def execute(self, morsel) -> Generator: yield pyarrow.Table.from_arrays( [pyarrow.array([]) for _ in arrow_schema], schema=arrow_schema ) - - yield EOS diff --git a/opteryx/operators/base_plan_node.py b/opteryx/operators/base_plan_node.py index 7e025efaf..0bff180c4 100644 --- a/opteryx/operators/base_plan_node.py +++ b/opteryx/operators/base_plan_node.py @@ -95,14 +95,29 @@ def __call__(self, morsel: pyarrow.Table) -> Optional[pyarrow.Table]: self.bytes_in += morsel.nbytes self.calls += 1 - start_time = time.monotonic_ns() - result = self.execute(morsel) - - self.execution_time += time.monotonic_ns() - start_time - if result is not None and result != EOS and hasattr(result, "num_rows"): - self.records_out += result.num_rows - self.bytes_out += result.nbytes - return result + generator = self.execute(morsel) # Initialize the generator + + while True: + try: + # Time the production of the next result + start_time = time.monotonic_ns() + result = next(generator) # Retrieve the next item from the generator + self.execution_time += time.monotonic_ns() - start_time + self.statistics.increase( + "time_" + self.name.lower(), time.monotonic_ns() - start_time + ) + + # Update metrics for valid results + if result is not None and result != EOS and hasattr(result, "num_rows"): + self.records_out += result.num_rows + self.bytes_out += result.nbytes + + # Yield the result to the consumer + yield result + + except StopIteration: + # Break the loop when the generator is exhausted + break def sensors(self): return { diff --git a/opteryx/operators/cross_join_node.py b/opteryx/operators/cross_join_node.py index 490762ba2..5a0ed5268 100644 --- a/opteryx/operators/cross_join_node.py +++ b/opteryx/operators/cross_join_node.py @@ -330,26 +330,25 @@ def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: if self._unnest_column is not None: if morsel == EOS: self.continue_executing = False - return EOS + return if isinstance(self._unnest_column.value, tuple): - return list( - _cross_join_unnest_literal( - morsel=morsel, - source=self._unnest_column.value, - target_column=self._unnest_target, - ) - ) - return list( - _cross_join_unnest_column( + yield from _cross_join_unnest_literal( morsel=morsel, - source=self._unnest_column, + source=self._unnest_column.value, target_column=self._unnest_target, - conditions=self._filters, - hash_set=self.hash_set, - distinct=self._distinct, - single_column=self._single_column, ) + return + + yield from _cross_join_unnest_column( + morsel=morsel, + source=self._unnest_column, + target_column=self._unnest_target, + conditions=self._filters, + hash_set=self.hash_set, + distinct=self._distinct, + single_column=self._single_column, ) + return if self.stream == "left": if morsel == EOS: @@ -358,13 +357,13 @@ def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: self.left_buffer.clear() else: self.left_buffer.append(morsel) - return None + yield None if self.stream == "right": if morsel == EOS: right_table = pyarrow.concat_tables(self.right_buffer, promote_options="none") # type:ignore self.right_buffer = None - return list(_cross_join(self.left_relation, right_table)) + yield from _cross_join(self.left_relation, right_table) else: self.right_buffer.append(morsel) - return None + yield None diff --git a/opteryx/operators/distinct_node.py b/opteryx/operators/distinct_node.py index 60cf76c2d..2a1478f9f 100644 --- a/opteryx/operators/distinct_node.py +++ b/opteryx/operators/distinct_node.py @@ -59,7 +59,8 @@ def execute(self, morsel: Table) -> Table: # limit processing if morsel == EOS: - return EOS + yield EOS + return unique_indexes, self.hash_set = distinct( morsel, columns=self._distinct_on, seen_hashes=self.hash_set @@ -67,7 +68,7 @@ def execute(self, morsel: Table) -> Table: if len(unique_indexes) > 0: distinct_table = morsel.take(unique_indexes) - return distinct_table + yield distinct_table else: distinct_table = morsel.slice(0, 0) - return distinct_table + yield distinct_table diff --git a/opteryx/operators/exit_node.py b/opteryx/operators/exit_node.py index a428e955b..c12889394 100644 --- a/opteryx/operators/exit_node.py +++ b/opteryx/operators/exit_node.py @@ -67,7 +67,8 @@ def name(self): # pragma: no cover def execute(self, morsel: Table) -> Table: # Exit doesn't return EOS if morsel == EOS: - return None + yield None + return final_columns = [] final_names = [] @@ -105,4 +106,4 @@ def execute(self, morsel: Table) -> Table: morsel = morsel.select(final_columns) morsel = morsel.rename_columns(final_names) - return morsel + yield morsel diff --git a/opteryx/operators/explain_node.py b/opteryx/operators/explain_node.py index 2b16067a2..c52e7035e 100644 --- a/opteryx/operators/explain_node.py +++ b/opteryx/operators/explain_node.py @@ -45,4 +45,4 @@ def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover def execute(self, morsel: Table) -> Table: if self._query_plan: - return self._query_plan.explain(self.analyze) + yield self._query_plan.explain(self.analyze) diff --git a/opteryx/operators/filter_node.py b/opteryx/operators/filter_node.py index c4cff2e78..1346d4a60 100644 --- a/opteryx/operators/filter_node.py +++ b/opteryx/operators/filter_node.py @@ -57,10 +57,12 @@ def name(self): # pragma: no cover def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: if morsel == EOS: - return EOS + yield EOS + return if morsel.num_rows == 0: - return morsel + yield morsel + return if self.function_evaluations: morsel = evaluate_and_append(self.function_evaluations, morsel) @@ -77,5 +79,6 @@ def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: # if there's no matching rows, just drop the morsel if mask.size > 0 and not numpy.all(mask is None): - return morsel.take(pyarrow.array(mask)) - return morsel.slice(0, 0) + yield morsel.take(pyarrow.array(mask)) + else: + yield morsel.slice(0, 0) diff --git a/opteryx/operators/function_dataset_node.py b/opteryx/operators/function_dataset_node.py index 9ac8cf80a..ce2912d9b 100644 --- a/opteryx/operators/function_dataset_node.py +++ b/opteryx/operators/function_dataset_node.py @@ -148,4 +148,3 @@ def execute(self, morsel) -> Generator: self.statistics.columns_read += len(table.column_names) yield table - yield EOS diff --git a/opteryx/operators/heap_sort_node.py b/opteryx/operators/heap_sort_node.py index 782e8ab44..3a5a1d1fd 100644 --- a/opteryx/operators/heap_sort_node.py +++ b/opteryx/operators/heap_sort_node.py @@ -84,7 +84,8 @@ def name(self): # pragma: no cover def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: if morsel == EOS: - return [self.table, EOS] + yield self.table + return if self.table: # Concatenate the accumulated table with the new morsel diff --git a/opteryx/operators/inner_join_node.py b/opteryx/operators/inner_join_node.py index 533a0060a..5d5ef74a4 100644 --- a/opteryx/operators/inner_join_node.py +++ b/opteryx/operators/inner_join_node.py @@ -118,10 +118,11 @@ def execute(self, morsel: Table) -> Table: self.left_hash = hash_join_map(self.left_relation, self._left_columns) else: self.left_buffer.append(morsel) - return None + yield None + return if morsel == EOS: - return EOS + return # do the join new_morsel = inner_join_with_preprocessed_left_side( @@ -131,4 +132,4 @@ def execute(self, morsel: Table) -> Table: hash_table=self.left_hash, ) - return new_morsel + yield new_morsel diff --git a/opteryx/operators/inner_join_node_single.py b/opteryx/operators/inner_join_node_single.py index f2f45692c..2187e7701 100644 --- a/opteryx/operators/inner_join_node_single.py +++ b/opteryx/operators/inner_join_node_single.py @@ -204,10 +204,11 @@ def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: self.left_hash = preprocess_left(self.left_relation, self._left_columns) else: self.left_buffer.append(morsel) - return None + yield None + return if morsel == EOS: - return EOS + return # do the join new_morsel = inner_join_with_preprocessed_left_side( @@ -217,4 +218,4 @@ def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: hash_table=self.left_hash, ) - return new_morsel + yield new_morsel diff --git a/opteryx/operators/limit_node.py b/opteryx/operators/limit_node.py index 20b204829..b4e4a40ae 100644 --- a/opteryx/operators/limit_node.py +++ b/opteryx/operators/limit_node.py @@ -49,12 +49,13 @@ def config(self): # pragma: no cover def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: if morsel == EOS: - return EOS + return if self.rows_left_to_skip > 0: if self.rows_left_to_skip >= morsel.num_rows: self.rows_left_to_skip -= morsel.num_rows - return morsel.slice(offset=0, length=0) + yield morsel.slice(offset=0, length=0) + return else: morsel = morsel.slice( offset=self.rows_left_to_skip, length=morsel.num_rows - self.rows_left_to_skip @@ -62,12 +63,13 @@ def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: self.rows_left_to_skip = 0 if self.remaining_rows <= 0 or morsel.num_rows == 0: - return morsel.slice(offset=0, length=0) + yield morsel.slice(offset=0, length=0) + yield if morsel.num_rows < self.remaining_rows: self.remaining_rows -= morsel.num_rows - return morsel + yield morsel else: rows_to_slice = self.remaining_rows self.remaining_rows = 0 - return morsel.slice(offset=0, length=rows_to_slice) + yield morsel.slice(offset=0, length=rows_to_slice) diff --git a/opteryx/operators/noop_node.py b/opteryx/operators/noop_node.py index b0c4bce8b..5b327d593 100644 --- a/opteryx/operators/noop_node.py +++ b/opteryx/operators/noop_node.py @@ -41,4 +41,4 @@ def config(self): # pragma: no cover def execute(self, morsel: Table) -> Table: print("NOOP was called") - return [morsel] + yield morsel diff --git a/opteryx/operators/outer_join_node.py b/opteryx/operators/outer_join_node.py index 191d43c21..479dc98ef 100644 --- a/opteryx/operators/outer_join_node.py +++ b/opteryx/operators/outer_join_node.py @@ -292,7 +292,8 @@ def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: self.left_buffer.clear() else: self.left_buffer.append(morsel) - return None + yield None + return if self.stream == "right": if morsel == EOS: @@ -301,18 +302,16 @@ def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: join_provider = providers.get(self._join_type) - return list( - join_provider( - left_relation=self.left_relation, - right_relation=right_relation, - left_columns=self._left_columns, - right_columns=self._right_columns, - ) - ) + [EOS] + yield from join_provider( + left_relation=self.left_relation, + right_relation=right_relation, + left_columns=self._left_columns, + right_columns=self._right_columns, + ) else: self.right_buffer.append(morsel) - return None + yield None providers = { diff --git a/opteryx/operators/projection_node.py b/opteryx/operators/projection_node.py index 35b890597..8a858fe87 100644 --- a/opteryx/operators/projection_node.py +++ b/opteryx/operators/projection_node.py @@ -64,8 +64,8 @@ def name(self): # pragma: no cover def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: if morsel == EOS: - return EOS + return # If any of the columns need evaluating, we need to do that here morsel = evaluate_and_append(self.evaluations, morsel) - return morsel.select(self.projection) + yield morsel.select(self.projection) diff --git a/opteryx/operators/pyarrow_join_node.py b/opteryx/operators/pyarrow_join_node.py index 02b5ed131..6592c25a8 100644 --- a/opteryx/operators/pyarrow_join_node.py +++ b/opteryx/operators/pyarrow_join_node.py @@ -78,7 +78,8 @@ def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: else: self.left_buffer.append(morsel) - return None + yield None + return if morsel == EOS: right_relation = pyarrow.concat_tables(self.right_buffer, promote_options="none") @@ -111,8 +112,8 @@ def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: "Unable to ANTI/SEMI JOIN with unsupported column types in table." ) from err - return [new_morsel, EOS] + yield new_morsel else: self.right_buffer.append(morsel) - return None + yield None diff --git a/opteryx/operators/read_node.py b/opteryx/operators/read_node.py index e81bcb7eb..69d5b0d21 100644 --- a/opteryx/operators/read_node.py +++ b/opteryx/operators/read_node.py @@ -224,5 +224,3 @@ def execute(self, morsel) -> Generator: self.statistics.columns_read += morsel.num_columns else: self.statistics.columns_read += len(orso_schema.columns) - - yield EOS diff --git a/opteryx/operators/show_columns_node.py b/opteryx/operators/show_columns_node.py index 3d57a8c21..7388babc8 100644 --- a/opteryx/operators/show_columns_node.py +++ b/opteryx/operators/show_columns_node.py @@ -75,13 +75,15 @@ def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: from orso import DataFrame if self.seen: - return None + yield None + return if not (self._full or self._extended): # if it's not full or extended, do just get the list of columns and their # types self.seen = True - return _simple_collector(self._schema) + yield _simple_collector(self._schema) + return if self._full or self._extended: # we're going to read the full table, so we can count stuff @@ -90,7 +92,8 @@ def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: dicts = self.collector.to_dicts() dicts = [self.rename_column(d, self._column_map) for d in dicts] self.seen = True - return pyarrow.Table.from_pylist(dicts) + yield pyarrow.Table.from_pylist(dicts) + return df = DataFrame.from_arrow(morsel) @@ -99,4 +102,4 @@ def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: else: self.collector += df.profile - return None + yield None diff --git a/opteryx/operators/sort_node.py b/opteryx/operators/sort_node.py index 12c399240..c943afa6a 100644 --- a/opteryx/operators/sort_node.py +++ b/opteryx/operators/sort_node.py @@ -53,7 +53,8 @@ def name(self): # pragma: no cover def execute(self, morsel: Table) -> Table: if morsel != EOS: self.morsels.append(morsel) - return None + yield None + return table = concat_tables(self.morsels, promote_options="permissive") @@ -67,7 +68,8 @@ def execute(self, morsel: Table) -> Table: if column.value in ("RANDOM", "RAND"): new_order = numpy.argsort(numpy.random.uniform(size=table.num_rows)) table = table.take(new_order) - return table + yield table + return raise UnsupportedSyntaxError( "`ORDER BY` only supports `RAND()` as a functional sort order." @@ -97,4 +99,5 @@ def execute(self, morsel: Table) -> Table: f"`ORDER BY` must reference columns as they appear in the `SELECT` clause. {cnfe}" ) - return [table.sort_by(mapped_order), EOS] + yield table.sort_by(mapped_order) + yield EOS diff --git a/opteryx/operators/union_node.py b/opteryx/operators/union_node.py index a59a07530..c8623f69d 100644 --- a/opteryx/operators/union_node.py +++ b/opteryx/operators/union_node.py @@ -50,15 +50,15 @@ def execute(self, morsel: Table) -> Table: coercible types are coerced. """ if morsel == EOS and self.seen_first_eos: - return [EOS] - if morsel == EOS: + return + elif morsel == EOS: self.seen_first_eos = True - return None + yield None - if self.schema is None: + elif self.schema is None: self.schema = morsel.schema else: morsel = morsel.rename_columns(self.schema.names) morsel = morsel.cast(self.schema) - return morsel.select(self.column_ids) + yield morsel.select(self.column_ids) diff --git a/tests/sql_battery/test_shapes_and_errors_battery.py b/tests/sql_battery/test_shapes_and_errors_battery.py index 7c26aeeac..9a6d81698 100644 --- a/tests/sql_battery/test_shapes_and_errors_battery.py +++ b/tests/sql_battery/test_shapes_and_errors_battery.py @@ -1813,14 +1813,28 @@ ("SELECT name, missions FROM $astronauts WHERE missions ILIKE ANY ('%Apoll%')", 34, 2, None), ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('%Apoll%', 'mission')", 34, 2, None), ("SELECT name, missions FROM $astronauts WHERE missions ILIKE ANY ('%Apoll%', 'mission')", 34, 2, None), - ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY '%apoll%'", 0, 2, None), - ("SELECT name, missions FROM $astronauts WHERE missions NOT ILIKE ANY '%apoll%'", 34, 2, None), - ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('%apoll%')", 0, 2, None), - ("SELECT name, missions FROM $astronauts WHERE missions NOT ILIKE ANY ('%apoll%')", 34, 2, None), - ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('%Apoll%')", 34, 2, None), - ("SELECT name, missions FROM $astronauts WHERE missions NOT ILIKE ANY ('%Apoll%')", 34, 2, None), - ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('%Apoll%', 'mission')", 34, 2, None), - ("SELECT name, missions FROM $astronauts WHERE missions NOT ILIKE ANY ('%Apoll%', 'mission')", 34, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY '%apoll%'", 357, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions NOT ILIKE ANY '%apoll%'", 323, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('%apoll%')", 357, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions NOT ILIKE ANY ('%apoll%')", 323, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('%Apoll%')", 323, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions NOT ILIKE ANY ('%Apoll%')", 323, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('%Apoll%', 'mission')", 323, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions NOT ILIKE ANY ('%Apoll%', 'mission')", 323, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('Apoll%', 'Gemini%', 'Mercury%')", 37, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('Apoll%', 'Gemini%', 'Mercury%')", 320, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ()", 0, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ()", 0, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('%Apoll%', null)", 37, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('%Apoll%', null)", 37, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('%aPoll%')", 37, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions ILIKE ANY ('%aPoll%')", 37, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('Apollo 11')", 37, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('Apollo 11')", 37, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('Apollo_%')", 37, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('Apo__o%')", 37, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('%Apoll%', 123)", 37, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('%pattern1%', '%pattern2%', '%pattern3%', '%pattern4%', '%pattern5%', '%pattern6%', '%pattern7%', '%pattern8%', '%pattern9%', '%pattern10%', '%pattern11%', '%pattern12%', '%pattern13%', '%pattern14%', '%pattern15%', '%pattern16%', '%pattern17%', '%pattern18%', '%pattern19%', '%pattern20%', '%pattern21%', '%pattern22%', '%pattern23%', '%pattern24%', '%pattern25%', '%pattern26%', '%pattern27%', '%pattern28%', '%pattern29%', '%pattern30%', '%pattern31%', '%pattern32%', '%pattern33%', '%pattern34%', '%pattern35%', '%pattern36%', '%pattern37%', '%pattern38%', '%pattern39%', '%pattern40%', '%pattern41%', '%pattern42%', '%pattern43%', '%pattern44%', '%pattern45%', '%pattern46%', '%pattern47%', '%pattern48%', '%pattern49%', '%pattern50%');", 37, 2, None), # **************************************************************************************** From e8fc67eedeac79d07b5c01a339013c0e8bbfb351 Mon Sep 17 00:00:00 2001 From: XB500 Date: Sun, 24 Nov 2024 21:33:38 +0000 Subject: [PATCH 038/157] Opteryx Version 0.19.0-alpha.866 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 8494118f2..7f11af185 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 865 +__build__ = 866 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 3cafd9295d6ba4f2f63f9c0283430fda8d01eaf5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 25 Nov 2024 10:08:21 +0000 Subject: [PATCH 039/157] Bump duckdb-engine from 0.13.5 to 0.13.6 Bumps [duckdb-engine](https://github.com/Mause/duckdb_engine) from 0.13.5 to 0.13.6. - [Release notes](https://github.com/Mause/duckdb_engine/releases) - [Changelog](https://github.com/Mause/duckdb_engine/blob/main/CHANGELOG.md) - [Commits](https://github.com/Mause/duckdb_engine/compare/v0.13.5...v0.13.6) --- updated-dependencies: - dependency-name: duckdb-engine dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- tests/requirements_arm.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/requirements_arm.txt b/tests/requirements_arm.txt index 260cd088f..7887e997b 100644 --- a/tests/requirements_arm.txt +++ b/tests/requirements_arm.txt @@ -19,6 +19,6 @@ sqlalchemy pymysql psycopg2-binary duckdb==1.1.3 # 1040 -duckdb-engine==0.13.5 # 1040 +duckdb-engine==0.13.6 # 1040 setuptools_rust \ No newline at end of file From a10cb5d58bef713af42a2ef12ae8abf0e7b0c058 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 25 Nov 2024 10:21:47 +0000 Subject: [PATCH 040/157] Update pyo3 requirement from 0.22 to 0.23 Updates the requirements on [pyo3](https://github.com/pyo3/pyo3) to permit the latest version. - [Release notes](https://github.com/pyo3/pyo3/releases) - [Changelog](https://github.com/PyO3/pyo3/blob/v0.22.6/CHANGELOG.md) - [Commits](https://github.com/pyo3/pyo3/compare/v0.22.0...v0.22.6) --- updated-dependencies: - dependency-name: pyo3 dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 0130e0db2..4ffeb02c2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,7 @@ pythonize = "0.22" serde = "1.0.171" [dependencies.pyo3] -version = "0.22" +version = "0.23" features = ["extension-module"] [dependencies.sqlparser] From 453c3ae137480cb554ca54b91b0aa7800038df7b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 25 Nov 2024 10:21:49 +0000 Subject: [PATCH 041/157] Update pythonize requirement from 0.22 to 0.23 Updates the requirements on [pythonize](https://github.com/davidhewitt/pythonize) to permit the latest version. - [Release notes](https://github.com/davidhewitt/pythonize/releases) - [Changelog](https://github.com/davidhewitt/pythonize/blob/main/CHANGELOG.md) - [Commits](https://github.com/davidhewitt/pythonize/compare/v0.22.0...v0.22.0) --- updated-dependencies: - dependency-name: pythonize dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 0130e0db2..441717efa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,7 @@ name = "compute" crate-type = ["cdylib"] [dependencies] -pythonize = "0.22" +pythonize = "0.23" serde = "1.0.171" [dependencies.pyo3] From 7571a46da4a16198655b8d24743abf7a933c6ff2 Mon Sep 17 00:00:00 2001 From: joocer Date: Wed, 27 Nov 2024 08:34:13 +0000 Subject: [PATCH 042/157] #2111 --- Cargo.toml | 4 +- opteryx/models/physical_plan.py | 78 ++++++++++++++++--- opteryx/operators/aggregate_and_group_node.py | 1 + opteryx/operators/sort_node.py | 1 + .../strategies/constant_folding.py | 18 +++++ 5 files changed, 89 insertions(+), 13 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 0130e0db2..86a558dfb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,11 +9,11 @@ name = "compute" crate-type = ["cdylib"] [dependencies] -pythonize = "0.22" +pythonize = "0.23" serde = "1.0.171" [dependencies.pyo3] -version = "0.22" +version = "0.23" features = ["extension-module"] [dependencies.sqlparser] diff --git a/opteryx/models/physical_plan.py b/opteryx/models/physical_plan.py index 94eb95270..2638a540a 100644 --- a/opteryx/models/physical_plan.py +++ b/opteryx/models/physical_plan.py @@ -24,9 +24,17 @@ from opteryx.exceptions import InvalidInternalStateError from opteryx.third_party.travers import Graph +import pyarrow + morsel_lock = Lock() +active_task_lock = Lock() active_tasks: int = 0 +def active_tasks_increment(value: int): + global active_tasks + with active_task_lock: + active_tasks += value + class PhysicalPlan(Graph): """ @@ -59,6 +67,55 @@ def depth_first_search_flat( return traversal_list + def explain(self, analyze: bool) -> Generator[pyarrow.Table, None, None]: + from opteryx import operators + + def _inner_explain(node, depth): + incoming_operators = self.ingoing_edges(node) + for operator_name in incoming_operators: + operator = self[operator_name[0]] + if isinstance( + operator, (operators.ExitNode, operators.ExplainNode) + ): # Skip ExitNode + yield from _inner_explain(operator_name[0], depth) + continue + elif isinstance(operator, operators.BasePlanNode): + record = { + "tree": depth, + "operator": operator.name, + "config": operator.config, + } + if analyze: + record["time_ms"] = operator.execution_time / 1e6 + record["records_in"] = operator.records_in + record["records_out"] = operator.records_out + yield record + yield from _inner_explain(operator_name[0], depth + 1) + + head = list(dict.fromkeys(self.get_exit_points())) + if len(head) != 1: # pragma: no cover + raise InvalidInternalStateError(f"Problem with the plan - it has {len(head)} heads.") + + # for EXPLAIN ANALYZE, we execute the query and report statistics + if analyze: + # we don't want the results, just the details from the plan + temp = None + head_node = self.get_exit_points()[0] + query_head, _, _ = self.ingoing_edges(head_node)[0] + results = self.execute(query_head) + if results is not None: + results_generator, _ = next(results, ([], None)) + for temp in results_generator: + pass + del temp + + plan = list(_inner_explain(head[0], 1)) + + table = pyarrow.Table.from_pylist(plan) + print(table) + return table + + def execute(self, head_node=None) -> Generator[Tuple[Any, ResultType], Any, Any]: from opteryx.operators import ExplainNode from opteryx.operators import JoinNode @@ -77,9 +134,8 @@ def mark_node_exhausted(node_id): if node_exhaustion[node_id]: return # Node is already marked as exhausted - global active_tasks node_exhaustion[node_id] = True - # print("+", node_id) + print("+", node_id, self[node_id].name) # Notify downstream nodes for _, downstream_node, _ in self.outgoing_edges(node_id): @@ -87,8 +143,8 @@ def mark_node_exhausted(node_id): if all( node_exhaustion[parent] for parent, _, _ in self.ingoing_edges(downstream_node) ): - work_queue.put((downstream_node, EOS)) # None signals exhaustion - active_tasks += 1 + work_queue.put((downstream_node, EOS)) # EOS signals exhaustion + active_tasks_increment(+1) morsel_accounting[node_id] += 1 def update_morsel_accounting(node_id, morsel_count_change: int): @@ -139,7 +195,7 @@ def update_morsel_accounting(node_id, morsel_count_change: int): work_queue = Queue() # Response queue for results sent back to the engine response_queue = Queue() - num_workers = 2 + num_workers = 1 workers = [] def worker_process(): @@ -191,7 +247,7 @@ def inner_execute(plan): for downstream_node in next_nodes: # Queue tasks for downstream operators work_queue.put((downstream_node, morsel)) - active_tasks += 1 + active_tasks_increment(+1) update_morsel_accounting(downstream_node, +1) # Pump is exhausted after emitting all morsels @@ -202,7 +258,7 @@ def should_stop(): all_nodes_exhausted = all(node_exhaustion.values()) queues_empty = work_queue.empty() and response_queue.empty() all_nodes_inactive = active_tasks <= 0 - # print(node_exhaustion.values(), all(node_exhaustion.values()), work_queue.empty(), response_queue.empty(), active_tasks) + print(node_exhaustion.values(), all(node_exhaustion.values()), work_queue.empty(), response_queue.empty(), active_tasks) return all_nodes_exhausted and queues_empty and all_nodes_inactive while not should_stop(): @@ -213,8 +269,8 @@ def should_stop(): continue # Handle EOS - if result is None: - active_tasks -= 1 + if result is None or result == EOS: + active_tasks_increment(-1) continue # Determine downstream operators @@ -225,12 +281,12 @@ def should_stop(): else: for downstream_node in downstream_nodes: # Queue tasks for downstream operators - active_tasks += 1 + active_tasks_increment(+1) work_queue.put((downstream_node, result)) update_morsel_accounting(downstream_node, +1) # decrement _after_ we've done the work relation to handling the task - active_tasks -= 1 + active_tasks_increment(-1) # print("DONE!", node_exhaustion, work_queue.empty(), response_queue.empty()) diff --git a/opteryx/operators/aggregate_and_group_node.py b/opteryx/operators/aggregate_and_group_node.py index 900ce180b..c5463a529 100644 --- a/opteryx/operators/aggregate_and_group_node.py +++ b/opteryx/operators/aggregate_and_group_node.py @@ -139,6 +139,7 @@ def execute(self, morsel: pyarrow.Table): groups = groups.rename_columns(list(self.column_map.keys()) + self.group_by_columns) yield groups + yield EOS return morsel = project(morsel, self.all_identifiers) diff --git a/opteryx/operators/sort_node.py b/opteryx/operators/sort_node.py index c943afa6a..c6ede6fe5 100644 --- a/opteryx/operators/sort_node.py +++ b/opteryx/operators/sort_node.py @@ -69,6 +69,7 @@ def execute(self, morsel: Table) -> Table: new_order = numpy.argsort(numpy.random.uniform(size=table.num_rows)) table = table.take(new_order) yield table + yield EOS return raise UnsupportedSyntaxError( diff --git a/opteryx/planner/cost_based_optimizer/strategies/constant_folding.py b/opteryx/planner/cost_based_optimizer/strategies/constant_folding.py index 8dc449a40..21ac51b77 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/constant_folding.py +++ b/opteryx/planner/cost_based_optimizer/strategies/constant_folding.py @@ -137,6 +137,24 @@ def fold_constants(root: Node, statistics: QueryStatistics) -> Node: node.schema_column = root.schema_column return node + if root.node_type == NodeType.COMPARISON_OPERATOR: + if ( + root.value in ("Like", "Ilike") + and root.left.node_type == NodeType.IDENTIFIER + and root.right.node_type == NodeType.LITERAL + and root.right.value == "%" + ): + # column LIKE '%' is True + node = Node(node_type=NodeType.UNARY_OPERATOR) + node.type = OrsoTypes.BOOLEAN + node.value = "IsNotNull" + node.schema_column = root.schema_column + node.centre = root.left + node.query_column = root.query_column + statistics.optimization_constant_fold_reduce += 1 + return node + + if root.node_type in {NodeType.AND, NodeType.OR, NodeType.XOR}: # try to fold each side of logical operators root.left = fold_constants(root.left, statistics) From 0e95cca3d01ac34f72d058728ccc1c7a0cc80ef2 Mon Sep 17 00:00:00 2001 From: XB500 Date: Wed, 27 Nov 2024 08:34:46 +0000 Subject: [PATCH 043/157] Opteryx Version 0.19.0-alpha.871 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 7f11af185..98bf044f3 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 866 +__build__ = 871 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 54eb59928b735cdec41be62e2d9d5eb4fe59337e Mon Sep 17 00:00:00 2001 From: joocer Date: Wed, 27 Nov 2024 09:08:41 +0000 Subject: [PATCH 044/157] #2113 --- opteryx/functions/other_functions.py | 34 +++++++++++++++++++++++----- opteryx/operators/projection_node.py | 1 + 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/opteryx/functions/other_functions.py b/opteryx/functions/other_functions.py index b8145b13e..e3a0b097f 100644 --- a/opteryx/functions/other_functions.py +++ b/opteryx/functions/other_functions.py @@ -130,17 +130,39 @@ def if_null(values, replacement): if isinstance(values, list): values = numpy.array(values) - response = values.copy() # Create a copy of the array to avoid modifying the original + # Create a mask for null values is_null_array = _is_null(values) - for index, is_null in enumerate(is_null_array): - if is_null: - response[index] = replacement[index] - return response + # Use NumPy's where function to vectorize the operation + return numpy.where(is_null_array, replacement, values) def null_if(col1, col2): - return [None if a == b else a for a, b in zip(col1, col2)] + """ + Parameters: + col1: Union[numpy.ndarray, list] + The first input array. + col2: Union[numpy.ndarray, list] + The second input array. + + Returns: + numpy.ndarray + An array where elements from col1 are replaced with None if they match the corresponding elements in col2. + """ + if isinstance(col1, pyarrow.Array): + values = values.to_numpy(False) + if isinstance(col1, list): + values = numpy.array(values) + if isinstance(col2, pyarrow.Array): + values = values.to_numpy(False) + if isinstance(col2, list): + values = numpy.array(values) + + # Create a mask where elements in col1 are equal to col2 + mask = col1 == col2 + + # Return None where the mask is True, else col1 + return numpy.where(mask, None, col1) def cosine_similarity(arr, val): diff --git a/opteryx/operators/projection_node.py b/opteryx/operators/projection_node.py index 8a858fe87..213c56b10 100644 --- a/opteryx/operators/projection_node.py +++ b/opteryx/operators/projection_node.py @@ -64,6 +64,7 @@ def name(self): # pragma: no cover def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: if morsel == EOS: + yield EOS return # If any of the columns need evaluating, we need to do that here From 2c6e25b62d5ebd882b57bcd27204b06b7eb825cd Mon Sep 17 00:00:00 2001 From: joocer Date: Wed, 27 Nov 2024 20:41:48 +0000 Subject: [PATCH 045/157] #2100 --- opteryx/functions/other_functions.py | 2 +- opteryx/models/physical_plan.py | 16 +++++++++++----- opteryx/operators/aggregate_and_group_node.py | 2 +- opteryx/operators/cross_join_node.py | 6 ++++-- opteryx/operators/distinct_node.py | 2 +- opteryx/operators/filter_node.py | 2 +- opteryx/operators/heap_sort_node.py | 2 ++ opteryx/operators/inner_join_node.py | 1 + opteryx/operators/inner_join_node_single.py | 1 + opteryx/operators/limit_node.py | 4 ++-- opteryx/operators/projection_node.py | 2 +- opteryx/operators/sort_node.py | 2 -- opteryx/operators/union_node.py | 1 + .../strategies/constant_folding.py | 1 - 14 files changed, 27 insertions(+), 17 deletions(-) diff --git a/opteryx/functions/other_functions.py b/opteryx/functions/other_functions.py index e3a0b097f..dc39e2ada 100644 --- a/opteryx/functions/other_functions.py +++ b/opteryx/functions/other_functions.py @@ -138,7 +138,7 @@ def if_null(values, replacement): def null_if(col1, col2): - """ + """ Parameters: col1: Union[numpy.ndarray, list] The first input array. diff --git a/opteryx/models/physical_plan.py b/opteryx/models/physical_plan.py index 2638a540a..aa664f885 100644 --- a/opteryx/models/physical_plan.py +++ b/opteryx/models/physical_plan.py @@ -19,17 +19,18 @@ from typing import Optional from typing import Tuple +import pyarrow + from opteryx import EOS from opteryx.constants import ResultType from opteryx.exceptions import InvalidInternalStateError from opteryx.third_party.travers import Graph -import pyarrow - morsel_lock = Lock() active_task_lock = Lock() active_tasks: int = 0 + def active_tasks_increment(value: int): global active_tasks with active_task_lock: @@ -115,7 +116,6 @@ def _inner_explain(node, depth): print(table) return table - def execute(self, head_node=None) -> Generator[Tuple[Any, ResultType], Any, Any]: from opteryx.operators import ExplainNode from opteryx.operators import JoinNode @@ -135,7 +135,7 @@ def mark_node_exhausted(node_id): return # Node is already marked as exhausted node_exhaustion[node_id] = True - print("+", node_id, self[node_id].name) +# print("+", node_id, self[node_id].name) # Notify downstream nodes for _, downstream_node, _ in self.outgoing_edges(node_id): @@ -258,7 +258,13 @@ def should_stop(): all_nodes_exhausted = all(node_exhaustion.values()) queues_empty = work_queue.empty() and response_queue.empty() all_nodes_inactive = active_tasks <= 0 - print(node_exhaustion.values(), all(node_exhaustion.values()), work_queue.empty(), response_queue.empty(), active_tasks) +# print( +# node_exhaustion.values(), +# all(node_exhaustion.values()), +# work_queue.empty(), +# response_queue.empty(), +# active_tasks, +# ) return all_nodes_exhausted and queues_empty and all_nodes_inactive while not should_stop(): diff --git a/opteryx/operators/aggregate_and_group_node.py b/opteryx/operators/aggregate_and_group_node.py index c5463a529..28ad741f3 100644 --- a/opteryx/operators/aggregate_and_group_node.py +++ b/opteryx/operators/aggregate_and_group_node.py @@ -139,7 +139,6 @@ def execute(self, morsel: pyarrow.Table): groups = groups.rename_columns(list(self.column_map.keys()) + self.group_by_columns) yield groups - yield EOS return morsel = project(morsel, self.all_identifiers) @@ -153,3 +152,4 @@ def execute(self, morsel: pyarrow.Table): morsel = evaluate_and_append(self.groups, morsel) self.buffer.append(morsel) + yield None diff --git a/opteryx/operators/cross_join_node.py b/opteryx/operators/cross_join_node.py index 5a0ed5268..f2a4b08f8 100644 --- a/opteryx/operators/cross_join_node.py +++ b/opteryx/operators/cross_join_node.py @@ -325,11 +325,13 @@ def config(self): # pragma: no cover def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: if not self.continue_executing: - return None + yield None + return if self._unnest_column is not None: if morsel == EOS: self.continue_executing = False + yield None return if isinstance(self._unnest_column.value, tuple): yield from _cross_join_unnest_literal( @@ -366,4 +368,4 @@ def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: yield from _cross_join(self.left_relation, right_table) else: self.right_buffer.append(morsel) - yield None + yield None diff --git a/opteryx/operators/distinct_node.py b/opteryx/operators/distinct_node.py index 2a1478f9f..384bef653 100644 --- a/opteryx/operators/distinct_node.py +++ b/opteryx/operators/distinct_node.py @@ -59,7 +59,7 @@ def execute(self, morsel: Table) -> Table: # limit processing if morsel == EOS: - yield EOS + yield None return unique_indexes, self.hash_set = distinct( diff --git a/opteryx/operators/filter_node.py b/opteryx/operators/filter_node.py index 1346d4a60..ff8940aad 100644 --- a/opteryx/operators/filter_node.py +++ b/opteryx/operators/filter_node.py @@ -57,7 +57,7 @@ def name(self): # pragma: no cover def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: if morsel == EOS: - yield EOS + yield None return if morsel.num_rows == 0: diff --git a/opteryx/operators/heap_sort_node.py b/opteryx/operators/heap_sort_node.py index 3a5a1d1fd..24c419a71 100644 --- a/opteryx/operators/heap_sort_node.py +++ b/opteryx/operators/heap_sort_node.py @@ -138,3 +138,5 @@ def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: ) # Slice the sorted table self.table = self.table.take(sort_indices[: self.limit]) + + yield None diff --git a/opteryx/operators/inner_join_node.py b/opteryx/operators/inner_join_node.py index 5d5ef74a4..d655ef494 100644 --- a/opteryx/operators/inner_join_node.py +++ b/opteryx/operators/inner_join_node.py @@ -122,6 +122,7 @@ def execute(self, morsel: Table) -> Table: return if morsel == EOS: + yield None return # do the join diff --git a/opteryx/operators/inner_join_node_single.py b/opteryx/operators/inner_join_node_single.py index 2187e7701..b21e97aa7 100644 --- a/opteryx/operators/inner_join_node_single.py +++ b/opteryx/operators/inner_join_node_single.py @@ -208,6 +208,7 @@ def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: return if morsel == EOS: + yield None return # do the join diff --git a/opteryx/operators/limit_node.py b/opteryx/operators/limit_node.py index b4e4a40ae..5db185dea 100644 --- a/opteryx/operators/limit_node.py +++ b/opteryx/operators/limit_node.py @@ -49,6 +49,7 @@ def config(self): # pragma: no cover def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: if morsel == EOS: + yield None return if self.rows_left_to_skip > 0: @@ -64,9 +65,8 @@ def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: if self.remaining_rows <= 0 or morsel.num_rows == 0: yield morsel.slice(offset=0, length=0) - yield - if morsel.num_rows < self.remaining_rows: + elif morsel.num_rows < self.remaining_rows: self.remaining_rows -= morsel.num_rows yield morsel else: diff --git a/opteryx/operators/projection_node.py b/opteryx/operators/projection_node.py index 213c56b10..4f150ea72 100644 --- a/opteryx/operators/projection_node.py +++ b/opteryx/operators/projection_node.py @@ -64,7 +64,7 @@ def name(self): # pragma: no cover def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: if morsel == EOS: - yield EOS + yield None return # If any of the columns need evaluating, we need to do that here diff --git a/opteryx/operators/sort_node.py b/opteryx/operators/sort_node.py index c6ede6fe5..87e471ceb 100644 --- a/opteryx/operators/sort_node.py +++ b/opteryx/operators/sort_node.py @@ -69,7 +69,6 @@ def execute(self, morsel: Table) -> Table: new_order = numpy.argsort(numpy.random.uniform(size=table.num_rows)) table = table.take(new_order) yield table - yield EOS return raise UnsupportedSyntaxError( @@ -101,4 +100,3 @@ def execute(self, morsel: Table) -> Table: ) yield table.sort_by(mapped_order) - yield EOS diff --git a/opteryx/operators/union_node.py b/opteryx/operators/union_node.py index c8623f69d..430c5f785 100644 --- a/opteryx/operators/union_node.py +++ b/opteryx/operators/union_node.py @@ -50,6 +50,7 @@ def execute(self, morsel: Table) -> Table: coercible types are coerced. """ if morsel == EOS and self.seen_first_eos: + yield None return elif morsel == EOS: self.seen_first_eos = True diff --git a/opteryx/planner/cost_based_optimizer/strategies/constant_folding.py b/opteryx/planner/cost_based_optimizer/strategies/constant_folding.py index 21ac51b77..473d76837 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/constant_folding.py +++ b/opteryx/planner/cost_based_optimizer/strategies/constant_folding.py @@ -154,7 +154,6 @@ def fold_constants(root: Node, statistics: QueryStatistics) -> Node: statistics.optimization_constant_fold_reduce += 1 return node - if root.node_type in {NodeType.AND, NodeType.OR, NodeType.XOR}: # try to fold each side of logical operators root.left = fold_constants(root.left, statistics) From b590f4478186159f8db84ae583b0c8b5570a7cdf Mon Sep 17 00:00:00 2001 From: XB500 Date: Wed, 27 Nov 2024 20:42:14 +0000 Subject: [PATCH 046/157] Opteryx Version 0.19.0-alpha.874 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 98bf044f3..6f71a88ce 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 871 +__build__ = 874 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From d35e5b9069da4985d71d7712fecad5c757143d99 Mon Sep 17 00:00:00 2001 From: joocer Date: Fri, 29 Nov 2024 00:02:52 +0000 Subject: [PATCH 047/157] #2100 --- opteryx/compiled/structures/memory_pool.pyx | 3 +- opteryx/config.py | 3 + opteryx/connectors/capabilities/cacheable.py | 1 + opteryx/models/physical_plan.py | 71 +++++++++++++------- opteryx/operators/async_read_node.py | 1 - opteryx/operators/base_plan_node.py | 16 +++-- opteryx/operators/function_dataset_node.py | 1 - opteryx/operators/read_node.py | 1 - opteryx/operators/sort_node.py | 2 + opteryx/virtual_datasets/statistics.py | 8 +-- 10 files changed, 62 insertions(+), 45 deletions(-) diff --git a/opteryx/compiled/structures/memory_pool.pyx b/opteryx/compiled/structures/memory_pool.pyx index df84add3f..5dfa26447 100644 --- a/opteryx/compiled/structures/memory_pool.pyx +++ b/opteryx/compiled/structures/memory_pool.pyx @@ -158,8 +158,7 @@ cdef class MemoryPool: self._level2_compaction() segment_index = self._find_free_segment(len_data) if segment_index == -1: - self.failed_commits += 1 - return None # No space available + raise MemoryError("Unable to create segment in bufferpool") segment = self.free_segments[segment_index] self.free_segments.erase(self.free_segments.begin() + segment_index) diff --git a/opteryx/config.py b/opteryx/config.py index 28940bd14..aeb1a7878 100644 --- a/opteryx/config.py +++ b/opteryx/config.py @@ -166,6 +166,9 @@ def get(key: str, default: Optional[typing.Any] = None) -> Optional[typing.Any]: CONCURRENT_READS: int = int(get("CONCURRENT_READS", 4)) """Number of read workers per data source.""" +CONCURRENT_WORKERS: int = int(get("CONCURRENT_WORKERS", 2)) +"""Number of worker threads created to execute queries.""" + DATA_CATALOG_PROVIDER: str = get("DATA_CATALOG_PROVIDER") """Data Catalog provider.""" diff --git a/opteryx/connectors/capabilities/cacheable.py b/opteryx/connectors/capabilities/cacheable.py index fcae2e957..30e170232 100644 --- a/opteryx/connectors/capabilities/cacheable.py +++ b/opteryx/connectors/capabilities/cacheable.py @@ -147,6 +147,7 @@ async def wrapper(blob_name: str, statistics, pool: MemoryPool, **kwargs): if source == SOURCE_ORIGIN and len(payload) < MAX_CACHEABLE_ITEM_SIZE: # If we read from the source, it's not in the remote cache remote_cache.set(key, payload) + system_statistics.remote_cache_commits += 1 else: statistics.cache_oversize += 1 diff --git a/opteryx/models/physical_plan.py b/opteryx/models/physical_plan.py index aa664f885..d7a10486f 100644 --- a/opteryx/models/physical_plan.py +++ b/opteryx/models/physical_plan.py @@ -22,6 +22,7 @@ import pyarrow from opteryx import EOS +from opteryx.config import CONCURRENT_WORKERS from opteryx.constants import ResultType from opteryx.exceptions import InvalidInternalStateError from opteryx.third_party.travers import Graph @@ -135,7 +136,7 @@ def mark_node_exhausted(node_id): return # Node is already marked as exhausted node_exhaustion[node_id] = True -# print("+", node_id, self[node_id].name) + print("+", node_id, self[node_id].name) # Notify downstream nodes for _, downstream_node, _ in self.outgoing_edges(node_id): @@ -143,7 +144,7 @@ def mark_node_exhausted(node_id): if all( node_exhaustion[parent] for parent, _, _ in self.ingoing_edges(downstream_node) ): - work_queue.put((downstream_node, EOS)) # EOS signals exhaustion + work_queue.put((node_id, EOS)) # EOS signals exhaustion active_tasks_increment(+1) morsel_accounting[node_id] += 1 @@ -160,14 +161,27 @@ def update_morsel_accounting(node_id, morsel_count_change: int): """ with morsel_lock: morsel_accounting[node_id] += morsel_count_change - # print(">", node_id, morsel_accounting[node_id], morsel_count_change, self[node_id].name) + print( + ">", + node_id, + morsel_accounting[node_id], + morsel_count_change, + self[node_id].name, + ) + + if morsel_accounting[node_id] < 0: + raise InvalidInternalStateError("Node input and output count in invalid state.") # Check if the node is exhausted - if morsel_accounting[node_id] <= 0: # No more pending morsels for this node + if morsel_accounting[node_id] == 0: # No more pending morsels for this node # Ensure all parent nodes are exhausted all_parents_exhausted = all( node_exhaustion[parent] for parent, _, _ in self.ingoing_edges(node_id) ) + print( + self.ingoing_edges(node_id)[0][0], + node_exhaustion[self.ingoing_edges(node_id)[0][0]], + ) if all_parents_exhausted: mark_node_exhausted(node_id) @@ -195,7 +209,7 @@ def update_morsel_accounting(node_id, morsel_count_change: int): work_queue = Queue() # Response queue for results sent back to the engine response_queue = Queue() - num_workers = 1 + num_workers = CONCURRENT_WORKERS workers = [] def worker_process(): @@ -208,8 +222,6 @@ def worker_process(): break node_id, morsel = task - if morsel_accounting[node_id] is False: - print("RUNNING AN EXHAUSTED NODE") operator = self[node_id] results = operator(morsel) @@ -258,13 +270,13 @@ def should_stop(): all_nodes_exhausted = all(node_exhaustion.values()) queues_empty = work_queue.empty() and response_queue.empty() all_nodes_inactive = active_tasks <= 0 -# print( -# node_exhaustion.values(), -# all(node_exhaustion.values()), -# work_queue.empty(), -# response_queue.empty(), -# active_tasks, -# ) + print( + list(node_exhaustion.values()), + all(node_exhaustion.values()), + work_queue.empty(), + response_queue.empty(), + active_tasks, + ) return all_nodes_exhausted and queues_empty and all_nodes_inactive while not should_stop(): @@ -274,27 +286,34 @@ def should_stop(): except Empty: continue - # Handle EOS - if result is None or result == EOS: + # if a thread threw a error, we get them in the main + # thread here, we just reraise the error here + if isinstance(result, Exception): + raise Exception(f"{node_id} - {self[node_id]}") from result + + # Handle Empty responses + if result is None: active_tasks_increment(-1) continue # Determine downstream operators downstream_nodes = [target for _, target, _ in self.outgoing_edges(node_id)] - if len(downstream_nodes) == 0: - # print("YIELD") - yield result - else: - for downstream_node in downstream_nodes: - # Queue tasks for downstream operators - active_tasks_increment(+1) - work_queue.put((downstream_node, result)) - update_morsel_accounting(downstream_node, +1) + if len(downstream_nodes) == 0: # Exit node + if result is not None: + yield result # Emit the morsel immediately + active_tasks_increment(-1) # Mark the task as completed + continue + + for downstream_node in downstream_nodes: + # Queue tasks for downstream operators + active_tasks_increment(+1) + work_queue.put((downstream_node, result)) + update_morsel_accounting(downstream_node, +1) # decrement _after_ we've done the work relation to handling the task active_tasks_increment(-1) - # print("DONE!", node_exhaustion, work_queue.empty(), response_queue.empty()) + print("DONE!", node_exhaustion, work_queue.empty(), response_queue.empty()) for worker in workers: work_queue.put(None) diff --git a/opteryx/operators/async_read_node.py b/opteryx/operators/async_read_node.py index 7096f68d2..81dbd1b61 100644 --- a/opteryx/operators/async_read_node.py +++ b/opteryx/operators/async_read_node.py @@ -31,7 +31,6 @@ import pyarrow.parquet from orso.schema import convert_orso_schema_to_arrow_schema -from opteryx import EOS from opteryx import config from opteryx.exceptions import DataError from opteryx.models import QueryProperties diff --git a/opteryx/operators/base_plan_node.py b/opteryx/operators/base_plan_node.py index 0bff180c4..cd815092e 100644 --- a/opteryx/operators/base_plan_node.py +++ b/opteryx/operators/base_plan_node.py @@ -90,25 +90,25 @@ def execute(self, morsel: pyarrow.Table) -> Optional[pyarrow.Table]: # pragma: pass def __call__(self, morsel: pyarrow.Table) -> Optional[pyarrow.Table]: - if morsel is not None and morsel != EOS: + if hasattr(morsel, "num_rows"): self.records_in += morsel.num_rows self.bytes_in += morsel.nbytes self.calls += 1 - generator = self.execute(morsel) # Initialize the generator + # set up the execution of the operator + generator = self.execute(morsel) while True: try: # Time the production of the next result start_time = time.monotonic_ns() result = next(generator) # Retrieve the next item from the generator - self.execution_time += time.monotonic_ns() - start_time - self.statistics.increase( - "time_" + self.name.lower(), time.monotonic_ns() - start_time - ) + execution_time = time.monotonic_ns() - start_time + self.execution_time += execution_time + self.statistics.increase("time_" + self.name.lower(), execution_time) # Update metrics for valid results - if result is not None and result != EOS and hasattr(result, "num_rows"): + if hasattr(result, "num_rows"): self.records_out += result.num_rows self.bytes_out += result.nbytes @@ -118,6 +118,8 @@ def __call__(self, morsel: pyarrow.Table) -> Optional[pyarrow.Table]: except StopIteration: # Break the loop when the generator is exhausted break + except Exception as err: + yield err def sensors(self): return { diff --git a/opteryx/operators/function_dataset_node.py b/opteryx/operators/function_dataset_node.py index ce2912d9b..c0ae8612f 100644 --- a/opteryx/operators/function_dataset_node.py +++ b/opteryx/operators/function_dataset_node.py @@ -23,7 +23,6 @@ import pyarrow -from opteryx import EOS from opteryx.exceptions import SqlError from opteryx.managers.expression import NodeType from opteryx.models import QueryProperties diff --git a/opteryx/operators/read_node.py b/opteryx/operators/read_node.py index 69d5b0d21..6e660a4a2 100644 --- a/opteryx/operators/read_node.py +++ b/opteryx/operators/read_node.py @@ -27,7 +27,6 @@ from orso.schema import RelationSchema from orso.schema import convert_orso_schema_to_arrow_schema -from opteryx import EOS from opteryx.models import QueryProperties from . import BasePlanNode diff --git a/opteryx/operators/sort_node.py b/opteryx/operators/sort_node.py index 87e471ceb..f26c8bb1f 100644 --- a/opteryx/operators/sort_node.py +++ b/opteryx/operators/sort_node.py @@ -52,10 +52,12 @@ def name(self): # pragma: no cover def execute(self, morsel: Table) -> Table: if morsel != EOS: + print("APPENDED") self.morsels.append(morsel) yield None return + print("JOIN", len(self.morsels)) table = concat_tables(self.morsels, promote_options="permissive") mapped_order = [] diff --git a/opteryx/virtual_datasets/statistics.py b/opteryx/virtual_datasets/statistics.py index 935cc2680..4122734b0 100644 --- a/opteryx/virtual_datasets/statistics.py +++ b/opteryx/virtual_datasets/statistics.py @@ -30,9 +30,6 @@ def read(end_date=None, variables={}): from opteryx.shared.buffer_pool import BufferPool bufferpool = BufferPool() - - lru_hits, lru_misses, lru_evictions, lru_inserts = bufferpool.stats - pool = bufferpool._memory_pool # fmt:off @@ -47,16 +44,13 @@ def read(end_date=None, variables={}): {"key": "bufferpool_capacity", "value": str(pool.size)}, {"key": "bufferpool_free", "value": str(pool.available_space())}, {"key": "bufferpool_items", "value": str(len(pool.used_segments))}, - {"key": "lru_hits", "value": str(lru_hits)}, - {"key": "lru_misses", "value": str(lru_misses)}, - {"key": "lru_evictions", "value": str(lru_evictions)}, - {"key": "lru_inserts", "value": str(lru_inserts)}, {"key": "queries_executed", "value": str(system_statistics.queries_executed)}, {"key": "uptime_seconds","value": str((time.time_ns() - system_statistics.start_time) / 1e9)}, {"key": "io_wait_seconds", "value": str(system_statistics.io_wait_seconds)}, {"key": "cpu_wait_seconds", "value": str(system_statistics.cpu_wait_seconds)}, {"key": "origin_reads", "value": str(system_statistics.origin_reads)}, {"key": "remote_cache_reads", "value": str(system_statistics.remote_cache_reads)}, + {"key": "remote_cache_commits", "value": str(system_statistics.remote_cache_commits)}, ] # fmt:on From ab9d5a07e10771fd81ca885d27296d311794ac68 Mon Sep 17 00:00:00 2001 From: XB500 Date: Fri, 29 Nov 2024 00:03:15 +0000 Subject: [PATCH 048/157] Opteryx Version 0.19.0-alpha.875 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 6f71a88ce..6bf725772 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 874 +__build__ = 875 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From df0985c33077a6960129da01b4d3e74fd5f85860 Mon Sep 17 00:00:00 2001 From: joocer Date: Sun, 1 Dec 2024 14:51:09 +0000 Subject: [PATCH 049/157] #2100 --- opteryx/models/physical_plan.py | 17 +++++++++-------- opteryx/operators/sort_node.py | 2 -- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/opteryx/models/physical_plan.py b/opteryx/models/physical_plan.py index d7a10486f..aceaff750 100644 --- a/opteryx/models/physical_plan.py +++ b/opteryx/models/physical_plan.py @@ -136,7 +136,7 @@ def mark_node_exhausted(node_id): return # Node is already marked as exhausted node_exhaustion[node_id] = True - print("+", node_id, self[node_id].name) + print("EXHAUST", node_id, self[node_id].name) # Notify downstream nodes for _, downstream_node, _ in self.outgoing_edges(node_id): @@ -162,7 +162,7 @@ def update_morsel_accounting(node_id, morsel_count_change: int): with morsel_lock: morsel_accounting[node_id] += morsel_count_change print( - ">", + "ACCOUNT", node_id, morsel_accounting[node_id], morsel_count_change, @@ -178,10 +178,6 @@ def update_morsel_accounting(node_id, morsel_count_change: int): all_parents_exhausted = all( node_exhaustion[parent] for parent, _, _ in self.ingoing_edges(node_id) ) - print( - self.ingoing_edges(node_id)[0][0], - node_exhaustion[self.ingoing_edges(node_id)[0][0]], - ) if all_parents_exhausted: mark_node_exhausted(node_id) @@ -244,6 +240,8 @@ def inner_execute(plan): # Identify pump nodes global active_tasks + # Get all the nodes which push data into the plan We use DFS to order the + # nodes to ensure left branch is always before the right branch pump_nodes = [ (nid, node) for nid, node in self.depth_first_search_flat() @@ -253,10 +251,13 @@ def inner_execute(plan): # Main engine loop processes pump nodes and coordinates work for pump_nid, pump_instance in pump_nodes: for morsel in pump_instance(None): - # Initial morsels pushed to the work queue - # Determine downstream operators + print("MORSEL") + # Initial morsels pushed to the work queue determine downstream operators next_nodes = [target for _, target, _ in self.outgoing_edges(pump_nid)] for downstream_node in next_nodes: + print( + f"following {self[pump_nid].name} triggering {self[downstream_node].name}" + ) # Queue tasks for downstream operators work_queue.put((downstream_node, morsel)) active_tasks_increment(+1) diff --git a/opteryx/operators/sort_node.py b/opteryx/operators/sort_node.py index f26c8bb1f..87e471ceb 100644 --- a/opteryx/operators/sort_node.py +++ b/opteryx/operators/sort_node.py @@ -52,12 +52,10 @@ def name(self): # pragma: no cover def execute(self, morsel: Table) -> Table: if morsel != EOS: - print("APPENDED") self.morsels.append(morsel) yield None return - print("JOIN", len(self.morsels)) table = concat_tables(self.morsels, promote_options="permissive") mapped_order = [] From 251e9e02a6180a760b456fe1861d130afe0cb7bc Mon Sep 17 00:00:00 2001 From: XB500 Date: Sun, 1 Dec 2024 14:51:34 +0000 Subject: [PATCH 050/157] Opteryx Version 0.19.0-alpha.876 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 6bf725772..a2dd9094b 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 875 +__build__ = 876 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 0192629e30c0e1d146efed2a5a481b8b5af78bfb Mon Sep 17 00:00:00 2001 From: joocer Date: Sun, 1 Dec 2024 22:00:19 +0000 Subject: [PATCH 051/157] #2100 --- README.md | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index fcb8e072f..e30d77378 100644 --- a/README.md +++ b/README.md @@ -287,18 +287,18 @@ You can also try Opteryx right now using our [interactive labs](https://github.c ## Community [![Discord](https://img.shields.io/badge/discuss%20on-discord-5865F2.svg?logo=discord)](https://discord.gg/qpv2tr989x) -[![X Follow](https://img.shields.io/badge/follow%20on-X-1DA1F2.svg?logo=X)](https://twitter.com/OpteryxSQL) [![Medium](https://img.shields.io/badge/Read%20on-Medium-5865F2.svg?logo=medium)](https://medium.com/opteryx) **Get Involved** -* :star: Star this repo -* [Contribute](https://opteryx.dev/latest/contributing/contributing/) — join us in building Opteryx, through writing code, or inspiring others to use it. -* Let us know [your ideas](https://github.com/mabel-dev/opteryx/discussions), how you are using Opteryx, or report a [bug or feature request]((https://github.com/mabel-dev/opteryx/issues/new/choose)). -* See the [contributor documentation](https://opteryx.dev/latest/contributing/contributing/) for Opteryx. It's easy to get started, and we're really friendly if you need any help! -* If you're interested in contributing to the code now, check out [GitHub issues](https://github.com/mabel-dev/opteryx/issues). Feel free to ask questions or open a draft PR. +🌟 **Star this repo** to show your support and help others discover Opteryx. +💬 [Join the discussion](https://github.com/mabel-dev/opteryx/discussions) — share your ideas, tell us how you’re using Opteryx, or suggest [features](https://github.com/mabel-dev/opteryx/issues/new/choose). +🛠️ [Contribute to the code](https://opteryx.dev/latest/contributing/contributing/) — join us in building Opteryx! It’s easy to get started, and we’re happy to guide you. +📚 Check out the [contributor documentation](https://opteryx.dev/latest/contributing/contributing/). No matter your skill level, there are ways to contribute. +❤️ We welcome [sponsorships](https://github.com/sponsors/mabel-dev) of any size. Every contribution helps us make Opteryx even better! +We’re excited to have you join our journey. Let us know how we can help! ## Security [![Static Analysis](https://github.com/mabel-dev/opteryx/actions/workflows/static_analysis.yaml/badge.svg)](https://github.com/mabel-dev/opteryx/actions/workflows/static_analysis.yml) @@ -331,3 +331,15 @@ Opteryx is in beta. Beta means different things to different people, to us, bein - **[orso](https://github.com/mabel-dev/orso)** DataFrame library - **[mabel](https://github.com/mabel-dev/mabel)** Streaming data APIs - **[tarchia](https://github.com/mabel-dev/mabel)** Data Catalog + +## Thank You + +A huge thank you to our amazing sponsor: + +**[Konstantin Vinogradov](https://github.com/vinogradovkonst)** +❤️ Your support helps us continue building and improving Opteryx for everyone. + +--- + +Want to see your name here? +[Become a sponsor](https://github.com/sponsors/mabel-dev) to support Opteryx and join our journey! From c4e4b548a90955191f4df4ae0fe3d84040ed31e1 Mon Sep 17 00:00:00 2001 From: XB500 Date: Sun, 1 Dec 2024 22:00:46 +0000 Subject: [PATCH 052/157] Opteryx Version 0.19.0-alpha.877 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index a2dd9094b..b3c19abb9 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 876 +__build__ = 877 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 81fc95ac83a5548859fd0df9e9ff42dc2262b21d Mon Sep 17 00:00:00 2001 From: joocer Date: Sun, 1 Dec 2024 23:35:59 +0000 Subject: [PATCH 053/157] #2100 --- opteryx/functions/other_functions.py | 4 +- opteryx/models/physical_plan.py | 42 ++++++++----------- opteryx/planner/sql_rewriter.py | 9 ++-- .../test_shapes_and_errors_battery.py | 2 +- 4 files changed, 24 insertions(+), 33 deletions(-) diff --git a/opteryx/functions/other_functions.py b/opteryx/functions/other_functions.py index dc39e2ada..eda4da41c 100644 --- a/opteryx/functions/other_functions.py +++ b/opteryx/functions/other_functions.py @@ -251,12 +251,12 @@ def jsonb_object_keys(arr: numpy.ndarray): if isinstance(arr[0], dict): # Process dictionaries for i, row in enumerate(arr): - result[i] = [str(key) for key in row.keys()] + result[i] = [str(key) for key in row.keys()] # noqa: SIM118 - row is not a dict; .keys() is required elif isinstance(arr[0], (str, bytes)): # SIMD-JSON parser instance for JSON string/bytes parser = simdjson.Parser() for i, row in enumerate(arr): - result[i] = [str(key) for key in parser.parse(row).keys()] + result[i] = [str(key) for key in parser.parse(row).keys()] # noqa: SIM118 - row is not a dict; .keys() is required else: raise ValueError("Unsupported dtype for array elements. Expected dict, str, or bytes.") diff --git a/opteryx/models/physical_plan.py b/opteryx/models/physical_plan.py index aceaff750..a7be78aaa 100644 --- a/opteryx/models/physical_plan.py +++ b/opteryx/models/physical_plan.py @@ -114,7 +114,6 @@ def _inner_explain(node, depth): plan = list(_inner_explain(head[0], 1)) table = pyarrow.Table.from_pylist(plan) - print(table) return table def execute(self, head_node=None) -> Generator[Tuple[Any, ResultType], Any, Any]: @@ -136,10 +135,16 @@ def mark_node_exhausted(node_id): return # Node is already marked as exhausted node_exhaustion[node_id] = True - print("EXHAUST", node_id, self[node_id].name) + + if isinstance(self[node_id], ReaderNode): + return # Notify downstream nodes - for _, downstream_node, _ in self.outgoing_edges(node_id): + downstream_nodes = self.outgoing_edges(node_id) + if len(downstream_nodes) > 1: + raise InvalidInternalStateError("Cannot FORK execution") + elif len(downstream_nodes) == 1: + _, downstream_node, _ = downstream_nodes[0] # Check if all parents of downstream_node are exhausted if all( node_exhaustion[parent] for parent, _, _ in self.ingoing_edges(downstream_node) @@ -161,13 +166,13 @@ def update_morsel_accounting(node_id, morsel_count_change: int): """ with morsel_lock: morsel_accounting[node_id] += morsel_count_change - print( - "ACCOUNT", - node_id, - morsel_accounting[node_id], - morsel_count_change, - self[node_id].name, - ) + # print( + # "ACCOUNT", + # node_id, + # morsel_accounting[node_id], + # morsel_count_change, + # self[node_id].name, + # ) if morsel_accounting[node_id] < 0: raise InvalidInternalStateError("Node input and output count in invalid state.") @@ -251,13 +256,10 @@ def inner_execute(plan): # Main engine loop processes pump nodes and coordinates work for pump_nid, pump_instance in pump_nodes: for morsel in pump_instance(None): - print("MORSEL") # Initial morsels pushed to the work queue determine downstream operators next_nodes = [target for _, target, _ in self.outgoing_edges(pump_nid)] for downstream_node in next_nodes: - print( - f"following {self[pump_nid].name} triggering {self[downstream_node].name}" - ) + # DEBUG: log (f"following initial {self[pump_nid].name} triggering {self[downstream_node].name}") # Queue tasks for downstream operators work_queue.put((downstream_node, morsel)) active_tasks_increment(+1) @@ -271,13 +273,6 @@ def should_stop(): all_nodes_exhausted = all(node_exhaustion.values()) queues_empty = work_queue.empty() and response_queue.empty() all_nodes_inactive = active_tasks <= 0 - print( - list(node_exhaustion.values()), - all(node_exhaustion.values()), - work_queue.empty(), - response_queue.empty(), - active_tasks, - ) return all_nodes_exhausted and queues_empty and all_nodes_inactive while not should_stop(): @@ -290,7 +285,7 @@ def should_stop(): # if a thread threw a error, we get them in the main # thread here, we just reraise the error here if isinstance(result, Exception): - raise Exception(f"{node_id} - {self[node_id]}") from result + raise result # Handle Empty responses if result is None: @@ -308,14 +303,13 @@ def should_stop(): for downstream_node in downstream_nodes: # Queue tasks for downstream operators active_tasks_increment(+1) + # DEBUG: log (f"following {self[node_id].name} triggering {self[downstream_node].name}") work_queue.put((downstream_node, result)) update_morsel_accounting(downstream_node, +1) # decrement _after_ we've done the work relation to handling the task active_tasks_increment(-1) - print("DONE!", node_exhaustion, work_queue.empty(), response_queue.empty()) - for worker in workers: work_queue.put(None) diff --git a/opteryx/planner/sql_rewriter.py b/opteryx/planner/sql_rewriter.py index e9f4c2b02..7d2b2f650 100644 --- a/opteryx/planner/sql_rewriter.py +++ b/opteryx/planner/sql_rewriter.py @@ -288,12 +288,9 @@ def _temporal_extration_state_machine( open_count -= 1 if in_special_function and open_count == special_function_brackets: in_special_function = False - if relation == "": - state = WAITING - else: - # function relations, like FAKE(234,234) need the items between the - # brackets be be consumed - state = FUNCTION_RELATION + # function relations, like FAKE(234,234) need the items between the + # brackets be be consumed + state = WAITING if relation == "" else FUNCTION_RELATION if not in_special_function: if comparable_part in STOP_COLLECTING: diff --git a/tests/sql_battery/test_shapes_and_errors_battery.py b/tests/sql_battery/test_shapes_and_errors_battery.py index 9a6d81698..6757ac024 100644 --- a/tests/sql_battery/test_shapes_and_errors_battery.py +++ b/tests/sql_battery/test_shapes_and_errors_battery.py @@ -83,7 +83,7 @@ ("SELECT * FROM sqlite.planets", 9, 20, None), ("SELECT * FROM $variables", 42, 4, None), ("SELECT * FROM $missions", 4630, 8, None), - ("SELECT * FROM $statistics", 20, 2, None), + ("SELECT * FROM $statistics", 17, 2, None), ("SELECT * FROM $stop_words", 305, 1, None), (b"SELECT * FROM $satellites", 177, 8, None), ("SELECT * FROM testdata.missions", 4630, 8, None), From 333b583593c7fbf7da891e0ce43c0a4c1b624f9c Mon Sep 17 00:00:00 2001 From: XB500 Date: Sun, 1 Dec 2024 23:36:21 +0000 Subject: [PATCH 054/157] Opteryx Version 0.19.0-alpha.878 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index b3c19abb9..7b4f665be 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 877 +__build__ = 878 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 5f31a870ff34eda55ff2db2b5adda6a4f4ca5727 Mon Sep 17 00:00:00 2001 From: joocer Date: Thu, 5 Dec 2024 00:44:36 +0000 Subject: [PATCH 055/157] =?UTF-8?q?=E2=9C=A8=20#2100?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 12 +-- SECURITY.md | 39 ++++++-- opteryx/connectors/sql_connector.py | 16 +++- opteryx/models/physical_plan.py | 91 +++++++++++-------- opteryx/operators/aggregate_and_group_node.py | 2 +- opteryx/operators/aggregate_node.py | 2 +- opteryx/operators/async_read_node.py | 2 +- opteryx/operators/base_plan_node.py | 4 +- opteryx/operators/cross_join_node.py | 2 +- opteryx/operators/distinct_node.py | 2 +- opteryx/operators/exit_node.py | 2 +- opteryx/operators/explain_node.py | 2 +- opteryx/operators/filter_node.py | 2 +- opteryx/operators/function_dataset_node.py | 2 +- opteryx/operators/heap_sort_node.py | 2 +- opteryx/operators/inner_join_node.py | 52 +++++++---- opteryx/operators/inner_join_node_single.py | 2 +- opteryx/operators/limit_node.py | 2 +- opteryx/operators/noop_node.py | 2 +- opteryx/operators/outer_join_node.py | 2 +- opteryx/operators/projection_node.py | 2 +- opteryx/operators/pyarrow_join_node.py | 2 +- opteryx/operators/read_node.py | 2 +- opteryx/operators/set_variable_node.py | 2 +- opteryx/operators/show_columns_node.py | 2 +- opteryx/operators/show_create_node.py | 2 +- opteryx/operators/show_value_node.py | 2 +- opteryx/operators/sort_node.py | 2 +- opteryx/operators/union_node.py | 2 +- .../test_shapes_and_errors_battery.py | 2 +- 30 files changed, 163 insertions(+), 99 deletions(-) diff --git a/README.md b/README.md index e30d77378..e0108ab88 100644 --- a/README.md +++ b/README.md @@ -21,9 +21,7 @@ Download | https://pypi.org/project/opteryx/ ## What is Opteryx? -Opteryx champions the SQL-on-everything approach, streamlining cross-platform data analytics by federating SQL queries across diverse data sources, including database systems like Postgres and datalake file formats like Parquet. The goal is to enhance your data analytics process by offering a unified way to access data from across your organization. - -Opteryx is a Python library that combines elements of in-process database engines like SQLite and DuckDB with federative features found in systems like Presto and Trino. The result is a versatile tool for querying data across multiple data sources in a seamless fashion. +Opteryx is a Python library enabling SQL queries across diverse data sources like Postgres, Parquet, and MongoDB. Opteryx champions the SQL-on-everything approach, streamlining cross-platform data analytics by federating SQL queries across diverse data sources. The goal is to enhance your data analytics process by offering a unified way to access data from across your organization. Opteryx combines the simplicity of SQLite and DuckDB with federated query capabilities found in Presto and Trino. Opteryx offers the following features: @@ -289,7 +287,6 @@ You can also try Opteryx right now using our [interactive labs](https://github.c [![Discord](https://img.shields.io/badge/discuss%20on-discord-5865F2.svg?logo=discord)](https://discord.gg/qpv2tr989x) [![Medium](https://img.shields.io/badge/Read%20on-Medium-5865F2.svg?logo=medium)](https://medium.com/opteryx) - **Get Involved** 🌟 **Star this repo** to show your support and help others discover Opteryx. @@ -299,14 +296,15 @@ You can also try Opteryx right now using our [interactive labs](https://github.c ❤️ We welcome [sponsorships](https://github.com/sponsors/mabel-dev) of any size. Every contribution helps us make Opteryx even better! We’re excited to have you join our journey. Let us know how we can help! + ## Security +We take security seriously. If you find any weaknesses please review our [Security Policy](https://github.com/mabel-dev/opteryx/blob/main/SECURITY.md) let us know through our [reporting process](https://github.com/mabel-dev/opteryx/security/advisories/new). + [![Static Analysis](https://github.com/mabel-dev/opteryx/actions/workflows/static_analysis.yaml/badge.svg)](https://github.com/mabel-dev/opteryx/actions/workflows/static_analysis.yml) [![Vulnerabilities](https://sonarcloud.io/api/project_badges/measure?project=mabel-dev_opteryx&metric=vulnerabilities)](https://sonarcloud.io/summary/new_code?id=mabel-dev_opteryx) [![Security Rating](https://sonarcloud.io/api/project_badges/measure?project=mabel-dev_opteryx&metric=security_rating)](https://sonarcloud.io/summary/new_code?id=mabel-dev_opteryx) -See the project [Security Policy](SECURITY.md) for information about reporting vulnerabilities. - ## License [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/mabel-dev/opteryx/blob/master/LICENSE) @@ -326,6 +324,8 @@ Opteryx is in beta. Beta means different things to different people, to us, bein - Changes are focused on feature completion, bugs, performance, reducing debt, and security - Code structure and APIs are not stable and may change +We’re actively adding features and improving performance. + ## Related Projects - **[orso](https://github.com/mabel-dev/orso)** DataFrame library diff --git a/SECURITY.md b/SECURITY.md index 7aada2e1d..d59d6c556 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -1,25 +1,44 @@ # Security Policy +We take security seriously and appreciate your efforts to make Opteryx more secure. + ## Supported Versions -The head of the current and previous minor version are supported for functional and security fixes. New features are only added to the latest version. Patch versions are not supported, fixes generally require the creation on a new patch version. +We support the current and previous minor versions for functional and security fixes. New features are only added to the latest version, while patch versions are created as needed for critical fixes. | Version | Supported | | ------- | ------------------ | -| 0.7 | :white_check_mark: | -| 0.8 | :white_check_mark: | -| <= 0.6 | :x: | +| 0.18 | ✅ | +| 0.17 | ✅ | +| <= 0.16 | ❌ | -All features in supported versions have support to resolve security issues regardless, however features which are due for deprecation may be removed rather than fixed. +### Key Notes -Releases may be yanked from PyPI if they contain material bugs, including security flaws. +- Features due for deprecation may be removed rather than fixed. +- Releases containing material bugs or security vulnerabilities may be yanked from PyPI. +- To stay secure, we recommend using the latest version wherever possible. ## Reporting a Vulnerability -Thank you for helping to make Opteryx more secure - Security weaknesses should be reported [via GitHub](https://github.com/mabel-dev/opteryx/security/advisories). +Thank you for helping to keep Opteryx secure! If you’ve discovered a potential vulnerability, please follow these steps: + +1. **Submit a Report**: Vulnerabilities should be reported through [GitHub Security Advisories](https://github.com/mabel-dev/opteryx/security/advisories). +1. **Include Details**: To help us assess the issue quickly, please include: + - A description of the vulnerability + - Steps to reproduce it + - Affected versions + - Any known mitigations +1. **Expectations**: We aim to triage and respond within 7 days. If you haven’t heard back, feel free to follow up. + +### Disclosure Timeline +- We follow a **90-day coordinated disclosure timeline** from the first contact, regardless of resolution status. +- Credit will be given to researchers unless anonymity is requested. -Please provide a description of the issue, the steps you took to create the issue, affected versions, and if known, mitigations for the issue. +## Scope of Security Issues -We will try to triage and respond to you within a week, if you do not get a response, please continue to get in touch - we appreciate your input but are a small development team who may not monitor for communications continuously. +This policy covers vulnerabilities that may compromise: +- Data confidentiality, integrity, or availability +- System functionality or integrity +- Compliance with security standards -This project follows a 90 day disclosure timeline (from first contact) regardless of response or resolution. +We appreciate your cooperation in helping us maintain a secure and reliable system for the Opteryx community. diff --git a/opteryx/connectors/sql_connector.py b/opteryx/connectors/sql_connector.py index 7b807811c..96743960c 100644 --- a/opteryx/connectors/sql_connector.py +++ b/opteryx/connectors/sql_connector.py @@ -198,8 +198,6 @@ def read_dataset( # type:ignore b = time.monotonic_ns() morsel = DataFrame(schema=result_schema, rows=batch_rows).arrow() convert_time += time.monotonic_ns() - b - yield morsel - at_least_once = True # Dynamically adjust chunk size based on the data size, we start by downloading # 500 records to get an idea of the row size, assuming these 500 are @@ -213,6 +211,9 @@ def read_dataset( # type:ignore self.chunk_size = min(self.chunk_size, 1000000) # cap at 1 million # DEBUG: log (f"CHANGING CHUNK SIZE TO {self.chunk_size} was {INITIAL_CHUNK_SIZE}.") + yield morsel + at_least_once = True + if not at_least_once: yield DataFrame(schema=result_schema).arrow() @@ -236,9 +237,16 @@ def get_dataset_schema(self) -> RelationSchema: name=column.name, type=PYTHON_TO_ORSO_MAP[column.type.python_type], precision=( - column.type.precision if column.type.precision is not None else 38 + column.type.precision + if hasattr(column.type, "precision") + and column.type.precision is not None + else 38 + ), + scale=( + column.type.scale + if hasattr(column.type, "scale") and column.type.scale is not None + else 14 ), - scale=(column.type.scale if column.type.scale is not None else 14), nullable=column.nullable, ) for column in table.columns diff --git a/opteryx/models/physical_plan.py b/opteryx/models/physical_plan.py index a7be78aaa..58d5ccfa3 100644 --- a/opteryx/models/physical_plan.py +++ b/opteryx/models/physical_plan.py @@ -31,11 +31,14 @@ active_task_lock = Lock() active_tasks: int = 0 +CONCURRENT_WORKERS = 2 + def active_tasks_increment(value: int): global active_tasks with active_task_lock: active_tasks += value + print("AT", active_tasks) class PhysicalPlan(Graph): @@ -139,19 +142,11 @@ def mark_node_exhausted(node_id): if isinstance(self[node_id], ReaderNode): return - # Notify downstream nodes - downstream_nodes = self.outgoing_edges(node_id) - if len(downstream_nodes) > 1: - raise InvalidInternalStateError("Cannot FORK execution") - elif len(downstream_nodes) == 1: - _, downstream_node, _ = downstream_nodes[0] - # Check if all parents of downstream_node are exhausted - if all( - node_exhaustion[parent] for parent, _, _ in self.ingoing_edges(downstream_node) - ): - work_queue.put((node_id, EOS)) # EOS signals exhaustion - active_tasks_increment(+1) - morsel_accounting[node_id] += 1 + for _, _, join_leg in self.ingoing_edges(node_id): + # Queue the task for node with the correct join_leg + work_queue.put((node_id, join_leg, EOS)) # EOS signals exhaustion + active_tasks_increment(+1) + morsel_accounting[node_id] += 1 def update_morsel_accounting(node_id, morsel_count_change: int): """ @@ -166,13 +161,13 @@ def update_morsel_accounting(node_id, morsel_count_change: int): """ with morsel_lock: morsel_accounting[node_id] += morsel_count_change - # print( - # "ACCOUNT", - # node_id, - # morsel_accounting[node_id], - # morsel_count_change, - # self[node_id].name, - # ) + print( + "ACCOUNT", + node_id, + morsel_accounting[node_id], + morsel_count_change, + self[node_id].name, + ) if morsel_accounting[node_id] < 0: raise InvalidInternalStateError("Node input and output count in invalid state.") @@ -180,11 +175,14 @@ def update_morsel_accounting(node_id, morsel_count_change: int): # Check if the node is exhausted if morsel_accounting[node_id] == 0: # No more pending morsels for this node # Ensure all parent nodes are exhausted - all_parents_exhausted = all( - node_exhaustion[parent] for parent, _, _ in self.ingoing_edges(node_id) + all_providers_exhausted = all( + node_exhaustion[provider] for provider, _, _ in self.ingoing_edges(node_id) ) - if all_parents_exhausted: + if all_providers_exhausted: + print("providers exhausted", node_exhaustion) mark_node_exhausted(node_id) + else: + print("providers not exhausted", node_exhaustion) if not self.is_acyclic(): raise InvalidInternalStateError("Query plan is cyclic, cannot execute.") @@ -198,6 +196,18 @@ def update_morsel_accounting(node_id, morsel_count_change: int): if head_node is None: head_node = self[head_nodes[0]] + # add the left/right labels to the edges coming into the joins + joins = [(nid, node) for nid, node in self.nodes(True) if isinstance(node, JoinNode)] + for nid, join in joins: + for s, t, r in self.breadth_first_search(nid, reverse=True): + source_relations = self[s].parameters.get("all_relations", set()) + if set(join._left_relation).intersection(source_relations): + self.remove_edge(s, t, r) + self.add_edge(s, t, "left") + elif set(join._right_relation).intersection(source_relations): + self.remove_edge(s, t, r) + self.add_edge(s, t, "right") + # Special case handling for 'Explain' queries if isinstance(head_node, ExplainNode): yield self.explain(head_node.analyze), ResultType.TABULAR @@ -222,13 +232,13 @@ def worker_process(): if task is None: break - node_id, morsel = task + node_id, join_leg, morsel = task operator = self[node_id] - results = operator(morsel) + results = operator(morsel, join_leg) for result in results: # Send results back to the response queue - response_queue.put((node_id, result)) + response_queue.put((node_id, join_leg, result)) update_morsel_accounting(node_id, -1) @@ -255,31 +265,38 @@ def inner_execute(plan): # Main engine loop processes pump nodes and coordinates work for pump_nid, pump_instance in pump_nodes: - for morsel in pump_instance(None): + for morsel in pump_instance(None, None): # Initial morsels pushed to the work queue determine downstream operators - next_nodes = [target for _, target, _ in self.outgoing_edges(pump_nid)] - for downstream_node in next_nodes: + next_nodes = [ + (target, join_leg) + for _, target, join_leg in self.outgoing_edges(pump_nid) + ] + for downstream_node, join_leg in next_nodes: # DEBUG: log (f"following initial {self[pump_nid].name} triggering {self[downstream_node].name}") # Queue tasks for downstream operators - work_queue.put((downstream_node, morsel)) + work_queue.put((downstream_node, join_leg, morsel)) active_tasks_increment(+1) update_morsel_accounting(downstream_node, +1) # Pump is exhausted after emitting all morsels + print("pump exhausted", pump_nid) mark_node_exhausted(pump_nid) # Process results from the response queue def should_stop(): all_nodes_exhausted = all(node_exhaustion.values()) - queues_empty = work_queue.empty() and response_queue.empty() all_nodes_inactive = active_tasks <= 0 - return all_nodes_exhausted and queues_empty and all_nodes_inactive + return all_nodes_exhausted and all_nodes_inactive while not should_stop(): # Wait for results from workers + print(active_tasks) + print("*", end="", flush=True) try: - node_id, result = response_queue.get(timeout=0.1) + node_id, join_leg, result = response_queue.get(timeout=0.1) + print("-", end="") except Empty: + print(".", end="") continue # if a thread threw a error, we get them in the main @@ -293,18 +310,20 @@ def should_stop(): continue # Determine downstream operators - downstream_nodes = [target for _, target, _ in self.outgoing_edges(node_id)] + downstream_nodes = [ + (target, join_leg) for _, target, join_leg in self.outgoing_edges(node_id) + ] if len(downstream_nodes) == 0: # Exit node if result is not None: yield result # Emit the morsel immediately active_tasks_increment(-1) # Mark the task as completed continue - for downstream_node in downstream_nodes: + for downstream_node, join_leg in downstream_nodes: # Queue tasks for downstream operators active_tasks_increment(+1) # DEBUG: log (f"following {self[node_id].name} triggering {self[downstream_node].name}") - work_queue.put((downstream_node, result)) + work_queue.put((downstream_node, join_leg, result)) update_morsel_accounting(downstream_node, +1) # decrement _after_ we've done the work relation to handling the task diff --git a/opteryx/operators/aggregate_and_group_node.py b/opteryx/operators/aggregate_and_group_node.py index 28ad741f3..a06dbfc4b 100644 --- a/opteryx/operators/aggregate_and_group_node.py +++ b/opteryx/operators/aggregate_and_group_node.py @@ -104,7 +104,7 @@ def config(self): # pragma: no cover def name(self): # pragma: no cover return "Group" - def execute(self, morsel: pyarrow.Table): + def execute(self, morsel: pyarrow.Table, **kwargs): if morsel == EOS: # merge all the morsels together into one table, selecting only the columns # we're pretty sure we're going to use - this will fail for datasets diff --git a/opteryx/operators/aggregate_node.py b/opteryx/operators/aggregate_node.py index 81142d930..422be5bdf 100644 --- a/opteryx/operators/aggregate_node.py +++ b/opteryx/operators/aggregate_node.py @@ -218,7 +218,7 @@ def config(self): # pragma: no cover def name(self): # pragma: no cover return "Aggregation" - def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + def execute(self, morsel: pyarrow.Table, **kwargs) -> pyarrow.Table: if morsel == EOS: if _is_count_star(self.aggregates): yield _count_star( diff --git a/opteryx/operators/async_read_node.py b/opteryx/operators/async_read_node.py index 81dbd1b61..8dc607bb0 100644 --- a/opteryx/operators/async_read_node.py +++ b/opteryx/operators/async_read_node.py @@ -84,7 +84,7 @@ def __init__(self, properties: QueryProperties, **parameters): def from_dict(cls, dic: dict) -> "AsyncReaderNode": # pragma: no cover raise NotImplementedError() - def execute(self, morsel) -> Generator: + def execute(self, morsel, **kwargs) -> Generator: from opteryx import system_statistics """Perform this step, time how long is spent doing work""" diff --git a/opteryx/operators/base_plan_node.py b/opteryx/operators/base_plan_node.py index cd815092e..347bc1543 100644 --- a/opteryx/operators/base_plan_node.py +++ b/opteryx/operators/base_plan_node.py @@ -89,14 +89,14 @@ def __str__(self) -> str: def execute(self, morsel: pyarrow.Table) -> Optional[pyarrow.Table]: # pragma: no cover pass - def __call__(self, morsel: pyarrow.Table) -> Optional[pyarrow.Table]: + def __call__(self, morsel: pyarrow.Table, join_leg: str) -> Optional[pyarrow.Table]: if hasattr(morsel, "num_rows"): self.records_in += morsel.num_rows self.bytes_in += morsel.nbytes self.calls += 1 # set up the execution of the operator - generator = self.execute(morsel) + generator = self.execute(morsel, join_leg=join_leg) while True: try: diff --git a/opteryx/operators/cross_join_node.py b/opteryx/operators/cross_join_node.py index f2a4b08f8..c59752515 100644 --- a/opteryx/operators/cross_join_node.py +++ b/opteryx/operators/cross_join_node.py @@ -323,7 +323,7 @@ def config(self): # pragma: no cover filters = f"({self._unnest_target.name} IN ({', '.join(self._filters)}))" return f"CROSS JOIN {filters}" - def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + def execute(self, morsel: pyarrow.Table, join_leg: str) -> pyarrow.Table: if not self.continue_executing: yield None return diff --git a/opteryx/operators/distinct_node.py b/opteryx/operators/distinct_node.py index 384bef653..8862fd9e1 100644 --- a/opteryx/operators/distinct_node.py +++ b/opteryx/operators/distinct_node.py @@ -48,7 +48,7 @@ def config(self): # pragma: no cover def name(self): # pragma: no cover return "Distinction" - def execute(self, morsel: Table) -> Table: + def execute(self, morsel: Table, **kwargs) -> Table: from opteryx.compiled.structures import distinct # We create a HashSet outside the distinct call, this allows us to pass diff --git a/opteryx/operators/exit_node.py b/opteryx/operators/exit_node.py index c12889394..83308bfae 100644 --- a/opteryx/operators/exit_node.py +++ b/opteryx/operators/exit_node.py @@ -64,7 +64,7 @@ def config(self): # pragma: no cover def name(self): # pragma: no cover return "Exit" - def execute(self, morsel: Table) -> Table: + def execute(self, morsel: Table, **kwargs) -> Table: # Exit doesn't return EOS if morsel == EOS: yield None diff --git a/opteryx/operators/explain_node.py b/opteryx/operators/explain_node.py index c52e7035e..c90b1444e 100644 --- a/opteryx/operators/explain_node.py +++ b/opteryx/operators/explain_node.py @@ -43,6 +43,6 @@ def config(self): def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover raise NotImplementedError() - def execute(self, morsel: Table) -> Table: + def execute(self, morsel: Table, **kwargs) -> Table: if self._query_plan: yield self._query_plan.explain(self.analyze) diff --git a/opteryx/operators/filter_node.py b/opteryx/operators/filter_node.py index ff8940aad..d459bd6ba 100644 --- a/opteryx/operators/filter_node.py +++ b/opteryx/operators/filter_node.py @@ -55,7 +55,7 @@ def config(self): # pragma: no cover def name(self): # pragma: no cover return "Filter" - def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + def execute(self, morsel: pyarrow.Table, **kwargs) -> pyarrow.Table: if morsel == EOS: yield None return diff --git a/opteryx/operators/function_dataset_node.py b/opteryx/operators/function_dataset_node.py index c0ae8612f..5fc6c5a7a 100644 --- a/opteryx/operators/function_dataset_node.py +++ b/opteryx/operators/function_dataset_node.py @@ -123,7 +123,7 @@ def name(self): # pragma: no cover def can_push_selection(self): return False - def execute(self, morsel) -> Generator: + def execute(self, morsel, **kwargs) -> Generator: try: start_time = time.time_ns() data = DATASET_FUNCTIONS[self.function](**self.parameters) # type:ignore diff --git a/opteryx/operators/heap_sort_node.py b/opteryx/operators/heap_sort_node.py index 24c419a71..2c43bb27b 100644 --- a/opteryx/operators/heap_sort_node.py +++ b/opteryx/operators/heap_sort_node.py @@ -82,7 +82,7 @@ def config(self): # pragma: no cover def name(self): # pragma: no cover return "Heap Sort" - def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + def execute(self, morsel: pyarrow.Table, **kwargs) -> pyarrow.Table: if morsel == EOS: yield self.table return diff --git a/opteryx/operators/inner_join_node.py b/opteryx/operators/inner_join_node.py index d655ef494..04654aa8b 100644 --- a/opteryx/operators/inner_join_node.py +++ b/opteryx/operators/inner_join_node.py @@ -85,9 +85,10 @@ def __init__(self, properties: QueryProperties, **parameters): self._right_columns = parameters.get("right_columns") self._right_relation = parameters.get("right_relation_names") - self.stream = "left" self.left_buffer = [] + self.right_buffer = [] self.left_hash = None + self.left_complete = False @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover @@ -101,10 +102,12 @@ def name(self): # pragma: no cover def config(self): # pragma: no cover return "" - def execute(self, morsel: Table) -> Table: - if self.stream == "left": + def execute(self, morsel: Table, join_leg: str) -> Table: + print(join_leg, type(morsel)) + + if join_leg == "left": if morsel == EOS: - self.stream = "right" + self.left_complete = True self.left_relation = pyarrow.concat_tables(self.left_buffer, promote_options="none") self.left_buffer.clear() @@ -116,21 +119,36 @@ def execute(self, morsel: Table) -> Table: ) self.left_hash = hash_join_map(self.left_relation, self._left_columns) + + for right_morsel in self.right_buffer: + yield inner_join_with_preprocessed_left_side( + left_relation=self.left_relation, + right_relation=right_morsel, + join_columns=self._right_columns, + hash_table=self.left_hash, + ) + self.right_buffer.clear() + return else: self.left_buffer.append(morsel) yield None return - if morsel == EOS: - yield None - return - - # do the join - new_morsel = inner_join_with_preprocessed_left_side( - left_relation=self.left_relation, - right_relation=morsel, - join_columns=self._right_columns, - hash_table=self.left_hash, - ) - - yield new_morsel + if join_leg == "right": + if morsel == EOS: + yield None + return + if not self.left_complete: + self.right_buffer.append(morsel) + yield None + return + + # do the join + new_morsel = inner_join_with_preprocessed_left_side( + left_relation=self.left_relation, + right_relation=morsel, + join_columns=self._right_columns, + hash_table=self.left_hash, + ) + + yield new_morsel diff --git a/opteryx/operators/inner_join_node_single.py b/opteryx/operators/inner_join_node_single.py index b21e97aa7..16860c9b7 100644 --- a/opteryx/operators/inner_join_node_single.py +++ b/opteryx/operators/inner_join_node_single.py @@ -187,7 +187,7 @@ def name(self): # pragma: no cover def config(self): # pragma: no cover return "" - def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + def execute(self, morsel: pyarrow.Table, join_leg: str) -> pyarrow.Table: if self.stream == "left": if morsel == EOS: self.stream = "right" diff --git a/opteryx/operators/limit_node.py b/opteryx/operators/limit_node.py index 5db185dea..427fce179 100644 --- a/opteryx/operators/limit_node.py +++ b/opteryx/operators/limit_node.py @@ -47,7 +47,7 @@ def name(self): # pragma: no cover def config(self): # pragma: no cover return str(self.limit) + " OFFSET " + str(self.offset) - def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + def execute(self, morsel: pyarrow.Table, **kwargs) -> pyarrow.Table: if morsel == EOS: yield None return diff --git a/opteryx/operators/noop_node.py b/opteryx/operators/noop_node.py index 5b327d593..1a7139459 100644 --- a/opteryx/operators/noop_node.py +++ b/opteryx/operators/noop_node.py @@ -39,6 +39,6 @@ def name(self): # pragma: no cover def config(self): # pragma: no cover return "" - def execute(self, morsel: Table) -> Table: + def execute(self, morsel: Table, **kwargs) -> Table: print("NOOP was called") yield morsel diff --git a/opteryx/operators/outer_join_node.py b/opteryx/operators/outer_join_node.py index 479dc98ef..85a06bfec 100644 --- a/opteryx/operators/outer_join_node.py +++ b/opteryx/operators/outer_join_node.py @@ -284,7 +284,7 @@ def config(self): # pragma: no cover return f"{self._join_type.upper()} JOIN (USING {','.join(map(format_expression, self._using))})" return f"{self._join_type.upper()}" - def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + def execute(self, morsel: pyarrow.Table, join_leg: str) -> pyarrow.Table: if self.stream == "left": if morsel == EOS: self.stream = "right" diff --git a/opteryx/operators/projection_node.py b/opteryx/operators/projection_node.py index 4f150ea72..6efac153f 100644 --- a/opteryx/operators/projection_node.py +++ b/opteryx/operators/projection_node.py @@ -62,7 +62,7 @@ def config(self): # pragma: no cover def name(self): # pragma: no cover return "Projection" - def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + def execute(self, morsel: pyarrow.Table, **kwargs) -> pyarrow.Table: if morsel == EOS: yield None return diff --git a/opteryx/operators/pyarrow_join_node.py b/opteryx/operators/pyarrow_join_node.py index 6592c25a8..a0bb29ada 100644 --- a/opteryx/operators/pyarrow_join_node.py +++ b/opteryx/operators/pyarrow_join_node.py @@ -62,7 +62,7 @@ def config(self): # pragma: no cover return f"{self._join_type.upper()} JOIN (USING {','.join(map(format_expression, self._using))})" return f"{self._join_type.upper()}" - def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + def execute(self, morsel: pyarrow.Table, join_leg: str) -> pyarrow.Table: if self.stream == "left": if morsel == EOS: self.stream = "right" diff --git a/opteryx/operators/read_node.py b/opteryx/operators/read_node.py index 6e660a4a2..7b1c284a3 100644 --- a/opteryx/operators/read_node.py +++ b/opteryx/operators/read_node.py @@ -188,7 +188,7 @@ def config(self): f"{' WITH(' + ','.join(self.parameters.get('hints')) + ')' if self.parameters.get('hints') else ''})" ) - def execute(self, morsel) -> Generator: + def execute(self, morsel, **kwargs) -> Generator: """Perform this step, time how long is spent doing work""" morsel = None diff --git a/opteryx/operators/set_variable_node.py b/opteryx/operators/set_variable_node.py index 02676434d..bf234c468 100644 --- a/opteryx/operators/set_variable_node.py +++ b/opteryx/operators/set_variable_node.py @@ -43,6 +43,6 @@ def name(self): # pragma: no cover def config(self): # pragma: no cover return f"{self.variable} TO {self.value}" - def execute(self, morsel) -> NonTabularResult: + def execute(self, morsel, **kwargs) -> NonTabularResult: self.variables[self.variable] = self.value return NonTabularResult(record_count=1, status=QueryStatus.SQL_SUCCESS) # type: ignore diff --git a/opteryx/operators/show_columns_node.py b/opteryx/operators/show_columns_node.py index 7388babc8..12443db9e 100644 --- a/opteryx/operators/show_columns_node.py +++ b/opteryx/operators/show_columns_node.py @@ -71,7 +71,7 @@ def rename_column(self, dic: dict, renames) -> dict: dic["name"] = renames[dic["name"]] return dic - def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + def execute(self, morsel: pyarrow.Table, **kwargs) -> pyarrow.Table: from orso import DataFrame if self.seen: diff --git a/opteryx/operators/show_create_node.py b/opteryx/operators/show_create_node.py index d76d95d9b..469cdf003 100644 --- a/opteryx/operators/show_create_node.py +++ b/opteryx/operators/show_create_node.py @@ -44,7 +44,7 @@ def name(self): # pragma: no cover def config(self): # pragma: no cover return "" - def execute(self, morsel: pyarrow.Table) -> pyarrow.Table: + def execute(self, morsel: pyarrow.Table, **kwargs) -> pyarrow.Table: if self.object_type == "VIEW": from opteryx.planner.views import is_view from opteryx.planner.views import view_as_sql diff --git a/opteryx/operators/show_value_node.py b/opteryx/operators/show_value_node.py index f223363bb..507c2e0ac 100644 --- a/opteryx/operators/show_value_node.py +++ b/opteryx/operators/show_value_node.py @@ -53,7 +53,7 @@ def name(self): # pragma: no cover def config(self): # pragma: no cover return "" - def execute(self, morsel) -> Generator: + def execute(self, morsel, **kwargs) -> Generator: buffer = [{"name": self.key, "value": str(self.value)}] table = pyarrow.Table.from_pylist(buffer) yield table diff --git a/opteryx/operators/sort_node.py b/opteryx/operators/sort_node.py index 87e471ceb..a3fea0672 100644 --- a/opteryx/operators/sort_node.py +++ b/opteryx/operators/sort_node.py @@ -50,7 +50,7 @@ def config(self): # pragma: no cover def name(self): # pragma: no cover return "Sort" - def execute(self, morsel: Table) -> Table: + def execute(self, morsel: Table, **kwargs) -> Table: if morsel != EOS: self.morsels.append(morsel) yield None diff --git a/opteryx/operators/union_node.py b/opteryx/operators/union_node.py index 430c5f785..f3855858f 100644 --- a/opteryx/operators/union_node.py +++ b/opteryx/operators/union_node.py @@ -44,7 +44,7 @@ def name(self): # pragma: no cover def config(self): # pragma: no cover return "" - def execute(self, morsel: Table) -> Table: + def execute(self, morsel: Table, **kwargs) -> Table: """ Union needs to ensure the column names are the same and that coercible types are coerced. diff --git a/tests/sql_battery/test_shapes_and_errors_battery.py b/tests/sql_battery/test_shapes_and_errors_battery.py index 6757ac024..b89c78789 100644 --- a/tests/sql_battery/test_shapes_and_errors_battery.py +++ b/tests/sql_battery/test_shapes_and_errors_battery.py @@ -93,7 +93,7 @@ # Does the error tester work ("THIS IS NOT VALID SQL", None, None, SqlError), - # Randomly generated but consistently tested queries + # Randomly generated but consistently tested queries (note we have a fuzzer in the full suite) ("SELECT * FROM $planets WHERE `name` = 'Earth'", 1, 20, None), ("SELECT * FROM $planets WHERE name = 'Mars'", 1, 20, None), ("SELECT * FROM $planets WHERE name <> 'Venus'", 8, 20, None), From 2d00e920c22e28ed4a33db44927cbf5842309971 Mon Sep 17 00:00:00 2001 From: XB500 Date: Thu, 5 Dec 2024 00:45:03 +0000 Subject: [PATCH 056/157] Opteryx Version 0.19.0-alpha.879 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 7b4f665be..923b4626e 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 878 +__build__ = 879 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From de42c355327d86371971f08fabe6e5bce89ac481 Mon Sep 17 00:00:00 2001 From: joocer Date: Tue, 10 Dec 2024 23:21:47 +0000 Subject: [PATCH 057/157] #2100 --- .github/ISSUE_TEMPLATE/bug_report.md | 16 +- .github/ISSUE_TEMPLATE/chore_request.md | 6 - .github/ISSUE_TEMPLATE/improvement_request.md | 13 +- .github/PULL_REQUEST_TEMPLATE.md | 23 ++- README.md | 2 + opteryx/compiled/list_ops/cython_list_ops.pyx | 36 +++-- opteryx/compiled/structures/node.pyx | 65 ++++++--- opteryx/managers/expression/ops.py | 44 +++--- opteryx/models/physical_plan.py | 137 +++++++++++------- opteryx/operators/base_plan_node.py | 6 +- opteryx/operators/inner_join_node.py | 113 ++++++++------- opteryx/operators/inner_join_node_single.py | 4 +- opteryx/operators/outer_join_node.py | 4 +- opteryx/operators/pyarrow_join_node.py | 4 +- opteryx/operators/read_node.py | 1 + .../planner/cost_based_optimizer/__init__.py | 1 + .../logical_planner/logical_planner.py | 84 ++++++++--- opteryx/third_party/travers/graph.py | 3 +- testdata/flat/atquestion/data.jsonl | 6 + .../test_shapes_and_errors_battery.py | 42 ++++++ 20 files changed, 413 insertions(+), 197 deletions(-) delete mode 100644 .github/ISSUE_TEMPLATE/chore_request.md create mode 100644 testdata/flat/atquestion/data.jsonl diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index df42a77b0..bc042d0fe 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -7,18 +7,28 @@ labels: "Bug 🪲" --- ### Thank you for taking the time to report a problem with Opteryx. -_To help us to respond to your request we ask that you try to provide the below detail about the bug._ +_To help us respond to your request, please provide the following details about the bug._ -**Describe the bug** _A clear and specific description of what the bug is. What the error, incorrect or unexpected behaviour was._ +--- + +**Describe the bug** _A clear and specific description of the bug. What error, incorrect, or unexpected behavior occurred?_ +--- **Expected behaviour** _A clear and concise description of what you expected to happen._ +--- **Sample Code/Statement** _If you can, please submit the SQL statement or Python code snippet, or a representative example using the sample datasets._ ~~~sql +-- Example SQL query here +~~~ +~~~python +# Example Python code here ~~~ -**Additional context** _Add any other context about the problem here, for example what you have done to try to diagnose or workaround the problem._ +--- + +**Additional context** _Add any other context about the problem here, for example what you have done to try to diagnose or workaround the problem, logs or error messages, the version of Opteryx and environment details (Python version, OS)_ diff --git a/.github/ISSUE_TEMPLATE/chore_request.md b/.github/ISSUE_TEMPLATE/chore_request.md deleted file mode 100644 index 4c1ba04af..000000000 --- a/.github/ISSUE_TEMPLATE/chore_request.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -name: Maintenance Request -about: Tasks to ensure the quality of the system is preserved -title: "🧹" - ---- diff --git a/.github/ISSUE_TEMPLATE/improvement_request.md b/.github/ISSUE_TEMPLATE/improvement_request.md index 67c2f529b..2d9030b90 100644 --- a/.github/ISSUE_TEMPLATE/improvement_request.md +++ b/.github/ISSUE_TEMPLATE/improvement_request.md @@ -5,12 +5,21 @@ title: "✨" --- -### Thanks for stopping by to let us know something could be better! +### Thank you for helping maintain the quality and performance of Opteryx. +_We appreciate your efforts — your contribution ensures that Opteryx remains reliable and efficient for everyone._ -**Is your feature request related to a problem? Please describe.** _A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]_ +--- + +**Is your feature request related to a problem? Please describe.** _A clear and concise description of what the problem is. (e.g., "I'm always frustrated when [...]")_ + +--- **Describe the solution you'd like** _A clear and concise description of what you want to happen._ +--- + **Describe alternatives you've considered** _A clear and concise description of any alternative solutions or features you've considered._ +--- + **Additional context** _Add any other context or screenshots about the feature request here._ diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index b87f0dc20..60a84b299 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,10 +1,19 @@ -Thank you for opening a Pull Request! +## Thank you for opening a Pull Request! -Before submitting your PR, there are a few things you can do to help make sure it goes smoothly: +We appreciate your contribution to Opteryx. Your time and effort make a difference, and we’re excited to review your changes. To help ensure a smooth review process, please check the following: -- [ ] Raise a [bug/feature request](https://github.com/mabel-dev/opteryx/issues/new/choose) or start a [discussion](https://github.com/mabel-dev/opteryx/discussions/landing), ideally before writing your code! That way we can discuss the change and the approach. -- [ ] Ensure the tests pass -- [ ] Ensure code coverage does not decrease (if any source code was changed) -- [ ] Appropriate docs and tests were updated (if necessary) +### **Checklist for a Successful PR** -Fixes # \ No newline at end of file +- [ ] **Start the conversation:** If you haven’t already, raise a [bug/feature request](https://github.com/mabel-dev/opteryx/issues/new/choose) or start a [discussion](https://github.com/mabel-dev/opteryx/discussions/landing). This ensures alignment on the change and approach. +- [ ] **Run the tests:** Confirm that all tests pass without errors. +- [ ] **Maintain code coverage:** If you’ve added or modified source code ensure new tests are added to the test suite. +- [ ] **Update documentation and tests (if applicable):** If your changes impact functionality, make sure the relevant docs and test cases are updated. + +--- + +### **Fixes: ``** +Please replace `` with the corresponding issue number. + +--- + +Thank you for contributing to Opteryx! 🎉 \ No newline at end of file diff --git a/README.md b/README.md index e0108ab88..eaac8afc8 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,8 @@ _A unified SQL interface to unlock insights across your diverse data sources, from blobs stores to databases - **effortless cross-platform data analytics**._ +![Opteryx](https://raw.githubusercontent.com/mabel-dev/opteryx.dev/main/assets/terminal.gif) + Resource | Location ------------- | ------------- Source Code | https://github.com/mabel-dev/opteryx diff --git a/opteryx/compiled/list_ops/cython_list_ops.pyx b/opteryx/compiled/list_ops/cython_list_ops.pyx index bc552b41e..697c21648 100644 --- a/opteryx/compiled/list_ops/cython_list_ops.pyx +++ b/opteryx/compiled/list_ops/cython_list_ops.pyx @@ -59,19 +59,35 @@ cpdef cnp.ndarray[cnp.npy_bool, ndim=1] cython_allop_neq(object literal, cnp.nda cpdef cnp.ndarray[cnp.npy_bool, ndim=1] cython_anyop_eq(object literal, cnp.ndarray arr): - cdef: - cdef Py_ssize_t i, j - cdef Py_ssize_t num_rows = arr.shape[0] - cnp.ndarray[cnp.npy_bool, ndim=1] result = numpy.zeros(num_rows, dtype=bool) - cnp.ndarray row + """ + Check each row in arr for the presence of `literal`. If found, mark the corresponding + position in the result as True, otherwise False. + + Parameters: + literal: object + The value to search for in each row. + arr: cnp.ndarray + A two-dimensional array-like structure where each element is a sub-array (row). + + Returns: + cnp.ndarray[cnp.npy_bool, ndim=1] + A boolean array indicating for each row whether `literal` was found. + """ + cdef Py_ssize_t i, j, num_rows, row_length + num_rows = arr.shape[0] + + cdef cnp.ndarray[cnp.npy_bool, ndim=1] result = numpy.zeros(num_rows, dtype=bool) + cdef cnp.ndarray row for i in range(num_rows): row = arr[i] - if row is not None and row.shape[0] > 0: - for j in range(row.shape[0]): - if row[j] == literal: - result[i] = True - break + if row is not None: + row_length = row.shape[0] + if row_length > 0: + for j in range(row_length): + if row[j] == literal: + result[i] = True + break return result diff --git a/opteryx/compiled/structures/node.pyx b/opteryx/compiled/structures/node.pyx index 80963275a..b6ee5f7e1 100644 --- a/opteryx/compiled/structures/node.pyx +++ b/opteryx/compiled/structures/node.pyx @@ -32,28 +32,51 @@ during execution for about 0.2 seconds, Cython runs this class approx 33% faster raw Python version. """ + from cpython cimport dict +from uuid import uuid4 cdef class Node: cdef: dict _properties object node_type + str uuid def __cinit__(self, node_type, **attributes): + """ + Initialize a new Node with a given node_type and optional attributes. + A UUID is automatically generated for the node. + """ self.node_type = node_type + self.uuid = str(uuid4()) self._properties = dict(attributes) def __getattr__(self, str name): + """ + Get an attribute: + - If name is 'node_type' or 'uuid', return the stored value. + - Otherwise, return the corresponding entry in _properties or None if not found. + """ if name == 'node_type': return self.node_type + if name == 'uuid': + return self.uuid try: return self._properties[name] except KeyError: return None def __setattr__(self, str name, object value): + """ + Set an attribute: + - If name is 'node_type' or 'uuid', store directly on the object. + - If value is None, remove it from _properties. + - Otherwise, store in _properties. + """ if name == 'node_type': self.node_type = value + elif name == 'uuid': + self.uuid = value elif value is None: self._properties.pop(name, None) else: @@ -61,40 +84,46 @@ cdef class Node: @property def properties(self): - # Merge _properties with node_type and return as a new dict - return {'node_type': self.node_type, **self._properties} + """ + Return a dictionary of all node properties, including node_type and uuid. + Dynamic attributes stored in _properties are merged. + """ + return { + 'node_type': self.node_type, + 'uuid': self.uuid, + **self._properties + } def get(self, str name, object default=None): + """ + Get an attribute from _properties with an optional default. + """ return self._properties.get(name, default) def __str__(self): + """ + Return a JSON representation of the node's properties, including node_type and uuid. + """ import orjson - # Serialize the full properties including node_type return orjson.dumps(self.properties, default=str).decode('utf-8') def __repr__(self): - cdef str node_type = str(self.node_type) - cdef str node_type_str = node_type[20:] if node_type.startswith("LogicalPlanStepType.") else node_type + """ + Return a string representation of the node, including its type. + """ + cdef str node_type_str = str(self.node_type) + if node_type_str.startswith("LogicalPlanStepType."): + node_type_str = node_type_str[20:] return f"" def copy(self) -> "Node": """ Create an independent deep copy of the node. - - Returns: - Node: The new, independent deep copy. + This generates a new uuid for the copied node, making it a distinct entity. """ - def _inner_copy(obj): """ - Create an independent inner copy of the given object. - - Parameters: - obj: Any - The object to be deep copied. - - Returns: - Any: The new, independent deep copy. + Create a deep copy of the given object, handling collections and objects with custom copy methods. """ obj_type = type(obj) if obj_type in (list, tuple, set): @@ -108,5 +137,7 @@ cdef class Node: return copy.deepcopy(obj) return obj + # Create a new Node with the same node_type and a deep copy of _properties new_node = Node(self.node_type, **{key: _inner_copy(value) for key, value in self._properties.items()}) + new_node.uuid = self.uuid return new_node diff --git a/opteryx/managers/expression/ops.py b/opteryx/managers/expression/ops.py index 58615fc16..51140abaf 100644 --- a/opteryx/managers/expression/ops.py +++ b/opteryx/managers/expression/ops.py @@ -232,34 +232,42 @@ def _inner_filter_operations(arr, operator, value): if operator == "AtQuestion": element = value[0] - if len(arr) > 0 and isinstance(arr[0], dict): - return [element in d for d in arr] - import simdjson parser = simdjson.Parser() if not element.startswith("$."): - # Don't warn on rule SIM118, the object isn't actually a dictionary + # Not a JSONPath, treat as a simple key existence check return pyarrow.array( [element in parser.parse(doc).keys() for doc in arr], - type=pyarrow.bool_(), # type:ignore + type=pyarrow.bool_(), # type: ignore ) - _keys = element[2:].split(".") - - def json_path_extract(current_value, keys): - for key in keys: - if key not in current_value: - return False # Key doesn't exist - - # Proceed to the next level of the JSON object - current_value = current_value[key] - return True # Key exists if traversal succeeds - + # Convert "$.key1.list[0]" to JSON Pointer "/key1/list/0" + def jsonpath_to_pointer(jsonpath: str) -> str: + # Remove "$." prefix + json_pointer = jsonpath[1:] + # Replace "." with "/" for dict navigation + json_pointer = json_pointer.replace(".", "/") + # Replace "[index]" with "/index" for list access + json_pointer = json_pointer.replace("[", "/").replace("]", "") + return json_pointer + + # Convert "$.key1.key2" to JSON Pointer "/key1/key2" + json_pointer = jsonpath_to_pointer(element) + + def check_json_pointer(doc, pointer): + try: + # Try accessing the path via JSON Pointer + parser.parse(doc).at_pointer(pointer) + return True # If successful, the path exists + except Exception: + return False # If an error occurs, the path does not exist + + # Apply the JSON Pointer check return pyarrow.array( - [json_path_extract(parser.parse(doc), _keys) for doc in arr], - type=pyarrow.bool_(), # type:ignore + [check_json_pointer(doc, json_pointer) for doc in arr], + type=pyarrow.bool_(), ) if operator == "AtArrow": diff --git a/opteryx/models/physical_plan.py b/opteryx/models/physical_plan.py index 58d5ccfa3..ce0fd2788 100644 --- a/opteryx/models/physical_plan.py +++ b/opteryx/models/physical_plan.py @@ -29,24 +29,26 @@ morsel_lock = Lock() active_task_lock = Lock() -active_tasks: int = 0 CONCURRENT_WORKERS = 2 -def active_tasks_increment(value: int): - global active_tasks - with active_task_lock: - active_tasks += value - print("AT", active_tasks) - - class PhysicalPlan(Graph): """ The execution tree is defined separately from the planner to simplify the complex code that is the planner from the tree that describes the plan. """ + def __init__(self): + super().__init__() + self.active_tasks: int = 0 + self.recheck_exhausted: list = [] + + def active_tasks_increment(self, value: int): + with active_task_lock: + self.active_tasks += value + print("AT", self.active_tasks) + def depth_first_search_flat( self, node: Optional[str] = None, visited: Optional[set] = None ) -> list: @@ -127,8 +129,12 @@ def execute(self, head_node=None) -> Generator[Tuple[Any, ResultType], Any, Any] from opteryx.operators import ShowCreateNode from opteryx.operators import ShowValueNode - morsel_accounting = {nid: 0 for nid in self.nodes()} # Total morsels received by each node - node_exhaustion = {nid: False for nid in self.nodes()} # Exhaustion state of each node + morsel_accounting = {nid: 0 for nid in self.nodes()} + node_exhaustion = {nid: False for nid in self.nodes()} + # Work queue for worker tasks + work_queue = Queue() + # Response queue for results sent back to the engine + response_queue = Queue() def mark_node_exhausted(node_id): """ @@ -144,11 +150,12 @@ def mark_node_exhausted(node_id): for _, _, join_leg in self.ingoing_edges(node_id): # Queue the task for node with the correct join_leg + print("EOS PUT", node_id, join_leg, EOS) work_queue.put((node_id, join_leg, EOS)) # EOS signals exhaustion - active_tasks_increment(+1) + self.active_tasks_increment(+1) morsel_accounting[node_id] += 1 - def update_morsel_accounting(node_id, morsel_count_change: int): + def update_morsel_accounting(node_id, morsel_count_change: int, join_leg: str): """ Updates the morsel accounting for a node and checks for exhaustion. @@ -159,6 +166,10 @@ def update_morsel_accounting(node_id, morsel_count_change: int): Returns: None """ + + nodes_to_check = self.recheck_exhausted.copy() + [node_id] + self.recheck_exhausted.clear() + with morsel_lock: morsel_accounting[node_id] += morsel_count_change print( @@ -169,20 +180,24 @@ def update_morsel_accounting(node_id, morsel_count_change: int): self[node_id].name, ) - if morsel_accounting[node_id] < 0: - raise InvalidInternalStateError("Node input and output count in invalid state.") - - # Check if the node is exhausted - if morsel_accounting[node_id] == 0: # No more pending morsels for this node - # Ensure all parent nodes are exhausted - all_providers_exhausted = all( - node_exhaustion[provider] for provider, _, _ in self.ingoing_edges(node_id) - ) - if all_providers_exhausted: - print("providers exhausted", node_exhaustion) - mark_node_exhausted(node_id) - else: - print("providers not exhausted", node_exhaustion) + for node in nodes_to_check: + if morsel_accounting[node_id] < 0: + raise InvalidInternalStateError( + "Node input and output count in invalid state." + ) + + # Check if the node is exhausted + if morsel_accounting[node] == 0: # No more pending morsels for this node + # Ensure all parent nodes are exhausted + all_providers_exhausted = all( + node_exhaustion[provider] for provider, _, _ in self.ingoing_edges(node) + ) + if all_providers_exhausted: + print("providers exhausted", node_exhaustion) + mark_node_exhausted(node) + else: + print("providers not exhausted", node_exhaustion) + self.recheck_exhausted.append(node) if not self.is_acyclic(): raise InvalidInternalStateError("Query plan is cyclic, cannot execute.") @@ -197,16 +212,31 @@ def update_morsel_accounting(node_id, morsel_count_change: int): head_node = self[head_nodes[0]] # add the left/right labels to the edges coming into the joins - joins = [(nid, node) for nid, node in self.nodes(True) if isinstance(node, JoinNode)] + joins = ((nid, node) for nid, node in self.nodes(True) if isinstance(node, JoinNode)) for nid, join in joins: - for s, t, r in self.breadth_first_search(nid, reverse=True): - source_relations = self[s].parameters.get("all_relations", set()) - if set(join._left_relation).intersection(source_relations): - self.remove_edge(s, t, r) - self.add_edge(s, t, "left") - elif set(join._right_relation).intersection(source_relations): - self.remove_edge(s, t, r) - self.add_edge(s, t, "right") + for provider, provider_target, provider_relation in self.ingoing_edges(nid): + reader_edges = { + (source, target, relation) + for source, target, relation in self.breadth_first_search( + provider, reverse=True + ) + } # if hasattr(self[target], "uuid")} + if hasattr(self[provider], "uuid"): + reader_edges.add((provider, provider_target, provider_relation)) + + for s, t, r in reader_edges: + if self[s].uuid in join.left_readers: + self.remove_edge(provider, nid, r) + self.add_edge(provider, nid, "left") + elif self[s].uuid in join.right_readers: + self.remove_edge(provider, nid, r) + self.add_edge(provider, nid, "right") + + tester = self.breadth_first_search(nid, reverse=True) + if not any(r == "left" for s, t, r in tester): + raise InvalidInternalStateError("Join has no LEFT leg") + if not any(r == "right" for s, t, r in tester): + raise InvalidInternalStateError("Join has no RIGHT leg") # Special case handling for 'Explain' queries if isinstance(head_node, ExplainNode): @@ -216,10 +246,6 @@ def update_morsel_accounting(node_id, morsel_count_change: int): yield head_node(None), ResultType.TABULAR else: - # Work queue for worker tasks - work_queue = Queue() - # Response queue for results sent back to the engine - response_queue = Queue() num_workers = CONCURRENT_WORKERS workers = [] @@ -240,7 +266,7 @@ def worker_process(): # Send results back to the response queue response_queue.put((node_id, join_leg, result)) - update_morsel_accounting(node_id, -1) + update_morsel_accounting(node_id, -1, join_leg) work_queue.task_done() @@ -267,30 +293,32 @@ def inner_execute(plan): for pump_nid, pump_instance in pump_nodes: for morsel in pump_instance(None, None): # Initial morsels pushed to the work queue determine downstream operators - next_nodes = [ + consumer_nodes = [ (target, join_leg) for _, target, join_leg in self.outgoing_edges(pump_nid) ] - for downstream_node, join_leg in next_nodes: - # DEBUG: log (f"following initial {self[pump_nid].name} triggering {self[downstream_node].name}") - # Queue tasks for downstream operators - work_queue.put((downstream_node, join_leg, morsel)) - active_tasks_increment(+1) - update_morsel_accounting(downstream_node, +1) + for consumer_node, join_leg in consumer_nodes: + # DEBUG: log (f"following initial {self[pump_nid].name} triggering {self[consumer_node].name}") + # Queue tasks for consumer operators + print("WORK PUT", consumer_node, join_leg) + work_queue.put((consumer_node, join_leg, morsel)) + self.active_tasks_increment(+1) + update_morsel_accounting(consumer_node, +1, join_leg) # Pump is exhausted after emitting all morsels print("pump exhausted", pump_nid) mark_node_exhausted(pump_nid) + update_morsel_accounting(pump_nid, 0) # Process results from the response queue def should_stop(): all_nodes_exhausted = all(node_exhaustion.values()) - all_nodes_inactive = active_tasks <= 0 + all_nodes_inactive = self.active_tasks <= 0 return all_nodes_exhausted and all_nodes_inactive while not should_stop(): # Wait for results from workers - print(active_tasks) + print(self.active_tasks) print("*", end="", flush=True) try: node_id, join_leg, result = response_queue.get(timeout=0.1) @@ -306,7 +334,7 @@ def should_stop(): # Handle Empty responses if result is None: - active_tasks_increment(-1) + self.active_tasks_increment(-1) continue # Determine downstream operators @@ -316,18 +344,19 @@ def should_stop(): if len(downstream_nodes) == 0: # Exit node if result is not None: yield result # Emit the morsel immediately - active_tasks_increment(-1) # Mark the task as completed + self.active_tasks_increment(-1) # Mark the task as completed continue for downstream_node, join_leg in downstream_nodes: # Queue tasks for downstream operators - active_tasks_increment(+1) + self.active_tasks_increment(+1) # DEBUG: log (f"following {self[node_id].name} triggering {self[downstream_node].name}") + print("WORK PUT", downstream_node, join_leg) work_queue.put((downstream_node, join_leg, result)) - update_morsel_accounting(downstream_node, +1) + update_morsel_accounting(downstream_node, +1, join_leg) # decrement _after_ we've done the work relation to handling the task - active_tasks_increment(-1) + self.active_tasks_increment(-1) for worker in workers: work_queue.put(None) diff --git a/opteryx/operators/base_plan_node.py b/opteryx/operators/base_plan_node.py index 347bc1543..aab154511 100644 --- a/opteryx/operators/base_plan_node.py +++ b/opteryx/operators/base_plan_node.py @@ -133,4 +133,8 @@ def sensors(self): class JoinNode(BasePlanNode): - pass + def __init__(self, *, properties, **parameters): + super().__init__(properties=properties, **parameters) + + self.left_readers = parameters.get("left_readers") + self.right_readers = parameters.get("right_readers") diff --git a/opteryx/operators/inner_join_node.py b/opteryx/operators/inner_join_node.py index 04654aa8b..04eca47a7 100644 --- a/opteryx/operators/inner_join_node.py +++ b/opteryx/operators/inner_join_node.py @@ -31,6 +31,9 @@ pyarrow_ops implementation which was a variation of a sort-merge join. """ +import time +from threading import Lock + import pyarrow from pyarrow import Table @@ -75,20 +78,17 @@ def inner_join_with_preprocessed_left_side(left_relation, right_relation, join_c class InnerJoinNode(JoinNode): def __init__(self, properties: QueryProperties, **parameters): JoinNode.__init__(self, properties=properties, **parameters) - self._join_type = parameters["type"] - self._on = parameters.get("on") - self._using = parameters.get("using") - self._left_columns = parameters.get("left_columns") - self._left_relation = parameters.get("left_relation_names") + self.left_columns = parameters.get("left_columns") + self.left_relation = None - self._right_columns = parameters.get("right_columns") - self._right_relation = parameters.get("right_relation_names") + self.right_columns = parameters.get("right_columns") self.left_buffer = [] self.right_buffer = [] self.left_hash = None - self.left_complete = False + + self.lock = Lock() @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover @@ -105,50 +105,61 @@ def config(self): # pragma: no cover def execute(self, morsel: Table, join_leg: str) -> Table: print(join_leg, type(morsel)) - if join_leg == "left": - if morsel == EOS: - self.left_complete = True - self.left_relation = pyarrow.concat_tables(self.left_buffer, promote_options="none") - self.left_buffer.clear() - - # in place until #1295 resolved - if self._left_columns[0] not in self.left_relation.column_names: - self._right_columns, self._left_columns = ( - self._left_columns, - self._right_columns, - ) - - self.left_hash = hash_join_map(self.left_relation, self._left_columns) - - for right_morsel in self.right_buffer: - yield inner_join_with_preprocessed_left_side( - left_relation=self.left_relation, - right_relation=right_morsel, - join_columns=self._right_columns, - hash_table=self.left_hash, + with self.lock: + if join_leg == "left": + if morsel == EOS: + start = time.monotonic_ns() + self.left_relation = pyarrow.concat_tables( + self.left_buffer, promote_options="none" ) - self.right_buffer.clear() - return - else: - self.left_buffer.append(morsel) - yield None - return - - if join_leg == "right": - if morsel == EOS: + self.left_buffer.clear() + + # in place until #1295 resolved + if self.left_columns[0] not in self.left_relation.column_names: + self.right_columns, self.left_columns = ( + self.left_columns, + self.right_columns, + ) + + start = time.monotonic_ns() + self.left_hash = hash_join_map(self.left_relation, self.left_columns) + print("BUILD HASH MAP", time.monotonic_ns() - start) + self.statistics.time_build_hash_map += time.monotonic_ns() - start + + for right_morsel in self.right_buffer: + print("CLEAR") + yield inner_join_with_preprocessed_left_side( + left_relation=self.left_relation, + right_relation=right_morsel, + join_columns=self.right_columns, + hash_table=self.left_hash, + ) + self.right_buffer.clear() + + return + else: + self.left_buffer.append(morsel) yield None return - if not self.left_complete: - self.right_buffer.append(morsel) - yield None - return - - # do the join - new_morsel = inner_join_with_preprocessed_left_side( - left_relation=self.left_relation, - right_relation=morsel, - join_columns=self._right_columns, - hash_table=self.left_hash, - ) - yield new_morsel + if join_leg == "right": + if morsel == EOS: + print("DONE") + yield None + return + + if self.left_hash is None: + # if we've not built the hash map, cache this morsel + self.right_buffer.append(morsel) + yield None + return + + # do the join + new_morsel = inner_join_with_preprocessed_left_side( + left_relation=self.left_relation, + right_relation=morsel, + join_columns=self.right_columns, + hash_table=self.left_hash, + ) + + yield new_morsel diff --git a/opteryx/operators/inner_join_node_single.py b/opteryx/operators/inner_join_node_single.py index 16860c9b7..aca5d154c 100644 --- a/opteryx/operators/inner_join_node_single.py +++ b/opteryx/operators/inner_join_node_single.py @@ -166,10 +166,10 @@ def __init__(self, properties: QueryProperties, **parameters): self._using = parameters.get("using") self._left_columns = parameters.get("left_columns") - self._left_relation = parameters.get("left_relation_names") + self.left_readers = parameters.get("left_readers") self._right_columns = parameters.get("right_columns") - self._right_relation = parameters.get("right_relation_names") + self.right_readers = parameters.get("right_readers") self.stream = "left" self.left_buffer = [] diff --git a/opteryx/operators/outer_join_node.py b/opteryx/operators/outer_join_node.py index 85a06bfec..6b7d6d59b 100644 --- a/opteryx/operators/outer_join_node.py +++ b/opteryx/operators/outer_join_node.py @@ -256,10 +256,10 @@ def __init__(self, properties: QueryProperties, **parameters): self._using = parameters.get("using") self._left_columns = parameters.get("left_columns") - self._left_relation = parameters.get("left_relation_names") + self.left_readers = parameters.get("left_readers") self._right_columns = parameters.get("right_columns") - self._right_relation = parameters.get("right_relation_names") + self.right_readers = parameters.get("right_readers") self.stream = "left" self.left_buffer = [] diff --git a/opteryx/operators/pyarrow_join_node.py b/opteryx/operators/pyarrow_join_node.py index a0bb29ada..6a4afdfe1 100644 --- a/opteryx/operators/pyarrow_join_node.py +++ b/opteryx/operators/pyarrow_join_node.py @@ -34,10 +34,10 @@ def __init__(self, properties: QueryProperties, **parameters): self._using = parameters.get("using") self._left_columns = parameters.get("left_columns") - self._left_relation = parameters.get("left_relation_names") + self.left_readers = parameters.get("left_readers") self._right_columns = parameters.get("right_columns") - self._right_relation = parameters.get("right_relation_names") + self.right_readers = parameters.get("right_readers") self.stream = "left" self.left_buffer = [] diff --git a/opteryx/operators/read_node.py b/opteryx/operators/read_node.py index 7b1c284a3..3c6219eed 100644 --- a/opteryx/operators/read_node.py +++ b/opteryx/operators/read_node.py @@ -135,6 +135,7 @@ class ReaderNode(BasePlanNode): def __init__(self, properties: QueryProperties, **parameters): BasePlanNode.__init__(self, properties=properties, **parameters) + self.uuid = parameters.get("uuid") self.start_date = parameters.get("start_date") self.end_date = parameters.get("end_date") self.hints = parameters.get("hints", []) diff --git a/opteryx/planner/cost_based_optimizer/__init__.py b/opteryx/planner/cost_based_optimizer/__init__.py index 676cb9710..2663c1c7d 100644 --- a/opteryx/planner/cost_based_optimizer/__init__.py +++ b/opteryx/planner/cost_based_optimizer/__init__.py @@ -135,6 +135,7 @@ def optimize(self, plan: LogicalPlan) -> LogicalPlan: current_plan = plan for strategy in self.strategies: current_plan = self.traverse(current_plan, strategy) + pass # DEBUG: log ("AFTER OPTIMIZATION") # DEBUG: log (current_plan.draw()) return current_plan diff --git a/opteryx/planner/logical_planner/logical_planner.py b/opteryx/planner/logical_planner/logical_planner.py index 510e34d3d..7dfc44c52 100644 --- a/opteryx/planner/logical_planner/logical_planner.py +++ b/opteryx/planner/logical_planner/logical_planner.py @@ -168,39 +168,79 @@ def __str__(self): def get_subplan_schemas(sub_plan: Graph) -> List[str]: """ - Retrieves schemas related to exit and entry points within a given sub-plan. + Collects all schema aliases used within a given sub-plan. - This function iterates through functions named 'get_exit_points' and 'get_entry_points' - in the `sub_plan` object to collect the schemas of the exit and entry points. + This function traverses the sub-plan graph to collect aliases, including those from subqueries. + Aliases define the schemas used at exit and entry points of the sub-plan. Parameters: sub_plan: Graph - The sub-plan object containing the necessary information for processing. + The sub-plan object representing a branch of the logical plan. Returns: List[str]: - A list of schemas corresponding to the exit and entry points in the sub-plan. + A sorted list of unique schema aliases found within the sub-plan. """ - aliases = set() - def traverse(nid: dict): - # get the actual node - node = sub_plan[nid["name"]] - # If this node is a subquery, capture its alias and stop traversing deeper - if node.node_type == LogicalPlanStepType.Subquery: - aliases.add(node.alias) - return # Stop traversing this branch as it's a separate scope - if node.alias: - aliases.add(node.alias) + def collect_aliases(node: dict) -> List[str]: + """ + Recursively traverse the graph to collect schema aliases. + + Parameters: + node: dict + The current node in the graph. + + Returns: + List[str]: + A list of unique schema aliases collected from the current node and its children. + """ + current_node = sub_plan[node["name"]] + + # Start with the alias of the current node, if it exists + aliases = [current_node.alias] if current_node.alias else [] + + # If this node is a subquery, stop traversal here + if current_node.node_type == LogicalPlanStepType.Subquery: + return aliases + + # Recursively collect aliases from children + for child in node.get("children", []): + aliases.extend(collect_aliases(child)) + + return aliases + + # Start the traversal from the root node + root_node = sub_plan.depth_first_search() + aliases = collect_aliases(root_node) + + # Return sorted list of unique aliases + return sorted(set(aliases)) + + +def get_subplan_reads(sub_plan: Graph) -> List[str]: + def collect_reads(node: dict) -> List[str]: + current_node = sub_plan[node["name"]] + + # If this node is a subquery, stop traversal here + if current_node.node_type in ( + LogicalPlanStepType.Scan, + LogicalPlanStepType.FunctionDataset, + ): + return [current_node.uuid] + + readers = [] + # Recursively collect aliases from children + for child in node.get("children", []): + readers.extend(collect_reads(child)) - # Otherwise, continue traversing the children - for child in nid.get("children", []): - traverse(child) + return readers - # Start the traversal from the provided node - traverse(sub_plan.depth_first_search()) + # Start the traversal from the root node + root_node = sub_plan.depth_first_search() + readers = collect_reads(root_node) - return list(aliases) + # Return sorted list of unique aliases + return sorted(set(readers)) """ @@ -732,7 +772,9 @@ def create_node_relation(relation): # add the left and right relation names - we sometimes need these later join_step.left_relation_names = get_subplan_schemas(sub_plan) + join_step.left_readers = get_subplan_reads(sub_plan) join_step.right_relation_names = get_subplan_schemas(right_plan) + join_step.right_readers = get_subplan_reads(right_plan) # add the right side of the join sub_plan += right_plan diff --git a/opteryx/third_party/travers/graph.py b/opteryx/third_party/travers/graph.py index 4ac779a4e..9f3358a9b 100644 --- a/opteryx/third_party/travers/graph.py +++ b/opteryx/third_party/travers/graph.py @@ -191,6 +191,7 @@ def breadth_first_search( visited = {source} queue = deque([(source, 0)]) seeker = self.ingoing_edges if reverse else self.outgoing_edges + offset = 0 if reverse else 1 traversed_edges = [] @@ -199,7 +200,7 @@ def breadth_first_search( if current_depth < depth: for edge in seeker(current_node): - _, target, _ = edge + target = edge[offset] # Add the edge to the traversed edges list traversed_edges.append(edge) diff --git a/testdata/flat/atquestion/data.jsonl b/testdata/flat/atquestion/data.jsonl new file mode 100644 index 000000000..faceb685b --- /dev/null +++ b/testdata/flat/atquestion/data.jsonl @@ -0,0 +1,6 @@ +{"id": 1, "dict": {"list": [1, 2, 3], "key": "value"}, "nested": {"level1": {"key": "val"}}} +{"id": 2, "dict": {"list": [4, 5]}, "nested": {"level1": {"key": null}}} +{"id": 3, "dict": {"other_list": [6, 7, 8]}, "nested": {"level1": {}}} +{"id": 4, "dict": {"list": [], "key": "another_value"}, "nested": {}} +{"id": 5, "dict": {}, "nested": {"level1": {"key": "val"}}} +{"id": 6, "dict": {"list": [9], "nested_list": [{"key": "a"}, {"key": "b"}]}, "nested": {"level1": {"key": "val"}}} \ No newline at end of file diff --git a/tests/sql_battery/test_shapes_and_errors_battery.py b/tests/sql_battery/test_shapes_and_errors_battery.py index b89c78789..654a008fb 100644 --- a/tests/sql_battery/test_shapes_and_errors_battery.py +++ b/tests/sql_battery/test_shapes_and_errors_battery.py @@ -675,6 +675,7 @@ ("SELECT cve -> 'CVE_data_meta' ->> 'ASSIGNER' FROM testdata.flat.nvd limit 10", 10, 1, None), ("SELECT cve ->> 'CVE_data_meta' ->> 'ASSIGNER' FROM testdata.flat.nvd limit 10", 10, 1, None), ("SELECT cve -> 'CVE_data_meta' -> 'ASSIGNER' FROM testdata.flat.nvd limit 10", 10, 1, None), + ("SELECT dict @? 'list' FROM testdata.flat.struct", 6, 1, None), ("SELECT struct(dict) @? 'list' FROM testdata.flat.struct", 6, 1, None), ("SELECT birth_place @? 'town' FROM $astronauts", 357, 1, None), @@ -686,6 +687,47 @@ ("SELECT cve @? '$.CVE_data_meta.REASSIGNER' FROM testdata.flat.nvd LIMIT 10", 10, 1, None), ("SELECT struct(dict) @? '$.list' FROM testdata.flat.struct", 6, 1, None), ("SELECT birth_place @? '$.town' FROM $astronauts", 357, 1, None), + + ("SELECT dict @? 'list' FROM testdata.flat.atquestion", 6, 1, None), # List exists in all but id=5 + ("SELECT dict @? 'key' FROM testdata.flat.atquestion", 6, 1, None), # Key exists in id=1 and id=4 + ("SELECT dict @? 'other_list' FROM testdata.flat.atquestion", 6, 1, None), # Only exists in id=3 + ("SELECT dict @? '$.list[0]' FROM testdata.flat.atquestion", 6, 1, None), # First element of list exists in id=1, id=2, id=6 + ("SELECT dict @? '$.list[2]' FROM testdata.flat.atquestion", 6, 1, None), # Third element of list exists in id=1 + ("SELECT dict @? '$.nested_list[0].key' FROM testdata.flat.atquestion", 6, 1, None), # Nested key exists in id=6 + ("SELECT dict @? '$.non_existent' FROM testdata.flat.atquestion", 6, 1, None), # Non-existent key + ("SELECT dict @? '$.list[100]' FROM testdata.flat.atquestion", 6, 1, None), # Out-of-bounds index + ("SELECT dict @? '$.nested_list[10]' FROM testdata.flat.atquestion", 6, 1, None), # Non-existent nested list index + ("SELECT dict @? '$.nested_list[0].non_existent' FROM testdata.flat.atquestion", 6, 1, None), # Non-existent nested key + ("SELECT nested @? '$.level1.key' FROM testdata.flat.atquestion", 6, 1, None), # Key exists but null in id=2 + ("SELECT nested @? '$.level1.non_existent' FROM testdata.flat.atquestion", 6, 1, None), # Non-existent key in level1 + ("SELECT nested @? '$.non_existent' FROM testdata.flat.atquestion", 6, 1, None), # Completely missing structure + ("SELECT id FROM testdata.flat.atquestion WHERE dict @? 'list'", 4, 1, None), # Rows where 'list' exists + ("SELECT id FROM testdata.flat.atquestion WHERE dict @? 'key'", 2, 1, None), # Rows where 'key' exists + ("SELECT id FROM testdata.flat.atquestion WHERE dict @? 'other_list'", 1, 1, None), # Rows where 'other_list' exists + ("SELECT id FROM testdata.flat.atquestion WHERE dict @? '$.list[0]'", 3, 1, None), # Rows where the first element of 'list' exists + ("SELECT id FROM testdata.flat.atquestion WHERE dict @? '$.list[2]'", 1, 1, None), # Rows where the third element of 'list' exists + ("SELECT id FROM testdata.flat.atquestion WHERE dict @? '$.nested_list[0].key'", 1, 1, None), # Rows where 'nested_list[0].key' exists + ("SELECT id FROM testdata.flat.atquestion WHERE dict @? '$.non_existent'", 0, 1, None), # No rows should match + ("SELECT id FROM testdata.flat.atquestion WHERE dict @? '$.list[100]'", 0, 1, None), # No rows should match + ("SELECT id FROM testdata.flat.atquestion WHERE dict @? '$.nested_list[10]'", 0, 1, None), # No rows should match + ("SELECT id FROM testdata.flat.atquestion WHERE dict @? '$.nested_list[0].non_existent'", 0, 1, None), # No rows should match + ("SELECT id FROM testdata.flat.atquestion WHERE nested @? '$.level1.key'", 4, 1, None), # Rows where 'level1.key' exists (null is still considered existing) + ("SELECT id FROM testdata.flat.atquestion WHERE nested @? '$.level1.non_existent'", 0, 1, None), # No rows should match + ("SELECT id FROM testdata.flat.atquestion WHERE nested @? '$.non_existent'", 0, 1, None), # No rows should match + ("SELECT id FROM testdata.flat.atquestion WHERE dict @? '$.list[0]' LIMIT 2", 2, 1, None), # Limit the matching rows to 2 + ("SELECT id FROM testdata.flat.atquestion WHERE dict @? '$.list[0]' LIMIT 10", 3, 1, None), # Limit exceeds matching rows + ("SELECT id FROM testdata.flat.atquestion WHERE dict @? 'list'", 4, 1, None), # Check existence of 'list' + ("SELECT id FROM testdata.flat.atquestion WHERE dict @? 'key'", 2, 1, None), # Check existence of 'key' + ("SELECT id FROM testdata.flat.atquestion WHERE dict @? '$.nested_list[0].key'", 1, 1, None), # Check nested array structure + ("SELECT id FROM testdata.flat.atquestion WHERE dict @? '$.list[2]'", 1, 1, None), # Index exists in one row + ("SELECT id FROM testdata.flat.atquestion WHERE dict @? '$.list[10]'", 0, 1, None), # Out-of-bounds index + ("SELECT id FROM testdata.flat.atquestion WHERE nested @? '$.level1.key'", 4, 1, None), # Null value still exists + ("SELECT id FROM testdata.flat.atquestion WHERE dict @? '$.key'", 2, 1, None), # Key is present, but handle null + ("SELECT COUNT(*) FROM testdata.flat.atquestion WHERE dict @? 'list'", 1, 1, None), # Aggregation + ("SELECT id FROM testdata.flat.atquestion WHERE dict @? 'list' AND dict @? 'key'", 2, 1, None), # Compound condition + ("SELECT id FROM testdata.flat.atquestion WHERE NOT dict @? 'list'", 2, 1, None), # Negation + ("SELECT id, COUNT(*) FROM testdata.flat.atquestion WHERE dict @? 'list' GROUP BY id", 4, 2, None), # Group by + ("SELECT birth_place['town'] FROM $astronauts", 357, 1, None), ("SELECT missions[0] FROM $astronauts", 357, 1, None), ("SELECT birth_place['town'] FROM $astronauts WHERE birth_place['town'] = 'Warsaw'", 1, 1, None), From 3515a25214ad54a33942955e9818eeb2e1223cbe Mon Sep 17 00:00:00 2001 From: XB500 Date: Tue, 10 Dec 2024 23:22:10 +0000 Subject: [PATCH 058/157] Opteryx Version 0.19.0-alpha.886 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 923b4626e..1a86dd459 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 879 +__build__ = 886 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 750a492370027470f35c26c32eacdf0a65ddd2e4 Mon Sep 17 00:00:00 2001 From: joocer Date: Thu, 12 Dec 2024 00:42:44 +0000 Subject: [PATCH 059/157] #2100 --- opteryx/models/physical_plan.py | 245 +++++++++--------- opteryx/operators/base_plan_node.py | 6 +- opteryx/third_party/travers/graph.py | 29 ++- .../third_party/travers/tests/test_graph.py | 25 +- 4 files changed, 172 insertions(+), 133 deletions(-) diff --git a/opteryx/models/physical_plan.py b/opteryx/models/physical_plan.py index ce0fd2788..56287a48f 100644 --- a/opteryx/models/physical_plan.py +++ b/opteryx/models/physical_plan.py @@ -10,6 +10,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + from queue import Empty from queue import Queue from threading import Lock @@ -30,26 +31,113 @@ morsel_lock = Lock() active_task_lock = Lock() -CONCURRENT_WORKERS = 2 +CONCURRENT_WORKERS = 1 + + +class EOSHandlerMixin: + def __init__(self, work_queue): + self.node_exhaustion = {} # Tracks which nodes are exhausted + self.morsel_accounting = {} # Tracks active morsels per node/leg + self.work_queue = work_queue + self.active_tasks = 0 + + def active_tasks_increment(self, value: int): + with active_task_lock: + self.active_tasks += value + + def initialize_eos_tracking(self, nodes): + """ + Initialize EOS tracking and morsel accounting for all nodes. + """ + from opteryx.operators import JoinNode + + self.node_exhaustion = { + (nid, None): False for nid, node in nodes if not isinstance(node, JoinNode) + } + self.morsel_accounting = { + (nid, None): 0 for nid, node in nodes if not isinstance(node, JoinNode) + } + + for join_nid in (nid for nid, node in nodes if isinstance(node, JoinNode)): + self.node_exhaustion[(join_nid, "left")] = False + self.node_exhaustion[(join_nid, "right")] = False + self.morsel_accounting[(join_nid, "left")] = 0 + self.morsel_accounting[(join_nid, "right")] = 0 + + def mark_node_exhausted(self, node_id: str, leg: Optional[str] = None): + """ + Mark a node and leg as exhausted and propagate EOS downstream. + """ + if self.node_exhaustion[(node_id, leg)]: + return # Already marked exhausted + + self.node_exhaustion[(node_id, leg)] = True + + # Propagate EOS to downstream nodes + self.queue_task(node_id, leg, EOS) + def update_morsel_accounting(self, node_id: str, leg: Optional[str], delta: int): + """ + Update morsel accounting for a node and check for exhaustion. + """ + with morsel_lock: + self.morsel_accounting[(node_id, leg)] += delta + if self.morsel_accounting[(node_id, leg)] < 0: + print(self.morsel_accounting) + raise InvalidInternalStateError("Morsel accounting is invalid.") -class PhysicalPlan(Graph): + # If no more morsels, check if all providers are exhausted + if self.morsel_accounting[(node_id, leg)] == 0: + self.check_and_mark_exhaustion(node_id, leg) + + def check_and_mark_exhaustion(self, node_id: str, leg: Optional[str]): + """ + Check if all upstream providers for a node are exhausted. + """ + for provider, _, provider_leg in self.ingoing_edges(node_id): + if not self.node_exhaustion.get((provider, provider_leg), False): + return # A provider is still active + + self.mark_node_exhausted(node_id, leg) + + def queue_task(self, node_id: str, leg: Optional[str], payload: Any): + """ + Queue a task for a worker. + """ + print( + "WORK PUT", + node_id, + leg, + "Table" if isinstance(payload, pyarrow.Table) else "EOS" if payload == EOS else payload, + flush=True, + ) + self.work_queue.put((node_id, leg, payload)) + self.active_tasks_increment(+1) + self.morsel_accounting[(node_id, leg)] += 1 + + # Process results from the response queue + def work_complete(self) -> bool: + all_nodes_exhausted = all(self.node_exhaustion.values()) + no_active_tasks = self.active_tasks <= 0 + return all_nodes_exhausted and no_active_tasks + + +class PhysicalPlan(Graph, EOSHandlerMixin): """ The execution tree is defined separately from the planner to simplify the complex code that is the planner from the tree that describes the plan. """ def __init__(self): - super().__init__() - self.active_tasks: int = 0 - self.recheck_exhausted: list = [] + # Work queue for worker tasks + self.work_queue = Queue(maxsize=10) + # Response queue for results sent back to the engine + self.response_queue = Queue(maxsize=10) - def active_tasks_increment(self, value: int): - with active_task_lock: - self.active_tasks += value - print("AT", self.active_tasks) + Graph.__init__(self) + EOSHandlerMixin.__init__(self, self.work_queue) - def depth_first_search_flat( + def depth_first_traversal_with_order( self, node: Optional[str] = None, visited: Optional[set] = None ) -> list: """ @@ -69,7 +157,7 @@ def depth_first_search_flat( for neighbor, _, _ in neighbors: if neighbor not in visited: - child_list = self.depth_first_search_flat(neighbor, visited) + child_list = self.depth_first_traversal_with_order(neighbor, visited) traversal_list.extend(child_list) return traversal_list @@ -129,76 +217,6 @@ def execute(self, head_node=None) -> Generator[Tuple[Any, ResultType], Any, Any] from opteryx.operators import ShowCreateNode from opteryx.operators import ShowValueNode - morsel_accounting = {nid: 0 for nid in self.nodes()} - node_exhaustion = {nid: False for nid in self.nodes()} - # Work queue for worker tasks - work_queue = Queue() - # Response queue for results sent back to the engine - response_queue = Queue() - - def mark_node_exhausted(node_id): - """ - Mark a node as exhausted and propagate exhaustion downstream. - """ - if node_exhaustion[node_id]: - return # Node is already marked as exhausted - - node_exhaustion[node_id] = True - - if isinstance(self[node_id], ReaderNode): - return - - for _, _, join_leg in self.ingoing_edges(node_id): - # Queue the task for node with the correct join_leg - print("EOS PUT", node_id, join_leg, EOS) - work_queue.put((node_id, join_leg, EOS)) # EOS signals exhaustion - self.active_tasks_increment(+1) - morsel_accounting[node_id] += 1 - - def update_morsel_accounting(node_id, morsel_count_change: int, join_leg: str): - """ - Updates the morsel accounting for a node and checks for exhaustion. - - Parameters: - node_id (str): The ID of the node to update. - morsel_count_change (int): The change in morsel count (+1 for increment, -1 for decrement). - - Returns: - None - """ - - nodes_to_check = self.recheck_exhausted.copy() + [node_id] - self.recheck_exhausted.clear() - - with morsel_lock: - morsel_accounting[node_id] += morsel_count_change - print( - "ACCOUNT", - node_id, - morsel_accounting[node_id], - morsel_count_change, - self[node_id].name, - ) - - for node in nodes_to_check: - if morsel_accounting[node_id] < 0: - raise InvalidInternalStateError( - "Node input and output count in invalid state." - ) - - # Check if the node is exhausted - if morsel_accounting[node] == 0: # No more pending morsels for this node - # Ensure all parent nodes are exhausted - all_providers_exhausted = all( - node_exhaustion[provider] for provider, _, _ in self.ingoing_edges(node) - ) - if all_providers_exhausted: - print("providers exhausted", node_exhaustion) - mark_node_exhausted(node) - else: - print("providers not exhausted", node_exhaustion) - self.recheck_exhausted.append(node) - if not self.is_acyclic(): raise InvalidInternalStateError("Query plan is cyclic, cannot execute.") @@ -211,6 +229,8 @@ def update_morsel_accounting(node_id, morsel_count_change: int, join_leg: str): if head_node is None: head_node = self[head_nodes[0]] + self.initialize_eos_tracking(self.nodes(True)) + # add the left/right labels to the edges coming into the joins joins = ((nid, node) for nid, node in self.nodes(True) if isinstance(node, JoinNode)) for nid, join in joins: @@ -226,10 +246,8 @@ def update_morsel_accounting(node_id, morsel_count_change: int, join_leg: str): for s, t, r in reader_edges: if self[s].uuid in join.left_readers: - self.remove_edge(provider, nid, r) self.add_edge(provider, nid, "left") elif self[s].uuid in join.right_readers: - self.remove_edge(provider, nid, r) self.add_edge(provider, nid, "right") tester = self.breadth_first_search(nid, reverse=True) @@ -254,21 +272,34 @@ def worker_process(): Worker thread: Processes tasks from the work queue and sends results to the response queue. """ while True: - task = work_queue.get() + task = self.work_queue.get() if task is None: + print("WORK GET", task) break + print( + "WORK GET", + task[0], + task[1], + "Table" + if isinstance(task[2], pyarrow.Table) + else "EOS" + if task[2] == EOS + else task[2], + self[task[0]].name, + flush=True, + ) node_id, join_leg, morsel = task operator = self[node_id] results = operator(morsel, join_leg) for result in results: # Send results back to the response queue - response_queue.put((node_id, join_leg, result)) + self.response_queue.put((node_id, join_leg, result)) - update_morsel_accounting(node_id, -1, join_leg) + self.update_morsel_accounting(node_id, join_leg, -1) - work_queue.task_done() + self.work_queue.task_done() # Launch worker threads for _ in range(num_workers): @@ -278,14 +309,11 @@ def worker_process(): workers.append(worker) def inner_execute(plan): - # Identify pump nodes - global active_tasks - # Get all the nodes which push data into the plan We use DFS to order the # nodes to ensure left branch is always before the right branch pump_nodes = [ (nid, node) - for nid, node in self.depth_first_search_flat() + for nid, node in self.depth_first_traversal_with_order() if isinstance(node, ReaderNode) ] @@ -298,33 +326,21 @@ def inner_execute(plan): for _, target, join_leg in self.outgoing_edges(pump_nid) ] for consumer_node, join_leg in consumer_nodes: - # DEBUG: log (f"following initial {self[pump_nid].name} triggering {self[consumer_node].name}") + # DEBUG: log (f"following initial {self[pump_nid].name} ({pump_nid}) triggering {self[consumer_node].name} ({consumer_node})") # Queue tasks for consumer operators - print("WORK PUT", consumer_node, join_leg) - work_queue.put((consumer_node, join_leg, morsel)) - self.active_tasks_increment(+1) - update_morsel_accounting(consumer_node, +1, join_leg) + self.queue_task(consumer_node, join_leg, morsel) # Pump is exhausted after emitting all morsels print("pump exhausted", pump_nid) - mark_node_exhausted(pump_nid) - update_morsel_accounting(pump_nid, 0) - - # Process results from the response queue - def should_stop(): - all_nodes_exhausted = all(node_exhaustion.values()) - all_nodes_inactive = self.active_tasks <= 0 - return all_nodes_exhausted and all_nodes_inactive + self.update_morsel_accounting(pump_nid, None, 0) - while not should_stop(): + while not self.work_complete(): # Wait for results from workers - print(self.active_tasks) - print("*", end="", flush=True) + print(list(self.node_exhaustion.values()), self.active_tasks) try: - node_id, join_leg, result = response_queue.get(timeout=0.1) - print("-", end="") + node_id, join_leg, result = self.response_queue.get(timeout=0.1) except Empty: - print(".", end="") + print(".") continue # if a thread threw a error, we get them in the main @@ -349,17 +365,14 @@ def should_stop(): for downstream_node, join_leg in downstream_nodes: # Queue tasks for downstream operators - self.active_tasks_increment(+1) - # DEBUG: log (f"following {self[node_id].name} triggering {self[downstream_node].name}") - print("WORK PUT", downstream_node, join_leg) - work_queue.put((downstream_node, join_leg, result)) - update_morsel_accounting(downstream_node, +1, join_leg) + # DEBUG: log (f"following {self[node_id].name} ({node_id}) triggering {self[downstream_node].name} ({downstream_node})", flush=True) + self.queue_task(downstream_node, join_leg, result) # decrement _after_ we've done the work relation to handling the task self.active_tasks_increment(-1) for worker in workers: - work_queue.put(None) + self.work_queue.put(None) # Wait for all workers to complete for worker in workers: diff --git a/opteryx/operators/base_plan_node.py b/opteryx/operators/base_plan_node.py index aab154511..69afba193 100644 --- a/opteryx/operators/base_plan_node.py +++ b/opteryx/operators/base_plan_node.py @@ -112,8 +112,10 @@ def __call__(self, morsel: pyarrow.Table, join_leg: str) -> Optional[pyarrow.Tab self.records_out += result.num_rows self.bytes_out += result.nbytes - # Yield the result to the consumer - yield result + if result == EOS: + yield None + else: + yield result except StopIteration: # Break the loop when the generator is exhausted diff --git a/opteryx/third_party/travers/graph.py b/opteryx/third_party/travers/graph.py index 9f3358a9b..f617086c1 100644 --- a/opteryx/third_party/travers/graph.py +++ b/opteryx/third_party/travers/graph.py @@ -101,11 +101,7 @@ def save(self, graph_path): # pragma: nocover def add_edge(self, source: str, target: str, relationship: Optional[str] = None): """ - Add edge to the graph - - Note: - This does not create an edge if either node does not already exist. - This does not create an edge if either node is None. + Add or update the relationship of an existing edge in the graph Parameters: source: string @@ -113,22 +109,29 @@ def add_edge(self, source: str, target: str, relationship: Optional[str] = None) target: string The target node relationship: string - The relationship between the source and target nodes + The relationship to be added or updated """ if source is None or target is None: - print("Trying to create edge with undefined nodes") + print("Trying to update edge with undefined nodes") return False - # Check for existing edges and add the new one + # Check if the edge exists existing_edges = list(self._edges.get(source, ())) + edge_found = False + + for i, (existing_target, existing_relationship) in enumerate(existing_edges): + if existing_target == target: + existing_edges[i] = (target, relationship) + edge_found = True + break - # Avoid adding duplicate edges - edge_to_add = (target, relationship) - if edge_to_add not in existing_edges: - existing_edges.append(edge_to_add) + if not edge_found: + # Add the new edge + existing_edges.append((target, relationship)) self._edges[source] = tuple(existing_edges) self._cached_edges = None + return True def add_node(self, nid: str, node): """ @@ -390,7 +393,7 @@ def remove_node(self, nid, heal: bool = False): # wire up the old incoming and outgoing nodes, cartesian style for out_nid in out_going: for in_nid in in_coming: - self.add_edge(in_nid[0], out_nid[1], in_nid[1]) # type:ignore + self.add_edge(in_nid[0], out_nid[1], in_nid[2]) # type:ignore self._cached_edges = None diff --git a/opteryx/third_party/travers/tests/test_graph.py b/opteryx/third_party/travers/tests/test_graph.py index 4000629eb..3e92836a0 100644 --- a/opteryx/third_party/travers/tests/test_graph.py +++ b/opteryx/third_party/travers/tests/test_graph.py @@ -92,9 +92,9 @@ def test_epitomize(): graph = build_graph() summ = graph.epitomize() - # are the node and edge counts right? + assert len(summ.nodes()) == 3 - assert len(list(summ.edges())) == 6 + assert len(list(summ.edges())) == 4 assert sorted(summ.nodes()) == ["Locality", "Person", "Restaurant"] @@ -148,6 +148,26 @@ def test_node_deletion(): assert ("Sharlene", "Bindoon", "Lives In") not in graph.edges() +def test_update_existing_edge(): + graph = build_graph() + + # Add an edge + graph.add_edge("Sharlene", "Bindoon", "friend") + + # Update the edge + graph.add_edge("Sharlene", "Bindoon", "best friend") + + # Verify the edge was updated + edges = graph.outgoing_edges("Sharlene") + updated = False + for source, target, relationship in edges: + if target == "Bindoon" and relationship == "best friend": + updated = True + break + + assert updated, "The edge relationship was not updated correctly" + + if __name__ == "__main__": # pragma: no cover test_graph() test_outgoing_edges() @@ -157,4 +177,5 @@ def test_node_deletion(): test_node_attributes() test_edge_deletion() test_node_deletion() + test_update_existing_edge() print("okay") From 6e4a1e75cce86da38856514e75ed3c3995e29867 Mon Sep 17 00:00:00 2001 From: XB500 Date: Thu, 12 Dec 2024 00:43:11 +0000 Subject: [PATCH 060/157] Opteryx Version 0.19.0-alpha.887 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 1a86dd459..5c27ddfde 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 886 +__build__ = 887 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 384c1e4d0f5ddd02a116c1391f89cd7f437ffd86 Mon Sep 17 00:00:00 2001 From: joocer Date: Fri, 13 Dec 2024 00:12:52 +0000 Subject: [PATCH 061/157] #2100 --- opteryx/cursor.py | 2 +- opteryx/managers/expression/ops.py | 109 ++++++++++++++----------- opteryx/models/physical_plan.py | 49 ++++++++---- opteryx/operators/async_read_node.py | 5 ++ opteryx/operators/read_node.py | 4 + opteryx/utils/sql.py | 33 ++++++++ tests/misc/test_utils_sql.py | 114 +++++++++++++++++++++++++++ 7 files changed, 255 insertions(+), 61 deletions(-) create mode 100644 tests/misc/test_utils_sql.py diff --git a/opteryx/cursor.py b/opteryx/cursor.py index 37ff5016f..dadb8cd89 100644 --- a/opteryx/cursor.py +++ b/opteryx/cursor.py @@ -204,7 +204,7 @@ def _inner_execute( start = time.time_ns() for plan in plans: self._statistics.time_planning += time.time_ns() - start - results = plan.execute() + results = plan.execute(statistics=self._statistics) start = time.time_ns() system_statistics.queries_executed += 1 diff --git a/opteryx/managers/expression/ops.py b/opteryx/managers/expression/ops.py index 51140abaf..70ffaa27a 100644 --- a/opteryx/managers/expression/ops.py +++ b/opteryx/managers/expression/ops.py @@ -2,12 +2,15 @@ Original code modified for Opteryx. """ +import re + import numpy import pyarrow from orso.types import OrsoTypes from pyarrow import compute from opteryx.compiled import list_ops +from opteryx.utils.sql import sql_like_to_regex def filter_operations(arr, left_type, operator, value, right_type): @@ -176,58 +179,72 @@ def _inner_filter_operations(arr, operator, value): if operator == "AllOpNotEq": return list_ops.cython_allop_neq(arr[0], value) + if operator == "AnyOpILike": + patterns = value[0] + + combined_regex_pattern = r"|".join(sql_like_to_regex(p) for p in patterns) + combined_regex = re.compile(combined_regex_pattern, re.IGNORECASE) + + out = numpy.zeros(arr.size, dtype=bool) + for i, row in enumerate(arr): + if row is None: + out[i] = None + continue + if row.size == 0: + continue + out[i] = any(combined_regex.search(elem) for elem in row) + + return out + if operator == "AnyOpLike": patterns = value[0] - return numpy.array( - [ - None - if row is None - else any(compute.match_like(row, pattern).true_count > 0 for pattern in patterns) - for row in arr - ], - dtype=bool, - ) + + combined_regex_pattern = r"|".join(sql_like_to_regex(p) for p in patterns) + combined_regex = re.compile(combined_regex_pattern) + + out = numpy.zeros(arr.size, dtype=bool) + for i, row in enumerate(arr): + if row is None: + out[i] = None + continue + if row.size == 0: + continue + out[i] = any(combined_regex.search(elem) for elem in row) + + return out if operator == "AnyOpNotLike": patterns = value[0] - matches = numpy.array( - [ - None - if row is None - else any(compute.match_like(row, pattern).true_count > 0 for pattern in patterns) - for row in arr - ], - dtype=bool, - ) - return numpy.invert(matches) - if operator == "AnyOpILike": - patterns = value[0] - return numpy.array( - [ - None - if row is None - else any( - compute.match_like(row, pattern, ignore_case=True).true_count > 0 - for pattern in patterns - ) - for row in arr - ], - dtype=bool, - ) + + combined_regex_pattern = r"|".join(sql_like_to_regex(p) for p in patterns) + combined_regex = re.compile(combined_regex_pattern) + + out = numpy.zeros(arr.size, dtype=bool) + for i, row in enumerate(arr): + if row is None: + out[i] = None + continue + if row.size == 0: + continue + out[i] = any(combined_regex.search(elem) for elem in row) + + return numpy.invert(out) + if operator == "AnyOpNotILike": patterns = value[0] - matches = numpy.array( - [ - None - if row is None - else any( - compute.match_like(row, pattern, ignore_case=True).true_count > 0 - for pattern in patterns - ) - for row in arr - ], - dtype=bool, - ) - return numpy.invert(matches) + + combined_regex_pattern = r"|".join(sql_like_to_regex(p) for p in patterns) + combined_regex = re.compile(combined_regex_pattern, re.IGNORECASE) + + out = numpy.zeros(arr.size, dtype=bool) + for i, row in enumerate(arr): + if row is None: + out[i] = None + continue + if row.size == 0: + continue + out[i] = any(combined_regex.search(elem) for elem in row) + + return numpy.invert(out) if operator == "AtQuestion": element = value[0] diff --git a/opteryx/models/physical_plan.py b/opteryx/models/physical_plan.py index 56287a48f..245674288 100644 --- a/opteryx/models/physical_plan.py +++ b/opteryx/models/physical_plan.py @@ -31,7 +31,9 @@ morsel_lock = Lock() active_task_lock = Lock() -CONCURRENT_WORKERS = 1 +CYCLE_WAIT_TIME: int = 0.05 + +CONCURRENT_WORKERS = 2 class EOSHandlerMixin: @@ -83,7 +85,7 @@ def update_morsel_accounting(self, node_id: str, leg: Optional[str], delta: int) with morsel_lock: self.morsel_accounting[(node_id, leg)] += delta if self.morsel_accounting[(node_id, leg)] < 0: - print(self.morsel_accounting) + # print(self.morsel_accounting) raise InvalidInternalStateError("Morsel accounting is invalid.") # If no more morsels, check if all providers are exhausted @@ -109,6 +111,7 @@ def queue_task(self, node_id: str, leg: Optional[str], payload: Any): node_id, leg, "Table" if isinstance(payload, pyarrow.Table) else "EOS" if payload == EOS else payload, + self.work_queue.qsize() + 1, flush=True, ) self.work_queue.put((node_id, leg, payload)) @@ -130,9 +133,9 @@ class PhysicalPlan(Graph, EOSHandlerMixin): def __init__(self): # Work queue for worker tasks - self.work_queue = Queue(maxsize=10) + self.work_queue = Queue() # Response queue for results sent back to the engine - self.response_queue = Queue(maxsize=10) + self.response_queue = Queue() Graph.__init__(self) EOSHandlerMixin.__init__(self, self.work_queue) @@ -162,7 +165,7 @@ def depth_first_traversal_with_order( return traversal_list - def explain(self, analyze: bool) -> Generator[pyarrow.Table, None, None]: + def explain(self, analyze: bool, statistics) -> Generator[pyarrow.Table, None, None]: from opteryx import operators def _inner_explain(node, depth): @@ -197,7 +200,7 @@ def _inner_explain(node, depth): temp = None head_node = self.get_exit_points()[0] query_head, _, _ = self.ingoing_edges(head_node)[0] - results = self.execute(query_head) + results = self.execute(head_node=query_head) if results is not None: results_generator, _ = next(results, ([], None)) for temp in results_generator: @@ -209,7 +212,18 @@ def _inner_explain(node, depth): table = pyarrow.Table.from_pylist(plan) return table - def execute(self, head_node=None) -> Generator[Tuple[Any, ResultType], Any, Any]: + def execute( + self, head_node=None, statistics=None + ) -> Generator[Tuple[Any, ResultType], Any, Any]: + """ + Execute the physical plan starting from the head node. + + Args: + head_node (Optional[str]): The starting node for execution. If None, the default head node is used. + + Returns: + Generator[Tuple[Any, ResultType], Any, Any]: A generator yielding results and their types. + """ from opteryx.operators import ExplainNode from opteryx.operators import JoinNode from opteryx.operators import ReaderNode @@ -258,7 +272,7 @@ def execute(self, head_node=None) -> Generator[Tuple[Any, ResultType], Any, Any] # Special case handling for 'Explain' queries if isinstance(head_node, ExplainNode): - yield self.explain(head_node.analyze), ResultType.TABULAR + yield self.explain(head_node.analyze, statistics), ResultType.TABULAR elif isinstance(head_node, (SetVariableNode, ShowValueNode, ShowCreateNode)): yield head_node(None), ResultType.TABULAR @@ -272,9 +286,14 @@ def worker_process(): Worker thread: Processes tasks from the work queue and sends results to the response queue. """ while True: - task = self.work_queue.get() + try: + task = self.work_queue.get(timeout=CYCLE_WAIT_TIME) + except Empty: + statistics.worker_wait_time += CYCLE_WAIT_TIME + continue + if task is None: - print("WORK GET", task) + # print("WORK GET", task) break print( @@ -287,6 +306,7 @@ def worker_process(): if task[2] == EOS else task[2], self[task[0]].name, + self.work_queue.qsize(), flush=True, ) node_id, join_leg, morsel = task @@ -331,16 +351,17 @@ def inner_execute(plan): self.queue_task(consumer_node, join_leg, morsel) # Pump is exhausted after emitting all morsels - print("pump exhausted", pump_nid) + # print("pump exhausted", pump_nid) self.update_morsel_accounting(pump_nid, None, 0) while not self.work_complete(): # Wait for results from workers - print(list(self.node_exhaustion.values()), self.active_tasks) + # print(list(self.node_exhaustion.values()), self.active_tasks) try: - node_id, join_leg, result = self.response_queue.get(timeout=0.1) + node_id, join_leg, result = self.response_queue.get(timeout=CYCLE_WAIT_TIME) except Empty: - print(".") + # print(".") + statistics.cpu_wait_time += CYCLE_WAIT_TIME continue # if a thread threw a error, we get them in the main diff --git a/opteryx/operators/async_read_node.py b/opteryx/operators/async_read_node.py index 8dc607bb0..528d630a8 100644 --- a/opteryx/operators/async_read_node.py +++ b/opteryx/operators/async_read_node.py @@ -31,6 +31,7 @@ import pyarrow.parquet from orso.schema import convert_orso_schema_to_arrow_schema +from opteryx import EOS from opteryx import config from opteryx.exceptions import DataError from opteryx.models import QueryProperties @@ -85,6 +86,10 @@ def from_dict(cls, dic: dict) -> "AsyncReaderNode": # pragma: no cover raise NotImplementedError() def execute(self, morsel, **kwargs) -> Generator: + if morsel == EOS: + yield None + return + from opteryx import system_statistics """Perform this step, time how long is spent doing work""" diff --git a/opteryx/operators/read_node.py b/opteryx/operators/read_node.py index 3c6219eed..67e70d7ab 100644 --- a/opteryx/operators/read_node.py +++ b/opteryx/operators/read_node.py @@ -27,6 +27,7 @@ from orso.schema import RelationSchema from orso.schema import convert_orso_schema_to_arrow_schema +from opteryx import EOS from opteryx.models import QueryProperties from . import BasePlanNode @@ -191,6 +192,9 @@ def config(self): def execute(self, morsel, **kwargs) -> Generator: """Perform this step, time how long is spent doing work""" + if morsel == EOS: + yield None + return morsel = None orso_schema = self.schema diff --git a/opteryx/utils/sql.py b/opteryx/utils/sql.py index 09aba5033..b5bc459ca 100644 --- a/opteryx/utils/sql.py +++ b/opteryx/utils/sql.py @@ -1,6 +1,39 @@ import re from typing import List +ESCAPE_SPECIAL_CHARS = re.compile(r"([.^$*+?{}[\]|()\\])") + + +def sql_like_to_regex(pattern: str) -> str: + """ + Converts an SQL `LIKE` pattern into a regular expression. + + SQL `LIKE` syntax: + - `%` matches zero or more characters (similar to `.*` in regex). + - `_` matches exactly one character (similar to `.` in regex). + - Special regex characters are escaped to ensure literal matching. + + Args: + pattern (str): The SQL LIKE pattern. + + Returns: + str: The equivalent regex pattern, anchored with `^` and `$`. + + Examples: + sql_like_to_regex("a%") -> "^a.*?$" + sql_like_to_regex("_b") -> "^.b$" + sql_like_to_regex("%[test]%") -> "^.*?\[test\].*?$" + """ + if pattern is None: + raise ValueError("Pattern cannot be None") + + # Escape special regex characters in the pattern + escaped_pattern = ESCAPE_SPECIAL_CHARS.sub(r"\\\1", pattern) + + # Replace SQL wildcards with regex equivalents + regex_pattern = "^" + escaped_pattern.replace("%", ".*?").replace("_", ".") + "$" + return regex_pattern + def remove_comments(string: str) -> str: """ diff --git a/tests/misc/test_utils_sql.py b/tests/misc/test_utils_sql.py new file mode 100644 index 000000000..95909c8e0 --- /dev/null +++ b/tests/misc/test_utils_sql.py @@ -0,0 +1,114 @@ +import os +import sys + +sys.path.insert(1, os.path.join(sys.path[0], "../..")) + +import pytest + +from opteryx.utils import sql + + +# fmt:off +TEST_CASES = [ + # Basic patterns + ("a%", r"^a.*?$"), + ("%a", r"^.*?a$"), + ("%a%", r"^.*?a.*?$"), + ("a_b", r"^a.b$"), + ("a__", r"^a..$"), + ("_", r"^.$"), + ("__", r"^..$"), + + # Escaping special regex characters + ("a.b", r"^a\.b$"), + ("[abc]", r"^\[abc\]$"), + ("(test)", r"^\(test\)$"), + ("a+b", r"^a\+b$"), + ("a*b", r"^a\*b$"), + ("a^b", r"^a\^b$"), + ("a$b", r"^a\$b$"), + ("a|b", r"^a\|b$"), + ("a\\b", r"^a\\b$"), + ("{test}", r"^\{test\}$"), + + # Mixed wildcards and special characters + ("%a.b%", r"^.*?a\.b.*?$"), + ("a_b%", r"^a.b.*?$"), + ("%a_b%", r"^.*?a.b.*?$"), + ("a%[test]%b", r"^a.*?\[test\].*?b$"), + ("_%[abc]", r"^..*?\[abc\]$"), + ("%a+b%", r"^.*?a\+b.*?$"), + ("a%b_", r"^a.*?b.$"), + + # Patterns with only wildcards + ("%", r"^.*?$"), + ("__", r"^..$"), + ("_%", r"^..*?$"), + ("_%_", r"^..*?.$"), + ("%_%", r"^.*?..*?$"), + + # Patterns with no special characters + ("test", r"^test$"), + ("abc", r"^abc$"), + ("hello world", r"^hello world$"), + ("123", r"^123$"), + + # Empty pattern + ("", r"^$"), + + # Edge cases with spaces + (" a%", r"^ a.*?$"), + ("% a", r"^.*? a$"), + (" a b ", r"^ a b $"), + + # Multiple wildcards + ("a%%b", r"^a.*?.*?b$"), + ("%a_b%", r"^.*?a.b.*?$"), + ("a%_%b", r"^a.*?..*?b$"), + ("%_%_%", r"^.*?..*?..*?$"), + + # Patterns with underscores + ("_", r"^.$"), + ("__abc", r"^..abc$"), + ("%__%", r"^.*?...*?$"), + ("abc_", r"^abc.$"), + ("_%_", r"^..*?.$"), + + # Patterns with numeric characters + ("123%", r"^123.*?$"), + ("12_3", r"^12.3$"), + ("%1_2%", r"^.*?1.2.*?$"), + + # Patterns with mixed characters + ("a1_b2%", r"^a1.b2.*?$"), + ("a_b%c_d%", r"^a.b.*?c.d.*?$"), + ("%a%1%", r"^.*?a.*?1.*?$"), + ("_a%_b", r"^.a.*?.b$"), + + # Patterns with special regex characters and wildcards + ("%a(b)%", r"^.*?a\(b\).*?$"), + ("[test]_%", r"^\[test\]..*?$"), + ("a{1,3}_b", r"^a\{1,3\}.b$"), + ("(a|b)%", r"^\(a\|b\).*?$") +] +# fmt:on + + +@pytest.mark.parametrize("like_pattern, re_pattern", TEST_CASES) +def test_like_to_regex(like_pattern, re_pattern): + converted = sql.sql_like_to_regex(like_pattern) + assert converted == re_pattern, f"{like_pattern} -> {converted} != {re_pattern}" + + +if __name__ == "__main__": # pragma: no cover + print(f"RUNNING BATTERY OF {len(TEST_CASES)} LIKE -> REGEX TESTS") + import time + + t = time.monotonic_ns() + for i in range(57): + for like_pattern, re_pattern in TEST_CASES: + print(".", end="") + test_like_to_regex(like_pattern, re_pattern) + print() + print("✅ okay") + print(time.monotonic_ns() - t) From 14e1514cc06846f79281327b7da95bc4a20445fa Mon Sep 17 00:00:00 2001 From: XB500 Date: Fri, 13 Dec 2024 00:13:18 +0000 Subject: [PATCH 062/157] Opteryx Version 0.19.0-alpha.888 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 5c27ddfde..0ef31a452 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 887 +__build__ = 888 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 45f6ce7c621d42b65b97de8ccdf4fe735e786b62 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 16 Dec 2024 09:22:04 +0000 Subject: [PATCH 063/157] Bump duckdb-engine from 0.13.6 to 0.14.0 Bumps [duckdb-engine](https://github.com/Mause/duckdb_engine) from 0.13.6 to 0.14.0. - [Release notes](https://github.com/Mause/duckdb_engine/releases) - [Changelog](https://github.com/Mause/duckdb_engine/blob/main/CHANGELOG.md) - [Commits](https://github.com/Mause/duckdb_engine/compare/v0.13.6...v0.14.0) --- updated-dependencies: - dependency-name: duckdb-engine dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- tests/requirements_arm.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/requirements_arm.txt b/tests/requirements_arm.txt index 7887e997b..a7885951e 100644 --- a/tests/requirements_arm.txt +++ b/tests/requirements_arm.txt @@ -19,6 +19,6 @@ sqlalchemy pymysql psycopg2-binary duckdb==1.1.3 # 1040 -duckdb-engine==0.13.6 # 1040 +duckdb-engine==0.14.0 # 1040 setuptools_rust \ No newline at end of file From c99596cb2576a2ca2e61fa8a8d80cf5582986395 Mon Sep 17 00:00:00 2001 From: joocer Date: Wed, 18 Dec 2024 16:48:50 +0000 Subject: [PATCH 064/157] #2128 --- opteryx/connectors/gcp_cloudstorage_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/connectors/gcp_cloudstorage_connector.py b/opteryx/connectors/gcp_cloudstorage_connector.py index 639cc0a28..6ff38f10a 100644 --- a/opteryx/connectors/gcp_cloudstorage_connector.py +++ b/opteryx/connectors/gcp_cloudstorage_connector.py @@ -198,7 +198,7 @@ def get_list_of_blob_names(self, *, prefix: str) -> List[str]: object_path = urllib.parse.quote(object_path, safe="") bucket = urllib.parse.quote(bucket, safe="") # Ensure bucket name is URL-safe - url = f"https://storage.googleapis.com/storage/v1/b/{bucket}/o?prefix={object_path}&fields=items(name)" + url = f"https://storage.googleapis.com/storage/v1/b/{bucket}/o?prefix={object_path}&fields=items(name),nextPageToken" # Ensure the credentials are valid, refreshing them if necessary if not self.client_credentials.valid: # pragma: no cover From 3cc59efbaf7d6e9fa375a02f3bf1679703d860ab Mon Sep 17 00:00:00 2001 From: XB500 Date: Wed, 18 Dec 2024 16:49:17 +0000 Subject: [PATCH 065/157] Opteryx Version 0.19.0-alpha.890 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 0ef31a452..d99500f91 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 888 +__build__ = 890 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 5034ccec3538b2f0e23efe1e0cad0b1bc875dc25 Mon Sep 17 00:00:00 2001 From: joocer Date: Wed, 18 Dec 2024 23:09:36 +0000 Subject: [PATCH 066/157] #2132 --- opteryx/connectors/sql_connector.py | 13 +- opteryx/cursor.py | 11 +- opteryx/exceptions.py | 2 +- opteryx/managers/execution/__init__.py | 14 + opteryx/managers/execution/serial_engine.py | 133 +++++++ opteryx/models/physical_plan.py | 371 ++---------------- opteryx/operators/__init__.py | 3 - opteryx/operators/aggregate_and_group_node.py | 1 + opteryx/operators/aggregate_node.py | 2 +- opteryx/operators/base_plan_node.py | 10 +- opteryx/operators/cross_join_node.py | 1 + opteryx/operators/distinct_node.py | 2 +- opteryx/operators/exit_node.py | 2 +- opteryx/operators/filter_node.py | 2 +- opteryx/operators/heap_sort_node.py | 1 + opteryx/operators/inner_join_node.py | 7 +- opteryx/operators/limit_node.py | 2 +- opteryx/operators/noop_node.py | 44 --- opteryx/operators/projection_node.py | 2 +- opteryx/operators/read_node.py | 2 + opteryx/operators/sort_node.py | 1 + opteryx/operators/union_node.py | 2 +- .../logical_planner/logical_planner.py | 4 + opteryx/shared/variables.py | 4 +- .../test_shapes_and_errors_battery.py | 1 + tests/storage/test_blob_gcs.py | 6 + 26 files changed, 233 insertions(+), 410 deletions(-) create mode 100644 opteryx/managers/execution/__init__.py create mode 100644 opteryx/managers/execution/serial_engine.py delete mode 100644 opteryx/operators/noop_node.py diff --git a/opteryx/connectors/sql_connector.py b/opteryx/connectors/sql_connector.py index 96743960c..3b9e88ac4 100644 --- a/opteryx/connectors/sql_connector.py +++ b/opteryx/connectors/sql_connector.py @@ -22,6 +22,7 @@ from typing import Optional from typing import Tuple +import orjson import pyarrow from orso import DataFrame from orso.schema import ConstantColumn @@ -29,6 +30,7 @@ from orso.schema import RelationSchema from orso.tools import random_string from orso.types import PYTHON_TO_ORSO_MAP +from orso.types import OrsoTypes from opteryx.config import OPTERYX_DEBUG from opteryx.connectors.base.base_connector import DEFAULT_MORSEL_SIZE @@ -194,6 +196,15 @@ def read_dataset( # type:ignore if not batch_rows: break + # If we have a struct column, we need to convert the data to bytes + if any(col.type == OrsoTypes.STRUCT for col in self.schema.columns): + batch_rows = list(batch_rows) + for i, row in enumerate(batch_rows): + batch_rows[i] = tuple( + orjson.dumps(field) if isinstance(field, dict) else field + for field in row + ) + # convert the SqlAlchemy Results to Arrow using Orso b = time.monotonic_ns() morsel = DataFrame(schema=result_schema, rows=batch_rows).arrow() @@ -209,7 +220,7 @@ def read_dataset( # type:ignore self.chunk_size = (self.chunk_size // MIN_CHUNK_SIZE) * MIN_CHUNK_SIZE self.chunk_size = max(self.chunk_size, MIN_CHUNK_SIZE) self.chunk_size = min(self.chunk_size, 1000000) # cap at 1 million - # DEBUG: log (f"CHANGING CHUNK SIZE TO {self.chunk_size} was {INITIAL_CHUNK_SIZE}.") + # DEBUG: log (f"CHANGING CHUNK SIZE TO {self.chunk_size} was {INITIAL_CHUNK_SIZE} ({morsel.nbytes} bytes).") yield morsel at_least_once = True diff --git a/opteryx/cursor.py b/opteryx/cursor.py index dadb8cd89..72955a986 100644 --- a/opteryx/cursor.py +++ b/opteryx/cursor.py @@ -171,6 +171,7 @@ def _inner_execute( """ from opteryx import system_statistics + from opteryx.managers.execution import execute from opteryx.planner import query_planner if not operation: # pragma: no cover @@ -190,22 +191,16 @@ def _inner_execute( try: start = time.time_ns() - first_item = next(plans) + plan = next(plans) self._statistics.time_planning += time.time_ns() - start except RuntimeError as err: # pragma: no cover raise SqlError(f"Error Executing SQL Statement ({err})") from err - plans = chain([first_item], plans) - if ROLLING_LOG: ROLLING_LOG.append(operation) - results = None + results = execute(plan, statistics=self._statistics) start = time.time_ns() - for plan in plans: - self._statistics.time_planning += time.time_ns() - start - results = plan.execute(statistics=self._statistics) - start = time.time_ns() system_statistics.queries_executed += 1 diff --git a/opteryx/exceptions.py b/opteryx/exceptions.py index 4f4696cb8..ca4d9db36 100644 --- a/opteryx/exceptions.py +++ b/opteryx/exceptions.py @@ -250,7 +250,7 @@ class UnexpectedDatasetReferenceError(SqlError): def __init__(self, dataset: str): self.dataset = dataset - message = f"Dataset '{dataset}' referenced in query without being referenced in a FROM or JOIN clause." + message = f"Dataset '{dataset}' is referenced in query but it doesn't appear in a FROM or JOIN clause." super().__init__(message) diff --git a/opteryx/managers/execution/__init__.py b/opteryx/managers/execution/__init__.py new file mode 100644 index 000000000..c165e8630 --- /dev/null +++ b/opteryx/managers/execution/__init__.py @@ -0,0 +1,14 @@ +from opteryx.exceptions import InvalidInternalStateError + +from .serial_engine import execute as serial_execute + + +def execute(plan, statistics): + # Validate query plan to ensure it's acyclic + if not plan.is_acyclic(): + raise InvalidInternalStateError("Query plan is cyclic, cannot execute.") + + # Label the join legs to ensure left/right ordering + plan.label_join_legs() + + yield from serial_execute(plan, statistics=statistics) diff --git a/opteryx/managers/execution/serial_engine.py b/opteryx/managers/execution/serial_engine.py new file mode 100644 index 000000000..3c2a0d9ca --- /dev/null +++ b/opteryx/managers/execution/serial_engine.py @@ -0,0 +1,133 @@ +""" +This module provides the execution engine for processing physical plans in a serial manner. +""" + +from typing import Any +from typing import Generator +from typing import Tuple + +import pyarrow + +from opteryx import EOS +from opteryx.constants import ResultType +from opteryx.exceptions import InvalidInternalStateError +from opteryx.models import PhysicalPlan +from opteryx.models import QueryStatistics + + +def execute( + plan: PhysicalPlan, head_node: str = None, statistics: QueryStatistics = None +) -> Tuple[Generator[pyarrow.Table, Any, Any], ResultType]: + from opteryx.operators import ExplainNode + from opteryx.operators import SetVariableNode + from opteryx.operators import ShowCreateNode + from opteryx.operators import ShowValueNode + + # Retrieve the tail of the query plan, which should ideally be a single head node + head_nodes = list(set(plan.get_exit_points())) + + if len(head_nodes) != 1: + raise InvalidInternalStateError( + f"Query plan has {len(head_nodes)} heads, expected exactly 1." + ) + + if head_node is None: + head_node = plan[head_nodes[0]] + + # Special case handling for 'Explain' queries + if isinstance(head_node, ExplainNode): + yield plan.explain(head_node.analyze), ResultType.TABULAR + + # Special case handling for 'Set' queries + elif isinstance(head_node, SetVariableNode): + yield head_node(None, None), ResultType.NON_TABULAR + + elif isinstance(head_node, (ShowValueNode, ShowCreateNode)): + yield head_node(None, None), ResultType.TABULAR + + else: + + def inner_execute(plan): + # Get the pump nodes from the plan and execute them in order + pump_nodes = [ + (nid, node) for nid, node in plan.depth_first_search_flat() if node.is_scan + ] + for pump_nid, pump_instance in pump_nodes: + for morsel in pump_instance(None, None): + yield from process_node(plan, pump_nid, morsel, None) + yield from process_node(plan, pump_nid, EOS, None) + + yield inner_execute(plan), ResultType.TABULAR + + +def explain(plan: PhysicalPlan, analyze: bool) -> Generator[pyarrow.Table, None, None]: + from opteryx import operators + + def _inner_explain(node, depth): + incoming_operators = plan.ingoing_edges(node) + for operator_name in incoming_operators: + operator = plan[operator_name[0]] + if isinstance(operator, (operators.ExitNode, operators.ExplainNode)): # Skip ExitNode + yield from _inner_explain(operator_name[0], depth) + continue + elif isinstance(operator, operators.BasePlanNode): + record = { + "tree": depth, + "operator": operator.name, + "config": operator.config, + } + if analyze: + record["time_ms"] = operator.execution_time / 1e6 + record["records_in"] = operator.records_in + record["records_out"] = operator.records_out + yield record + yield from _inner_explain(operator_name[0], depth + 1) + + head = list(dict.fromkeys(plan.get_exit_points())) + if len(head) != 1: # pragma: no cover + raise InvalidInternalStateError(f"Problem with the plan - it has {len(head)} heads.") + + # for EXPLAIN ANALYZE, we execute the query and report statistics + if analyze: + # we don't want the results, just the details from the plan + temp = None + head_node = plan.get_exit_points()[0] + query_head, _, _ = plan.ingoing_edges(head_node)[0] + results = plan.execute(query_head) + if results is not None: + results_generator, _ = next(results, ([], None)) + for temp in results_generator: + pass + del temp + + plan = list(_inner_explain(head[0], 1)) + + table = pyarrow.Table.from_pylist(plan) + + yield table + + +def process_node(plan: PhysicalPlan, nid: str, morsel, join_leg: str): + node = plan[nid] + + if node.is_scan: + children = ((t, r) for s, t, r in plan.outgoing_edges(nid)) + for child, leg in children: + results = process_node(plan, child, morsel, leg) + for result in results: + if result is not None: + yield result + else: + results = node(morsel, join_leg) + if results is None: + yield None + return + if isinstance(results, Exception): + raise results + for result in results: + if result is not None: + children = [(t, r) for s, t, r in plan.outgoing_edges(nid)] + for child, leg in children: + yield from process_node(plan, child, result, leg) + if len(children) == 0 and result != EOS: + yield result diff --git a/opteryx/models/physical_plan.py b/opteryx/models/physical_plan.py index 245674288..9890f5056 100644 --- a/opteryx/models/physical_plan.py +++ b/opteryx/models/physical_plan.py @@ -10,141 +10,31 @@ # See the License for the specific language governing permissions and # limitations under the License. +""" +The Physical Plan is a tree of nodes that represent the execution plan for a query. +""" -from queue import Empty -from queue import Queue -from threading import Lock -from threading import Thread -from typing import Any -from typing import Generator from typing import Optional -from typing import Tuple -import pyarrow - -from opteryx import EOS -from opteryx.config import CONCURRENT_WORKERS -from opteryx.constants import ResultType from opteryx.exceptions import InvalidInternalStateError from opteryx.third_party.travers import Graph -morsel_lock = Lock() -active_task_lock = Lock() - -CYCLE_WAIT_TIME: int = 0.05 - -CONCURRENT_WORKERS = 2 - - -class EOSHandlerMixin: - def __init__(self, work_queue): - self.node_exhaustion = {} # Tracks which nodes are exhausted - self.morsel_accounting = {} # Tracks active morsels per node/leg - self.work_queue = work_queue - self.active_tasks = 0 - - def active_tasks_increment(self, value: int): - with active_task_lock: - self.active_tasks += value - - def initialize_eos_tracking(self, nodes): - """ - Initialize EOS tracking and morsel accounting for all nodes. - """ - from opteryx.operators import JoinNode - - self.node_exhaustion = { - (nid, None): False for nid, node in nodes if not isinstance(node, JoinNode) - } - self.morsel_accounting = { - (nid, None): 0 for nid, node in nodes if not isinstance(node, JoinNode) - } - - for join_nid in (nid for nid, node in nodes if isinstance(node, JoinNode)): - self.node_exhaustion[(join_nid, "left")] = False - self.node_exhaustion[(join_nid, "right")] = False - self.morsel_accounting[(join_nid, "left")] = 0 - self.morsel_accounting[(join_nid, "right")] = 0 - - def mark_node_exhausted(self, node_id: str, leg: Optional[str] = None): - """ - Mark a node and leg as exhausted and propagate EOS downstream. - """ - if self.node_exhaustion[(node_id, leg)]: - return # Already marked exhausted - - self.node_exhaustion[(node_id, leg)] = True - - # Propagate EOS to downstream nodes - self.queue_task(node_id, leg, EOS) - - def update_morsel_accounting(self, node_id: str, leg: Optional[str], delta: int): - """ - Update morsel accounting for a node and check for exhaustion. - """ - with morsel_lock: - self.morsel_accounting[(node_id, leg)] += delta - if self.morsel_accounting[(node_id, leg)] < 0: - # print(self.morsel_accounting) - raise InvalidInternalStateError("Morsel accounting is invalid.") - - # If no more morsels, check if all providers are exhausted - if self.morsel_accounting[(node_id, leg)] == 0: - self.check_and_mark_exhaustion(node_id, leg) - - def check_and_mark_exhaustion(self, node_id: str, leg: Optional[str]): - """ - Check if all upstream providers for a node are exhausted. - """ - for provider, _, provider_leg in self.ingoing_edges(node_id): - if not self.node_exhaustion.get((provider, provider_leg), False): - return # A provider is still active - - self.mark_node_exhausted(node_id, leg) - - def queue_task(self, node_id: str, leg: Optional[str], payload: Any): - """ - Queue a task for a worker. - """ - print( - "WORK PUT", - node_id, - leg, - "Table" if isinstance(payload, pyarrow.Table) else "EOS" if payload == EOS else payload, - self.work_queue.qsize() + 1, - flush=True, - ) - self.work_queue.put((node_id, leg, payload)) - self.active_tasks_increment(+1) - self.morsel_accounting[(node_id, leg)] += 1 - - # Process results from the response queue - def work_complete(self) -> bool: - all_nodes_exhausted = all(self.node_exhaustion.values()) - no_active_tasks = self.active_tasks <= 0 - return all_nodes_exhausted and no_active_tasks - -class PhysicalPlan(Graph, EOSHandlerMixin): +class PhysicalPlan(Graph): """ - The execution tree is defined separately from the planner to simplify the - complex code that is the planner from the tree that describes the plan. + The execution tree is defined separately to the planner to simplify the + complex code which is the planner from the tree that describes the plan. """ - def __init__(self): - # Work queue for worker tasks - self.work_queue = Queue() - # Response queue for results sent back to the engine - self.response_queue = Queue() - - Graph.__init__(self) - EOSHandlerMixin.__init__(self, self.work_queue) - - def depth_first_traversal_with_order( + def depth_first_search_flat( self, node: Optional[str] = None, visited: Optional[set] = None ) -> list: """ Returns a flat list representing the depth-first traversal of the graph with left/right ordering. + + We do this so we always evaluate the left side of a join before the right side. It technically + doesn't need the entire plan flattened DFS-wise, but this is what we are doing here to achieve + the outcome we're after. """ if node is None: node = self.get_exit_points()[0] @@ -153,100 +43,29 @@ def depth_first_traversal_with_order( visited = set() visited.add(node) - traversal_list = [(node, self[node])] + + # Collect this node's information in a flat list format + traversal_list = [ + ( + node, + self[node], + ) + ] # Sort neighbors based on relationship to ensure left, right, then unlabelled order neighbors = sorted(self.ingoing_edges(node), key=lambda x: (x[2] == "right", x[2] == "")) + # Traverse each child, prioritizing left, then right, then unlabelled for neighbor, _, _ in neighbors: if neighbor not in visited: - child_list = self.depth_first_traversal_with_order(neighbor, visited) + child_list = self.depth_first_search_flat(neighbor, visited) traversal_list.extend(child_list) return traversal_list - def explain(self, analyze: bool, statistics) -> Generator[pyarrow.Table, None, None]: - from opteryx import operators - - def _inner_explain(node, depth): - incoming_operators = self.ingoing_edges(node) - for operator_name in incoming_operators: - operator = self[operator_name[0]] - if isinstance( - operator, (operators.ExitNode, operators.ExplainNode) - ): # Skip ExitNode - yield from _inner_explain(operator_name[0], depth) - continue - elif isinstance(operator, operators.BasePlanNode): - record = { - "tree": depth, - "operator": operator.name, - "config": operator.config, - } - if analyze: - record["time_ms"] = operator.execution_time / 1e6 - record["records_in"] = operator.records_in - record["records_out"] = operator.records_out - yield record - yield from _inner_explain(operator_name[0], depth + 1) - - head = list(dict.fromkeys(self.get_exit_points())) - if len(head) != 1: # pragma: no cover - raise InvalidInternalStateError(f"Problem with the plan - it has {len(head)} heads.") - - # for EXPLAIN ANALYZE, we execute the query and report statistics - if analyze: - # we don't want the results, just the details from the plan - temp = None - head_node = self.get_exit_points()[0] - query_head, _, _ = self.ingoing_edges(head_node)[0] - results = self.execute(head_node=query_head) - if results is not None: - results_generator, _ = next(results, ([], None)) - for temp in results_generator: - pass - del temp - - plan = list(_inner_explain(head[0], 1)) - - table = pyarrow.Table.from_pylist(plan) - return table - - def execute( - self, head_node=None, statistics=None - ) -> Generator[Tuple[Any, ResultType], Any, Any]: - """ - Execute the physical plan starting from the head node. - - Args: - head_node (Optional[str]): The starting node for execution. If None, the default head node is used. - - Returns: - Generator[Tuple[Any, ResultType], Any, Any]: A generator yielding results and their types. - """ - from opteryx.operators import ExplainNode - from opteryx.operators import JoinNode - from opteryx.operators import ReaderNode - from opteryx.operators import SetVariableNode - from opteryx.operators import ShowCreateNode - from opteryx.operators import ShowValueNode - - if not self.is_acyclic(): - raise InvalidInternalStateError("Query plan is cyclic, cannot execute.") - - head_nodes = list(set(self.get_exit_points())) - if len(head_nodes) != 1: - raise InvalidInternalStateError( - f"Query plan has {len(head_nodes)} heads, expected exactly 1." - ) - - if head_node is None: - head_node = self[head_nodes[0]] - - self.initialize_eos_tracking(self.nodes(True)) - + def label_join_legs(self): # add the left/right labels to the edges coming into the joins - joins = ((nid, node) for nid, node in self.nodes(True) if isinstance(node, JoinNode)) + joins = ((nid, node) for nid, node in self.nodes(True) if node.is_join) for nid, join in joins: for provider, provider_target, provider_relation in self.ingoing_edges(nid): reader_edges = { @@ -259,9 +78,12 @@ def execute( reader_edges.add((provider, provider_target, provider_relation)) for s, t, r in reader_edges: - if self[s].uuid in join.left_readers: + node = self[s] + if not hasattr(node, "uuid"): + continue + if node.uuid in join.left_readers: self.add_edge(provider, nid, "left") - elif self[s].uuid in join.right_readers: + elif node.uuid in join.right_readers: self.add_edge(provider, nid, "right") tester = self.breadth_first_search(nid, reverse=True) @@ -270,133 +92,12 @@ def execute( if not any(r == "right" for s, t, r in tester): raise InvalidInternalStateError("Join has no RIGHT leg") - # Special case handling for 'Explain' queries - if isinstance(head_node, ExplainNode): - yield self.explain(head_node.analyze, statistics), ResultType.TABULAR - - elif isinstance(head_node, (SetVariableNode, ShowValueNode, ShowCreateNode)): - yield head_node(None), ResultType.TABULAR - - else: - num_workers = CONCURRENT_WORKERS - workers = [] - - def worker_process(): - """ - Worker thread: Processes tasks from the work queue and sends results to the response queue. - """ - while True: - try: - task = self.work_queue.get(timeout=CYCLE_WAIT_TIME) - except Empty: - statistics.worker_wait_time += CYCLE_WAIT_TIME - continue - - if task is None: - # print("WORK GET", task) - break - - print( - "WORK GET", - task[0], - task[1], - "Table" - if isinstance(task[2], pyarrow.Table) - else "EOS" - if task[2] == EOS - else task[2], - self[task[0]].name, - self.work_queue.qsize(), - flush=True, - ) - node_id, join_leg, morsel = task - operator = self[node_id] - results = operator(morsel, join_leg) - - for result in results: - # Send results back to the response queue - self.response_queue.put((node_id, join_leg, result)) - - self.update_morsel_accounting(node_id, join_leg, -1) - - self.work_queue.task_done() - - # Launch worker threads - for _ in range(num_workers): - worker = Thread(target=worker_process) - worker.daemon = True - worker.start() - workers.append(worker) - - def inner_execute(plan): - # Get all the nodes which push data into the plan We use DFS to order the - # nodes to ensure left branch is always before the right branch - pump_nodes = [ - (nid, node) - for nid, node in self.depth_first_traversal_with_order() - if isinstance(node, ReaderNode) - ] - - # Main engine loop processes pump nodes and coordinates work - for pump_nid, pump_instance in pump_nodes: - for morsel in pump_instance(None, None): - # Initial morsels pushed to the work queue determine downstream operators - consumer_nodes = [ - (target, join_leg) - for _, target, join_leg in self.outgoing_edges(pump_nid) - ] - for consumer_node, join_leg in consumer_nodes: - # DEBUG: log (f"following initial {self[pump_nid].name} ({pump_nid}) triggering {self[consumer_node].name} ({consumer_node})") - # Queue tasks for consumer operators - self.queue_task(consumer_node, join_leg, morsel) - - # Pump is exhausted after emitting all morsels - # print("pump exhausted", pump_nid) - self.update_morsel_accounting(pump_nid, None, 0) - - while not self.work_complete(): - # Wait for results from workers - # print(list(self.node_exhaustion.values()), self.active_tasks) - try: - node_id, join_leg, result = self.response_queue.get(timeout=CYCLE_WAIT_TIME) - except Empty: - # print(".") - statistics.cpu_wait_time += CYCLE_WAIT_TIME - continue - - # if a thread threw a error, we get them in the main - # thread here, we just reraise the error here - if isinstance(result, Exception): - raise result - - # Handle Empty responses - if result is None: - self.active_tasks_increment(-1) - continue - - # Determine downstream operators - downstream_nodes = [ - (target, join_leg) for _, target, join_leg in self.outgoing_edges(node_id) - ] - if len(downstream_nodes) == 0: # Exit node - if result is not None: - yield result # Emit the morsel immediately - self.active_tasks_increment(-1) # Mark the task as completed - continue - - for downstream_node, join_leg in downstream_nodes: - # Queue tasks for downstream operators - # DEBUG: log (f"following {self[node_id].name} ({node_id}) triggering {self[downstream_node].name} ({downstream_node})", flush=True) - self.queue_task(downstream_node, join_leg, result) - - # decrement _after_ we've done the work relation to handling the task - self.active_tasks_increment(-1) - - for worker in workers: - self.work_queue.put(None) - - # Wait for all workers to complete - for worker in workers: - worker.join() + def sensors(self): + readings = {} + for nid in self.nodes(): + node = self[nid] + readings[node.identity] = node.sensors() + return readings - yield inner_execute(self), ResultType.TABULAR + def __del__(self): + pass diff --git a/opteryx/operators/__init__.py b/opteryx/operators/__init__.py index 5ffb5a7cf..740f3f6f8 100644 --- a/opteryx/operators/__init__.py +++ b/opteryx/operators/__init__.py @@ -36,9 +36,6 @@ from .limit_node import LimitNode # select the first N records from .pyarrow_join_node import PyArrowJoinNode -# from .metadata_writer_node import MetadataWriterNode -# from .morsel_defragment_node import MorselDefragmentNode # consolidate small morsels -from .noop_node import NoOpNode # No Operation from .outer_join_node import OuterJoinNode from .projection_node import ProjectionNode # remove unwanted columns including renames from .read_node import ReaderNode diff --git a/opteryx/operators/aggregate_and_group_node.py b/opteryx/operators/aggregate_and_group_node.py index a06dbfc4b..403c07f91 100644 --- a/opteryx/operators/aggregate_and_group_node.py +++ b/opteryx/operators/aggregate_and_group_node.py @@ -139,6 +139,7 @@ def execute(self, morsel: pyarrow.Table, **kwargs): groups = groups.rename_columns(list(self.column_map.keys()) + self.group_by_columns) yield groups + yield EOS return morsel = project(morsel, self.all_identifiers) diff --git a/opteryx/operators/aggregate_node.py b/opteryx/operators/aggregate_node.py index 422be5bdf..bd669a6ea 100644 --- a/opteryx/operators/aggregate_node.py +++ b/opteryx/operators/aggregate_node.py @@ -250,7 +250,7 @@ def execute(self, morsel: pyarrow.Table, **kwargs) -> pyarrow.Table: aggregates = aggregates.select(list(self.column_map.keys())) yield aggregates - + yield EOS return self.buffer.append(project(morsel, self.all_identifiers)) diff --git a/opteryx/operators/base_plan_node.py b/opteryx/operators/base_plan_node.py index 69afba193..8e35fff94 100644 --- a/opteryx/operators/base_plan_node.py +++ b/opteryx/operators/base_plan_node.py @@ -36,6 +36,9 @@ def __post_init__(self): class BasePlanNode: + is_join: bool = False + is_scan: bool = False + def __init__(self, *, properties, **parameters): """ This is the base class for nodes in the execution plan. @@ -112,10 +115,7 @@ def __call__(self, morsel: pyarrow.Table, join_leg: str) -> Optional[pyarrow.Tab self.records_out += result.num_rows self.bytes_out += result.nbytes - if result == EOS: - yield None - else: - yield result + yield result except StopIteration: # Break the loop when the generator is exhausted @@ -135,6 +135,8 @@ def sensors(self): class JoinNode(BasePlanNode): + is_join = True + def __init__(self, *, properties, **parameters): super().__init__(properties=properties, **parameters) diff --git a/opteryx/operators/cross_join_node.py b/opteryx/operators/cross_join_node.py index c59752515..abcf2bd4b 100644 --- a/opteryx/operators/cross_join_node.py +++ b/opteryx/operators/cross_join_node.py @@ -360,6 +360,7 @@ def execute(self, morsel: pyarrow.Table, join_leg: str) -> pyarrow.Table: else: self.left_buffer.append(morsel) yield None + return if self.stream == "right": if morsel == EOS: diff --git a/opteryx/operators/distinct_node.py b/opteryx/operators/distinct_node.py index 8862fd9e1..bc54ba16c 100644 --- a/opteryx/operators/distinct_node.py +++ b/opteryx/operators/distinct_node.py @@ -59,7 +59,7 @@ def execute(self, morsel: Table, **kwargs) -> Table: # limit processing if morsel == EOS: - yield None + yield EOS return unique_indexes, self.hash_set = distinct( diff --git a/opteryx/operators/exit_node.py b/opteryx/operators/exit_node.py index 83308bfae..08f225526 100644 --- a/opteryx/operators/exit_node.py +++ b/opteryx/operators/exit_node.py @@ -67,7 +67,7 @@ def name(self): # pragma: no cover def execute(self, morsel: Table, **kwargs) -> Table: # Exit doesn't return EOS if morsel == EOS: - yield None + yield EOS return final_columns = [] diff --git a/opteryx/operators/filter_node.py b/opteryx/operators/filter_node.py index d459bd6ba..81501b3c1 100644 --- a/opteryx/operators/filter_node.py +++ b/opteryx/operators/filter_node.py @@ -57,7 +57,7 @@ def name(self): # pragma: no cover def execute(self, morsel: pyarrow.Table, **kwargs) -> pyarrow.Table: if morsel == EOS: - yield None + yield EOS return if morsel.num_rows == 0: diff --git a/opteryx/operators/heap_sort_node.py b/opteryx/operators/heap_sort_node.py index 2c43bb27b..872b7aba2 100644 --- a/opteryx/operators/heap_sort_node.py +++ b/opteryx/operators/heap_sort_node.py @@ -85,6 +85,7 @@ def name(self): # pragma: no cover def execute(self, morsel: pyarrow.Table, **kwargs) -> pyarrow.Table: if morsel == EOS: yield self.table + yield EOS return if self.table: diff --git a/opteryx/operators/inner_join_node.py b/opteryx/operators/inner_join_node.py index 04eca47a7..30fab3b27 100644 --- a/opteryx/operators/inner_join_node.py +++ b/opteryx/operators/inner_join_node.py @@ -103,8 +103,6 @@ def config(self): # pragma: no cover return "" def execute(self, morsel: Table, join_leg: str) -> Table: - print(join_leg, type(morsel)) - with self.lock: if join_leg == "left": if morsel == EOS: @@ -123,11 +121,9 @@ def execute(self, morsel: Table, join_leg: str) -> Table: start = time.monotonic_ns() self.left_hash = hash_join_map(self.left_relation, self.left_columns) - print("BUILD HASH MAP", time.monotonic_ns() - start) self.statistics.time_build_hash_map += time.monotonic_ns() - start for right_morsel in self.right_buffer: - print("CLEAR") yield inner_join_with_preprocessed_left_side( left_relation=self.left_relation, right_relation=right_morsel, @@ -144,8 +140,7 @@ def execute(self, morsel: Table, join_leg: str) -> Table: if join_leg == "right": if morsel == EOS: - print("DONE") - yield None + yield EOS return if self.left_hash is None: diff --git a/opteryx/operators/limit_node.py b/opteryx/operators/limit_node.py index 427fce179..5d48994bc 100644 --- a/opteryx/operators/limit_node.py +++ b/opteryx/operators/limit_node.py @@ -49,7 +49,7 @@ def config(self): # pragma: no cover def execute(self, morsel: pyarrow.Table, **kwargs) -> pyarrow.Table: if morsel == EOS: - yield None + yield EOS return if self.rows_left_to_skip > 0: diff --git a/opteryx/operators/noop_node.py b/opteryx/operators/noop_node.py deleted file mode 100644 index 1a7139459..000000000 --- a/opteryx/operators/noop_node.py +++ /dev/null @@ -1,44 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -No Operation - -This is a SQL Query Execution Plan Node. -""" - -from pyarrow import Table - -from opteryx.models import QueryProperties - -from . import BasePlanNode - - -class NoOpNode(BasePlanNode): - def __init__(self, properties: QueryProperties, **parameters): - BasePlanNode.__init__(self, properties=properties, **parameters) - - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - - @property - def name(self): # pragma: no cover - return "NoOp" - - @property - def config(self): # pragma: no cover - return "" - - def execute(self, morsel: Table, **kwargs) -> Table: - print("NOOP was called") - yield morsel diff --git a/opteryx/operators/projection_node.py b/opteryx/operators/projection_node.py index 6efac153f..83eb40887 100644 --- a/opteryx/operators/projection_node.py +++ b/opteryx/operators/projection_node.py @@ -64,7 +64,7 @@ def name(self): # pragma: no cover def execute(self, morsel: pyarrow.Table, **kwargs) -> pyarrow.Table: if morsel == EOS: - yield None + yield EOS return # If any of the columns need evaluating, we need to do that here diff --git a/opteryx/operators/read_node.py b/opteryx/operators/read_node.py index 67e70d7ab..0f1fbbbc3 100644 --- a/opteryx/operators/read_node.py +++ b/opteryx/operators/read_node.py @@ -133,6 +133,8 @@ def merge_schemas( class ReaderNode(BasePlanNode): + is_scan = True + def __init__(self, properties: QueryProperties, **parameters): BasePlanNode.__init__(self, properties=properties, **parameters) diff --git a/opteryx/operators/sort_node.py b/opteryx/operators/sort_node.py index a3fea0672..57bad2499 100644 --- a/opteryx/operators/sort_node.py +++ b/opteryx/operators/sort_node.py @@ -100,3 +100,4 @@ def execute(self, morsel: Table, **kwargs) -> Table: ) yield table.sort_by(mapped_order) + yield EOS diff --git a/opteryx/operators/union_node.py b/opteryx/operators/union_node.py index f3855858f..0eca16b66 100644 --- a/opteryx/operators/union_node.py +++ b/opteryx/operators/union_node.py @@ -50,7 +50,7 @@ def execute(self, morsel: Table, **kwargs) -> Table: coercible types are coerced. """ if morsel == EOS and self.seen_first_eos: - yield None + yield EOS return elif morsel == EOS: self.seen_first_eos = True diff --git a/opteryx/planner/logical_planner/logical_planner.py b/opteryx/planner/logical_planner/logical_planner.py index 7dfc44c52..6d844366c 100644 --- a/opteryx/planner/logical_planner/logical_planner.py +++ b/opteryx/planner/logical_planner/logical_planner.py @@ -326,6 +326,10 @@ def inner_query_planner(ast_branch): join_step.right_relation_names = [_table_name(_relations[0])] join_step.left_relation_names = [_table_name(_relations[1])] + reader_nodes = list(inner_plan._nodes.values()) + join_step.left_readers = [reader_nodes[0].uuid] + join_step.right_readers = [reader_nodes[1].uuid] + step_id = random_string() inner_plan.add_node(step_id, join_step) for relation in _relations: diff --git a/opteryx/shared/variables.py b/opteryx/shared/variables.py index 750d05125..815c6c325 100644 --- a/opteryx/shared/variables.py +++ b/opteryx/shared/variables.py @@ -151,7 +151,9 @@ def as_column(self, key: str): from orso.schema import ConstantColumn # system variables aren't stored with the @@ - variable = self._variables[key[2:]] if key.startswith("@@") else self._variables[key] + variable = self._variables[key[2:]] if key.startswith("@@") else self._variables.get(key) + if not variable: + raise VariableNotFoundError(key) return ConstantColumn(name=key, type=variable[0], value=variable[1]) diff --git a/tests/sql_battery/test_shapes_and_errors_battery.py b/tests/sql_battery/test_shapes_and_errors_battery.py index 654a008fb..3745882a0 100644 --- a/tests/sql_battery/test_shapes_and_errors_battery.py +++ b/tests/sql_battery/test_shapes_and_errors_battery.py @@ -675,6 +675,7 @@ ("SELECT cve -> 'CVE_data_meta' ->> 'ASSIGNER' FROM testdata.flat.nvd limit 10", 10, 1, None), ("SELECT cve ->> 'CVE_data_meta' ->> 'ASSIGNER' FROM testdata.flat.nvd limit 10", 10, 1, None), ("SELECT cve -> 'CVE_data_meta' -> 'ASSIGNER' FROM testdata.flat.nvd limit 10", 10, 1, None), + ("SELECT details, details->'int_field' FROM duckdb.struct_tests", 10, 2, None), ("SELECT dict @? 'list' FROM testdata.flat.struct", 6, 1, None), ("SELECT struct(dict) @? 'list' FROM testdata.flat.struct", 6, 1, None), diff --git a/tests/storage/test_blob_gcs.py b/tests/storage/test_blob_gcs.py index 48f99f6bb..3d237b2ee 100644 --- a/tests/storage/test_blob_gcs.py +++ b/tests/storage/test_blob_gcs.py @@ -58,6 +58,12 @@ expected_columncount=2, stats={"columns_read": 4}, ), + TestCase( + query=f"SELECT COUNT(*) FROM {BUCKET_NAME}.many", + expected_rowcount=1, + expected_columncount=1, + stats={"blobs_read": 1018, "rows_read": 9162} + ) ] From fc11cbc3033da220b1492f8b89723ea0cf6d00d7 Mon Sep 17 00:00:00 2001 From: XB500 Date: Wed, 18 Dec 2024 23:10:01 +0000 Subject: [PATCH 067/157] Opteryx Version 0.19.0-alpha.891 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index d99500f91..97c8e23c7 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 890 +__build__ = 891 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 99fa8917bb244f7b0499271a6e40ae8fa48cccb4 Mon Sep 17 00:00:00 2001 From: joocer Date: Thu, 19 Dec 2024 20:26:12 +0000 Subject: [PATCH 068/157] #2132 --- opteryx/cursor.py | 27 ++--- opteryx/managers/execution/serial_engine.py | 39 +++---- opteryx/models/connection_context.py | 2 + opteryx/operators/base_plan_node.py | 2 - opteryx/operators/outer_join_node.py | 39 +++---- opteryx/operators/set_variable_node.py | 2 +- opteryx/planner/__init__.py | 74 ++++++------ opteryx/planner/ast_rewriter.py | 8 +- .../logical_planner/logical_planner.py | 21 ++-- opteryx/shared/variables.py | 108 +++++++++--------- opteryx/virtual_datasets/variables_data.py | 6 +- .../test_shapes_and_errors_battery.py | 18 +-- 12 files changed, 167 insertions(+), 179 deletions(-) diff --git a/opteryx/cursor.py b/opteryx/cursor.py index 72955a986..188e0ab49 100644 --- a/opteryx/cursor.py +++ b/opteryx/cursor.py @@ -15,7 +15,6 @@ from enum import Enum from enum import auto from functools import wraps -from itertools import chain from typing import Any from typing import Dict from typing import Iterable @@ -169,7 +168,6 @@ def _inner_execute( Returns: Results of the query execution. """ - from opteryx import system_statistics from opteryx.managers.execution import execute from opteryx.planner import query_planner @@ -179,19 +177,16 @@ def _inner_execute( self._connection.context.history.append((operation, True, datetime.datetime.utcnow())) - start = time.time_ns() - plans = query_planner( - operation=operation, - parameters=params, - visibility_filters=visibility_filters, - connection=self._connection, - qid=self.id, - statistics=self._statistics, - ) - try: start = time.time_ns() - plan = next(plans) + plan = query_planner( + operation=operation, + parameters=params, + visibility_filters=visibility_filters, + connection=self._connection, + qid=self.id, + statistics=self._statistics, + ) self._statistics.time_planning += time.time_ns() - start except RuntimeError as err: # pragma: no cover raise SqlError(f"Error Executing SQL Statement ({err})") from err @@ -214,7 +209,7 @@ def _inner_execute( def _execute_statements( self, - operation, + operation: str, params: Optional[Iterable] = None, visibility_filters: Optional[Dict[str, Any]] = None, ): @@ -275,7 +270,7 @@ def execute( operation = operation.decode() results = self._execute_statements(operation, params, visibility_filters) if results is not None: - result_data, self._result_type = next(results, (ResultType._UNDEFINED, None)) + result_data, self._result_type = results if self._result_type == ResultType.NON_TABULAR: import orso @@ -341,7 +336,7 @@ def execute_to_arrow( operation = operation.decode() results = self._execute_statements(operation, params, visibility_filters) if results is not None: - result_data, self._result_type = next(results, (ResultType._UNDEFINED, None)) + result_data, self._result_type = results if limit is not None: result_data = utils.arrow.limit_records(result_data, limit) # type: ignore if isinstance(result_data, pyarrow.Table): diff --git a/opteryx/managers/execution/serial_engine.py b/opteryx/managers/execution/serial_engine.py index 3c2a0d9ca..9f5e03857 100644 --- a/opteryx/managers/execution/serial_engine.py +++ b/opteryx/managers/execution/serial_engine.py @@ -36,28 +36,25 @@ def execute( # Special case handling for 'Explain' queries if isinstance(head_node, ExplainNode): - yield plan.explain(head_node.analyze), ResultType.TABULAR + return plan.explain(head_node.analyze), ResultType.TABULAR - # Special case handling for 'Set' queries - elif isinstance(head_node, SetVariableNode): - yield head_node(None, None), ResultType.NON_TABULAR + # Special case handling + if isinstance(head_node, SetVariableNode): + # Set the variables and return a non-tabular result + return head_node(None), ResultType.NON_TABULAR + if isinstance(head_node, (ShowValueNode, ShowCreateNode)): + # There's no execution plan to execute, just return the result + return head_node(None, None), ResultType.TABULAR - elif isinstance(head_node, (ShowValueNode, ShowCreateNode)): - yield head_node(None, None), ResultType.TABULAR + def inner_execute(plan: PhysicalPlan) -> Generator: + # Get the pump nodes from the plan and execute them in order + pump_nodes = [(nid, node) for nid, node in plan.depth_first_search_flat() if node.is_scan] + for pump_nid, pump_instance in pump_nodes: + for morsel in pump_instance(None, None): + yield from process_node(plan, pump_nid, morsel, None) + yield from process_node(plan, pump_nid, EOS, None) - else: - - def inner_execute(plan): - # Get the pump nodes from the plan and execute them in order - pump_nodes = [ - (nid, node) for nid, node in plan.depth_first_search_flat() if node.is_scan - ] - for pump_nid, pump_instance in pump_nodes: - for morsel in pump_instance(None, None): - yield from process_node(plan, pump_nid, morsel, None) - yield from process_node(plan, pump_nid, EOS, None) - - yield inner_execute(plan), ResultType.TABULAR + return inner_execute(plan), ResultType.TABULAR def explain(plan: PhysicalPlan, analyze: bool) -> Generator[pyarrow.Table, None, None]: @@ -107,7 +104,7 @@ def _inner_explain(node, depth): yield table -def process_node(plan: PhysicalPlan, nid: str, morsel, join_leg: str): +def process_node(plan: PhysicalPlan, nid: str, morsel: pyarrow.Table, join_leg: str) -> Generator: node = plan[nid] if node.is_scan: @@ -122,8 +119,6 @@ def process_node(plan: PhysicalPlan, nid: str, morsel, join_leg: str): if results is None: yield None return - if isinstance(results, Exception): - raise results for result in results: if result is not None: children = [(t, r) for s, t, r in plan.outgoing_edges(nid)] diff --git a/opteryx/models/connection_context.py b/opteryx/models/connection_context.py index 046b6b983..0a72a341a 100644 --- a/opteryx/models/connection_context.py +++ b/opteryx/models/connection_context.py @@ -23,6 +23,7 @@ from opteryx.shared.variables import SystemVariables from opteryx.shared.variables import SystemVariablesContainer from opteryx.shared.variables import VariableOwner +from opteryx.shared.variables import Visibility # History Item = [statement, success, execution start] HistoryItem = Tuple[str, bool, datetime.datetime] @@ -66,4 +67,5 @@ def __post_init__(self): OrsoTypes.ARRAY, self.memberships or [], VariableOwner.SERVER, + Visibility.UNRESTRICTED, ) diff --git a/opteryx/operators/base_plan_node.py b/opteryx/operators/base_plan_node.py index 8e35fff94..ee8cee09e 100644 --- a/opteryx/operators/base_plan_node.py +++ b/opteryx/operators/base_plan_node.py @@ -120,8 +120,6 @@ def __call__(self, morsel: pyarrow.Table, join_leg: str) -> Optional[pyarrow.Tab except StopIteration: # Break the loop when the generator is exhausted break - except Exception as err: - yield err def sensors(self): return { diff --git a/opteryx/operators/outer_join_node.py b/opteryx/operators/outer_join_node.py index 6b7d6d59b..a633bd514 100644 --- a/opteryx/operators/outer_join_node.py +++ b/opteryx/operators/outer_join_node.py @@ -251,17 +251,16 @@ def left_semi_join( class OuterJoinNode(JoinNode): def __init__(self, properties: QueryProperties, **parameters): JoinNode.__init__(self, properties=properties, **parameters) - self._join_type = parameters["type"] - self._on = parameters.get("on") - self._using = parameters.get("using") + self.join_type = parameters["type"] + self.on = parameters.get("on") + self.using = parameters.get("using") - self._left_columns = parameters.get("left_columns") + self.left_columns = parameters.get("left_columns") self.left_readers = parameters.get("left_readers") - self._right_columns = parameters.get("right_columns") + self.right_columns = parameters.get("right_columns") self.right_readers = parameters.get("right_readers") - self.stream = "left" self.left_buffer = [] self.right_buffer = [] self.left_relation = None @@ -272,46 +271,42 @@ def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover @property def name(self): # pragma: no cover - return self._join_type + return self.join_type @property - def config(self): # pragma: no cover + def config(self) -> str: # pragma: no cover from opteryx.managers.expression import format_expression - if self._on: - return f"{self._join_type.upper()} JOIN ({format_expression(self._on, True)})" - if self._using: - return f"{self._join_type.upper()} JOIN (USING {','.join(map(format_expression, self._using))})" - return f"{self._join_type.upper()}" + if self.on: + return f"{self.join_type.upper()} JOIN ({format_expression(self.on, True)})" + if self.using: + return f"{self.join_type.upper()} JOIN (USING {','.join(map(format_expression, self.using))})" + return f"{self.join_type.upper()}" def execute(self, morsel: pyarrow.Table, join_leg: str) -> pyarrow.Table: - if self.stream == "left": + if join_leg == "left": if morsel == EOS: - self.stream = "right" self.left_relation = pyarrow.concat_tables(self.left_buffer, promote_options="none") self.left_buffer.clear() else: self.left_buffer.append(morsel) - yield None - return - if self.stream == "right": + if join_leg == "right": if morsel == EOS: right_relation = pyarrow.concat_tables(self.right_buffer, promote_options="none") self.right_buffer.clear() - join_provider = providers.get(self._join_type) + join_provider = providers.get(self.join_type) yield from join_provider( left_relation=self.left_relation, right_relation=right_relation, - left_columns=self._left_columns, - right_columns=self._right_columns, + left_columns=self.left_columns, + right_columns=self.right_columns, ) else: self.right_buffer.append(morsel) - yield None providers = { diff --git a/opteryx/operators/set_variable_node.py b/opteryx/operators/set_variable_node.py index bf234c468..ccf2a49ec 100644 --- a/opteryx/operators/set_variable_node.py +++ b/opteryx/operators/set_variable_node.py @@ -43,6 +43,6 @@ def name(self): # pragma: no cover def config(self): # pragma: no cover return f"{self.variable} TO {self.value}" - def execute(self, morsel, **kwargs) -> NonTabularResult: + def __call__(self, morsel, **kwargs) -> NonTabularResult: self.variables[self.variable] = self.value return NonTabularResult(record_count=1, status=QueryStatus.SQL_SUCCESS) # type: ignore diff --git a/opteryx/planner/__init__.py b/opteryx/planner/__init__.py index ff1cec9f2..f8e931efc 100644 --- a/opteryx/planner/__init__.py +++ b/opteryx/planner/__init__.py @@ -131,7 +131,6 @@ def query_planner( from opteryx.planner.ast_rewriter import do_ast_rewriter from opteryx.planner.binder import do_bind_phase from opteryx.planner.cost_based_optimizer import do_cost_based_optimizer - from opteryx.planner.logical_planner import LogicalPlan from opteryx.planner.logical_planner import apply_visibility_filters from opteryx.planner.logical_planner import do_logical_planning_phase from opteryx.planner.physical_planner import create_physical_plan @@ -158,48 +157,45 @@ def query_planner( raise SqlError(parser_error) from parser_error # AST Rewriter adds temporal filters and parameters to the AST start = time.monotonic_ns() - parsed_statements = do_ast_rewriter( + parsed_statement = do_ast_rewriter( parsed_statements, temporal_filters=temporal_filters, parameters=params, connection=connection, - ) + )[0] statistics.time_planning_ast_rewriter += time.monotonic_ns() - start - logical_plan: LogicalPlan = None - ast: dict = {} - # Logical Planner converts ASTs to logical plans - for logical_plan, ast, ctes in do_logical_planning_phase(parsed_statements): # type: ignore - # check user has permission for this query type - query_type = next(iter(ast)) - if query_type not in connection.permissions: - from opteryx.exceptions import PermissionsError - - raise PermissionsError( - f"User does not have permission to execute '{query_type}' queries." - ) - - if visibility_filters: - logical_plan = apply_visibility_filters(logical_plan, visibility_filters) - - # The Binder adds schema information to the logical plan - start = time.monotonic_ns() - bound_plan = do_bind_phase( - logical_plan, - connection=connection.context, - qid=qid, - # common_table_expressions=ctes, - ) - statistics.time_planning_binder += time.monotonic_ns() - start - - start = time.monotonic_ns() - optimized_plan = do_cost_based_optimizer(bound_plan, statistics) - statistics.time_planning_optimizer += time.monotonic_ns() - start - - # before we write the new optimizer and execution engine, convert to a V1 plan - start = time.monotonic_ns() - query_properties = QueryProperties(qid=qid, variables=connection.context.variables) - physical_plan = create_physical_plan(optimized_plan, query_properties) - statistics.time_planning_physical_planner += time.monotonic_ns() - start - yield physical_plan + + logical_plan, ast, ctes = do_logical_planning_phase(parsed_statement) # type: ignore + # check user has permission for this query type + query_type = next(iter(ast)) + if query_type not in connection.permissions: + from opteryx.exceptions import PermissionsError + + raise PermissionsError(f"User does not have permission to execute '{query_type}' queries.") + + if visibility_filters: + logical_plan = apply_visibility_filters(logical_plan, visibility_filters) + + # The Binder adds schema information to the logical plan + start = time.monotonic_ns() + bound_plan = do_bind_phase( + logical_plan, + connection=connection.context, + qid=qid, + # common_table_expressions=ctes, + ) + statistics.time_planning_binder += time.monotonic_ns() - start + + start = time.monotonic_ns() + optimized_plan = do_cost_based_optimizer(bound_plan, statistics) + statistics.time_planning_optimizer += time.monotonic_ns() - start + + # before we write the new optimizer and execution engine, convert to a V1 plan + start = time.monotonic_ns() + query_properties = QueryProperties(qid=qid, variables=connection.context.variables) + physical_plan = create_physical_plan(optimized_plan, query_properties) + statistics.time_planning_physical_planner += time.monotonic_ns() - start + + return physical_plan diff --git a/opteryx/planner/ast_rewriter.py b/opteryx/planner/ast_rewriter.py index 5b0becd9e..23b689834 100644 --- a/opteryx/planner/ast_rewriter.py +++ b/opteryx/planner/ast_rewriter.py @@ -280,11 +280,13 @@ def rewrite_json_accessors(node: Dict[str, Any]) -> Dict[str, Any]: return node -def do_ast_rewriter(ast: list, temporal_filters: list, parameters: Union[list, dict], connection): +def do_ast_rewriter( + asts: List[dict], temporal_filters: list, parameters: Union[list, dict], connection +): # get the query type - query_type = next(iter(ast)) + query_type = next(iter(asts)) # bind the temporal ranges, we do that here because the order in the AST matters - with_temporal_ranges = temporal_range_binder(ast, temporal_filters) + with_temporal_ranges = temporal_range_binder(asts, temporal_filters) # bind the user provided parameters, we this that here because we want it after the # AST has been created (to avoid injection flaws) but also because the order # matters diff --git a/opteryx/planner/logical_planner/logical_planner.py b/opteryx/planner/logical_planner/logical_planner.py index 6d844366c..64457b456 100644 --- a/opteryx/planner/logical_planner/logical_planner.py +++ b/opteryx/planner/logical_planner/logical_planner.py @@ -18,7 +18,6 @@ from enum import Enum from enum import auto -from typing import Generator from typing import List from typing import Optional from typing import Tuple @@ -1200,16 +1199,14 @@ def build_expression_tree(relation, dnf_list): return logical_plan -def do_logical_planning_phase(parsed_statements) -> Generator: +def do_logical_planning_phase(parsed_statement: dict) -> tuple: # The sqlparser ast is an array of asts - for parsed_statement in parsed_statements: - statement_type = next(iter(parsed_statement)) - if statement_type not in QUERY_BUILDERS: - from opteryx.exceptions import UnsupportedSyntaxError - raise UnsupportedSyntaxError( - f"Version 2 Planner does not support '{statement_type}' type queries yet." - ) - # CTEs are Common Table Expressions, they're variations of subqueries - ctes = extract_ctes(parsed_statement, inner_query_planner) - yield QUERY_BUILDERS[statement_type](parsed_statement), parsed_statement, ctes + statement_type = next(iter(parsed_statement)) + if statement_type not in QUERY_BUILDERS: + from opteryx.exceptions import UnsupportedSyntaxError + + raise UnsupportedSyntaxError(f"VPlanner does not support '{statement_type}' type queries.") + # CTEs are Common Table Expressions, they're variations of subqueries + ctes = extract_ctes(parsed_statement, inner_query_planner) + return QUERY_BUILDERS[statement_type](parsed_statement), parsed_statement, ctes diff --git a/opteryx/shared/variables.py b/opteryx/shared/variables.py index 815c6c325..401cb7b9f 100644 --- a/opteryx/shared/variables.py +++ b/opteryx/shared/variables.py @@ -40,62 +40,65 @@ class VariableOwner(int, Enum): # Manually assign numbers because USER < INTERNAL < SERVER - SERVER = 30 - INTERNAL = 20 - USER = 10 + SERVER = 30 # set on the server, fixed per instantiation + INTERNAL = 20 # set by the system, can be updated by the system + USER = 10 # set by the user, can be updated by the user -VariableSchema = Tuple[Type, Any, VariableOwner] +class Visibility(str, Enum): + RESTRICTED = "restricted" + UNRESTRICTED = "unrestricted" + + +VariableSchema = Tuple[Type, Any, VariableOwner, Visibility] # fmt: off SYSTEM_VARIABLES_DEFAULTS: Dict[str, VariableSchema] = { - # name: (type, default, owner, description) - # These are the MySQL set of variables - we don't use all of them but have them for compatibility - "auto_increment_increment": (OrsoTypes.INTEGER, 1, VariableOwner.INTERNAL), - "autocommit": (OrsoTypes.BOOLEAN, True, VariableOwner.SERVER), - "character_set_client": (OrsoTypes.VARCHAR, CharacterSet.utf8mb4.name, VariableOwner.SERVER), - "character_set_connection": (OrsoTypes.VARCHAR, CharacterSet.utf8mb4.name, VariableOwner.SERVER), - "character_set_database": (OrsoTypes.VARCHAR, CharacterSet.utf8mb4.name, VariableOwner.SERVER), - "character_set_results": (OrsoTypes.VARCHAR, CharacterSet.utf8mb4.name, VariableOwner.SERVER), - "character_set_server": (OrsoTypes.VARCHAR, CharacterSet.utf8mb4.name, VariableOwner.SERVER), - "collation_connection": (OrsoTypes.VARCHAR, Collation.utf8mb4_general_ci.name, VariableOwner.SERVER), - "collation_database": (OrsoTypes.VARCHAR, Collation.utf8mb4_general_ci.name, VariableOwner.SERVER), - "collation_server": (OrsoTypes.VARCHAR, Collation.utf8mb4_general_ci.name, VariableOwner.SERVER), - "external_user": (OrsoTypes.VARCHAR, "", VariableOwner.INTERNAL), - "init_connect": (OrsoTypes.VARCHAR, "", VariableOwner.SERVER), - "interactive_timeout": (OrsoTypes.INTEGER, 28800, VariableOwner.SERVER), - "license": (OrsoTypes.VARCHAR, "MIT", VariableOwner.SERVER), - "lower_case_table_names": (OrsoTypes.INTEGER, 0, VariableOwner.SERVER), - "max_allowed_packet": (OrsoTypes.INTEGER, 67108864, VariableOwner.SERVER), - "max_execution_time": (OrsoTypes.INTEGER, 0, VariableOwner.SERVER), - "net_buffer_length": (OrsoTypes.INTEGER, 16384, VariableOwner.SERVER), - "net_write_timeout": (OrsoTypes.INTEGER, 28800, VariableOwner.SERVER), - "performance_schema": (OrsoTypes.BOOLEAN, False, VariableOwner.SERVER), - "sql_auto_is_null": (OrsoTypes.BOOLEAN, False, VariableOwner.SERVER), - "sql_mode": (OrsoTypes.VARCHAR, "ANSI", VariableOwner.SERVER), - "sql_select_limit": (OrsoTypes.INTEGER, None, VariableOwner.SERVER), - "system_time_zone": (OrsoTypes.VARCHAR, "UTC", VariableOwner.SERVER), - "time_zone": (OrsoTypes.VARCHAR, "UTC", VariableOwner.SERVER), - "transaction_read_only": (OrsoTypes.BOOLEAN, False, VariableOwner.SERVER), - "transaction_isolation": (OrsoTypes.VARCHAR, "READ-COMMITTED", VariableOwner.SERVER), - "version": (OrsoTypes.VARCHAR, __version__, VariableOwner.SERVER), - "version_comment": (OrsoTypes.VARCHAR, "mesos", VariableOwner.SERVER), - "wait_timeout": (OrsoTypes.INTEGER, 28800, VariableOwner.SERVER), - "event_scheduler": (OrsoTypes.VARCHAR, "OFF", VariableOwner.SERVER), - "default_storage_engine": (OrsoTypes.VARCHAR, "opteryx", VariableOwner.SERVER), - "default_tmp_storage_engine": (OrsoTypes.VARCHAR, "opteryx", VariableOwner.SERVER), - - # these are Opteryx specific variables - "max_cache_evictions_per_query": (OrsoTypes.INTEGER, config.MAX_CACHE_EVICTIONS_PER_QUERY, VariableOwner.USER), - "max_cacheable_item_size": (OrsoTypes.INTEGER, config.MAX_CACHEABLE_ITEM_SIZE, VariableOwner.SERVER), - "max_local_buffer_capacity": (OrsoTypes.INTEGER, config.MAX_LOCAL_BUFFER_CAPACITY, VariableOwner.SERVER), - "max_read_buffer_capacity": (OrsoTypes.INTEGER, config.MAX_READ_BUFFER_CAPACITY, VariableOwner.SERVER), - "disable_optimizer": (OrsoTypes.BOOLEAN, config.DISABLE_OPTIMIZER, VariableOwner.USER), - "disable_high_priority": (OrsoTypes.BOOLEAN, config.DISABLE_HIGH_PRIORITY, VariableOwner.SERVER), - "concurrent_reads": (OrsoTypes.BOOLEAN, config.CONCURRENT_READS, VariableOwner.SERVER), - "user_memberships": (OrsoTypes.ARRAY, [], VariableOwner.INTERNAL), - "morsel_size": (OrsoTypes.INTEGER, config.MORSEL_SIZE, VariableOwner.SERVER), + "auto_increment_increment": (OrsoTypes.INTEGER, 1, VariableOwner.INTERNAL, Visibility.UNRESTRICTED), + "autocommit": (OrsoTypes.BOOLEAN, True, VariableOwner.SERVER, Visibility.UNRESTRICTED), + "character_set_client": (OrsoTypes.VARCHAR, CharacterSet.utf8mb4.name, VariableOwner.SERVER, Visibility.UNRESTRICTED), + "character_set_connection": (OrsoTypes.VARCHAR, CharacterSet.utf8mb4.name, VariableOwner.SERVER, Visibility.UNRESTRICTED), + "character_set_database": (OrsoTypes.VARCHAR, CharacterSet.utf8mb4.name, VariableOwner.SERVER, Visibility.UNRESTRICTED), + "character_set_results": (OrsoTypes.VARCHAR, CharacterSet.utf8mb4.name, VariableOwner.SERVER, Visibility.UNRESTRICTED), + "character_set_server": (OrsoTypes.VARCHAR, CharacterSet.utf8mb4.name, VariableOwner.SERVER, Visibility.UNRESTRICTED), + "collation_connection": (OrsoTypes.VARCHAR, Collation.utf8mb4_general_ci.name, VariableOwner.SERVER, Visibility.UNRESTRICTED), + "collation_database": (OrsoTypes.VARCHAR, Collation.utf8mb4_general_ci.name, VariableOwner.SERVER, Visibility.UNRESTRICTED), + "collation_server": (OrsoTypes.VARCHAR, Collation.utf8mb4_general_ci.name, VariableOwner.SERVER, Visibility.UNRESTRICTED), + "external_user": (OrsoTypes.VARCHAR, "", VariableOwner.INTERNAL, Visibility.RESTRICTED), + "init_connect": (OrsoTypes.VARCHAR, "", VariableOwner.SERVER, Visibility.RESTRICTED), + "interactive_timeout": (OrsoTypes.INTEGER, 28800, VariableOwner.SERVER, Visibility.UNRESTRICTED), + "license": (OrsoTypes.VARCHAR, "MIT", VariableOwner.SERVER, Visibility.RESTRICTED), + "lower_case_table_names": (OrsoTypes.INTEGER, 0, VariableOwner.SERVER, Visibility.RESTRICTED), + "max_allowed_packet": (OrsoTypes.INTEGER, 67108864, VariableOwner.SERVER, Visibility.RESTRICTED), + "max_execution_time": (OrsoTypes.INTEGER, 0, VariableOwner.SERVER, Visibility.UNRESTRICTED), + "net_buffer_length": (OrsoTypes.INTEGER, 16384, VariableOwner.SERVER, Visibility.RESTRICTED), + "net_write_timeout": (OrsoTypes.INTEGER, 28800, VariableOwner.SERVER, Visibility.RESTRICTED), + "performance_schema": (OrsoTypes.BOOLEAN, False, VariableOwner.SERVER, Visibility.RESTRICTED), + "sql_auto_is_null": (OrsoTypes.BOOLEAN, False, VariableOwner.SERVER, Visibility.RESTRICTED), + "sql_mode": (OrsoTypes.VARCHAR, "ANSI", VariableOwner.SERVER, Visibility.RESTRICTED), + "sql_select_limit": (OrsoTypes.INTEGER, None, VariableOwner.SERVER, Visibility.UNRESTRICTED), + "system_time_zone": (OrsoTypes.VARCHAR, "UTC", VariableOwner.SERVER, Visibility.UNRESTRICTED), + "time_zone": (OrsoTypes.VARCHAR, "UTC", VariableOwner.SERVER, Visibility.UNRESTRICTED), + "transaction_read_only": (OrsoTypes.BOOLEAN, False, VariableOwner.SERVER, Visibility.RESTRICTED), + "transaction_isolation": (OrsoTypes.VARCHAR, "READ-COMMITTED", VariableOwner.SERVER, Visibility.RESTRICTED), + "version": (OrsoTypes.VARCHAR, __version__, VariableOwner.SERVER, Visibility.RESTRICTED), + "version_comment": (OrsoTypes.VARCHAR, "mesos", VariableOwner.SERVER, Visibility.RESTRICTED), + "wait_timeout": (OrsoTypes.INTEGER, 28800, VariableOwner.SERVER, Visibility.RESTRICTED), + "event_scheduler": (OrsoTypes.VARCHAR, "OFF", VariableOwner.SERVER, Visibility.UNRESTRICTED), + "default_storage_engine": (OrsoTypes.VARCHAR, "opteryx", VariableOwner.SERVER, Visibility.UNRESTRICTED), + "default_tmp_storage_engine": (OrsoTypes.VARCHAR, "opteryx", VariableOwner.SERVER, Visibility.UNRESTRICTED), + + # These are Opteryx specific variables + "max_cache_evictions_per_query": (OrsoTypes.INTEGER, config.MAX_CACHE_EVICTIONS_PER_QUERY, VariableOwner.USER, Visibility.RESTRICTED), + "max_cacheable_item_size": (OrsoTypes.INTEGER, config.MAX_CACHEABLE_ITEM_SIZE, VariableOwner.SERVER, Visibility.RESTRICTED), + "max_local_buffer_capacity": (OrsoTypes.INTEGER, config.MAX_LOCAL_BUFFER_CAPACITY, VariableOwner.SERVER, Visibility.RESTRICTED), + "max_read_buffer_capacity": (OrsoTypes.INTEGER, config.MAX_READ_BUFFER_CAPACITY, VariableOwner.SERVER, Visibility.RESTRICTED), + "disable_optimizer": (OrsoTypes.BOOLEAN, config.DISABLE_OPTIMIZER, VariableOwner.USER, Visibility.RESTRICTED), + "disable_high_priority": (OrsoTypes.BOOLEAN, config.DISABLE_HIGH_PRIORITY, VariableOwner.SERVER, Visibility.RESTRICTED), + "concurrent_reads": (OrsoTypes.INTEGER, config.CONCURRENT_READS, VariableOwner.SERVER, Visibility.RESTRICTED), + "user_memberships": (OrsoTypes.ARRAY, [], VariableOwner.INTERNAL, Visibility.UNRESTRICTED), + "morsel_size": (OrsoTypes.INTEGER, config.MORSEL_SIZE, VariableOwner.SERVER, Visibility.RESTRICTED), } # fmt: on @@ -114,6 +117,7 @@ def __setitem__(self, key: str, value: Any) -> None: if key[0] == "@": variable_type = value.type owner = VariableOwner.USER + visibility = Visibility.UNRESTRICTED else: if key not in self._variables: from opteryx.utils import suggest_alternative @@ -121,13 +125,13 @@ def __setitem__(self, key: str, value: Any) -> None: suggestion = suggest_alternative(key, list(self._variables.keys())) raise VariableNotFoundError(variable=key, suggestion=suggestion) - variable_type, _, owner = self._variables[key] + variable_type, _, owner, visibility = self._variables[key] if owner > self._owner: raise PermissionsError(f"User does not have permission to set variable `{key}`") if variable_type != value.type: raise ValueError(f"Invalid type for `{key}`, {variable_type} expected.") - self._variables[key] = (variable_type, value.value, owner) + self._variables[key] = (variable_type, value.value, owner, visibility) def details(self, key: str) -> VariableSchema: if key not in self._variables: diff --git a/opteryx/virtual_datasets/variables_data.py b/opteryx/virtual_datasets/variables_data.py index 17654166c..0b96a51e7 100644 --- a/opteryx/virtual_datasets/variables_data.py +++ b/opteryx/virtual_datasets/variables_data.py @@ -28,13 +28,16 @@ def read(end_date=None, variables={}): buffer = [] for variable in variables: - variable_type, variable_value, variable_owner = variables.details(variable) + variable_type, variable_value, variable_owner, variable_visibility = variables.details( + variable + ) buffer.append( { "name": variable, "value": str(variable_value), "type": variable_type, "owner": variable_owner.name, + "visibility": variable_visibility.name, } ) @@ -50,6 +53,7 @@ def schema(): FlatColumn(name="value", type=OrsoTypes.VARCHAR), FlatColumn(name="type", type=OrsoTypes.VARCHAR), FlatColumn(name="owner", type=OrsoTypes.VARCHAR), + FlatColumn(name="visibility", type=OrsoTypes.VARCHAR), ], ) # fmt:on diff --git a/tests/sql_battery/test_shapes_and_errors_battery.py b/tests/sql_battery/test_shapes_and_errors_battery.py index 3745882a0..539f2e83a 100644 --- a/tests/sql_battery/test_shapes_and_errors_battery.py +++ b/tests/sql_battery/test_shapes_and_errors_battery.py @@ -81,7 +81,7 @@ ("SELECT * FROM $astronauts", 357, 19, None), ("SELECT * FROM $no_table", 1, 1, None), ("SELECT * FROM sqlite.planets", 9, 20, None), - ("SELECT * FROM $variables", 42, 4, None), + ("SELECT * FROM $variables", 42, 5, None), ("SELECT * FROM $missions", 4630, 8, None), ("SELECT * FROM $statistics", 17, 2, None), ("SELECT * FROM $stop_words", 305, 1, None), @@ -1870,14 +1870,14 @@ ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ()", 0, 2, None), ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('%Apoll%', null)", 37, 2, None), ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('%Apoll%', null)", 37, 2, None), - ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('%aPoll%')", 37, 2, None), - ("SELECT name, missions FROM $astronauts WHERE missions ILIKE ANY ('%aPoll%')", 37, 2, None), - ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('Apollo 11')", 37, 2, None), - ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('Apollo 11')", 37, 2, None), - ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('Apollo_%')", 37, 2, None), - ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('Apo__o%')", 37, 2, None), - ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('%Apoll%', 123)", 37, 2, None), - ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('%pattern1%', '%pattern2%', '%pattern3%', '%pattern4%', '%pattern5%', '%pattern6%', '%pattern7%', '%pattern8%', '%pattern9%', '%pattern10%', '%pattern11%', '%pattern12%', '%pattern13%', '%pattern14%', '%pattern15%', '%pattern16%', '%pattern17%', '%pattern18%', '%pattern19%', '%pattern20%', '%pattern21%', '%pattern22%', '%pattern23%', '%pattern24%', '%pattern25%', '%pattern26%', '%pattern27%', '%pattern28%', '%pattern29%', '%pattern30%', '%pattern31%', '%pattern32%', '%pattern33%', '%pattern34%', '%pattern35%', '%pattern36%', '%pattern37%', '%pattern38%', '%pattern39%', '%pattern40%', '%pattern41%', '%pattern42%', '%pattern43%', '%pattern44%', '%pattern45%', '%pattern46%', '%pattern47%', '%pattern48%', '%pattern49%', '%pattern50%');", 37, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('%aPoll%')", 0, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions ILIKE ANY ('%aPoll%')", 34, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('Apollo 11')", 3, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('Apollo 11')", 354, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('Apollo_%')", 34, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('Apo__o%')", 34, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('%Apoll%', 123)", 34, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('%pattern1%', '%pattern2%', '%pattern3%', '%pattern4%', '%pattern5%', '%pattern6%', '%pattern7%', '%pattern8%', '%pattern9%', '%pattern10%', '%pattern11%', '%pattern12%', '%pattern13%', '%pattern14%', '%pattern15%', '%pattern16%', '%pattern17%', '%pattern18%', '%pattern19%', '%pattern20%', '%pattern21%', '%pattern22%', '%pattern23%', '%pattern24%', '%pattern25%', '%pattern26%', '%pattern27%', '%pattern28%', '%pattern29%', '%pattern30%', '%pattern31%', '%pattern32%', '%pattern33%', '%pattern34%', '%pattern35%', '%pattern36%', '%pattern37%', '%pattern38%', '%pattern39%', '%pattern40%', '%pattern41%', '%pattern42%', '%pattern43%', '%pattern44%', '%pattern45%', '%pattern46%', '%pattern47%', '%pattern48%', '%pattern49%', '%pattern50%');", 0, 2, None), # **************************************************************************************** From cd349b4246dcc5a303af82cdf269210fe0c92949 Mon Sep 17 00:00:00 2001 From: XB500 Date: Thu, 19 Dec 2024 20:26:38 +0000 Subject: [PATCH 069/157] Opteryx Version 0.19.0-alpha.892 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 97c8e23c7..f89810d1a 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 891 +__build__ = 892 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From d763ed5202058683f5449d6ff1e5e78b761ab316 Mon Sep 17 00:00:00 2001 From: joocer Date: Thu, 19 Dec 2024 22:44:55 +0000 Subject: [PATCH 070/157] #2132 --- opteryx/operators/aggregate_node.py | 1 + opteryx/operators/base_plan_node.py | 2 + opteryx/operators/show_create_node.py | 2 +- tests/storage/test_sql_duckdb.py | 63 ++++++++++------ tests/storage/test_sql_sqlite.py | 105 ++++++++++++++++++++++---- 5 files changed, 138 insertions(+), 35 deletions(-) diff --git a/opteryx/operators/aggregate_node.py b/opteryx/operators/aggregate_node.py index bd669a6ea..d7b688b41 100644 --- a/opteryx/operators/aggregate_node.py +++ b/opteryx/operators/aggregate_node.py @@ -225,6 +225,7 @@ def execute(self, morsel: pyarrow.Table, **kwargs) -> pyarrow.Table: morsel_promise=self.buffer, column_name=self.aggregates[0].schema_column.identity, ) + yield EOS return # merge all the morsels together into one table, selecting only the columns diff --git a/opteryx/operators/base_plan_node.py b/opteryx/operators/base_plan_node.py index ee8cee09e..e0c19d237 100644 --- a/opteryx/operators/base_plan_node.py +++ b/opteryx/operators/base_plan_node.py @@ -120,6 +120,8 @@ def __call__(self, morsel: pyarrow.Table, join_leg: str) -> Optional[pyarrow.Tab except StopIteration: # Break the loop when the generator is exhausted break + except Exception as err: + print(f"Exception {err} in operator", self.name) def sensors(self): return { diff --git a/opteryx/operators/show_create_node.py b/opteryx/operators/show_create_node.py index 469cdf003..5bb5799b3 100644 --- a/opteryx/operators/show_create_node.py +++ b/opteryx/operators/show_create_node.py @@ -53,7 +53,7 @@ def execute(self, morsel: pyarrow.Table, **kwargs) -> pyarrow.Table: view_sql = view_as_sql(self.object_name) buffer = [{self.object_name: view_sql}] table = pyarrow.Table.from_pylist(buffer) - return table + yield table raise DatasetNotFoundError(self.object_name) diff --git a/tests/storage/test_sql_duckdb.py b/tests/storage/test_sql_duckdb.py index 082e504b1..7376088aa 100644 --- a/tests/storage/test_sql_duckdb.py +++ b/tests/storage/test_sql_duckdb.py @@ -1,35 +1,56 @@ """ -Test we can read from DuckDB - this is a basic exercise of the SQL Connector +This module tests the ability to read from DuckDB using the SQLConnector. + +DuckDB is used to rigorously test the SQLConnector due to its in-memory nature, +which allows for fast and efficient testing without the overhead of disk I/O. +This enables more intensive testing without the limitations of disk-based databases. + +DuckDB appears to not like being tested in GitHub actions, so we run this suite +a little differently. We first create the DuckDB database, then we run the tests as +one function call, rather than it appearing as a test per statement as we do in other +battery test suites. + +Note: DuckDB includes additional tests beyond the standard battery. However, +due to DuckDB's unstable file format, it only covers a subset of the required use +cases to save time, as loading it with numerous different tables can be time-consuming. """ import os import sys -os.environ["OPTERYX_DEBUG"] = "1" sys.path.insert(1, os.path.join(sys.path[0], "../..")) import opteryx from opteryx.connectors import SqlConnector from tests.tools import create_duck_db -# fmt:off -test_cases = [ - ("SELECT * FROM duckdb.planets", 9, 20), - ("SELECT * FROM duckdb.satellites", 177, 8), - ("SELECT COUNT(*) FROM duckdb.planets;", 1, 1), - ("SELECT COUNT(*) FROM duckdb.satellites;", 1, 1), - ("SELECT name FROM duckdb.planets;", 9, 1), - ("SELECT name FROM duckdb.satellites", 177, 1), - ("SELECT * FROM duckdb.planets, duckdb.satellites", 1593, 28), - ("SELECT * FROM duckdb.planets INNER JOIN $satellites ON duckdb.planets.id = $satellites.planetId;", 177, 28), - ("SELECT * FROM duckdb.planets INNER JOIN duckdb.satellites ON duckdb.planets.id = duckdb.satellites.planetId;", 177, 28), - ("SELECT * FROM duckdb.planets, duckdb.satellites WHERE duckdb.planets.id = duckdb.satellites.planetId;", 177, 28), - ("SELECT * FROM duckdb.planets, duckdb.satellites WHERE duckdb.planets.id = 5 AND duckdb.satellites.planetId = 5;", 67, 28), - ("SELECT * FROM duckdb.planets, duckdb.satellites WHERE duckdb.planets.id - duckdb.satellites.planetId = 0;", 177, 28), - ("SELECT * FROM duckdb.planets, duckdb.satellites WHERE duckdb.planets.id - duckdb.satellites.planetId != 0;", 1416, 28), +# fmt: off +STATEMENTS = [ + ("SELECT * FROM duckdb.planets", 9, 20, None), + ("SELECT * FROM duckdb.satellites", 177, 8, None), + ("SELECT COUNT(*) FROM duckdb.planets;", 1, 1, None), + ("SELECT COUNT(*) FROM duckdb.satellites;", 1, 1, None), + ("SELECT name FROM duckdb.planets;", 9, 1, None), + ("SELECT name FROM duckdb.satellites", 177, 1, None), + ("SELECT * FROM duckdb.planets, duckdb.satellites", 1593, 28, None), + ("SELECT * FROM duckdb.planets INNER JOIN $satellites ON duckdb.planets.id = $satellites.planetId;", 177, 28, None), + ("SELECT * FROM duckdb.planets INNER JOIN duckdb.satellites ON duckdb.planets.id = duckdb.satellites.planetId;", 177, 28, None), + ("SELECT * FROM duckdb.planets, duckdb.satellites WHERE duckdb.planets.id = duckdb.satellites.planetId;", 177, 28, None), + ("SELECT * FROM duckdb.planets, duckdb.satellites WHERE duckdb.planets.id = 5 AND duckdb.satellites.planetId = 5;", 67, 28, None), + ("SELECT * FROM duckdb.planets, duckdb.satellites WHERE duckdb.planets.id - duckdb.satellites.planetId = 0;", 177, 28, None), + ("SELECT * FROM duckdb.planets, duckdb.satellites WHERE duckdb.planets.id - duckdb.satellites.planetId != 0;", 1416, 28, None), + ("SELECT DISTINCT name FROM duckdb.planets;", 9, 1, None), + ("SELECT name, COUNT(*) FROM duckdb.satellites GROUP BY name;", 177, 2, None), + ("SELECT name FROM duckdb.planets WHERE id IN (1, 2, 3);", 3, 1, None), + ("SELECT name FROM duckdb.satellites WHERE planetId BETWEEN 1 AND 3;", 1, 1, None), + ("SELECT name FROM duckdb.planets WHERE name LIKE 'E%';", 1, 1, None), + ("SELECT name FROM duckdb.satellites WHERE name ILIKE '%moon%';", 1, 1, None), + ("SELECT * FROM duckdb.planets ORDER BY name;", 9, 20, None), + ("SELECT * FROM duckdb.satellites ORDER BY name DESC;", 177, 8, None), + ("SELECT * FROM duckdb.planets LIMIT 5;", 5, 20, None), + ("SELECT * FROM duckdb.satellites LIMIT 10 OFFSET 5;", 10, 8, None), ] -# fmt:on - +# fmt: on def test_duckdb_storage(): # We have some problems with creating duckdb, particularly in GitHub Actions @@ -72,8 +93,8 @@ def test_duckdb_battery(): connection="duckdb:///planets.duckdb", ) - print(f"RUNNING DUCK BATTERY OF {len(test_cases)} TESTS") - for script, rows, cols in test_cases: + print(f"RUNNING FLOCK OF {len(STATEMENTS)} DUCK TESTS\n") + for script, rows, cols, error in STATEMENTS: print(format_sql(script)) results = opteryx.query(script) assert results.rowcount == rows, format_sql(script) + str(results.shape) diff --git a/tests/storage/test_sql_sqlite.py b/tests/storage/test_sql_sqlite.py index b239822b4..6c6339577 100644 --- a/tests/storage/test_sql_sqlite.py +++ b/tests/storage/test_sql_sqlite.py @@ -1,14 +1,13 @@ """ -Test we can read from SQLite. +This module tests the ability to read from SQLite using the SQLConnector. -SQLite is also used to test the SQLConnector harder than the other -SQL sources. We use SQLite for this because the file is local and therefore -we're not going to cause contention with remote services. +SQLite is used to rigorously test the SQLConnector due to its local file nature, +which avoids contention with remote services. This allows for more intensive +testing without the overhead of network latency or remote service limitations. -Note: DuckDB also has additional tests to the standard battery but because -DuckDB doesn't have a stable file format, it only covers a subset of -the required use cases (to save time, loading it with a lot of different -tables is time consuming) +Note: DuckDB also includes additional tests beyond the standard battery. However, +due to DuckDB's unstable file format, it only covers a subset of the required use +cases to save time, as loading it with numerous different tables can be time-consuming. """ import os @@ -53,7 +52,7 @@ ("SELECT avg(num_moons) FROM (SELECT numberOfMoons as num_moons FROM sqlite.planets) AS subquery;", 1, 1, None), ("SELECT p.name, s.name FROM sqlite.planets p LEFT OUTER JOIN sqlite.satellites s ON p.id = s.planetId;", 179, 2, None), ("SELECT A.name, B.name FROM sqlite.planets A, sqlite.planets B WHERE A.gravity = B.gravity AND A.id != B.id;", 2, 2, None), -# ("SELECT * FROM sqlite.planets p JOIN sqlite.satellites s ON p.id = s.planetId AND p.gravity > 1;", 6, 28, None), + ("SELECT * FROM sqlite.planets p JOIN sqlite.satellites s ON p.id = s.planetId WHERE p.gravity > 1;", 172, 28, None), ("SELECT planetId, COUNT(*) AS num_satellites FROM sqlite.satellites GROUP BY planetId HAVING COUNT(*) > 1;", 6, 2, None), ("SELECT * FROM sqlite.planets ORDER BY name;", 9, 20, None), ("SELECT DISTINCT name FROM sqlite.planets;", 9, 1, None), @@ -89,11 +88,90 @@ ("SELECT name FROM sqlite.planets WHERE name LIKE '____';", 1, 1, None), ("SELECT name FROM sqlite.planets WHERE name NOT LIKE '____';", 8, 1, None), # All except Mars, Earth ("SELECT name FROM sqlite.planets WHERE name ILIKE '%o';", 1, 1, None), # Pluto - ("SELECT name FROM sqlite.planets WHERE name NOT ILIKE '%o';", 8, 1, None) # All except Pluto - + ("SELECT name FROM sqlite.planets WHERE name RLIKE '^M';", 2, 1, None), # Mars, Mercury + ("SELECT name FROM sqlite.planets WHERE name RLIKE 'e';", 4, 1, None), # Earth, Jupiter, Neptune, Mercury, Venus + ("SELECT name FROM sqlite.planets WHERE name RLIKE '^.a';", 3, 1, None), # Mars, Saturn, Uranus + ("SELECT name FROM sqlite.planets WHERE name RLIKE '^.{4}$';", 1, 1, None), # Mars + ("SELECT name FROM sqlite.planets WHERE name RLIKE 't$';", 0, 1, None), + ("SELECT name FROM sqlite.planets WHERE name RLIKE 'o$';", 1, 1, None), # Pluto + ("SELECT name FROM sqlite.planets WHERE name NOT RLIKE 'o$';", 8, 1, None), # All except Pluto + ("SELECT COUNT(DISTINCT name) FROM sqlite.planets;", 1, 1, None), + ("SELECT name FROM sqlite.planets WHERE id NOT IN (1, 2, 3);", 6, 1, None), + ("SELECT COUNT(*) FROM sqlite.satellites WHERE planetId = 1;", 1, 1, None), + ("SELECT COUNT(*) FROM sqlite.satellites WHERE planetId = 2;", 1, 1, None), + ("SELECT COUNT(*) FROM sqlite.satellites WHERE planetId = 3;", 1, 1, None), + ("SELECT COUNT(*) FROM sqlite.satellites WHERE planetId = 4;", 1, 1, None), + ("SELECT COUNT(*) FROM sqlite.satellites WHERE planetId = 5;", 1, 1, None), + ("SELECT COUNT(*) FROM sqlite.satellites WHERE planetId = 6;", 1, 1, None), + ("SELECT COUNT(*) FROM sqlite.satellites WHERE planetId = 7;", 1, 1, None), + ("SELECT COUNT(*) FROM sqlite.satellites WHERE planetId = 8;", 1, 1, None), + ("SELECT COUNT(*) FROM sqlite.satellites WHERE planetId = 9;", 1, 1, None), + ("SELECT user_name FROM sqlite_tweets.tweets WHERE user_verified is true;", 711, 1, None), + ("SELECT user_name FROM sqlite_tweets.tweets WHERE user_verified is false;", 99289, 1, None), + ("SELECT user_name FROM sqlite_tweets.tweets WHERE user_verified = true;", 711, 1, None), + ("SELECT user_name FROM sqlite_tweets.tweets WHERE user_verified = false;", 99289, 1, None), + ("SELECT user_name FROM sqlite_tweets.tweets WHERE text LIKE '%happy%';", 1174, 1, None), + ("SELECT user_name FROM sqlite_tweets.tweets WHERE text LIKE '%sad%';", 697, 1, None), + ("SELECT user_name FROM sqlite_tweets.tweets WHERE text LIKE '%excited%';", 280, 1, None), + ("SELECT user_name FROM sqlite_tweets.tweets WHERE text LIKE '%angry%';", 147, 1, None), + ("SELECT user_name FROM sqlite_tweets.tweets WHERE text LIKE '%bored%';", 102, 1, None), + ("SELECT user_name FROM sqlite_tweets.tweets WHERE text LIKE '%tired%';", 359, 1, None), + ("SELECT user_name FROM sqlite_tweets.tweets WHERE text LIKE '%hungry%';", 74, 1, None), + ("SELECT user_name FROM sqlite_tweets.tweets WHERE text LIKE '%thirsty%';", 10, 1, None), + ("SELECT * FROM sqlite.planets WHERE id BETWEEN 2 AND 5;", 4, 20, None), + ("SELECT * FROM sqlite.planets WHERE id NOT BETWEEN 2 AND 5;", 5, 20, None), + ("SELECT * FROM sqlite.planets WHERE name IN ('Earth', 'Mars');", 2, 20, None), + ("SELECT * FROM sqlite.planets WHERE name NOT IN ('Earth', 'Mars');", 7, 20, None), + ("SELECT * FROM sqlite.planets WHERE name LIKE 'M%';", 2, 20, None), + ("SELECT * FROM sqlite.planets WHERE name NOT LIKE 'M%';", 7, 20, None), + ("SELECT * FROM sqlite.planets WHERE name LIKE '%e%';", 5, 20, None), + ("SELECT * FROM sqlite.planets WHERE name NOT LIKE '%e%';", 4, 20, None), + ("SELECT * FROM sqlite.planets WHERE name LIKE '_a%';", 3, 20, None), + ("SELECT * FROM sqlite.planets WHERE name NOT LIKE '_a%';", 6, 20, None), + ("SELECT * FROM sqlite.planets WHERE name LIKE '____';", 1, 20, None), + ("SELECT * FROM sqlite.planets WHERE name NOT LIKE '____';", 8, 20, None), + ("SELECT * FROM sqlite.planets WHERE name ILIKE 'p%';", 1, 20, None), + ("SELECT * FROM sqlite.planets WHERE name NOT ILIKE 'p%';", 8, 20, None), + ("SELECT * FROM sqlite.planets WHERE name ILIKE '%U%';", 7, 20, None), + ("SELECT * FROM sqlite.planets WHERE name NOT ILIKE '%U%';", 2, 20, None), + ("SELECT * FROM sqlite.planets WHERE name RLIKE '^M';", 2, 20, None), + ("SELECT * FROM sqlite.planets WHERE name RLIKE 'e';", 4, 20, None), + ("SELECT * FROM sqlite.planets WHERE name RLIKE '^.a';", 3, 20, None), + ("SELECT * FROM sqlite.planets WHERE name RLIKE '^.{4}$';", 1, 20, None), + ("SELECT * FROM sqlite.planets WHERE name RLIKE 't$';", 0, 20, None), + ("SELECT * FROM sqlite.planets WHERE name RLIKE 'o$';", 1, 20, None), + ("SELECT * FROM sqlite.planets WHERE name NOT RLIKE 'o$';", 8, 20, None), + ("SELECT * FROM sqlite.planets WHERE id > 3 AND id < 7;", 3, 20, None), + ("SELECT * FROM sqlite.planets WHERE id <= 3 OR id >= 7;", 6, 20, None), + ("SELECT * FROM sqlite.planets WHERE id = 1 OR id = 9;", 2, 20, None), + ("SELECT * FROM sqlite.planets WHERE id != 1 AND id != 9;", 7, 20, None), + ("SELECT * FROM sqlite.planets WHERE id > 1 AND id < 9;", 7, 20, None), + ("SELECT * FROM sqlite.planets WHERE id <= 1 OR id >= 9;", 2, 20, None), + ("SELECT * FROM sqlite.planets WHERE id IN (1, 3, 5, 7, 9);", 5, 20, None), + ("SELECT * FROM sqlite.planets WHERE id NOT IN (1, 3, 5, 7, 9);", 4, 20, None), + ("SELECT * FROM sqlite.planets WHERE id BETWEEN 1 AND 3;", 3, 20, None), + ("SELECT * FROM sqlite.planets WHERE id NOT BETWEEN 1 AND 3;", 6, 20, None), + ("SELECT * FROM sqlite.planets WHERE id BETWEEN 7 AND 9;", 3, 20, None), + ("SELECT * FROM sqlite.planets WHERE id NOT BETWEEN 7 AND 9;", 6, 20, None), + ("SELECT * FROM sqlite.planets WHERE id BETWEEN 4 AND 6;", 3, 20, None), + ("SELECT * FROM sqlite.planets WHERE id NOT BETWEEN 4 AND 6;", 6, 20, None), + ("SELECT * FROM sqlite.planets WHERE id BETWEEN 2 AND 8;", 7, 20, None), + ("SELECT * FROM sqlite.planets WHERE id NOT BETWEEN 2 AND 8;", 2, 20, None), + ("SELECT * FROM sqlite.planets WHERE id BETWEEN 1 AND 9;", 9, 20, None), + ("SELECT * FROM sqlite.planets WHERE id NOT BETWEEN 1 AND 9;", 0, 20, None), + ("SELECT * FROM sqlite.planets WHERE id BETWEEN 3 AND 7;", 5, 20, None), + ("SELECT * FROM sqlite.planets WHERE id NOT BETWEEN 3 AND 7;", 4, 20, None), + ("SELECT * FROM sqlite.planets WHERE id BETWEEN 5 AND 9;", 5, 20, None), + ("SELECT * FROM sqlite.planets WHERE id NOT BETWEEN 5 AND 9;", 4, 20, None), + ("SELECT * FROM sqlite.planets WHERE id BETWEEN 1 AND 5;", 5, 20, None), + ("SELECT * FROM sqlite.planets WHERE id NOT BETWEEN 1 AND 5;", 4, 20, None), + ("SELECT * FROM sqlite.planets WHERE id BETWEEN 2 AND 6;", 5, 20, None), + ("SELECT * FROM sqlite.planets WHERE id NOT BETWEEN 2 AND 6;", 4, 20, None), + ("SELECT * FROM sqlite.planets WHERE id BETWEEN 4 AND 8;", 5, 20, None), + ("SELECT * FROM sqlite.planets WHERE id NOT BETWEEN 4 AND 8;", 4, 20, None), + ("SELECT * FROM sqlite.planets WHERE id BETWEEN 3 AND 9;", 7, 20, None), + ("SELECT user_name, name FROM sqlite_tweets.tweets JOIN sqlite.planets ON sqlite_tweets.tweets.followers = sqlite.planets.id;", 3962, 2, None), ] -# fmt: on - @pytest.mark.parametrize("statement, rows, columns, exception", STATEMENTS) def test_sql_battery(statement, rows, columns, exception): @@ -131,6 +209,7 @@ def test_sql_battery(statement, rows, columns, exception): except AssertionError as err: # pragma: no cover raise Exception(err) from err except Exception as err: # pragma: no cover + print(err) if type(err) != exception: raise Exception( f"{format_sql(statement)}\nQuery failed with error {type(err)} but error {exception} was expected" From 2fbcf1493045e8f9bbf9f5225b8baffc4ab2cc35 Mon Sep 17 00:00:00 2001 From: XB500 Date: Thu, 19 Dec 2024 22:45:19 +0000 Subject: [PATCH 071/157] Opteryx Version 0.19.0-alpha.893 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index f89810d1a..f07a1a32e 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 892 +__build__ = 893 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 4936ad7e90186e7a7dd747635bbfd5c1441db5e1 Mon Sep 17 00:00:00 2001 From: joocer Date: Fri, 20 Dec 2024 00:05:12 +0000 Subject: [PATCH 072/157] #2132 --- opteryx/utils/dates.py | 2 +- tests/functions/test_date_trunc.py | 34 ++++++++++++++ tests/functions/test_levenshtien.py | 16 +++++++ tests/{misc => functions}/test_mbleven.py | 0 tests/functions/test_soundex.py | 45 +++++++++++++++++-- .../test_string_functions.py | 0 tests/misc/test_cast.py | 35 ++++++++++++++- tests/misc/test_hash_table.py | 32 +++++++++++++ tests/misc/test_node.py | 17 ++++--- 9 files changed, 170 insertions(+), 11 deletions(-) rename tests/{misc => functions}/test_mbleven.py (100%) rename tests/{misc => functions}/test_string_functions.py (100%) diff --git a/opteryx/utils/dates.py b/opteryx/utils/dates.py index 0117edf9a..f59dcff6b 100644 --- a/opteryx/utils/dates.py +++ b/opteryx/utils/dates.py @@ -212,7 +212,7 @@ def date_trunc(truncate_to, date_value): return datetime.datetime(date_value.year, 1, 1, tzinfo=date_value.tzinfo) elif truncate_to == "quarter": quarter = (date_value.month - 1) // 3 + 1 - return datetime.datetime(date_value.year, 3 * quarter - 2, 1, tzinfo=date_value.tzinfo) + return datetime.datetime(date_value.year, 3 * (quarter - 1) + 1, 1, tzinfo=date_value.tzinfo) elif truncate_to == "month": return datetime.datetime(date_value.year, date_value.month, 1, tzinfo=date_value.tzinfo) elif truncate_to == "week": diff --git a/tests/functions/test_date_trunc.py b/tests/functions/test_date_trunc.py index 61ae09399..4af16bea8 100644 --- a/tests/functions/test_date_trunc.py +++ b/tests/functions/test_date_trunc.py @@ -60,6 +60,40 @@ def test_truncate_to_week(): dt = datetime(2012, 7, 9, 12, 14, 14, 342, timezone.utc) expected = dt.replace(hour=0, minute=0, second=0, microsecond=0) + actual = date_trunc("week", dt) + assert actual == expected + + +def test_truncate_to_quarter(): + dt = datetime(2012, 7, 12, 12, 14, 14, 342, timezone.utc) + expected = datetime(2012, 7, 1, 0, 0, 0, 0, timezone.utc) + actual = date_trunc("quarter", dt) + assert actual == expected, f"{actual}, {expected}" + + dt = datetime(2012, 1, 15, 10, 30, 45, 123, timezone.utc) + expected = datetime(2012, 1, 1, 0, 0, 0, 0, timezone.utc) + actual = date_trunc("quarter", dt) + assert actual == expected, f"{actual}, {expected}" + + dt = datetime(2012, 6, 25, 5, 20, 30, 456, timezone.utc) + expected = datetime(2012, 4, 1, 0, 0, 0, 0, timezone.utc) + actual = date_trunc("quarter", dt) + assert actual == expected, f"{actual}, {expected}" + + dt = datetime(2012, 11, 5, 23, 59, 59, 999, timezone.utc) + expected = datetime(2012, 10, 1, 0, 0, 0, 0, timezone.utc) + actual = date_trunc("quarter", dt) + assert actual == expected, f"{actual}, {expected}" + + +def test_truncate_to_decade(): + dt = datetime(2012, 7, 12, 12, 14, 14, 342, timezone.utc) + try: + date_trunc("decade", dt) + except ValueError: + pass + except Exception as e: + assert False, f"Unexpected exception: {e}" if __name__ == "__main__": # pragma: no cover diff --git a/tests/functions/test_levenshtien.py b/tests/functions/test_levenshtien.py index 6119d4e0d..4f646a94a 100644 --- a/tests/functions/test_levenshtien.py +++ b/tests/functions/test_levenshtien.py @@ -66,6 +66,22 @@ ("hello world", "helloworld", 1), # space removed ("hello world", "hello world", 1), # extra space ("hello world", " hello world", 1), # space added at the beginning + + # strings with special characters + ("hello@world", "hello#world", 1), # at replaced with hash + ("hello@world", "hello world", 1), # at replaced with space + ("hello@world", "hello@wor1d", 1), # l replaced with 1 + + # strings with mixed case + ("HelloWorld", "helloworld", 2), # H replaced with h + ("HelloWorld", "Helloworld", 1), # W replaced with w + ("HelloWorld", "helloworld!", 3), # H replaced with h and exclamation mark added + + # strings with unicode characters + ("hello", "héllo", 1), # e replaced with é + ("hello", "hellö", 1), # o replaced with ö + ("你好", "你号", 1), # 好 replaced with 号 + ("こんにちは", "こんばんは", 2), # こん replaced with こん and は replaced with ば ("hello world", "hello world ", 1), # space added at the end ("hello world", "h e l l o world", 4), # spaces added in between characters ] diff --git a/tests/misc/test_mbleven.py b/tests/functions/test_mbleven.py similarity index 100% rename from tests/misc/test_mbleven.py rename to tests/functions/test_mbleven.py diff --git a/tests/functions/test_soundex.py b/tests/functions/test_soundex.py index 50a8ae41f..12aa28d25 100644 --- a/tests/functions/test_soundex.py +++ b/tests/functions/test_soundex.py @@ -53,7 +53,43 @@ ('Cy', 'C000'), ('Du', 'D000'), ('Ek', 'E200'), - ('', '') + ('', ''), + ('Washington', 'W252'), + ('Jefferson', 'J162'), + ('Lincoln', 'L524'), + ('Roosevelt', 'R214'), + ('Kennedy', 'K530'), + ('Reagan', 'R250'), + ('Bush', 'B200'), + ('Clinton', 'C453'), + ('Obama', 'O150'), + ('Trump', 'T651'), + ('Biden', 'B350'), + ('Harrison', 'H625'), + ('Cleveland', 'C414'), + ('McKinley', 'M254'), + ('Coolidge', 'C432'), + ('Hoover', 'H160'), + ('Truman', 'T650'), + ('Eisenhower', 'E256'), + ('Nixon', 'N250'), + ('Ford', 'F630'), + ('Carter', 'C636'), + ('Adams', 'A352'), + ('Madison', 'M325'), + ('Monroe', 'M560'), + ('Jackson', 'J250'), + ('Polk', 'P420'), + ('Taylor', 'T460'), + ('Fillmore', 'F456'), + ('Pierce', 'P620'), + ('Buchanan', 'B250'), + ('Grant', 'G653'), + ('Hayes', 'H200'), + ('Garfield', 'G614'), + ('Arthur', 'A636'), + ('Taft', 'T130'), + ('Harding', 'H635'), ] # fmt:on @@ -67,8 +103,11 @@ def test_soundex_battery(input, result): if __name__ == "__main__": # pragma: no cover print(f"RUNNING BATTERY OF {len(TESTS)} TESTS") for str1, str2 in TESTS: - test_soundex_battery(str1, str2) - print("\033[38;2;26;185;67m.\033[0m", end="") + try: + test_soundex_battery(str1, str2) + print("\033[38;2;26;185;67m.\033[0m", end="") + except Exception as e: + print(f"Test failed for {str1} and {str2} with error: {e}") print() print("✅ okay") diff --git a/tests/misc/test_string_functions.py b/tests/functions/test_string_functions.py similarity index 100% rename from tests/misc/test_string_functions.py rename to tests/functions/test_string_functions.py diff --git a/tests/misc/test_cast.py b/tests/misc/test_cast.py index 67cc4a834..3fa142d62 100644 --- a/tests/misc/test_cast.py +++ b/tests/misc/test_cast.py @@ -137,6 +137,27 @@ ("INTEGER", " 456 ", 456), # Leading and trailing spaces ("INTEGER", "-0", 0), # Negative zero handling + # Additional test cases for DECIMAL + ("DECIMAL", "3.14 ", decimal.Decimal("3.14")), # Trailing spaces + ("DECIMAL", "0.0000000000000000001", decimal.Decimal("0.0000000000000000001")), # Very small decimal + ("DECIMAL", "-0.0", decimal.Decimal("0.0")), # Negative zero as decimal + # Additional test cases for BOOLEAN + ("BOOLEAN", "tRuE", True), # Case insensitivity + ("BOOLEAN", "FaLsE", False), # Case insensitivity + ("BOOLEAN", 2, None), # Invalid integer value + ("BOOLEAN", -1, None), # Invalid negative value + + # Additional test cases for DOUBLE + ("DOUBLE", "3,14", None), # Comma instead of a dot + ("DOUBLE", " 123.45 ", 123.45), # Leading and trailing spaces + ("DOUBLE", "+Infinity", float('inf')), # Explicit positive infinity + #("DOUBLE", "NaN", float("NaN")), # Not-a-Number case handling + + # Additional test cases for INTEGER + ("INTEGER", "+123", 123), # Positive sign + ("INTEGER", " 456 ", 456), # Leading and trailing spaces + ("INTEGER", "-0", 0), # Negative zero handling + # Additional test cases for DECIMAL ("DECIMAL", "3.14 ", decimal.Decimal("3.14")), # Trailing spaces ("DECIMAL", "0.0000000000000000001", decimal.Decimal("0.0000000000000000001")), # Very small decimal @@ -147,6 +168,9 @@ ("VARCHAR", " leading and trailing spaces ", " leading and trailing spaces "), # Spaces retained ("VARCHAR", "\nnewline", "\nnewline"), # Newline character in string ("VARCHAR", "\ttabbed", "\ttabbed"), # Tab character in string + ("VARCHAR", "special characters !@#$%^&*()", "special characters !@#$%^&*()"), + ("VARCHAR", b'binary string', "b'binary string'"), # Binary string to string + ("VARCHAR", b'\x00\x01\x02', "b'\\x00\\x01\\x02'"), # Binary data to string # Additional test cases for TIMESTAMP ("TIMESTAMP", "2021-02-21T12:00:00Z", datetime.datetime(2021, 2, 21, 12, 0, 0)), # UTC suffix ignored @@ -154,12 +178,22 @@ ("TIMESTAMP", "2021-02-21T12:00:00+01:00", datetime.datetime(2021, 2, 21, 12, 0, 0)), # Timezone ignored ("TIMESTAMP", "2021-02-21T12:00", datetime.datetime(2021, 2, 21, 12, 0, 0)), ("TIMESTAMP", "2021-02-21T12", None), + ("TIMESTAMP", "2021-02-21T24:00:00", None), # Invalid hour + ("TIMESTAMP", "2021-02-21T12:60:00", None), # Invalid minute + ("TIMESTAMP", "2021-02-21T12:00:60", None), # Invalid second # Additional test cases for DATE ("DATE", "2021-02-21 ", None), # Trailing space ("DATE", "0001-01-01", datetime.date(1, 1, 1)), # Very early date ("DATE", "9999-12-31", datetime.date(9999, 12, 31)), # Very late date ("DATE", "2021.02.21", None), # Dots instead of hyphens + ("DATE", "2021/02/21", None), # Invalid format + ("DATE", "21-02-2021", None), # Invalid format + ("DATE", "2021-13-01", None), # Invalid month + ("DATE", "2021-00-01", None), # Invalid month + ("DATE", "2021-02-30", None), # Invalid date + ("DATE", "2021-02-29", None), # Non-leap year date + ("DATE", "2020-02-29", datetime.date(2020, 2, 29)), # Leap year date ] @pytest.mark.parametrize("type_name, input, expected", CAST_TESTS) @@ -168,7 +202,6 @@ def test_cast(type_name, input, expected): assert result == expected, f"{type_name} cast of `{input}` failed: {result} != {expected}" - if __name__ == "__main__": passed = 0 failed = 0 diff --git a/tests/misc/test_hash_table.py b/tests/misc/test_hash_table.py index ce20f61bb..31278632a 100644 --- a/tests/misc/test_hash_table.py +++ b/tests/misc/test_hash_table.py @@ -50,6 +50,7 @@ def test_hash_join_map_multicolumn(): assert hash_table.get(hash(1) * 31 + hash('x')) == [0] assert hash_table.get(hash(2) * 31 + hash('y')) == [1] assert hash_table.get(hash(3) * 31 + hash('z')) == [2] + assert hash_table.get(hash(4) * 31 + hash('w')) == [3] def test_hash_join_map_large_dataset(): # Create a large dataset to test performance and availability @@ -65,6 +66,37 @@ def test_hash_join_map_large_dataset(): # Verify it doesn’t crash and handles the large data set assert hash_table.get(hash(99999) * 31 + hash('x')) == [99999] +def test_hash_join_map_duplicate_keys(): + # Create a pyarrow Table with duplicate keys + data = { + 'a': [1, 2, 2, 4], + 'b': ['x', 'y', 'y', 'z'] + } + table = pyarrow.table(data) + + # Run the hash join map function + hash_table = hash_join_map(table, ['a', 'b']) + + # Check for correct hash mappings with duplicates + assert hash_table.get(hash(1) * 31 + hash('x')) == [0] + assert hash_table.get(hash(2) * 31 + hash('y')) == [1, 2] + assert hash_table.get(hash(4) * 31 + hash('z')) == [3] + + +def test_hash_join_map_large_null_values(): + # Create a large dataset with null values + data = { + 'a': [None] * 50000 + list(range(50000)), + 'b': ['x'] * 100000 + } + table = pyarrow.table(data) + + # Run the hash join map function + hash_table = hash_join_map(table, ['a', 'b']) + + # Verify it handles the large data set with null values + assert hash_table.get(hash(49999) * 31 + hash('x')) == [99999] + assert hash_table.get(hash(None)) == [] if __name__ == "__main__": # pragma: no cover from tests.tools import run_tests diff --git a/tests/misc/test_node.py b/tests/misc/test_node.py index 66279ccd2..77d33c583 100644 --- a/tests/misc/test_node.py +++ b/tests/misc/test_node.py @@ -16,13 +16,16 @@ def test_node_simple_usage(): def test_node_str_representation(): n = Node("", a=1, c=3) - assert str(n) == '{"node_type":"","a":1,"c":3}' - + stringified = str(n) + assert '"node_type":""' in stringified + assert '"a":1' in stringified + assert '"c":3' in stringified def test_node_properties(): n = Node("", a=1, c=3) - n_properties = n.properties - assert n_properties == {"node_type": "", "a": 1, "c": 3} + p = n.properties + p.pop("uuid", None) + assert p == {"node_type": "", "a": 1, "c": 3} def test_node_copying(): @@ -69,14 +72,16 @@ def test_node_reassign_value(): n = Node("", a=1) n.a = 2 assert n.a == 2 - assert n.properties == {"node_type": "", "a": 2} + p = n.properties + p.pop("uuid", None) + assert p == {"node_type": "", "a": 2} def test_node_node_type(): """Test setting and retrieving the node_type attribute.""" n = Node(node_type="Fruit") assert n.node_type == "Fruit" - assert n.properties == {"node_type": "Fruit"} + assert n.properties.get("node_type") == "Fruit" def test_node_copy_with_node_type(): From 3c0f8987618060e7c92b3b65c28b016659d65c5b Mon Sep 17 00:00:00 2001 From: XB500 Date: Fri, 20 Dec 2024 00:05:37 +0000 Subject: [PATCH 073/157] Opteryx Version 0.19.0-alpha.894 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index f07a1a32e..3d0ede4b1 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 893 +__build__ = 894 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 956969543f372a0029adae61ea5c27c6c5ddb4f1 Mon Sep 17 00:00:00 2001 From: joocer Date: Fri, 20 Dec 2024 00:22:47 +0000 Subject: [PATCH 074/157] #2132 --- opteryx/managers/execution/serial_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/managers/execution/serial_engine.py b/opteryx/managers/execution/serial_engine.py index 9f5e03857..71c3a7cf6 100644 --- a/opteryx/managers/execution/serial_engine.py +++ b/opteryx/managers/execution/serial_engine.py @@ -36,7 +36,7 @@ def execute( # Special case handling for 'Explain' queries if isinstance(head_node, ExplainNode): - return plan.explain(head_node.analyze), ResultType.TABULAR + return explain(plan, analyze=head_node.analyze), ResultType.TABULAR # Special case handling if isinstance(head_node, SetVariableNode): From d55463ed41a7d49ea5bd1937195b4bdfe544b181 Mon Sep 17 00:00:00 2001 From: XB500 Date: Fri, 20 Dec 2024 00:23:12 +0000 Subject: [PATCH 075/157] Opteryx Version 0.19.0-alpha.895 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 3d0ede4b1..e836f075b 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 894 +__build__ = 895 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 9cc9a8bdb7b4993d24d4c103915b8169a4e6a413 Mon Sep 17 00:00:00 2001 From: joocer Date: Fri, 20 Dec 2024 10:20:22 +0000 Subject: [PATCH 076/157] #2132 --- opteryx/managers/execution/serial_engine.py | 2 +- opteryx/operators/base_plan_node.py | 3 ++- opteryx/operators/outer_join_node.py | 1 + opteryx/planner/logical_planner/logical_planner.py | 4 ++-- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/opteryx/managers/execution/serial_engine.py b/opteryx/managers/execution/serial_engine.py index 71c3a7cf6..0af686c54 100644 --- a/opteryx/managers/execution/serial_engine.py +++ b/opteryx/managers/execution/serial_engine.py @@ -90,7 +90,7 @@ def _inner_explain(node, depth): temp = None head_node = plan.get_exit_points()[0] query_head, _, _ = plan.ingoing_edges(head_node)[0] - results = plan.execute(query_head) + results, result_type = execute(plan, query_head) if results is not None: results_generator, _ = next(results, ([], None)) for temp in results_generator: diff --git a/opteryx/operators/base_plan_node.py b/opteryx/operators/base_plan_node.py index e0c19d237..4a69aea17 100644 --- a/opteryx/operators/base_plan_node.py +++ b/opteryx/operators/base_plan_node.py @@ -121,7 +121,8 @@ def __call__(self, morsel: pyarrow.Table, join_leg: str) -> Optional[pyarrow.Tab # Break the loop when the generator is exhausted break except Exception as err: - print(f"Exception {err} in operator", self.name) + # print(f"Exception {err} in operator", self.name) + raise err def sensors(self): return { diff --git a/opteryx/operators/outer_join_node.py b/opteryx/operators/outer_join_node.py index a633bd514..a8c70527c 100644 --- a/opteryx/operators/outer_join_node.py +++ b/opteryx/operators/outer_join_node.py @@ -304,6 +304,7 @@ def execute(self, morsel: pyarrow.Table, join_leg: str) -> pyarrow.Table: left_columns=self.left_columns, right_columns=self.right_columns, ) + yield EOS else: self.right_buffer.append(morsel) diff --git a/opteryx/planner/logical_planner/logical_planner.py b/opteryx/planner/logical_planner/logical_planner.py index 64457b456..a379f4cca 100644 --- a/opteryx/planner/logical_planner/logical_planner.py +++ b/opteryx/planner/logical_planner/logical_planner.py @@ -844,7 +844,7 @@ def build_parm(node): "EXECUTE does not support USING syntax, please provide parameters in parenthesis." ) - statement_name = statement["Execute"]["name"]["value"].upper() + statement_name = statement["Execute"]["name"][0]["value"].upper() parameters = dict(build_parm(p) for p in statement["Execute"]["parameters"]) try: with open("prepared_statements.json", "r") as ps: @@ -883,7 +883,7 @@ def build_parm(node): parameters=parameters, connection=None, ) - return list(do_logical_planning_phase(parsed_statements))[0][0] + return do_logical_planning_phase(parsed_statements[0])[0] def plan_explain(statement) -> LogicalPlan: From e8a2ad286e42c2087e7750c8b3c8f3ab768e9844 Mon Sep 17 00:00:00 2001 From: XB500 Date: Fri, 20 Dec 2024 10:20:49 +0000 Subject: [PATCH 077/157] Opteryx Version 0.19.0-alpha.896 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index e836f075b..7e0139854 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 895 +__build__ = 896 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 1f9004740c145b00dc100235154f221d5b021ba5 Mon Sep 17 00:00:00 2001 From: joocer Date: Fri, 20 Dec 2024 11:05:20 +0000 Subject: [PATCH 078/157] #2133 --- .github/workflows/regression_suite.yaml | 9 +- opteryx/managers/cache/__init__.py | 3 +- opteryx/managers/cache/memcached.py | 2 + opteryx/managers/cache/redis.py | 2 + opteryx/managers/cache/valkey.py | 107 ++++++++++++++++++++++++ tests/requirements.txt | 1 + tests/storage/test_cache_valkey.py | 52 ++++++++++++ 7 files changed, 174 insertions(+), 2 deletions(-) create mode 100644 opteryx/managers/cache/valkey.py create mode 100644 tests/storage/test_cache_valkey.py diff --git a/.github/workflows/regression_suite.yaml b/.github/workflows/regression_suite.yaml index 99e07b5b9..dba91a53d 100644 --- a/.github/workflows/regression_suite.yaml +++ b/.github/workflows/regression_suite.yaml @@ -1,3 +1,10 @@ +# This is the main regression test script which tests the entire application. +# Other GitHub Actions test subsets primarily to ensure compatibility for different OSes. + +# This script is run on Ubuntu, the most common OS for CI/CD pipelines. +# The script is run on Python 3.10, 3.11, and 3.12. +# The script is run at 4:00 AM UTC every day and on every push to the repository. + name: Regression Suite on: @@ -74,7 +81,7 @@ jobs: DATA_CATALOG_PROVIDER: 'TARCHIA' DATA_CATALOG_CONFIGURATION: '${{ secrets.DATA_CATALOG_CONFIGURATION }}' TARCHIA_KEY: '${{ secrets.TARCHIA_KEY }}' - EXPERIMENTAL_EXECUTION_ENGINE: 'true' + REDIS_CONNECTION: '${{ secrets.VALKEY_CONFIG }}' - name: Check Coverage run: python -m coverage report --include=opteryx/** --fail-under=90 -m diff --git a/opteryx/managers/cache/__init__.py b/opteryx/managers/cache/__init__.py index f34ca0703..5a736e2a8 100644 --- a/opteryx/managers/cache/__init__.py +++ b/opteryx/managers/cache/__init__.py @@ -2,5 +2,6 @@ from .memcached import MemcachedCache from .null_cache import NullCache from .redis import RedisCache +from .valkey import ValkeyCache -__all__ = ("CacheManager", "MemcachedCache", "NullCache", "RedisCache") +__all__ = ("CacheManager", "MemcachedCache", "NullCache", "RedisCache", "ValkeyCache") diff --git a/opteryx/managers/cache/memcached.py b/opteryx/managers/cache/memcached.py index 98fa341ad..f9980c98e 100644 --- a/opteryx/managers/cache/memcached.py +++ b/opteryx/managers/cache/memcached.py @@ -83,6 +83,8 @@ def __init__(self, **kwargs): """ self._server = _memcached_server(**kwargs) if self._server is None: + import datetime + print(f"{datetime.datetime.now()} [CACHE] Unable to set up memcached cache.") self._consecutive_failures: int = MAXIMUM_CONSECUTIVE_FAILURES else: self._consecutive_failures = 0 diff --git a/opteryx/managers/cache/redis.py b/opteryx/managers/cache/redis.py index dafa66540..9c828533c 100644 --- a/opteryx/managers/cache/redis.py +++ b/opteryx/managers/cache/redis.py @@ -58,6 +58,8 @@ def __init__(self, **kwargs): """ self._server = _redis_server(**kwargs) if self._server is None: + import datetime + print(f"{datetime.datetime.now()} [CACHE] Unable to set up redis cache.") self._consecutive_failures: int = MAXIMUM_CONSECUTIVE_FAILURES else: self._consecutive_failures = 0 diff --git a/opteryx/managers/cache/valkey.py b/opteryx/managers/cache/valkey.py new file mode 100644 index 000000000..41d5629cb --- /dev/null +++ b/opteryx/managers/cache/valkey.py @@ -0,0 +1,107 @@ +""" +This implements an interface to Valkey + +If we have 10 failures in a row, stop trying to use the cache. +""" + +import os +from typing import Union + +from orso.tools import single_item_cache + +from opteryx.exceptions import MissingDependencyError +from opteryx.managers.kvstores import BaseKeyValueStore + +MAXIMUM_CONSECUTIVE_FAILURES: int = 10 + + +@single_item_cache +def _valkey_server(**kwargs): + """ + Handling connecting to Valkey + """ + # the server must be set in the environment + valkey_config = kwargs.get("server", os.environ.get("REDIS_CONNECTION")) + + + if valkey_config is None: + return None + + + try: + import valkey # Assuming `valkey` is the client library's name + except ImportError as err: + raise MissingDependencyError(err.name) from err + + return valkey.from_url(valkey_config) # Example instantiation of the client + + +class ValkeyCache(BaseKeyValueStore): + """ + Cache object + """ + + def __init__(self, **kwargs): + """ + Parameters: + server: string (optional) + Sets the Valkey server and port (server:port). If not provided + the value will be obtained from the OS environment. + """ + self._server = _valkey_server(**kwargs) + if self._server is None: + import datetime + print(f"{datetime.datetime.now()} [CACHE] Unable to set up valkey cache.") + self._consecutive_failures: int = MAXIMUM_CONSECUTIVE_FAILURES + else: + self._consecutive_failures = 0 + self.hits: int = 0 + self.misses: int = 0 + self.skips: int = 0 + self.errors: int = 0 + self.sets: int = 0 + + def get(self, key: bytes) -> Union[bytes, None]: + if self._consecutive_failures >= MAXIMUM_CONSECUTIVE_FAILURES: + self.skips += 1 + return None + try: + response = self._server.get(key) # Adjust based on Valkey's API + self._consecutive_failures = 0 + if response: + self.hits += 1 + return bytes(response) + except Exception as err: + self._consecutive_failures += 1 + if self._consecutive_failures >= MAXIMUM_CONSECUTIVE_FAILURES: + import datetime + + print( + f"{datetime.datetime.now()} [CACHE] Disabling remote Valkey cache due to persistent errors ({err})." + ) + self.errors += 1 + return None + + self.misses += 1 + return None + + def set(self, key: bytes, value: bytes) -> None: + if self._consecutive_failures < MAXIMUM_CONSECUTIVE_FAILURES: + try: + self._server.set(key, value) # Adjust based on Valkey's API + self.sets += 1 + except Exception as err: + # if we fail to set, stop trying + self._consecutive_failures = MAXIMUM_CONSECUTIVE_FAILURES + self.errors += 1 + import datetime + + print( + f"{datetime.datetime.now()} [CACHE] Disabling remote Valkey cache due to persistent errors ({err}) [SET]." + ) + else: + self.skips += 1 + + def __del__(self): + pass + # DEBUG: log(f"Valkey ") \ No newline at end of file diff --git a/tests/requirements.txt b/tests/requirements.txt index 0857b8413..0816e452e 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -13,6 +13,7 @@ pycln pandas pymemcache redis +valkey requests rich zstandard diff --git a/tests/storage/test_cache_valkey.py b/tests/storage/test_cache_valkey.py new file mode 100644 index 000000000..fe7c6f0da --- /dev/null +++ b/tests/storage/test_cache_valkey.py @@ -0,0 +1,52 @@ +""" +Test the valkey cache by executing the same query twice. The first time we ensure +the files are in the cache (they may or may not be) for the second time to definitely +'hit' the cache. +""" + +import os +import sys + +sys.path.insert(1, os.path.join(sys.path[0], "../..")) + +from tests.tools import is_arm, is_mac, is_windows, skip_if + + +@skip_if(is_arm() or is_windows() or is_mac()) +def test_valkey_cache(): + os.environ["OPTERYX_DEBUG"] = "1" + os.environ["MAX_LOCAL_BUFFER_CAPACITY"] = "10" + os.environ["MAX_CACHE_EVICTIONS_PER_QUERY"] = "4" + + import opteryx + from opteryx import CacheManager + from opteryx.managers.cache import ValkeyCache + from opteryx.shared import BufferPool + + cache = ValkeyCache() + opteryx.set_cache_manager(CacheManager(cache_backend=cache)) + + # read the data once, this should populate the cache if it hasn't already + cur = opteryx.query("SELECT * FROM testdata.flat.ten_files;") + stats = cur.stats + + buffer = BufferPool() + buffer.reset() + + # read the data a second time, this should hit the cache + cur = opteryx.query("SELECT * FROM testdata.flat.ten_files;") + + assert cache.hits > 0, cache.hits + assert cache.misses < 12 + assert cache.skips == 0 + assert cache.errors == 0 + + stats = cur.stats + assert stats.get("remote_cache_hits", 0) >= stats["blobs_read"], stats + assert stats.get("cache_misses", 0) == 0, stats + + +if __name__ == "__main__": # pragma: no cover + from tests.tools import run_tests + + run_tests() From 0aea0ef92d578113b1e529cfa94bbee7985c6786 Mon Sep 17 00:00:00 2001 From: XB500 Date: Fri, 20 Dec 2024 11:05:49 +0000 Subject: [PATCH 079/157] Opteryx Version 0.19.0-alpha.897 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 7e0139854..ad88ea5ea 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 896 +__build__ = 897 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 53e49a545fbec5daf70eb16e889c7c68257716a5 Mon Sep 17 00:00:00 2001 From: joocer Date: Fri, 20 Dec 2024 13:15:14 +0000 Subject: [PATCH 080/157] #2132 --- opteryx/managers/cache/memcached.py | 1 + opteryx/managers/cache/redis.py | 1 + opteryx/managers/cache/valkey.py | 5 ++--- opteryx/operators/union_node.py | 1 + opteryx/planner/views/__init__.py | 14 +++++++++++--- 5 files changed, 16 insertions(+), 6 deletions(-) diff --git a/opteryx/managers/cache/memcached.py b/opteryx/managers/cache/memcached.py index f9980c98e..01d4db1e9 100644 --- a/opteryx/managers/cache/memcached.py +++ b/opteryx/managers/cache/memcached.py @@ -84,6 +84,7 @@ def __init__(self, **kwargs): self._server = _memcached_server(**kwargs) if self._server is None: import datetime + print(f"{datetime.datetime.now()} [CACHE] Unable to set up memcached cache.") self._consecutive_failures: int = MAXIMUM_CONSECUTIVE_FAILURES else: diff --git a/opteryx/managers/cache/redis.py b/opteryx/managers/cache/redis.py index 9c828533c..b1eeec358 100644 --- a/opteryx/managers/cache/redis.py +++ b/opteryx/managers/cache/redis.py @@ -59,6 +59,7 @@ def __init__(self, **kwargs): self._server = _redis_server(**kwargs) if self._server is None: import datetime + print(f"{datetime.datetime.now()} [CACHE] Unable to set up redis cache.") self._consecutive_failures: int = MAXIMUM_CONSECUTIVE_FAILURES else: diff --git a/opteryx/managers/cache/valkey.py b/opteryx/managers/cache/valkey.py index 41d5629cb..b82e6c4b2 100644 --- a/opteryx/managers/cache/valkey.py +++ b/opteryx/managers/cache/valkey.py @@ -23,11 +23,9 @@ def _valkey_server(**kwargs): # the server must be set in the environment valkey_config = kwargs.get("server", os.environ.get("REDIS_CONNECTION")) - if valkey_config is None: return None - try: import valkey # Assuming `valkey` is the client library's name except ImportError as err: @@ -51,6 +49,7 @@ def __init__(self, **kwargs): self._server = _valkey_server(**kwargs) if self._server is None: import datetime + print(f"{datetime.datetime.now()} [CACHE] Unable to set up valkey cache.") self._consecutive_failures: int = MAXIMUM_CONSECUTIVE_FAILURES else: @@ -104,4 +103,4 @@ def set(self, key: bytes, value: bytes) -> None: def __del__(self): pass - # DEBUG: log(f"Valkey ") \ No newline at end of file + # DEBUG: log(f"Valkey ") diff --git a/opteryx/operators/union_node.py b/opteryx/operators/union_node.py index 0eca16b66..cd5f5c374 100644 --- a/opteryx/operators/union_node.py +++ b/opteryx/operators/union_node.py @@ -55,6 +55,7 @@ def execute(self, morsel: Table, **kwargs) -> Table: elif morsel == EOS: self.seen_first_eos = True yield None + return elif self.schema is None: self.schema = morsel.schema diff --git a/opteryx/planner/views/__init__.py b/opteryx/planner/views/__init__.py index 9bc17844e..ffe66abab 100644 --- a/opteryx/planner/views/__init__.py +++ b/opteryx/planner/views/__init__.py @@ -12,6 +12,8 @@ import orjson +from opteryx.exceptions import DatasetNotFoundError + def _load_views(): try: @@ -28,23 +30,29 @@ def _load_views(): def is_view(view_name: str) -> bool: + """Check if a view exists.""" return view_name in VIEWS -def view_as_plan(view_name: str): +def view_as_plan(view_name: str) -> dict: + """Return the logical plan for a view.""" from opteryx.planner.logical_planner import do_logical_planning_phase from opteryx.third_party import sqloxide from opteryx.utils.sql import clean_statement from opteryx.utils.sql import remove_comments - operation = VIEWS.get(view_name)["statement"] + if not is_view(view_name): + raise DatasetNotFoundError(view_name) + + operation = view_as_sql(view_name) clean_sql = clean_statement(remove_comments(operation)) parsed_statements = sqloxide.parse_sql(clean_sql, dialect="mysql") - logical_plan, _, _ = next(do_logical_planning_phase(parsed_statements)) + logical_plan, _, _ = do_logical_planning_phase(parsed_statements[0]) return logical_plan def view_as_sql(view_name: str): + """Return the SQL statement for a view.""" return VIEWS.get(view_name)["statement"] From 84feded9f6b8ca7d7602f4bb21f36cadb97a5173 Mon Sep 17 00:00:00 2001 From: XB500 Date: Fri, 20 Dec 2024 13:15:39 +0000 Subject: [PATCH 081/157] Opteryx Version 0.19.0-alpha.898 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index ad88ea5ea..1bf0b8b21 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 897 +__build__ = 898 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 58aa6213d1f46e3079cd1a426ae84fbe822ab8fd Mon Sep 17 00:00:00 2001 From: joocer Date: Fri, 20 Dec 2024 15:06:47 +0000 Subject: [PATCH 082/157] #2132 --- opteryx/managers/execution/serial_engine.py | 3 ++- opteryx/operators/base_plan_node.py | 1 + opteryx/operators/cross_join_node.py | 9 ++++--- opteryx/operators/filter_node.py | 2 +- opteryx/operators/inner_join_node_single.py | 26 ++++++--------------- opteryx/operators/outer_join_node.py | 3 +++ 6 files changed, 18 insertions(+), 26 deletions(-) diff --git a/opteryx/managers/execution/serial_engine.py b/opteryx/managers/execution/serial_engine.py index 0af686c54..d88bf458c 100644 --- a/opteryx/managers/execution/serial_engine.py +++ b/opteryx/managers/execution/serial_engine.py @@ -51,7 +51,8 @@ def inner_execute(plan: PhysicalPlan) -> Generator: pump_nodes = [(nid, node) for nid, node in plan.depth_first_search_flat() if node.is_scan] for pump_nid, pump_instance in pump_nodes: for morsel in pump_instance(None, None): - yield from process_node(plan, pump_nid, morsel, None) + if morsel is not None: + yield from process_node(plan, pump_nid, morsel, None) yield from process_node(plan, pump_nid, EOS, None) return inner_execute(plan), ResultType.TABULAR diff --git a/opteryx/operators/base_plan_node.py b/opteryx/operators/base_plan_node.py index 4a69aea17..1a6145619 100644 --- a/opteryx/operators/base_plan_node.py +++ b/opteryx/operators/base_plan_node.py @@ -93,6 +93,7 @@ def execute(self, morsel: pyarrow.Table) -> Optional[pyarrow.Table]: # pragma: pass def __call__(self, morsel: pyarrow.Table, join_leg: str) -> Optional[pyarrow.Table]: + if hasattr(morsel, "num_rows"): self.records_in += morsel.num_rows self.bytes_in += morsel.nbytes diff --git a/opteryx/operators/cross_join_node.py b/opteryx/operators/cross_join_node.py index abcf2bd4b..a3c302fc4 100644 --- a/opteryx/operators/cross_join_node.py +++ b/opteryx/operators/cross_join_node.py @@ -299,7 +299,6 @@ def __init__(self, properties: QueryProperties, **parameters): self._unnest_target.identity, } - self.stream = "left" self.left_buffer = [] self.right_buffer = [] self.left_relation = None @@ -331,7 +330,7 @@ def execute(self, morsel: pyarrow.Table, join_leg: str) -> pyarrow.Table: if self._unnest_column is not None: if morsel == EOS: self.continue_executing = False - yield None + yield EOS return if isinstance(self._unnest_column.value, tuple): yield from _cross_join_unnest_literal( @@ -352,9 +351,8 @@ def execute(self, morsel: pyarrow.Table, join_leg: str) -> pyarrow.Table: ) return - if self.stream == "left": + if join_leg == "left": if morsel == EOS: - self.stream = "right" self.left_relation = pyarrow.concat_tables(self.left_buffer, promote_options="none") self.left_buffer.clear() else: @@ -362,11 +360,12 @@ def execute(self, morsel: pyarrow.Table, join_leg: str) -> pyarrow.Table: yield None return - if self.stream == "right": + if join_leg == "right": if morsel == EOS: right_table = pyarrow.concat_tables(self.right_buffer, promote_options="none") # type:ignore self.right_buffer = None yield from _cross_join(self.left_relation, right_table) + yield EOS else: self.right_buffer.append(morsel) yield None diff --git a/opteryx/operators/filter_node.py b/opteryx/operators/filter_node.py index 81501b3c1..6228496d9 100644 --- a/opteryx/operators/filter_node.py +++ b/opteryx/operators/filter_node.py @@ -77,7 +77,7 @@ def execute(self, morsel: pyarrow.Table, **kwargs) -> pyarrow.Table: ) mask = numpy.nonzero(mask)[0] - # if there's no matching rows, just drop the morsel + # if there's no matching rows, return empty morsel if mask.size > 0 and not numpy.all(mask is None): yield morsel.take(pyarrow.array(mask)) else: diff --git a/opteryx/operators/inner_join_node_single.py b/opteryx/operators/inner_join_node_single.py index aca5d154c..7ebda2981 100644 --- a/opteryx/operators/inner_join_node_single.py +++ b/opteryx/operators/inner_join_node_single.py @@ -161,19 +161,16 @@ def inner_join_with_preprocessed_left_side(left_relation, right_relation, join_c class InnerJoinSingleNode(JoinNode): def __init__(self, properties: QueryProperties, **parameters): JoinNode.__init__(self, properties=properties, **parameters) - self._join_type = parameters["type"] - self._on = parameters.get("on") - self._using = parameters.get("using") - self._left_columns = parameters.get("left_columns") + self.left_columns = parameters.get("left_columns") self.left_readers = parameters.get("left_readers") - self._right_columns = parameters.get("right_columns") + self.right_columns = parameters.get("right_columns") self.right_readers = parameters.get("right_readers") - self.stream = "left" self.left_buffer = [] self.left_hash = None + self.left_relation = None @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover @@ -188,34 +185,25 @@ def config(self): # pragma: no cover return "" def execute(self, morsel: pyarrow.Table, join_leg: str) -> pyarrow.Table: - if self.stream == "left": + if join_leg == "left": if morsel == EOS: - self.stream = "right" self.left_relation = pyarrow.concat_tables(self.left_buffer, promote_options="none") self.left_buffer.clear() - - # in place until #1295 resolved - if self._left_columns[0] not in self.left_relation.column_names: - self._right_columns, self._left_columns = ( - self._left_columns, - self._right_columns, - ) - - self.left_hash = preprocess_left(self.left_relation, self._left_columns) + self.left_hash = preprocess_left(self.left_relation, self.left_columns) else: self.left_buffer.append(morsel) yield None return if morsel == EOS: - yield None + yield EOS return # do the join new_morsel = inner_join_with_preprocessed_left_side( left_relation=self.left_relation, right_relation=morsel, - join_columns=self._right_columns, + join_columns=self.right_columns, hash_table=self.left_hash, ) diff --git a/opteryx/operators/outer_join_node.py b/opteryx/operators/outer_join_node.py index a8c70527c..dd95bfe6d 100644 --- a/opteryx/operators/outer_join_node.py +++ b/opteryx/operators/outer_join_node.py @@ -290,6 +290,8 @@ def execute(self, morsel: pyarrow.Table, join_leg: str) -> pyarrow.Table: self.left_buffer.clear() else: self.left_buffer.append(morsel) + yield None + return if join_leg == "right": if morsel == EOS: @@ -308,6 +310,7 @@ def execute(self, morsel: pyarrow.Table, join_leg: str) -> pyarrow.Table: else: self.right_buffer.append(morsel) + yield None providers = { From 31d595182b7e8457783392fc9ce5484507acc3b3 Mon Sep 17 00:00:00 2001 From: XB500 Date: Fri, 20 Dec 2024 15:07:10 +0000 Subject: [PATCH 083/157] Opteryx Version 0.19.0-alpha.899 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 1bf0b8b21..1a2f498e6 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 898 +__build__ = 899 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 5782d448aa056f7b7324c3247f91232fdd7650c4 Mon Sep 17 00:00:00 2001 From: joocer Date: Fri, 20 Dec 2024 16:17:23 +0000 Subject: [PATCH 084/157] #2132 --- opteryx/functions/__init__.py | 1 + opteryx/managers/expression/ops.py | 4 +-- opteryx/operators/base_plan_node.py | 1 - opteryx/planner/binder/binder.py | 6 ++++- opteryx/utils/file_decoders.py | 26 +++++++++++++------ .../test_shapes_and_errors_battery.py | 2 -- tests/storage/test_sql_duckdb.py | 1 + 7 files changed, 27 insertions(+), 14 deletions(-) diff --git a/opteryx/functions/__init__.py b/opteryx/functions/__init__.py index 1762dbd26..de0938905 100644 --- a/opteryx/functions/__init__.py +++ b/opteryx/functions/__init__.py @@ -334,6 +334,7 @@ def sleep(x): "TRUNCATE": "TRUNC", # deprecated, remove 0.19.0 "LIST_CONTAINS_ANY": "ARRAY_CONTAINS_ANY", # deprecated, remove 0.20.0 "LIST_CONTAINS_ALL": "ARRAY_CONTAINS_ALL", # deprecated, remove 0.20.0 + "STRUCT": None, # deprecated, remove 0.21.0 } # fmt:off diff --git a/opteryx/managers/expression/ops.py b/opteryx/managers/expression/ops.py index 70ffaa27a..2801462ef 100644 --- a/opteryx/managers/expression/ops.py +++ b/opteryx/managers/expression/ops.py @@ -247,10 +247,10 @@ def _inner_filter_operations(arr, operator, value): return numpy.invert(out) if operator == "AtQuestion": - element = value[0] - import simdjson + element = value[0] + parser = simdjson.Parser() if not element.startswith("$."): diff --git a/opteryx/operators/base_plan_node.py b/opteryx/operators/base_plan_node.py index 1a6145619..4a69aea17 100644 --- a/opteryx/operators/base_plan_node.py +++ b/opteryx/operators/base_plan_node.py @@ -93,7 +93,6 @@ def execute(self, morsel: pyarrow.Table) -> Optional[pyarrow.Table]: # pragma: pass def __call__(self, morsel: pyarrow.Table, join_leg: str) -> Optional[pyarrow.Table]: - if hasattr(morsel, "num_rows"): self.records_in += morsel.num_rows self.bytes_in += morsel.nbytes diff --git a/opteryx/planner/binder/binder.py b/opteryx/planner/binder/binder.py index a78c8b2a3..fa079815b 100644 --- a/opteryx/planner/binder/binder.py +++ b/opteryx/planner/binder/binder.py @@ -283,7 +283,11 @@ def inner_binder(node: Node, context: BindingContext) -> Tuple[Node, Any]: if node.value in DEPRECATED_FUNCTIONS: import warnings - message = f"Function '{node.value}' is deprecated and will be removed in a future version. Use '{DEPRECATED_FUNCTIONS[node.value]}' instead." + replacement = DEPRECATED_FUNCTIONS[node.value] + if replacement is not None: + message = f"Function '{node.value}' is deprecated and will be removed in a future version. Use '{DEPRECATED_FUNCTIONS[node.value]}' instead." + else: + message = f"Function '{node.value}' is deprecated and will be removed in a future version." context.statistics.add_message(message) warnings.warn(message, category=DeprecationWarning, stacklevel=2) diff --git a/opteryx/utils/file_decoders.py b/opteryx/utils/file_decoders.py index 5845b7a53..432490145 100644 --- a/opteryx/utils/file_decoders.py +++ b/opteryx/utils/file_decoders.py @@ -317,17 +317,27 @@ def jsonl_decoder( just_schema: bool = False, **kwargs, ) -> Tuple[int, int, pyarrow.Table]: + import orjson import pyarrow.json + import simdjson - stream: BinaryIO = None - if isinstance(buffer, memoryview): - stream = MemoryViewStream(buffer) - elif isinstance(buffer, bytes): - stream = io.BytesIO(buffer) - else: - stream = buffer + rows = [] + + if not isinstance(buffer, bytes): + buffer = buffer.read() + + parser = simdjson.Parser() + + for line in buffer.split(b"\n"): + if not line: + continue + dict_line = parser.parse(line) + rows.append( + {k: orjson.dumps(v) if isinstance(v, dict) else v for k, v in dict_line.items()} + ) + + table = pyarrow.Table.from_pylist(rows) - table = pyarrow.json.read_json(stream) schema = table.schema if just_schema: return convert_arrow_schema_to_orso_schema(schema) diff --git a/tests/sql_battery/test_shapes_and_errors_battery.py b/tests/sql_battery/test_shapes_and_errors_battery.py index 539f2e83a..b5df832ac 100644 --- a/tests/sql_battery/test_shapes_and_errors_battery.py +++ b/tests/sql_battery/test_shapes_and_errors_battery.py @@ -675,10 +675,8 @@ ("SELECT cve -> 'CVE_data_meta' ->> 'ASSIGNER' FROM testdata.flat.nvd limit 10", 10, 1, None), ("SELECT cve ->> 'CVE_data_meta' ->> 'ASSIGNER' FROM testdata.flat.nvd limit 10", 10, 1, None), ("SELECT cve -> 'CVE_data_meta' -> 'ASSIGNER' FROM testdata.flat.nvd limit 10", 10, 1, None), - ("SELECT details, details->'int_field' FROM duckdb.struct_tests", 10, 2, None), ("SELECT dict @? 'list' FROM testdata.flat.struct", 6, 1, None), - ("SELECT struct(dict) @? 'list' FROM testdata.flat.struct", 6, 1, None), ("SELECT birth_place @? 'town' FROM $astronauts", 357, 1, None), ("SELECT dict @? '$.list' FROM testdata.flat.struct", 6, 1, None), ("SELECT cve @? '$.CVE_data_meta.ASSIGNER' FROM testdata.flat.nvd LIMIT 10", 10, 1, None), diff --git a/tests/storage/test_sql_duckdb.py b/tests/storage/test_sql_duckdb.py index 7376088aa..396ee0260 100644 --- a/tests/storage/test_sql_duckdb.py +++ b/tests/storage/test_sql_duckdb.py @@ -49,6 +49,7 @@ ("SELECT * FROM duckdb.satellites ORDER BY name DESC;", 177, 8, None), ("SELECT * FROM duckdb.planets LIMIT 5;", 5, 20, None), ("SELECT * FROM duckdb.satellites LIMIT 10 OFFSET 5;", 10, 8, None), + ("SELECT details, details->'int_field' FROM duckdb.struct_tests", 10, 2, None), ] # fmt: on From b76310658685e129ec83263b51c66342601360f0 Mon Sep 17 00:00:00 2001 From: XB500 Date: Fri, 20 Dec 2024 16:17:49 +0000 Subject: [PATCH 085/157] Opteryx Version 0.19.0-alpha.900 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 1a2f498e6..c20a2eb29 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 899 +__build__ = 900 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From cad26dc60237ddaf509cd40ff4a8ca516532bbdd Mon Sep 17 00:00:00 2001 From: joocer Date: Fri, 20 Dec 2024 16:35:43 +0000 Subject: [PATCH 086/157] #2132 --- opteryx/utils/file_decoders.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/opteryx/utils/file_decoders.py b/opteryx/utils/file_decoders.py index 432490145..58656bdaa 100644 --- a/opteryx/utils/file_decoders.py +++ b/opteryx/utils/file_decoders.py @@ -326,12 +326,10 @@ def jsonl_decoder( if not isinstance(buffer, bytes): buffer = buffer.read() - parser = simdjson.Parser() - for line in buffer.split(b"\n"): if not line: continue - dict_line = parser.parse(line) + dict_line = simdjson.Parser().parse(line) rows.append( {k: orjson.dumps(v) if isinstance(v, dict) else v for k, v in dict_line.items()} ) From 3676832d8a16655c24430080c9829ccbe9abf03e Mon Sep 17 00:00:00 2001 From: XB500 Date: Fri, 20 Dec 2024 16:36:08 +0000 Subject: [PATCH 087/157] Opteryx Version 0.19.0-alpha.901 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index c20a2eb29..6aa59aef6 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 900 +__build__ = 901 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From e673bba5c2298fdd8a91aaf041bcee6be8b8c3e2 Mon Sep 17 00:00:00 2001 From: joocer Date: Sat, 21 Dec 2024 13:17:32 +0000 Subject: [PATCH 088/157] #2132 --- opteryx/managers/expression/ops.py | 8 ++++---- opteryx/operators/show_create_node.py | 1 + tests/sql_battery/test_shapes_and_errors_battery.py | 10 +++++----- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/opteryx/managers/expression/ops.py b/opteryx/managers/expression/ops.py index 2801462ef..bec0a7401 100644 --- a/opteryx/managers/expression/ops.py +++ b/opteryx/managers/expression/ops.py @@ -182,7 +182,7 @@ def _inner_filter_operations(arr, operator, value): if operator == "AnyOpILike": patterns = value[0] - combined_regex_pattern = r"|".join(sql_like_to_regex(p) for p in patterns) + combined_regex_pattern = r"|".join(sql_like_to_regex(p) for p in patterns if p) combined_regex = re.compile(combined_regex_pattern, re.IGNORECASE) out = numpy.zeros(arr.size, dtype=bool) @@ -199,7 +199,7 @@ def _inner_filter_operations(arr, operator, value): if operator == "AnyOpLike": patterns = value[0] - combined_regex_pattern = r"|".join(sql_like_to_regex(p) for p in patterns) + combined_regex_pattern = r"|".join(sql_like_to_regex(p) for p in patterns if p) combined_regex = re.compile(combined_regex_pattern) out = numpy.zeros(arr.size, dtype=bool) @@ -215,7 +215,7 @@ def _inner_filter_operations(arr, operator, value): if operator == "AnyOpNotLike": patterns = value[0] - combined_regex_pattern = r"|".join(sql_like_to_regex(p) for p in patterns) + combined_regex_pattern = r"|".join(sql_like_to_regex(p) for p in patterns if p) combined_regex = re.compile(combined_regex_pattern) out = numpy.zeros(arr.size, dtype=bool) @@ -232,7 +232,7 @@ def _inner_filter_operations(arr, operator, value): if operator == "AnyOpNotILike": patterns = value[0] - combined_regex_pattern = r"|".join(sql_like_to_regex(p) for p in patterns) + combined_regex_pattern = r"|".join(sql_like_to_regex(p) for p in patterns if p) combined_regex = re.compile(combined_regex_pattern, re.IGNORECASE) out = numpy.zeros(arr.size, dtype=bool) diff --git a/opteryx/operators/show_create_node.py b/opteryx/operators/show_create_node.py index 5bb5799b3..1e0a30004 100644 --- a/opteryx/operators/show_create_node.py +++ b/opteryx/operators/show_create_node.py @@ -54,6 +54,7 @@ def execute(self, morsel: pyarrow.Table, **kwargs) -> pyarrow.Table: buffer = [{self.object_name: view_sql}] table = pyarrow.Table.from_pylist(buffer) yield table + return raise DatasetNotFoundError(self.object_name) diff --git a/tests/sql_battery/test_shapes_and_errors_battery.py b/tests/sql_battery/test_shapes_and_errors_battery.py index b5df832ac..a20bb3c17 100644 --- a/tests/sql_battery/test_shapes_and_errors_battery.py +++ b/tests/sql_battery/test_shapes_and_errors_battery.py @@ -1638,7 +1638,7 @@ ("EXECUTE PLANETS_BY_ID (1)", None, None, ParameterError), # simple case) ("EXECUTE PLANETS_BY_ID (name=1)", None, None, ParameterError), # simple case) ("EXECUTE VERSION", 1, 1, None), # no paramters - ("EXECUTE VERSION()", 1, 1, SqlError), # no paramters + ("EXECUTE VERSION()", 1, 1, None), ("EXECUTE get_satellites_by_planet_name(name='Jupiter')", 67, 1, None), # string param ("EXECUTE GET_SATELLITES_BY_PLANET_NAME(name='Jupiter')", 67, 1, None), # string param ("EXECUTE multiply_two_numbers (one=1.0, two=9.9)", 1, 1, None), # multiple params @@ -1864,10 +1864,10 @@ ("SELECT name, missions FROM $astronauts WHERE missions NOT ILIKE ANY ('%Apoll%', 'mission')", 323, 2, None), ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('Apoll%', 'Gemini%', 'Mercury%')", 37, 2, None), ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('Apoll%', 'Gemini%', 'Mercury%')", 320, 2, None), - ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ()", 0, 2, None), - ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ()", 0, 2, None), - ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('%Apoll%', null)", 37, 2, None), - ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('%Apoll%', null)", 37, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ()", 0, 2, SqlError), + ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ()", 0, 2, SqlError), + ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('%Apoll%', null)", 34, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('%Apoll%', null)", 323, 2, None), ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('%aPoll%')", 0, 2, None), ("SELECT name, missions FROM $astronauts WHERE missions ILIKE ANY ('%aPoll%')", 34, 2, None), ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('Apollo 11')", 3, 2, None), From 70a4c0ba5f5dd504a36d92c36fff389fa8c9a051 Mon Sep 17 00:00:00 2001 From: XB500 Date: Sat, 21 Dec 2024 13:17:58 +0000 Subject: [PATCH 089/157] Opteryx Version 0.19.0-alpha.902 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 6aa59aef6..fe5d1764e 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 901 +__build__ = 902 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 0f097c5ab8810fbebc5c1ff8e101ae5dc749fd48 Mon Sep 17 00:00:00 2001 From: joocer Date: Sat, 21 Dec 2024 18:30:06 +0000 Subject: [PATCH 090/157] #2132 --- .github/workflows/regression_suite.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/regression_suite.yaml b/.github/workflows/regression_suite.yaml index dba91a53d..6a91420ed 100644 --- a/.github/workflows/regression_suite.yaml +++ b/.github/workflows/regression_suite.yaml @@ -81,7 +81,7 @@ jobs: DATA_CATALOG_PROVIDER: 'TARCHIA' DATA_CATALOG_CONFIGURATION: '${{ secrets.DATA_CATALOG_CONFIGURATION }}' TARCHIA_KEY: '${{ secrets.TARCHIA_KEY }}' - REDIS_CONNECTION: '${{ secrets.VALKEY_CONFIG }}' + VALKEY_CONNECTION: '${{ secrets.VALKEY_CONFIG }}' - name: Check Coverage run: python -m coverage report --include=opteryx/** --fail-under=90 -m From ae035aaf8a5bd7e17093759af1c29e10f818d9ba Mon Sep 17 00:00:00 2001 From: XB500 Date: Sat, 21 Dec 2024 18:30:29 +0000 Subject: [PATCH 091/157] Opteryx Version 0.19.0-alpha.903 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index fe5d1764e..9dce888e4 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 902 +__build__ = 903 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From a00f1f394d90c73149aaac6563ce6df5b2f14478 Mon Sep 17 00:00:00 2001 From: joocer Date: Sat, 21 Dec 2024 19:32:57 +0000 Subject: [PATCH 092/157] #2132 --- opteryx/managers/permissions/__init__.py | 21 +++++++- tests/misc/test_utils_paths.py | 13 ++++- tests/security/test_execute_permissions.py | 54 +++++++++++-------- tests/security/test_row_visibility_filters.py | 15 ++++-- tests/security/test_table_permissions.py | 48 ++++++++++++++++- tests/sql_battery/test_null_semantics.py | 25 +++++++-- 6 files changed, 144 insertions(+), 32 deletions(-) diff --git a/opteryx/managers/permissions/__init__.py b/opteryx/managers/permissions/__init__.py index bd8b0ec0a..28c71ddb7 100644 --- a/opteryx/managers/permissions/__init__.py +++ b/opteryx/managers/permissions/__init__.py @@ -37,19 +37,33 @@ def load_permissions() -> List[Dict]: def can_read_table(roles: Iterable[str], table: str, action: str = "READ") -> bool: """ - Check if any of the provided roles have READ permission for the specified table. + Check if any of the provided roles have READ access to the specified table. + + When we call this function, we provide the current user's roles and the table name. + We then check if any of the permissions in the system match those roles and if those permissions + grant access to the table. + + Tables can have wildcards in their names, so we use fnmatch to check the table name. + + We have a default role 'opteryx' with READ access to all tables. Parameters: roles (List[str]): A list of roles to check against permissions. table (str): The name of the table to check access for. Returns: - bool: True if any role has READ permission for the table, False otherwise. + bool: True if any role has READ access to the table, False otherwise. """ + + def escape_special_chars(pattern: str) -> str: + return pattern.replace(r"\*", "*").replace(r"\?", "?") + # If no permissions are loaded, default to allowing all reads. if not PERMISSIONS: return True + table = escape_special_chars(table) + for entry in PERMISSIONS: # Check if the permission, the role is in the provided roles, # and the table matches the pattern defined in the permission. @@ -58,6 +72,9 @@ def can_read_table(roles: Iterable[str], table: str, action: str = "READ") -> bo and entry["role"] in roles and fnmatch.fnmatch(table, entry["table"]) ): + # Additional check for leading dots + if table.startswith(".") and not entry["table"].startswith("."): + continue return True # If no matching permission is found, deny access. diff --git a/tests/misc/test_utils_paths.py b/tests/misc/test_utils_paths.py index 19e6215a0..19c1df185 100644 --- a/tests/misc/test_utils_paths.py +++ b/tests/misc/test_utils_paths.py @@ -18,11 +18,22 @@ ("bucket/path/path/path/path/path/file.ext", ("bucket", "path/path/path/path/path", "file", ".ext"), None), ("bucket/path/file.ext", ("bucket", "path", "file", ".ext"), None), ("bucket.ext/path.ext/file.ext", ("bucket.ext", "path.ext", "file", ".ext"), None), - # can't traverse up the folder structure ("../../path/file.ext", None, ValueError), ("path/../../path/file.ext", None, ValueError), ("~/path/file.ext", None, ValueError), ("~/file.ext", None, ValueError), + ("/absolute/path/to/file.ext", ("", "absolute/path/to", "file", ".ext"), None), + ("relative/path/to/file.ext", ("relative", "path/to", "file", ".ext"), None), + ("./relative/path/to/file.ext", (".", "relative/path/to", "file", ".ext"), None), + ("../relative/path/to/file.ext", None, ValueError), + ("C:\\users\\opteryx\\file.ext", ("", "", "C:\\users\\opteryx\\file", ".ext"), None), + ("bucket/path.with.dots/file.ext", ("bucket", "path.with.dots", "file", ".ext"), None), + ("bucket/path with spaces/file.ext", ("bucket", "path with spaces", "file", ".ext"), None), + ("bucket/path_with_underscores/file.ext", ("bucket", "path_with_underscores", "file", ".ext"), None), + ("bucket/path-with-hyphens/file.ext", ("bucket", "path-with-hyphens", "file", ".ext"), None), + ("bucket/path123/file.ext", ("bucket", "path123", "file", ".ext"), None), + ("bucket/123path/file.ext", ("bucket", "123path", "file", ".ext"), None), + ] # fmt:on diff --git a/tests/security/test_execute_permissions.py b/tests/security/test_execute_permissions.py index 23a5fc3fc..432a73a58 100644 --- a/tests/security/test_execute_permissions.py +++ b/tests/security/test_execute_permissions.py @@ -37,6 +37,37 @@ def test_security_permissions_cursor(): curr.execute("EXPLAIN SELECT * FROM $planets") curr.arrow() +def test_security_permissions_invalid(): + """test edge cases for permissions""" + # empty permissions set + with pytest.raises(opteryx.exceptions.PermissionsError): + opteryx.query("SELECT * FROM $planets", permissions=set()).arrow() + + # permissions with invalid type + with pytest.raises(opteryx.exceptions.ProgrammingError): + opteryx.query("SELECT * FROM $planets", permissions={"InvalidPermission"}).arrow() + + # permissions with mixed valid and invalid types + with pytest.raises(opteryx.exceptions.ProgrammingError): + opteryx.query("SELECT * FROM $planets", permissions={"Query", "InvalidPermission"}).arrow() + + # permissions with empty string + with pytest.raises(opteryx.exceptions.ProgrammingError): + opteryx.query("SELECT * FROM $planets", permissions="").arrow() + + # permissions with numeric values + with pytest.raises(opteryx.exceptions.ProgrammingError): + opteryx.query("SELECT * FROM $planets", permissions={1, 2, 3}).arrow() + + # permissions with boolean values + with pytest.raises(opteryx.exceptions.ProgrammingError): + opteryx.query("SELECT * FROM $planets", permissions={True, False}).arrow() + + # permissions with mixed valid and invalid types in a list + with pytest.raises(opteryx.exceptions.ProgrammingError): + opteryx.query("SELECT * FROM $planets", permissions=["Query", 123, None]).arrow() + + def test_security_permissions_query(): """test we can stop users performing some query types""" @@ -44,6 +75,8 @@ def test_security_permissions_query(): opteryx.query("EXPLAIN SELECT * FROM $planets").arrow() # shouldn't have any issues opteryx.query("SELECT * FROM $planets").arrow() + # None is equivalent to all permissions + opteryx.query("SELECT * FROM $planets", permissions=None).arrow() # shouldn't have any issues opteryx.query("SELECT * FROM $planets", permissions={"Query"}).arrow() @@ -59,28 +92,7 @@ def test_security_permissions_validation(): opteryx.query("SELECT * FROM $planets", permissions={"Analyze", "Execute", "Query"}).arrow() opteryx.query("SELECT * FROM $planets", permissions=["Analyze", "Execute", "Query"]).arrow() opteryx.query("SELECT * FROM $planets", permissions=("Analyze", "Execute", "Query")).arrow() - # should fail - with pytest.raises(opteryx.exceptions.ProgrammingError): - # invalid permission - opteryx.query("SELECT * FROM $planets", permissions={"Select"}).arrow() - with pytest.raises(opteryx.exceptions.ProgrammingError): - # no permissions - opteryx.query("SELECT * FROM $planets", permissions={}).arrow() - with pytest.raises(opteryx.exceptions.ProgrammingError): - # invalid permission - opteryx.query("SELECT * FROM $planets", permissions={"Query", "Select"}).arrow() - -def test_security_permissions_invalid_values(): - with pytest.raises(opteryx.exceptions.ProgrammingError): - # invalid permission - opteryx.query("SELECT * FROM $planets", permissions=[1]).arrow() - with pytest.raises(opteryx.exceptions.ProgrammingError): - # invalid permission - opteryx.query("SELECT * FROM $planets", memberships=[1]).arrow() - with pytest.raises(opteryx.exceptions.ProgrammingError): - # invalid permission - opteryx.query("SELECT * FROM $planets", user=1).arrow() if __name__ == "__main__": # pragma: no cover diff --git a/tests/security/test_row_visibility_filters.py b/tests/security/test_row_visibility_filters.py index 76e85aadd..4b016ff46 100644 --- a/tests/security/test_row_visibility_filters.py +++ b/tests/security/test_row_visibility_filters.py @@ -43,6 +43,18 @@ ("SELECT * FROM $planets p LEFT JOIN $satellites s ON p.id = s.planetId", {"$satellites": [("id", "Lt", 4)]}, (10, 28)), ("SELECT * FROM $planets p1 JOIN $planets p2 ON p1.id = p2.id", {"$planets": [("id", "Gt", 3)], "p2": [("name", "NotEq", "X")]}, (6, 40)), + + ("SELECT * FROM $planets WHERE id = 4", {"$planets": [("id", "Eq", 4)]}, (1, 20)), + ("SELECT * FROM $planets WHERE name = 'Mars'", {"$planets": [("name", "Eq", "Mars")]}, (1, 20)), + ("SELECT * FROM $planets WHERE name LIKE 'M%'", {"$planets": [("name", "Like", "M%")]}, (2, 20)), + ("SELECT * FROM $planets WHERE id > 3 AND name LIKE 'M%'", {"$planets": [("id", "Gt", 3), ("name", "Like", "M%")]}, (1, 20)), + ("SELECT * FROM $planets WHERE id < 4 OR name LIKE 'M%'", {"$planets": [("id", "Lt", 4), ("name", "Like", "M%")]}, (1, 20)), + ("SELECT * FROM $planets WHERE id = 4 AND name = 'Mars'", {"$planets": [("id", "Eq", 4), ("name", "Eq", "Mars")]}, (1, 20)), + ("SELECT * FROM $planets WHERE id = 4 OR name = 'Mars'", {"$planets": [("id", "Eq", 4), ("name", "Eq", "Mars")]}, (1, 20)), + ("SELECT * FROM $planets WHERE id = 4 AND name LIKE 'M%'", {"$planets": [("id", "Eq", 4), ("name", "Like", "M%")]}, (1, 20)), + ("SELECT * FROM $planets WHERE name LIKE 'M%'", {"$planets": [("id", "Eq", 4), ("name", "Like", "M%")]}, (1, 20)), + ("SELECT * FROM $planets WHERE id = 4", {"$planets": [("id", "Eq", 4), ("name", "NotLike", "M%")]}, (0, 20)), + ("SELECT * FROM $planets", {"$planets": [("id", "Eq", 4), ("name", "NotLike", "M%")]}, (0, 20)), ] @@ -102,6 +114,3 @@ def test_visibility_filters(sql, filters, shape): f" \033[38;2;26;185;67m{passed} passed ({(passed * 100) // (passed + failed)}%)\033[0m\n" f" \033[38;2;255;121;198m{failed} failed\033[0m" ) - - - diff --git a/tests/security/test_table_permissions.py b/tests/security/test_table_permissions.py index f1576c341..fe2b328d5 100644 --- a/tests/security/test_table_permissions.py +++ b/tests/security/test_table_permissions.py @@ -49,6 +49,52 @@ (["restricted", "opteryx"], "db.schema.table", True), (["restricted", "opteryx"], "opteryx.schema.deeply.nested.table", True), (["restricted", "opteryx"], "other.schema.table", True), + (["opteryx"], "", True), # Empty table name + ([], "", False), # Empty roles and table name + ([""], "", False), # Empty role and table name + (["opteryx"], " ", True), # Table name with space + ([" "], "opteryx.table1", False), # Role with space + (["opteryx"], "opteryx..table", True), # Table name with double dots + (["opteryx"], ".opteryx.table", False), # Table name starting with dot + (["opteryx"], "opteryx.table.", True), # Table name ending with dot + (["opteryx"], "opteryx..schema.table", True), # Table name with double dots in schema + (["opteryx"], "opteryx.schema..table", True), # Table name with double dots in table + (["opteryx"], "opteryx.schema.table..", True), # Table name ending with double dots + (["opteryx"], "opteryx.table_with_special_chars!@#$%^&*()", True), # Special characters in table name + (["opteryx"], "Opteryx.Table", True), # Mixed case table name + (["opteryx"], "opteryx." + "a" * 255, True), # Very long table name + (["table_with_special_chars!@#$%^&*()"], "opteryx.table1", False), # Role with special characters + (["table_with_special_chars!@#$%^&*()"], "opteryx.table1", False), # Role with special characters + (["table_with_special_chars!@#$%^&*()"], "opteryx.table_with_special_chars!@#$%^&*()", False), # Role and table with special characters + (["table_with_special_chars!@#$%^&*()"], "opteryx.table_with_underscore", False), # Role with special characters and table with underscore + (["table_with_special_chars!@#$%^&*()"], "opteryx.table-with-dash", False), # Role with special characters and table with dash + (["table_with_special_chars!@#$%^&*()"], "opteryx.table/with/slash", False), # Role with special characters and table with slash + (["table_with_special_chars!@#$%^&*()"], "opteryx.table\\with\\backslash", False), # Role with special characters and table with backslash + (["table_with_special_chars!@#$%^&*()"], "opteryx.table:with:colon", False), # Role with special characters and table with colon + (["table_with_special_chars!@#$%^&*()"], "opteryx.table;with:semicolon", False), # Role with special characters and table with semicolon + (["table_with_special_chars!@#$%^&*()"], "opteryx.table,with,comma", False), # Role with special characters and table with comma + (["table_with_special_chars!@#$%^&*()"], "opteryx.tablewith>greater>than", False), # Role with special characters and table with greater than + (["table_with_special_chars!@#$%^&*()"], "opteryx.table|with|pipe", False), # Role with special characters and table with pipe + (["table_with_special_chars!@#$%^&*()"], "opteryx.table?with?question?mark", False), # Role with special characters and table with question mark + (["table_with_special_chars!@#$%^&*()"], "opteryx.table*with*asterisk", False), # Role with special characters and table with asterisk + (["table_with_special_chars!@#$%^&*()"], "opteryx.table\"with\"double\"quote", False), # Role with special characters and table with double quote + (["table_with_special_chars!@#$%^&*()"], "opteryx.table'with'single'quote", False), # Role with special characters and table with single quote + (["opteryx"], "opteryx.table_with_underscore", True), # Table name with underscore + (["opteryx"], "opteryx.table-with-dash", True), # Table name with dash + (["opteryx"], "opteryx.table/with/slash", True), # Table name with slash + (["opteryx"], "opteryx.table\\with\\backslash", True), # Table name with backslash + (["opteryx"], "opteryx.table:with:colon", True), # Table name with colon + (["opteryx"], "opteryx.table;with:semicolon", True), # Table name with semicolon + (["opteryx"], "opteryx.table,with,comma", True), # Table name with comma + (["opteryx"], "opteryx.tablewith>greater>than", True), # Table name with greater than + (["opteryx"], "opteryx.table|with|pipe", True), # Table name with pipe + (["opteryx"], "opteryx.table?with?question?mark", True), # Table name with question mark + (["opteryx"], "opteryx.table*with*asterisk", True), # Table name with asterisk + (["opteryx"], "opteryx.table\"with\"double\"quote", True), # Table name with double quote + (["opteryx"], "opteryx.table'with'single'quote", True), # Table name with single quote + ] @pytest.mark.parametrize("roles, table, expected", test_cases) @@ -69,7 +115,7 @@ def test_can_read_table(roles, table, expected): for index, (roles, table, expected) in enumerate(test_cases): print( f"\033[38;2;255;184;108m{(index + 1):04}\033[0m" - f" .", + f" {', '.join(roles).ljust(35)} {table.ljust(25)}", end="", flush=True, ) diff --git a/tests/sql_battery/test_null_semantics.py b/tests/sql_battery/test_null_semantics.py index 063678e15..ad0f36be2 100644 --- a/tests/sql_battery/test_null_semantics.py +++ b/tests/sql_battery/test_null_semantics.py @@ -124,10 +124,27 @@ """ -- Query 24: Expected rows: 3 (1, -1, NULL) SELECT * FROM (VALUES (1), (-1), (NULL)) AS tristatebooleans(bool) WHERE NOT bool IS NULL; -""", {1, -1}) - - - +""", {1, -1}),( +""" +-- Query 25: SELECT * FROM tristatebooleans WHERE bool IS NULL; +-- Expected rows: 1 (NULL) +SELECT * FROM (VALUES (True), (False), (NULL)) AS tristatebooleans(bool) WHERE bool IS NULL; +""", {None}),( +""" +-- Query 26: SELECT * FROM tristatebooleans WHERE bool IS NOT TRUE; +-- Expected rows: 2 (False, NULL) +SELECT * FROM (VALUES (True), (False), (NULL)) AS tristatebooleans(bool) WHERE bool IS NOT TRUE; +""", {False, None}),( +""" +-- Query 27: SELECT * FROM tristatebooleans WHERE bool IS NOT FALSE; +-- Expected rows: 2 (True, NULL) +SELECT * FROM (VALUES (True), (False), (NULL)) AS tristatebooleans(bool) WHERE bool IS NOT FALSE; +""", {True, None}),( +""" +-- Query 28: SELECT * FROM tristatebooleans WHERE (bool IS NULL AND bool IS NOT NULL) OR (bool IS NOT NULL AND bool IS NULL) OR (bool <> bool); +-- Expected rows: 1 (NULL) +SELECT * FROM (VALUES (True), (False), (NULL)) AS tristatebooleans(bool) WHERE (bool IS NULL AND bool IS NOT NULL) OR (bool IS NOT NULL AND bool IS NULL) OR (bool <> bool); +""", {None}) ] # fmt:on From d31ecdde27c3f652c478eed6cd91174658a5e5e9 Mon Sep 17 00:00:00 2001 From: XB500 Date: Sat, 21 Dec 2024 19:33:24 +0000 Subject: [PATCH 093/157] Opteryx Version 0.19.0-alpha.904 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 9dce888e4..a3dc7079b 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 903 +__build__ = 904 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 57ad38cb9537681f9fc5734538b798e81899dd2f Mon Sep 17 00:00:00 2001 From: joocer Date: Sat, 21 Dec 2024 19:42:06 +0000 Subject: [PATCH 094/157] #2132 --- tests/sql_battery/test_null_semantics.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tests/sql_battery/test_null_semantics.py b/tests/sql_battery/test_null_semantics.py index ad0f36be2..6780c376d 100644 --- a/tests/sql_battery/test_null_semantics.py +++ b/tests/sql_battery/test_null_semantics.py @@ -139,12 +139,7 @@ -- Query 27: SELECT * FROM tristatebooleans WHERE bool IS NOT FALSE; -- Expected rows: 2 (True, NULL) SELECT * FROM (VALUES (True), (False), (NULL)) AS tristatebooleans(bool) WHERE bool IS NOT FALSE; -""", {True, None}),( -""" --- Query 28: SELECT * FROM tristatebooleans WHERE (bool IS NULL AND bool IS NOT NULL) OR (bool IS NOT NULL AND bool IS NULL) OR (bool <> bool); --- Expected rows: 1 (NULL) -SELECT * FROM (VALUES (True), (False), (NULL)) AS tristatebooleans(bool) WHERE (bool IS NULL AND bool IS NOT NULL) OR (bool IS NOT NULL AND bool IS NULL) OR (bool <> bool); -""", {None}) +""", {True, None}), ] # fmt:on From 7d6ac99c631353a3f7aa49e51c786c52010d625f Mon Sep 17 00:00:00 2001 From: XB500 Date: Sat, 21 Dec 2024 19:42:29 +0000 Subject: [PATCH 095/157] Opteryx Version 0.19.0-alpha.905 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index a3dc7079b..48d2443df 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 904 +__build__ = 905 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From d0bc16d464c0ee93a31d3427fa80c3af6f6afa4e Mon Sep 17 00:00:00 2001 From: joocer Date: Sat, 21 Dec 2024 21:47:50 +0000 Subject: [PATCH 096/157] #2129 --- opteryx/exceptions.py | 5 +++-- opteryx/operators/outer_join_node.py | 13 ++++++------- opteryx/planner/binder/binder.py | 8 +++++++- tests/sql_battery/test_shapes_and_errors_battery.py | 4 ++-- 4 files changed, 18 insertions(+), 12 deletions(-) diff --git a/opteryx/exceptions.py b/opteryx/exceptions.py index ca4d9db36..19d518342 100644 --- a/opteryx/exceptions.py +++ b/opteryx/exceptions.py @@ -248,9 +248,10 @@ def __init__(self, dataset: str): class UnexpectedDatasetReferenceError(SqlError): """Exception raised for unexpected dataset references.""" - def __init__(self, dataset: str): + def __init__(self, dataset: str, message: Optional[str] = None): self.dataset = dataset - message = f"Dataset '{dataset}' is referenced in query but it doesn't appear in a FROM or JOIN clause." + if not message: + message = f"Dataset '{dataset}' is referenced in query but it doesn't appear in a FROM or JOIN clause." super().__init__(message) diff --git a/opteryx/operators/outer_join_node.py b/opteryx/operators/outer_join_node.py index dd95bfe6d..78c1665a6 100644 --- a/opteryx/operators/outer_join_node.py +++ b/opteryx/operators/outer_join_node.py @@ -23,11 +23,13 @@ popular SEMI and ANTI joins we leave to PyArrow for now. """ +from collections import deque from typing import List import pyarrow from opteryx import EOS +from opteryx.compiled.structures import HashSet from opteryx.compiled.structures import HashTable from opteryx.models import QueryProperties from opteryx.utils.arrow import align_tables @@ -52,8 +54,6 @@ def left_join(left_relation, right_relation, left_columns: List[str], right_colu Returns: A pyarrow.Table containing the result of the LEFT JOIN operation. """ - from collections import deque - from opteryx.compiled.structures.hash_table import hash_join_map left_indexes: deque = deque() @@ -190,16 +190,15 @@ def left_anti_join( Returns: A pyarrow.Table containing the result of the LEFT ANTI JOIN operation. """ - hash_table = HashTable() non_null_right_values = right_relation.select(right_columns).itercolumns() - for i, value_tuple in enumerate(zip(*non_null_right_values)): - hash_table.insert(hash(value_tuple), i) + right_hash_set = set(zip(*non_null_right_values)) left_indexes = [] left_values = left_relation.select(left_columns).itercolumns() for i, value_tuple in enumerate(zip(*left_values)): - rows = hash_table.get(hash(value_tuple)) - if not rows: # Only include left rows that have no match in the right table + if ( + value_tuple not in right_hash_set + ): # Only include left rows that have no match in the right table left_indexes.append(i) # Filter the left_chunk based on the anti join condition diff --git a/opteryx/planner/binder/binder.py b/opteryx/planner/binder/binder.py index fa079815b..d403f13ad 100644 --- a/opteryx/planner/binder/binder.py +++ b/opteryx/planner/binder/binder.py @@ -138,7 +138,13 @@ def create_variable_node(node: Node, context: BindingContext) -> Node: # if there are no candidates, we probably don't know the relation if not candidate_schemas: - raise UnexpectedDatasetReferenceError(dataset=node.source) + if node.source in context.relations: + raise UnexpectedDatasetReferenceError( + dataset=node.source, + message=f"Dataset `{node.source}` is not available after being used on the right side of a ANTI or SEMI JOIN", + ) + else: + raise UnexpectedDatasetReferenceError(dataset=node.source) # look up the column in the candidate schemas column, found_source_relation = locate_identifier_in_loaded_schemas( diff --git a/tests/sql_battery/test_shapes_and_errors_battery.py b/tests/sql_battery/test_shapes_and_errors_battery.py index a20bb3c17..2edfbf0d7 100644 --- a/tests/sql_battery/test_shapes_and_errors_battery.py +++ b/tests/sql_battery/test_shapes_and_errors_battery.py @@ -1535,8 +1535,8 @@ ("SELECT * FROM $planets AS P LEFT ANTI JOIN $satellites AS S ON S.id = P.id WHERE P.id > 5;", 0, 20, None), ("SELECT * FROM $planets AS P LEFT ANTI JOIN (SELECT id FROM $satellites WHERE name LIKE 'Moon%') AS S ON S.id = P.id;", 8, 20, None), ("SELECT * FROM GENERATE_SERIES(1, 10) AS C LEFT ANTI JOIN $satellites AS S ON S.id = C;", 0, 1, None), -# ("SELECT * FROM $planets AS P LEFT ANTI JOIN $satellites AS S ON S.id = P.id WHERE S.size > 1000;", 0, 20, None), -# ("SELECT * FROM $planets AS P LEFT SEMI JOIN $satellites AS S ON S.id = P.id WHERE S.name LIKE 'Moon%';", 0, 20, None), + ("SELECT * FROM $planets AS P LEFT SEMI JOIN $satellites AS S ON S.id = P.id WHERE P.name LIKE 'E%';", 1, 20, None), + ("SELECT * FROM $planets AS P LEFT SEMI JOIN $satellites AS S ON S.id = P.id WHERE S.name LIKE 'E%';", 1, 20, UnexpectedDatasetReferenceError), ("SELECT * FROM $planets AS P LEFT SEMI JOIN (SELECT id FROM $satellites WHERE name != 'Moon') AS S ON S.id = P.id;", 8, 20, None), ("SELECT * FROM $planets AS P LEFT SEMI JOIN $satellites AS S ON S.id = P.id WHERE P.name != 'Earth';", 8, 20, None), ("SELECT * FROM GENERATE_SERIES(1, 10) AS G LEFT SEMI JOIN $satellites AS S ON S.id = G;", 10, 1, None), From 1fa522e2ef23e15c15e6f8fa4ab18e07be6903d9 Mon Sep 17 00:00:00 2001 From: XB500 Date: Sat, 21 Dec 2024 21:48:13 +0000 Subject: [PATCH 097/157] Opteryx Version 0.19.0-alpha.906 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 48d2443df..ea2130aa5 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 905 +__build__ = 906 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 98226586a562066751d7cec72aca9e0b697ab964 Mon Sep 17 00:00:00 2001 From: joocer Date: Sun, 22 Dec 2024 18:46:12 +0000 Subject: [PATCH 098/157] #2112 --- opteryx/operators/aggregate_and_group_node.py | 2 +- opteryx/operators/async_read_node.py | 2 -- opteryx/operators/function_dataset_node.py | 2 -- opteryx/operators/read_node.py | 2 -- .../strategies/constant_folding.py | 11 ----------- tests/sql_battery/test_shapes_and_errors_battery.py | 2 ++ 6 files changed, 3 insertions(+), 18 deletions(-) diff --git a/opteryx/operators/aggregate_and_group_node.py b/opteryx/operators/aggregate_and_group_node.py index 403c07f91..b6153f7bd 100644 --- a/opteryx/operators/aggregate_and_group_node.py +++ b/opteryx/operators/aggregate_and_group_node.py @@ -102,7 +102,7 @@ def config(self): # pragma: no cover @property def name(self): # pragma: no cover - return "Group" + return "Group By" def execute(self, morsel: pyarrow.Table, **kwargs): if morsel == EOS: diff --git a/opteryx/operators/async_read_node.py b/opteryx/operators/async_read_node.py index 528d630a8..8ecb027dd 100644 --- a/opteryx/operators/async_read_node.py +++ b/opteryx/operators/async_read_node.py @@ -192,9 +192,7 @@ def execute(self, morsel, **kwargs) -> Generator: arrow_schema = morsel.schema self.statistics.blobs_read += 1 - self.records_out += morsel.num_rows self.statistics.rows_read += morsel.num_rows - self.bytes_out += morsel.nbytes yield morsel except Exception as err: diff --git a/opteryx/operators/function_dataset_node.py b/opteryx/operators/function_dataset_node.py index 5fc6c5a7a..d1b2f1130 100644 --- a/opteryx/operators/function_dataset_node.py +++ b/opteryx/operators/function_dataset_node.py @@ -142,8 +142,6 @@ def execute(self, morsel, **kwargs) -> Generator: else: table = data - self.records_out += table.num_rows - self.bytes_out += table.nbytes self.statistics.columns_read += len(table.column_names) yield table diff --git a/opteryx/operators/read_node.py b/opteryx/operators/read_node.py index 0f1fbbbc3..e9c44a871 100644 --- a/opteryx/operators/read_node.py +++ b/opteryx/operators/read_node.py @@ -221,9 +221,7 @@ def execute(self, morsel, **kwargs) -> Generator: self.statistics.time_reading_blobs += time.monotonic_ns() - start_clock self.statistics.blobs_read += 1 - self.records_out += morsel.num_rows self.statistics.rows_read += morsel.num_rows - self.bytes_out += morsel.nbytes yield morsel start_clock = time.monotonic_ns() if morsel: diff --git a/opteryx/planner/cost_based_optimizer/strategies/constant_folding.py b/opteryx/planner/cost_based_optimizer/strategies/constant_folding.py index 473d76837..242048d44 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/constant_folding.py +++ b/opteryx/planner/cost_based_optimizer/strategies/constant_folding.py @@ -125,17 +125,6 @@ def fold_constants(root: Node, statistics: QueryStatistics) -> Node: root.left.schema_column = root.schema_column statistics.optimization_constant_fold_reduce += 1 return root.left # anything - if ( - root.value == "Divide" - and root.right.node_type == NodeType.IDENTIFIER - and root.left.node_type == NodeType.IDENTIFIER - and root.right.schema_column.identity == root.left.schema_column.identity - ): - # anything / itself = 1 (0 is an exception) - node = build_literal_node(1, root, OrsoTypes.INTEGER) - statistics.optimization_constant_fold_reduce += 1 - node.schema_column = root.schema_column - return node if root.node_type == NodeType.COMPARISON_OPERATOR: if ( diff --git a/tests/sql_battery/test_shapes_and_errors_battery.py b/tests/sql_battery/test_shapes_and_errors_battery.py index 2edfbf0d7..d7c04d3ca 100644 --- a/tests/sql_battery/test_shapes_and_errors_battery.py +++ b/tests/sql_battery/test_shapes_and_errors_battery.py @@ -2180,6 +2180,8 @@ # 2059 ("SELECT g FROM generate_series(10) as g CROSS JOIN UNNEST (g) as g1", 0, 0, TypeError), ("SELECT DISTINCT l FROM (SELECT split('a b c d e f g h i j', ' ') as letters) as plet CROSS JOIN UNNEST (letters) as l", 10, 1, None), + # 2112 + ("SELECT id FROM $planets WHERE surface_pressure / surface_pressure is null", 5, 1, None), ] # fmt:on From 3b2efd9b40643b00c92956cdd3c7094886ed6f49 Mon Sep 17 00:00:00 2001 From: XB500 Date: Sun, 22 Dec 2024 18:46:42 +0000 Subject: [PATCH 099/157] Opteryx Version 0.19.0-alpha.909 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index ea2130aa5..616afef5f 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 906 +__build__ = 909 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From a4322cc8b59aa53586508cc3893b103252a50548 Mon Sep 17 00:00:00 2001 From: joocer Date: Sun, 22 Dec 2024 19:13:12 +0000 Subject: [PATCH 100/157] #2104 --- opteryx/operators/async_read_node.py | 2 ++ opteryx/operators/function_dataset_node.py | 2 ++ opteryx/operators/read_node.py | 1 + 3 files changed, 5 insertions(+) diff --git a/opteryx/operators/async_read_node.py b/opteryx/operators/async_read_node.py index 8ecb027dd..4d1db3548 100644 --- a/opteryx/operators/async_read_node.py +++ b/opteryx/operators/async_read_node.py @@ -168,6 +168,7 @@ def execute(self, morsel, **kwargs) -> Generator: # due to a read-after-free type error start = time.monotonic_ns() blob_bytes = self.pool.read_and_release(reference, zero_copy=False) + self.statistics.bytes_read += len(blob_bytes) decoded = decoder( blob_bytes, projection=self.columns, selection=self.predicates ) @@ -193,6 +194,7 @@ def execute(self, morsel, **kwargs) -> Generator: self.statistics.blobs_read += 1 self.statistics.rows_read += morsel.num_rows + self.statistics.bytes_processed += morsel.nbytes yield morsel except Exception as err: diff --git a/opteryx/operators/function_dataset_node.py b/opteryx/operators/function_dataset_node.py index d1b2f1130..d3043b8c2 100644 --- a/opteryx/operators/function_dataset_node.py +++ b/opteryx/operators/function_dataset_node.py @@ -143,5 +143,7 @@ def execute(self, morsel, **kwargs) -> Generator: table = data self.statistics.columns_read += len(table.column_names) + self.statistics.rows_read += table.num_rows + self.statistics.bytes_processed += table.nbytes yield table diff --git a/opteryx/operators/read_node.py b/opteryx/operators/read_node.py index e9c44a871..c614c1b7b 100644 --- a/opteryx/operators/read_node.py +++ b/opteryx/operators/read_node.py @@ -222,6 +222,7 @@ def execute(self, morsel, **kwargs) -> Generator: self.statistics.time_reading_blobs += time.monotonic_ns() - start_clock self.statistics.blobs_read += 1 self.statistics.rows_read += morsel.num_rows + self.statistics.bytes_processed += morsel.nbytes yield morsel start_clock = time.monotonic_ns() if morsel: From cb275e43ac91a5451deeed98654946a154f1c8f8 Mon Sep 17 00:00:00 2001 From: XB500 Date: Sun, 22 Dec 2024 19:13:34 +0000 Subject: [PATCH 101/157] Opteryx Version 0.19.0-alpha.910 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 616afef5f..a17ea2163 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 909 +__build__ = 910 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 48add128f04019983d4946c072060a8d18ef85fe Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 23 Dec 2024 09:11:39 +0000 Subject: [PATCH 102/157] Update sqlparser requirement from 0.52.0 to 0.53.0 Updates the requirements on [sqlparser](https://github.com/apache/datafusion-sqlparser-rs) to permit the latest version. - [Changelog](https://github.com/apache/datafusion-sqlparser-rs/blob/main/CHANGELOG.md) - [Commits](https://github.com/apache/datafusion-sqlparser-rs/compare/v0.52.0...v0.53.0) --- updated-dependencies: - dependency-name: sqlparser dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 86a558dfb..379a5444e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,5 +17,5 @@ version = "0.23" features = ["extension-module"] [dependencies.sqlparser] -version = "0.52.0" +version = "0.53.0" features = ["serde", "visitor"] \ No newline at end of file From 744d160e968f2c8662c9137f844547e7fbf37055 Mon Sep 17 00:00:00 2001 From: joocer Date: Mon, 23 Dec 2024 12:26:55 +0000 Subject: [PATCH 103/157] HOUSEKEEPING --- opteryx/config.py | 3 - opteryx/connectors/aws_s3_connector.py | 10 --- opteryx/connectors/base/base_connector.py | 2 +- opteryx/connectors/capabilities/cacheable.py | 8 +- opteryx/cursor.py | 24 +++--- opteryx/managers/cache/memcached.py | 2 +- opteryx/managers/cache/redis.py | 6 +- opteryx/managers/cache/valkey.py | 6 +- opteryx/managers/catalog/tarchia_provider.py | 2 +- opteryx/models/logical_column.py | 13 ---- opteryx/operators/base_plan_node.py | 7 -- .../planner/cost_based_optimizer/__init__.py | 2 +- .../strategies/constant_folding.py | 2 +- .../logical_planner/logical_planner.py | 32 +------- opteryx/planner/physical_planner.py | 10 --- opteryx/planner/views/__init__.py | 2 - opteryx/shared/__init__.py | 9 +-- opteryx/shared/rolling_log.py | 73 ------------------ opteryx/utils/__init__.py | 16 +--- opteryx/utils/arrow.py | 2 +- .../test_sql_fuzzer_single_table_select.py | 5 +- tests/misc/test_connection.py | 1 + tests/misc/test_cursor.py | 9 +++ .../test_optimizations_invoked.py | 4 +- .../test_shapes_and_errors_battery.py | 3 + tests/sql_battery/tests/system.run_tests | 1 + tests/sql_battery/tests/v2_planner.run_tests | 77 ++++++++++++++++++- tests/storage/test_cache_memcached.py | 10 +++ tests/storage/test_cache_redis.py | 18 +++++ tests/storage/test_cache_valkey.py | 17 ++++ 30 files changed, 175 insertions(+), 201 deletions(-) delete mode 100644 opteryx/shared/rolling_log.py diff --git a/opteryx/config.py b/opteryx/config.py index aeb1a7878..c2ad0d5bf 100644 --- a/opteryx/config.py +++ b/opteryx/config.py @@ -183,9 +183,6 @@ def get(key: str, default: Optional[typing.Any] = None) -> Optional[typing.Any]: ENABLE_RESOURCE_LOGGING: bool = bool(get("ENABLE_RESOURCE_LOGGING", False)) # size of morsels to push between steps MORSEL_SIZE: int = int(get("MORSEL_SIZE", 64 * 1024 * 1024)) -# query log -QUERY_LOG_LOCATION:str = get("QUERY_LOG_LOCATION", False) -QUERY_LOG_SIZE:int = int(get("QUERY_LOG_SIZE", 100)) # not GA PROFILE_LOCATION:str = get("PROFILE_LOCATION") # fmt:on diff --git a/opteryx/connectors/aws_s3_connector.py b/opteryx/connectors/aws_s3_connector.py index c185c534d..e8405dfc3 100644 --- a/opteryx/connectors/aws_s3_connector.py +++ b/opteryx/connectors/aws_s3_connector.py @@ -68,10 +68,6 @@ def __init__(self, credentials=None, **kwargs): self.minio = Minio(end_point, access_key, secret_key, secure=secure) self.dataset = self.dataset.replace(".", OS_SEP) - # we're going to cache the first blob as the schema and dataset reader - # sometimes both start here - self.cached_first_blob = None - @single_item_cache def get_list_of_blob_names(self, *, prefix: str) -> List[str]: bucket, object_path, _, _ = paths.get_parts(prefix) @@ -94,12 +90,6 @@ def read_dataset( prefix=self.dataset, ) - # Check if the first blob was cached earlier - if self.cached_first_blob is not None: - yield self.cached_first_blob # Use cached blob - blob_names = blob_names[1:] # Skip first blob - self.cached_first_blob = None - for blob_name in blob_names: try: decoder = get_decoder(blob_name) diff --git a/opteryx/connectors/base/base_connector.py b/opteryx/connectors/base/base_connector.py index 851505abc..af3e31751 100644 --- a/opteryx/connectors/base/base_connector.py +++ b/opteryx/connectors/base/base_connector.py @@ -148,7 +148,7 @@ def __next__(self) -> pyarrow.Table: # pragma: no cover """ raise NotImplementedError("Subclasses must implement __next__ method.") - def close(self) -> None: + def close(self) -> None: # pragma: no cover """ Close the reader and release any resources. """ diff --git a/opteryx/connectors/capabilities/cacheable.py b/opteryx/connectors/capabilities/cacheable.py index 30e170232..9535d64ec 100644 --- a/opteryx/connectors/capabilities/cacheable.py +++ b/opteryx/connectors/capabilities/cacheable.py @@ -88,7 +88,7 @@ async def wrapper(blob_name: str, statistics, pool: MemoryPool, **kwargs): remote_cache.touch(key) # help the remote cache track LRU statistics.bufferpool_hits += 1 read_buffer_ref = await pool.commit(payload) # type: ignore - while read_buffer_ref is None: + while read_buffer_ref is None: # pragma: no cover await asyncio.sleep(0.1) statistics.stalls_writing_to_read_buffer += 1 read_buffer_ref = await pool.commit(payload) # type: ignore @@ -103,7 +103,7 @@ async def wrapper(blob_name: str, statistics, pool: MemoryPool, **kwargs): statistics.remote_cache_hits += 1 system_statistics.remote_cache_reads += 1 read_buffer_ref = await pool.commit(payload) # type: ignore - while read_buffer_ref is None: + while read_buffer_ref is None: # pragma: no cover await asyncio.sleep(0.1) statistics.stalls_writing_to_read_buffer += 1 read_buffer_ref = await pool.commit(payload) # type: ignore @@ -119,7 +119,7 @@ async def wrapper(blob_name: str, statistics, pool: MemoryPool, **kwargs): statistics.cache_misses += 1 system_statistics.origin_reads += 1 return read_buffer_ref - except Exception as e: + except Exception as e: # pragma: no cover print(f"Error in {func.__name__}: {e}") raise # Optionally re-raise the error after logging it @@ -136,7 +136,7 @@ async def wrapper(blob_name: str, statistics, pool: MemoryPool, **kwargs): ): # if we didn't get it from the buffer pool (origin or remote cache) we add it evicted = buffer_pool.set(key, payload) - if evicted: + if evicted: # pragma: no cover # if we're evicting items we just put in the cache, stop if evicted in my_keys: evictions_remaining = 0 diff --git a/opteryx/cursor.py b/opteryx/cursor.py index 188e0ab49..1216015dc 100644 --- a/opteryx/cursor.py +++ b/opteryx/cursor.py @@ -40,17 +40,9 @@ from opteryx.exceptions import SqlError from opteryx.exceptions import UnsupportedSyntaxError from opteryx.models import QueryStatistics -from opteryx.shared.rolling_log import RollingLog from opteryx.utils import sql PROFILE_LOCATION = config.PROFILE_LOCATION -QUERY_LOG_LOCATION = config.QUERY_LOG_LOCATION -QUERY_LOG_SIZE = config.QUERY_LOG_SIZE - - -ROLLING_LOG = None -if QUERY_LOG_LOCATION: - ROLLING_LOG = RollingLog(QUERY_LOG_LOCATION, max_entries=QUERY_LOG_SIZE) class CursorState(Enum): @@ -191,9 +183,6 @@ def _inner_execute( except RuntimeError as err: # pragma: no cover raise SqlError(f"Error Executing SQL Statement ({err})") from err - if ROLLING_LOG: - ROLLING_LOG.append(operation) - results = execute(plan, statistics=self._statistics) start = time.time_ns() @@ -337,6 +326,19 @@ def execute_to_arrow( results = self._execute_statements(operation, params, visibility_filters) if results is not None: result_data, self._result_type = results + + if self._result_type == ResultType.NON_TABULAR: + import orso + + meta_dataframe = orso.DataFrame( + rows=[(result_data.record_count,)], # type: ignore + schema=RelationSchema( + name="table", + columns=[FlatColumn(name="rows_affected", type=OrsoTypes.INTEGER)], + ), + ) # type: ignore + return meta_dataframe.arrow() + if limit is not None: result_data = utils.arrow.limit_records(result_data, limit) # type: ignore if isinstance(result_data, pyarrow.Table): diff --git a/opteryx/managers/cache/memcached.py b/opteryx/managers/cache/memcached.py index 01d4db1e9..e5761678b 100644 --- a/opteryx/managers/cache/memcached.py +++ b/opteryx/managers/cache/memcached.py @@ -49,7 +49,7 @@ def _memcached_server(**kwargs): try: from pymemcache.client import base - except ImportError as err: + except ImportError as err: # pragma: no cover raise MissingDependencyError(err.name) from err try: diff --git a/opteryx/managers/cache/redis.py b/opteryx/managers/cache/redis.py index b1eeec358..e80b707ea 100644 --- a/opteryx/managers/cache/redis.py +++ b/opteryx/managers/cache/redis.py @@ -38,7 +38,7 @@ def _redis_server(**kwargs): try: import redis - except ImportError as err: + except ImportError as err: # pragma: no cover raise MissingDependencyError(err.name) from err return redis.from_url(redis_config) @@ -80,7 +80,7 @@ def get(self, key: bytes) -> Union[bytes, None]: if response: self.hits += 1 return bytes(response) - except Exception as err: + except Exception as err: # pragma: no cover self._consecutive_failures += 1 if self._consecutive_failures >= MAXIMUM_CONSECUTIVE_FAILURES: import datetime @@ -99,7 +99,7 @@ def set(self, key: bytes, value: bytes) -> None: try: self._server.set(key, value) self.sets += 1 - except Exception as err: + except Exception as err: # pragma: no cover # if we fail to set, stop trying self._consecutive_failures = MAXIMUM_CONSECUTIVE_FAILURES self.errors += 1 diff --git a/opteryx/managers/cache/valkey.py b/opteryx/managers/cache/valkey.py index b82e6c4b2..36f1330fc 100644 --- a/opteryx/managers/cache/valkey.py +++ b/opteryx/managers/cache/valkey.py @@ -28,7 +28,7 @@ def _valkey_server(**kwargs): try: import valkey # Assuming `valkey` is the client library's name - except ImportError as err: + except ImportError as err: # pragma: no cover raise MissingDependencyError(err.name) from err return valkey.from_url(valkey_config) # Example instantiation of the client @@ -70,7 +70,7 @@ def get(self, key: bytes) -> Union[bytes, None]: if response: self.hits += 1 return bytes(response) - except Exception as err: + except Exception as err: # pragma: no cover self._consecutive_failures += 1 if self._consecutive_failures >= MAXIMUM_CONSECUTIVE_FAILURES: import datetime @@ -89,7 +89,7 @@ def set(self, key: bytes, value: bytes) -> None: try: self._server.set(key, value) # Adjust based on Valkey's API self.sets += 1 - except Exception as err: + except Exception as err: # pragma: no cover # if we fail to set, stop trying self._consecutive_failures = MAXIMUM_CONSECUTIVE_FAILURES self.errors += 1 diff --git a/opteryx/managers/catalog/tarchia_provider.py b/opteryx/managers/catalog/tarchia_provider.py index 03f6e05f8..f2e25ff2e 100644 --- a/opteryx/managers/catalog/tarchia_provider.py +++ b/opteryx/managers/catalog/tarchia_provider.py @@ -44,7 +44,7 @@ def is_valid_url(url: str) -> bool: try: result = urlparse(url) return all([result.scheme, result.netloc]) - except ValueError: + except ValueError: # pragma: no cover return False diff --git a/opteryx/models/logical_column.py b/opteryx/models/logical_column.py index fd2ba1aa4..36fde92d7 100644 --- a/opteryx/models/logical_column.py +++ b/opteryx/models/logical_column.py @@ -85,16 +85,3 @@ def copy(self): def __repr__(self) -> str: return f"" - - def to_dict(self) -> dict: - from opteryx.utils import dataclass_to_dict - - return { - "class": "LogicalColumn", - "node_type": self.node_type.name, - "source_column": self.source_column, - "source_connector": self.source_connector, - "source": self.source, - "alias": self.alias, - "schema_column": dataclass_to_dict(self.schema_column), - } diff --git a/opteryx/operators/base_plan_node.py b/opteryx/operators/base_plan_node.py index 4a69aea17..ea786bfd4 100644 --- a/opteryx/operators/base_plan_node.py +++ b/opteryx/operators/base_plan_node.py @@ -61,13 +61,6 @@ def __init__(self, *, properties, **parameters): self.records_out = 0 self.bytes_out = 0 - def to_json(self) -> bytes: # pragma: no cover - import orjson - - from opteryx.utils import dataclass_to_dict - - return orjson.dumps(dataclass_to_dict(self.do)) - @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover raise NotImplementedError() diff --git a/opteryx/planner/cost_based_optimizer/__init__.py b/opteryx/planner/cost_based_optimizer/__init__.py index 2663c1c7d..35f0980b5 100644 --- a/opteryx/planner/cost_based_optimizer/__init__.py +++ b/opteryx/planner/cost_based_optimizer/__init__.py @@ -152,7 +152,7 @@ def do_cost_based_optimizer(plan: LogicalPlan, statistics: QueryStatistics) -> L Returns: LogicalPlan: The optimized logical plan. """ - if DISABLE_OPTIMIZER: + if DISABLE_OPTIMIZER: # pragma: no cover message = "[OPTERYX] The optimizer has been disabled, 'DISABLE_OPTIMIZER' variable is TRUE." print(message) statistics.add_message(message) diff --git a/opteryx/planner/cost_based_optimizer/strategies/constant_folding.py b/opteryx/planner/cost_based_optimizer/strategies/constant_folding.py index 242048d44..7ce5e04a9 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/constant_folding.py +++ b/opteryx/planner/cost_based_optimizer/strategies/constant_folding.py @@ -128,7 +128,7 @@ def fold_constants(root: Node, statistics: QueryStatistics) -> Node: if root.node_type == NodeType.COMPARISON_OPERATOR: if ( - root.value in ("Like", "Ilike") + root.value in ("Like", "ILike") and root.left.node_type == NodeType.IDENTIFIER and root.right.node_type == NodeType.LITERAL and root.right.value == "%" diff --git a/opteryx/planner/logical_planner/logical_planner.py b/opteryx/planner/logical_planner/logical_planner.py index a379f4cca..507c45615 100644 --- a/opteryx/planner/logical_planner/logical_planner.py +++ b/opteryx/planner/logical_planner/logical_planner.py @@ -73,7 +73,7 @@ class LogicalPlanNode(Node): def copy(self) -> "Node": return LogicalPlanNode(**super().copy().properties) - def __str__(self): + def __str__(self): # pragma: no cover try: # fmt:off node_type = self.node_type @@ -502,12 +502,6 @@ def inner_query_planner(ast_branch): if previous_step_id is not None: inner_plan.add_edge(previous_step_id, step_id) - if distinct_step.on and _projection[0].source_column == "FROM": - cols = ", ".join([format_expression(c) for c in distinct_step.on]) - raise UnsupportedSyntaxError( - f"Did you mean 'SELECT DISTINCT ON ({cols}) {cols} FROM {_projection[0].alias};'?" - ) - # order if _order_by: order_step = LogicalPlanNode(node_type=LogicalPlanStepType.Order) @@ -794,28 +788,6 @@ def create_node_relation(relation): return root_node, sub_plan -def analyze_query(statement) -> LogicalPlan: - root_node = "Analyze" - plan = LogicalPlan() - - from_step = LogicalPlanNode(node_type=LogicalPlanStepType.Scan) - table = statement[root_node]["table_name"] - from_step.relation = ".".join(part["value"] for part in table) - from_step.alias = from_step.relation - from_step.start_date = table[0].get("start_date") - from_step.end_date = table[0].get("end_date") - step_id = random_string() - plan.add_node(step_id, from_step) - - metadata_step = LogicalPlanNode(node_type=LogicalPlanStepType.MetadataWriter) - previous_step_id, step_id = step_id, random_string() - plan.add_node(step_id, metadata_step) - plan.add_edge(previous_step_id, step_id) - - return plan - # write manifest - - def plan_execute_query(statement) -> LogicalPlan: import orjson @@ -1112,7 +1084,7 @@ def plan_show_variables(statement): QUERY_BUILDERS = { - "Analyze": analyze_query, + # "Analyze": analyze_query, "Execute": plan_execute_query, "Explain": plan_explain, "Query": plan_query, diff --git a/opteryx/planner/physical_planner.py b/opteryx/planner/physical_planner.py index 3233a6e7c..414cced93 100644 --- a/opteryx/planner/physical_planner.py +++ b/opteryx/planner/physical_planner.py @@ -95,16 +95,6 @@ def create_physical_plan(logical_plan, query_properties) -> PhysicalPlan: raise Exception(f"something unexpected happed - {node_type.name}") # fmt: on - # DEBUG: from opteryx.exceptions import InvalidInternalStateError - # DEBUG: - # DEBUG: try: - # DEBUG: config = node.to_json() - ## DEBUG: print(config) - # DEBUG: except Exception as err: - # DEBUG: message = f"Internal Error - node '{node}' unable to be serialized" - # DEBUG: print(message) - ## DEBUG: raise InvalidInternalStateError(message) - plan.add_node(nid, node) for source, destination, relation in logical_plan.edges(): diff --git a/opteryx/planner/views/__init__.py b/opteryx/planner/views/__init__.py index ffe66abab..f500a70a4 100644 --- a/opteryx/planner/views/__init__.py +++ b/opteryx/planner/views/__init__.py @@ -20,8 +20,6 @@ def _load_views(): with open("views.json", "rb") as defs: return orjson.loads(defs.read()) except Exception as err: # nosec - if not err: - pass # DEBUG:: log (f"[OPTERYX] Unable to open views definition file. {err}") return {} diff --git a/opteryx/shared/__init__.py b/opteryx/shared/__init__.py index 14e0ba35b..a5877f3ce 100644 --- a/opteryx/shared/__init__.py +++ b/opteryx/shared/__init__.py @@ -14,12 +14,5 @@ from opteryx.shared.async_memory_pool import AsyncMemoryPool from opteryx.shared.buffer_pool import BufferPool from opteryx.shared.materialized_datasets import MaterializedDatasets -from opteryx.shared.rolling_log import RollingLog -__all__ = ( - "AsyncMemoryPool", - "BufferPool", - "MaterializedDatasets", - "MemoryPool", - "RollingLog", -) +__all__ = ("AsyncMemoryPool", "BufferPool", "MaterializedDatasets", "MemoryPool") diff --git a/opteryx/shared/rolling_log.py b/opteryx/shared/rolling_log.py deleted file mode 100644 index 5230c3d46..000000000 --- a/opteryx/shared/rolling_log.py +++ /dev/null @@ -1,73 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Write to a log with a fixed number of entries, as the log fills the older entries are removed -from the log file. -""" - -import os - -EIGHT_MEGABYTES: int = 8 * 1024 * 1024 - - -class RollingLog: - _instance = None - log_file: str = None - max_entries: int = 100 - block_size: int = EIGHT_MEGABYTES - - def __new__(cls, log_file: str, max_entries: int = 100, block_size: int = EIGHT_MEGABYTES): - if cls._instance is None: - cls._instance = super().__new__(cls) - cls._instance.log_file = log_file - cls._instance.max_entries = max_entries - cls._instance.block_size = block_size - if not os.path.exists(log_file): - with open(log_file, "wb"): - pass - return cls._instance - - def append(self, entry): - # Append the new entry - with open(self.log_file, "a", encoding="UTF8") as log_file: - log_file.write(entry + "\n") - - # Check if max entries exceeded and remove the first entry if needed - lines = None - with open(self.log_file, "r", encoding="UTF8") as f: - lines = f.readlines() - - if len(lines) > self.max_entries: - with open(self.log_file, "w", encoding="UTF8") as f: - f.writelines(lines[1:]) # Write all lines except the first - - def scan(self): # pragma: no cover - # open the log file in binary mode - with open(self.log_file, "r", encoding="UTF8") as log_file: - # read the current position in the circular buffer - while True: - chunk = log_file.read(self.block_size) - if not chunk: - break - lines = chunk.split("\n") - for line in lines: - if line: - yield line - - def tail(self, count: int = 5): - """return the last 'count' records""" - return list(self.scan())[-count:] - - def head(self, count: int = 5): - """return the first 'count' records""" - return list(self.scan())[:count] diff --git a/opteryx/utils/__init__.py b/opteryx/utils/__init__.py index 21473f4e5..17904d0f8 100644 --- a/opteryx/utils/__init__.py +++ b/opteryx/utils/__init__.py @@ -11,7 +11,6 @@ # limitations under the License. -from enum import Enum from itertools import permutations from typing import Iterable from typing import Optional @@ -47,7 +46,7 @@ def suggest_alternative(value: str, candidates: Iterable[str]) -> Optional[str]: best_match_column = None best_match_score = 100 # Large number indicates no match found yet. - # Function to find the best match + # Function to find the best match based on levenstein distance def find_best_match(name: str): nonlocal best_match_column, best_match_score for raw, candidate in ((ca, "".join(ch for ch in ca if ch.isalnum())) for ca in candidates): @@ -74,16 +73,3 @@ def find_best_match(name: str): return result return best_match_column # Return the best match found, or None if no suitable match is found. - - -def dataclass_to_dict(instance): - if isinstance(instance, Enum): - return instance.name - elif hasattr(instance, "to_dict"): - return instance.to_dict() - elif hasattr(instance, "__dataclass_fields__"): - return {k: dataclass_to_dict(getattr(instance, k)) for k in instance.__dataclass_fields__} - elif isinstance(instance, (list, tuple)): - return [dataclass_to_dict(k) for k in instance] - else: - return instance diff --git a/opteryx/utils/arrow.py b/opteryx/utils/arrow.py index f8765e387..8effb11ba 100644 --- a/opteryx/utils/arrow.py +++ b/opteryx/utils/arrow.py @@ -26,7 +26,7 @@ def limit_records( morsels: Iterator[pyarrow.Table], limit: Optional[int] = None, offset: int = 0 -) -> Optional[Iterator[pyarrow.Table]]: +) -> Optional[Iterator[pyarrow.Table]]: # pragma: no cover """ Cycle over an iterable of morsels, limiting the response to a given number of records with an optional offset. diff --git a/tests/fuzzing/test_sql_fuzzer_single_table_select.py b/tests/fuzzing/test_sql_fuzzer_single_table_select.py index ec586a0b3..e0603d7a0 100644 --- a/tests/fuzzing/test_sql_fuzzer_single_table_select.py +++ b/tests/fuzzing/test_sql_fuzzer_single_table_select.py @@ -85,7 +85,10 @@ def generate_random_sql_select(columns, table): else: select_clause = "SELECT *" # Add table name - select_clause = select_clause + " FROM " + table + if random.random() < 0.1: + return f"SELECT * FROM ({generate_random_sql_select(columns, table)}) as table_{random_string(4)}" + else: + select_clause = select_clause + " FROM " + table # Generate a WHERE clause with 70% chance if random.random() < 0.7: where_clause = generate_condition(columns) diff --git a/tests/misc/test_connection.py b/tests/misc/test_connection.py index 296f2341f..69422d23d 100644 --- a/tests/misc/test_connection.py +++ b/tests/misc/test_connection.py @@ -31,6 +31,7 @@ def test_connection(): cur.close() + def test_execute(): import pandas diff --git a/tests/misc/test_cursor.py b/tests/misc/test_cursor.py index ddb86db5c..2ac6ff7a3 100644 --- a/tests/misc/test_cursor.py +++ b/tests/misc/test_cursor.py @@ -185,6 +185,15 @@ def test_execute_unsupported_syntax_error(): with pytest.raises(UnsupportedSyntaxError): cursor.execute("SELECT * FROM table; SELECT * FROM table2", params=[1]) +def test_non_tabular_result(): + cursor = setup_function() + cursor.execute("SET @name = 'tim'") + cursor.fetchall() + +def test_limit(): + cursor = setup_function() + dataset = cursor.execute_to_arrow("SELECT * FROM $planets", limit=3) + assert dataset.num_rows == 3 if __name__ == "__main__": # pragma: no cover from tests.tools import run_tests diff --git a/tests/plan_optimization/test_optimizations_invoked.py b/tests/plan_optimization/test_optimizations_invoked.py index cda3d4c8d..a8e686582 100644 --- a/tests/plan_optimization/test_optimizations_invoked.py +++ b/tests/plan_optimization/test_optimizations_invoked.py @@ -19,7 +19,9 @@ ("SELECT * FROM $planets WHERE id = 4 + 4", "optimization_constant_fold_expression"), ("SELECT * FROM $planets WHERE id * 0 = 1", "optimization_constant_fold_reduce"), ("SELECT id ^ 1 = 1 FROM $planets LIMIT 10", "optimization_limit_pushdown"), - ("SELECT name FROM $astronauts WHERE name = 'Neil A. Armstrong'", "optimization_predicate_pushdown") + ("SELECT name FROM $astronauts WHERE name = 'Neil A. Armstrong'", "optimization_predicate_pushdown"), + ("SELECT name FROM $planets WHERE name LIKE '%'", "optimization_constant_fold_reduce"), # rewritten to `name is not null` + ("SELECT name FROM $planets WHERE name ILIKE '%'", "optimization_constant_fold_reduce"), # rewritten to `name is not null` ] # fmt:on diff --git a/tests/sql_battery/test_shapes_and_errors_battery.py b/tests/sql_battery/test_shapes_and_errors_battery.py index d7c04d3ca..08f6ed5b9 100644 --- a/tests/sql_battery/test_shapes_and_errors_battery.py +++ b/tests/sql_battery/test_shapes_and_errors_battery.py @@ -257,6 +257,7 @@ ("SELECT * FROM $satellites WHERE name != 'Calypso'", 176, 8, None), ("SELECT * FROM $satellites WHERE name = '********'", 0, 8, None), ("SELECT * FROM $satellites WHERE name LIKE '_a_y_s_'", 1, 8, None), + ("SELECT * FROM $satellites WHERE name LIKE 'Cal%%'", 4, 8, None), ("SELECT * FROM $satellites WHERE name LIKE 'Cal%'", 4, 8, None), ("SELECT * FROM $satellites WHERE name like 'Cal%'", 4, 8, None), ("SELECT * FROM $satellites WHERE name ILIKE '_a_y_s_'", 1, 8, None), @@ -1125,6 +1126,7 @@ ("SELECT name, SEARCH(birth_place, 'Italy') FROM $astronauts", 357, 2, None), ("SELECT name, birth_place FROM $astronauts WHERE SEARCH(birth_place, 'Italy')", 1, 2, None), ("SELECT name, birth_place FROM $astronauts WHERE SEARCH(birth_place, 'Rome')", 1, 2, None), + ("SELECT SEARCH($satellites.name, 'a') FROM $planets LEFT JOIN $satellites ON $planets.id = $satellites.planetId", 179, 1, None), ("SELECT birth_date FROM $astronauts WHERE EXTRACT(year FROM birth_date) < 1930;", 14, 1, None), ("SELECT EXTRACT(month FROM birth_date) FROM $astronauts", 357, 1, None), @@ -1474,6 +1476,7 @@ ("SELECT p1.name AS planet1_name, p2.name AS planet2_name, p3.name AS planet3_name, p4.name AS planet4_name, p5.name AS planet5_name, p6.name AS planet6_name, p7.name AS planet7_name, p8.name AS planet8_name, p9.name AS planet9_name, p10.name AS planet10_name, p1.diameter AS planet1_diameter, p2.gravity AS planet2_gravity, p3.orbitalPeriod AS planet3_orbitalPeriod, p4.numberOfMoons AS planet4_numberOfMoons, p5.meanTemperature AS planet5_meanTemperature FROM $planets p1 JOIN $planets p2 ON p1.id = p2.id JOIN $planets p3 ON p1.id = p3.id JOIN $planets p4 ON p1.id = p4.id JOIN $planets p5 ON p1.id = p5.id JOIN $planets p6 ON p1.id = p6.id JOIN $planets p7 ON p1.id = p7.id JOIN $planets p8 ON p1.id = p8.id JOIN $planets p9 ON p1.id = p9.id JOIN $planets p10 ON p1.id = p10.id WHERE p1.diameter > 10000 ORDER BY p1.name, p2.name, p3.name, p4.name, p5.name;", 6, 15, None), ("SELECT mission, LIST(name) FROM $missions INNER JOIN (SELECT * FROM $astronauts CROSS JOIN UNNEST(missions) AS mission) AS astronauts ON Mission = mission GROUP BY mission", 16, 2, None), + ("SELECT alma_matered FROM (SELECT alma_mater FROM $astronauts CROSS JOIN $satellites) AS bulked CROSS JOIN UNNEST(alma_mater) AS alma_matered", 120537, 1, None), # virtual dataset doesn't exist ("SELECT * FROM $RomanGods", None, None, DatasetNotFoundError), diff --git a/tests/sql_battery/tests/system.run_tests b/tests/sql_battery/tests/system.run_tests index d255bf720..19c4c6f9b 100644 --- a/tests/sql_battery/tests/system.run_tests +++ b/tests/sql_battery/tests/system.run_tests @@ -1,6 +1,7 @@ SELECT version(); SELECT connection_id() AS pid; SHOW VARIABLES; +SET @name = 'bob'; # SELECT * FROM information_schema.tables; # SELECT * FROM information_schema.views; \ No newline at end of file diff --git a/tests/sql_battery/tests/v2_planner.run_tests b/tests/sql_battery/tests/v2_planner.run_tests index f28658566..1faec341e 100644 --- a/tests/sql_battery/tests/v2_planner.run_tests +++ b/tests/sql_battery/tests/v2_planner.run_tests @@ -30,4 +30,79 @@ EXPLAIN ANALYZE FORMAT JSON SELECT * FROM $planets AS a INNER JOIN (SELECT id FR SELECT DISTINCT ON (planetId) planetId, name FROM $satellites; # CONDITIONS IN AGGREGATES -SELECT SUM(DISTINCT id ORDER BY id) FROM $planets \ No newline at end of file +SELECT SUM(DISTINCT id ORDER BY id) FROM $planets + +# INNER JOIN with FULL OUTER JOIN +SELECT * FROM $planets AS p1 INNER JOIN $planets AS p2 ON p1.id = p2.id FULL OUTER JOIN $planets AS p3 ON p1.id = p3.id; + +# INNER JOIN with LEFT OUTER JOIN +SELECT * FROM $planets AS p1 INNER JOIN $planets AS p2 ON p1.id = p2.id LEFT OUTER JOIN $planets AS p3 ON p1.id = p3.id; + +# INNER JOIN with RIGHT OUTER JOIN +SELECT * FROM $planets AS p1 INNER JOIN $planets AS p2 ON p1.id = p2.id RIGHT OUTER JOIN $planets AS p3 ON p1.id = p3.id; + +# INNER JOIN with NATURAL JOIN +SELECT * FROM $planets AS p1 INNER JOIN $planets AS p2 ON p1.id = p2.id NATURAL JOIN $planets AS p3; + +# INNER JOIN with LEFT ANTI JOIN +SELECT * FROM $planets AS p1 INNER JOIN $planets AS p2 ON p1.id = p2.id LEFT ANTI JOIN $planets AS p3 ON p1.id = p3.id; + +# INNER JOIN with LEFT SEMI JOIN +SELECT * FROM $planets AS p1 INNER JOIN $planets AS p2 ON p1.id = p2.id LEFT SEMI JOIN $planets AS p3 ON p1.id = p3.id; + +# FULL OUTER JOIN with LEFT OUTER JOIN +SELECT * FROM $planets AS p1 FULL OUTER JOIN $planets AS p2 ON p1.id = p2.id LEFT OUTER JOIN $planets AS p3 ON p1.id = p3.id; + +# FULL OUTER JOIN with RIGHT OUTER JOIN +SELECT * FROM $planets AS p1 FULL OUTER JOIN $planets AS p2 ON p1.id = p2.id RIGHT OUTER JOIN $planets AS p3 ON p1.id = p3.id; + +# FULL OUTER JOIN with NATURAL JOIN +SELECT * FROM $planets AS p1 FULL OUTER JOIN $planets AS p2 ON p1.id = p2.id NATURAL JOIN $planets AS p3; + +# FULL OUTER JOIN with LEFT ANTI JOIN +SELECT * FROM $planets AS p1 FULL OUTER JOIN $planets AS p2 ON p1.id = p2.id LEFT ANTI JOIN $planets AS p3 ON p1.id = p3.id; + +# FULL OUTER JOIN with LEFT SEMI JOIN +SELECT * FROM $planets AS p1 FULL OUTER JOIN $planets AS p2 ON p1.id = p2.id LEFT SEMI JOIN $planets AS p3 ON p1.id = p3.id; + +# LEFT OUTER JOIN with RIGHT OUTER JOIN +SELECT * FROM $planets AS p1 LEFT OUTER JOIN $planets AS p2 ON p1.id = p2.id RIGHT OUTER JOIN $planets AS p3 ON p1.id = p3.id; + +# LEFT OUTER JOIN with NATURAL JOIN +SELECT * FROM $planets AS p1 LEFT OUTER JOIN $planets AS p2 ON p1.id = p2.id NATURAL JOIN $planets AS p3; + +# LEFT OUTER JOIN with LEFT ANTI JOIN +SELECT * FROM $planets AS p1 LEFT OUTER JOIN $planets AS p2 ON p1.id = p2.id LEFT ANTI JOIN $planets AS p3 ON p1.id = p3.id; + +# LEFT OUTER JOIN with LEFT SEMI JOIN +SELECT * FROM $planets AS p1 LEFT OUTER JOIN $planets AS p2 ON p1.id = p2.id LEFT SEMI JOIN $planets AS p3 ON p1.id = p3.id; + +# RIGHT OUTER JOIN with NATURAL JOIN +SELECT * FROM $planets AS p1 RIGHT OUTER JOIN $planets AS p2 ON p1.id = p2.id NATURAL JOIN $planets AS p3; + +# RIGHT OUTER JOIN with LEFT ANTI JOIN +SELECT * FROM $planets AS p1 RIGHT OUTER JOIN $planets AS p2 ON p1.id = p2.id LEFT ANTI JOIN $planets AS p3 ON p1.id = p3.id; + +# RIGHT OUTER JOIN with LEFT SEMI JOIN +SELECT * FROM $planets AS p1 RIGHT OUTER JOIN $planets AS p2 ON p1.id = p2.id LEFT SEMI JOIN $planets AS p3 ON p1.id = p3.id; + +# LEFT ANTI JOIN with LEFT SEMI JOIN +SELECT * FROM $planets AS p1 LEFT ANTI JOIN $planets AS p2 ON p1.id = p2.id LEFT SEMI JOIN $planets AS p3 ON p1.id = p3.id; + +# INNER JOIN with INNER JOIN +SELECT * FROM $planets AS p1 INNER JOIN $planets AS p2 ON p1.id = p2.id INNER JOIN $planets AS p3 ON p1.id = p3.id; + +# FULL OUTER JOIN with FULL OUTER JOIN +SELECT * FROM $planets AS p1 FULL OUTER JOIN $planets AS p2 ON p1.id = p2.id FULL OUTER JOIN $planets AS p3 ON p1.id = p3.id; + +# LEFT OUTER JOIN with LEFT OUTER JOIN +SELECT * FROM $planets AS p1 LEFT OUTER JOIN $planets AS p2 ON p1.id = p2.id LEFT OUTER JOIN $planets AS p3 ON p1.id = p3.id; + +# RIGHT OUTER JOIN with RIGHT OUTER JOIN +SELECT * FROM $planets AS p1 RIGHT OUTER JOIN $planets AS p2 ON p1.id = p2.id RIGHT OUTER JOIN $planets AS p3 ON p1.id = p3.id; + +# LEFT ANTI JOIN with LEFT ANTI JOIN +SELECT * FROM $planets AS p1 LEFT ANTI JOIN $planets AS p2 ON p1.id = p2.id LEFT ANTI JOIN $planets AS p3 ON p1.id = p3.id; + +# LEFT SEMI JOIN with LEFT SEMI JOIN +SELECT * FROM $planets AS p1 LEFT SEMI JOIN $planets AS p2 ON p1.id = p2.id LEFT SEMI JOIN $planets AS p3 ON p1.id = p3.id; \ No newline at end of file diff --git a/tests/storage/test_cache_memcached.py b/tests/storage/test_cache_memcached.py index f73d6a227..8f0e52404 100644 --- a/tests/storage/test_cache_memcached.py +++ b/tests/storage/test_cache_memcached.py @@ -6,6 +6,7 @@ import os import sys +import pytest os.environ["OPTERYX_DEBUG"] = "1" @@ -128,6 +129,15 @@ def test_memcache_threaded(): assert result == load, f"Post-thread check failed: {result} != {load}" +def test_skip_on_error(): + from opteryx.managers.cache import MemcachedCache + cache = MemcachedCache() + cache.set(b"key", b"value") + assert cache.get(b"key") == b"value" + cache._consecutive_failures = 10 + assert cache.get(b"key") is None + + if __name__ == "__main__": # pragma: no cover from tests.tools import run_tests diff --git a/tests/storage/test_cache_redis.py b/tests/storage/test_cache_redis.py index 5fdd6de2d..dc3f1f1e4 100644 --- a/tests/storage/test_cache_redis.py +++ b/tests/storage/test_cache_redis.py @@ -6,6 +6,7 @@ import os import sys +import pytest sys.path.insert(1, os.path.join(sys.path[0], "../..")) @@ -46,6 +47,23 @@ def test_redis_cache(): assert stats.get("cache_misses", 0) == 0, stats +def test_invalid_config(): + from opteryx.managers.cache import RedisCache + + with pytest.raises(Exception): + RedisCache(server="") + + v = RedisCache(server=None) + assert v._consecutive_failures == 10 + +def test_skip_on_error(): + from opteryx.managers.cache import RedisCache + cache = RedisCache() + cache.set(b"key", b"value") + assert cache.get(b"key") == b"value" + cache._consecutive_failures = 10 + assert cache.get(b"key") is None + if __name__ == "__main__": # pragma: no cover from tests.tools import run_tests diff --git a/tests/storage/test_cache_valkey.py b/tests/storage/test_cache_valkey.py index fe7c6f0da..52ba187ab 100644 --- a/tests/storage/test_cache_valkey.py +++ b/tests/storage/test_cache_valkey.py @@ -6,6 +6,7 @@ import os import sys +import pytest sys.path.insert(1, os.path.join(sys.path[0], "../..")) @@ -45,6 +46,22 @@ def test_valkey_cache(): assert stats.get("remote_cache_hits", 0) >= stats["blobs_read"], stats assert stats.get("cache_misses", 0) == 0, stats +def test_invalid_config(): + from opteryx.managers.cache import ValkeyCache + + with pytest.raises(Exception): + ValkeyCache(server="") + + v = ValkeyCache(server=None) + assert v._consecutive_failures == 10 + +def test_skip_on_error(): + from opteryx.managers.cache import ValkeyCache + cache = ValkeyCache() + cache.set(b"key", b"value") + assert cache.get(b"key") == b"value" + cache._consecutive_failures = 10 + assert cache.get(b"key") is None if __name__ == "__main__": # pragma: no cover from tests.tools import run_tests From 436ee2258135e14498ab1d3097d6a769fd71b551 Mon Sep 17 00:00:00 2001 From: XB500 Date: Mon, 23 Dec 2024 12:27:19 +0000 Subject: [PATCH 104/157] Opteryx Version 0.19.0-alpha.912 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index a17ea2163..fed3ecee2 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 910 +__build__ = 912 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 2b34e81809cecdfb88b07a0eadedac330b02968e Mon Sep 17 00:00:00 2001 From: joocer Date: Mon, 23 Dec 2024 16:19:51 +0000 Subject: [PATCH 105/157] #2144 --- opteryx/exceptions.py | 5 +++- opteryx/functions/__init__.py | 4 +-- opteryx/functions/other_functions.py | 30 ++++++++++++++++--- .../test_shapes_and_errors_battery.py | 4 +++ 4 files changed, 36 insertions(+), 7 deletions(-) diff --git a/opteryx/exceptions.py b/opteryx/exceptions.py index 19d518342..570464bc1 100644 --- a/opteryx/exceptions.py +++ b/opteryx/exceptions.py @@ -304,6 +304,7 @@ def __init__( right_column: Optional[str] = None, left_node: Optional[Any] = None, right_node: Optional[Any] = None, + message: Optional[str] = None, ): def _format_col(_type, _node, _name): if _node.node_type == 42: @@ -317,7 +318,9 @@ def _format_col(_type, _node, _name): self.column = column self.left_column = left_column self.right_column = right_column - if self.column: + if message: + super().__init__(message) + elif self.column: super().__init__( f"Incompatible types for column '{column}': {left_type} and {right_type}" ) diff --git a/opteryx/functions/__init__.py b/opteryx/functions/__init__.py index de0938905..eecd8b0d7 100644 --- a/opteryx/functions/__init__.py +++ b/opteryx/functions/__init__.py @@ -529,8 +529,8 @@ def apply_function(function: str = None, *parameters): if null_positions.all(): return numpy.array([None] * morsel_size) - if null_positions.any(): - # if we have nulls and both columns are numpy arrays, we can speed things + if null_positions.any() and all(isinstance(arr, numpy.ndarray) for arr in parameters): + # if we have nulls and the value array is a numpy arrays, we can speed things # up by removing the nulls from the calculations, we add the rows back in # later valid_positions = ~null_positions diff --git a/opteryx/functions/other_functions.py b/opteryx/functions/other_functions.py index eda4da41c..6822f659d 100644 --- a/opteryx/functions/other_functions.py +++ b/opteryx/functions/other_functions.py @@ -18,6 +18,7 @@ import simdjson from pyarrow import compute +from opteryx.exceptions import IncompatibleTypesError from opteryx.exceptions import SqlError @@ -150,13 +151,34 @@ def null_if(col1, col2): An array where elements from col1 are replaced with None if they match the corresponding elements in col2. """ if isinstance(col1, pyarrow.Array): - values = values.to_numpy(False) + col1 = col1.to_numpy(False) if isinstance(col1, list): - values = numpy.array(values) + col1 = col1.array(col1) if isinstance(col2, pyarrow.Array): - values = values.to_numpy(False) + col2 = col2.to_numpy(False) if isinstance(col2, list): - values = numpy.array(values) + col2 = col2.array(col2) + + from orso.types import PYTHON_TO_ORSO_MAP + from orso.types import OrsoTypes + + def get_first_non_null_type(array): + for item in array: + if item is not None: + return PYTHON_TO_ORSO_MAP.get(type(item), OrsoTypes._MISSING_TYPE) + return OrsoTypes.NULL + + col1_type = get_first_non_null_type(col1.tolist()) + col2_type = get_first_non_null_type(col2.tolist()) + + if col1_type != col2_type: + print(col1_type, col2_type) + + raise IncompatibleTypesError( + left_type=col1_type, + right_type=col2_type, + message=f"`NULLIF` called with input arrays of different types, {col1_type} and {col2_type}.", + ) # Create a mask where elements in col1 are equal to col2 mask = col1 == col2 diff --git a/tests/sql_battery/test_shapes_and_errors_battery.py b/tests/sql_battery/test_shapes_and_errors_battery.py index 08f6ed5b9..05811599b 100644 --- a/tests/sql_battery/test_shapes_and_errors_battery.py +++ b/tests/sql_battery/test_shapes_and_errors_battery.py @@ -2185,6 +2185,10 @@ ("SELECT DISTINCT l FROM (SELECT split('a b c d e f g h i j', ' ') as letters) as plet CROSS JOIN UNNEST (letters) as l", 10, 1, None), # 2112 ("SELECT id FROM $planets WHERE surface_pressure / surface_pressure is null", 5, 1, None), + #2144 + ("SELECT town, LENGTH(NULLIF(town, 'Inglewood')) FROM (SELECT birth_place->'town' AS town FROM $astronauts) AS T", 357, 2, None), + ("SELECT town, LENGTH(NULLIF(town, b'Inglewood')) FROM (SELECT birth_place->>'town' AS town FROM $astronauts) AS T", 357, 2, None), + ("SELECT town, LENGTH(NULLIF(town, 'Inglewood')) FROM (SELECT birth_place->>'town' AS town FROM $astronauts) AS T", None, None, IncompatibleTypesError), ] # fmt:on From 2d578586548e68155f7ded89ea872d80a6c5e32a Mon Sep 17 00:00:00 2001 From: XB500 Date: Mon, 23 Dec 2024 16:21:30 +0000 Subject: [PATCH 106/157] Opteryx Version 0.19.0-alpha.913 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index fed3ecee2..c1afc9409 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 912 +__build__ = 913 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 2ab6e3f62b3a806075a9b26965412bffd0e86955 Mon Sep 17 00:00:00 2001 From: joocer Date: Mon, 23 Dec 2024 22:38:53 +0000 Subject: [PATCH 107/157] #2146 --- opteryx/managers/expression/ops.py | 63 ++------------ opteryx/utils/sql.py | 83 +++++++++++++++++++ .../test_shapes_and_errors_battery.py | 53 +++++++++--- tests/storage/test_cache_memcached.py | 5 +- 4 files changed, 136 insertions(+), 68 deletions(-) diff --git a/opteryx/managers/expression/ops.py b/opteryx/managers/expression/ops.py index bec0a7401..b07bc40a3 100644 --- a/opteryx/managers/expression/ops.py +++ b/opteryx/managers/expression/ops.py @@ -180,71 +180,24 @@ def _inner_filter_operations(arr, operator, value): return list_ops.cython_allop_neq(arr[0], value) if operator == "AnyOpILike": - patterns = value[0] + from opteryx.utils.sql import regex_match_any - combined_regex_pattern = r"|".join(sql_like_to_regex(p) for p in patterns if p) - combined_regex = re.compile(combined_regex_pattern, re.IGNORECASE) - - out = numpy.zeros(arr.size, dtype=bool) - for i, row in enumerate(arr): - if row is None: - out[i] = None - continue - if row.size == 0: - continue - out[i] = any(combined_regex.search(elem) for elem in row) - - return out + return regex_match_any(arr, value[0], flags=re.IGNORECASE) if operator == "AnyOpLike": - patterns = value[0] + from opteryx.utils.sql import regex_match_any - combined_regex_pattern = r"|".join(sql_like_to_regex(p) for p in patterns if p) - combined_regex = re.compile(combined_regex_pattern) + return regex_match_any(arr, value[0]) - out = numpy.zeros(arr.size, dtype=bool) - for i, row in enumerate(arr): - if row is None: - out[i] = None - continue - if row.size == 0: - continue - out[i] = any(combined_regex.search(elem) for elem in row) - - return out if operator == "AnyOpNotLike": - patterns = value[0] - - combined_regex_pattern = r"|".join(sql_like_to_regex(p) for p in patterns if p) - combined_regex = re.compile(combined_regex_pattern) + from opteryx.utils.sql import regex_match_any - out = numpy.zeros(arr.size, dtype=bool) - for i, row in enumerate(arr): - if row is None: - out[i] = None - continue - if row.size == 0: - continue - out[i] = any(combined_regex.search(elem) for elem in row) - - return numpy.invert(out) + return regex_match_any(arr, value[0], invert=True) if operator == "AnyOpNotILike": - patterns = value[0] - - combined_regex_pattern = r"|".join(sql_like_to_regex(p) for p in patterns if p) - combined_regex = re.compile(combined_regex_pattern, re.IGNORECASE) - - out = numpy.zeros(arr.size, dtype=bool) - for i, row in enumerate(arr): - if row is None: - out[i] = None - continue - if row.size == 0: - continue - out[i] = any(combined_regex.search(elem) for elem in row) + from opteryx.utils.sql import regex_match_any - return numpy.invert(out) + return regex_match_any(arr, value[0], flags=re.IGNORECASE, invert=True) if operator == "AtQuestion": import simdjson diff --git a/opteryx/utils/sql.py b/opteryx/utils/sql.py index b5bc459ca..125a8afe3 100644 --- a/opteryx/utils/sql.py +++ b/opteryx/utils/sql.py @@ -1,6 +1,8 @@ import re from typing import List +import numpy + ESCAPE_SPECIAL_CHARS = re.compile(r"([.^$*+?{}[\]|()\\])") @@ -114,3 +116,84 @@ def split_sql_statements(sql: str) -> List[str]: statements.append("".join(buffer).strip()) return [s for s in statements if s != ""] + + +def regex_match_any( + arr: numpy.ndarray, + patterns: List[str], + flags: int = re.NOFLAG, + invert: bool = False, +) -> numpy.ndarray: + """ + Evaluates whether each row in `arr` matches ANY of the given LIKE patterns. + Patterns are converted to regexes, combined, and compiled once. + + Parameters: + arr: numpy.ndarray + 1D array of rows. Each element can be: + - None + - A single string/bytes + - A list/tuple/array of strings/bytes + (all non-None elements are assumed to be the same structure). + patterns: List[str] + A list of SQL LIKE patterns. These get combined into a single regex. + flags: int, optional + Flags to pass to `re.compile()`, e.g. re.IGNORECASE for ILIKE. + + Returns: + numpy.ndarray: + A 1D object array with True, False, or None, + indicating whether each row did (or did not) match the patterns. + """ + # 1) Combine the LIKE patterns into a single compiled regex + # (Empty patterns list => empty string => matches nothing) + combined_pattern_str = r"|".join(sql_like_to_regex(p) for p in patterns if p) + # If there are no valid patterns, we build a "never match" pattern + if not combined_pattern_str: + combined_pattern_str = r"(?!x)" # Negative lookahead to never match + + combined_regex = re.compile(combined_pattern_str, flags=flags) + + # 2) Create the output array (dtype=object so we can store None/bool) + out = numpy.empty(arr.size, dtype=object) + + # 3) Determine if the array consists of single strings or lists-of-strings + first_non_none = None + for x in arr: + if x is not None: + first_non_none = x + break + + # If the entire array is None, just return all None + if first_non_none is None: + out[:] = None + return out + + single_string_mode = isinstance(first_non_none, (str, bytes)) + + # 4) Main loop + if single_string_mode: + # Single-string mode + for i, row in enumerate(arr): + if row is None: + out[i] = None + else: + # Match or not? + is_match = combined_regex.search(row) is not None + out[i] = (not is_match) if invert else is_match + else: + # Lists-of-strings mode + for i, row in enumerate(arr): + if row is None: + out[i] = None + else: + # row is assumed to be an iterable of strings/bytes + if row.size == 0: + # Probably a numpy array with zero length + is_match = False + else: + # If anything in the row matches, it's True + is_match = any(combined_regex.search(elem) for elem in row) + out[i] = (not is_match) if invert else is_match + + return out diff --git a/tests/sql_battery/test_shapes_and_errors_battery.py b/tests/sql_battery/test_shapes_and_errors_battery.py index 05811599b..468d62528 100644 --- a/tests/sql_battery/test_shapes_and_errors_battery.py +++ b/tests/sql_battery/test_shapes_and_errors_battery.py @@ -1857,29 +1857,60 @@ ("SELECT name, missions FROM $astronauts WHERE missions ILIKE ANY ('%Apoll%')", 34, 2, None), ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('%Apoll%', 'mission')", 34, 2, None), ("SELECT name, missions FROM $astronauts WHERE missions ILIKE ANY ('%Apoll%', 'mission')", 34, 2, None), - ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY '%apoll%'", 357, 2, None), - ("SELECT name, missions FROM $astronauts WHERE missions NOT ILIKE ANY '%apoll%'", 323, 2, None), - ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('%apoll%')", 357, 2, None), - ("SELECT name, missions FROM $astronauts WHERE missions NOT ILIKE ANY ('%apoll%')", 323, 2, None), - ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('%Apoll%')", 323, 2, None), - ("SELECT name, missions FROM $astronauts WHERE missions NOT ILIKE ANY ('%Apoll%')", 323, 2, None), - ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('%Apoll%', 'mission')", 323, 2, None), - ("SELECT name, missions FROM $astronauts WHERE missions NOT ILIKE ANY ('%Apoll%', 'mission')", 323, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY '%apoll%'", 334, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions NOT ILIKE ANY '%apoll%'", 300, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('%apoll%')", 334, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions NOT ILIKE ANY ('%apoll%')", 300, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('%Apoll%')", 300, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions NOT ILIKE ANY ('%Apoll%')", 300, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('%Apoll%', 'mission')", 300, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions NOT ILIKE ANY ('%Apoll%', 'mission')", 300, 2, None), ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('Apoll%', 'Gemini%', 'Mercury%')", 37, 2, None), - ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('Apoll%', 'Gemini%', 'Mercury%')", 320, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('Apoll%', 'Gemini%', 'Mercury%')", 297, 2, None), ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ()", 0, 2, SqlError), ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ()", 0, 2, SqlError), ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('%Apoll%', null)", 34, 2, None), - ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('%Apoll%', null)", 323, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('%Apoll%', null)", 300, 2, None), ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('%aPoll%')", 0, 2, None), ("SELECT name, missions FROM $astronauts WHERE missions ILIKE ANY ('%aPoll%')", 34, 2, None), ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('Apollo 11')", 3, 2, None), - ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('Apollo 11')", 354, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('Apollo 11')", 331, 2, None), ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('Apollo_%')", 34, 2, None), ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('Apo__o%')", 34, 2, None), ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('%Apoll%', 123)", 34, 2, None), ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('%pattern1%', '%pattern2%', '%pattern3%', '%pattern4%', '%pattern5%', '%pattern6%', '%pattern7%', '%pattern8%', '%pattern9%', '%pattern10%', '%pattern11%', '%pattern12%', '%pattern13%', '%pattern14%', '%pattern15%', '%pattern16%', '%pattern17%', '%pattern18%', '%pattern19%', '%pattern20%', '%pattern21%', '%pattern22%', '%pattern23%', '%pattern24%', '%pattern25%', '%pattern26%', '%pattern27%', '%pattern28%', '%pattern29%', '%pattern30%', '%pattern31%', '%pattern32%', '%pattern33%', '%pattern34%', '%pattern35%', '%pattern36%', '%pattern37%', '%pattern38%', '%pattern39%', '%pattern40%', '%pattern41%', '%pattern42%', '%pattern43%', '%pattern44%', '%pattern45%', '%pattern46%', '%pattern47%', '%pattern48%', '%pattern49%', '%pattern50%');", 0, 2, None), + ("SELECT name, missions FROM $astronauts WHERE name LIKE ANY '%armstrong%'", 0, 2, None), + ("SELECT name, missions FROM $astronauts WHERE name ILIKE ANY '%armstrong%'", 1, 2, None), + ("SELECT name, missions FROM $astronauts WHERE name LIKE ANY ('%arms%')", 0, 2, None), + ("SELECT name, missions FROM $astronauts WHERE name ILIKE ANY ('%arms%')", 1, 2, None), + ("SELECT name, missions FROM $astronauts WHERE name LIKE ANY ('%Armstrong%')", 1, 2, None), + ("SELECT name, missions FROM $astronauts WHERE name ILIKE ANY ('%Arms%')", 1, 2, None), + ("SELECT name, missions FROM $astronauts WHERE name LIKE ANY ('%Armstrong%', 'mission')", 1, 2, None), + ("SELECT name, missions FROM $astronauts WHERE name ILIKE ANY ('%Armstrong%', 'mission')", 1, 2, None), + ("SELECT name, missions FROM $astronauts WHERE name NOT LIKE ANY '%armstrong%'", 357, 2, None), + ("SELECT name, missions FROM $astronauts WHERE name NOT ILIKE ANY '%armstrong%'", 356, 2, None), + ("SELECT name, missions FROM $astronauts WHERE name NOT LIKE ANY ('%armstrong%')", 357, 2, None), + ("SELECT name, missions FROM $astronauts WHERE name NOT ILIKE ANY ('%armstrong%')", 356, 2, None), + ("SELECT name, missions FROM $astronauts WHERE name NOT LIKE ANY ('%Armstrong%')", 356, 2, None), + ("SELECT name, missions FROM $astronauts WHERE name NOT ILIKE ANY ('%Armstrong%')", 356, 2, None), + ("SELECT name, missions FROM $astronauts WHERE name NOT LIKE ANY ('%Armstrong%', 'mission')", 356, 2, None), + ("SELECT name, missions FROM $astronauts WHERE name NOT ILIKE ANY ('%Armstrong%', 'mission')", 356, 2, None), + ("SELECT name, missions FROM $astronauts WHERE name LIKE ANY ('%Armstrong%', '%Aldrin%', '%Collins%')", 4, 2, None), + ("SELECT name, missions FROM $astronauts WHERE name NOT LIKE ANY ('%Armstrong%', '%Aldrin%', '%Collins%')", 353, 2, None), + ("SELECT name, missions FROM $astronauts WHERE name LIKE ANY ()", 0, 2, SqlError), + ("SELECT name, missions FROM $astronauts WHERE name NOT LIKE ANY ()", 0, 2, SqlError), + ("SELECT name, missions FROM $astronauts WHERE name LIKE ANY ('%Armstrong%', null)", 1, 2, None), + ("SELECT name, missions FROM $astronauts WHERE name NOT LIKE ANY ('%Armstrong%', null)", 356, 2, None), + ("SELECT name, missions FROM $astronauts WHERE name LIKE ANY ('%aRmstrong%')", 0, 2, None), + ("SELECT name, missions FROM $astronauts WHERE name ILIKE ANY ('%aRmstrong%')", 1, 2, None), + ("SELECT name, missions FROM $astronauts WHERE name LIKE ANY ('Neil A. Armstrong')", 1, 2, None), + ("SELECT name, missions FROM $astronauts WHERE name NOT LIKE ANY ('Neil A. Armstrong')", 356, 2, None), + ("SELECT name, missions FROM $astronauts WHERE name LIKE ANY ('%__Armstrong%')", 1, 2, None), + ("SELECT name, missions FROM $astronauts WHERE name LIKE ANY ('%Arm__rong%')", 1, 2, None), + ("SELECT name, missions FROM $astronauts WHERE name LIKE ANY ('%Armstrong%', 123)", 1, 2, None), + ("SELECT name, missions FROM $astronauts WHERE name LIKE ANY ('%pattern1%', '%pattern2%', '%pattern3%', '%pattern4%', '%pattern5%', '%pattern6%', '%pattern7%', '%pattern8%', '%pattern9%', '%pattern10%', '%pattern11%', '%pattern12%', '%pattern13%', '%pattern14%', '%pattern15%', '%pattern16%', '%pattern17%', '%pattern18%', '%pattern19%', '%pattern20%', '%pattern21%', '%pattern22%', '%pattern23%', '%pattern24%', '%pattern25%', '%pattern26%', '%pattern27%', '%pattern28%', '%pattern29%', '%pattern30%', '%pattern31%', '%pattern32%', '%pattern33%', '%pattern34%', '%pattern35%', '%pattern36%', '%pattern37%', '%pattern38%', '%pattern39%', '%pattern40%', '%pattern41%', '%pattern42%', '%pattern43%', '%pattern44%', '%pattern45%', '%pattern46%', '%pattern47%', '%pattern48%', '%pattern49%', '%pattern50%');", 0, 2, None), + # **************************************************************************************** # These are queries which have been found to return the wrong result or not run correctly diff --git a/tests/storage/test_cache_memcached.py b/tests/storage/test_cache_memcached.py index 8f0e52404..93c1a53f3 100644 --- a/tests/storage/test_cache_memcached.py +++ b/tests/storage/test_cache_memcached.py @@ -6,7 +6,6 @@ import os import sys -import pytest os.environ["OPTERYX_DEBUG"] = "1" @@ -62,6 +61,7 @@ def test_memcached_cache(): assert stats.get("remote_cache_hits", 0) >= stats["blobs_read"], str(stats) assert stats.get("cache_misses", 0) == 0, stats +@skip_if(is_arm() or is_windows() or is_mac()) def test_memcache_stand_alone(): os.environ["OPTERYX_DEBUG"] = "1" from opteryx.managers.cache import MemcachedCache @@ -110,6 +110,7 @@ def threaded_cache_operations(cache: MemcachedCache, payloads: list): for thread in threads: thread.join() +@skip_if(is_arm() or is_windows() or is_mac()) def test_memcache_threaded(): os.environ["OPTERYX_DEBUG"] = "1" @@ -128,7 +129,7 @@ def test_memcache_threaded(): if result: assert result == load, f"Post-thread check failed: {result} != {load}" - +@skip_if(is_arm() or is_windows() or is_mac()) def test_skip_on_error(): from opteryx.managers.cache import MemcachedCache cache = MemcachedCache() From 6310f225c075cc358c267de9a2e90d86c4341990 Mon Sep 17 00:00:00 2001 From: XB500 Date: Mon, 23 Dec 2024 22:39:28 +0000 Subject: [PATCH 108/157] Opteryx Version 0.19.0-alpha.914 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index fed3ecee2..48172f824 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 912 +__build__ = 914 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 3d9e65ac59cb03d01bb9f0be42d9d12f4634a87a Mon Sep 17 00:00:00 2001 From: XB500 Date: Mon, 23 Dec 2024 22:40:26 +0000 Subject: [PATCH 109/157] Opteryx Version 0.19.0-alpha.915 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 48172f824..0595e543b 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 914 +__build__ = 915 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 501644cf8bf7d93c1b28998f7858934574d9ae86 Mon Sep 17 00:00:00 2001 From: joocer Date: Tue, 24 Dec 2024 01:10:36 +0000 Subject: [PATCH 110/157] #2149 --- opteryx/models/physical_plan.py | 4 + opteryx/operators/__init__.py | 2 +- opteryx/operators/filter_join_node.py | 154 ++++++++++++++++++ opteryx/operators/inner_join_node.py | 10 +- opteryx/operators/inner_join_node_single.py | 3 + opteryx/operators/outer_join_node.py | 85 +--------- opteryx/operators/pyarrow_join_node.py | 119 -------------- opteryx/planner/binder/binder_visitor.py | 11 +- .../logical_planner/logical_planner.py | 6 + opteryx/planner/physical_planner.py | 10 +- .../test_shapes_and_errors_battery.py | 8 +- tests/sql_battery/tests/v2_planner.run_tests | 10 -- 12 files changed, 185 insertions(+), 237 deletions(-) create mode 100644 opteryx/operators/filter_join_node.py delete mode 100644 opteryx/operators/pyarrow_join_node.py diff --git a/opteryx/models/physical_plan.py b/opteryx/models/physical_plan.py index 9890f5056..43f1e579b 100644 --- a/opteryx/models/physical_plan.py +++ b/opteryx/models/physical_plan.py @@ -55,6 +55,10 @@ def depth_first_search_flat( # Sort neighbors based on relationship to ensure left, right, then unlabelled order neighbors = sorted(self.ingoing_edges(node), key=lambda x: (x[2] == "right", x[2] == "")) + # left semi and anti joins we hash the right side first, usually we want the left side first + if self[node].is_join and self[node].join_type in ("left anti", "left semi"): + neighbors.reverse() + # Traverse each child, prioritizing left, then right, then unlabelled for neighbor, _, _ in neighbors: if neighbor not in visited: diff --git a/opteryx/operators/__init__.py b/opteryx/operators/__init__.py index 740f3f6f8..7d318c9be 100644 --- a/opteryx/operators/__init__.py +++ b/opteryx/operators/__init__.py @@ -26,6 +26,7 @@ from .distinct_node import DistinctNode # remove duplicate records from .exit_node import ExitNode from .explain_node import ExplainNode # EXPLAIN queries +from .filter_join_node import FilterJoinNode # filter unwanted rows from .filter_node import FilterNode # filter unwanted rows from .function_dataset_node import FunctionDatasetNode # Dataset Constructors from .heap_sort_node import HeapSortNode # Heap @@ -34,7 +35,6 @@ from .inner_join_node import InnerJoinNode from .inner_join_node_single import InnerJoinSingleNode from .limit_node import LimitNode # select the first N records -from .pyarrow_join_node import PyArrowJoinNode from .outer_join_node import OuterJoinNode from .projection_node import ProjectionNode # remove unwanted columns including renames diff --git a/opteryx/operators/filter_join_node.py b/opteryx/operators/filter_join_node.py new file mode 100644 index 000000000..2e71ada18 --- /dev/null +++ b/opteryx/operators/filter_join_node.py @@ -0,0 +1,154 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Filter Join Node + +This is a SQL Query Execution Plan Node. + +This module contains implementations for LEFT SEMI and LEFT ANTI joins. +These joins are used to filter rows from the left table based on the +presence or absence of matching rows in the right table. +""" + +from typing import List +from typing import Set + +import pyarrow + +from opteryx import EOS +from opteryx.models import QueryProperties + +from . import JoinNode + + +def left_anti_join(left_relation, left_columns: List[str], right_hash_set: Set[str]): + """ + Perform a LEFT ANTI JOIN. + + This implementation ensures that all rows from the left table are included in the result set, + where there are no matching rows in the right table based on the join columns. + + Parameters: + left_relation (pyarrow.Table): The left pyarrow.Table to join. + left_columns (list of str): Column names from the left table to join on. + right_hash_set (set of tuple): A set of tuples representing the hashed values of the right table's join columns. + + Returns: + A pyarrow.Table containing the result of the LEFT ANTI JOIN operation. + """ + + left_indexes = [] + left_values = left_relation.select(left_columns).drop_null().itercolumns() + for i, value_tuple in enumerate(map(hash, zip(*left_values))): + if ( + value_tuple not in right_hash_set + ): # Only include left rows that have no match in the right table + left_indexes.append(i) + + # Filter the left_chunk based on the anti join condition + if left_indexes: + return left_relation.take(left_indexes) + else: + return left_relation.slice(0, 0) + + +def left_semi_join(left_relation, left_columns: List[str], right_hash_set: Set[str]): + """ + Perform a LEFT SEMI JOIN. + + This implementation ensures that all rows from the left table that have a matching row in the right table + based on the join columns are included in the result set. + + Parameters: + left_relation (pyarrow.Table): The left pyarrow.Table to join. + left_columns (list of str): Column names from the left table to join on. + right_hash_set (set of tuple): A set of tuples representing the hashed values of the right table's join columns. + + Returns: + A pyarrow.Table containing the result of the LEFT ANTI JOIN operation. + """ + left_indexes = [] + left_values = left_relation.select(left_columns).drop_null().itercolumns() + for i, value_tuple in enumerate(map(hash, zip(*left_values))): + if ( + value_tuple in right_hash_set + ): # Only include left rows that have a match in the right table + left_indexes.append(i) + + # Filter the left_chunk based on the semi join condition + if left_indexes: + return left_relation.take(left_indexes) + else: + return left_relation.slice(0, 0) + + +class FilterJoinNode(JoinNode): + def __init__(self, properties: QueryProperties, **parameters): + JoinNode.__init__(self, properties=properties, **parameters) + self.join_type = parameters["type"] + self.on = parameters.get("on") + self.using = parameters.get("using") + + self.left_columns = parameters.get("left_columns") + self.left_readers = parameters.get("left_readers") + + self.right_columns = parameters.get("right_columns") + self.right_readers = parameters.get("right_readers") + + self.right_buffer = [] + self.right_hash_set = set() + + @classmethod + def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover + raise NotImplementedError() + + @property + def name(self): # pragma: no cover + return self.join_type + + @property + def config(self) -> str: # pragma: no cover + from opteryx.managers.expression import format_expression + + if self.on: + return f"{self.join_type.upper()} JOIN ({format_expression(self.on, True)})" + if self.using: + return f"{self.join_type.upper()} JOIN (USING {','.join(map(format_expression, self.using))})" + return f"{self.join_type.upper()}" + + def execute(self, morsel: pyarrow.Table, join_leg: str) -> pyarrow.Table: + if join_leg == "left": + if morsel == EOS: + yield EOS + else: + join_provider = providers.get(self.join_type) + yield join_provider( + left_relation=morsel, + left_columns=self.left_columns, + right_hash_set=self.right_hash_set, + ) + if join_leg == "right": + if morsel == EOS: + right_relation = pyarrow.concat_tables(self.right_buffer, promote_options="none") + self.right_buffer.clear() + non_null_right_values = right_relation.select(self.right_columns).drop_null().itercolumns() + self.right_hash_set = set(map(hash, zip(*non_null_right_values))) + else: + self.right_buffer.append(morsel) + yield None + + +providers = { + "left anti": left_anti_join, + "left semi": left_semi_join, +} diff --git a/opteryx/operators/inner_join_node.py b/opteryx/operators/inner_join_node.py index 30fab3b27..a22263566 100644 --- a/opteryx/operators/inner_join_node.py +++ b/opteryx/operators/inner_join_node.py @@ -76,6 +76,9 @@ def inner_join_with_preprocessed_left_side(left_relation, right_relation, join_c class InnerJoinNode(JoinNode): + + join_type = "inner" + def __init__(self, properties: QueryProperties, **parameters): JoinNode.__init__(self, properties=properties, **parameters) @@ -112,13 +115,6 @@ def execute(self, morsel: Table, join_leg: str) -> Table: ) self.left_buffer.clear() - # in place until #1295 resolved - if self.left_columns[0] not in self.left_relation.column_names: - self.right_columns, self.left_columns = ( - self.left_columns, - self.right_columns, - ) - start = time.monotonic_ns() self.left_hash = hash_join_map(self.left_relation, self.left_columns) self.statistics.time_build_hash_map += time.monotonic_ns() - start diff --git a/opteryx/operators/inner_join_node_single.py b/opteryx/operators/inner_join_node_single.py index 7ebda2981..c028bfdc4 100644 --- a/opteryx/operators/inner_join_node_single.py +++ b/opteryx/operators/inner_join_node_single.py @@ -159,6 +159,9 @@ def inner_join_with_preprocessed_left_side(left_relation, right_relation, join_c class InnerJoinSingleNode(JoinNode): + + join_type = "inner" + def __init__(self, properties: QueryProperties, **parameters): JoinNode.__init__(self, properties=properties, **parameters) diff --git a/opteryx/operators/outer_join_node.py b/opteryx/operators/outer_join_node.py index 78c1665a6..343922486 100644 --- a/opteryx/operators/outer_join_node.py +++ b/opteryx/operators/outer_join_node.py @@ -29,7 +29,6 @@ import pyarrow from opteryx import EOS -from opteryx.compiled.structures import HashSet from opteryx.compiled.structures import HashTable from opteryx.models import QueryProperties from opteryx.utils.arrow import align_tables @@ -172,81 +171,6 @@ def right_join(left_relation, right_relation, left_columns: List[str], right_col yield align_tables(left_relation, right_chunk, left_indexes, right_indexes) -def left_anti_join( - left_relation, right_relation, left_columns: List[str], right_columns: List[str] -): - """ - Perform a LEFT ANTI JOIN. - - This implementation ensures that all rows from the left table are included in the result set, - where there are no matching rows in the right table based on the join columns. - - Parameters: - left_relation (pyarrow.Table): The left pyarrow.Table to join. - right_relation (pyarrow.Table): The right pyarrow.Table to join. - left_columns (list of str): Column names from the left table to join on. - right_columns (list of str): Column names from the right table to join on. - - Returns: - A pyarrow.Table containing the result of the LEFT ANTI JOIN operation. - """ - non_null_right_values = right_relation.select(right_columns).itercolumns() - right_hash_set = set(zip(*non_null_right_values)) - - left_indexes = [] - left_values = left_relation.select(left_columns).itercolumns() - for i, value_tuple in enumerate(zip(*left_values)): - if ( - value_tuple not in right_hash_set - ): # Only include left rows that have no match in the right table - left_indexes.append(i) - - # Filter the left_chunk based on the anti join condition - if left_indexes: - yield left_relation.take(left_indexes) - else: - yield left_relation.slice(0, 0) - - -def left_semi_join( - left_relation, right_relation, left_columns: List[str], right_columns: List[str] -): - """ - Perform a LEFT SEMI JOIN. - - This implementation ensures that all rows from the left table that have a matching row in the right table - based on the join columns are included in the result set. - - Parameters: - left_relation (pyarrow.Table): The left pyarrow.Table to join. - right_relation (pyarrow.Table): The right pyarrow.Table to join. - left_columns (list of str): Column names from the left table to join on. - right_columns (list of str): Column names from the right table to join on. - - Returns: - A pyarrow.Table containing the result of the LEFT SEMI JOIN operation. - """ - - hash_table = HashTable() - non_null_right_values = right_relation.select(right_columns).itercolumns() - for i, value_tuple in enumerate(zip(*non_null_right_values)): - hash_table.insert(hash(value_tuple), i) - - left_indexes = [] - left_values = left_relation.select(left_columns).itercolumns() - - for i, value_tuple in enumerate(zip(*left_values)): - rows = hash_table.get(hash(value_tuple)) - if rows: # Only include left rows that have a match in the right table - left_indexes.append(i) - - # Filter the left_chunk based on the anti join condition - if left_indexes: - yield left_relation.take(left_indexes) - else: - yield left_relation.slice(0, 0) - - class OuterJoinNode(JoinNode): def __init__(self, properties: QueryProperties, **parameters): JoinNode.__init__(self, properties=properties, **parameters) @@ -283,6 +207,7 @@ def config(self) -> str: # pragma: no cover return f"{self.join_type.upper()}" def execute(self, morsel: pyarrow.Table, join_leg: str) -> pyarrow.Table: + print("OuterJoinNode.execute", join_leg, type(morsel)) if join_leg == "left": if morsel == EOS: self.left_relation = pyarrow.concat_tables(self.left_buffer, promote_options="none") @@ -312,10 +237,4 @@ def execute(self, morsel: pyarrow.Table, join_leg: str) -> pyarrow.Table: yield None -providers = { - "left outer": left_join, - "full outer": full_join, - "right outer": right_join, - "left anti": left_anti_join, - "left semi": left_semi_join, -} +providers = {"left outer": left_join, "full outer": full_join, "right outer": right_join} diff --git a/opteryx/operators/pyarrow_join_node.py b/opteryx/operators/pyarrow_join_node.py deleted file mode 100644 index 6a4afdfe1..000000000 --- a/opteryx/operators/pyarrow_join_node.py +++ /dev/null @@ -1,119 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Join Node - -We have our own implementations of INNER and OUTER joins, this uses PyArrow -to implement less-common joins of ANTI and SEMI joins. -""" - -import pyarrow - -from opteryx import EOS -from opteryx.exceptions import UnsupportedSyntaxError -from opteryx.models import QueryProperties - -from . import JoinNode - - -class PyArrowJoinNode(JoinNode): - def __init__(self, properties: QueryProperties, **parameters): - JoinNode.__init__(self, properties=properties, **parameters) - self._join_type = parameters["type"] - self._on = parameters.get("on") - self._using = parameters.get("using") - - self._left_columns = parameters.get("left_columns") - self.left_readers = parameters.get("left_readers") - - self._right_columns = parameters.get("right_columns") - self.right_readers = parameters.get("right_readers") - - self.stream = "left" - self.left_buffer = [] - self.right_buffer = [] - self.left_relation = None - - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - - @property - def name(self): # pragma: no cover - return f"{self._join_type} Join" - - @property - def config(self): # pragma: no cover - from opteryx.managers.expression import format_expression - - if self._on: - return f"{self._join_type.upper()} JOIN ({format_expression(self._on, True)})" - if self._using: - return f"{self._join_type.upper()} JOIN (USING {','.join(map(format_expression, self._using))})" - return f"{self._join_type.upper()}" - - def execute(self, morsel: pyarrow.Table, join_leg: str) -> pyarrow.Table: - if self.stream == "left": - if morsel == EOS: - self.stream = "right" - self.left_relation = pyarrow.concat_tables(self.left_buffer, promote_options="none") - self.left_buffer.clear() - - # in place until #1295 resolved - if self._left_columns[0] not in self.left_relation.column_names: - self._right_columns, self._left_columns = ( - self._left_columns, - self._right_columns, - ) - - else: - self.left_buffer.append(morsel) - yield None - return - - if morsel == EOS: - right_relation = pyarrow.concat_tables(self.right_buffer, promote_options="none") - self.right_buffer.clear() - # do the join - try: - new_morsel = self.left_relation.join( - right_relation, - keys=self._left_columns, - right_keys=self._right_columns, - join_type=self._join_type, - coalesce_keys=self._using is not None, - ) - except pyarrow.ArrowInvalid as err: # pragma: no cover - last_token = str(err).split(" ")[-1] - column = None - for col in self.left_relation.columns: - if last_token == col.identity: - column = col.name - break - for col in right_relation.columns: - if last_token == col.identity: - column = col.name - break - if column: - raise UnsupportedSyntaxError( - f"Unable to ANTI/SEMI JOIN with unsupported column types in table, '{column}'." - ) from err - raise UnsupportedSyntaxError( - "Unable to ANTI/SEMI JOIN with unsupported column types in table." - ) from err - - yield new_morsel - - else: - self.right_buffer.append(morsel) - yield None diff --git a/opteryx/planner/binder/binder_visitor.py b/opteryx/planner/binder/binder_visitor.py index 151b7aa12..ec936287a 100644 --- a/opteryx/planner/binder/binder_visitor.py +++ b/opteryx/planner/binder/binder_visitor.py @@ -689,12 +689,8 @@ def visit_join(self, node: Node, context: BindingContext) -> Tuple[Node, Binding right_column = context.schemas[right_relation_name].pop_column(column_name) # we need to decide which column we're going to keep - if node.type in ("right anti", "right semi"): - right_column.origin = [left_relation_name, right_relation_name] - columns.append(right_column) - else: - left_column.origin = [left_relation_name, right_relation_name] - columns.append(left_column) + left_column.origin = [left_relation_name, right_relation_name] + columns.append(left_column) # shared columns exist in both schemas in some uses and in neither in others context.schemas[f"$shared-{random_string()}"] = RelationSchema( @@ -705,9 +701,6 @@ def visit_join(self, node: Node, context: BindingContext) -> Tuple[Node, Binding if node.type in ("left anti", "left semi"): for schema in node.right_relation_names: context.schemas.pop(schema) - if node.type in ("right anti", "right semi"): - for schema in node.left_relation_names: - context.schemas.pop(schema) # If we have an unnest_column, how how it is bound is different to other columns if node.unnest_column: diff --git a/opteryx/planner/logical_planner/logical_planner.py b/opteryx/planner/logical_planner/logical_planner.py index 507c45615..4a7facab3 100644 --- a/opteryx/planner/logical_planner/logical_planner.py +++ b/opteryx/planner/logical_planner/logical_planner.py @@ -622,6 +622,12 @@ def extract_unnest_dataset(join: dict, join_type: str) -> Tuple[Optional[str], O join_step = LogicalPlanNode(node_type=LogicalPlanStepType.Join) join_step.type = extract_join_type(join) + + if join_step.type in ("right semi", "right anti"): + raise UnsupportedSyntaxError( + f"{join_step.type.upper()} JOIN not supported, use LEFT variations only." + ) + join_step.on, join_step.using = extract_join_condition(join) # At this stage, CROSS JOIN UNNEST are represented in a single JOIN node join_step.unnest_column, join_step.unnest_alias = extract_unnest_dataset(join, join_step.type) diff --git a/opteryx/planner/physical_planner.py b/opteryx/planner/physical_planner.py index 414cced93..1ea300e4f 100644 --- a/opteryx/planner/physical_planner.py +++ b/opteryx/planner/physical_planner.py @@ -14,8 +14,8 @@ from orso.schema import OrsoTypes from opteryx import operators as operators +from opteryx.exceptions import InvalidInternalStateError from opteryx.exceptions import UnsupportedSyntaxError -from opteryx.models import LogicalColumn from opteryx.models import PhysicalPlan from opteryx.planner.logical_planner import LogicalPlanStepType @@ -55,15 +55,17 @@ def create_physical_plan(logical_plan, query_properties) -> PhysicalPlan: node = operators.InnerJoinSingleNode(query_properties, **node_config) else: node = operators.InnerJoinNode(query_properties, **node_config) - elif node_config.get("type") in ("left outer", "full outer", "right outer", "left anti", "left semi"): + elif node_config.get("type") in ("left outer", "full outer", "right outer"): # We use out own implementation of OUTER JOINS node = operators.OuterJoinNode(query_properties, **node_config) elif node_config.get("type") == "cross join": # Pyarrow doesn't have a CROSS JOIN node = operators.CrossJoinNode(query_properties, **node_config) + elif node_config.get("type") in ("left anti", "left semi"): + # We use our own implementation of LEFT SEMI and LEFT ANTI JOIN + node = operators.FilterJoinNode(query_properties, **node_config) else: - # Use Pyarrow for all other joins (right semi, right anti) - node = operators.PyArrowJoinNode(query_properties, **node_config) + raise InvalidInternalStateError(f"Unsupported JOIN type '{node_config['type']}'") elif node_type == LogicalPlanStepType.Limit: node = operators.LimitNode(query_properties, **{k:v for k,v in node_config.items() if k in ("limit", "offset", "all_relations")}) elif node_type == LogicalPlanStepType.Order: diff --git a/tests/sql_battery/test_shapes_and_errors_battery.py b/tests/sql_battery/test_shapes_and_errors_battery.py index 05811599b..e5232c64a 100644 --- a/tests/sql_battery/test_shapes_and_errors_battery.py +++ b/tests/sql_battery/test_shapes_and_errors_battery.py @@ -1528,13 +1528,13 @@ # V2 New Syntax Checks # ("SELECT * FROM $planets AS P1 UNION SELECT * FROM $planets AS P2;", 9, 20, UnsupportedSyntaxError), ("SELECT * FROM $planets AS P LEFT ANTI JOIN $satellites AS S ON S.id = P.id;", 0, 20, None), - ("SELECT * FROM $planets AS P RIGHT ANTI JOIN $satellites AS S ON S.id = P.id;", 168, 8, None), + ("SELECT * FROM $planets AS P RIGHT ANTI JOIN $satellites AS S ON S.id = P.id;", 168, 8, UnsupportedSyntaxError), ("SELECT * FROM $planets AS P LEFT SEMI JOIN $satellites AS S ON S.id = P.id;", 9, 20, None), - ("SELECT * FROM $planets AS P RIGHT SEMI JOIN $satellites AS S ON S.id = P.id;", 9, 8, None), + ("SELECT * FROM $planets AS P RIGHT SEMI JOIN $satellites AS S ON S.id = P.id;", 9, 8, UnsupportedSyntaxError), ("SELECT * FROM $planets AS P LEFT ANTI JOIN $satellites AS S USING(id);", 0, 20, None), - ("SELECT * FROM $planets AS P RIGHT ANTI JOIN $satellites AS S USING(id);", 168, 8, None), + ("SELECT * FROM $planets AS P RIGHT ANTI JOIN $satellites AS S USING(id);", 168, 8, UnsupportedSyntaxError), ("SELECT * FROM $planets AS P LEFT SEMI JOIN $satellites AS S USING(id);", 9, 20, None), - ("SELECT * FROM $planets AS P RIGHT SEMI JOIN $satellites AS S USING(id);", 9, 8, None), + ("SELECT * FROM $planets AS P RIGHT SEMI JOIN $satellites AS S USING(id);", 9, 8, UnsupportedSyntaxError), ("SELECT * FROM $planets AS P LEFT ANTI JOIN $satellites AS S ON S.id = P.id WHERE P.id > 5;", 0, 20, None), ("SELECT * FROM $planets AS P LEFT ANTI JOIN (SELECT id FROM $satellites WHERE name LIKE 'Moon%') AS S ON S.id = P.id;", 8, 20, None), ("SELECT * FROM GENERATE_SERIES(1, 10) AS C LEFT ANTI JOIN $satellites AS S ON S.id = C;", 0, 1, None), diff --git a/tests/sql_battery/tests/v2_planner.run_tests b/tests/sql_battery/tests/v2_planner.run_tests index 1faec341e..5b12b6d4f 100644 --- a/tests/sql_battery/tests/v2_planner.run_tests +++ b/tests/sql_battery/tests/v2_planner.run_tests @@ -4,24 +4,14 @@ # NEW JOINS SELECT * FROM $planets LEFT ANTI JOIN $satellites USING(id); SELECT * FROM $planets LEFT SEMI JOIN $satellites USING(id); -SELECT * FROM $planets RIGHT ANTI JOIN $satellites USING(id); -SELECT * FROM $planets RIGHT SEMI JOIN $satellites USING(id); SELECT * FROM $planets LEFT ANTI JOIN $satellites ON $planets.id = $satellites.id; SELECT * FROM $planets LEFT SEMI JOIN $satellites ON $planets.id = $satellites.id; -SELECT * FROM $planets RIGHT ANTI JOIN $satellites ON $planets.id = $satellites.id; -SELECT * FROM $planets RIGHT SEMI JOIN $satellites ON $planets.id = $satellites.id; SELECT * FROM $planets LEFT ANTI JOIN $satellites ON $satellites.id = $planets.id; SELECT * FROM $planets LEFT SEMI JOIN $satellites ON $satellites.id = $planets.id; -SELECT * FROM $planets RIGHT ANTI JOIN $satellites ON $satellites.id = $planets.id; -SELECT * FROM $planets RIGHT SEMI JOIN $satellites ON $satellites.id = $planets.id; SELECT * FROM $planets AS P LEFT ANTI JOIN $satellites AS S ON P.id = S.id; SELECT * FROM $planets AS P LEFT SEMI JOIN $satellites AS S ON P.id = S.id; -SELECT * FROM $planets AS P RIGHT ANTI JOIN $satellites AS S ON P.id = S.id; -SELECT * FROM $planets AS P RIGHT SEMI JOIN $satellites AS S ON P.id = S.id; SELECT * FROM $planets AS P LEFT ANTI JOIN $satellites AS S USING(id); SELECT * FROM $planets AS P LEFT SEMI JOIN $satellites AS S USING(id); -SELECT * FROM $planets AS P RIGHT ANTI JOIN $satellites AS S USING(id); -SELECT * FROM $planets AS P RIGHT SEMI JOIN $satellites AS S USING(id); # EXPLAIN FORMAT EXPLAIN ANALYZE FORMAT JSON SELECT * FROM $planets AS a INNER JOIN (SELECT id FROM $planets) AS b USING (id); From 33a760b848cd6e6013c2fab424ed6f9c753344b6 Mon Sep 17 00:00:00 2001 From: XB500 Date: Tue, 24 Dec 2024 01:11:03 +0000 Subject: [PATCH 111/157] Opteryx Version 0.19.0-alpha.916 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index c1afc9409..7e4daf68d 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 913 +__build__ = 916 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 08b39af8755d177669c703cd9d02845a53c2aee2 Mon Sep 17 00:00:00 2001 From: XB500 Date: Tue, 24 Dec 2024 09:47:24 +0000 Subject: [PATCH 112/157] Opteryx Version 0.19.0-alpha.917 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 7e4daf68d..6bb369981 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 916 +__build__ = 917 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 53dd797407d6dc759bf6a84a247cd32970736fa2 Mon Sep 17 00:00:00 2001 From: joocer Date: Tue, 24 Dec 2024 10:27:55 +0000 Subject: [PATCH 113/157] #2149 --- opteryx/operators/cross_join_node.py | 2 ++ opteryx/operators/filter_join_node.py | 4 +++- opteryx/operators/inner_join_node.py | 1 - opteryx/operators/inner_join_node_single.py | 1 - opteryx/operators/outer_join_node.py | 1 - opteryx/utils/formatter.py | 1 + opteryx/utils/sql.py | 2 +- tests/fuzzing/test_sql_fuzzer_join.py | 10 +++++++--- 8 files changed, 14 insertions(+), 8 deletions(-) diff --git a/opteryx/operators/cross_join_node.py b/opteryx/operators/cross_join_node.py index a3c302fc4..6eb45b9b6 100644 --- a/opteryx/operators/cross_join_node.py +++ b/opteryx/operators/cross_join_node.py @@ -271,6 +271,8 @@ class CrossJoinNode(JoinNode): Implements a SQL CROSS JOIN """ + join_type = "cross" + def __init__(self, properties: QueryProperties, **parameters): JoinNode.__init__(self, properties=properties, **parameters) diff --git a/opteryx/operators/filter_join_node.py b/opteryx/operators/filter_join_node.py index 2e71ada18..8f2e839b4 100644 --- a/opteryx/operators/filter_join_node.py +++ b/opteryx/operators/filter_join_node.py @@ -141,7 +141,9 @@ def execute(self, morsel: pyarrow.Table, join_leg: str) -> pyarrow.Table: if morsel == EOS: right_relation = pyarrow.concat_tables(self.right_buffer, promote_options="none") self.right_buffer.clear() - non_null_right_values = right_relation.select(self.right_columns).drop_null().itercolumns() + non_null_right_values = ( + right_relation.select(self.right_columns).drop_null().itercolumns() + ) self.right_hash_set = set(map(hash, zip(*non_null_right_values))) else: self.right_buffer.append(morsel) diff --git a/opteryx/operators/inner_join_node.py b/opteryx/operators/inner_join_node.py index a22263566..1d968d372 100644 --- a/opteryx/operators/inner_join_node.py +++ b/opteryx/operators/inner_join_node.py @@ -76,7 +76,6 @@ def inner_join_with_preprocessed_left_side(left_relation, right_relation, join_c class InnerJoinNode(JoinNode): - join_type = "inner" def __init__(self, properties: QueryProperties, **parameters): diff --git a/opteryx/operators/inner_join_node_single.py b/opteryx/operators/inner_join_node_single.py index c028bfdc4..3a1d0395d 100644 --- a/opteryx/operators/inner_join_node_single.py +++ b/opteryx/operators/inner_join_node_single.py @@ -159,7 +159,6 @@ def inner_join_with_preprocessed_left_side(left_relation, right_relation, join_c class InnerJoinSingleNode(JoinNode): - join_type = "inner" def __init__(self, properties: QueryProperties, **parameters): diff --git a/opteryx/operators/outer_join_node.py b/opteryx/operators/outer_join_node.py index 343922486..f0f697167 100644 --- a/opteryx/operators/outer_join_node.py +++ b/opteryx/operators/outer_join_node.py @@ -207,7 +207,6 @@ def config(self) -> str: # pragma: no cover return f"{self.join_type.upper()}" def execute(self, morsel: pyarrow.Table, join_leg: str) -> pyarrow.Table: - print("OuterJoinNode.execute", join_leg, type(morsel)) if join_leg == "left": if morsel == EOS: self.left_relation = pyarrow.concat_tables(self.left_buffer, promote_options="none") diff --git a/opteryx/utils/formatter.py b/opteryx/utils/formatter.py index 69f6a3e46..ef2110c26 100644 --- a/opteryx/utils/formatter.py +++ b/opteryx/utils/formatter.py @@ -78,6 +78,7 @@ def _replacer(match): "OUTER", "RIGHT", "SELECT", + "SEMI", "SET", "SHOW", "SINCE", diff --git a/opteryx/utils/sql.py b/opteryx/utils/sql.py index 125a8afe3..78cda3ea1 100644 --- a/opteryx/utils/sql.py +++ b/opteryx/utils/sql.py @@ -121,7 +121,7 @@ def split_sql_statements(sql: str) -> List[str]: def regex_match_any( arr: numpy.ndarray, patterns: List[str], - flags: int = re.NOFLAG, + flags: int = 0, invert: bool = False, ) -> numpy.ndarray: """ diff --git a/tests/fuzzing/test_sql_fuzzer_join.py b/tests/fuzzing/test_sql_fuzzer_join.py index 1418ccbe9..f87749d1e 100644 --- a/tests/fuzzing/test_sql_fuzzer_join.py +++ b/tests/fuzzing/test_sql_fuzzer_join.py @@ -51,7 +51,7 @@ def generate_condition(table, columns): return f"{table}.{where_column.name} {where_operator} {where_value}" def generate_random_sql_join(columns1, table1, columns2, table2) -> str: - join_type = random.choice(["JOIN", "INNER JOIN", "LEFT JOIN", "RIGHT JOIN", "FULL OUTER JOIN"]) + join_type = random.choice(["JOIN", "INNER JOIN", "LEFT JOIN", "LEFT OUTER JOIN", "RIGHT JOIN", "FULL OUTER JOIN", "LEFT ANTI JOIN", "LEFT SEMI JOIN"]) last_value = -1 this_value = random.random() @@ -70,7 +70,11 @@ def generate_random_sql_join(columns1, table1, columns2, table2) -> str: conditions.append(condition) join_condition = " AND ".join(conditions) - selected_columns = [f"{table1}.{col.name}" for col in columns1 if random.random() < 0.2] + [f"{table2}.{col.name}" for col in columns2 if random.random() < 0.2] + + if join_type in ("LEFT ANTI JOIN", "LEFT SEMI JOIN"): + selected_columns = [f"{table1}.{col.name}" for col in columns1 if random.random() < 0.2] + else: + selected_columns = [f"{table1}.{col.name}" for col in columns1 if random.random() < 0.2] + [f"{table2}.{col.name}" for col in columns2 if random.random() < 0.2] if len(selected_columns) == 0: selected_columns = ["*"] select_clause = "SELECT " + ", ".join(selected_columns) @@ -86,7 +90,7 @@ def generate_random_sql_join(columns1, table1, columns2, table2) -> str: linking_condition = random.choice(["AND", "OR", "AND NOT"]) where_clause += f" {linking_condition} {generate_condition(table1, columns1)}" - if random.random() < 0.3: + if join_type not in ("LEFT ANTI JOIN", "LEFT SEMI JOIN") and random.random() < 0.3: if where_clause == "--": where_clause = " WHERE " else: From 79d45748e3101a1e5ac10a773150b6b3d424322f Mon Sep 17 00:00:00 2001 From: XB500 Date: Tue, 24 Dec 2024 10:28:20 +0000 Subject: [PATCH 114/157] Opteryx Version 0.19.0-alpha.918 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 6bb369981..d40e44a05 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 917 +__build__ = 918 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 6284c6ce4274ce572f44f41805b01c2d87e2e685 Mon Sep 17 00:00:00 2001 From: joocer Date: Tue, 24 Dec 2024 10:41:14 +0000 Subject: [PATCH 115/157] #2149 --- tests/storage/test_cache_redis.py | 1 + tests/storage/test_cache_valkey.py | 1 + 2 files changed, 2 insertions(+) diff --git a/tests/storage/test_cache_redis.py b/tests/storage/test_cache_redis.py index dc3f1f1e4..5e0b922c7 100644 --- a/tests/storage/test_cache_redis.py +++ b/tests/storage/test_cache_redis.py @@ -56,6 +56,7 @@ def test_invalid_config(): v = RedisCache(server=None) assert v._consecutive_failures == 10 +@skip_if(is_arm() or is_windows() or is_mac()) def test_skip_on_error(): from opteryx.managers.cache import RedisCache cache = RedisCache() diff --git a/tests/storage/test_cache_valkey.py b/tests/storage/test_cache_valkey.py index 52ba187ab..f76cb197c 100644 --- a/tests/storage/test_cache_valkey.py +++ b/tests/storage/test_cache_valkey.py @@ -55,6 +55,7 @@ def test_invalid_config(): v = ValkeyCache(server=None) assert v._consecutive_failures == 10 +@skip_if(is_arm() or is_windows() or is_mac()) def test_skip_on_error(): from opteryx.managers.cache import ValkeyCache cache = ValkeyCache() From 91fa47b66b1c17174e71acdd9f309b614ba0b3a9 Mon Sep 17 00:00:00 2001 From: XB500 Date: Tue, 24 Dec 2024 10:41:36 +0000 Subject: [PATCH 116/157] Opteryx Version 0.19.0-alpha.919 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index d40e44a05..79594842b 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 918 +__build__ = 919 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 89f4b43896de562c95f46fc847c06dc413f319bf Mon Sep 17 00:00:00 2001 From: joocer Date: Tue, 24 Dec 2024 12:24:35 +0000 Subject: [PATCH 117/157] HOUSEKEEPING --- opteryx/__init__.py | 11 ++--------- opteryx/__main__.py | 11 ++--------- opteryx/__version__.py | 11 ++--------- opteryx/command.py | 11 ++--------- opteryx/config.py | 11 ++--------- opteryx/connection.py | 11 ++--------- opteryx/connectors/__init__.py | 11 ++--------- opteryx/connectors/arrow_connector.py | 11 ++--------- opteryx/connectors/aws_s3_connector.py | 11 ++--------- opteryx/connectors/base/base_connector.py | 11 ++--------- opteryx/connectors/capabilities/__init__.py | 11 ++--------- .../connectors/capabilities/asynchronous.py | 11 ++--------- opteryx/connectors/capabilities/cacheable.py | 11 ++--------- .../connectors/capabilities/limit_pushable.py | 11 ++--------- .../connectors/capabilities/partitionable.py | 11 ++--------- .../capabilities/predicate_pushable.py | 11 ++--------- opteryx/connectors/cql_connector.py | 11 ++--------- opteryx/connectors/disk_connector.py | 11 ++--------- opteryx/connectors/file_connector.py | 11 ++--------- .../connectors/gcp_cloudstorage_connector.py | 11 ++--------- opteryx/connectors/gcp_firestore_connector.py | 11 ++--------- opteryx/connectors/mongodb_connector.py | 11 ++--------- opteryx/connectors/sql_connector.py | 11 ++--------- opteryx/connectors/virtual_data.py | 11 ++--------- opteryx/constants/__init__.py | 11 ++--------- opteryx/constants/character_set.py | 11 ++--------- opteryx/constants/permissions.py | 11 ++--------- opteryx/constants/query_status.py | 11 ++--------- opteryx/constants/result_type.py | 11 ++--------- opteryx/cursor.py | 11 ++--------- opteryx/custom_types/intervals.py | 11 ++--------- opteryx/debugging.py | 11 ++--------- opteryx/exceptions.py | 11 ++--------- opteryx/functions/__init__.py | 11 ++--------- opteryx/functions/catalogue.py | 11 ++--------- opteryx/functions/date_functions.py | 11 ++--------- opteryx/functions/number_functions.py | 11 ++--------- opteryx/functions/other_functions.py | 11 ++--------- opteryx/functions/string_functions.py | 11 ++--------- opteryx/managers/__init__.py | 11 ++--------- opteryx/managers/cache/cache_manager.py | 11 ++--------- opteryx/managers/cache/memcached.py | 11 ++--------- opteryx/managers/cache/null_cache.py | 11 ++--------- opteryx/managers/cache/redis.py | 11 ++--------- opteryx/managers/catalog/__init__.py | 11 ++--------- opteryx/managers/catalog/catalog_provider.py | 11 ++--------- opteryx/managers/catalog/null_provider.py | 11 ++--------- opteryx/managers/catalog/tarchia_provider.py | 11 ++--------- opteryx/managers/execution/serial_engine.py | 5 +++++ opteryx/managers/expression/__init__.py | 11 ++--------- .../managers/expression/binary_operators.py | 11 ++--------- .../managers/expression/unary_operations.py | 11 ++--------- opteryx/managers/kvstores/__init__.py | 11 ++--------- opteryx/managers/kvstores/base_kv_store.py | 11 ++--------- opteryx/managers/permissions/__init__.py | 5 +++++ opteryx/managers/schemes/__init__.py | 11 ++--------- .../managers/schemes/base_partition_scheme.py | 11 ++--------- .../managers/schemes/default_partitions.py | 11 ++--------- opteryx/managers/schemes/mabel_partitions.py | 11 ++--------- opteryx/managers/schemes/tarchia_schema.py | 11 ++--------- opteryx/managers/serde/__init__.py | 0 opteryx/managers/serde/physical_plan.py | 19 ------------------- opteryx/models/__init__.py | 11 ++--------- opteryx/models/connection_context.py | 11 ++--------- opteryx/models/connection_state.py | 5 +++++ opteryx/models/logical_column.py | 11 ++--------- opteryx/models/non_tabular_result.py | 5 +++++ opteryx/models/physical_plan.py | 11 ++--------- opteryx/models/query_properties.py | 11 ++--------- opteryx/models/query_statistics.py | 11 ++--------- opteryx/operators/__init__.py | 11 ++--------- opteryx/operators/aggregate_and_group_node.py | 11 ++--------- opteryx/operators/aggregate_node.py | 11 ++--------- opteryx/operators/async_read_node.py | 11 ++--------- opteryx/operators/base_plan_node.py | 11 ++--------- .../bench/#information_schema_node.py | 11 ++--------- .../operators/bench/#show_databases_node.py | 11 ++--------- opteryx/operators/cross_join_node.py | 11 ++--------- opteryx/operators/distinct_node.py | 11 ++--------- opteryx/operators/exit_node.py | 11 ++--------- opteryx/operators/explain_node.py | 11 ++--------- opteryx/operators/filter_join_node.py | 11 ++--------- opteryx/operators/filter_node.py | 11 ++--------- opteryx/operators/function_dataset_node.py | 11 ++--------- opteryx/operators/heap_sort_node.py | 11 ++--------- opteryx/operators/inner_join_node.py | 11 ++--------- opteryx/operators/inner_join_node_single.py | 11 ++--------- opteryx/operators/limit_node.py | 11 ++--------- opteryx/operators/outer_join_node.py | 11 ++--------- opteryx/operators/projection_node.py | 11 ++--------- opteryx/operators/read_node.py | 11 ++--------- opteryx/operators/set_variable_node.py | 11 ++--------- opteryx/operators/show_columns_node.py | 11 ++--------- opteryx/operators/show_create_node.py | 11 ++--------- opteryx/operators/show_value_node.py | 11 ++--------- opteryx/operators/sort_node.py | 11 ++--------- opteryx/operators/union_node.py | 11 ++--------- opteryx/planner/__init__.py | 11 ++--------- opteryx/planner/ast_rewriter.py | 11 ++--------- opteryx/planner/binder/__init__.py | 11 ++--------- opteryx/planner/binder/binder.py | 11 ++--------- opteryx/planner/binder/binder_visitor.py | 11 ++--------- opteryx/planner/binder/binding_context.py | 11 ++--------- .../planner/cost_based_optimizer/__init__.py | 11 ++--------- .../predicate_ordering_brute.py | 11 ++--------- .../predicate_ordering_genetic.py | 11 ++--------- .../bench/defragment_morsels.py | 11 ++--------- .../strategies/boolean_simplication.py | 11 ++--------- .../strategies/constant_folding.py | 11 ++--------- .../strategies/distinct_pushdown.py | 11 ++--------- .../strategies/limit_pushdown.py | 11 ++--------- .../strategies/operator_fusion.py | 11 ++--------- .../strategies/optimization_strategy.py | 11 ++--------- .../strategies/predicate_pushdown.py | 11 ++--------- .../strategies/predicate_rewriter.py | 11 ++--------- .../strategies/projection_pushdown.py | 11 ++--------- .../strategies/redundant_operators.py | 11 ++--------- .../split_conjunctive_predicates.py | 11 ++--------- opteryx/planner/executor/v2_coordinator.py | 11 ++--------- .../logical_planner/logical_planner.py | 11 ++--------- .../logical_planner_builders.py | 11 ++--------- opteryx/planner/physical_planner.py | 11 ++--------- opteryx/planner/sql_rewriter.py | 11 ++--------- opteryx/planner/views/__init__.py | 11 ++--------- opteryx/shared/__init__.py | 11 ++--------- opteryx/shared/async_memory_pool.py | 11 ++--------- opteryx/shared/buffer_pool.py | 11 ++--------- opteryx/shared/materialized_datasets.py | 11 ++--------- opteryx/shared/variables.py | 11 ++--------- opteryx/utils/__init__.py | 11 ++--------- opteryx/utils/arrow.py | 11 ++--------- opteryx/utils/dates.py | 11 ++--------- opteryx/utils/file_decoders.py | 11 ++--------- opteryx/utils/formatter.py | 5 +++++ opteryx/utils/lru_2.py | 11 ++--------- opteryx/utils/memory_view_stream.py | 11 ++--------- opteryx/utils/paths.py | 11 ++--------- opteryx/utils/resource_monitor.py | 5 +++++ opteryx/utils/series.py | 11 ++--------- opteryx/virtual_datasets/__init__.py | 11 ++--------- opteryx/virtual_datasets/astronaut_data.py | 11 ++--------- opteryx/virtual_datasets/derived_data.py | 11 ++--------- opteryx/virtual_datasets/missions.py | 11 ++--------- opteryx/virtual_datasets/no_table_data.py | 11 ++--------- opteryx/virtual_datasets/planet_data.py | 11 ++--------- opteryx/virtual_datasets/satellite_data.py | 11 ++--------- opteryx/virtual_datasets/statistics.py | 11 ++--------- opteryx/virtual_datasets/stop_words.py | 11 ++--------- opteryx/virtual_datasets/user.py | 11 ++--------- opteryx/virtual_datasets/variables_data.py | 11 ++--------- 150 files changed, 314 insertions(+), 1297 deletions(-) delete mode 100644 opteryx/managers/serde/__init__.py delete mode 100644 opteryx/managers/serde/physical_plan.py diff --git a/opteryx/__init__.py b/opteryx/__init__.py index e749087e1..6b45823aa 100644 --- a/opteryx/__init__.py +++ b/opteryx/__init__.py @@ -1,15 +1,8 @@ # isort: skip_file # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ This module sets up various parts of the Engine - we do somethings in a specific order diff --git a/opteryx/__main__.py b/opteryx/__main__.py index 177b31327..a848b85e9 100644 --- a/opteryx/__main__.py +++ b/opteryx/__main__.py @@ -2,15 +2,8 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ A command line interface for Opteryx diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 79594842b..68799821b 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -2,15 +2,8 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Store the version here so: diff --git a/opteryx/command.py b/opteryx/command.py index 58a072edf..01cf87458 100644 --- a/opteryx/command.py +++ b/opteryx/command.py @@ -2,15 +2,8 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. from opteryx.__main__ import main diff --git a/opteryx/config.py b/opteryx/config.py index c2ad0d5bf..0a5eb8cbd 100644 --- a/opteryx/config.py +++ b/opteryx/config.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. import datetime import typing diff --git a/opteryx/connection.py b/opteryx/connection.py index fe645b710..0d17b14e8 100644 --- a/opteryx/connection.py +++ b/opteryx/connection.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ This module provides a PEP-249 familiar interface for interacting with mabel data diff --git a/opteryx/connectors/__init__.py b/opteryx/connectors/__init__.py index 306fbd047..80046033d 100644 --- a/opteryx/connectors/__init__.py +++ b/opteryx/connectors/__init__.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. import os diff --git a/opteryx/connectors/arrow_connector.py b/opteryx/connectors/arrow_connector.py index 4f5cda7a8..41059072c 100644 --- a/opteryx/connectors/arrow_connector.py +++ b/opteryx/connectors/arrow_connector.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Arrow Reader diff --git a/opteryx/connectors/aws_s3_connector.py b/opteryx/connectors/aws_s3_connector.py index e8405dfc3..0ac8f8cf0 100644 --- a/opteryx/connectors/aws_s3_connector.py +++ b/opteryx/connectors/aws_s3_connector.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ MinIo Reader - also works with AWS diff --git a/opteryx/connectors/base/base_connector.py b/opteryx/connectors/base/base_connector.py index af3e31751..1fd2a74f5 100644 --- a/opteryx/connectors/base/base_connector.py +++ b/opteryx/connectors/base/base_connector.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ The BaseConnector provides a common interface for all storage connectors. diff --git a/opteryx/connectors/capabilities/__init__.py b/opteryx/connectors/capabilities/__init__.py index 1fed4fc37..4c004b16b 100644 --- a/opteryx/connectors/capabilities/__init__.py +++ b/opteryx/connectors/capabilities/__init__.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. from opteryx.connectors.capabilities.asynchronous import Asynchronous from opteryx.connectors.capabilities.cacheable import Cacheable diff --git a/opteryx/connectors/capabilities/asynchronous.py b/opteryx/connectors/capabilities/asynchronous.py index b6120c264..9c7afb0b3 100644 --- a/opteryx/connectors/capabilities/asynchronous.py +++ b/opteryx/connectors/capabilities/asynchronous.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. class Asynchronous: diff --git a/opteryx/connectors/capabilities/cacheable.py b/opteryx/connectors/capabilities/cacheable.py index 9535d64ec..5492fe4bb 100644 --- a/opteryx/connectors/capabilities/cacheable.py +++ b/opteryx/connectors/capabilities/cacheable.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. import asyncio diff --git a/opteryx/connectors/capabilities/limit_pushable.py b/opteryx/connectors/capabilities/limit_pushable.py index e8d9340b0..0026aef83 100644 --- a/opteryx/connectors/capabilities/limit_pushable.py +++ b/opteryx/connectors/capabilities/limit_pushable.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. class LimitPushable: diff --git a/opteryx/connectors/capabilities/partitionable.py b/opteryx/connectors/capabilities/partitionable.py index 028d74346..240062d75 100644 --- a/opteryx/connectors/capabilities/partitionable.py +++ b/opteryx/connectors/capabilities/partitionable.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. from opteryx.exceptions import InvalidConfigurationError diff --git a/opteryx/connectors/capabilities/predicate_pushable.py b/opteryx/connectors/capabilities/predicate_pushable.py index fb0ff5784..6f0d6c107 100644 --- a/opteryx/connectors/capabilities/predicate_pushable.py +++ b/opteryx/connectors/capabilities/predicate_pushable.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ This is both a marker and a wrapper for key functionality to support predicate/filter diff --git a/opteryx/connectors/cql_connector.py b/opteryx/connectors/cql_connector.py index e53bb421a..9930e65e9 100644 --- a/opteryx/connectors/cql_connector.py +++ b/opteryx/connectors/cql_connector.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ The CQL Connector downloads data from remote servers and converts them diff --git a/opteryx/connectors/disk_connector.py b/opteryx/connectors/disk_connector.py index a940d246a..5be748a9d 100644 --- a/opteryx/connectors/disk_connector.py +++ b/opteryx/connectors/disk_connector.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ The 'direct disk' connector provides the reader for when a dataset is diff --git a/opteryx/connectors/file_connector.py b/opteryx/connectors/file_connector.py index dfa520517..4bf2bd739 100644 --- a/opteryx/connectors/file_connector.py +++ b/opteryx/connectors/file_connector.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ The file connector provides the reader for when a file name is provided as the diff --git a/opteryx/connectors/gcp_cloudstorage_connector.py b/opteryx/connectors/gcp_cloudstorage_connector.py index 6ff38f10a..be98c4255 100644 --- a/opteryx/connectors/gcp_cloudstorage_connector.py +++ b/opteryx/connectors/gcp_cloudstorage_connector.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. import asyncio import os diff --git a/opteryx/connectors/gcp_firestore_connector.py b/opteryx/connectors/gcp_firestore_connector.py index 5853f99f3..20b98d349 100644 --- a/opteryx/connectors/gcp_firestore_connector.py +++ b/opteryx/connectors/gcp_firestore_connector.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. from typing import Dict from typing import Generator diff --git a/opteryx/connectors/mongodb_connector.py b/opteryx/connectors/mongodb_connector.py index b7efc7f74..fdfd603fb 100644 --- a/opteryx/connectors/mongodb_connector.py +++ b/opteryx/connectors/mongodb_connector.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ A MongoDB Reader diff --git a/opteryx/connectors/sql_connector.py b/opteryx/connectors/sql_connector.py index 3b9e88ac4..a56c57c48 100644 --- a/opteryx/connectors/sql_connector.py +++ b/opteryx/connectors/sql_connector.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ The SQL Connector downloads data from remote servers and converts them diff --git a/opteryx/connectors/virtual_data.py b/opteryx/connectors/virtual_data.py index 68b5a6eb5..d136425a7 100644 --- a/opteryx/connectors/virtual_data.py +++ b/opteryx/connectors/virtual_data.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ The 'sample' connector provides readers for the internal sample datasets, diff --git a/opteryx/constants/__init__.py b/opteryx/constants/__init__.py index ee2a2e62a..300748713 100644 --- a/opteryx/constants/__init__.py +++ b/opteryx/constants/__init__.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. from opteryx.constants.permissions import PERMISSIONS from opteryx.constants.query_status import QueryStatus diff --git a/opteryx/constants/character_set.py b/opteryx/constants/character_set.py index 5711282e3..3eb4e675a 100644 --- a/opteryx/constants/character_set.py +++ b/opteryx/constants/character_set.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Originally from https://github.com/kelsin/mysql-mimic diff --git a/opteryx/constants/permissions.py b/opteryx/constants/permissions.py index 17df2d098..98072b78c 100644 --- a/opteryx/constants/permissions.py +++ b/opteryx/constants/permissions.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ All of the query types supported by sqlparser-rs diff --git a/opteryx/constants/query_status.py b/opteryx/constants/query_status.py index f28b91fd6..1677437cd 100644 --- a/opteryx/constants/query_status.py +++ b/opteryx/constants/query_status.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. from enum import Enum from enum import auto diff --git a/opteryx/constants/result_type.py b/opteryx/constants/result_type.py index f8ae425eb..d1513ee3c 100644 --- a/opteryx/constants/result_type.py +++ b/opteryx/constants/result_type.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. from enum import Enum from enum import auto diff --git a/opteryx/cursor.py b/opteryx/cursor.py index 1216015dc..098f91067 100644 --- a/opteryx/cursor.py +++ b/opteryx/cursor.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. import datetime import time diff --git a/opteryx/custom_types/intervals.py b/opteryx/custom_types/intervals.py index af6bd9147..f863d30d0 100644 --- a/opteryx/custom_types/intervals.py +++ b/opteryx/custom_types/intervals.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. from typing import Callable from typing import Dict diff --git a/opteryx/debugging.py b/opteryx/debugging.py index cb071c173..ec60e9215 100644 --- a/opteryx/debugging.py +++ b/opteryx/debugging.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Opteryx and Orso Import Customization diff --git a/opteryx/exceptions.py b/opteryx/exceptions.py index 570464bc1..3f4460d67 100644 --- a/opteryx/exceptions.py +++ b/opteryx/exceptions.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Bespoke error types for Opteryx and error types and structure as defined in PEP-0249. diff --git a/opteryx/functions/__init__.py b/opteryx/functions/__init__.py index eecd8b0d7..f510068e5 100644 --- a/opteryx/functions/__init__.py +++ b/opteryx/functions/__init__.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ These are a set of functions that can be applied to data. diff --git a/opteryx/functions/catalogue.py b/opteryx/functions/catalogue.py index 89b8d7ded..0ee922b4b 100644 --- a/opteryx/functions/catalogue.py +++ b/opteryx/functions/catalogue.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Function Catalogue diff --git a/opteryx/functions/date_functions.py b/opteryx/functions/date_functions.py index 31965591c..4ed4decec 100644 --- a/opteryx/functions/date_functions.py +++ b/opteryx/functions/date_functions.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. import datetime diff --git a/opteryx/functions/number_functions.py b/opteryx/functions/number_functions.py index cf755dd00..f86d89f98 100644 --- a/opteryx/functions/number_functions.py +++ b/opteryx/functions/number_functions.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. from typing import List diff --git a/opteryx/functions/other_functions.py b/opteryx/functions/other_functions.py index 6822f659d..e7d119ebc 100644 --- a/opteryx/functions/other_functions.py +++ b/opteryx/functions/other_functions.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. from typing import List from typing import Optional diff --git a/opteryx/functions/string_functions.py b/opteryx/functions/string_functions.py index 9764d3d30..80f429a3b 100644 --- a/opteryx/functions/string_functions.py +++ b/opteryx/functions/string_functions.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. from typing import List from typing import Union diff --git a/opteryx/managers/__init__.py b/opteryx/managers/__init__.py index 4d9a92490..9b18f1918 100644 --- a/opteryx/managers/__init__.py +++ b/opteryx/managers/__init__.py @@ -1,11 +1,4 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. diff --git a/opteryx/managers/cache/cache_manager.py b/opteryx/managers/cache/cache_manager.py index 6a3cb3515..cd27377d0 100644 --- a/opteryx/managers/cache/cache_manager.py +++ b/opteryx/managers/cache/cache_manager.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. from typing import Union diff --git a/opteryx/managers/cache/memcached.py b/opteryx/managers/cache/memcached.py index e5761678b..4d0cec6f7 100644 --- a/opteryx/managers/cache/memcached.py +++ b/opteryx/managers/cache/memcached.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ This implements an interface to Memcached diff --git a/opteryx/managers/cache/null_cache.py b/opteryx/managers/cache/null_cache.py index e88e303af..d5a162070 100644 --- a/opteryx/managers/cache/null_cache.py +++ b/opteryx/managers/cache/null_cache.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. from typing import Any diff --git a/opteryx/managers/cache/redis.py b/opteryx/managers/cache/redis.py index e80b707ea..e7d38f9a6 100644 --- a/opteryx/managers/cache/redis.py +++ b/opteryx/managers/cache/redis.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ This implements an interface to REDIS diff --git a/opteryx/managers/catalog/__init__.py b/opteryx/managers/catalog/__init__.py index 0f710350c..59e242bf3 100644 --- a/opteryx/managers/catalog/__init__.py +++ b/opteryx/managers/catalog/__init__.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. def catalog_factory(): diff --git a/opteryx/managers/catalog/catalog_provider.py b/opteryx/managers/catalog/catalog_provider.py index 3ed0b76f7..6a3739af1 100644 --- a/opteryx/managers/catalog/catalog_provider.py +++ b/opteryx/managers/catalog/catalog_provider.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. from typing import Optional diff --git a/opteryx/managers/catalog/null_provider.py b/opteryx/managers/catalog/null_provider.py index 2c7034c1a..b9864cd73 100644 --- a/opteryx/managers/catalog/null_provider.py +++ b/opteryx/managers/catalog/null_provider.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Used when there is no provider, we basically return none to everything diff --git a/opteryx/managers/catalog/tarchia_provider.py b/opteryx/managers/catalog/tarchia_provider.py index f2e25ff2e..37210149c 100644 --- a/opteryx/managers/catalog/tarchia_provider.py +++ b/opteryx/managers/catalog/tarchia_provider.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. import os diff --git a/opteryx/managers/execution/serial_engine.py b/opteryx/managers/execution/serial_engine.py index d88bf458c..8b050e3ec 100644 --- a/opteryx/managers/execution/serial_engine.py +++ b/opteryx/managers/execution/serial_engine.py @@ -1,3 +1,8 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. + """ This module provides the execution engine for processing physical plans in a serial manner. """ diff --git a/opteryx/managers/expression/__init__.py b/opteryx/managers/expression/__init__.py index e252b9374..77834018e 100644 --- a/opteryx/managers/expression/__init__.py +++ b/opteryx/managers/expression/__init__.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Expressions describe a calculation or evaluation of some sort. diff --git a/opteryx/managers/expression/binary_operators.py b/opteryx/managers/expression/binary_operators.py index 95098e039..f61c6583e 100644 --- a/opteryx/managers/expression/binary_operators.py +++ b/opteryx/managers/expression/binary_operators.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. from typing import Any from typing import Dict diff --git a/opteryx/managers/expression/unary_operations.py b/opteryx/managers/expression/unary_operations.py index a96bbdaa6..24f1af6c0 100644 --- a/opteryx/managers/expression/unary_operations.py +++ b/opteryx/managers/expression/unary_operations.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Implement conditions which are essentially unary statements, usually IS statements. diff --git a/opteryx/managers/kvstores/__init__.py b/opteryx/managers/kvstores/__init__.py index 86091b74a..9c8d33a93 100644 --- a/opteryx/managers/kvstores/__init__.py +++ b/opteryx/managers/kvstores/__init__.py @@ -1,13 +1,6 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. from opteryx.managers.kvstores.base_kv_store import BaseKeyValueStore diff --git a/opteryx/managers/kvstores/base_kv_store.py b/opteryx/managers/kvstores/base_kv_store.py index ff5284964..fc3e5ed48 100644 --- a/opteryx/managers/kvstores/base_kv_store.py +++ b/opteryx/managers/kvstores/base_kv_store.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ This is a Base class for KV Value Storage adapter. diff --git a/opteryx/managers/permissions/__init__.py b/opteryx/managers/permissions/__init__.py index 28c71ddb7..8ccf31480 100644 --- a/opteryx/managers/permissions/__init__.py +++ b/opteryx/managers/permissions/__init__.py @@ -1,3 +1,8 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. + import fnmatch from typing import Dict from typing import Iterable diff --git a/opteryx/managers/schemes/__init__.py b/opteryx/managers/schemes/__init__.py index 806893012..68e9d8590 100644 --- a/opteryx/managers/schemes/__init__.py +++ b/opteryx/managers/schemes/__init__.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. from .base_partition_scheme import BasePartitionScheme from .default_partitions import DefaultPartitionScheme diff --git a/opteryx/managers/schemes/base_partition_scheme.py b/opteryx/managers/schemes/base_partition_scheme.py index 002b724dc..e012bc818 100644 --- a/opteryx/managers/schemes/base_partition_scheme.py +++ b/opteryx/managers/schemes/base_partition_scheme.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. import datetime from typing import Callable diff --git a/opteryx/managers/schemes/default_partitions.py b/opteryx/managers/schemes/default_partitions.py index 14c1d21cf..eec00fc62 100644 --- a/opteryx/managers/schemes/default_partitions.py +++ b/opteryx/managers/schemes/default_partitions.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. import datetime from typing import Callable diff --git a/opteryx/managers/schemes/mabel_partitions.py b/opteryx/managers/schemes/mabel_partitions.py index f7932e350..25a4f9690 100644 --- a/opteryx/managers/schemes/mabel_partitions.py +++ b/opteryx/managers/schemes/mabel_partitions.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. import concurrent.futures import datetime diff --git a/opteryx/managers/schemes/tarchia_schema.py b/opteryx/managers/schemes/tarchia_schema.py index 7a90d24dc..cc98c63d1 100644 --- a/opteryx/managers/schemes/tarchia_schema.py +++ b/opteryx/managers/schemes/tarchia_schema.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. from typing import Dict diff --git a/opteryx/managers/serde/__init__.py b/opteryx/managers/serde/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/opteryx/managers/serde/physical_plan.py b/opteryx/managers/serde/physical_plan.py deleted file mode 100644 index 4e8ab6d35..000000000 --- a/opteryx/managers/serde/physical_plan.py +++ /dev/null @@ -1,19 +0,0 @@ -""" -{ - "fields": - [ - { - "name" - "type" - } - ], - "steps": { - "id": , - "operator": "name", - "columns": [], - "config": {}, - "requires": [] - } -} - -""" diff --git a/opteryx/models/__init__.py b/opteryx/models/__init__.py index e40ff5095..14861529c 100644 --- a/opteryx/models/__init__.py +++ b/opteryx/models/__init__.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. from opteryx.compiled.structures.node import Node from opteryx.models.connection_context import ConnectionContext diff --git a/opteryx/models/connection_context.py b/opteryx/models/connection_context.py index 0a72a341a..5b24659b3 100644 --- a/opteryx/models/connection_context.py +++ b/opteryx/models/connection_context.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. import datetime from dataclasses import dataclass diff --git a/opteryx/models/connection_state.py b/opteryx/models/connection_state.py index e559909b7..98ba7ea58 100644 --- a/opteryx/models/connection_state.py +++ b/opteryx/models/connection_state.py @@ -1,3 +1,8 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. + import utils diff --git a/opteryx/models/logical_column.py b/opteryx/models/logical_column.py index 36fde92d7..d63160f8e 100644 --- a/opteryx/models/logical_column.py +++ b/opteryx/models/logical_column.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. from typing import Optional diff --git a/opteryx/models/non_tabular_result.py b/opteryx/models/non_tabular_result.py index 7955e6504..9516f5935 100644 --- a/opteryx/models/non_tabular_result.py +++ b/opteryx/models/non_tabular_result.py @@ -1,3 +1,8 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. + from opteryx.constants import QueryStatus diff --git a/opteryx/models/physical_plan.py b/opteryx/models/physical_plan.py index 43f1e579b..387268cf3 100644 --- a/opteryx/models/physical_plan.py +++ b/opteryx/models/physical_plan.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ The Physical Plan is a tree of nodes that represent the execution plan for a query. diff --git a/opteryx/models/query_properties.py b/opteryx/models/query_properties.py index 57336e086..ceee3291a 100644 --- a/opteryx/models/query_properties.py +++ b/opteryx/models/query_properties.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. import datetime from dataclasses import dataclass diff --git a/opteryx/models/query_statistics.py b/opteryx/models/query_statistics.py index 24a771fb1..42f6111e6 100644 --- a/opteryx/models/query_statistics.py +++ b/opteryx/models/query_statistics.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. from collections import defaultdict diff --git a/opteryx/operators/__init__.py b/opteryx/operators/__init__.py index 7d318c9be..78adcf7aa 100644 --- a/opteryx/operators/__init__.py +++ b/opteryx/operators/__init__.py @@ -2,15 +2,8 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. from .base_plan_node import BasePlanDataObject # isort: skip diff --git a/opteryx/operators/aggregate_and_group_node.py b/opteryx/operators/aggregate_and_group_node.py index b6153f7bd..faca38303 100644 --- a/opteryx/operators/aggregate_and_group_node.py +++ b/opteryx/operators/aggregate_and_group_node.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Grouping Node diff --git a/opteryx/operators/aggregate_node.py b/opteryx/operators/aggregate_node.py index d7b688b41..953fd8eeb 100644 --- a/opteryx/operators/aggregate_node.py +++ b/opteryx/operators/aggregate_node.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Aggregation Node diff --git a/opteryx/operators/async_read_node.py b/opteryx/operators/async_read_node.py index 4d1db3548..54d14de30 100644 --- a/opteryx/operators/async_read_node.py +++ b/opteryx/operators/async_read_node.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Async Scanner Node diff --git a/opteryx/operators/base_plan_node.py b/opteryx/operators/base_plan_node.py index ea786bfd4..e563757ff 100644 --- a/opteryx/operators/base_plan_node.py +++ b/opteryx/operators/base_plan_node.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. import time diff --git a/opteryx/operators/bench/#information_schema_node.py b/opteryx/operators/bench/#information_schema_node.py index 97cc5f847..066710848 100644 --- a/opteryx/operators/bench/#information_schema_node.py +++ b/opteryx/operators/bench/#information_schema_node.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Collection Reader Node diff --git a/opteryx/operators/bench/#show_databases_node.py b/opteryx/operators/bench/#show_databases_node.py index 6dc7e3500..7148b01b7 100644 --- a/opteryx/operators/bench/#show_databases_node.py +++ b/opteryx/operators/bench/#show_databases_node.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Show Stores Node diff --git a/opteryx/operators/cross_join_node.py b/opteryx/operators/cross_join_node.py index 6eb45b9b6..6390e134c 100644 --- a/opteryx/operators/cross_join_node.py +++ b/opteryx/operators/cross_join_node.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Cross Join Node diff --git a/opteryx/operators/distinct_node.py b/opteryx/operators/distinct_node.py index bc54ba16c..7d0a755fd 100644 --- a/opteryx/operators/distinct_node.py +++ b/opteryx/operators/distinct_node.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Distinct Node diff --git a/opteryx/operators/exit_node.py b/opteryx/operators/exit_node.py index 08f225526..49c3eb93c 100644 --- a/opteryx/operators/exit_node.py +++ b/opteryx/operators/exit_node.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Exit Node diff --git a/opteryx/operators/explain_node.py b/opteryx/operators/explain_node.py index c90b1444e..55540ae91 100644 --- a/opteryx/operators/explain_node.py +++ b/opteryx/operators/explain_node.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Explain Node diff --git a/opteryx/operators/filter_join_node.py b/opteryx/operators/filter_join_node.py index 8f2e839b4..eda16ef1b 100644 --- a/opteryx/operators/filter_join_node.py +++ b/opteryx/operators/filter_join_node.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Filter Join Node diff --git a/opteryx/operators/filter_node.py b/opteryx/operators/filter_node.py index 6228496d9..4795093ed 100644 --- a/opteryx/operators/filter_node.py +++ b/opteryx/operators/filter_node.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Selection Node diff --git a/opteryx/operators/function_dataset_node.py b/opteryx/operators/function_dataset_node.py index d3043b8c2..ed61c3959 100644 --- a/opteryx/operators/function_dataset_node.py +++ b/opteryx/operators/function_dataset_node.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Blob Reader Node diff --git a/opteryx/operators/heap_sort_node.py b/opteryx/operators/heap_sort_node.py index 872b7aba2..3142fdea5 100644 --- a/opteryx/operators/heap_sort_node.py +++ b/opteryx/operators/heap_sort_node.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Heap Sort Node diff --git a/opteryx/operators/inner_join_node.py b/opteryx/operators/inner_join_node.py index 1d968d372..c7c0171f8 100644 --- a/opteryx/operators/inner_join_node.py +++ b/opteryx/operators/inner_join_node.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Inner Join Node diff --git a/opteryx/operators/inner_join_node_single.py b/opteryx/operators/inner_join_node_single.py index 3a1d0395d..5a4899676 100644 --- a/opteryx/operators/inner_join_node_single.py +++ b/opteryx/operators/inner_join_node_single.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Inner Join Node (Single Condition) diff --git a/opteryx/operators/limit_node.py b/opteryx/operators/limit_node.py index 5d48994bc..2b6182ae8 100644 --- a/opteryx/operators/limit_node.py +++ b/opteryx/operators/limit_node.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Limit Node diff --git a/opteryx/operators/outer_join_node.py b/opteryx/operators/outer_join_node.py index f0f697167..243ccddfb 100644 --- a/opteryx/operators/outer_join_node.py +++ b/opteryx/operators/outer_join_node.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Outer Join Node diff --git a/opteryx/operators/projection_node.py b/opteryx/operators/projection_node.py index 83eb40887..1edf09b42 100644 --- a/opteryx/operators/projection_node.py +++ b/opteryx/operators/projection_node.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Projection Node diff --git a/opteryx/operators/read_node.py b/opteryx/operators/read_node.py index c614c1b7b..7023e62e7 100644 --- a/opteryx/operators/read_node.py +++ b/opteryx/operators/read_node.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Read Node diff --git a/opteryx/operators/set_variable_node.py b/opteryx/operators/set_variable_node.py index ccf2a49ec..da3c33de2 100644 --- a/opteryx/operators/set_variable_node.py +++ b/opteryx/operators/set_variable_node.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Set Variables Node diff --git a/opteryx/operators/show_columns_node.py b/opteryx/operators/show_columns_node.py index 12443db9e..d95295c7e 100644 --- a/opteryx/operators/show_columns_node.py +++ b/opteryx/operators/show_columns_node.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Show Columns Node diff --git a/opteryx/operators/show_create_node.py b/opteryx/operators/show_create_node.py index 1e0a30004..b8409e1bb 100644 --- a/opteryx/operators/show_create_node.py +++ b/opteryx/operators/show_create_node.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Show Create Node diff --git a/opteryx/operators/show_value_node.py b/opteryx/operators/show_value_node.py index 507c2e0ac..265317bb6 100644 --- a/opteryx/operators/show_value_node.py +++ b/opteryx/operators/show_value_node.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Show Variables Node diff --git a/opteryx/operators/sort_node.py b/opteryx/operators/sort_node.py index 57bad2499..a786960d7 100644 --- a/opteryx/operators/sort_node.py +++ b/opteryx/operators/sort_node.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Sort Node diff --git a/opteryx/operators/union_node.py b/opteryx/operators/union_node.py index cd5f5c374..c40cf9559 100644 --- a/opteryx/operators/union_node.py +++ b/opteryx/operators/union_node.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Union Node diff --git a/opteryx/planner/__init__.py b/opteryx/planner/__init__.py index f8e931efc..1830a251b 100644 --- a/opteryx/planner/__init__.py +++ b/opteryx/planner/__init__.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ ~~~ diff --git a/opteryx/planner/ast_rewriter.py b/opteryx/planner/ast_rewriter.py index 23b689834..96e40b9e0 100644 --- a/opteryx/planner/ast_rewriter.py +++ b/opteryx/planner/ast_rewriter.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ This is the AST rewriter, it sits between the Parser and the Logical Planner. diff --git a/opteryx/planner/binder/__init__.py b/opteryx/planner/binder/__init__.py index 269803ee9..4a28264c5 100644 --- a/opteryx/planner/binder/__init__.py +++ b/opteryx/planner/binder/__init__.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ This is Binder, it sits between the Logical Planner and the Optimizers. diff --git a/opteryx/planner/binder/binder.py b/opteryx/planner/binder/binder.py index d403f13ad..7ffe96c47 100644 --- a/opteryx/planner/binder/binder.py +++ b/opteryx/planner/binder/binder.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. import copy diff --git a/opteryx/planner/binder/binder_visitor.py b/opteryx/planner/binder/binder_visitor.py index ec936287a..4b7812387 100644 --- a/opteryx/planner/binder/binder_visitor.py +++ b/opteryx/planner/binder/binder_visitor.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. import re from typing import List diff --git a/opteryx/planner/binder/binding_context.py b/opteryx/planner/binder/binding_context.py index ef8d48dc9..2eede0301 100644 --- a/opteryx/planner/binder/binding_context.py +++ b/opteryx/planner/binder/binding_context.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. from copy import deepcopy from dataclasses import dataclass diff --git a/opteryx/planner/cost_based_optimizer/__init__.py b/opteryx/planner/cost_based_optimizer/__init__.py index 35f0980b5..5e386cf10 100644 --- a/opteryx/planner/cost_based_optimizer/__init__.py +++ b/opteryx/planner/cost_based_optimizer/__init__.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ ~~~ diff --git a/opteryx/planner/cost_based_optimizer/bench/cost_based_optimizer/predicate_ordering_brute.py b/opteryx/planner/cost_based_optimizer/bench/cost_based_optimizer/predicate_ordering_brute.py index 614af04af..2386e1f41 100644 --- a/opteryx/planner/cost_based_optimizer/bench/cost_based_optimizer/predicate_ordering_brute.py +++ b/opteryx/planner/cost_based_optimizer/bench/cost_based_optimizer/predicate_ordering_brute.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Ordering of adjacent predicates using brute-force. diff --git a/opteryx/planner/cost_based_optimizer/bench/cost_based_optimizer/predicate_ordering_genetic.py b/opteryx/planner/cost_based_optimizer/bench/cost_based_optimizer/predicate_ordering_genetic.py index 04583ca27..28d04aeba 100644 --- a/opteryx/planner/cost_based_optimizer/bench/cost_based_optimizer/predicate_ordering_genetic.py +++ b/opteryx/planner/cost_based_optimizer/bench/cost_based_optimizer/predicate_ordering_genetic.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Ordering of adjacent predicates using a genetic algorithm. diff --git a/opteryx/planner/cost_based_optimizer/bench/defragment_morsels.py b/opteryx/planner/cost_based_optimizer/bench/defragment_morsels.py index cdb7c7f73..49374be98 100644 --- a/opteryx/planner/cost_based_optimizer/bench/defragment_morsels.py +++ b/opteryx/planner/cost_based_optimizer/bench/defragment_morsels.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. from orso.tools import random_string diff --git a/opteryx/planner/cost_based_optimizer/strategies/boolean_simplication.py b/opteryx/planner/cost_based_optimizer/strategies/boolean_simplication.py index 29b2a5dcb..c2f99659c 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/boolean_simplication.py +++ b/opteryx/planner/cost_based_optimizer/strategies/boolean_simplication.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Optimization Rule - Demorgan's Laws diff --git a/opteryx/planner/cost_based_optimizer/strategies/constant_folding.py b/opteryx/planner/cost_based_optimizer/strategies/constant_folding.py index 7ce5e04a9..28aee2bfb 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/constant_folding.py +++ b/opteryx/planner/cost_based_optimizer/strategies/constant_folding.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Optimization Rule - Constant Folding diff --git a/opteryx/planner/cost_based_optimizer/strategies/distinct_pushdown.py b/opteryx/planner/cost_based_optimizer/strategies/distinct_pushdown.py index c556e27a4..2d77c5dbd 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/distinct_pushdown.py +++ b/opteryx/planner/cost_based_optimizer/strategies/distinct_pushdown.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Optimization Rule - Distinct Pushdown diff --git a/opteryx/planner/cost_based_optimizer/strategies/limit_pushdown.py b/opteryx/planner/cost_based_optimizer/strategies/limit_pushdown.py index 34251ee52..93b59bf4d 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/limit_pushdown.py +++ b/opteryx/planner/cost_based_optimizer/strategies/limit_pushdown.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Optimization Rule - Limit Pushdown diff --git a/opteryx/planner/cost_based_optimizer/strategies/operator_fusion.py b/opteryx/planner/cost_based_optimizer/strategies/operator_fusion.py index 77d8fba5c..557f0c9a0 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/operator_fusion.py +++ b/opteryx/planner/cost_based_optimizer/strategies/operator_fusion.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Optimization Rule - Operator Fusion diff --git a/opteryx/planner/cost_based_optimizer/strategies/optimization_strategy.py b/opteryx/planner/cost_based_optimizer/strategies/optimization_strategy.py index ec4e3cbe1..3c9482d64 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/optimization_strategy.py +++ b/opteryx/planner/cost_based_optimizer/strategies/optimization_strategy.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. from opteryx.planner.logical_planner import LogicalPlan from opteryx.planner.logical_planner import LogicalPlanNode diff --git a/opteryx/planner/cost_based_optimizer/strategies/predicate_pushdown.py b/opteryx/planner/cost_based_optimizer/strategies/predicate_pushdown.py index ddcaa1884..bf7d33b4a 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/predicate_pushdown.py +++ b/opteryx/planner/cost_based_optimizer/strategies/predicate_pushdown.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Optimization Rule - Predicate Pushdown diff --git a/opteryx/planner/cost_based_optimizer/strategies/predicate_rewriter.py b/opteryx/planner/cost_based_optimizer/strategies/predicate_rewriter.py index 6d64cfbf8..d9cc25379 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/predicate_rewriter.py +++ b/opteryx/planner/cost_based_optimizer/strategies/predicate_rewriter.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Optimization Rule - Predicate rewriter diff --git a/opteryx/planner/cost_based_optimizer/strategies/projection_pushdown.py b/opteryx/planner/cost_based_optimizer/strategies/projection_pushdown.py index 5f936cbc0..24cb5a949 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/projection_pushdown.py +++ b/opteryx/planner/cost_based_optimizer/strategies/projection_pushdown.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Optimization Rule - Projection Pushdown diff --git a/opteryx/planner/cost_based_optimizer/strategies/redundant_operators.py b/opteryx/planner/cost_based_optimizer/strategies/redundant_operators.py index 95c0e8f87..be15e6f14 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/redundant_operators.py +++ b/opteryx/planner/cost_based_optimizer/strategies/redundant_operators.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Optimization Rule - Remove Redundant Operators diff --git a/opteryx/planner/cost_based_optimizer/strategies/split_conjunctive_predicates.py b/opteryx/planner/cost_based_optimizer/strategies/split_conjunctive_predicates.py index 3cb598411..258d22cb4 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/split_conjunctive_predicates.py +++ b/opteryx/planner/cost_based_optimizer/strategies/split_conjunctive_predicates.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Optimization Rule - Split Conjections diff --git a/opteryx/planner/executor/v2_coordinator.py b/opteryx/planner/executor/v2_coordinator.py index 8d463d188..44d318583 100644 --- a/opteryx/planner/executor/v2_coordinator.py +++ b/opteryx/planner/executor/v2_coordinator.py @@ -28,15 +28,8 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ ~~~ diff --git a/opteryx/planner/logical_planner/logical_planner.py b/opteryx/planner/logical_planner/logical_planner.py index 4a7facab3..1b5601eee 100644 --- a/opteryx/planner/logical_planner/logical_planner.py +++ b/opteryx/planner/logical_planner/logical_planner.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Converts the AST to a logical query plan. diff --git a/opteryx/planner/logical_planner/logical_planner_builders.py b/opteryx/planner/logical_planner/logical_planner_builders.py index 7d853b5b0..95dc5ae08 100644 --- a/opteryx/planner/logical_planner/logical_planner_builders.py +++ b/opteryx/planner/logical_planner/logical_planner_builders.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ This module contains various converters for parts of the AST, this diff --git a/opteryx/planner/physical_planner.py b/opteryx/planner/physical_planner.py index 1ea300e4f..e83bad725 100644 --- a/opteryx/planner/physical_planner.py +++ b/opteryx/planner/physical_planner.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. from orso.schema import OrsoTypes diff --git a/opteryx/planner/sql_rewriter.py b/opteryx/planner/sql_rewriter.py index 7d2b2f650..2899d7528 100644 --- a/opteryx/planner/sql_rewriter.py +++ b/opteryx/planner/sql_rewriter.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ ~~~ diff --git a/opteryx/planner/views/__init__.py b/opteryx/planner/views/__init__.py index f500a70a4..db1bb416d 100644 --- a/opteryx/planner/views/__init__.py +++ b/opteryx/planner/views/__init__.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. import orjson diff --git a/opteryx/shared/__init__.py b/opteryx/shared/__init__.py index a5877f3ce..f19265abf 100644 --- a/opteryx/shared/__init__.py +++ b/opteryx/shared/__init__.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. from opteryx.compiled.structures import MemoryPool from opteryx.shared.async_memory_pool import AsyncMemoryPool diff --git a/opteryx/shared/async_memory_pool.py b/opteryx/shared/async_memory_pool.py index 2aefde15a..dfdaac749 100644 --- a/opteryx/shared/async_memory_pool.py +++ b/opteryx/shared/async_memory_pool.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ This is the async wrapper around the memory pool, it is used by the diff --git a/opteryx/shared/buffer_pool.py b/opteryx/shared/buffer_pool.py index 5895041e6..a6e55c572 100644 --- a/opteryx/shared/buffer_pool.py +++ b/opteryx/shared/buffer_pool.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Global Buffer Pool. diff --git a/opteryx/shared/materialized_datasets.py b/opteryx/shared/materialized_datasets.py index d42bfff7d..d1f994f36 100644 --- a/opteryx/shared/materialized_datasets.py +++ b/opteryx/shared/materialized_datasets.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Materialized Datasets. diff --git a/opteryx/shared/variables.py b/opteryx/shared/variables.py index 401cb7b9f..dad2497f2 100644 --- a/opteryx/shared/variables.py +++ b/opteryx/shared/variables.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ diff --git a/opteryx/utils/__init__.py b/opteryx/utils/__init__.py index 17904d0f8..13f09bf19 100644 --- a/opteryx/utils/__init__.py +++ b/opteryx/utils/__init__.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. from itertools import permutations diff --git a/opteryx/utils/arrow.py b/opteryx/utils/arrow.py index 8effb11ba..1c5a0c4f9 100644 --- a/opteryx/utils/arrow.py +++ b/opteryx/utils/arrow.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ This module contains support functions for working with PyArrow diff --git a/opteryx/utils/dates.py b/opteryx/utils/dates.py index f59dcff6b..b14f9eec0 100644 --- a/opteryx/utils/dates.py +++ b/opteryx/utils/dates.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Date Utilities """ diff --git a/opteryx/utils/file_decoders.py b/opteryx/utils/file_decoders.py index 58656bdaa..d553e966e 100644 --- a/opteryx/utils/file_decoders.py +++ b/opteryx/utils/file_decoders.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Decode files from a raw binary format to a PyArrow Table. diff --git a/opteryx/utils/formatter.py b/opteryx/utils/formatter.py index ef2110c26..21e6cf617 100644 --- a/opteryx/utils/formatter.py +++ b/opteryx/utils/formatter.py @@ -1,3 +1,8 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. + import re diff --git a/opteryx/utils/lru_2.py b/opteryx/utils/lru_2.py index ad1a3b24a..18c28f3ad 100644 --- a/opteryx/utils/lru_2.py +++ b/opteryx/utils/lru_2.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ LRU-K evicts the morsel whose K-th most recent access is furthest in the past. Note, the diff --git a/opteryx/utils/memory_view_stream.py b/opteryx/utils/memory_view_stream.py index 481dd1088..1e726f615 100644 --- a/opteryx/utils/memory_view_stream.py +++ b/opteryx/utils/memory_view_stream.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Handle a memoryview like a stream without converting to bytes. diff --git a/opteryx/utils/paths.py b/opteryx/utils/paths.py index 4aa853bfb..3d9ea544d 100644 --- a/opteryx/utils/paths.py +++ b/opteryx/utils/paths.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Functions to help with handling file paths diff --git a/opteryx/utils/resource_monitor.py b/opteryx/utils/resource_monitor.py index a327a3abb..edfab0320 100644 --- a/opteryx/utils/resource_monitor.py +++ b/opteryx/utils/resource_monitor.py @@ -1,3 +1,8 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. + import os import threading from time import sleep diff --git a/opteryx/utils/series.py b/opteryx/utils/series.py index f1fd99b74..f8c7be983 100644 --- a/opteryx/utils/series.py +++ b/opteryx/utils/series.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. import numpy from orso.types import OrsoTypes diff --git a/opteryx/virtual_datasets/__init__.py b/opteryx/virtual_datasets/__init__.py index 4fd57c1ec..fefa55f14 100644 --- a/opteryx/virtual_datasets/__init__.py +++ b/opteryx/virtual_datasets/__init__.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. import opteryx.virtual_datasets.astronaut_data as astronauts import opteryx.virtual_datasets.derived_data as derived diff --git a/opteryx/virtual_datasets/astronaut_data.py b/opteryx/virtual_datasets/astronaut_data.py index 0a21745ed..5a296ecfc 100644 --- a/opteryx/virtual_datasets/astronaut_data.py +++ b/opteryx/virtual_datasets/astronaut_data.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ astronauts diff --git a/opteryx/virtual_datasets/derived_data.py b/opteryx/virtual_datasets/derived_data.py index 9f8addc10..598857bb7 100644 --- a/opteryx/virtual_datasets/derived_data.py +++ b/opteryx/virtual_datasets/derived_data.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ derived diff --git a/opteryx/virtual_datasets/missions.py b/opteryx/virtual_datasets/missions.py index 2b12f620c..2f5fc4fa7 100644 --- a/opteryx/virtual_datasets/missions.py +++ b/opteryx/virtual_datasets/missions.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ missions diff --git a/opteryx/virtual_datasets/no_table_data.py b/opteryx/virtual_datasets/no_table_data.py index ee54a6137..fe9396311 100644 --- a/opteryx/virtual_datasets/no_table_data.py +++ b/opteryx/virtual_datasets/no_table_data.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ no table diff --git a/opteryx/virtual_datasets/planet_data.py b/opteryx/virtual_datasets/planet_data.py index 366a7e2b3..8cc70f75d 100644 --- a/opteryx/virtual_datasets/planet_data.py +++ b/opteryx/virtual_datasets/planet_data.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ planets diff --git a/opteryx/virtual_datasets/satellite_data.py b/opteryx/virtual_datasets/satellite_data.py index b16012ae4..8f6a869a7 100644 --- a/opteryx/virtual_datasets/satellite_data.py +++ b/opteryx/virtual_datasets/satellite_data.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ satellites diff --git a/opteryx/virtual_datasets/statistics.py b/opteryx/virtual_datasets/statistics.py index 4122734b0..16ae084c9 100644 --- a/opteryx/virtual_datasets/statistics.py +++ b/opteryx/virtual_datasets/statistics.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ This is a virtual dataset which is calculated at access time. diff --git a/opteryx/virtual_datasets/stop_words.py b/opteryx/virtual_datasets/stop_words.py index 0cdaa49b6..a510a7ce7 100644 --- a/opteryx/virtual_datasets/stop_words.py +++ b/opteryx/virtual_datasets/stop_words.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ Stop Words - words not included in the VECTOR INDEX or in MATCH AGAINST searches diff --git a/opteryx/virtual_datasets/user.py b/opteryx/virtual_datasets/user.py index 20ee1284a..c97f34ec7 100644 --- a/opteryx/virtual_datasets/user.py +++ b/opteryx/virtual_datasets/user.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ This is a virtual dataset which is calculated at access time. diff --git a/opteryx/virtual_datasets/variables_data.py b/opteryx/virtual_datasets/variables_data.py index 0b96a51e7..097ea58a5 100644 --- a/opteryx/virtual_datasets/variables_data.py +++ b/opteryx/virtual_datasets/variables_data.py @@ -1,14 +1,7 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# See the License at http://www.apache.org/licenses/LICENSE-2.0 +# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. """ This is a virtual dataset which is calculated at access time. From ed9d4f2422143fd9cc01d786cf5b76535685677d Mon Sep 17 00:00:00 2001 From: XB500 Date: Tue, 24 Dec 2024 12:24:57 +0000 Subject: [PATCH 118/157] Opteryx Version 0.19.0-alpha.920 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 68799821b..7fce78c24 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 919 +__build__ = 920 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 72831ba1e7ac5643c51ad5083e76089a91dddfa9 Mon Sep 17 00:00:00 2001 From: joocer Date: Tue, 24 Dec 2024 22:35:15 +0000 Subject: [PATCH 119/157] #2151 --- opteryx/operators/__init__.py | 1 - opteryx/operators/aggregate_and_group_node.py | 16 ---------------- opteryx/operators/aggregate_node.py | 17 ++++------------- opteryx/operators/async_read_node.py | 8 -------- opteryx/operators/base_plan_node.py | 18 ------------------ opteryx/operators/cross_join_node.py | 11 ----------- opteryx/operators/exit_node.py | 13 ------------- opteryx/operators/heap_sort_node.py | 10 ---------- 8 files changed, 4 insertions(+), 90 deletions(-) diff --git a/opteryx/operators/__init__.py b/opteryx/operators/__init__.py index 78adcf7aa..371228cbc 100644 --- a/opteryx/operators/__init__.py +++ b/opteryx/operators/__init__.py @@ -6,7 +6,6 @@ # Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. -from .base_plan_node import BasePlanDataObject # isort: skip from .base_plan_node import BasePlanNode, JoinNode # isort: skip from .aggregate_and_group_node import AggregateAndGroupNode # Group is always followed by aggregate diff --git a/opteryx/operators/aggregate_and_group_node.py b/opteryx/operators/aggregate_and_group_node.py index faca38303..1ab25ad72 100644 --- a/opteryx/operators/aggregate_and_group_node.py +++ b/opteryx/operators/aggregate_and_group_node.py @@ -14,8 +14,6 @@ """ -from dataclasses import dataclass - import numpy import pyarrow from orso.types import OrsoTypes @@ -28,22 +26,10 @@ from opteryx.operators.aggregate_node import build_aggregations from opteryx.operators.aggregate_node import extract_evaluations from opteryx.operators.aggregate_node import project -from opteryx.operators.base_plan_node import BasePlanDataObject from . import BasePlanNode -@dataclass -class AggregateAndGroupDataObject(BasePlanDataObject): - groups: list = None - aggregates: list = None - all_identifiers: list = None - evaluatable_nodes: list = None - group_by_columns: list = None - column_map: list = None - aggregate_functions: list = None - - class AggregateAndGroupNode(BasePlanNode): def __init__(self, properties: QueryProperties, **parameters): BasePlanNode.__init__(self, properties=properties, **parameters) @@ -79,8 +65,6 @@ def __init__(self, properties: QueryProperties, **parameters): self.group_by_columns = list({node.schema_column.identity for node in self.groups}) self.column_map, self.aggregate_functions = build_aggregations(self.aggregates) - self.do = AggregateAndGroupDataObject() - self.buffer = [] @classmethod diff --git a/opteryx/operators/aggregate_node.py b/opteryx/operators/aggregate_node.py index 953fd8eeb..c4ec29117 100644 --- a/opteryx/operators/aggregate_node.py +++ b/opteryx/operators/aggregate_node.py @@ -11,8 +11,6 @@ This node performs aggregates without performing groupings. """ -from dataclasses import dataclass - import numpy import pyarrow @@ -22,7 +20,6 @@ from opteryx.managers.expression import evaluate_and_append from opteryx.managers.expression import get_all_nodes_of_type from opteryx.models import QueryProperties -from opteryx.operators.base_plan_node import BasePlanDataObject from . import BasePlanNode @@ -166,16 +163,11 @@ def extract_evaluations(aggregates): if len(aggregators) == 0: evaluatable_nodes.append(node) - return evaluatable_nodes - + literal_count = len([n for n in evaluatable_nodes if n.node_type == NodeType.LITERAL]) + if literal_count > 0 and literal_count < len(evaluatable_nodes): + evaluatable_nodes = [n for n in evaluatable_nodes if n.node_type != NodeType.LITERAL] -@dataclass -class AggregateDataObject(BasePlanDataObject): - aggregates: list = None - all_identifiers: list = None - evaluatable_nodes: list = None - column_map: list = None - aggregate_functions: list = None + return evaluatable_nodes class AggregateNode(BasePlanNode): @@ -196,7 +188,6 @@ def __init__(self, properties: QueryProperties, **parameters): self.column_map, self.aggregate_functions = build_aggregations(self.aggregates) - self.do = AggregateDataObject() self.buffer = [] @classmethod diff --git a/opteryx/operators/async_read_node.py b/opteryx/operators/async_read_node.py index 54d14de30..3a994c217 100644 --- a/opteryx/operators/async_read_node.py +++ b/opteryx/operators/async_read_node.py @@ -16,7 +16,6 @@ import queue import threading import time -from dataclasses import dataclass from typing import Generator import aiohttp @@ -28,7 +27,6 @@ from opteryx import config from opteryx.exceptions import DataError from opteryx.models import QueryProperties -from opteryx.operators.base_plan_node import BasePlanDataObject from opteryx.shared import AsyncMemoryPool from opteryx.shared import MemoryPool from opteryx.utils.file_decoders import get_decoder @@ -61,17 +59,11 @@ async def fetch_and_process(blob_name): await session.close() -@dataclass -class AsyncReaderDataObject(BasePlanDataObject): - pass - - class AsyncReaderNode(ReaderNode): def __init__(self, properties: QueryProperties, **parameters): ReaderNode.__init__(self, properties=properties, **parameters) self.pool = MemoryPool(MAX_READ_BUFFER_CAPACITY, f"ReadBuffer <{self.parameters['alias']}>") - self.do = AsyncReaderDataObject() self.predicates = parameters.get("predicates") @classmethod diff --git a/opteryx/operators/base_plan_node.py b/opteryx/operators/base_plan_node.py index e563757ff..9124c8a78 100644 --- a/opteryx/operators/base_plan_node.py +++ b/opteryx/operators/base_plan_node.py @@ -5,28 +5,11 @@ import time -from dataclasses import dataclass from typing import Optional import pyarrow from orso.tools import random_string -from opteryx import EOS - - -@dataclass -class BasePlanDataObject: - operation: Optional[str] = None - query_id: str = None - identity: str = None - - def __post_init__(self): - # Perform actions after initialization - if self.identity is None: - self.identity = random_string() - if self.operation is None: - self.operation = self.__class__.__name__.replace("DataObject", "Node") - class BasePlanNode: is_join: bool = False @@ -47,7 +30,6 @@ def __init__(self, *, properties, **parameters): self.parameters = parameters self.execution_time = 0 self.identity = random_string() - self.do: Optional[BasePlanDataObject] = None self.calls = 0 self.records_in = 0 self.bytes_in = 0 diff --git a/opteryx/operators/cross_join_node.py b/opteryx/operators/cross_join_node.py index 6390e134c..7e9b23f19 100644 --- a/opteryx/operators/cross_join_node.py +++ b/opteryx/operators/cross_join_node.py @@ -12,7 +12,6 @@ here rather than calling the join() functions """ -from dataclasses import dataclass from typing import Generator from typing import Set from typing import Tuple @@ -26,7 +25,6 @@ from opteryx.managers.expression import NodeType from opteryx.models import LogicalColumn from opteryx.models import QueryProperties -from opteryx.operators.base_plan_node import BasePlanDataObject from . import JoinNode @@ -250,15 +248,6 @@ def _chunker(seq_1, seq_2, size): ) -@dataclass -class CrossJoinDataObject(BasePlanDataObject): - source: str = None - _unnest_column: str = None - _unnest_target: str = None - _filters: str = None - _distinct: bool = False - - class CrossJoinNode(JoinNode): """ Implements a SQL CROSS JOIN diff --git a/opteryx/operators/exit_node.py b/opteryx/operators/exit_node.py index 49c3eb93c..2152d2712 100644 --- a/opteryx/operators/exit_node.py +++ b/opteryx/operators/exit_node.py @@ -17,34 +17,21 @@ This node doesn't do any calculations, it is a pure Projection. """ -from dataclasses import dataclass -from dataclasses import field -from typing import List - from pyarrow import Table from opteryx import EOS from opteryx.exceptions import AmbiguousIdentifierError from opteryx.exceptions import InvalidInternalStateError -from opteryx.models import LogicalColumn from opteryx.models import QueryProperties -from opteryx.operators.base_plan_node import BasePlanDataObject from . import BasePlanNode -@dataclass -class ExitDataObject(BasePlanDataObject): - columns: List[LogicalColumn] = field(default_factory=list) - - class ExitNode(BasePlanNode): def __init__(self, properties: QueryProperties, **parameters): BasePlanNode.__init__(self, properties=properties, **parameters) self.columns = parameters.get("columns", []) - self.do = ExitDataObject(columns=self.columns) - @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover raise NotImplementedError() diff --git a/opteryx/operators/heap_sort_node.py b/opteryx/operators/heap_sort_node.py index 3142fdea5..48c42b34e 100644 --- a/opteryx/operators/heap_sort_node.py +++ b/opteryx/operators/heap_sort_node.py @@ -17,8 +17,6 @@ sorting smaller chunks over and over again. """ -from dataclasses import dataclass - import numpy import pyarrow import pyarrow.compute @@ -27,24 +25,16 @@ from opteryx import EOS from opteryx.exceptions import ColumnNotFoundError from opteryx.models import QueryProperties -from opteryx.operators.base_plan_node import BasePlanDataObject from . import BasePlanNode -@dataclass -class HeapSortDataObject(BasePlanDataObject): - order_by: list = None - limit: int = -1 - - class HeapSortNode(BasePlanNode): def __init__(self, properties: QueryProperties, **parameters): BasePlanNode.__init__(self, properties=properties, **parameters) self.order_by = parameters.get("order_by", []) self.limit: int = parameters.get("limit", -1) - self.do = HeapSortDataObject(order_by=self.order_by, limit=self.limit) self.mapped_order = [] self.table = None From 9a35da154ec415a386498696fa7f3dba1e18193b Mon Sep 17 00:00:00 2001 From: XB500 Date: Tue, 24 Dec 2024 22:35:37 +0000 Subject: [PATCH 120/157] Opteryx Version 0.19.0-alpha.921 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 7fce78c24..05dca2c3d 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 920 +__build__ = 921 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 5975cf714febd2ff962eb1a7e5b6187ca3223b3b Mon Sep 17 00:00:00 2001 From: joocer Date: Fri, 27 Dec 2024 00:44:13 +0000 Subject: [PATCH 121/157] #2147 --- opteryx/compiled/structures/__init__.py | 3 + opteryx/compiled/structures/hash_table.pyx | 103 +++++++++++++++++++-- opteryx/operators/filter_join_node.py | 96 +++---------------- 3 files changed, 110 insertions(+), 92 deletions(-) diff --git a/opteryx/compiled/structures/__init__.py b/opteryx/compiled/structures/__init__.py index 691eccc56..66227b516 100644 --- a/opteryx/compiled/structures/__init__.py +++ b/opteryx/compiled/structures/__init__.py @@ -1,6 +1,9 @@ from .hash_table import HashSet from .hash_table import HashTable +from .hash_table import anti_join from .hash_table import distinct +from .hash_table import filter_join_set from .hash_table import list_distinct +from .hash_table import semi_join from .memory_pool import MemoryPool from .node import Node diff --git a/opteryx/compiled/structures/hash_table.pyx b/opteryx/compiled/structures/hash_table.pyx index dcaf65f86..c06681e11 100644 --- a/opteryx/compiled/structures/hash_table.pyx +++ b/opteryx/compiled/structures/hash_table.pyx @@ -72,9 +72,7 @@ cdef inline object recast_column(column): -@cython.boundscheck(False) @cython.wraparound(False) -@cython.cdivision(True) cpdef tuple distinct(table, HashSet seen_hashes=None, list columns=None): """ Perform a distinct operation on the given table using an external HashSet. @@ -142,9 +140,8 @@ cpdef tuple distinct(table, HashSet seen_hashes=None, list columns=None): return keep, seen_hashes -@cython.boundscheck(False) @cython.wraparound(False) -cdef void compute_float_hashes(cnp.ndarray[cnp.float64_t] data, int64_t null_hash, cnp.ndarray[int64_t] hashes): +cdef void compute_float_hashes(cnp.ndarray[cnp.float64_t] data, int64_t null_hash, int64_t[:] hashes): cdef Py_ssize_t i, n = data.shape[0] cdef cnp.float64_t value for i in range(n): @@ -154,9 +151,8 @@ cdef void compute_float_hashes(cnp.ndarray[cnp.float64_t] data, int64_t null_has else: hashes[i] = hash(value) -@cython.boundscheck(False) @cython.wraparound(False) -cdef void compute_int_hashes(cnp.ndarray[cnp.int64_t] data, int64_t null_hash, cnp.ndarray[int64_t] hashes): +cdef void compute_int_hashes(cnp.ndarray[cnp.int64_t] data, int64_t null_hash, int64_t[:] hashes): cdef Py_ssize_t i, n = data.shape[0] cdef cnp.int64_t value for i in range(n): @@ -168,9 +164,8 @@ cdef void compute_int_hashes(cnp.ndarray[cnp.int64_t] data, int64_t null_hash, c else: hashes[i] = value # Hash of int is the int itself in Python 3 -@cython.boundscheck(False) @cython.wraparound(False) -cdef void compute_object_hashes(cnp.ndarray data, int64_t null_hash, cnp.ndarray[int64_t] hashes): +cdef void compute_object_hashes(cnp.ndarray data, int64_t null_hash, int64_t[:] hashes): cdef Py_ssize_t i, n = data.shape[0] cdef object value for i in range(n): @@ -209,7 +204,6 @@ cpdef tuple list_distinct(cnp.ndarray values, cnp.int32_t[::1] indices, HashSet -@cython.boundscheck(False) @cython.wraparound(False) cpdef HashTable hash_join_map(relation, list join_columns): """ @@ -276,3 +270,94 @@ cpdef HashTable hash_join_map(relation, list join_columns): ht.insert(hash_value, non_null_indices[i]) return ht + + +cpdef filter_join_set(relation, list join_columns, HashSet seen_hashes): + + cdef int64_t num_columns = len(join_columns) + + if seen_hashes is None: + seen_hashes = HashSet() + + # Memory view for the values array (for the join columns) + cdef object[:, ::1] values_array = numpy.array(list(relation.select(join_columns).drop_null().itercolumns()), dtype=object) + + cdef int64_t hash_value, i + + if num_columns == 1: + col = values_array[0, :] + for i in range(len(col)): + hash_value = hash(col[i]) + seen_hashes.insert(hash_value) + else: + for i in range(values_array.shape[1]): + # Combine the hashes of each value in the row + hash_value = 0 + for value in values_array[:, i]: + hash_value = (hash_value * 31 + hash(value)) + seen_hashes.insert(hash_value) + + return seen_hashes + +cpdef anti_join(relation, list join_columns, HashSet seen_hashes): + cdef int64_t num_columns = len(join_columns) + cdef int64_t num_rows = relation.shape[0] + cdef int64_t hash_value, i + cdef cnp.ndarray[int64_t, ndim=1] index_buffer = numpy.empty(num_rows, dtype=numpy.int64) + cdef int64_t idx_count = 0 + + cdef object[:, ::1] values_array = numpy.array(list(relation.select(join_columns).drop_null().itercolumns()), dtype=object) + + if num_columns == 1: + col = values_array[0, :] + for i in range(len(col)): + hash_value = hash(col[i]) + if not seen_hashes.contains(hash_value): + index_buffer[idx_count] = i + idx_count += 1 + else: + for i in range(values_array.shape[1]): + # Combine the hashes of each value in the row + hash_value = 0 + for value in values_array[:, i]: + hash_value = (hash_value * 31 + hash(value)) + if not seen_hashes.contains(hash_value): + index_buffer[idx_count] = i + idx_count += 1 + + if idx_count > 0: + return relation.take(index_buffer[:idx_count]) + else: + return relation.slice(0, 0) + + +cpdef semi_join(relation, list join_columns, HashSet seen_hashes): + cdef int64_t num_columns = len(join_columns) + cdef int64_t num_rows = relation.shape[0] + cdef int64_t hash_value, i + cdef cnp.ndarray[int64_t, ndim=1] index_buffer = numpy.empty(num_rows, dtype=numpy.int64) + cdef int64_t idx_count = 0 + + cdef object[:, ::1] values_array = numpy.array(list(relation.select(join_columns).drop_null().itercolumns()), dtype=object) + + if num_columns == 1: + col = values_array[0, :] + for i in range(len(col)): + hash_value = hash(col[i]) + if seen_hashes.contains(hash_value): + index_buffer[idx_count] = i + idx_count += 1 + else: + for i in range(values_array.shape[1]): + # Combine the hashes of each value in the row + hash_value = 0 + for value in values_array[:, i]: + hash_value = (hash_value * 31 + hash(value)) + if seen_hashes.contains(hash_value): + index_buffer[idx_count] = i + idx_count += 1 + + if idx_count > 0: + return relation.take(index_buffer[:idx_count]) + else: + return relation.slice(0, 0) \ No newline at end of file diff --git a/opteryx/operators/filter_join_node.py b/opteryx/operators/filter_join_node.py index eda16ef1b..00245903b 100644 --- a/opteryx/operators/filter_join_node.py +++ b/opteryx/operators/filter_join_node.py @@ -13,78 +13,17 @@ presence or absence of matching rows in the right table. """ -from typing import List -from typing import Set - import pyarrow from opteryx import EOS +from opteryx.compiled.structures import anti_join +from opteryx.compiled.structures import filter_join_set +from opteryx.compiled.structures import semi_join from opteryx.models import QueryProperties from . import JoinNode -def left_anti_join(left_relation, left_columns: List[str], right_hash_set: Set[str]): - """ - Perform a LEFT ANTI JOIN. - - This implementation ensures that all rows from the left table are included in the result set, - where there are no matching rows in the right table based on the join columns. - - Parameters: - left_relation (pyarrow.Table): The left pyarrow.Table to join. - left_columns (list of str): Column names from the left table to join on. - right_hash_set (set of tuple): A set of tuples representing the hashed values of the right table's join columns. - - Returns: - A pyarrow.Table containing the result of the LEFT ANTI JOIN operation. - """ - - left_indexes = [] - left_values = left_relation.select(left_columns).drop_null().itercolumns() - for i, value_tuple in enumerate(map(hash, zip(*left_values))): - if ( - value_tuple not in right_hash_set - ): # Only include left rows that have no match in the right table - left_indexes.append(i) - - # Filter the left_chunk based on the anti join condition - if left_indexes: - return left_relation.take(left_indexes) - else: - return left_relation.slice(0, 0) - - -def left_semi_join(left_relation, left_columns: List[str], right_hash_set: Set[str]): - """ - Perform a LEFT SEMI JOIN. - - This implementation ensures that all rows from the left table that have a matching row in the right table - based on the join columns are included in the result set. - - Parameters: - left_relation (pyarrow.Table): The left pyarrow.Table to join. - left_columns (list of str): Column names from the left table to join on. - right_hash_set (set of tuple): A set of tuples representing the hashed values of the right table's join columns. - - Returns: - A pyarrow.Table containing the result of the LEFT ANTI JOIN operation. - """ - left_indexes = [] - left_values = left_relation.select(left_columns).drop_null().itercolumns() - for i, value_tuple in enumerate(map(hash, zip(*left_values))): - if ( - value_tuple in right_hash_set - ): # Only include left rows that have a match in the right table - left_indexes.append(i) - - # Filter the left_chunk based on the semi join condition - if left_indexes: - return left_relation.take(left_indexes) - else: - return left_relation.slice(0, 0) - - class FilterJoinNode(JoinNode): def __init__(self, properties: QueryProperties, **parameters): JoinNode.__init__(self, properties=properties, **parameters) @@ -98,8 +37,7 @@ def __init__(self, properties: QueryProperties, **parameters): self.right_columns = parameters.get("right_columns") self.right_readers = parameters.get("right_readers") - self.right_buffer = [] - self.right_hash_set = set() + self.right_hash_set = None @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover @@ -107,7 +45,7 @@ def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover @property def name(self): # pragma: no cover - return self.join_type + return self.join_type.replace(" ", "_") @property def config(self) -> str: # pragma: no cover @@ -126,24 +64,16 @@ def execute(self, morsel: pyarrow.Table, join_leg: str) -> pyarrow.Table: else: join_provider = providers.get(self.join_type) yield join_provider( - left_relation=morsel, - left_columns=self.left_columns, - right_hash_set=self.right_hash_set, - ) - if join_leg == "right": - if morsel == EOS: - right_relation = pyarrow.concat_tables(self.right_buffer, promote_options="none") - self.right_buffer.clear() - non_null_right_values = ( - right_relation.select(self.right_columns).drop_null().itercolumns() + relation=morsel, + join_columns=self.left_columns, + seen_hashes=self.right_hash_set, ) - self.right_hash_set = set(map(hash, zip(*non_null_right_values))) - else: - self.right_buffer.append(morsel) - yield None + if join_leg == "right" and morsel != EOS: + self.right_hash_set = filter_join_set(morsel, self.right_columns, self.right_hash_set) + yield None providers = { - "left anti": left_anti_join, - "left semi": left_semi_join, + "left anti": anti_join, + "left semi": semi_join, } From feb6e53f24b1e781a3e952b7a4a2930fea469e73 Mon Sep 17 00:00:00 2001 From: XB500 Date: Fri, 27 Dec 2024 00:44:50 +0000 Subject: [PATCH 122/157] Opteryx Version 0.19.0-alpha.922 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 05dca2c3d..12e5eccae 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 921 +__build__ = 922 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 602e86e6ca375aedbe9f03f456fd4bba04d52e06 Mon Sep 17 00:00:00 2001 From: joocer Date: Fri, 27 Dec 2024 14:48:11 +0000 Subject: [PATCH 123/157] #1866 --- .../strategies/constant_folding.py | 20 +++++++++++++++++++ .../test_shapes_and_errors_battery.py | 18 +++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/opteryx/planner/cost_based_optimizer/strategies/constant_folding.py b/opteryx/planner/cost_based_optimizer/strategies/constant_folding.py index 28aee2bfb..f6af988be 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/constant_folding.py +++ b/opteryx/planner/cost_based_optimizer/strategies/constant_folding.py @@ -41,6 +41,10 @@ def fold_constants(root: Node, statistics: QueryStatistics) -> Node: # if we're already a literal (constant), we can't fold return root + if root.node_type == NodeType.EXPRESSION_LIST: + # we currently don't fold CASE expressions + return root + if root.node_type in {NodeType.COMPARISON_OPERATOR, NodeType.BINARY_OPERATOR}: # if we have a binary expression, try to fold each side root.left = fold_constants(root.left, statistics) @@ -229,6 +233,22 @@ def fold_constants(root: Node, statistics: QueryStatistics) -> Node: # Although they have no params, these are evaluated per row return root + # fold costants in function parameters + if root.parameters: + for i, param in enumerate(root.parameters): + root.parameters[i] = fold_constants(param, statistics) + + for agg in aggregators: + if len(agg.parameters) == 1 and agg.parameters[0].node_type == NodeType.LITERAL: + if agg.value == "COUNT": + # COUNT(1) is always the number of rows + root.parameters[0] = Node(NodeType.WILDCARD) + statistics.optimization_constant_aggregation += 1 + return root + if agg.value in ("AVG", "MIN", "MAX"): + statistics.optimization_constant_aggregation += 1 + return build_literal_node(agg.parameters[0].value, root, root.schema_column.type) + if len(identifiers) == 0 and len(aggregators) == 0: table = no_table_data.read() try: diff --git a/tests/sql_battery/test_shapes_and_errors_battery.py b/tests/sql_battery/test_shapes_and_errors_battery.py index 7cf835d91..bcb881793 100644 --- a/tests/sql_battery/test_shapes_and_errors_battery.py +++ b/tests/sql_battery/test_shapes_and_errors_battery.py @@ -1911,6 +1911,24 @@ ("SELECT name, missions FROM $astronauts WHERE name LIKE ANY ('%Armstrong%', 123)", 1, 2, None), ("SELECT name, missions FROM $astronauts WHERE name LIKE ANY ('%pattern1%', '%pattern2%', '%pattern3%', '%pattern4%', '%pattern5%', '%pattern6%', '%pattern7%', '%pattern8%', '%pattern9%', '%pattern10%', '%pattern11%', '%pattern12%', '%pattern13%', '%pattern14%', '%pattern15%', '%pattern16%', '%pattern17%', '%pattern18%', '%pattern19%', '%pattern20%', '%pattern21%', '%pattern22%', '%pattern23%', '%pattern24%', '%pattern25%', '%pattern26%', '%pattern27%', '%pattern28%', '%pattern29%', '%pattern30%', '%pattern31%', '%pattern32%', '%pattern33%', '%pattern34%', '%pattern35%', '%pattern36%', '%pattern37%', '%pattern38%', '%pattern39%', '%pattern40%', '%pattern41%', '%pattern42%', '%pattern43%', '%pattern44%', '%pattern45%', '%pattern46%', '%pattern47%', '%pattern48%', '%pattern49%', '%pattern50%');", 0, 2, None), + ("SELECT max(current_time), name FROM $satellites group by name", 177, 2, None), + ("SELECT max(1), name FROM $satellites group by name", 177, 2, None), + ("SELECT max(1) FROM $satellites", 1, 1, None), + ("SELECT max('a'), name FROM $satellites group by name", 177, 2, None), + ("SELECT max('a') FROM $satellites", 1, 1, None), + ("SELECT min(current_time), name FROM $satellites group by name", 177, 2, None), + ("SELECT min(1), name FROM $satellites group by name", 177, 2, None), + ("SELECT min(1) FROM $satellites", 1, 1, None), + ("SELECT min('a'), name FROM $satellites group by name", 177, 2, None), + ("SELECT min('a') FROM $satellites", 1, 1, None), + ("SELECT count(current_time), name FROM $satellites group by name", 177, 2, None), + ("SELECT count(1), name FROM $satellites group by name", 177, 2, None), + ("SELECT count(1) FROM $satellites", 1, 1, None), + ("SELECT count('a'), name FROM $satellites group by name", 177, 2, None), + ("SELECT count('a') FROM $satellites", 1, 1, None), + ("SELECT avg(1), name FROM $satellites group by name", 177, 2, None), + ("SELECT avg(1) FROM $satellites", 1, 1, None), + # **************************************************************************************** # These are queries which have been found to return the wrong result or not run correctly From 020a8de009f0ebbdee16aac0c3a1944a233b17f0 Mon Sep 17 00:00:00 2001 From: XB500 Date: Fri, 27 Dec 2024 14:48:34 +0000 Subject: [PATCH 124/157] Opteryx Version 0.19.0-alpha.923 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 12e5eccae..bed120ec5 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 922 +__build__ = 923 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 8905c42c0711b2e1c16968f0d289ea4119b92930 Mon Sep 17 00:00:00 2001 From: joocer Date: Fri, 27 Dec 2024 15:05:07 +0000 Subject: [PATCH 125/157] #1866 --- .../cost_based_optimizer/strategies/constant_folding.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/opteryx/planner/cost_based_optimizer/strategies/constant_folding.py b/opteryx/planner/cost_based_optimizer/strategies/constant_folding.py index f6af988be..24982afff 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/constant_folding.py +++ b/opteryx/planner/cost_based_optimizer/strategies/constant_folding.py @@ -124,6 +124,7 @@ def fold_constants(root: Node, statistics: QueryStatistics) -> Node: return root.left # anything if root.node_type == NodeType.COMPARISON_OPERATOR: + # anything LIKE '%' is true for non null values if ( root.value in ("Like", "ILike") and root.left.node_type == NodeType.IDENTIFIER @@ -229,15 +230,16 @@ def fold_constants(root: Node, statistics: QueryStatistics) -> Node: functions = get_all_nodes_of_type(root, (NodeType.FUNCTION,)) aggregators = get_all_nodes_of_type(root, (NodeType.AGGREGATOR,)) - if any(func.value in {"RANDOM", "RAND", "NORMAL", "RANDOM_STRING"} for func in functions): + if any(func.value in ("RANDOM", "RAND", "NORMAL", "RANDOM_STRING") for func in functions): # Although they have no params, these are evaluated per row return root - # fold costants in function parameters + # fold costants in function parameters - this is generally aggregations we're affecting here if root.parameters: for i, param in enumerate(root.parameters): root.parameters[i] = fold_constants(param, statistics) + # rewrite aggregations to constants where possible for agg in aggregators: if len(agg.parameters) == 1 and agg.parameters[0].node_type == NodeType.LITERAL: if agg.value == "COUNT": From d8168db7c2d50eacb1c66e1e9cb213b87e0c0497 Mon Sep 17 00:00:00 2001 From: XB500 Date: Fri, 27 Dec 2024 15:12:05 +0000 Subject: [PATCH 126/157] Opteryx Version 0.19.0-alpha.925 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index bed120ec5..889edabe0 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 923 +__build__ = 925 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 9eca04207eae4499d5b576dbe824c9f0e51b9f4b Mon Sep 17 00:00:00 2001 From: joocer Date: Sun, 29 Dec 2024 16:01:41 +0000 Subject: [PATCH 127/157] #2163 --- .../compiled/cross_join/cython_cross_join.pyx | 30 +++++----- opteryx/compiled/levenshtein/clevenshtein.pyx | 55 ++++++++++++------- opteryx/compiled/structures/hash_table.pyx | 27 ++++----- .../capabilities/predicate_pushable.py | 2 + opteryx/managers/expression/__init__.py | 5 +- opteryx/operators/async_read_node.py | 8 +-- opteryx/operators/base_plan_node.py | 16 +++++- opteryx/operators/cross_join_node.py | 2 +- .../logical_planner_builders.py | 17 ++++++ opteryx/utils/file_decoders.py | 4 +- 10 files changed, 103 insertions(+), 63 deletions(-) diff --git a/opteryx/compiled/cross_join/cython_cross_join.pyx b/opteryx/compiled/cross_join/cython_cross_join.pyx index a30f35a8e..6c9569f89 100644 --- a/opteryx/compiled/cross_join/cython_cross_join.pyx +++ b/opteryx/compiled/cross_join/cython_cross_join.pyx @@ -9,15 +9,15 @@ import numpy as np cimport numpy as cnp cimport cython -from libc.stdint cimport int32_t +from libc.stdint cimport int64_t from libc.math cimport INFINITY cpdef tuple build_rows_indices_and_column(cnp.ndarray column_data): - cdef Py_ssize_t row_count = column_data.shape[0] - cdef cnp.int32_t[::1] lengths = np.empty(row_count, dtype=np.int32) - cdef cnp.int32_t[::1] offsets = np.empty(row_count + 1, dtype=np.int32) - cdef Py_ssize_t i - cdef Py_ssize_t total_size = 0 + cdef int64_t row_count = column_data.shape[0] + cdef cnp.int64_t[::1] lengths = np.empty(row_count, dtype=np.int64) + cdef cnp.int64_t[::1] offsets = np.empty(row_count + 1, dtype=np.int64) + cdef int64_t i + cdef int64_t total_size = 0 cdef cnp.dtype element_dtype = column_data[0].dtype if not isinstance(column_data[0], np.ndarray): @@ -30,13 +30,13 @@ cpdef tuple build_rows_indices_and_column(cnp.ndarray column_data): # Early exit if total_size is zero if total_size == 0: - return (np.array([], dtype=np.int32), np.array([], dtype=object)) + return (np.array([], dtype=np.int64), np.array([], dtype=object)) # Compute offsets for efficient slicing offsets[0] = 0 for i in range(row_count): offsets[i + 1] = offsets[i] + lengths[i] - cdef cnp.int32_t[::1] indices = np.empty(total_size, dtype=np.int32) + cdef cnp.int64_t[::1] indices = np.empty(total_size, dtype=np.int64) cdef cnp.ndarray flat_data = np.empty(total_size, dtype=element_dtype) # Fill indices and flat_data @@ -65,13 +65,13 @@ cpdef tuple build_filtered_rows_indices_and_column(cnp.ndarray column_data, set tuple of (ndarray, ndarray) Returns a tuple containing an array of indices and an array of flattened data for rows that match the filter. """ - cdef Py_ssize_t row_count = column_data.shape[0] - cdef Py_ssize_t allocated_size = row_count * 4 # Initial allocation size - cdef Py_ssize_t index = 0 - cdef Py_ssize_t i, j, len_i + cdef int64_t row_count = column_data.shape[0] + cdef int64_t allocated_size = row_count * 4 # Initial allocation size + cdef int64_t index = 0 + cdef int64_t i, j, len_i cdef object array_i cdef cnp.ndarray flat_data - cdef cnp.int32_t[::1] indices + cdef cnp.int64_t[::1] indices cdef cnp.dtype element_dtype = None cdef object value @@ -92,7 +92,7 @@ cpdef tuple build_filtered_rows_indices_and_column(cnp.ndarray column_data, set element_dtype = np.object_ # Initialize indices and flat_data arrays - indices = np.empty(allocated_size, dtype=np.int32) + indices = np.empty(allocated_size, dtype=np.int64) flat_data = np.empty(allocated_size, dtype=element_dtype) # Handle set initialization based on element dtype @@ -127,7 +127,7 @@ cpdef tuple build_filtered_rows_indices_and_column(cnp.ndarray column_data, set index += 1 if index == 0: - return (np.array([], dtype=np.int32), np.array([], dtype=element_dtype)) + return (np.array([], dtype=np.int64), np.array([], dtype=element_dtype)) # Slice arrays to the actual used size indices = indices[:index] diff --git a/opteryx/compiled/levenshtein/clevenshtein.pyx b/opteryx/compiled/levenshtein/clevenshtein.pyx index 56caa63a8..061a34c38 100644 --- a/opteryx/compiled/levenshtein/clevenshtein.pyx +++ b/opteryx/compiled/levenshtein/clevenshtein.pyx @@ -1,17 +1,26 @@ # cython: language_level=3 +# cython: nonecheck=False +# cython: cdivision=True +# cython: initializedcheck=False +# cython: infer_types=True +# cython: wraparound=False +# cython: boundscheck=False import numpy as np # Required for array allocation +from libc.stdint cimport int64_t, int32_t +cimport cython -cdef int min3(int x, int y, int z): +cdef inline int64_t min3(int64_t x, int64_t y, int64_t z) nogil: """Utility function to find the minimum of three integers.""" - cdef int m = x - if y < m: - m = y - if z < m: - m = z - return m - -def levenshtein(str string1, str string2): + if x <= y: + if x <= z: + return x + return z + if y <= z: + return y + return z + +cpdef int64_t levenshtein(str string1, str string2): """ Calculate the Levenshtein distance between two strings. @@ -22,26 +31,30 @@ def levenshtein(str string1, str string2): Returns: int: The Levenshtein distance between string1 and string2. """ + if len(string1) < len(string2): + string1, string2 = string2, string1 + cdef int len1 = len(string1) - cdef int len2 = len(string2) - cdef int i, j + cdef int len2 = len(string2) + 1 + + cdef int64_t i, j # Allocate a numpy array and create a memory view from it - cdef int[:] dp = np.zeros((len1 + 1) * (len2 + 1), dtype=np.int32) + cdef int64_t[:] dp = np.zeros((len1 + 1) * len2, dtype=np.int64) for i in range(len1 + 1): - for j in range(len2 + 1): + for j in range(len2): if i == 0: - dp[i * (len2 + 1) + j] = j # First string is empty + dp[j] = j elif j == 0: - dp[i * (len2 + 1) + j] = i # Second string is empty + dp[i * len2] = i elif string1[i - 1] == string2[j - 1]: - dp[i * (len2 + 1) + j] = dp[(i - 1) * (len2 + 1) + (j - 1)] + dp[i * len2 + j] = dp[(i - 1) * len2 + (j - 1)] else: - dp[i * (len2 + 1) + j] = 1 + min3( - dp[(i - 1) * (len2 + 1) + j], # Remove - dp[i * (len2 + 1) + (j - 1)], # Insert - dp[(i - 1) * (len2 + 1) + (j - 1)] # Replace + dp[i * len2 + j] = 1 + min3( + dp[(i - 1) * len2 + j], # Remove + dp[i * len2 + (j - 1)], # Insert + dp[(i - 1) * len2 + (j - 1)] # Replace ) - return dp[len1 * (len2 + 1) + len2] + return dp[len1 * len2 + (len2 - 1)] diff --git a/opteryx/compiled/structures/hash_table.pyx b/opteryx/compiled/structures/hash_table.pyx index c06681e11..43d8eec71 100644 --- a/opteryx/compiled/structures/hash_table.pyx +++ b/opteryx/compiled/structures/hash_table.pyx @@ -4,7 +4,7 @@ # cython: cdivision=True # cython: initializedcheck=False # cython: infer_types=True -# cython: wraparound=True +# cython: wraparound=False # cython: boundscheck=False from libcpp.unordered_map cimport unordered_map @@ -59,7 +59,7 @@ cdef class HashSet: cdef inline bint contains(self, int64_t value): return self.c_set.find(value) != self.c_set.end() -@cython.wraparound(False) + cdef inline object recast_column(column): cdef column_type = column.type @@ -70,9 +70,6 @@ cdef inline object recast_column(column): return column - - -@cython.wraparound(False) cpdef tuple distinct(table, HashSet seen_hashes=None, list columns=None): """ Perform a distinct operation on the given table using an external HashSet. @@ -140,7 +137,6 @@ cpdef tuple distinct(table, HashSet seen_hashes=None, list columns=None): return keep, seen_hashes -@cython.wraparound(False) cdef void compute_float_hashes(cnp.ndarray[cnp.float64_t] data, int64_t null_hash, int64_t[:] hashes): cdef Py_ssize_t i, n = data.shape[0] cdef cnp.float64_t value @@ -151,7 +147,7 @@ cdef void compute_float_hashes(cnp.ndarray[cnp.float64_t] data, int64_t null_has else: hashes[i] = hash(value) -@cython.wraparound(False) + cdef void compute_int_hashes(cnp.ndarray[cnp.int64_t] data, int64_t null_hash, int64_t[:] hashes): cdef Py_ssize_t i, n = data.shape[0] cdef cnp.int64_t value @@ -164,7 +160,6 @@ cdef void compute_int_hashes(cnp.ndarray[cnp.int64_t] data, int64_t null_hash, i else: hashes[i] = value # Hash of int is the int itself in Python 3 -@cython.wraparound(False) cdef void compute_object_hashes(cnp.ndarray data, int64_t null_hash, int64_t[:] hashes): cdef Py_ssize_t i, n = data.shape[0] cdef object value @@ -176,18 +171,13 @@ cdef void compute_object_hashes(cnp.ndarray data, int64_t null_hash, int64_t[:] hashes[i] = hash(value) -@cython.wraparound(False) -cpdef tuple list_distinct(cnp.ndarray values, cnp.int32_t[::1] indices, HashSet seen_hashes=None): +cpdef tuple list_distinct(cnp.ndarray values, cnp.int64_t[::1] indices, HashSet seen_hashes=None): cdef: Py_ssize_t i, j = 0 Py_ssize_t n = values.shape[0] - object v int64_t hash_value - int32_t[::1] new_indices = numpy.empty(n, dtype=numpy.int32) - - # Determine the dtype of the `values` array + int64_t[::1] new_indices = numpy.empty(n, dtype=numpy.int64) cnp.dtype dtype = values.dtype - cnp.ndarray new_values = numpy.empty(n, dtype=dtype) if seen_hashes is None: @@ -200,11 +190,11 @@ cpdef tuple list_distinct(cnp.ndarray values, cnp.int32_t[::1] indices, HashSet new_values[j] = v new_indices[j] = indices[i] j += 1 + return new_values[:j], new_indices[:j], seen_hashes -@cython.wraparound(False) cpdef HashTable hash_join_map(relation, list join_columns): """ Build a hash table for the join operations. @@ -272,7 +262,10 @@ cpdef HashTable hash_join_map(relation, list join_columns): return ht -cpdef filter_join_set(relation, list join_columns, HashSet seen_hashes): +cpdef HashSet filter_join_set(relation, list join_columns, HashSet seen_hashes): + """ + Build the set for the right of a filter join (ANTI/SEMI) + """ cdef int64_t num_columns = len(join_columns) diff --git a/opteryx/connectors/capabilities/predicate_pushable.py b/opteryx/connectors/capabilities/predicate_pushable.py index 6f0d6c107..73d019005 100644 --- a/opteryx/connectors/capabilities/predicate_pushable.py +++ b/opteryx/connectors/capabilities/predicate_pushable.py @@ -18,6 +18,7 @@ import datetime from typing import Dict +from orso.tools import single_item_cache from orso.types import OrsoTypes from opteryx.exceptions import NotSupportedError @@ -59,6 +60,7 @@ def __init__(self, **kwargs): pass @staticmethod + @single_item_cache def to_dnf(root): """ Convert a filter to DNF form, this is the form used by PyArrow. diff --git a/opteryx/managers/expression/__init__.py b/opteryx/managers/expression/__init__.py index 77834018e..a62ddfedf 100644 --- a/opteryx/managers/expression/__init__.py +++ b/opteryx/managers/expression/__init__.py @@ -11,6 +11,7 @@ Expressions are evaluated against an entire morsel at a time. """ +from collections import deque from enum import Enum from typing import Callable from typing import Dict @@ -289,7 +290,7 @@ def evaluate(expression: Node, table: Table): return result -def get_all_nodes_of_type(root, select_nodes): +def get_all_nodes_of_type(root, select_nodes: tuple) -> list: """ Walk an expression tree collecting all nodes of a specified type. """ @@ -299,7 +300,7 @@ def get_all_nodes_of_type(root, select_nodes): root = [root] identifiers = [] - stack = list(root) + stack = deque(root) while stack: node = stack.pop() diff --git a/opteryx/operators/async_read_node.py b/opteryx/operators/async_read_node.py index 3a994c217..4db5ea946 100644 --- a/opteryx/operators/async_read_node.py +++ b/opteryx/operators/async_read_node.py @@ -66,9 +66,10 @@ def __init__(self, properties: QueryProperties, **parameters): self.predicates = parameters.get("predicates") - @classmethod - def from_dict(cls, dic: dict) -> "AsyncReaderNode": # pragma: no cover - raise NotImplementedError() + @property + def name(self): # pragma: no cover + """friendly name for this step""" + return "Async Read" def execute(self, morsel, **kwargs) -> Generator: if morsel == EOS: @@ -99,7 +100,6 @@ def execute(self, morsel, **kwargs) -> Generator: if len(blob_names) == 0: # if we don't have any matching blobs, create an empty dataset - # TODO: rewrite from orso import DataFrame as_arrow = DataFrame(rows=[], schema=orso_schema).arrow() diff --git a/opteryx/operators/base_plan_node.py b/opteryx/operators/base_plan_node.py index 9124c8a78..a42c66e75 100644 --- a/opteryx/operators/base_plan_node.py +++ b/opteryx/operators/base_plan_node.py @@ -10,6 +10,8 @@ import pyarrow from orso.tools import random_string +from opteryx.config import MORSEL_SIZE + class BasePlanNode: is_join: bool = False @@ -68,6 +70,8 @@ def __call__(self, morsel: pyarrow.Table, join_leg: str) -> Optional[pyarrow.Tab # set up the execution of the operator generator = self.execute(morsel, join_leg=join_leg) + empty_morsel = None + at_least_one = False while True: try: @@ -83,10 +87,20 @@ def __call__(self, morsel: pyarrow.Table, join_leg: str) -> Optional[pyarrow.Tab self.records_out += result.num_rows self.bytes_out += result.nbytes - yield result + # if we get empty sets, don't yield them unless they're the only one + if result.num_rows > 0: + self.statistics.avoided_empty_datasets += 1 + at_least_one = True + yield result + else: + empty_morsel = result + else: + yield result except StopIteration: # Break the loop when the generator is exhausted + if not at_least_one and empty_morsel is not None: + yield empty_morsel break except Exception as err: # print(f"Exception {err} in operator", self.name) diff --git a/opteryx/operators/cross_join_node.py b/opteryx/operators/cross_join_node.py index 7e9b23f19..44dbd5d8c 100644 --- a/opteryx/operators/cross_join_node.py +++ b/opteryx/operators/cross_join_node.py @@ -90,7 +90,7 @@ def _cross_join_unnest_column( if single_column and distinct and indices.size > 0: # if the unnest target is the only field in the SELECT and we're DISTINCTING - indices = numpy.array(indices, dtype=numpy.int32) + indices = numpy.array(indices, dtype=numpy.int64) new_column_data, indices, hash_set = list_distinct(new_column_data, indices, hash_set) if len(indices) > 0: diff --git a/opteryx/planner/logical_planner/logical_planner_builders.py b/opteryx/planner/logical_planner/logical_planner_builders.py index 95dc5ae08..88866f88f 100644 --- a/opteryx/planner/logical_planner/logical_planner_builders.py +++ b/opteryx/planner/logical_planner/logical_planner_builders.py @@ -242,6 +242,22 @@ def expression_with_alias(branch, alias: Optional[List[str]] = None, key=None): return build(branch["expr"], alias=branch["alias"]["value"]) +def exists(branch, alias: Optional[List[str]] = None, key=None): + from opteryx.planner.logical_planner.logical_planner import plan_query + + subplan = plan_query(branch["subquery"]) + not_exists = Node(NodeType.LITERAL, type=OrsoTypes.BOOLEAN, value=branch["negated"]) + + raise UnsupportedSyntaxError("EXISTS is not supported in Opteryx") + + return Node( + NodeType.UNARY_OPERATOR, + value="EXISTS", + parameters=[Node(NodeType.SUBQUERY, plan=subplan), not_exists], + alias=alias, + ) + + def expressions(branch, alias: Optional[List[str]] = None, key=None): return [build(part) for part in branch] @@ -715,6 +731,7 @@ def build(value, alias: Optional[List[str]] = None, key=None): "Ceil": ceiling, "CompoundIdentifier": compound_identifier, "DoubleQuotedString": literal_string, + "Exists": exists, "Expr": build, "Expressions": expressions, "ExprWithAlias": expression_with_alias, diff --git a/opteryx/utils/file_decoders.py b/opteryx/utils/file_decoders.py index d553e966e..b68cc2de1 100644 --- a/opteryx/utils/file_decoders.py +++ b/opteryx/utils/file_decoders.py @@ -219,9 +219,8 @@ def parquet_decoder( Returns: Tuple containing number of rows, number of columns, and the table or schema. """ - stream = pyarrow.BufferReader(buffer) - # Open the parquet file only once + stream = pyarrow.BufferReader(buffer) parquet_file = parquet.ParquetFile(stream) # Return just the schema if that's all that's needed @@ -263,6 +262,7 @@ def parquet_decoder( filters=dnf_filter, use_threads=False, use_pandas_metadata=False, + schema=parquet_file.schema_arrow, ) # Any filters we couldn't push to PyArrow to read we run here From cb0629ff627c685e3b778e3c1d02bc5d8d430e9e Mon Sep 17 00:00:00 2001 From: XB500 Date: Sun, 29 Dec 2024 16:02:05 +0000 Subject: [PATCH 128/157] Opteryx Version 0.19.0-alpha.926 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 889edabe0..58d45c2d1 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 925 +__build__ = 926 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From e292b438a4fb26c6a427f55b261e5066381e8732 Mon Sep 17 00:00:00 2001 From: joocer Date: Sun, 29 Dec 2024 16:38:54 +0000 Subject: [PATCH 129/157] sqlparser-0.53 --- opteryx/functions/number_functions.py | 18 ++++++++++++++---- opteryx/planner/ast_rewriter.py | 5 +++++ .../planner/logical_planner/logical_planner.py | 11 +++++++---- .../test_shapes_and_errors_battery.py | 4 ++-- 4 files changed, 28 insertions(+), 10 deletions(-) diff --git a/opteryx/functions/number_functions.py b/opteryx/functions/number_functions.py index cf755dd00..ef61f1e7e 100644 --- a/opteryx/functions/number_functions.py +++ b/opteryx/functions/number_functions.py @@ -95,8 +95,13 @@ def ceiling(values, scales=None) -> List: scale = scales[0] if scale == 0: return numpy.ceil(values) - scale = 10**scale - return numpy.ceil(values * scale) / scale + + if scale > 0: + scale_factor = 10**scale + return numpy.ceil(values * scale_factor) / scale_factor + else: + scale_factor = 10 ** (-scale) + return numpy.ceil(values / scale_factor) * scale_factor def floor(values, scales=None) -> List: @@ -111,5 +116,10 @@ def floor(values, scales=None) -> List: scale = scales[0] if scale == 0: return numpy.floor(values) - scale = 10**scale - return numpy.floor(values * scale) / scale + + if scale > 0: + scale_factor = 10**scale + return numpy.floor(values * scale_factor) / scale_factor + else: + scale_factor = 10 ** (-scale) + return numpy.floor(values / scale_factor) * scale_factor diff --git a/opteryx/planner/ast_rewriter.py b/opteryx/planner/ast_rewriter.py index 23b689834..6f2d564e8 100644 --- a/opteryx/planner/ast_rewriter.py +++ b/opteryx/planner/ast_rewriter.py @@ -182,6 +182,11 @@ def temporal_range_binder(ast, filters): ast["table_name"][0]["start_date"] = temporal_range[1] ast["table_name"][0]["end_date"] = temporal_range[2] return ast + if "parent_name" in ast: + temporal_range = filters.pop(0) + ast["parent_name"][0]["start_date"] = temporal_range[1] + ast["parent_name"][0]["end_date"] = temporal_range[2] + return ast if "ShowCreate" in ast and filters: temporal_range = filters.pop(0) ast["ShowCreate"]["start_date"] = temporal_range[1] diff --git a/opteryx/planner/logical_planner/logical_planner.py b/opteryx/planner/logical_planner/logical_planner.py index a379f4cca..997b56d97 100644 --- a/opteryx/planner/logical_planner/logical_planner.py +++ b/opteryx/planner/logical_planner/logical_planner.py @@ -681,7 +681,9 @@ def create_node_relation(relation): node_type=LogicalPlanStepType.FunctionDataset, function="VALUES" ) values_step.alias = subquery["alias"]["name"]["value"] - values_step.columns = tuple(col["value"] for col in subquery["alias"]["columns"]) + values_step.columns = tuple( + col["name"]["value"] for col in subquery["alias"]["columns"] + ) values_step.values = [ tuple(logical_planner_builders.build(value) for value in row) for row in subquery["subquery"]["body"]["Values"]["rows"] @@ -738,7 +740,7 @@ def create_node_relation(relation): function_step.args = [ logical_planner_builders.build(arg) for arg in function["args"]["args"] ] - function_step.columns = tuple(col["value"] for col in function["alias"]["columns"]) + function_step.columns = tuple(col["name"]["value"] for col in function["alias"]["columns"]) step_id = random_string() sub_plan.add_node(step_id, function_step) @@ -1018,7 +1020,7 @@ def plan_show_columns(statement): plan = LogicalPlan() from_step = LogicalPlanNode(node_type=LogicalPlanStepType.Scan) - table = statement[root_node]["table_name"] + table = statement[root_node]["show_options"]["show_in"]["parent_name"] from_step.relation = ".".join(part["value"] for part in table) from_step.alias = from_step.relation from_step.start_date = table[0].get("start_date") @@ -1034,8 +1036,9 @@ def plan_show_columns(statement): plan.add_node(step_id, show_step) plan.add_edge(previous_step_id, step_id) - _filter = statement[root_node]["filter"] + _filter = statement[root_node]["show_options"].get("filter_position") if _filter: + _filter = _filter["Suffix"] filter_node = LogicalPlanNode(node_type=LogicalPlanStepType.Filter) filter_node.condition = extract_simple_filter(_filter, "name") previous_step_id, step_id = step_id, random_string() diff --git a/tests/sql_battery/test_shapes_and_errors_battery.py b/tests/sql_battery/test_shapes_and_errors_battery.py index d7c04d3ca..780dd045e 100644 --- a/tests/sql_battery/test_shapes_and_errors_battery.py +++ b/tests/sql_battery/test_shapes_and_errors_battery.py @@ -1702,8 +1702,8 @@ ("SELECT FLOOR(3.14159, 2) as floor_value", 1, 1, None), ("SELECT CEIL(3.14159, 0) as ceil_value", 1, 1, None), ("SELECT FLOOR(3.14159, 0) as floor_value", 1, 1, None), - ("SELECT CEIL(3.14159, -1) as ceil_value", 1, 1, SqlError), - ("SELECT FLOOR(3.14159, -1) as floor_value", 1, 1, SqlError), + ("SELECT CEIL(3.14159, -1) as ceil_value", 1, 1, None), + ("SELECT FLOOR(3.14159, -1) as floor_value", 1, 1, None), ("SELECT UPPER(name) FROM $planets", 9, 1, None), ("SELECT LOWER(name) FROM $astronauts WHERE UPPER(name) LIKE 'A%'", 11, 1, None), ("SELECT REVERSE(name) FROM $planets", 9, 1, None), From ed8a88cb51eddb2511c130207981408b6b681153 Mon Sep 17 00:00:00 2001 From: XB500 Date: Sun, 29 Dec 2024 16:39:19 +0000 Subject: [PATCH 130/157] Opteryx Version 0.19.0-alpha.927 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index a17ea2163..5ddecd480 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 910 +__build__ = 927 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 192d78dcd639d34a390f2f632453a2cc9db4253d Mon Sep 17 00:00:00 2001 From: XB500 Date: Sun, 29 Dec 2024 16:40:27 +0000 Subject: [PATCH 131/157] Opteryx Version 0.19.0-alpha.928 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 449378ef9..1cb380ea2 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 927 +__build__ = 928 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 0878b9069dfee25c40c461b40d9bf5d5d65f6edb Mon Sep 17 00:00:00 2001 From: joocer Date: Sun, 29 Dec 2024 18:55:38 +0000 Subject: [PATCH 132/157] #2165 --- opteryx/operators/async_read_node.py | 2 +- opteryx/operators/base_plan_node.py | 29 ++++++++++++++++------------ 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/opteryx/operators/async_read_node.py b/opteryx/operators/async_read_node.py index 4db5ea946..fdaf3fe0a 100644 --- a/opteryx/operators/async_read_node.py +++ b/opteryx/operators/async_read_node.py @@ -78,7 +78,7 @@ def execute(self, morsel, **kwargs) -> Generator: from opteryx import system_statistics - """Perform this step, time how long is spent doing work""" + # Perform this step, time how long is spent doing work orso_schema = self.parameters["schema"] reader = self.parameters["connector"] diff --git a/opteryx/operators/base_plan_node.py b/opteryx/operators/base_plan_node.py index a42c66e75..809594cb6 100644 --- a/opteryx/operators/base_plan_node.py +++ b/opteryx/operators/base_plan_node.py @@ -10,7 +10,9 @@ import pyarrow from orso.tools import random_string -from opteryx.config import MORSEL_SIZE +from opteryx import EOS + +END = object() class BasePlanNode: @@ -77,31 +79,34 @@ def __call__(self, morsel: pyarrow.Table, join_leg: str) -> Optional[pyarrow.Tab try: # Time the production of the next result start_time = time.monotonic_ns() - result = next(generator) # Retrieve the next item from the generator + result = next(generator, END) # Retrieve the next item from the generator execution_time = time.monotonic_ns() - start_time self.execution_time += execution_time self.statistics.increase("time_" + self.name.lower(), execution_time) # Update metrics for valid results + if result == END: + # Break the loop when the generator is exhausted + if not at_least_one: + yield empty_morsel + break + if hasattr(result, "num_rows"): self.records_out += result.num_rows self.bytes_out += result.nbytes + if empty_morsel is None: + empty_morsel = result.slice(0, 0) + # if we get empty sets, don't yield them unless they're the only one if result.num_rows > 0: self.statistics.avoided_empty_datasets += 1 at_least_one = True yield result - else: - empty_morsel = result - else: - yield result - - except StopIteration: - # Break the loop when the generator is exhausted - if not at_least_one and empty_morsel is not None: - yield empty_morsel - break + continue + + yield result + except Exception as err: # print(f"Exception {err} in operator", self.name) raise err From 4794004bc0b28e864ea9535b9a44315a91eff248 Mon Sep 17 00:00:00 2001 From: XB500 Date: Sun, 29 Dec 2024 18:56:03 +0000 Subject: [PATCH 133/157] Opteryx Version 0.19.0-alpha.930 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 1cb380ea2..9a67dbbdd 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 928 +__build__ = 930 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 8703985176b22d0698e0f7eb0765fd2c8f65bd3d Mon Sep 17 00:00:00 2001 From: joocer Date: Sun, 29 Dec 2024 22:41:19 +0000 Subject: [PATCH 134/157] HOUSEKEEPING --- opteryx/config.py | 16 ++-- opteryx/managers/expression/__init__.py | 34 +++---- opteryx/planner/physical_planner.py | 2 - opteryx/utils/file_decoders.py | 26 ++---- opteryx/utils/memory_view_stream.py | 101 -------------------- tests/misc/test_mv_stream.py | 119 ------------------------ 6 files changed, 34 insertions(+), 264 deletions(-) delete mode 100644 opteryx/utils/memory_view_stream.py delete mode 100644 tests/misc/test_mv_stream.py diff --git a/opteryx/config.py b/opteryx/config.py index 0a5eb8cbd..d334f38f9 100644 --- a/opteryx/config.py +++ b/opteryx/config.py @@ -54,18 +54,18 @@ def parse_yaml(yaml_str: str) -> dict: def line_value(value: str) -> typing.Any: value = value.strip() if value.isdigit(): - value = int(value) - elif value.replace(".", "", 1).isdigit(): - value = float(value) - elif value.lower() == "true": + return int(value) + if value.replace(".", "", 1).isdigit(): + return float(value) + if value.lower() == "true": return True - elif value.lower() == "false": + if value.lower() == "false": return False - elif value.lower() == "none": + if value.lower() == "none": return None - elif value.startswith("["): + if value.startswith("["): return [val.strip() for val in value[1:-1].split(",")] - elif value.startswith("-"): + if value.startswith("-"): return [val.strip() for val in value.split("-") if val.strip()] return value diff --git a/opteryx/managers/expression/__init__.py b/opteryx/managers/expression/__init__.py index a62ddfedf..5c6cd2267 100644 --- a/opteryx/managers/expression/__init__.py +++ b/opteryx/managers/expression/__init__.py @@ -53,29 +53,29 @@ class NodeType(int, Enum): # fmt:off # 00000000 - UNKNOWN: int = 0 + UNKNOWN = 0 # LOGICAL OPERATORS # 0001 nnnn - AND: int = 17 # 0001 0001 - OR: int = 18 # 0001 0010 - XOR: int = 19 # 0001 0011 - NOT: int = 20 # 0001 0100 + AND = 17 # 0001 0001 + OR = 18 # 0001 0010 + XOR = 19 # 0001 0011 + NOT = 20 # 0001 0100 # INTERAL IDENTIFIERS # 0010 nnnn - WILDCARD: int = 33 # 0010 0001 - COMPARISON_OPERATOR: int = 34 # 0010 0010 - BINARY_OPERATOR: int = 35 # 0010 0011 - UNARY_OPERATOR: int = 36 # 0010 0100 - FUNCTION: int = 37 # 0010 0101 - IDENTIFIER: int = 38 # 0010 0110 - SUBQUERY: int = 39 # 0010 0111 - NESTED: int = 40 # 0010 1000 - AGGREGATOR:int = 41 # 0010 1001 - LITERAL:int = 42 # 0010 1010 - EXPRESSION_LIST: int = 43 # 0010 1011 (CASE WHEN) - EVALUATED: int = 44 # 0010 1100 - memoize results + WILDCARD = 33 # 0010 0001 + COMPARISON_OPERATOR = 34 # 0010 0010 + BINARY_OPERATOR = 35 # 0010 0011 + UNARY_OPERATOR = 36 # 0010 0100 + FUNCTION = 37 # 0010 0101 + IDENTIFIER = 38 # 0010 0110 + SUBQUERY = 39 # 0010 0111 + NESTED = 40 # 0010 1000 + AGGREGATOR = 41 # 0010 1001 + LITERAL = 42 # 0010 1010 + EXPRESSION_LIST = 43 # 0010 1011 (CASE WHEN) + EVALUATED = 44 # 0010 1100 - memoize results ORSO_TO_NUMPY_MAP = { diff --git a/opteryx/planner/physical_planner.py b/opteryx/planner/physical_planner.py index e83bad725..11866be36 100644 --- a/opteryx/planner/physical_planner.py +++ b/opteryx/planner/physical_planner.py @@ -82,8 +82,6 @@ def create_physical_plan(logical_plan, query_properties) -> PhysicalPlan: raise UnsupportedSyntaxError(f"Unsupported SHOW type '{node_config['object_type']}'") elif node_type == LogicalPlanStepType.ShowColumns: node = operators.ShowColumnsNode(query_properties, **node_config) - elif node_type == LogicalPlanStepType.Subquery: - node = operators.NoOpNode(query_properties, **node_config) elif node_type == LogicalPlanStepType.Union: node = operators.UnionNode(query_properties, **node_config) else: # pragma: no cover diff --git a/opteryx/utils/file_decoders.py b/opteryx/utils/file_decoders.py index b68cc2de1..6f2cb346c 100644 --- a/opteryx/utils/file_decoders.py +++ b/opteryx/utils/file_decoders.py @@ -26,7 +26,6 @@ from opteryx.managers.expression import NodeType from opteryx.managers.expression import get_all_nodes_of_type from opteryx.utils.arrow import post_read_projector -from opteryx.utils.memory_view_stream import MemoryViewStream class ExtentionType(str, Enum): @@ -163,8 +162,7 @@ def zstd_decoder( """ import zstandard - stream: BinaryIO = None - stream = MemoryViewStream(buffer) if isinstance(buffer, memoryview) else io.BytesIO(buffer) + stream: BinaryIO = io.BytesIO(buffer) with zstandard.open(stream, "rb") as file: return jsonl_decoder( @@ -185,8 +183,7 @@ def lzma_decoder( """ import lzma - stream: BinaryIO = None - stream = MemoryViewStream(buffer) if isinstance(buffer, memoryview) else io.BytesIO(buffer) + stream: BinaryIO = io.BytesIO(buffer) with lzma.open(stream, "rb") as file: return jsonl_decoder( @@ -285,8 +282,7 @@ def orc_decoder( """ import pyarrow.orc as orc - stream: BinaryIO = None - stream = MemoryViewStream(buffer) if isinstance(buffer, memoryview) else io.BytesIO(buffer) + stream: BinaryIO = io.BytesIO(buffer) orc_file = orc.ORCFile(stream) if just_schema: @@ -303,7 +299,7 @@ def orc_decoder( def jsonl_decoder( - buffer: Union[memoryview, bytes], + buffer: Union[memoryview, bytes, BinaryIO], *, projection: Optional[list] = None, selection: Optional[list] = None, @@ -317,7 +313,7 @@ def jsonl_decoder( rows = [] if not isinstance(buffer, bytes): - buffer = buffer.read() + buffer = buffer.read() # type: ignore for line in buffer.split(b"\n"): if not line: @@ -354,8 +350,7 @@ def csv_decoder( import pyarrow.csv from pyarrow.csv import ParseOptions - stream: BinaryIO = None - stream = MemoryViewStream(buffer) if isinstance(buffer, memoryview) else io.BytesIO(buffer) + stream: BinaryIO = io.BytesIO(buffer) parse_options = ParseOptions(delimiter=delimiter, newlines_in_values=True) table = pyarrow.csv.read_csv(stream, parse_options=parse_options) schema = table.schema @@ -415,8 +410,7 @@ def arrow_decoder( ) -> Tuple[int, int, pyarrow.Table]: import pyarrow.feather as pf - stream: BinaryIO = None - stream = MemoryViewStream(buffer) if isinstance(buffer, memoryview) else io.BytesIO(buffer) + stream: BinaryIO = io.BytesIO(buffer) table = pf.read_table(stream) schema = table.schema if just_schema: @@ -454,8 +448,7 @@ def avro_decoder( raise MissingDependencyError("fastavro") - stream: BinaryIO = None - stream = MemoryViewStream(buffer) if isinstance(buffer, memoryview) else io.BytesIO(buffer) + stream: BinaryIO = io.BytesIO(buffer) reader = fastavro.reader(stream) if just_schema: @@ -496,8 +489,7 @@ def ipc_decoder( from pyarrow import ipc - stream: BinaryIO = None - stream = MemoryViewStream(buffer) if isinstance(buffer, memoryview) else io.BytesIO(buffer) + stream: BinaryIO = io.BytesIO(buffer) reader = ipc.open_stream(stream) batch_one = next(reader, None) diff --git a/opteryx/utils/memory_view_stream.py b/opteryx/utils/memory_view_stream.py deleted file mode 100644 index 1e726f615..000000000 --- a/opteryx/utils/memory_view_stream.py +++ /dev/null @@ -1,101 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# See the License at http://www.apache.org/licenses/LICENSE-2.0 -# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. - -""" -Handle a memoryview like a stream without converting to bytes. - -This has the minimal implementation to pass the Opteryx CI, unit and regression -tests - it's not intended to be correct for usage outside Opteryx. -""" - -import io -from typing import BinaryIO -from typing import Iterable -from typing import Iterator - - -class MemoryViewStream(BinaryIO): - def __init__(self, mv: memoryview): - self.mv = mv - self.offset = 0 - self._closed = False - - def read(self, n=-1) -> bytes: - if n < 0 or n + self.offset > len(self.mv): - n = len(self.mv) - self.offset - result = self.mv[self.offset : self.offset + n] - self.offset += n - return result.tobytes() - - def seek(self, offset: int, whence: int = 0, expected_offset: int = 0) -> int: - if whence == 0: # Absolute file positioning - self.offset = min(max(offset, 0), len(self.mv)) - elif whence == 1: # Seek relative to the current position - self.offset = min(max(self.offset + offset, 0), len(self.mv)) - elif whence == 2: # Seek relative to the file's end - self.offset = min(max(len(self.mv) + offset, 0), len(self.mv)) - return self.offset - - def tell(self) -> int: - return self.offset - - def readable(self) -> bool: - return True - - def writable(self) -> bool: - return False - - def seekable(self) -> bool: - return True - - def close(self): - self._closed = True - - @property - def closed(self) -> bool: - return self._closed - - @property - def mode(self) -> str: # pragma: no cover - return "rb" - - def __enter__(self) -> BinaryIO: - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - self.close() - - def __iter__(self) -> Iterator: - return iter(self.mv) - - def __next__(self) -> bytes: # pragma: no cover - if self.offset >= len(self.mv): - raise StopIteration() - self.offset += 1 - return bytes([self.mv[self.offset]]) - - def fileno(self) -> int: # pragma: no cover - return -1 - - def flush(self) -> None: # pragma: no cover - raise io.UnsupportedOperation() - - def isatty(self) -> bool: # pragma: no cover - return False - - def readline(self, limit: int = -1): # pragma: no cover - raise io.UnsupportedOperation() - - def readlines(self, hint: int = -1) -> list: # pragma: no cover - raise io.UnsupportedOperation() - - def truncate(self, pos: int = None): # pragma: no cover - raise io.UnsupportedOperation() - - def write(self, buffer: bytes, /) -> int: # pragma: no cover - raise io.UnsupportedOperation() - - def writelines(self, buffer: Iterable[bytes], /) -> None: # pragma: no cover - raise io.UnsupportedOperation() diff --git a/tests/misc/test_mv_stream.py b/tests/misc/test_mv_stream.py deleted file mode 100644 index 4ade4ef54..000000000 --- a/tests/misc/test_mv_stream.py +++ /dev/null @@ -1,119 +0,0 @@ -import io -import os -import sys - -import pytest - -sys.path.insert(1, os.path.join(sys.path[0], "../..")) - -from opteryx.utils.memory_view_stream import MemoryViewStream - - -def test_read_full(): - data = b"Hello, World!" - mv = memoryview(data) - stream = MemoryViewStream(mv) - assert stream.read() == data - - -def test_read_in_chunks(): - data = b"Hello, World!" - mv = memoryview(data) - stream = MemoryViewStream(mv) - assert stream.read(5) == b"Hello" - assert stream.read(2) == b", " - assert stream.read(6) == b"World!" - - -def test_seek_and_tell(): - data = b"Hello, World!" - mv = memoryview(data) - stream = MemoryViewStream(mv) - stream.seek(7) - assert stream.tell() == 7 - assert stream.read(5) == b"World" - - -def test_read_past_end(): - data = b"Hello, World!" - mv = memoryview(data) - stream = MemoryViewStream(mv) - stream.seek(12) - assert stream.read(1) == b"!" - assert stream.read(1) == b"" # Reading past the end returns an empty bytes object - - -def test_close(): - data = b"Hello, World!" - mv = memoryview(data) - stream = MemoryViewStream(mv) - assert not stream.closed - stream.close() - assert stream.closed - - -def test_context_manager(): - data = b"Hello, World!" - mv = memoryview(data) - with MemoryViewStream(mv) as stream: - assert not stream.closed - assert stream.closed - - -def test_seek(): - def inner(offset, whence, expected_offset): - data = b"Hello, World!" - mv = memoryview(data) - stream = MemoryViewStream(mv) - stream.seek(offset, whence) - assert stream.tell() == expected_offset - - params = [ - (5, 0, 5), # Absolute positioning - (2, 1, 2), # Relative to current position - (-3, 2, 10), # Relative to file's end - ] - - for param in params: - inner(*param) - -def test_unsupported_operations(): - data = b"Hello, World!" - mv = memoryview(data) - stream = MemoryViewStream(mv) - with pytest.raises(io.UnsupportedOperation): - stream.readline() - with pytest.raises(io.UnsupportedOperation): - stream.readlines() - with pytest.raises(io.UnsupportedOperation): - stream.truncate() - with pytest.raises(TypeError): - stream.write() - with pytest.raises(TypeError): - stream.writelines() - with pytest.raises(io.UnsupportedOperation): - stream.flush() - - -def test_other_attributes(): - data = b"Hello, World!" - mv = memoryview(data) - stream = MemoryViewStream(mv) - - from typing import Iterator - - assert stream.readable() - assert not stream.writable() - assert stream.seekable() - assert isinstance(iter(stream), Iterator) - - for i in stream: - pass - - assert stream.fileno() == -1 - assert not stream.isatty() - -if __name__ == "__main__": # pragma: no cover - from tests.tools import run_tests - - run_tests() From 6466b850393faf90df8320430c98598c7764d6d1 Mon Sep 17 00:00:00 2001 From: XB500 Date: Sun, 29 Dec 2024 22:41:41 +0000 Subject: [PATCH 135/157] Opteryx Version 0.19.0-alpha.931 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 9a67dbbdd..590db7591 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 930 +__build__ = 931 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 57eb470651eb50d3358be54d06f911b122c041d2 Mon Sep 17 00:00:00 2001 From: XB500 Date: Mon, 30 Dec 2024 12:07:18 +0000 Subject: [PATCH 136/157] Opteryx Version 0.19.0-alpha.932 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 590db7591..b6775dd47 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 931 +__build__ = 932 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From fd2f5f4b17283ed8c8b9f0393c30009e1ddaee7e Mon Sep 17 00:00:00 2001 From: joocer Date: Mon, 30 Dec 2024 12:08:39 +0000 Subject: [PATCH 137/157] #2173 --- opteryx/functions/__init__.py | 3 +- opteryx/functions/other_functions.py | 41 ++++++++----------- opteryx/operators/async_read_node.py | 7 +--- opteryx/operators/base_plan_node.py | 1 + opteryx/operators/exit_node.py | 15 ++++++- opteryx/operators/heap_sort_node.py | 4 ++ opteryx/operators/sort_node.py | 7 +++- .../bench/defragment_morsels.py | 32 --------------- opteryx/utils/file_decoders.py | 8 ---- .../test_shapes_and_errors_battery.py | 6 +++ tests/storage/test_blob_gcs.py | 8 +++- tests/storage/test_cache_valkey.py | 1 + 12 files changed, 61 insertions(+), 72 deletions(-) delete mode 100644 opteryx/planner/cost_based_optimizer/bench/defragment_morsels.py diff --git a/opteryx/functions/__init__.py b/opteryx/functions/__init__.py index f510068e5..9649e28d5 100644 --- a/opteryx/functions/__init__.py +++ b/opteryx/functions/__init__.py @@ -428,10 +428,11 @@ def sleep(x): "SEARCH": other_functions.search, "COALESCE": _coalesce, "IFNULL": other_functions.if_null, + "IFNOTNULL": other_functions.if_not_null, "SORT": _sort(numpy.sort), "GREATEST": _iterate_single_parameter(numpy.nanmax), "LEAST": _iterate_single_parameter(numpy.nanmin), - "IIF": other_functions.iif, + "IIF": numpy.where, # "GENERATE_SERIES": series.generate_series, "NULLIF": other_functions.null_if, "CASE": select_values, #other_functions.case_when, diff --git a/opteryx/functions/other_functions.py b/opteryx/functions/other_functions.py index e7d119ebc..cab33168d 100644 --- a/opteryx/functions/other_functions.py +++ b/opteryx/functions/other_functions.py @@ -85,23 +85,6 @@ def search(array, item, ignore_case: Optional[List[bool]] = None): return results_mask -def iif(mask, true_values, false_values): - # we have three columns, the first is TRUE offsets - # the second is TRUE response - # the third is FAST response - - if isinstance(mask, pyarrow.lib.BooleanArray) or ( - isinstance(mask, numpy.ndarray) and mask.dtype == numpy.bool_ - ): - mask = numpy.nonzero(mask)[0] - - response = false_values - - for index in mask: - response[index] = true_values[index] - return response - - def if_null(values, replacement): """ Replace null values in the input array with corresponding values from the replacement array. @@ -118,12 +101,6 @@ def if_null(values, replacement): """ from opteryx.managers.expression.unary_operations import _is_null - # Check if the values array is a pyarrow array and convert it to a numpy array if necessary - if isinstance(values, pyarrow.Array): - values = values.to_numpy(False) - if isinstance(values, list): - values = numpy.array(values) - # Create a mask for null values is_null_array = _is_null(values) @@ -131,6 +108,24 @@ def if_null(values, replacement): return numpy.where(is_null_array, replacement, values) +def if_not_null(values: numpy.ndarray, replacements: numpy.ndarray) -> numpy.ndarray: + """ + Retain non-null values in `values`, replacing null values with `replacements`. + + Parameters: + values: A NumPy array containing the original values. + replacements: A NumPy array of replacement values. + is_null: A function that identifies null values in `values`. + + Returns: + A NumPy array with non-null values retained. + """ + from opteryx.managers.expression.unary_operations import _is_not_null + + not_null_mask = _is_not_null(values) + return numpy.where(not_null_mask, replacements, values) + + def null_if(col1, col2): """ Parameters: diff --git a/opteryx/operators/async_read_node.py b/opteryx/operators/async_read_node.py index fdaf3fe0a..4d0e31fa9 100644 --- a/opteryx/operators/async_read_node.py +++ b/opteryx/operators/async_read_node.py @@ -125,7 +125,7 @@ def execute(self, morsel, **kwargs) -> Generator: read_thread.start() morsel = None - arrow_schema = None + arrow_schema = convert_orso_schema_to_arrow_schema(orso_schema, use_identities=True) while True: try: @@ -171,11 +171,8 @@ def execute(self, morsel, **kwargs) -> Generator: morsel = struct_to_jsonb(morsel) morsel = normalize_morsel(orso_schema, morsel) - - if arrow_schema: + if morsel.column_names != ["*"]: morsel = morsel.cast(arrow_schema) - else: - arrow_schema = morsel.schema self.statistics.blobs_read += 1 self.statistics.rows_read += morsel.num_rows diff --git a/opteryx/operators/base_plan_node.py b/opteryx/operators/base_plan_node.py index 809594cb6..75bc8979d 100644 --- a/opteryx/operators/base_plan_node.py +++ b/opteryx/operators/base_plan_node.py @@ -39,6 +39,7 @@ def __init__(self, *, properties, **parameters): self.bytes_in = 0 self.records_out = 0 self.bytes_out = 0 + self.columns = parameters.get("columns", []) @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover diff --git a/opteryx/operators/exit_node.py b/opteryx/operators/exit_node.py index 2152d2712..746fb0f65 100644 --- a/opteryx/operators/exit_node.py +++ b/opteryx/operators/exit_node.py @@ -30,7 +30,7 @@ class ExitNode(BasePlanNode): def __init__(self, properties: QueryProperties, **parameters): BasePlanNode.__init__(self, properties=properties, **parameters) - self.columns = parameters.get("columns", []) + self.at_least_one = False @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover @@ -47,9 +47,22 @@ def name(self): # pragma: no cover def execute(self, morsel: Table, **kwargs) -> Table: # Exit doesn't return EOS if morsel == EOS: + if not self.at_least_one: + import pyarrow + + yield pyarrow.Table.from_arrays( + [pyarrow.array([]) for _ in self.columns], + names=[column.current_name for column in self.columns], + ) yield EOS return + if morsel.num_columns == 0: + yield None + return + + self.at_least_one = True + final_columns = [] final_names = [] for column in self.columns: diff --git a/opteryx/operators/heap_sort_node.py b/opteryx/operators/heap_sort_node.py index 48c42b34e..c3bed4d02 100644 --- a/opteryx/operators/heap_sort_node.py +++ b/opteryx/operators/heap_sort_node.py @@ -71,6 +71,10 @@ def execute(self, morsel: pyarrow.Table, **kwargs) -> pyarrow.Table: yield EOS return + if morsel.num_rows == 0: + yield None + return + if self.table: # Concatenate the accumulated table with the new morsel self.table = concat_tables([self.table, morsel], promote_options="permissive") diff --git a/opteryx/operators/sort_node.py b/opteryx/operators/sort_node.py index a786960d7..813f3b9ec 100644 --- a/opteryx/operators/sort_node.py +++ b/opteryx/operators/sort_node.py @@ -45,10 +45,15 @@ def name(self): # pragma: no cover def execute(self, morsel: Table, **kwargs) -> Table: if morsel != EOS: - self.morsels.append(morsel) + if morsel.num_rows > 0: + self.morsels.append(morsel) yield None return + if len(self.morsels) == 0: + yield EOS + return + table = concat_tables(self.morsels, promote_options="permissive") mapped_order = [] diff --git a/opteryx/planner/cost_based_optimizer/bench/defragment_morsels.py b/opteryx/planner/cost_based_optimizer/bench/defragment_morsels.py deleted file mode 100644 index 49374be98..000000000 --- a/opteryx/planner/cost_based_optimizer/bench/defragment_morsels.py +++ /dev/null @@ -1,32 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# See the License at http://www.apache.org/licenses/LICENSE-2.0 -# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. - - -from orso.tools import random_string - -from opteryx.planner.logical_planner import LogicalPlan -from opteryx.planner.logical_planner import LogicalPlanNode -from opteryx.planner.logical_planner import LogicalPlanStepType - -from .optimization_strategy import HeuristicOptimizerContext -from .optimization_strategy import OptimizationStrategy - - -class DefragmentMorselsStrategy(OptimizationStrategy): # pragma: no cover - def visit( - self, node: LogicalPlanNode, context: HeuristicOptimizerContext - ) -> HeuristicOptimizerContext: - if not context.optimized_plan: - context.optimized_plan = context.pre_optimized_tree.copy() # type: ignore - - if node.node_type in (LogicalPlanStepType.Join,): - for node, _, _ in context.optimized_plan.ingoing_edges(context.node_id): - defrag = LogicalPlanNode(node_type=LogicalPlanStepType.Defragment) - context.optimized_plan.insert_node_after(random_string(), defrag, node) - return context - - def complete(self, plan: LogicalPlan, context: HeuristicOptimizerContext) -> LogicalPlan: - # No finalization needed for this strategy - return plan diff --git a/opteryx/utils/file_decoders.py b/opteryx/utils/file_decoders.py index 6f2cb346c..33690c281 100644 --- a/opteryx/utils/file_decoders.py +++ b/opteryx/utils/file_decoders.py @@ -230,14 +230,6 @@ def parquet_decoder( PredicatePushable.to_dnf(selection) if selection else (None, None) ) - if projection == [] and selection is None: - # Create a boolean array with True values, one for each column in the Parquet file - bool_array = pyarrow.array([True] * parquet_file.metadata.num_rows, type=pyarrow.bool_()) - # Create a PyArrow Table with the column name '*' - table = pyarrow.Table.from_arrays([bool_array], ["*"]) - - return (parquet_file.metadata.num_rows, 0, table) - # Determine the columns needed for projection and filtering projection_set = set(p.source_column for p in projection or []) filter_columns = { diff --git a/tests/sql_battery/test_shapes_and_errors_battery.py b/tests/sql_battery/test_shapes_and_errors_battery.py index 35e2f5def..3b6e18c64 100644 --- a/tests/sql_battery/test_shapes_and_errors_battery.py +++ b/tests/sql_battery/test_shapes_and_errors_battery.py @@ -1240,6 +1240,7 @@ ("SELECT CONCAT_WS('-', ('1', '2', '3'))", 1, 1, None), ("SELECT CONCAT_WS('-', ('1', '2', '3')) FROM $planets", 9, 1, None), ("SELECT IFNULL(death_date, '1970-01-01') FROM $astronauts", 357, 1, None), + ("SELECT IFNOTNULL(death_date, '1970-01-01') FROM $astronauts", 357, 1, None), ("SELECT RANDOM_STRING(88) FROM $planets", 9, 1, None), ("SELECT * FROM $planets WHERE STARTS_WITH(name, 'M')", 2, 20, None), ("SELECT * FROM $astronauts WHERE STARTS_WITH(name, 'Jo')", 23, 19, None), @@ -1690,6 +1691,9 @@ ("SELECT IFNULL(NULL, 'default') as result", 1, 1, None), ("SELECT IFNULL('value', 'default') as result", 1, 1, None), ("SELECT IFNULL(NULL, NULL) as result", 1, 1, None), + ("SELECT IFNOTNULL(NULL, 'default') as result", 1, 1, None), + ("SELECT IFNOTNULL('value', 'default') as result", 1, 1, None), + ("SELECT IFNOTNULL(NULL, NULL) as result", 1, 1, None), ("SELECT COALESCE(NULL, 'default') as coalesced_value", 1, 1, None), ("SELECT COALESCE(NULL, 'default', 'fallback') as coalesced_value", 1, 1, None), ("SELECT COALESCE('first', NULL, 'fallback') as coalesced_value", 1, 1, None), @@ -1928,6 +1932,8 @@ ("SELECT count('a') FROM $satellites", 1, 1, None), ("SELECT avg(1), name FROM $satellites group by name", 177, 2, None), ("SELECT avg(1) FROM $satellites", 1, 1, None), + ("SELECT surface_pressure FROM $planets WHERE IFNOTNULL(surface_pressure, 0.0) == 0.0", 5, 1, None), + # **************************************************************************************** diff --git a/tests/storage/test_blob_gcs.py b/tests/storage/test_blob_gcs.py index 3d237b2ee..98b899695 100644 --- a/tests/storage/test_blob_gcs.py +++ b/tests/storage/test_blob_gcs.py @@ -63,7 +63,13 @@ expected_rowcount=1, expected_columncount=1, stats={"blobs_read": 1018, "rows_read": 9162} - ) + ), + TestCase( + query=f"SELECT kepler_name FROM {BUCKET_NAME}.exoplanets AS exoplanets WHERE kepler_name = 'non-existant' ORDER BY kepler_name LIMIT 5", + expected_rowcount=0, + expected_columncount=1, + stats={"columns_read": 1}, + ), ] diff --git a/tests/storage/test_cache_valkey.py b/tests/storage/test_cache_valkey.py index f76cb197c..d6030d918 100644 --- a/tests/storage/test_cache_valkey.py +++ b/tests/storage/test_cache_valkey.py @@ -61,6 +61,7 @@ def test_skip_on_error(): cache = ValkeyCache() cache.set(b"key", b"value") assert cache.get(b"key") == b"value" + assert cache.hits > 0 cache._consecutive_failures = 10 assert cache.get(b"key") is None From 088ec50774f3c3acc8e4e288480eafdd6405aa2c Mon Sep 17 00:00:00 2001 From: XB500 Date: Mon, 30 Dec 2024 12:09:02 +0000 Subject: [PATCH 138/157] Opteryx Version 0.19.0-alpha.933 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index b6775dd47..733e39f78 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 932 +__build__ = 933 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 8b5fb0e7211f22042689e560b04f13389c7c5c29 Mon Sep 17 00:00:00 2001 From: joocer Date: Mon, 30 Dec 2024 14:32:53 +0000 Subject: [PATCH 139/157] #2173 --- opteryx/operators/base_plan_node.py | 2 +- opteryx/operators/exit_node.py | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/opteryx/operators/base_plan_node.py b/opteryx/operators/base_plan_node.py index 75bc8979d..60e469e03 100644 --- a/opteryx/operators/base_plan_node.py +++ b/opteryx/operators/base_plan_node.py @@ -101,7 +101,7 @@ def __call__(self, morsel: pyarrow.Table, join_leg: str) -> Optional[pyarrow.Tab # if we get empty sets, don't yield them unless they're the only one if result.num_rows > 0: - self.statistics.avoided_empty_datasets += 1 + self.statistics.avoided_empty_morsels += 1 at_least_one = True yield result continue diff --git a/opteryx/operators/exit_node.py b/opteryx/operators/exit_node.py index 746fb0f65..ec24da6eb 100644 --- a/opteryx/operators/exit_node.py +++ b/opteryx/operators/exit_node.py @@ -49,10 +49,17 @@ def execute(self, morsel: Table, **kwargs) -> Table: if morsel == EOS: if not self.at_least_one: import pyarrow + from orso.schema import RelationSchema + from orso.schema import convert_orso_schema_to_arrow_schema + + orso_schema = RelationSchema( + name="Relation", columns=[c.schema_column for c in self.columns] + ) + arrow_shema = convert_orso_schema_to_arrow_schema(orso_schema) yield pyarrow.Table.from_arrays( [pyarrow.array([]) for _ in self.columns], - names=[column.current_name for column in self.columns], + schema=arrow_shema, ) yield EOS return From 120b4e80be97f193b2d49b71952d279469c79113 Mon Sep 17 00:00:00 2001 From: XB500 Date: Mon, 30 Dec 2024 14:33:19 +0000 Subject: [PATCH 140/157] Opteryx Version 0.19.0-alpha.934 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 733e39f78..daf7cd85d 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 933 +__build__ = 934 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From acce1375f43b333ea1dd02a8b6d3913260d2495a Mon Sep 17 00:00:00 2001 From: joocer Date: Mon, 30 Dec 2024 14:56:38 +0000 Subject: [PATCH 141/157] #2173 --- opteryx/operators/exit_node.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/operators/exit_node.py b/opteryx/operators/exit_node.py index ec24da6eb..472708c44 100644 --- a/opteryx/operators/exit_node.py +++ b/opteryx/operators/exit_node.py @@ -64,7 +64,7 @@ def execute(self, morsel: Table, **kwargs) -> Table: yield EOS return - if morsel.num_columns == 0: + if morsel.num_rows == 0: yield None return From 4ff5cab2f3c8ae1e6ed9eb97b36b23b20def5795 Mon Sep 17 00:00:00 2001 From: XB500 Date: Mon, 30 Dec 2024 14:57:02 +0000 Subject: [PATCH 142/157] Opteryx Version 0.19.0-alpha.935 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index daf7cd85d..55a5f6228 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 934 +__build__ = 935 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 36b748a38ab5a523848ba44197a6718a8f7aa906 Mon Sep 17 00:00:00 2001 From: joocer Date: Mon, 30 Dec 2024 16:12:53 +0000 Subject: [PATCH 143/157] #2173 --- opteryx/operators/base_plan_node.py | 4 +- opteryx/operators/exit_node.py | 71 +++++++++++-------- .../flat/ten_files/tweets-0000 copy 10.jsonl | 2 +- .../flat/ten_files/tweets-0000 copy 4.jsonl | 2 +- .../test_shapes_and_errors_battery.py | 2 + 5 files changed, 48 insertions(+), 33 deletions(-) diff --git a/opteryx/operators/base_plan_node.py b/opteryx/operators/base_plan_node.py index 60e469e03..717fceb2d 100644 --- a/opteryx/operators/base_plan_node.py +++ b/opteryx/operators/base_plan_node.py @@ -83,7 +83,9 @@ def __call__(self, morsel: pyarrow.Table, join_leg: str) -> Optional[pyarrow.Tab result = next(generator, END) # Retrieve the next item from the generator execution_time = time.monotonic_ns() - start_time self.execution_time += execution_time - self.statistics.increase("time_" + self.name.lower(), execution_time) + self.statistics.increase( + "time_" + self.name.lower().replace(" ", "_"), execution_time + ) # Update metrics for valid results if result == END: diff --git a/opteryx/operators/exit_node.py b/opteryx/operators/exit_node.py index 472708c44..15eb4a72c 100644 --- a/opteryx/operators/exit_node.py +++ b/opteryx/operators/exit_node.py @@ -32,6 +32,32 @@ def __init__(self, properties: QueryProperties, **parameters): BasePlanNode.__init__(self, properties=properties, **parameters) self.at_least_one = False + final_columns = [] + final_names = [] + for column in self.columns: + final_columns.append(column.schema_column.identity) + final_names.append(column.current_name) + + if len(final_columns) != len(set(final_columns)): # pragma: no cover + from collections import Counter + + duplicates = [column for column, count in Counter(final_columns).items() if count > 1] + matches = {a for a, b in zip(final_names, final_columns) if b in duplicates} + raise AmbiguousIdentifierError( + message=f"Query result contains multiple instances of the same column(s) - `{'`, `'.join(matches)}`" + ) + + if len(set(final_names)) != len(final_names): # we have duplicate names + final_names = [] + for column in self.columns: + # if column.schema_column.origin: + # final_names.append(f"{column.schema_column.origin[0]}.{column.current_name}") + # else: + final_names.append(column.qualified_name) + + self.final_columns = final_columns + self.final_names = final_names + @classmethod def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover raise NotImplementedError() @@ -55,12 +81,16 @@ def execute(self, morsel: Table, **kwargs) -> Table: orso_schema = RelationSchema( name="Relation", columns=[c.schema_column for c in self.columns] ) - arrow_shema = convert_orso_schema_to_arrow_schema(orso_schema) + arrow_shema = convert_orso_schema_to_arrow_schema(orso_schema, use_identities=True) - yield pyarrow.Table.from_arrays( + morsel = pyarrow.Table.from_arrays( [pyarrow.array([]) for _ in self.columns], schema=arrow_shema, ) + morsel = morsel.select(self.final_columns) + morsel = morsel.rename_columns(self.final_names) + yield morsel + yield EOS return @@ -70,40 +100,21 @@ def execute(self, morsel: Table, **kwargs) -> Table: self.at_least_one = True - final_columns = [] - final_names = [] - for column in self.columns: - final_columns.append(column.schema_column.identity) - final_names.append(column.current_name) - - if len(final_columns) != len(set(final_columns)): # pragma: no cover - from collections import Counter - - duplicates = [column for column, count in Counter(final_columns).items() if count > 1] - matches = {a for a, b in zip(final_names, final_columns) if b in duplicates} - raise AmbiguousIdentifierError( - message=f"Query result contains multiple instances of the same column(s) - `{'`, `'.join(matches)}`" - ) - - if len(set(final_names)) != len(final_names): # we have duplicate names - final_names = [] - for column in self.columns: - # if column.schema_column.origin: - # final_names.append(f"{column.schema_column.origin[0]}.{column.current_name}") - # else: - final_names.append(column.qualified_name) - - if not set(final_columns).issubset(morsel.column_names): # pragma: no cover - mapping = {name: int_name for name, int_name in zip(final_columns, final_names)} + if not set(self.final_columns).issubset(morsel.column_names): # pragma: no cover + mapping = { + name: int_name for name, int_name in zip(self.final_columns, self.final_names) + } missing_references = { - mapping.get(ref): ref for ref in final_columns if ref not in morsel.column_names + mapping.get(ref): ref + for ref in self.final_columns + if ref not in morsel.column_names } raise InvalidInternalStateError( f"The following fields were not in the resultset - {', '.join(missing_references.keys())}" ) - morsel = morsel.select(final_columns) - morsel = morsel.rename_columns(final_names) + morsel = morsel.select(self.final_columns) + morsel = morsel.rename_columns(self.final_names) yield morsel diff --git a/testdata/flat/ten_files/tweets-0000 copy 10.jsonl b/testdata/flat/ten_files/tweets-0000 copy 10.jsonl index 15352fffd..1d7fa0f78 100644 --- a/testdata/flat/ten_files/tweets-0000 copy 10.jsonl +++ b/testdata/flat/ten_files/tweets-0000 copy 10.jsonl @@ -22,4 +22,4 @@ {"userid": 14173315, "username": "NBCNews", "user_verified": true, "followers": 7049432, "tweet": "NFL rookie Josh Jacobs, whose family was once homeless, surprises his dad with a house. https://t.co/cbINjU35LI", "location": "New York, NY", "sentiment": -0.125, "timestamp": "2020-01-15T21:53:06"} {"userid": 14173315, "username": "NBCNews", "user_verified": true, "followers": 7050377, "tweet": "A Christian high school student in Kentucky was expelled after school administrators saw a photograph from her 15th birthday party in which she was wearing a rainbow sweater and smiling next to a rainbow birthday cake. https://t.co/rjw1LZU25Y", "location": "New York, NY", "sentiment": 0, "timestamp": "2020-01-16T07:20:03"} {"userid": 14173315, "username": "NBCNews", "user_verified": true, "followers": 7050442, "tweet": "U.K. veteran says Prince Harry defended him from anti-gay soldiers. https://t.co/ilqkgkvvui - @NBCOUT", "location": "New York, NY", "sentiment": 0, "timestamp": "2020-01-16T08:41:07"} -{"userid": 612473, "username": "BBCNews", "user_verified": true, "followers": 10415949, "tweet": "Flybe to switch Newquay-Heathrow flights to Gatwick, in a change that could anger firms in the South West https://t.co/dgpnIVr917", "location": "London", "sentiment": -0.15789473684210525, "timestamp": "2020-01-16T16:10:25"} +{"userid": 612473, "username": "BBCNews", "user_verified": true, "followers": 10000, "tweet": "Flybe to switch Newquay-Heathrow flights to Gatwick, in a change that could anger firms in the South West https://t.co/dgpnIVr917", "location": "London", "sentiment": -0.15789473684210525, "timestamp": "2020-01-16T16:10:25"} diff --git a/testdata/flat/ten_files/tweets-0000 copy 4.jsonl b/testdata/flat/ten_files/tweets-0000 copy 4.jsonl index 15352fffd..e27b4a598 100644 --- a/testdata/flat/ten_files/tweets-0000 copy 4.jsonl +++ b/testdata/flat/ten_files/tweets-0000 copy 4.jsonl @@ -8,7 +8,7 @@ {"userid": 14173315, "username": "NBCNews", "user_verified": true, "followers": 7028438, "tweet": "Man convicted of shooting a U.S. Border Patrol agent 9 years ago in a case that exposed \u201cFast and Furious\u201d federal gun operation is sentenced to life in prison. https://t.co/Q5hmD5TuAl", "location": "New York, NY", "sentiment": -0.36666666666666664, "timestamp": "2020-01-09T11:47:01"} {"userid": 14173315, "username": "NBCNews", "user_verified": true, "followers": 7028252, "tweet": "Opinion | Kaitlin Menza: How Prince Harry's and Meghan's bombshell pushes the monarchy in a new direction https://t.co/L1YCpT3CCy - @NBCNewsTHINK", "location": "New York, NY", "sentiment": 0, "timestamp": "2020-01-09T10:45:03"} {"userid": 14173315, "username": "NBCNews", "user_verified": true, "followers": 7029987, "tweet": "The surveillance video taken from outside Jeffrey Epstein's jail cell on the day of his first apparent suicide attempt has been permanently deleted because video from the wrong floor was saved due to a clerical error, federal prosecutors claim. https://t.co/EmYM6VmhTn", "location": "New York, NY", "sentiment": -0.1, "timestamp": "2020-01-09T19:23:03"} -{"userid": 14173315, "username": "NBCNews", "user_verified": true, "followers": 7031324, "tweet": "The FDA announces more recalls of heartburn medications that have been found to contain trace amounts of a substance that may be linked to cancer. https://t.co/gbH4ybuzVz", "location": "New York, NY", "sentiment": -0.038461538461538464, "timestamp": "2020-01-10T06:01:06"} +{"userid": 14173315, "username": "NBCNews", "user_verified": true, "followers": 100, "tweet": "The FDA announces more recalls of heartburn medications that have been found to contain trace amounts of a substance that may be linked to cancer. https://t.co/gbH4ybuzVz", "location": "New York, NY", "sentiment": -0.038461538461538464, "timestamp": "2020-01-10T06:01:06"} {"userid": 14173315, "username": "NBCNews", "user_verified": true, "followers": 7031963, "tweet": "Australian authorities urged nearly a quarter of a million people to evacuate their homes and prepared military backup as soaring temperatures and erratic winds were expected to fan deadly wildfires across the east coast. https://t.co/3XgoAKPBFv", "location": "New York, NY", "sentiment": 0, "timestamp": "2020-01-10T11:46:02"} {"userid": 14173315, "username": "NBCNews", "user_verified": true, "followers": 7035751, "tweet": "NEW: Magnitude 6.0 earthquake hit Puerto Rico just before 8 a.m. ET Saturday morning, causing further damage as residents are reeling from series of strong quakes. https://t.co/H6tY26NwOo", "location": "New York, NY", "sentiment": -0.037037037037037035, "timestamp": "2020-01-11T14:42:25"} {"userid": 14173315, "username": "NBCNews", "user_verified": true, "followers": 7038057, "tweet": "A Seattle officer was suspended after an investigation found that a lie the officer told a hit-and-run suspect contributed to the man's suicide. https://t.co/cAjWPHEz4y", "location": "New York, NY", "sentiment": -0.16666666666666666, "timestamp": "2020-01-12T07:11:04"} diff --git a/tests/sql_battery/test_shapes_and_errors_battery.py b/tests/sql_battery/test_shapes_and_errors_battery.py index 3b6e18c64..fcd7e8e89 100644 --- a/tests/sql_battery/test_shapes_and_errors_battery.py +++ b/tests/sql_battery/test_shapes_and_errors_battery.py @@ -1933,6 +1933,8 @@ ("SELECT avg(1), name FROM $satellites group by name", 177, 2, None), ("SELECT avg(1) FROM $satellites", 1, 1, None), ("SELECT surface_pressure FROM $planets WHERE IFNOTNULL(surface_pressure, 0.0) == 0.0", 5, 1, None), + ("SELECT username FROM testdata.flat.ten_files WHERE SQRT(followers) = 10 ORDER BY followers DESC LIMIT 10", 1, 1, None), + ("SELECT username FROM testdata.flat.ten_files WHERE SQRT(followers) = 15 ORDER BY followers DESC LIMIT 10", 0, 1, None), # **************************************************************************************** From 6adf003bc63c67d8acb52ef72295013e41ed2773 Mon Sep 17 00:00:00 2001 From: XB500 Date: Mon, 30 Dec 2024 16:13:18 +0000 Subject: [PATCH 144/157] Opteryx Version 0.19.0-alpha.936 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 55a5f6228..fabc8ac22 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 935 +__build__ = 936 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From e927c9d7d672ddd7e341d13e7f9f79bfc8d9c90a Mon Sep 17 00:00:00 2001 From: joocer Date: Tue, 31 Dec 2024 15:55:23 +0000 Subject: [PATCH 145/157] #2177 --- opteryx/planner/binder/operator_map.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/opteryx/planner/binder/operator_map.py b/opteryx/planner/binder/operator_map.py index 0c40853d8..daf2db6f8 100644 --- a/opteryx/planner/binder/operator_map.py +++ b/opteryx/planner/binder/operator_map.py @@ -279,6 +279,9 @@ def determine_type(node) -> OrsoTypes: if node.value in ("NotInSubQuery", "InSubQuery"): return OrsoTypes.BOOLEAN + if node.schema_column: + return node.schema_column.type + if node.left.node_type == NodeType.LITERAL: left_type = node.left.type elif node.left.schema_column: From e2f089c2a66bbc8d80e13669e924b2eeefc6bd00 Mon Sep 17 00:00:00 2001 From: XB500 Date: Tue, 31 Dec 2024 15:55:45 +0000 Subject: [PATCH 146/157] Opteryx Version 0.19.0-alpha.937 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index fabc8ac22..ff03ef467 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 936 +__build__ = 937 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From d63a250ed56c88fe4a50c8021cfebf23cc8408c6 Mon Sep 17 00:00:00 2001 From: joocer Date: Wed, 1 Jan 2025 21:23:29 +0000 Subject: [PATCH 147/157] #2181 --- README.md | 2 ++ opteryx/managers/expression/__init__.py | 21 ++++++++++++++++++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index eaac8afc8..69ee950ee 100644 --- a/README.md +++ b/README.md @@ -334,6 +334,7 @@ We’re actively adding features and improving performance. - **[mabel](https://github.com/mabel-dev/mabel)** Streaming data APIs - **[tarchia](https://github.com/mabel-dev/mabel)** Data Catalog + \ No newline at end of file diff --git a/opteryx/managers/expression/__init__.py b/opteryx/managers/expression/__init__.py index 5c6cd2267..735fa8630 100644 --- a/opteryx/managers/expression/__init__.py +++ b/opteryx/managers/expression/__init__.py @@ -21,6 +21,7 @@ from orso.tools import random_string from orso.types import OrsoTypes from pyarrow import Table +from pyarrow import compute from opteryx.exceptions import ColumnReferencedBeforeEvaluationError from opteryx.exceptions import UnsupportedSyntaxError @@ -107,7 +108,8 @@ def short_cut_and(root, table): # Evaluate left expression left_result = numpy.array(evaluate(root.left, table)) - left_result = numpy.asarray(left_result, dtype=bool) + null_indices = compute.is_null(left_result, nan_is_null=True).to_numpy(False) + left_result = numpy.asarray(left_result, dtype=numpy.bool_) # If all values in left_result are False, no need to evaluate the right expression if not left_result.any(): @@ -123,9 +125,14 @@ def short_cut_and(root, table): right_result = numpy.array(evaluate(root.right, subset_table)) # Combine results - # Iterate over subset_indices and update left_result at those positions left_result[subset_indices] = right_result + # handle nulls + if null_indices.any(): + left_result = left_result.astype(object) + numpy.place(left_result, null_indices, [None]) + return left_result + return left_result @@ -134,7 +141,9 @@ def short_cut_or(root, table): false_indices = numpy.arange(table.num_rows) # Evaluate left expression - left_result = numpy.array(evaluate(root.left, table), dtype=numpy.bool_) + left_result = numpy.array(evaluate(root.left, table)) + null_indices = compute.is_null(left_result, nan_is_null=True).to_numpy(False) + left_result = numpy.asarray(left_result, dtype=numpy.bool_) # Filter out indices where left_result is TRUE subset_indices = false_indices[~left_result] @@ -152,6 +161,12 @@ def short_cut_or(root, table): # Update left_result with the right_result where left_result was False left_result[subset_indices] = left_result[subset_indices] | right_result + # handle nulls + if null_indices.any(): + left_result = left_result.astype(object) + numpy.place(left_result, null_indices, [None]) + return left_result + return left_result From 94a450b6f9d979ab4e1069d587b71244d3f99547 Mon Sep 17 00:00:00 2001 From: XB500 Date: Wed, 1 Jan 2025 21:23:51 +0000 Subject: [PATCH 148/157] Opteryx Version 0.19.0-alpha.938 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index fabc8ac22..36e85fab1 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 936 +__build__ = 938 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 08102a25e3971c1b534abb03cd38b06108fba9b7 Mon Sep 17 00:00:00 2001 From: XB500 Date: Wed, 1 Jan 2025 21:24:29 +0000 Subject: [PATCH 149/157] Opteryx Version 0.19.0-alpha.939 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 36e85fab1..6944d2673 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 938 +__build__ = 939 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 0eff0bf8829dae59f07ccf63a7cade60748f17f5 Mon Sep 17 00:00:00 2001 From: joocer Date: Wed, 1 Jan 2025 22:04:21 +0000 Subject: [PATCH 150/157] #2159 --- opteryx/functions/__init__.py | 1 + opteryx/functions/other_functions.py | 10 +- opteryx/functions/string_functions.py | 1 - opteryx/operators/cross_join_node.py | 8 +- .../strategies/constant_folding.py | 128 ++++++++++++------ .../logical_planner_builders.py | 1 + opteryx/utils/formatter.py | 60 +++++++- tests/misc/test_sql_formatter.py | 2 +- .../test_shapes_and_errors_battery.py | 43 ++++++ tests/sql_battery/tests/optimizer.run_tests | 55 ++++++++ tests/sql_battery/tests/variations.run_tests | 4 + 11 files changed, 252 insertions(+), 61 deletions(-) create mode 100644 tests/sql_battery/tests/optimizer.run_tests create mode 100644 tests/sql_battery/tests/variations.run_tests diff --git a/opteryx/functions/__init__.py b/opteryx/functions/__init__.py index 9649e28d5..9e0316f25 100644 --- a/opteryx/functions/__init__.py +++ b/opteryx/functions/__init__.py @@ -354,6 +354,7 @@ def sleep(x): "STR": cast_varchar, "STRUCT": _iterate_single_parameter(lambda x: orjson.loads(str(x)) if x is not None else None), "DATE": lambda x: compute.cast(x, pyarrow.date32()), + "PASSTHRU": lambda x: x, "BLOB": cast_blob, "TRY_TIMESTAMP": try_cast("TIMESTAMP"), "TRY_BOOLEAN": try_cast("BOOLEAN"), diff --git a/opteryx/functions/other_functions.py b/opteryx/functions/other_functions.py index cab33168d..8f26b8895 100644 --- a/opteryx/functions/other_functions.py +++ b/opteryx/functions/other_functions.py @@ -85,7 +85,7 @@ def search(array, item, ignore_case: Optional[List[bool]] = None): return results_mask -def if_null(values, replacement): +def if_null(values, replacements): """ Replace null values in the input array with corresponding values from the replacement array. @@ -102,10 +102,10 @@ def if_null(values, replacement): from opteryx.managers.expression.unary_operations import _is_null # Create a mask for null values - is_null_array = _is_null(values) + is_null_mask = _is_null(values) # Use NumPy's where function to vectorize the operation - return numpy.where(is_null_array, replacement, values) + return numpy.where(is_null_mask, replacements, values).astype(replacements.dtype) def if_not_null(values: numpy.ndarray, replacements: numpy.ndarray) -> numpy.ndarray: @@ -122,8 +122,8 @@ def if_not_null(values: numpy.ndarray, replacements: numpy.ndarray) -> numpy.nda """ from opteryx.managers.expression.unary_operations import _is_not_null - not_null_mask = _is_not_null(values) - return numpy.where(not_null_mask, replacements, values) + is_not_null_mask = _is_not_null(values) + return numpy.where(is_not_null_mask, replacements, values).astype(replacements.dtype) def null_if(col1, col2): diff --git a/opteryx/functions/string_functions.py b/opteryx/functions/string_functions.py index 80f429a3b..e16a040cd 100644 --- a/opteryx/functions/string_functions.py +++ b/opteryx/functions/string_functions.py @@ -266,7 +266,6 @@ def _inner(val, _from, _for): _from -= 1 _for = int(_for) if _for and _for == _for else None # nosec if _for is None: - print(val, _from) return val[_from:] return val[_from : _for + _from] diff --git a/opteryx/operators/cross_join_node.py b/opteryx/operators/cross_join_node.py index 44dbd5d8c..ab014a50c 100644 --- a/opteryx/operators/cross_join_node.py +++ b/opteryx/operators/cross_join_node.py @@ -28,7 +28,7 @@ from . import JoinNode -INTERNAL_BATCH_SIZE: int = 7500 # config +INTERNAL_BATCH_SIZE: int = 10000 # config MAX_JOIN_SIZE: int = 1000 # config MORSEL_SIZE_BYTES: int = 16 * 1024 * 1024 CROSS_JOIN_UNNEST_BATCH_SIZE = 10000 @@ -115,9 +115,9 @@ def _cross_join_unnest_column( else: # Rebuild the block with the new column data if we have any rows to build for - total_rows = len(indices) # Both arrays have the same length + total_rows = indices.size # Both arrays have the same length block_size = MORSEL_SIZE_BYTES / (left_block.nbytes / left_block.num_rows) - block_size = int(block_size // 1000) * 1000 + block_size = int(block_size / 1000) * 1000 for start_block in range(0, total_rows, block_size): # Compute the end index for the current chunk @@ -128,7 +128,7 @@ def _cross_join_unnest_column( new_column_data_chunk = new_column_data[start_block:end_block] # Create a new block using the chunk of indices - indices_chunk = numpy.array(indices_chunk, dtype=numpy.int32) + indices_chunk = numpy.array(indices_chunk, dtype=numpy.int64) new_block = left_block.take(indices_chunk) new_block = pyarrow.Table.from_batches([new_block], schema=morsel.schema) diff --git a/opteryx/planner/cost_based_optimizer/strategies/constant_folding.py b/opteryx/planner/cost_based_optimizer/strategies/constant_folding.py index 24982afff..81ae02970 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/constant_folding.py +++ b/opteryx/planner/cost_based_optimizer/strategies/constant_folding.py @@ -36,6 +36,27 @@ from .optimization_strategy import OptimizerContext +def _build_if_not_null_node(root, value, value_if_not_null) -> Node: + node = Node(node_type=NodeType.FUNCTION) + node.value = "IFNOTNULL" + node.parameters = [value, value_if_not_null] + node.schema_column = root.schema_column + node.query_column = root.query_column + return node + + +def _build_passthru_node(root, value) -> Node: + if root.node_type == NodeType.COMPARISON_OPERATOR: + return root + + node = Node(node_type=NodeType.FUNCTION) + node.value = "PASSTHRU" + node.parameters = [value] + node.schema_column = root.schema_column + node.query_column = root.query_column + return node + + def fold_constants(root: Node, statistics: QueryStatistics) -> Node: if root.node_type == NodeType.LITERAL: # if we're already a literal (constant), we can't fold @@ -58,70 +79,70 @@ def fold_constants(root: Node, statistics: QueryStatistics) -> Node: and root.right.node_type == NodeType.IDENTIFIER and root.left.value == 0 ): - # 0 * anything = 0 - root.left.schema_column = root.schema_column + # 0 * anything = 0 (except NULL) + node = _build_if_not_null_node(root, root.right, build_literal_node(0)) statistics.optimization_constant_fold_reduce += 1 - return root.left # 0 + return node if ( root.value == "Multiply" and root.right.node_type == NodeType.LITERAL and root.left.node_type == NodeType.IDENTIFIER and root.right.value == 0 ): - # anything * 0 = 0 - root.right.schema_column = root.schema_column + # anything * 0 = 0 (except NULL) + node = _build_if_not_null_node(root, root.left, build_literal_node(0)) statistics.optimization_constant_fold_reduce += 1 - return root.right # 0 + return node if ( root.value == "Multiply" and root.left.node_type == NodeType.LITERAL and root.right.node_type == NodeType.IDENTIFIER and root.left.value == 1 ): - # 1 * anything = anything - root.right.query_column = root.query_column + # 1 * anything = anything (except NULL) + node = _build_passthru_node(root, root.right) statistics.optimization_constant_fold_reduce += 1 - return root.right # anything + return node if ( root.value == "Multiply" and root.right.node_type == NodeType.LITERAL and root.left.node_type == NodeType.IDENTIFIER and root.right.value == 1 ): - # anything * 1 = anything - root.left.query_column = root.query_column + # anything * 1 = anything (except NULL) + node = _build_passthru_node(root, root.left) statistics.optimization_constant_fold_reduce += 1 - return root.left # anything + return node if ( root.value in "Plus" and root.left.node_type == NodeType.LITERAL and root.right.node_type == NodeType.IDENTIFIER and root.left.value == 0 ): - # 0 + anything = anything - root.right.query_column = root.query_column + # 0 + anything = anything (except NULL) + node = _build_passthru_node(root, root.right) statistics.optimization_constant_fold_reduce += 1 - return root.right # anything + return node if ( root.value in ("Plus", "Minus") and root.right.node_type == NodeType.LITERAL and root.left.node_type == NodeType.IDENTIFIER and root.right.value == 0 ): - # anything +/- 0 = anything - root.left.query_column = root.query_column + # anything +/- 0 = anything (except NULL) + node = _build_passthru_node(root, root.left) statistics.optimization_constant_fold_reduce += 1 - return root.left # anything + return node if ( root.value == "Divide" and root.right.node_type == NodeType.LITERAL and root.left.node_type == NodeType.IDENTIFIER and root.right.value == 1 ): - # anything / 1 = anything - root.left.schema_column = root.schema_column + # anything / 1 = anything (except NULL) + node = _build_passthru_node(root, root.left) statistics.optimization_constant_fold_reduce += 1 - return root.left # anything + return node if root.node_type == NodeType.COMPARISON_OPERATOR: # anything LIKE '%' is true for non null values @@ -138,6 +159,7 @@ def fold_constants(root: Node, statistics: QueryStatistics) -> Node: node.schema_column = root.schema_column node.centre = root.left node.query_column = root.query_column + node.alias = root.alias statistics.optimization_constant_fold_reduce += 1 return node @@ -154,37 +176,37 @@ def fold_constants(root: Node, statistics: QueryStatistics) -> Node: and root.left.type == OrsoTypes.BOOLEAN and root.left.value ): - # True OR anything is True - root.left.schema_column = root.schema_column + # True OR anything is True (including NULL) + node = _build_passthru_node(root, root.left) statistics.optimization_constant_fold_boolean_reduce += 1 - return root.left + return node if ( root.right.node_type == NodeType.LITERAL and root.right.type == OrsoTypes.BOOLEAN and root.right.value ): - # anything OR True is True - root.right.schema_column = root.schema_column + # anything OR True is True (including NULL) + node = _build_passthru_node(root, root.right) statistics.optimization_constant_fold_boolean_reduce += 1 - return root.right + return node if ( root.left.node_type == NodeType.LITERAL and root.left.type == OrsoTypes.BOOLEAN and not root.left.value ): - # False OR anything is anything - root.right.schema_column = root.schema_column + # False OR anything is anything (except NULL) + node = _build_passthru_node(root, root.right) statistics.optimization_constant_fold_boolean_reduce += 1 - return root.right + return node if ( root.right.node_type == NodeType.LITERAL and root.right.type == OrsoTypes.BOOLEAN and not root.right.value ): - # anything OR False is anything - root.left.schema_column = root.schema_column + # anything OR False is anything (except NULL) + node = _build_passthru_node(root, root.left) statistics.optimization_constant_fold_boolean_reduce += 1 - return root.left + return node elif root.node_type == NodeType.AND: if ( @@ -192,37 +214,38 @@ def fold_constants(root: Node, statistics: QueryStatistics) -> Node: and root.left.type == OrsoTypes.BOOLEAN and not root.left.value ): - # False AND anything is False - root.left.schema_column = root.schema_column + # False AND anything is False (including NULL) + node = _build_passthru_node(root, root.left) statistics.optimization_constant_fold_boolean_reduce += 1 - return root.left + return node if ( root.right.node_type == NodeType.LITERAL and root.right.type == OrsoTypes.BOOLEAN and not root.right.value ): - # anything AND False is False - root.right.schema_column = root.schema_column + # anything AND False is False (including NULL) + node = _build_passthru_node(root, root.right) statistics.optimization_constant_fold_boolean_reduce += 1 - return root.right + return node if ( root.left.node_type == NodeType.LITERAL and root.left.type == OrsoTypes.BOOLEAN and root.left.value ): - # True AND anything is anything - root.right.schema_column = root.schema_column + # True AND anything is anything (except NULL) + node = _build_passthru_node(root, root.right) statistics.optimization_constant_fold_boolean_reduce += 1 - return root.right + return node if ( root.right.node_type == NodeType.LITERAL and root.right.type == OrsoTypes.BOOLEAN and root.right.value ): - # anything AND True is anything - root.left.schema_column = root.schema_column + # anything AND True is anything (except NULL) + node = _build_passthru_node(root, root.left) + node.type = OrsoTypes.BOOLEAN statistics.optimization_constant_fold_boolean_reduce += 1 - return root.left + return node return root @@ -248,6 +271,7 @@ def fold_constants(root: Node, statistics: QueryStatistics) -> Node: statistics.optimization_constant_aggregation += 1 return root if agg.value in ("AVG", "MIN", "MAX"): + # AVG, MIN, MAX of a constant is the constant statistics.optimization_constant_aggregation += 1 return build_literal_node(agg.parameters[0].value, root, root.schema_column.type) @@ -286,6 +310,20 @@ def visit(self, node: LogicalPlanNode, context: OptimizerContext) -> OptimizerCo node.columns = [fold_constants(c, self.statistics) for c in node.columns] context.optimized_plan[context.node_id] = node + # remove nesting in order by and group by clauses + if node.node_type == LogicalPlanStepType.Order: + new_order_by = [] + for field, order in node.order_by: + while field.node_type == NodeType.NESTED: + field = field.centre + new_order_by.append((field, order)) + node.order_by = new_order_by + context.optimized_plan[context.node_id] = node + + if node.node_type == LogicalPlanStepType.AggregateAndGroup: + node.groups = [g.centre if g.node_type == NodeType.NESTED else g for g in node.groups] + context.optimized_plan[context.node_id] = node + return context def complete(self, plan: LogicalPlan, context: OptimizerContext) -> LogicalPlan: diff --git a/opteryx/planner/logical_planner/logical_planner_builders.py b/opteryx/planner/logical_planner/logical_planner_builders.py index 88866f88f..0648aaaf9 100644 --- a/opteryx/planner/logical_planner/logical_planner_builders.py +++ b/opteryx/planner/logical_planner/logical_planner_builders.py @@ -575,6 +575,7 @@ def pattern_match(branch, alias: Optional[List[str]] = None, key=None): value=key, left=left, right=right, + alias=alias, ) diff --git a/opteryx/utils/formatter.py b/opteryx/utils/formatter.py index 21e6cf617..8b2c49cf2 100644 --- a/opteryx/utils/formatter.py +++ b/opteryx/utils/formatter.py @@ -22,6 +22,8 @@ def tokenize_string(string): def format_sql(sql): # pragma: no cover """ Adds colorization to SQL statements to make it easier to find keywords and literals + + It's not intended to be perfect, it's just to assist reading test outputs """ def color_comments(string): # pragma: no cover @@ -56,16 +58,24 @@ def _replacer(match): if word.endswith("'"): in_string_literal = False formatted_sql += "\033[0m " - elif word in ("(", ")", ",", ";"): + elif word in ("(", ")", ",", ";", "[", "]"): formatted_sql += "\033[38;5;102m" + word + "\033[0m " elif word.upper() in { "ANALYZE", "ANTI", "AS", + "ASC", "BY", + "CASE", + "CREATE", "CROSS", + "DATE", "DATES", + "DESC", "DISTINCT", + "ELSE", + "END", + "EXECUTE", "EXPLAIN", "FOR", "FROM", @@ -77,6 +87,7 @@ def _replacer(match): "JOIN", "LEFT", "LIMIT", + "MONTH", "OFFSET", "ON", "ORDER", @@ -87,43 +98,62 @@ def _replacer(match): "SET", "SHOW", "SINCE", + "THEN", "TODAY", "UNION", + "UNNEST", "USE", "USING", + "WHEN", "WHERE", "WITH", "YESTERDAY", }: formatted_sql += "\033[38;2;139;233;253m" + word.upper() + "\033[0m " - elif word.upper() in ("TRUE", "FALSE", "NULL", "DAY", "MONTH", "MINUTE"): - formatted_sql += "\033[38;2;255;184;108m" + word.upper() + "\033[0m " - elif (i + 1) < len(words) and words[i + 1] == "(": + elif word.upper() in ("TRUE", "FALSE", "NULL"): + formatted_sql += "\033[38;2;255;184;188m" + word.upper() + "\033[0m " + elif ((i + 1) < len(words) and words[i + 1] == "(") or word.upper() in ( + "ANY", + "CURRENT_TIME", + ): formatted_sql += "\033[38;2;80;250;123m" + word.upper() + "\033[0m" elif word.upper() in ( "=", + "==", ">=", "<=", "!=", + "%", "<", ">", "<>", "-", "+", + "*", + "/", + "//", + "||", + "|", + "DIV", "LIKE", "ILIKE", "RLIKE", "NOT", "AND", "OR", + "XOR", "IN", "SIMILAR", "TO", "BETWEEN", "IS", + "->", + "->>", + "::", + "@?", ): formatted_sql += "\033[38;2;189;147;249m" + word.upper() + "\033[0m " - elif word.replace(".", "", 1).isdigit(): + elif word.replace(".", "", 1).lstrip("-").isdigit(): formatted_sql += "\033[38;2;255;184;108m" + word + "\033[0m " elif word == "\n": formatted_sql = formatted_sql.strip() + "\n" @@ -132,7 +162,27 @@ def _replacer(match): formatted_sql += "\033[0m" + spaces_after = ( + "FROM", + "WHERE", + "JOIN", + "/", + "AND", + "OR", + "NOT", + "XOR", + "+", + "-", + "*", + "UNION", + "ON", + ) + formatted_sql = formatted_sql.replace(" \033[38;5;102m(", "\033[38;5;102m(") + for item in spaces_after: + formatted_sql = formatted_sql.replace( + f"{item}\033[0m\033[38;5;102m(", f"{item}\033[0m \033[38;5;102m(" + ) formatted_sql = formatted_sql.replace("(\033[0m ", "(\033[0m") formatted_sql = formatted_sql.replace(" \033[38;5;102m)", "\033[38;5;102m)") formatted_sql = formatted_sql.replace(" \033[38;5;102m,", "\033[38;5;102m,") diff --git a/tests/misc/test_sql_formatter.py b/tests/misc/test_sql_formatter.py index ddecfc228..bbdaad090 100644 --- a/tests/misc/test_sql_formatter.py +++ b/tests/misc/test_sql_formatter.py @@ -11,7 +11,7 @@ def test_format_sql(): formatted_sql = format_sql(sql) assert ( formatted_sql - == "\x1b[38;2;139;233;253mSELECT\x1b[0m * \x1b[38;2;139;233;253mFROM\x1b[0m mytable \x1b[0m" + == "\x1b[38;2;139;233;253mSELECT\x1b[0m \x1b[38;2;189;147;249m*\x1b[0m \x1b[38;2;139;233;253mFROM\x1b[0m mytable \x1b[0m" ), str(formatted_sql.encode()) + "\n" + formatted_sql diff --git a/tests/sql_battery/test_shapes_and_errors_battery.py b/tests/sql_battery/test_shapes_and_errors_battery.py index fcd7e8e89..29fa1d931 100644 --- a/tests/sql_battery/test_shapes_and_errors_battery.py +++ b/tests/sql_battery/test_shapes_and_errors_battery.py @@ -2246,6 +2246,49 @@ ("SELECT town, LENGTH(NULLIF(town, 'Inglewood')) FROM (SELECT birth_place->'town' AS town FROM $astronauts) AS T", 357, 2, None), ("SELECT town, LENGTH(NULLIF(town, b'Inglewood')) FROM (SELECT birth_place->>'town' AS town FROM $astronauts) AS T", 357, 2, None), ("SELECT town, LENGTH(NULLIF(town, 'Inglewood')) FROM (SELECT birth_place->>'town' AS town FROM $astronauts) AS T", None, None, IncompatibleTypesError), + # 2159 + ("SELECT * FROM (SELECT 1 * surface_pressure as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None), + ("SELECT * FROM (SELECT surface_pressure * 1 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None), + ("SELECT * FROM (SELECT 0 * surface_pressure as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None), + ("SELECT * FROM (SELECT surface_pressure * 0 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None), + ("SELECT * FROM (SELECT 0 + surface_pressure as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None), + ("SELECT * FROM (SELECT surface_pressure + 0 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None), + ("SELECT * FROM (SELECT surface_pressure - 0 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None), + ("SELECT * FROM (SELECT surface_pressure / 1 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None), + ("SELECT * FROM (SELECT TRUE AND (surface_pressure != 0) as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None), + ("SELECT * FROM (SELECT FALSE AND (surface_pressure != 0) as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 0, 2, None), + ("SELECT * FROM (SELECT TRUE OR (surface_pressure != 0) as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 0, 2, None), + ("SELECT * FROM (SELECT FALSE OR (surface_pressure != 0) as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None), + ("SELECT * FROM (SELECT (surface_pressure != 0) AND TRUE as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None), + ("SELECT * FROM (SELECT (surface_pressure != 0) AND FALSE as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 0, 2, None), + ("SELECT * FROM (SELECT (surface_pressure != 0) OR TRUE as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 0, 2, None), + ("SELECT * FROM (SELECT (surface_pressure != 0) OR FALSE as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None), + ("SELECT * FROM (SELECT name LIKE '%' as opt, name FROM $planets) AS sub WHERE opt IS TRUE", 9 , 2, None), + ("SELECT * FROM $planets WHERE (surface_pressure * 1 IS NULL) OR (surface_pressure + 0 IS NULL)", 4, 20, None), + ("SELECT * FROM $planets WHERE (surface_pressure / 1 IS NULL) AND (TRUE OR surface_pressure IS NULL)", 4, 20, None), + ("SELECT * FROM $planets WHERE ((FALSE AND (surface_pressure * 1) != 0) IS NULL) OR (surface_pressure IS NULL)", 4, 20, None), + ("SELECT * FROM $planets WHERE ((surface_pressure != 0) AND TRUE) IS NULL", 4, 20, None), + ("SELECT * FROM $planets WHERE ((surface_pressure != 0) OR FALSE) IS NULL", 4, 20, None), + ("SELECT COUNT(surface_pressure - 0) AS count_opt FROM $planets WHERE surface_pressure IS NULL", 1, 1, None), + ("SELECT name || '' AS opt FROM $planets", 9, 1, None), + ("SELECT name LIKE '%' AS opt FROM $planets", 9, 1, None), + ("SELECT name LIKE '%a%' AS opt FROM $planets", 9, 1, None), +# ("SELECT surface_pressure * 1 + surface_pressure * 0 AS opt FROM $planets", 4, 1, None), + ("SELECT (TRUE AND (surface_pressure != 0)) OR FALSE AS opt FROM $planets", 9, 1, None), + ("SELECT (surface_pressure / 1) * (surface_pressure - 0) AS opt FROM $planets", 9, 1, None), + # 2180 + ("SELECT * FROM $planets ORDER BY (id)", 9, 20, None), + ("SELECT * FROM $planets ORDER BY (id) ASC", 9, 20, None), + ("SELECT * FROM $planets ORDER BY (id) DESC", 9, 20, None), + ("SELECT * FROM $planets ORDER BY name, (id)", 9, 20, None), + ("SELECT * FROM $planets ORDER BY name ASC, (id)", 9, 20, None), + ("SELECT * FROM $planets ORDER BY name DESC, (id)", 9, 20, None), + ("SELECT * FROM $planets ORDER BY (name), (id)", 9, 20, None), + ("SELECT * FROM $planets ORDER BY (name) ASC, (id) DESC", 9, 20, None), + ("SELECT * FROM $planets ORDER BY (name) DESC, (id)", 9, 20, None), + ("SELECT * FROM $planets ORDER BY (id), name", 9, 20, None), + ("SELECT * FROM $planets ORDER BY (id) ASC, name", 9, 20, None), + ("SELECT * FROM $planets ORDER BY (id) DESC, name", 9, 20, None), ] # fmt:on diff --git a/tests/sql_battery/tests/optimizer.run_tests b/tests/sql_battery/tests/optimizer.run_tests new file mode 100644 index 000000000..66b7f5c5f --- /dev/null +++ b/tests/sql_battery/tests/optimizer.run_tests @@ -0,0 +1,55 @@ +SELECT NULL OR TRUE -- TRUE +SELECT NULL AND TRUE -- NULL +SELECT NULL OR FALSE -- NULL +SELECT NULL AND FALSE -- FALSE + +SELECT TRUE OR NULL +SELECT TRUE AND NULL +SELECT FALSE OR NULL +SELECT FALSE AND NULL +SELECT (TRUE) OR NULL +SELECT (TRUE) AND NULL +SELECT (FALSE) OR NULL +SELECT (FALSE) AND NULL +SELECT TRUE OR (NULL) +SELECT TRUE AND (NULL) +SELECT FALSE OR (NULL) +SELECT FALSE AND (NULL) +SELECT (TRUE OR NULL) +SELECT (TRUE AND NULL) +SELECT (FALSE OR NULL) +SELECT (FALSE AND NULL) +SELECT * FROM (SELECT 1 * surface_pressure as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT surface_pressure * 1 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT 0 * surface_pressure as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT surface_pressure * 0 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT 0 + surface_pressure as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT surface_pressure + 0 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT surface_pressure - 0 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT surface_pressure / 1 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT TRUE AND surface_pressure as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT FALSE AND surface_pressure as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT TRUE OR surface_pressure as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT FALSE OR surface_pressure as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT surface_pressure AND TRUE as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT surface_pressure AND FALSE as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT surface_pressure OR TRUE as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT surface_pressure OR FALSE as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT 1 + surface_pressure as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT surface_pressure + 1 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT 1 - surface_pressure as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT surface_pressure - 1 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT 1 / surface_pressure as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT 0 / surface_pressure as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT 1 * 1 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT 0 * 0 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT 1 + 1 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT 1 - 1 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT 1 / 1 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT 1 * 0 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT 0 * 1 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT 1 + 0 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT 0 + 1 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT 1 - 0 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT 0 - 1 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT 1 / 0 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL diff --git a/tests/sql_battery/tests/variations.run_tests b/tests/sql_battery/tests/variations.run_tests new file mode 100644 index 000000000..5be2ee870 --- /dev/null +++ b/tests/sql_battery/tests/variations.run_tests @@ -0,0 +1,4 @@ +# This is a complex query, we're going to run AI generated variations of it to test edge cases + +SET @planet = 'Saturn'; SELECT planets.name AS nom, bigsats.occurances AS big_satellites_occurances, smallsats.occurances AS small_satellites_occurances FROM (SELECT DISTINCT id AS planetId, name FROM $planets WHERE name = @planet) AS planets LEFT JOIN (SELECT planetId, COUNT(*) AS occurances FROM $satellites s1 FOR DATES BETWEEN '2022-01-01' AND TODAY WHERE gm > 10 GROUP BY planetId) AS bigsats ON bigsats.planetId = planets.planetId LEFT JOIN (SELECT planetId, COUNT(*) AS occurances FROM $satellites s2 FOR DATES IN LAST_MONTH WHERE gm < 10 GROUP BY planetId) AS smallsats ON smallsats.planetId = planets.planetId; +-- SET @planet = 'Saturn'; SELECT planets.name AS nom, bigsats.occurances AS big_satellites_occurances, smallsats.occurances AS small_satellites_occurances FROM (SELECT DISTINCT id AS planetId, name FROM $planets WHERE (name = @planet)) AS planets LEFT JOIN (SELECT planetId, COUNT(*) AS occurances FROM $satellites s1 FOR DATES BETWEEN '2022-01-01' AND TODAY WHERE (gm > 10) GROUP BY (planetId)) AS bigsats ON (bigsats.planetId = planets.planetId) LEFT JOIN (SELECT planetId, COUNT(*) AS occurances FROM $satellites s2 FOR DATES IN LAST_MONTH WHERE (gm < 10) GROUP BY (planetId)) AS smallsats ON (smallsats.planetId = planets.planetId); \ No newline at end of file From 82f86334b32ddbe45aa1e3164e90939ceb1cd35c Mon Sep 17 00:00:00 2001 From: XB500 Date: Wed, 1 Jan 2025 22:04:44 +0000 Subject: [PATCH 151/157] Opteryx Version 0.19.0-alpha.940 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index fabc8ac22..c8446023d 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 936 +__build__ = 940 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 0e978922d3389e4824ad709613a4a60af332877d Mon Sep 17 00:00:00 2001 From: XB500 Date: Wed, 1 Jan 2025 22:08:42 +0000 Subject: [PATCH 152/157] Opteryx Version 0.19.0-alpha.941 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index c8446023d..e42f25d47 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 940 +__build__ = 941 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 86589a7aa213d4371cb6ed2cb1edb8edad1b5c92 Mon Sep 17 00:00:00 2001 From: joocer Date: Thu, 2 Jan 2025 22:00:57 +0000 Subject: [PATCH 153/157] #2186 --- opteryx/compiled/structures/hash_table.pyx | 14 +- .../capabilities/predicate_pushable.py | 10 + opteryx/functions/__init__.py | 281 +++++++++--------- opteryx/managers/expression/__init__.py | 25 +- opteryx/operators/aggregate_and_group_node.py | 4 - opteryx/operators/aggregate_node.py | 4 - opteryx/operators/base_plan_node.py | 5 +- opteryx/operators/cross_join_node.py | 4 - opteryx/operators/distinct_node.py | 4 - opteryx/operators/exit_node.py | 4 - opteryx/operators/explain_node.py | 4 - opteryx/operators/filter_join_node.py | 4 - opteryx/operators/filter_node.py | 6 +- opteryx/operators/function_dataset_node.py | 4 - opteryx/operators/heap_sort_node.py | 4 - opteryx/operators/inner_join_node.py | 21 -- opteryx/operators/inner_join_node_single.py | 4 - opteryx/operators/limit_node.py | 4 - opteryx/operators/outer_join_node.py | 4 - opteryx/operators/projection_node.py | 4 - opteryx/operators/read_node.py | 4 - opteryx/operators/set_variable_node.py | 4 - opteryx/operators/show_columns_node.py | 4 - opteryx/operators/show_create_node.py | 4 - opteryx/operators/show_value_node.py | 4 - opteryx/operators/sort_node.py | 4 - opteryx/operators/union_node.py | 4 - opteryx/planner/binder/binder.py | 4 +- opteryx/planner/binder/operator_map.py | 39 ++- .../logical_planner_builders.py | 4 + opteryx/utils/sql.py | 7 +- tests/misc/test_cast.py | 22 +- .../test_shapes_and_errors_battery.py | 21 +- .../sql_battery/tests/feature_tests.run_tests | 10 +- tests/sql_battery/tests/optimizer.run_tests | 16 +- 35 files changed, 270 insertions(+), 295 deletions(-) diff --git a/opteryx/compiled/structures/hash_table.pyx b/opteryx/compiled/structures/hash_table.pyx index 43d8eec71..1d8d38f4e 100644 --- a/opteryx/compiled/structures/hash_table.pyx +++ b/opteryx/compiled/structures/hash_table.pyx @@ -30,15 +30,19 @@ cdef class HashTable: cpdef bint insert(self, int64_t key, int64_t row_id): # If the key is already in the hash table, append the row_id to the existing list. # Otherwise, create a new list with the row_id. - if self.hash_table.find(key) != self.hash_table.end(): - self.hash_table[key].push_back(row_id) - return False - self.hash_table[key] = vector[int64_t](1, row_id) + cdef unordered_map[int64_t, vector[int64_t]].iterator it + it = self.hash_table.find(key) + if it == self.hash_table.end(): + self.hash_table[key] = vector[int64_t]() + self.hash_table[key].reserve(16) + self.hash_table[key].push_back(row_id) return True cpdef vector[int64_t] get(self, int64_t key): # Return the list of row IDs for the given key, or an empty list if the key is not found. - if self.hash_table.find(key) != self.hash_table.end(): + cdef unordered_map[int64_t, vector[int64_t]].iterator it + it = self.hash_table.find(key) + if it != self.hash_table.end(): return self.hash_table[key] return vector[int64_t]() diff --git a/opteryx/connectors/capabilities/predicate_pushable.py b/opteryx/connectors/capabilities/predicate_pushable.py index 73d019005..2e6ee4437 100644 --- a/opteryx/connectors/capabilities/predicate_pushable.py +++ b/opteryx/connectors/capabilities/predicate_pushable.py @@ -23,6 +23,7 @@ from opteryx.exceptions import NotSupportedError from opteryx.managers.expression import NodeType +from opteryx.managers.expression import get_all_nodes_of_type from opteryx.models import Node @@ -52,8 +53,17 @@ class PredicatePushable: PUSHABLE_TYPES: set = {t for t in OrsoTypes} def can_push(self, operator: Node, types: set = None) -> bool: + # we can only push simple expressions + all_nodes = get_all_nodes_of_type(operator.condition, ("*",)) + if any( + n.node_type not in (NodeType.IDENTIFIER, NodeType.LITERAL, NodeType.COMPARISON_OPERATOR) + for n in all_nodes + ): + return False + # we can only push certain types if types and not types.issubset(self.PUSHABLE_TYPES): return False + # we can only push certain operators return self.PUSHABLE_OPS.get(operator.condition.value, False) def __init__(self, **kwargs): diff --git a/opteryx/functions/__init__.py b/opteryx/functions/__init__.py index 9e0316f25..206fd2034 100644 --- a/opteryx/functions/__init__.py +++ b/opteryx/functions/__init__.py @@ -15,6 +15,7 @@ import orjson import pyarrow from orso.cityhash import CityHash64 +from orso.types import OrsoTypes from pyarrow import ArrowNotImplementedError from pyarrow import compute @@ -202,7 +203,7 @@ def try_cast(_type): "DECIMAL": decimal.Decimal, "VARCHAR": lambda x: str(x) if x is not None else x, "TIMESTAMP": dates.parse_iso, - "STRUCT": orjson.loads, + "STRUCT": lambda x: str(x).encode() if x is not None and not isinstance(x, bytes) else x, "DATE": lambda x: dates.parse_iso(x).date(), } if _type in casters: @@ -335,161 +336,159 @@ def sleep(x): # The type is needed particularly when returning Python objects that # the first entry is NONE. FUNCTIONS = { - "VERSION": lambda x: None, # * - "CONNECTION_ID": lambda x: None, # * - "DATABASE": lambda x: None, # * - "USER": lambda x: None, # * - # DEBUG: "SLEEP": lambda x: [sleep(x)], # SLEEP is only available in 'debug' mode + "VERSION": (lambda x: None, OrsoTypes.VARCHAR, 1.0), + "CONNECTION_ID": (lambda x: None, OrsoTypes.VARCHAR, 1.0), + "DATABASE": (lambda x: None, OrsoTypes.VARCHAR, 1.0), + "USER": (lambda x: None, OrsoTypes.VARCHAR, 1.0), + # DEBUG: "SLEEP": (lambda x: [sleep(x)], OrsoTypes.NULL, 1.0), # SLEEP is only available in 'debug' mode # TYPE CONVERSION - "TIMESTAMP": lambda x: compute.cast(x, pyarrow.timestamp("us")), - "BOOLEAN": lambda x: compute.cast(x, "bool"), - "NUMERIC": lambda x: compute.cast(x, "float64"), - "INTEGER": lambda x: compute.cast(x, "int64", safe=False), - "DOUBLE": lambda x: compute.cast(x, "float64"), - "FLOAT": lambda x: compute.cast(x, "float64"), - "DECIMAL": lambda x: compute.cast(x, pyarrow.decimal128(19)), - "VARCHAR": cast_varchar, - "STRING": cast_varchar, - "STR": cast_varchar, - "STRUCT": _iterate_single_parameter(lambda x: orjson.loads(str(x)) if x is not None else None), - "DATE": lambda x: compute.cast(x, pyarrow.date32()), - "PASSTHRU": lambda x: x, - "BLOB": cast_blob, - "TRY_TIMESTAMP": try_cast("TIMESTAMP"), - "TRY_BOOLEAN": try_cast("BOOLEAN"), - "TRY_NUMERIC": try_cast("DOUBLE"), - "TRY_VARCHAR": try_cast("VARCHAR"), - "TRY_BLOB": try_cast("BLOB"), - "TRY_STRING": try_cast("VARCHAR"), # alias for VARCHAR - "TRY_STRUCT": try_cast("STRUCT"), - "TRY_INTEGER": try_cast("INTEGER"), - "TRY_DECIMAL": try_cast("DECIMAL"), - "TRY_DOUBLE": try_cast("DOUBLE"), - "TRY_DATE": try_cast("DATE"), + "TIMESTAMP": (lambda x: compute.cast(x, pyarrow.timestamp("us")), OrsoTypes.TIMESTAMP, 1.0), + "BOOLEAN": (lambda x: compute.cast(x, "bool"), OrsoTypes.BOOLEAN, 1.0), + "NUMERIC": (lambda x: compute.cast(x, "float64"), OrsoTypes.DOUBLE, 1.0), + "INTEGER": (lambda x: compute.cast(x, "int64", safe=False), OrsoTypes.INTEGER, 1.0), + "DOUBLE": (lambda x: compute.cast(x, "float64"), OrsoTypes.DOUBLE, 1.0), + "FLOAT": (lambda x: compute.cast(x, "float64"), OrsoTypes.DOUBLE, 1.0), + "DECIMAL": (lambda x: compute.cast(x, pyarrow.decimal128(19)), OrsoTypes.DECIMAL, 1.0), + "VARCHAR": (cast_varchar, OrsoTypes.VARCHAR, 1.0), + "STRING": (cast_varchar, OrsoTypes.VARCHAR, 1.0), + "STR": (cast_varchar, OrsoTypes.VARCHAR, 1.0), + "STRUCT": (try_cast("BLOB"), OrsoTypes.BLOB, 1.0), + "DATE": (lambda x: compute.cast(x, pyarrow.date32()), OrsoTypes.DATE, 1.0), + "PASSTHRU": (lambda x: x, 0, 1.0), + "BLOB": (cast_blob, OrsoTypes.BLOB, 1.0), + "TRY_TIMESTAMP": (try_cast("TIMESTAMP"), OrsoTypes.TIMESTAMP, 1.0), + "TRY_BOOLEAN": (try_cast("BOOLEAN"), OrsoTypes.BOOLEAN, 1.0), + "TRY_NUMERIC": (try_cast("DOUBLE"), OrsoTypes.DOUBLE, 1.0), + "TRY_VARCHAR": (try_cast("VARCHAR"), OrsoTypes.VARCHAR, 1.0), + "TRY_BLOB": (try_cast("BLOB"), OrsoTypes.BLOB, 1.0), + "TRY_STRING": (try_cast("VARCHAR"), OrsoTypes.VARCHAR, 1.0), + "TRY_STRUCT": (try_cast("STRUCT"), OrsoTypes.STRUCT, 1.0), + "TRY_INTEGER": (try_cast("INTEGER"), OrsoTypes.INTEGER, 1.0), + "TRY_DECIMAL": (try_cast("DECIMAL"), OrsoTypes.DECIMAL, 1.0), + "TRY_DOUBLE": (try_cast("DOUBLE"), OrsoTypes.DOUBLE, 1.0), + "TRY_DATE": (try_cast("DATE"), OrsoTypes.DATE, 1.0), # CHARS - "CHAR": string_functions.to_char, - "ASCII": string_functions.to_ascii, + "CHAR": (string_functions.to_char, OrsoTypes.VARCHAR, 1.0), + "ASCII": (string_functions.to_ascii, OrsoTypes.INTEGER, 1.0), # STRINGS - "LEN": _iterate_single_parameter(get_len), # LENGTH(str) -> int - "LENGTH": _iterate_single_parameter(get_len), # LENGTH(str) -> int - "UPPER": compute.utf8_upper, # UPPER(str) -> str - "LOWER": compute.utf8_lower, # LOWER(str) -> str - "LEFT": string_functions.string_slicer_left, - "RIGHT": string_functions.string_slicer_right, - "REVERSE": compute.utf8_reverse, - "SOUNDEX": string_functions.soundex, - "TITLE": compute.utf8_title, - "CONCAT": string_functions.concat, - "CONCAT_WS": string_functions.concat_ws, - "STARTS_WITH": string_functions.starts_w, - "ENDS_WITH": string_functions.ends_w, - "SUBSTRING": string_functions.substring, - "POSITION": _iterate_double_parameter(string_functions.position), - "TRIM": string_functions.trim, - "LTRIM": string_functions.ltrim, - "RTRIM": string_functions.rtrim, - "LPAD": string_functions.left_pad, - "RPAD": string_functions.right_pad, - "LEVENSHTEIN": string_functions.levenshtein, - "SPLIT": string_functions.split, - "MATCH_AGAINST": string_functions.match_against, - "REGEXP_REPLACE": string_functions.regex_replace, + "LEN": (_iterate_single_parameter(get_len), OrsoTypes.INTEGER, 1.0), # LENGTH(str) -> int + "LENGTH": (_iterate_single_parameter(get_len), OrsoTypes.INTEGER, 1.0), # LENGTH(str) -> int + "UPPER": (compute.utf8_upper, OrsoTypes.VARCHAR, 1.0), # UPPER(str) -> str + "LOWER": (compute.utf8_lower, OrsoTypes.VARCHAR, 1.0), # LOWER(str) -> str + "LEFT": (string_functions.string_slicer_left, OrsoTypes.VARCHAR, 1.0), + "RIGHT": (string_functions.string_slicer_right, OrsoTypes.VARCHAR, 1.0), + "REVERSE": (compute.utf8_reverse, OrsoTypes.VARCHAR, 1.0), + "SOUNDEX": (string_functions.soundex, OrsoTypes.VARCHAR, 1.0), + "TITLE": (compute.utf8_title, OrsoTypes.VARCHAR, 1.0), + "CONCAT": (string_functions.concat, OrsoTypes.VARCHAR, 1.0), + "CONCAT_WS": (string_functions.concat_ws, OrsoTypes.VARCHAR, 1.0), + "STARTS_WITH": (string_functions.starts_w, OrsoTypes.BOOLEAN, 1.0), + "ENDS_WITH": (string_functions.ends_w, OrsoTypes.BOOLEAN, 1.0), + "SUBSTRING": (string_functions.substring, OrsoTypes.VARCHAR, 1.0), + "POSITION": (_iterate_double_parameter(string_functions.position), OrsoTypes.INTEGER, 1.0), + "TRIM": (string_functions.trim, OrsoTypes.VARCHAR, 1.0), + "LTRIM": (string_functions.ltrim, OrsoTypes.VARCHAR, 1.0), + "RTRIM": (string_functions.rtrim, OrsoTypes.VARCHAR, 1.0), + "LPAD": (string_functions.left_pad, OrsoTypes.VARCHAR, 1.0), + "RPAD": (string_functions.right_pad, OrsoTypes.VARCHAR, 1.0), + "LEVENSHTEIN": (string_functions.levenshtein, OrsoTypes.INTEGER, 1.0), + "SPLIT": (string_functions.split, OrsoTypes.ARRAY, 1.0), + "MATCH_AGAINST": (string_functions.match_against, OrsoTypes.BOOLEAN, 1.0), + "REGEXP_REPLACE": (string_functions.regex_replace, OrsoTypes.VARCHAR, 1.0), # HASHING & ENCODING - "HASH": _iterate_single_parameter(lambda x: hex(CityHash64(str(x)))[2:]), - "MD5": _iterate_single_parameter(string_functions.get_md5), - "SHA1": _iterate_single_parameter(string_functions.get_sha1), - "SHA224": _iterate_single_parameter(string_functions.get_sha224), - "SHA256": _iterate_single_parameter(string_functions.get_sha256), - "SHA384": _iterate_single_parameter(string_functions.get_sha384), - "SHA512": _iterate_single_parameter(string_functions.get_sha512), - "RANDOM": number_functions.random_number, - "RAND": number_functions.random_number, - "NORMAL": number_functions.random_normal, - "RANDOM_STRING": number_functions.random_string, - "BASE64_ENCODE": _iterate_single_parameter(string_functions.get_base64_encode), - "BASE64_DECODE": _iterate_single_parameter(string_functions.get_base64_decode), - "BASE85_ENCODE": _iterate_single_parameter(string_functions.get_base85_encode), - "BASE85_DECODE": _iterate_single_parameter(string_functions.get_base85_decode), - "HEX_ENCODE": _iterate_single_parameter(string_functions.get_hex_encode), - "HEX_DECODE": _iterate_single_parameter(string_functions.get_hex_decode), + "HASH": (_iterate_single_parameter(lambda x: hex(CityHash64(str(x)))[2:]), OrsoTypes.BLOB, 1.0), + "MD5": (_iterate_single_parameter(string_functions.get_md5), OrsoTypes.BLOB, 1.0), + "SHA1": (_iterate_single_parameter(string_functions.get_sha1), OrsoTypes.BLOB, 1.0), + "SHA224": (_iterate_single_parameter(string_functions.get_sha224), OrsoTypes.BLOB, 1.0), + "SHA256": (_iterate_single_parameter(string_functions.get_sha256), OrsoTypes.BLOB, 1.0), + "SHA384": (_iterate_single_parameter(string_functions.get_sha384), OrsoTypes.BLOB, 1.0), + "SHA512": (_iterate_single_parameter(string_functions.get_sha512), OrsoTypes.BLOB, 1.0), + "RANDOM": (number_functions.random_number, OrsoTypes.DOUBLE, 1.0), + "RAND": (number_functions.random_number, OrsoTypes.DOUBLE, 1.0), + "NORMAL": (number_functions.random_normal, OrsoTypes.DOUBLE, 1.0), + "RANDOM_STRING": (number_functions.random_string, OrsoTypes.BLOB, 1.0), + "BASE64_ENCODE": (_iterate_single_parameter(string_functions.get_base64_encode), OrsoTypes.BLOB, 1.0), + "BASE64_DECODE": (_iterate_single_parameter(string_functions.get_base64_decode), OrsoTypes.BLOB, 1.0), + "BASE85_ENCODE": (_iterate_single_parameter(string_functions.get_base85_encode), OrsoTypes.BLOB, 1.0), + "BASE85_DECODE": (_iterate_single_parameter(string_functions.get_base85_decode), OrsoTypes.BLOB, 1.0), + "HEX_ENCODE": (_iterate_single_parameter(string_functions.get_hex_encode), OrsoTypes.BLOB, 1.0), + "HEX_DECODE": (_iterate_single_parameter(string_functions.get_hex_decode), OrsoTypes.BLOB, 1.0), + # OTHER - "GET": _get, - "GET_STRING": _get_string, - "LIST_CONTAINS": _iterate_double_parameter(other_functions.list_contains), - "ARRAY_CONTAINS": _iterate_double_parameter(other_functions.list_contains), - "LIST_CONTAINS_ANY": list_contains_any, - "ARRAY_CONTAINS_ANY": list_contains_any, - "LIST_CONTAINS_ALL": other_functions.list_contains_all, - "ARRAY_CONTAINS_ALL": other_functions.list_contains_all, - "SEARCH": other_functions.search, - "COALESCE": _coalesce, - "IFNULL": other_functions.if_null, - "IFNOTNULL": other_functions.if_not_null, - "SORT": _sort(numpy.sort), - "GREATEST": _iterate_single_parameter(numpy.nanmax), - "LEAST": _iterate_single_parameter(numpy.nanmin), - "IIF": numpy.where, -# "GENERATE_SERIES": series.generate_series, - "NULLIF": other_functions.null_if, - "CASE": select_values, #other_functions.case_when, - "JSONB_OBJECT_KEYS": other_functions.jsonb_object_keys, + "GET": (_get, 0, 1.0), + "GET_STRING": (_get_string, OrsoTypes.VARCHAR, 1.0), + "LIST_CONTAINS": (_iterate_double_parameter(other_functions.list_contains), OrsoTypes.BOOLEAN, 1.0), + "ARRAY_CONTAINS": (_iterate_double_parameter(other_functions.list_contains), OrsoTypes.BOOLEAN, 1.0), + "LIST_CONTAINS_ANY": (list_contains_any, OrsoTypes.BOOLEAN, 1.0), + "ARRAY_CONTAINS_ANY": (list_contains_any, OrsoTypes.BOOLEAN, 1.0), + "LIST_CONTAINS_ALL": (other_functions.list_contains_all, OrsoTypes.BOOLEAN, 1.0), + "ARRAY_CONTAINS_ALL": (other_functions.list_contains_all, OrsoTypes.BOOLEAN, 1.0), + "SEARCH": (other_functions.search, OrsoTypes.BOOLEAN, 1.0), + "COALESCE": (_coalesce, 0, 1.0), + "IFNULL": (other_functions.if_null, 0, 1.0), + "IFNOTNULL": (other_functions.if_not_null, 0, 1.0), + "SORT": (_sort(numpy.sort), OrsoTypes.ARRAY, 1.0), + "GREATEST": (_iterate_single_parameter(numpy.nanmax), 0, 1.0), + "LEAST": (_iterate_single_parameter(numpy.nanmin), 0, 1.0), + "IIF": (numpy.where, 0, 1.0), + "NULLIF": (other_functions.null_if, 0, 1.0), + "CASE": (select_values, 0, 1.0), + "JSONB_OBJECT_KEYS": (other_functions.jsonb_object_keys, OrsoTypes.ARRAY, 1.0), + # Vector - "COSINE_SIMILARITY": other_functions.cosine_similarity, + "COSINE_SIMILARITY": (other_functions.cosine_similarity, OrsoTypes.DOUBLE, 1.0), # NUMERIC - "ROUND": number_functions.round, - "FLOOR": number_functions.floor, - "CEIL": number_functions.ceiling, - "CEILING": number_functions.ceiling, # deprecated, remove 0.19.0 - "ABS": compute.abs, - "ABSOLUTE": compute.abs, # deprecated, remove 0.19.0 - "SIGN": compute.sign, - "SIGNUM": compute.sign, - "SQRT": compute.sqrt, - "TRUNC": compute.trunc, - "TRUNCATE": compute.trunc, # deprecated, remove 0.19.0 - "PI": lambda x: None, # * - "PHI": lambda x: None, # * - "E": lambda x: None, # * - "INT": _iterate_single_parameter(int), - "POWER": number_functions.safe_power, - "LN": compute.ln, - "LOG10": compute.log10, - "LOG2": compute.log2, - "LOG": compute.logb, + "ROUND": (number_functions.round, OrsoTypes.DOUBLE, 1.0), + "FLOOR": (number_functions.floor, OrsoTypes.DOUBLE, 1.0), + "CEIL": (number_functions.ceiling, OrsoTypes.DOUBLE, 1.0), + "CEILING": (number_functions.ceiling, OrsoTypes.DOUBLE, 1.0), # deprecated, remove 0.19.0 + "ABS": (compute.abs, 0, 1.0), + "ABSOLUTE": (compute.abs, 0, 1.0), # deprecated, remove 0.19.0 + "SIGN": (compute.sign, OrsoTypes.INTEGER, 1.0), + "SIGNUM": (compute.sign, OrsoTypes.INTEGER, 1.0), + "SQRT": (compute.sqrt, OrsoTypes.DOUBLE, 1.0), + "TRUNC": (compute.trunc, OrsoTypes.INTEGER, 1.0), + "TRUNCATE": (compute.trunc, OrsoTypes.INTEGER, 1.0), # deprecated, remove 0.19.0 + "PI": (lambda x: None, OrsoTypes.DOUBLE, 1.0), + "PHI": (lambda x: None, OrsoTypes.DOUBLE, 1.0), + "E": (lambda x: None, OrsoTypes.DOUBLE, 1.0), + "INT": (_iterate_single_parameter(int), OrsoTypes.INTEGER, 1.0), + "POWER": (number_functions.safe_power, OrsoTypes.DOUBLE, 1.0), + "LN": (compute.ln, OrsoTypes.DOUBLE, 1.0), + "LOG10": (compute.log10, OrsoTypes.DOUBLE, 1.0), + "LOG2": (compute.log2, OrsoTypes.DOUBLE, 1.0), + "LOG": (compute.logb, OrsoTypes.DOUBLE, 1.0), # DATES & TIMES - "DATE_TRUNC": _iterate_double_parameter_field_second(dates.date_trunc), - "TIME_BUCKET": date_functions.date_floor, - "DATEDIFF": date_functions.date_diff, - "TIMEDIFF": date_functions.time_diff, - "DATEPART": date_functions.date_part, - "DATE_FORMAT": date_functions.date_format, - "CURRENT_TIME": lambda x: None, # * - "UTC_TIMESTAMP": lambda x: None, # * - "NOW": lambda x: None, # * - "CURRENT_DATE": lambda x: None, # * - "TODAY": lambda x: None, # * -# "TIME": _repeat_no_parameters(date_functions.get_time), - "YESTERDAY": lambda x: None, # * -# "DATE": lambda x: compute.cast(x, "date32"), #_iterate_single_parameter(date_functions.get_date), - "YEAR": compute.year, - "MONTH": compute.month, - "DAY": compute.day, - "WEEK": compute.iso_week, - "HOUR": compute.hour, - "MINUTE": compute.minute, - "SECOND": compute.second, - "QUARTER": compute.quarter, - "FROM_UNIXTIME": date_functions.from_unixtimestamp, - "UNIXTIME": date_functions.unixtime, - + "DATE_TRUNC": (_iterate_double_parameter_field_second(dates.date_trunc), OrsoTypes.TIMESTAMP, 1.0), + "TIME_BUCKET": (date_functions.date_floor, OrsoTypes.TIMESTAMP, 1.0), + "DATEDIFF": (date_functions.date_diff, OrsoTypes.INTEGER, 1.0), + "TIMEDIFF": (date_functions.time_diff, OrsoTypes.INTEGER, 1.0), + "DATEPART": (date_functions.date_part, 0, 1.0), + "DATE_FORMAT": (date_functions.date_format, OrsoTypes.VARCHAR, 1.0), + "CURRENT_TIME": (lambda x: None, OrsoTypes.TIMESTAMP, 1.0), + "UTC_TIMESTAMP": (lambda x: None, OrsoTypes.INTEGER, 1.0), + "NOW": (lambda x: None, OrsoTypes.TIMESTAMP, 1.0), + "CURRENT_DATE": (lambda x: None, OrsoTypes.TIMESTAMP, 1.0), + "TODAY": (lambda x: None, OrsoTypes.TIME, 1.0), + "YESTERDAY": (lambda x: None, OrsoTypes.TIME, 1.0), + "YEAR": (compute.year, OrsoTypes.INTEGER, 1.0), + "MONTH": (compute.month, OrsoTypes.INTEGER, 1.0), + "DAY": (compute.day, OrsoTypes.INTEGER, 1.0), + "WEEK": (compute.iso_week, OrsoTypes.INTEGER, 1.0), + "HOUR": (compute.hour, OrsoTypes.INTEGER, 1.0), + "MINUTE": (compute.minute, OrsoTypes.INTEGER, 1.0), + "SECOND": (compute.second, OrsoTypes.INTEGER, 1.0), + "QUARTER": (compute.quarter, OrsoTypes.INTEGER, 1.0), + "FROM_UNIXTIME": (date_functions.from_unixtimestamp, OrsoTypes.TIMESTAMP, 1.0), + "UNIXTIME": (date_functions.unixtime, OrsoTypes.INTEGER, 1.0), } # fmt:on @@ -532,7 +531,7 @@ def apply_function(function: str = None, *parameters): parameters = [arr.compress(valid_positions) for arr in parameters] compressed = True - interim_results = FUNCTIONS[function](*parameters) + interim_results = FUNCTIONS[function][0](*parameters) if compressed: # fill the result set diff --git a/opteryx/managers/expression/__init__.py b/opteryx/managers/expression/__init__.py index 735fa8630..34d9e13ea 100644 --- a/opteryx/managers/expression/__init__.py +++ b/opteryx/managers/expression/__init__.py @@ -203,7 +203,7 @@ def _inner_evaluate(root: Node, table: Table): literal_type = root.type if literal_type == OrsoTypes.ARRAY: # creating ARRAY columns is expensive, so we don't create one full length - return numpy.array([root.value]) + return numpy.array([root.value], dtype=numpy.ndarray) if literal_type == OrsoTypes.VARCHAR: return numpy.array([root.value] * table.num_rows, dtype=numpy.unicode_) if literal_type == OrsoTypes.BLOB: @@ -319,7 +319,7 @@ def get_all_nodes_of_type(root, select_nodes: tuple) -> list: while stack: node = stack.pop() - if node.node_type in select_nodes: + if select_nodes == ("*",) or node.node_type in select_nodes: identifiers.append(node) if node.parameters: stack.extend( @@ -360,7 +360,26 @@ def evaluate_and_append(expressions, table: Table): if isinstance(new_column, pyarrow.ChunkedArray): new_column = new_column.combine_chunks() - table = table.append_column(statement.schema_column.identity, new_column) + + # if we know the intended type of the result column, cast it + field = statement.schema_column.identity + if statement.schema_column.type in (OrsoTypes.ARRAY,): + pass + elif statement.schema_column.type not in ( + 0, + OrsoTypes._MISSING_TYPE, + OrsoTypes.INTERVAL, + ): + field = pyarrow.field( + name=statement.schema_column.identity, + type=statement.schema_column.arrow_field.type, + ) + if isinstance(new_column, pyarrow.Array): + new_column = new_column.cast(field.type) + else: + new_column = pyarrow.array(new_column[0], type=field.type) + + table = table.append_column(field, new_column) return table diff --git a/opteryx/operators/aggregate_and_group_node.py b/opteryx/operators/aggregate_and_group_node.py index 1ab25ad72..848550781 100644 --- a/opteryx/operators/aggregate_and_group_node.py +++ b/opteryx/operators/aggregate_and_group_node.py @@ -67,10 +67,6 @@ def __init__(self, properties: QueryProperties, **parameters): self.buffer = [] - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - @property def config(self): # pragma: no cover from opteryx.managers.expression import format_expression diff --git a/opteryx/operators/aggregate_node.py b/opteryx/operators/aggregate_node.py index c4ec29117..1677a57cc 100644 --- a/opteryx/operators/aggregate_node.py +++ b/opteryx/operators/aggregate_node.py @@ -190,10 +190,6 @@ def __init__(self, properties: QueryProperties, **parameters): self.buffer = [] - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - @property def config(self): # pragma: no cover return str(self.aggregates) diff --git a/opteryx/operators/base_plan_node.py b/opteryx/operators/base_plan_node.py index 717fceb2d..55c99ca83 100644 --- a/opteryx/operators/base_plan_node.py +++ b/opteryx/operators/base_plan_node.py @@ -41,10 +41,7 @@ def __init__(self, *, properties, **parameters): self.bytes_out = 0 self.columns = parameters.get("columns", []) - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - + @property def config(self) -> str: return "" diff --git a/opteryx/operators/cross_join_node.py b/opteryx/operators/cross_join_node.py index ab014a50c..4f196bb66 100644 --- a/opteryx/operators/cross_join_node.py +++ b/opteryx/operators/cross_join_node.py @@ -291,10 +291,6 @@ def __init__(self, properties: QueryProperties, **parameters): self.continue_executing = True - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - @property def name(self): # pragma: no cover return "Cross Join" diff --git a/opteryx/operators/distinct_node.py b/opteryx/operators/distinct_node.py index 7d0a755fd..1ee41c9c7 100644 --- a/opteryx/operators/distinct_node.py +++ b/opteryx/operators/distinct_node.py @@ -29,10 +29,6 @@ def __init__(self, properties: QueryProperties, **parameters): self._distinct_on = [col.schema_column.identity for col in self._distinct_on] self.hash_set = HashSet() - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - @property def config(self): # pragma: no cover return "" diff --git a/opteryx/operators/exit_node.py b/opteryx/operators/exit_node.py index 15eb4a72c..f5e31a8ab 100644 --- a/opteryx/operators/exit_node.py +++ b/opteryx/operators/exit_node.py @@ -58,10 +58,6 @@ def __init__(self, properties: QueryProperties, **parameters): self.final_columns = final_columns self.final_names = final_names - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - @property def config(self): # pragma: no cover return None diff --git a/opteryx/operators/explain_node.py b/opteryx/operators/explain_node.py index 55540ae91..331598015 100644 --- a/opteryx/operators/explain_node.py +++ b/opteryx/operators/explain_node.py @@ -32,10 +32,6 @@ def name(self): # pragma: no cover def config(self): return "" - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - def execute(self, morsel: Table, **kwargs) -> Table: if self._query_plan: yield self._query_plan.explain(self.analyze) diff --git a/opteryx/operators/filter_join_node.py b/opteryx/operators/filter_join_node.py index 00245903b..06e9d111b 100644 --- a/opteryx/operators/filter_join_node.py +++ b/opteryx/operators/filter_join_node.py @@ -39,10 +39,6 @@ def __init__(self, properties: QueryProperties, **parameters): self.right_hash_set = None - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - @property def name(self): # pragma: no cover return self.join_type.replace(" ", "_") diff --git a/opteryx/operators/filter_node.py b/opteryx/operators/filter_node.py index 4795093ed..f4fad1a72 100644 --- a/opteryx/operators/filter_node.py +++ b/opteryx/operators/filter_node.py @@ -36,10 +36,6 @@ def __init__(self, properties: QueryProperties, **parameters): select_nodes=(NodeType.FUNCTION,), ) - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - @property def config(self): # pragma: no cover return format_expression(self.filter) @@ -70,7 +66,7 @@ def execute(self, morsel: pyarrow.Table, **kwargs) -> pyarrow.Table: ) mask = numpy.nonzero(mask)[0] - # if there's no matching rows, return empty morsel + # if there's no matching rows, don't return anything if mask.size > 0 and not numpy.all(mask is None): yield morsel.take(pyarrow.array(mask)) else: diff --git a/opteryx/operators/function_dataset_node.py b/opteryx/operators/function_dataset_node.py index ed61c3959..88366c845 100644 --- a/opteryx/operators/function_dataset_node.py +++ b/opteryx/operators/function_dataset_node.py @@ -89,10 +89,6 @@ def __init__(self, properties: QueryProperties, **parameters): self.columns = parameters.get("columns", []) self.args = parameters.get("args", []) - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - @property def config(self): # pragma: no cover from opteryx.managers.expression import format_expression diff --git a/opteryx/operators/heap_sort_node.py b/opteryx/operators/heap_sort_node.py index c3bed4d02..736f39438 100644 --- a/opteryx/operators/heap_sort_node.py +++ b/opteryx/operators/heap_sort_node.py @@ -51,10 +51,6 @@ def __init__(self, properties: QueryProperties, **parameters): f"`ORDER BY` must reference columns as they appear in the `SELECT` clause. {cnfe}" ) - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - @property def config(self): # pragma: no cover return f"LIMIT = {self.limit} ORDER = " + ", ".join( diff --git a/opteryx/operators/inner_join_node.py b/opteryx/operators/inner_join_node.py index c7c0171f8..9b71c5b9c 100644 --- a/opteryx/operators/inner_join_node.py +++ b/opteryx/operators/inner_join_node.py @@ -85,10 +85,6 @@ def __init__(self, properties: QueryProperties, **parameters): self.lock = Lock() - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - @property def name(self): # pragma: no cover return "Inner Join" @@ -110,17 +106,6 @@ def execute(self, morsel: Table, join_leg: str) -> Table: start = time.monotonic_ns() self.left_hash = hash_join_map(self.left_relation, self.left_columns) self.statistics.time_build_hash_map += time.monotonic_ns() - start - - for right_morsel in self.right_buffer: - yield inner_join_with_preprocessed_left_side( - left_relation=self.left_relation, - right_relation=right_morsel, - join_columns=self.right_columns, - hash_table=self.left_hash, - ) - self.right_buffer.clear() - - return else: self.left_buffer.append(morsel) yield None @@ -131,12 +116,6 @@ def execute(self, morsel: Table, join_leg: str) -> Table: yield EOS return - if self.left_hash is None: - # if we've not built the hash map, cache this morsel - self.right_buffer.append(morsel) - yield None - return - # do the join new_morsel = inner_join_with_preprocessed_left_side( left_relation=self.left_relation, diff --git a/opteryx/operators/inner_join_node_single.py b/opteryx/operators/inner_join_node_single.py index 5a4899676..52ddc1350 100644 --- a/opteryx/operators/inner_join_node_single.py +++ b/opteryx/operators/inner_join_node_single.py @@ -167,10 +167,6 @@ def __init__(self, properties: QueryProperties, **parameters): self.left_hash = None self.left_relation = None - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - @property def name(self): # pragma: no cover return "Inner Join (Single)" diff --git a/opteryx/operators/limit_node.py b/opteryx/operators/limit_node.py index 2b6182ae8..332d48378 100644 --- a/opteryx/operators/limit_node.py +++ b/opteryx/operators/limit_node.py @@ -28,10 +28,6 @@ def __init__(self, properties: QueryProperties, **parameters): self.remaining_rows = self.limit if self.limit is not None else float("inf") self.rows_left_to_skip = max(0, self.offset) - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - @property def name(self): # pragma: no cover return "LIMIT" diff --git a/opteryx/operators/outer_join_node.py b/opteryx/operators/outer_join_node.py index 243ccddfb..7c5eddc6a 100644 --- a/opteryx/operators/outer_join_node.py +++ b/opteryx/operators/outer_join_node.py @@ -181,10 +181,6 @@ def __init__(self, properties: QueryProperties, **parameters): self.right_buffer = [] self.left_relation = None - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - @property def name(self): # pragma: no cover return self.join_type diff --git a/opteryx/operators/projection_node.py b/opteryx/operators/projection_node.py index 1edf09b42..c03f1df92 100644 --- a/opteryx/operators/projection_node.py +++ b/opteryx/operators/projection_node.py @@ -41,10 +41,6 @@ def __init__(self, properties: QueryProperties, **parameters): self.columns = parameters["projection"] - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - @property def config(self): # pragma: no cover from opteryx.managers.expression import format_expression diff --git a/opteryx/operators/read_node.py b/opteryx/operators/read_node.py index 7023e62e7..5086e2b0f 100644 --- a/opteryx/operators/read_node.py +++ b/opteryx/operators/read_node.py @@ -157,10 +157,6 @@ def to_dict(self) -> dict: "filters": self.predicates, } - @classmethod - def from_dict(cls, dic: dict) -> "BasePlanNode": - raise NotImplementedError() - @property def name(self): # pragma: no cover """friendly name for this step""" diff --git a/opteryx/operators/set_variable_node.py b/opteryx/operators/set_variable_node.py index da3c33de2..681ff6701 100644 --- a/opteryx/operators/set_variable_node.py +++ b/opteryx/operators/set_variable_node.py @@ -24,10 +24,6 @@ def __init__(self, properties: QueryProperties, **parameters): self.value = parameters.get("value") self.variables = parameters.get("variables") - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - @property def name(self): # pragma: no cover return "Set Variables" diff --git a/opteryx/operators/show_columns_node.py b/opteryx/operators/show_columns_node.py index d95295c7e..eecf9d616 100644 --- a/opteryx/operators/show_columns_node.py +++ b/opteryx/operators/show_columns_node.py @@ -48,10 +48,6 @@ def __init__(self, properties: QueryProperties, **parameters): self.collector = None self.seen = False - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - @property def name(self): # pragma: no cover return "Show Columns" diff --git a/opteryx/operators/show_create_node.py b/opteryx/operators/show_create_node.py index b8409e1bb..cbfb42992 100644 --- a/opteryx/operators/show_create_node.py +++ b/opteryx/operators/show_create_node.py @@ -25,10 +25,6 @@ def __init__(self, properties: QueryProperties, **parameters): self.object_type = parameters.get("object_type") self.object_name = parameters.get("object_name") - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - @property def name(self): # pragma: no cover return "Show" diff --git a/opteryx/operators/show_value_node.py b/opteryx/operators/show_value_node.py index 265317bb6..a177986e9 100644 --- a/opteryx/operators/show_value_node.py +++ b/opteryx/operators/show_value_node.py @@ -34,10 +34,6 @@ def __init__(self, properties: QueryProperties, **parameters): self.key = self.value self.value = properties.variables[self.value] - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - @property def name(self): # pragma: no cover return "Show Value" diff --git a/opteryx/operators/sort_node.py b/opteryx/operators/sort_node.py index 813f3b9ec..198e32a38 100644 --- a/opteryx/operators/sort_node.py +++ b/opteryx/operators/sort_node.py @@ -31,10 +31,6 @@ def __init__(self, properties: QueryProperties, **parameters): self.order_by = parameters.get("order_by", []) self.morsels = [] - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - @property def config(self): # pragma: no cover return ", ".join([f"{i[0].value} {i[1][0:3].upper()}" for i in self.order_by]) diff --git a/opteryx/operators/union_node.py b/opteryx/operators/union_node.py index c40cf9559..2c2af87de 100644 --- a/opteryx/operators/union_node.py +++ b/opteryx/operators/union_node.py @@ -25,10 +25,6 @@ def __init__(self, properties: QueryProperties, **parameters): self.seen_first_eos = False self.schema = None - @classmethod - def from_json(cls, json_obj: str) -> "BasePlanNode": # pragma: no cover - raise NotImplementedError() - @property def name(self): # pragma: no cover return "Union" diff --git a/opteryx/planner/binder/binder.py b/opteryx/planner/binder/binder.py index 7ffe96c47..104a708e0 100644 --- a/opteryx/planner/binder/binder.py +++ b/opteryx/planner/binder/binder.py @@ -22,6 +22,7 @@ from opteryx.exceptions import InvalidInternalStateError from opteryx.exceptions import UnexpectedDatasetReferenceError from opteryx.functions import DEPRECATED_FUNCTIONS +from opteryx.functions import FUNCTIONS from opteryx.functions import fixed_value_function from opteryx.managers.expression import NodeType from opteryx.models import Node @@ -306,7 +307,8 @@ def inner_binder(node: Node, context: BindingContext) -> Tuple[Node, Any]: node.type = result_type node.value = fixed_function_result else: - schema_column = FunctionColumn(name=column_name, type=0, aliases=aliases) + _, result_type, _ = FUNCTIONS.get(node.value, (None, 0, None)) + schema_column = FunctionColumn(name=column_name, type=result_type, aliases=aliases) schemas["$derived"].columns.append(schema_column) node.derived_from = [] node.schema_column = schema_column diff --git a/opteryx/planner/binder/operator_map.py b/opteryx/planner/binder/operator_map.py index daf2db6f8..d2b796619 100644 --- a/opteryx/planner/binder/operator_map.py +++ b/opteryx/planner/binder/operator_map.py @@ -34,11 +34,24 @@ class OperatorMapType(NamedTuple): (OrsoTypes.BLOB, OrsoTypes.VARCHAR, "BitwiseOr"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), (OrsoTypes.BLOB, OrsoTypes.VARCHAR, "StringConcat"): OperatorMapType(OrsoTypes.BLOB, None, 100.0), (OrsoTypes.BLOB, OrsoTypes.VARCHAR, "Arrow"): OperatorMapType(OrsoTypes._MISSING_TYPE, None, 100.0), - (OrsoTypes.BLOB, OrsoTypes.VARCHAR, "LongArrow"): OperatorMapType(OrsoTypes.VARCHAR, None, 100.0), + (OrsoTypes.BLOB, OrsoTypes.VARCHAR, "LongArrow"): OperatorMapType(OrsoTypes.BLOB, None, 100.0), (OrsoTypes.BLOB, OrsoTypes.VARCHAR, "AtQuestion"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), (OrsoTypes.BLOB, OrsoTypes.BLOB, "Arrow"): OperatorMapType(OrsoTypes._MISSING_TYPE, None, 100.0), - (OrsoTypes.BLOB, OrsoTypes.BLOB, "LongArrow"): OperatorMapType(OrsoTypes.VARCHAR, None, 100.0), + (OrsoTypes.BLOB, OrsoTypes.BLOB, "LongArrow"): OperatorMapType(OrsoTypes.BLOB, None, 100.0), (OrsoTypes.BLOB, OrsoTypes.BLOB, "AtQuestion"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), + (OrsoTypes.BLOB, OrsoTypes.BLOB, "Eq"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), + (OrsoTypes.BLOB, OrsoTypes.BLOB, "NotEq"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), + (OrsoTypes.BLOB, OrsoTypes.BLOB, "Gt"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), + (OrsoTypes.BLOB, OrsoTypes.BLOB, "GtEq"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), + (OrsoTypes.BLOB, OrsoTypes.BLOB, "Lt"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), + (OrsoTypes.BLOB, OrsoTypes.BLOB, "LtEq"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), + (OrsoTypes.BLOB, OrsoTypes.BLOB, "Like"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), + (OrsoTypes.BLOB, OrsoTypes.BLOB, "NotLike"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), + (OrsoTypes.BLOB, OrsoTypes.BLOB, "ILike"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), + (OrsoTypes.BLOB, OrsoTypes.BLOB, "NotILike"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), + (OrsoTypes.BLOB, OrsoTypes.BLOB, "RLike"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), + (OrsoTypes.BLOB, OrsoTypes.BLOB, "NotRLike"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), + (OrsoTypes.BLOB, OrsoTypes.BLOB, "StringConcat"): OperatorMapType(OrsoTypes.BLOB, None, 100.0), (OrsoTypes.BOOLEAN, OrsoTypes.ARRAY, "InList"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), (OrsoTypes.BOOLEAN, OrsoTypes.ARRAY, "NotInList"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), (OrsoTypes.BOOLEAN, OrsoTypes.BOOLEAN, "Or"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), @@ -94,6 +107,10 @@ class OperatorMapType(NamedTuple): (OrsoTypes.DECIMAL, OrsoTypes.INTEGER, "GtEq"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), (OrsoTypes.DECIMAL, OrsoTypes.INTEGER, "Lt"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), (OrsoTypes.DECIMAL, OrsoTypes.INTEGER, "LtEq"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), + (OrsoTypes.DECIMAL, OrsoTypes.INTEGER, "Plus"): OperatorMapType(OrsoTypes.DECIMAL, None, 100.0), + (OrsoTypes.DECIMAL, OrsoTypes.INTEGER, "Minus"): OperatorMapType(OrsoTypes.DECIMAL, None, 100.0), + (OrsoTypes.DECIMAL, OrsoTypes.INTEGER, "Multiply"): OperatorMapType(OrsoTypes.DECIMAL, None, 100.0), + (OrsoTypes.DECIMAL, OrsoTypes.INTEGER, "Divide"): OperatorMapType(OrsoTypes.DECIMAL, None, 100.0), (OrsoTypes.DOUBLE, OrsoTypes.ARRAY, "InList"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), (OrsoTypes.DOUBLE, OrsoTypes.ARRAY, "NotInList"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), (OrsoTypes.DOUBLE, OrsoTypes.DECIMAL, "Eq"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), @@ -131,6 +148,16 @@ class OperatorMapType(NamedTuple): (OrsoTypes.INTEGER, OrsoTypes.DATE, "GtEq"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), (OrsoTypes.INTEGER, OrsoTypes.DATE, "Lt"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), (OrsoTypes.INTEGER, OrsoTypes.DATE, "LtEq"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), + (OrsoTypes.INTEGER, OrsoTypes.DECIMAL, "Eq"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), + (OrsoTypes.INTEGER, OrsoTypes.DECIMAL, "NotEq"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), + (OrsoTypes.INTEGER, OrsoTypes.DECIMAL, "Gt"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), + (OrsoTypes.INTEGER, OrsoTypes.DECIMAL, "GtEq"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), + (OrsoTypes.INTEGER, OrsoTypes.DECIMAL, "Lt"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), + (OrsoTypes.INTEGER, OrsoTypes.DECIMAL, "LtEq"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), + (OrsoTypes.INTEGER, OrsoTypes.DECIMAL, "Divide"): OperatorMapType(OrsoTypes.DOUBLE, None, 100.0), + (OrsoTypes.INTEGER, OrsoTypes.DECIMAL, "Multiply"): OperatorMapType(OrsoTypes.DOUBLE, None, 100.0), + (OrsoTypes.INTEGER, OrsoTypes.DECIMAL, "Plus"): OperatorMapType(OrsoTypes.DOUBLE, None, 100.0), + (OrsoTypes.INTEGER, OrsoTypes.DECIMAL, "Minus"): OperatorMapType(OrsoTypes.DOUBLE, None, 100.0), (OrsoTypes.INTEGER, OrsoTypes.DOUBLE, "Eq"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), (OrsoTypes.INTEGER, OrsoTypes.DOUBLE, "NotEq"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), (OrsoTypes.INTEGER, OrsoTypes.DOUBLE, "Gt"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), @@ -184,10 +211,10 @@ class OperatorMapType(NamedTuple): (OrsoTypes.INTERVAL, OrsoTypes.DATE, "Plus"): OperatorMapType(OrsoTypes.TIMESTAMP, None, 100.0), (OrsoTypes.INTERVAL, OrsoTypes.DATE, "Minus"): OperatorMapType(OrsoTypes.TIMESTAMP, None, 100.0), (OrsoTypes.STRUCT, OrsoTypes.VARCHAR, "Arrow"): OperatorMapType(OrsoTypes._MISSING_TYPE, None, 100.0), - (OrsoTypes.STRUCT, OrsoTypes.VARCHAR, "LongArrow"): OperatorMapType(OrsoTypes.VARCHAR, None, 100.0), + (OrsoTypes.STRUCT, OrsoTypes.VARCHAR, "LongArrow"): OperatorMapType(OrsoTypes.BLOB, None, 100.0), (OrsoTypes.STRUCT, OrsoTypes.VARCHAR, "AtQuestion"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), (OrsoTypes.STRUCT, OrsoTypes.BLOB, "Arrow"): OperatorMapType(OrsoTypes._MISSING_TYPE, None, 100.0), - (OrsoTypes.STRUCT, OrsoTypes.BLOB, "LongArrow"): OperatorMapType(OrsoTypes.VARCHAR, None, 100.0), + (OrsoTypes.STRUCT, OrsoTypes.BLOB, "LongArrow"): OperatorMapType(OrsoTypes.BLOB, None, 100.0), (OrsoTypes.STRUCT, OrsoTypes.BLOB, "AtQuestion"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), (OrsoTypes.TIMESTAMP, OrsoTypes.ARRAY, "InList"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), (OrsoTypes.TIMESTAMP, OrsoTypes.ARRAY, "NotInList"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), @@ -245,10 +272,10 @@ class OperatorMapType(NamedTuple): (OrsoTypes.VARCHAR, OrsoTypes.VARCHAR, "BitwiseOr"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), (OrsoTypes.VARCHAR, OrsoTypes.VARCHAR, "StringConcat"): OperatorMapType(OrsoTypes.VARCHAR, None, 100.0), (OrsoTypes.VARCHAR, OrsoTypes.BLOB, "Arrow"): OperatorMapType(OrsoTypes._MISSING_TYPE, None, 100.0), - (OrsoTypes.VARCHAR, OrsoTypes.BLOB, "LongArrow"): OperatorMapType(OrsoTypes.VARCHAR, None, 100.0), + (OrsoTypes.VARCHAR, OrsoTypes.BLOB, "LongArrow"): OperatorMapType(OrsoTypes.BLOB, None, 100.0), (OrsoTypes.VARCHAR, OrsoTypes.BLOB, "AtQuestion"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), (OrsoTypes.VARCHAR, OrsoTypes.VARCHAR, "Arrow"): OperatorMapType(OrsoTypes._MISSING_TYPE, None, 100.0), - (OrsoTypes.VARCHAR, OrsoTypes.VARCHAR, "LongArrow"): OperatorMapType(OrsoTypes.VARCHAR, None, 100.0), + (OrsoTypes.VARCHAR, OrsoTypes.VARCHAR, "LongArrow"): OperatorMapType(OrsoTypes.BLOB, None, 100.0), (OrsoTypes.VARCHAR, OrsoTypes.VARCHAR, "AtQuestion"): OperatorMapType(OrsoTypes.BOOLEAN, None, 100.0), } diff --git a/opteryx/planner/logical_planner/logical_planner_builders.py b/opteryx/planner/logical_planner/logical_planner_builders.py index 0648aaaf9..ffdf890e6 100644 --- a/opteryx/planner/logical_planner/logical_planner_builders.py +++ b/opteryx/planner/logical_planner/logical_planner_builders.py @@ -565,6 +565,10 @@ def pattern_match(branch, alias: Optional[List[str]] = None, key=None): key = f"Not{key}" if is_any: key = f"AnyOp{key}" + if right.node_type == NodeType.IDENTIFIER: + raise UnsupportedSyntaxError( + "LIKE ANY syntax incorrect, `column LIKE ANY (patterns)` expected." + ) if right.node_type == NodeType.NESTED: right = right.centre if right.type != OrsoTypes.ARRAY: diff --git a/opteryx/utils/sql.py b/opteryx/utils/sql.py index 78cda3ea1..4f8c6dcad 100644 --- a/opteryx/utils/sql.py +++ b/opteryx/utils/sql.py @@ -50,7 +50,7 @@ def remove_comments(string: str) -> str: """ # First group captures quoted strings (double or single) # Second group captures comments (/* multi-line */ or -- single-line) - pattern = r"(\"[^\"]*\"|\'[^\']*\')|(/\*[\s\S]*?\*/|--[^\r\n]*$)" + pattern = r"(\"[^\"]*\"|\'[^\']*\')|(/\*.*?\*/|--[^\r\n]*$)" regex = re.compile(pattern, re.MULTILINE | re.DOTALL) @@ -145,6 +145,11 @@ def regex_match_any( A 1D object array with True, False, or None, indicating whether each row did (or did not) match the patterns. """ + if any(not isinstance(p, str) for p in patterns if p): + from opteryx.exceptions import IncorrectTypeError + + raise IncorrectTypeError("Patterns for LIKE ANY comparisons must be strings.") + # 1) Combine the LIKE patterns into a single compiled regex # (Empty patterns list => empty string => matches nothing) combined_pattern_str = r"|".join(sql_like_to_regex(p) for p in patterns if p) diff --git a/tests/misc/test_cast.py b/tests/misc/test_cast.py index 3fa142d62..d17842b8c 100644 --- a/tests/misc/test_cast.py +++ b/tests/misc/test_cast.py @@ -93,18 +93,18 @@ ("TIMESTAMP", None, None), ("TIMESTAMP", "", None), - ("STRUCT", '{"key": "value"}', {"key": "value"}), - ("STRUCT", "not a struct", None), - ("STRUCT", '{"number": 123}', {"number": 123}), - ("STRUCT", '{"boolean": true}', {"boolean": True}), - ("STRUCT", '{"list": [1, 2, 3]}', {"list": [1, 2, 3]}), - ("STRUCT", '{"nested": {"key": "value"}}', {"nested": {"key": "value"}}), - ("STRUCT", '{"string": "string", "number": 123}', {"string": "string", "number": 123}), - ("STRUCT", '{"null_value": null}', {"null_value": None}), - ("STRUCT", '{}', {}), - ("STRUCT", '[]', []), # Invalid struct + ("STRUCT", '{"key": "value"}', b'{"key": "value"}'), + ("STRUCT", "not a struct", b"not a struct"), + ("STRUCT", '{"number": 123}', b'{"number": 123}'), + ("STRUCT", '{"boolean": true}', b'{"boolean": true}'), + ("STRUCT", '{"list": [1, 2, 3]}', b'{"list": [1, 2, 3]}'), + ("STRUCT", '{"nested": {"key": "value"}}', b'{"nested": {"key": "value"}}'), + ("STRUCT", '{"string": "string", "number": 123}', b'{"string": "string", "number": 123}'), + ("STRUCT", '{"null_value": null}', b'{"null_value": null}'), + ("STRUCT", '{}', b'{}'), + ("STRUCT", '[]', b'[]'), # Invalid struct ("STRUCT", None, None), - ("STRUCT", "", None), + ("STRUCT", "", b""), ("DATE", "2021-02-21", datetime.date(2021, 2, 21)), ("DATE", "not a date", None), diff --git a/tests/sql_battery/test_shapes_and_errors_battery.py b/tests/sql_battery/test_shapes_and_errors_battery.py index 29fa1d931..1ed15c805 100644 --- a/tests/sql_battery/test_shapes_and_errors_battery.py +++ b/tests/sql_battery/test_shapes_and_errors_battery.py @@ -229,6 +229,19 @@ ("SELECT * FROM $satellites;--comment", 177, 8, None), ("SELECT * --comment\nFROM $satellites", 177, 8, None), ("SELECT * FROM $satellites --comment\n;", 177, 8, None), + (""" +/* This is a comment */ +SELECT * +/* This is a multiline +comment +*/ +FROM -- +$planets +WHERE /* FALSE AND */ +/* FALSE -- */ +id > /* 0 */ 1 +-- AND name = 'Earth') + """, 8, 20, None), # basic test of the operators ("SELECT $satellites.* FROM $satellites", 177, 8, None), @@ -1512,7 +1525,7 @@ ("SELECT CAST('abc' AS LIST)", None, None, SqlError), ("SELECT TRY_CAST('abc' AS LIST)", None, None, SqlError), - ("SELECT STRUCT(dict) FROM testdata.flat.struct", 3, 1, InconsistentSchemaError), + ("SELECT STRUCT(dict) FROM testdata.flat.struct", 6, 1, None), # Test the order of the predicates shouldn't matter ("SELECT * FROM sqlite.planets WHERE id > gravity", 2, 20, None), @@ -1881,7 +1894,7 @@ ("SELECT name, missions FROM $astronauts WHERE missions NOT LIKE ANY ('Apollo 11')", 331, 2, None), ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('Apollo_%')", 34, 2, None), ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('Apo__o%')", 34, 2, None), - ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('%Apoll%', 123)", 34, 2, None), + ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('%Apoll%', 123)", 34, 2, IncorrectTypeError), ("SELECT name, missions FROM $astronauts WHERE missions LIKE ANY ('%pattern1%', '%pattern2%', '%pattern3%', '%pattern4%', '%pattern5%', '%pattern6%', '%pattern7%', '%pattern8%', '%pattern9%', '%pattern10%', '%pattern11%', '%pattern12%', '%pattern13%', '%pattern14%', '%pattern15%', '%pattern16%', '%pattern17%', '%pattern18%', '%pattern19%', '%pattern20%', '%pattern21%', '%pattern22%', '%pattern23%', '%pattern24%', '%pattern25%', '%pattern26%', '%pattern27%', '%pattern28%', '%pattern29%', '%pattern30%', '%pattern31%', '%pattern32%', '%pattern33%', '%pattern34%', '%pattern35%', '%pattern36%', '%pattern37%', '%pattern38%', '%pattern39%', '%pattern40%', '%pattern41%', '%pattern42%', '%pattern43%', '%pattern44%', '%pattern45%', '%pattern46%', '%pattern47%', '%pattern48%', '%pattern49%', '%pattern50%');", 0, 2, None), ("SELECT name, missions FROM $astronauts WHERE name LIKE ANY '%armstrong%'", 0, 2, None), @@ -1912,7 +1925,7 @@ ("SELECT name, missions FROM $astronauts WHERE name NOT LIKE ANY ('Neil A. Armstrong')", 356, 2, None), ("SELECT name, missions FROM $astronauts WHERE name LIKE ANY ('%__Armstrong%')", 1, 2, None), ("SELECT name, missions FROM $astronauts WHERE name LIKE ANY ('%Arm__rong%')", 1, 2, None), - ("SELECT name, missions FROM $astronauts WHERE name LIKE ANY ('%Armstrong%', 123)", 1, 2, None), + ("SELECT name, missions FROM $astronauts WHERE name LIKE ANY ('%Armstrong%', 123)", 1, 2, IncorrectTypeError), ("SELECT name, missions FROM $astronauts WHERE name LIKE ANY ('%pattern1%', '%pattern2%', '%pattern3%', '%pattern4%', '%pattern5%', '%pattern6%', '%pattern7%', '%pattern8%', '%pattern9%', '%pattern10%', '%pattern11%', '%pattern12%', '%pattern13%', '%pattern14%', '%pattern15%', '%pattern16%', '%pattern17%', '%pattern18%', '%pattern19%', '%pattern20%', '%pattern21%', '%pattern22%', '%pattern23%', '%pattern24%', '%pattern25%', '%pattern26%', '%pattern27%', '%pattern28%', '%pattern29%', '%pattern30%', '%pattern31%', '%pattern32%', '%pattern33%', '%pattern34%', '%pattern35%', '%pattern36%', '%pattern37%', '%pattern38%', '%pattern39%', '%pattern40%', '%pattern41%', '%pattern42%', '%pattern43%', '%pattern44%', '%pattern45%', '%pattern46%', '%pattern47%', '%pattern48%', '%pattern49%', '%pattern50%');", 0, 2, None), ("SELECT max(current_time), name FROM $satellites group by name", 177, 2, None), @@ -2239,7 +2252,7 @@ ("SELECT DISTINCT sides FROM (SELECT * FROM $planets AS plans LEFT JOIN (SELECT ARRAY_AGG(birth_place) as sids, group FROM $astronauts GROUP BY group) AS sats ON plans.id = group) AS plansats CROSS JOIN UNNEST (sids) as sides", 110, 1, None), # 2059 ("SELECT g FROM generate_series(10) as g CROSS JOIN UNNEST (g) as g1", 0, 0, TypeError), - ("SELECT DISTINCT l FROM (SELECT split('a b c d e f g h i j', ' ') as letters) as plet CROSS JOIN UNNEST (letters) as l", 10, 1, None), +# ("SELECT DISTINCT l FROM (SELECT split('a b c d e f g h i j', ' ') as letters) as plet CROSS JOIN UNNEST (letters) as l", 10, 1, None), # 2112 ("SELECT id FROM $planets WHERE surface_pressure / surface_pressure is null", 5, 1, None), #2144 diff --git a/tests/sql_battery/tests/feature_tests.run_tests b/tests/sql_battery/tests/feature_tests.run_tests index 4fab53a90..8165067ef 100644 --- a/tests/sql_battery/tests/feature_tests.run_tests +++ b/tests/sql_battery/tests/feature_tests.run_tests @@ -152,11 +152,11 @@ SELECT name FROM $planets ORDER BY mass LIMIT 2; # USE opteryx; -SELECT SPLIT('a,bc,def'); -SELECT SPLIT('a,bc,def', ','); -SELECT SPLIT('a'); -SELECT SPLIT('a', 'a'); -SELECT SPLIT(name, ' ') FROM $astronauts; +-- SELECT SPLIT('a,bc,def'); +-- SELECT SPLIT('a,bc,def', ','); +-- SELECT SPLIT('a'); +-- SELECT SPLIT('a', 'a'); +-- SELECT SPLIT(name, ' ') FROM $astronauts; SELECT CAST('{"test":true}' AS STRUCT); SELECT CAST('{"test":true, "live":false}' AS STRUCT); diff --git a/tests/sql_battery/tests/optimizer.run_tests b/tests/sql_battery/tests/optimizer.run_tests index 66b7f5c5f..b96b6c808 100644 --- a/tests/sql_battery/tests/optimizer.run_tests +++ b/tests/sql_battery/tests/optimizer.run_tests @@ -27,14 +27,14 @@ SELECT * FROM (SELECT 0 + surface_pressure as opt, surface_pressure FROM $planet SELECT * FROM (SELECT surface_pressure + 0 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL SELECT * FROM (SELECT surface_pressure - 0 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL SELECT * FROM (SELECT surface_pressure / 1 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL -SELECT * FROM (SELECT TRUE AND surface_pressure as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL -SELECT * FROM (SELECT FALSE AND surface_pressure as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL -SELECT * FROM (SELECT TRUE OR surface_pressure as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL -SELECT * FROM (SELECT FALSE OR surface_pressure as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL -SELECT * FROM (SELECT surface_pressure AND TRUE as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL -SELECT * FROM (SELECT surface_pressure AND FALSE as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL -SELECT * FROM (SELECT surface_pressure OR TRUE as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL -SELECT * FROM (SELECT surface_pressure OR FALSE as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT TRUE AND (surface_pressure != 0) as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT FALSE AND (surface_pressure != 0) as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT TRUE OR (surface_pressure != 0) as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT FALSE OR (surface_pressure != 0) as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT (surface_pressure != 0) AND TRUE as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT (surface_pressure != 0) AND FALSE as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT (surface_pressure != 0) OR TRUE as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL +SELECT * FROM (SELECT (surface_pressure != 0) OR FALSE as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL SELECT * FROM (SELECT 1 + surface_pressure as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL SELECT * FROM (SELECT surface_pressure + 1 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL SELECT * FROM (SELECT 1 - surface_pressure as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL From f812ecd29c1341722078da1cf1e1ef47cd17fbe4 Mon Sep 17 00:00:00 2001 From: XB500 Date: Thu, 2 Jan 2025 22:01:22 +0000 Subject: [PATCH 154/157] Opteryx Version 0.19.0-alpha.942 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index e42f25d47..5a9b78ac6 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 941 +__build__ = 942 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 8f70def9796b6a0b40b6bb8505f694b06e40b7c8 Mon Sep 17 00:00:00 2001 From: XB500 Date: Thu, 2 Jan 2025 23:25:30 +0000 Subject: [PATCH 155/157] Opteryx Version 0.19.0-alpha.943 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 5a9b78ac6..dc46f41f8 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 942 +__build__ = 943 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From f505d7c6f813564a6a6e3667c104c09146aeb8d2 Mon Sep 17 00:00:00 2001 From: joocer Date: Thu, 2 Jan 2025 23:26:50 +0000 Subject: [PATCH 156/157] 0.19.0 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index dc46f41f8..990d615f3 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -22,7 +22,7 @@ class VersionStatus(Enum): _major = 0 _minor = 19 _revision = 0 -_status = VersionStatus.ALPHA +_status = VersionStatus.RELEASE __author__ = "@joocer" __version__ = f"{_major}.{_minor}.{_revision}" + ( From 019df43374a054bfab4e3762e6df64e4536987f3 Mon Sep 17 00:00:00 2001 From: XB500 Date: Thu, 2 Jan 2025 23:27:12 +0000 Subject: [PATCH 157/157] Opteryx Version 0.19.0 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 990d615f3..7c3a3d2cc 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 943 +__build__ = 944 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License.