Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Arrow] Properly use the parent's array.offset in many places in the scan #9661

Merged
merged 25 commits into from
Nov 16, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
180037a
use parent offsets when scanning lists and varchars
Tishj Nov 12, 2023
37f9186
add test for list of lists
Tishj Nov 12, 2023
7ea3c9b
fixed size list
Tishj Nov 12, 2023
a6aa5bc
binary and large_binary
Tishj Nov 12, 2023
b20a83c
add test with blobs
Tishj Nov 13, 2023
d5da5f6
hunt for issues in struct/list combinations - fixed up some obvious i…
Tishj Nov 13, 2023
2197dd0
test conversion of time values, with all units
Tishj Nov 13, 2023
0362442
fix timestamptz conversion
Tishj Nov 13, 2023
ed2fc4c
fix duration
Tishj Nov 13, 2023
3cbbe38
fix month_day_nanos
Tishj Nov 13, 2023
125160f
realize interval[months] can not be created from pyarrow..
Tishj Nov 13, 2023
10ac70c
fix decimals (16,32,64 and 128 bits)
Tishj Nov 13, 2023
d3a4aa7
use the ArrowBufferData abstraction
Tishj Nov 13, 2023
80861c2
undo previous commit, order of operations between the cast and the ad…
Tishj Nov 13, 2023
570ade1
add missing coverage for bools
Tishj Nov 13, 2023
237f93d
add missing coverage for date32 and date64
Tishj Nov 13, 2023
adb1ce0
add coverage for dictionaries
Tishj Nov 13, 2023
02e134f
remove debug breakpoint code
Tishj Nov 13, 2023
f74814a
rename test, it was misleading/wrong
Tishj Nov 13, 2023
1edc1af
fixed a FIXME, turns out it does need parent_offset
Tishj Nov 14, 2023
8ed74e9
use parent offset when setting the validity mask
Tishj Nov 14, 2023
325ed9b
add tests for null
Tishj Nov 14, 2023
12374c1
need to initialize the offset in this test, because we now read it
Tishj Nov 14, 2023
8fc062f
fix enums in structs that are null
Tishj Nov 14, 2023
27d0e58
Merge remote-tracking branch 'upstream/main' into arrow_parent_offsets
Tishj Nov 15, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fix decimals (16,32,64 and 128 bits)
  • Loading branch information
Tishj committed Nov 13, 2023
commit 10ac70c17057249f0c54ca30ae4ac05fe7fe1768
8 changes: 4 additions & 4 deletions src/function/table/arrow_conversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -570,7 +570,7 @@ static void ColumnArrowToDuckDB(Vector &vector, ArrowArray &array, ArrowArraySca
case LogicalTypeId::DECIMAL: {
auto val_mask = FlatVector::Validity(vector);
//! We have to convert from INT128
auto src_ptr = ArrowBufferData<hugeint_t>(array, 1) + scan_state.chunk_offset + array.offset;
auto src_ptr = ArrowBufferData<hugeint_t>(array, 1) + scan_state.chunk_offset + parent_offset + array.offset;
if (nested_offset != -1) {
src_ptr = ArrowBufferData<hugeint_t>(array, 1) + nested_offset + array.offset;
}
Expand Down Expand Up @@ -609,9 +609,9 @@ static void ColumnArrowToDuckDB(Vector &vector, ArrowArray &array, ArrowArraySca
break;
}
case PhysicalType::INT128: {
FlatVector::SetData(vector,
ArrowBufferData<data_t>(array, 1) + GetTypeIdSize(vector.GetType().InternalType()) *
(scan_state.chunk_offset + array.offset));
FlatVector::SetData(vector, ArrowBufferData<data_t>(array, 1) +
GetTypeIdSize(vector.GetType().InternalType()) *
(scan_state.chunk_offset + parent_offset + array.offset));
break;
}
default:
Expand Down
45 changes: 45 additions & 0 deletions tools/pythonpkg/tests/fast/arrow/test_arrow_offsets.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import duckdb
import pytest
import datetime
import decimal
import pytz

pa = pytest.importorskip("pyarrow")
Expand Down Expand Up @@ -42,6 +43,14 @@ def nano_interval(nanos):
return (0, 0, nanos)


def decimal_value(value, precision, scale):
val = str(value)
actual_width = precision - scale
if len(val) > actual_width:
return decimal.Decimal('9' * actual_width)
return decimal.Decimal(val)


class TestArrowOffsets(object):
def test_struct_of_strings(self, duckdb_cursor):
col1 = [str(i) for i in range(0, MAGIC_ARRAY_SIZE)]
Expand Down Expand Up @@ -226,6 +235,42 @@ def test_struct_of_large_blobs(self, duckdb_cursor):
).fetchall()
assert res == [(b'131072', b'131072')]

@pytest.mark.parametrize(
["precision_scale", "expected"],
[
((38, 37), decimal.Decimal('9.0000000000000000000000000000000000000')),
((38, 24), decimal.Decimal('131072.000000000000000000000000')),
((18, 14), decimal.Decimal('9999.00000000000000')),
((18, 5), decimal.Decimal('131072.00000')),
((9, 7), decimal.Decimal('99.0000000')),
((9, 3), decimal.Decimal('131072.000')),
((4, 2), decimal.Decimal('99.00')),
((4, 0), decimal.Decimal('9999')),
],
)
def test_struct_of_decimal(self, duckdb_cursor, precision_scale, expected):
precision, scale = precision_scale
col1 = [decimal_value(i, precision, scale) for i in range(0, MAGIC_ARRAY_SIZE)]
# "a" in the struct matches the value for col1
col2 = [{"a": i} for i in col1]

arrow_table = pa.Table.from_pydict(
{"col1": col1, "col2": col2},
schema=pa.schema(
[("col1", pa.decimal128(precision, scale)), ("col2", pa.struct({"a": pa.decimal128(precision, scale)}))]
),
)

res = duckdb_cursor.sql(
f"""
SELECT
col1,
col2.a
FROM arrow_table offset {MAGIC_ARRAY_SIZE-1}
"""
).fetchall()
assert res == [(expected, expected)]

def test_struct_of_small_list(self, duckdb_cursor):
col1 = [str(i) for i in range(0, MAGIC_ARRAY_SIZE)]
# "a" in the struct matches the value for col1
Expand Down