From c970365c06eeadeda2f401a0bd6caf836e266747 Mon Sep 17 00:00:00 2001 From: taniabogatsch <44262898+taniabogatsch@users.noreply.github.com> Date: Thu, 1 Feb 2024 16:53:15 +0100 Subject: [PATCH 1/5] reworked MAP creation --- src/common/types/vector.cpp | 84 ++--- src/core_functions/scalar/map/map.cpp | 289 ++++++++---------- src/include/duckdb/common/types/vector.hpp | 11 +- test/sql/cast/string_to_map_cast.test | 2 +- .../map/map_const_and_col_combination.test | 100 ++++-- test/sql/types/nested/map/map_error.test | 72 +++++ test/sql/types/nested/map/test_map.test | 91 ++---- .../nested/map/test_map_and_lambdas.test | 16 + .../sql/types/nested/map/test_map_concat.test | 3 +- test/sql/types/nested/map/test_map_keys.test | 2 +- .../map/test_map_subscript_composite.test | 4 +- 11 files changed, 377 insertions(+), 297 deletions(-) create mode 100644 test/sql/types/nested/map/map_error.test create mode 100644 test/sql/types/nested/map/test_map_and_lambdas.test diff --git a/src/common/types/vector.cpp b/src/common/types/vector.cpp index f9db436e0868..4ef6e9955f95 100644 --- a/src/common/types/vector.cpp +++ b/src/common/types/vector.cpp @@ -1872,60 +1872,76 @@ const Vector &MapVector::GetValues(const Vector &vector) { } MapInvalidReason MapVector::CheckMapValidity(Vector &map, idx_t count, const SelectionVector &sel) { + D_ASSERT(map.GetType().id() == LogicalTypeId::MAP); - UnifiedVectorFormat map_vdata; - map.ToUnifiedFormat(count, map_vdata); - auto &map_validity = map_vdata.validity; + // unify the MAP vector, which is a physical LIST vector + UnifiedVectorFormat map_data; + map.ToUnifiedFormat(count, map_data); + auto map_entries = UnifiedVectorFormat::GetDataNoConst(map_data); + auto maps_length = ListVector::GetListSize(map); - auto list_data = ListVector::GetData(map); + // unify the child vector containing the keys auto &keys = MapVector::GetKeys(map); - UnifiedVectorFormat key_vdata; - keys.ToUnifiedFormat(count, key_vdata); - auto &key_validity = key_vdata.validity; - - for (idx_t row = 0; row < count; row++) { - auto mapped_row = sel.get_index(row); - auto map_idx = map_vdata.sel->get_index(mapped_row); - // map is allowed to be NULL - if (!map_validity.RowIsValid(map_idx)) { + UnifiedVectorFormat key_data; + keys.ToUnifiedFormat(maps_length, key_data); + + for (idx_t row_idx = 0; row_idx < count; row_idx++) { + + auto mapped_row = sel.get_index(row_idx); + auto map_idx = map_data.sel->get_index(mapped_row); + + if (!map_data.validity.RowIsValid(map_idx)) { continue; } + value_set_t unique_keys; - for (idx_t i = 0; i < list_data[map_idx].length; i++) { - auto index = list_data[map_idx].offset + i; - index = key_vdata.sel->get_index(index); - if (!key_validity.RowIsValid(index)) { + auto length = map_entries[map_idx].length; + auto offset = map_entries[map_idx].offset; + + for (idx_t child_idx = 0; child_idx < length; child_idx++) { + auto key_idx = key_data.sel->get_index(offset + child_idx); + + if (!key_data.validity.RowIsValid(key_idx)) { return MapInvalidReason::NULL_KEY; } - auto value = keys.GetValue(index); - auto result = unique_keys.insert(value); - if (!result.second) { + + auto value = keys.GetValue(key_idx); + auto unique = unique_keys.insert(value).second; + if (!unique) { return MapInvalidReason::DUPLICATE_KEY; } } } + return MapInvalidReason::VALID; } void MapVector::MapConversionVerify(Vector &vector, idx_t count) { - auto valid_check = MapVector::CheckMapValidity(vector, count); - switch (valid_check) { + auto reason = MapVector::CheckMapValidity(vector, count); + EvalMapInvalidReason(reason); +} + +void MapVector::EvalMapInvalidReason(MapInvalidReason reason) { + switch (reason) { case MapInvalidReason::VALID: - break; - case MapInvalidReason::DUPLICATE_KEY: { - throw InvalidInputException("Map keys have to be unique"); - } - case MapInvalidReason::NULL_KEY: { - throw InvalidInputException("Map keys can not be NULL"); - } - case MapInvalidReason::NULL_KEY_LIST: { - throw InvalidInputException("The list of map keys is not allowed to be NULL"); - } - default: { + return; + case MapInvalidReason::DUPLICATE_KEY: + throw InvalidInputException("Map keys must be unique."); + case MapInvalidReason::NULL_KEY: + throw InvalidInputException("Map keys can not be NULL."); + case MapInvalidReason::NULL_KEY_LIST: + throw InvalidInputException("The list of map keys must not be NULL."); + case MapInvalidReason::NULL_VALUE_LIST: + throw InvalidInputException("The list of map values must not be NULL."); + case MapInvalidReason::NOT_ALIGNED: + throw InvalidInputException("The map key list does not align with the map value list."); + case MapInvalidReason::INVALID_PARAMS: + throw InvalidInputException("Invalid map argument(s). Valid map arguments are a list of key-value pairs (MAP " + "{'key1': 'val1', ...}), two lists (MAP ([1, 2], [10, 11])), or no arguments."); + default: throw InternalException("MapInvalidReason not implemented"); } - } } //===--------------------------------------------------------------------===// diff --git a/src/core_functions/scalar/map/map.cpp b/src/core_functions/scalar/map/map.cpp index 40bea7a89618..829437ddebbf 100644 --- a/src/core_functions/scalar/map/map.cpp +++ b/src/core_functions/scalar/map/map.cpp @@ -9,211 +9,182 @@ namespace duckdb { -// Example: -// source: [1,2,3], expansion_factor: 4 -// target (result): [1,2,3,1,2,3,1,2,3,1,2,3] -static void CreateExpandedVector(const Vector &source, Vector &target, idx_t expansion_factor) { - idx_t count = ListVector::GetListSize(source); - auto &entry = ListVector::GetEntry(source); - - idx_t target_idx = 0; - for (idx_t copy = 0; copy < expansion_factor; copy++) { - for (idx_t key_idx = 0; key_idx < count; key_idx++) { - target.SetValue(target_idx, entry.GetValue(key_idx)); - target_idx++; - } - } - D_ASSERT(target_idx == count * expansion_factor); -} +struct MapChildInfo {}; -static void AlignVectorToReference(const Vector &original, const Vector &reference, idx_t tuple_count, Vector &result) { - auto original_length = ListVector::GetListSize(original); - auto new_length = ListVector::GetListSize(reference); +static void MapFunctionEmptyInput(Vector &result, const idx_t row_count) { - Vector expanded_const(ListType::GetChildType(original.GetType()), new_length); - - if (new_length != tuple_count * original_length) { - throw InvalidInputException("Error in MAP creation: key list and value list do not align. i.e. different " - "size or incompatible structure"); - } - auto expansion_factor = original_length ? new_length / original_length : original_length; - CreateExpandedVector(original, expanded_const, expansion_factor); + // if no chunk is set in ExpressionExecutor::ExecuteExpression (args.data.empty(), e.g., + // in SELECT MAP()), then we always pass a row_count of 1 + result.SetVectorType(VectorType::CONSTANT_VECTOR); + ListVector::SetListSize(result, 0); - result.Reference(expanded_const); + auto result_data = ListVector::GetData(result); + result_data[0] = list_entry_t(); + result.Verify(row_count); } -static bool ListEntriesEqual(Vector &keys, Vector &values, idx_t count) { - auto key_count = ListVector::GetListSize(keys); - auto value_count = ListVector::GetListSize(values); - bool same_vector_type = keys.GetVectorType() == values.GetVectorType(); +static void MapFunction(DataChunk &args, ExpressionState &, Vector &result) { - D_ASSERT(keys.GetType().id() == LogicalTypeId::LIST); - D_ASSERT(values.GetType().id() == LogicalTypeId::LIST); + // internal MAP representation + // - LIST-vector that contains STRUCTs as child entries + // - STRUCTs have exactly two fields, a key-field, and a value-field + // - key names are unique - UnifiedVectorFormat keys_data; - UnifiedVectorFormat values_data; + D_ASSERT(result.GetType().id() == LogicalTypeId::MAP); + auto row_count = args.size(); - keys.ToUnifiedFormat(count, keys_data); - values.ToUnifiedFormat(count, values_data); + // early-out, if no data + if (args.data.empty()) { + return MapFunctionEmptyInput(result, row_count); + } + auto &keys = args.data[0]; + auto &values = args.data[1]; + + // a LIST vector, where each row contains a LIST of KEYS + UnifiedVectorFormat keys_data; + keys.ToUnifiedFormat(row_count, keys_data); auto keys_entries = UnifiedVectorFormat::GetData(keys_data); - auto values_entries = UnifiedVectorFormat::GetData(values_data); - if (same_vector_type) { - const auto key_data = keys_data.data; - const auto value_data = values_data.data; + // the KEYs child vector + auto keys_child_vector = ListVector::GetEntry(keys); + UnifiedVectorFormat keys_child_data; + keys_child_vector.ToUnifiedFormat(ListVector::GetListSize(keys), keys_child_data); - if (keys.GetVectorType() == VectorType::CONSTANT_VECTOR) { - D_ASSERT(values.GetVectorType() == VectorType::CONSTANT_VECTOR); - // Only need to compare one entry in this case - return memcmp(key_data, value_data, sizeof(list_entry_t)) == 0; - } + // a LIST vector, where each row contains a LIST of VALUES + UnifiedVectorFormat values_data; + values.ToUnifiedFormat(row_count, values_data); + auto values_entries = UnifiedVectorFormat::GetData(values_data); - // Fast path if the vector types are equal, can just check if the entries are the same - if (key_count != value_count) { - return false; + // the VALUEs child vector + auto values_child_vector = ListVector::GetEntry(values); + UnifiedVectorFormat values_child_data; + values_child_vector.ToUnifiedFormat(ListVector::GetListSize(values), values_child_data); + + // a LIST vector, where each row contains a MAP (LIST of STRUCTs) + UnifiedVectorFormat result_data; + result.ToUnifiedFormat(row_count, result_data); + auto result_entries = UnifiedVectorFormat::GetDataNoConst(result_data); + result_data.validity.SetAllValid(row_count); + + // get the resulting size of the key/value child lists + idx_t result_child_size = 0; + for (idx_t row_idx = 0; row_idx < row_count; row_idx++) { + auto keys_idx = keys_data.sel->get_index(row_idx); + if (!keys_data.validity.RowIsValid(keys_idx)) { + continue; } - return memcmp(key_data, value_data, count * sizeof(list_entry_t)) == 0; + auto keys_entry = keys_entries[keys_idx]; + result_child_size += keys_entry.length; } - // Compare the list_entries one by one - for (idx_t i = 0; i < count; i++) { - auto keys_idx = keys_data.sel->get_index(i); - auto values_idx = values_data.sel->get_index(i); + // we need to slice potential dictionary vectors + SelectionVector sel_keys(result_child_size); + SelectionVector sel_values(result_child_size); + idx_t offset = 0; - if (keys_entries[keys_idx] != values_entries[values_idx]) { - return false; - } - } - return true; -} + for (idx_t row_idx = 0; row_idx < row_count; row_idx++) { -static list_entry_t *GetBiggestList(Vector &key, Vector &value, idx_t &size) { - auto key_size = ListVector::GetListSize(key); - auto value_size = ListVector::GetListSize(value); - if (key_size > value_size) { - size = key_size; - return ListVector::GetData(key); - } - size = value_size; - return ListVector::GetData(value); -} + auto keys_idx = keys_data.sel->get_index(row_idx); + auto values_idx = values_data.sel->get_index(row_idx); + auto result_idx = result_data.sel->get_index(row_idx); -static void MapFunction(DataChunk &args, ExpressionState &state, Vector &result) { - D_ASSERT(result.GetType().id() == LogicalTypeId::MAP); + // empty map + if (!keys_data.validity.RowIsValid(keys_idx) && !values_data.validity.RowIsValid(values_idx)) { + result_entries[result_idx] = list_entry_t(); + continue; + } - auto count = args.size(); + auto keys_entry = keys_entries[keys_idx]; + auto values_entry = values_entries[values_idx]; - auto &map_key_vector = MapVector::GetKeys(result); - auto &map_value_vector = MapVector::GetValues(result); - auto result_data = ListVector::GetData(result); + // validity checks + if (!keys_data.validity.RowIsValid(keys_idx)) { + MapVector::EvalMapInvalidReason(MapInvalidReason::NULL_KEY_LIST); + } + if (!keys_data.validity.RowIsValid(keys_idx)) { + MapVector::EvalMapInvalidReason(MapInvalidReason::NULL_VALUE_LIST); + } + if (keys_entry.length != values_entry.length) { + MapVector::EvalMapInvalidReason(MapInvalidReason::NOT_ALIGNED); + } - result.SetVectorType(VectorType::CONSTANT_VECTOR); - if (args.data.empty()) { - ListVector::SetListSize(result, 0); - result_data->offset = 0; - result_data->length = 0; - result.Verify(count); - return; - } + // set the selection vectors and perform a duplicate key check + value_set_t unique_keys; + for (idx_t child_idx = 0; child_idx < keys_entry.length; child_idx++) { - D_ASSERT(args.ColumnCount() == 2); - auto &key_vector = args.data[0]; - auto &value_vector = args.data[1]; + auto key_idx = keys_child_data.sel->get_index(keys_entry.offset + child_idx); + auto value_idx = values_child_data.sel->get_index(values_entry.offset + child_idx); - if (args.AllConstant()) { - auto key_data = ListVector::GetData(key_vector); - auto value_data = ListVector::GetData(value_vector); - auto key_entry = key_data[0]; - auto value_entry = value_data[0]; - if (key_entry != value_entry) { - throw BinderException("Key and value list sizes don't match"); - } - result_data[0] = key_entry; - ListVector::SetListSize(result, ListVector::GetListSize(key_vector)); - map_key_vector.Reference(ListVector::GetEntry(key_vector)); - map_value_vector.Reference(ListVector::GetEntry(value_vector)); - MapVector::MapConversionVerify(result, count); - result.Verify(count); - return; - } + // NULL check + if (!keys_child_data.validity.RowIsValid(key_idx)) { + MapVector::EvalMapInvalidReason(MapInvalidReason::NULL_KEY); + } + + // unique check + auto value = keys_child_vector.GetValue(key_idx); + auto unique = unique_keys.insert(value).second; + if (!unique) { + MapVector::EvalMapInvalidReason(MapInvalidReason::DUPLICATE_KEY); + } - result.SetVectorType(VectorType::FLAT_VECTOR); - - if (key_vector.GetVectorType() == VectorType::CONSTANT_VECTOR) { - D_ASSERT(value_vector.GetVectorType() != VectorType::CONSTANT_VECTOR); - Vector expanded_const(ListType::GetChildType(key_vector.GetType()), count); - AlignVectorToReference(key_vector, value_vector, count, expanded_const); - map_key_vector.Reference(expanded_const); - - value_vector.Flatten(count); - map_value_vector.Reference(ListVector::GetEntry(value_vector)); - } else if (value_vector.GetVectorType() == VectorType::CONSTANT_VECTOR) { - D_ASSERT(key_vector.GetVectorType() != VectorType::CONSTANT_VECTOR); - Vector expanded_const(ListType::GetChildType(value_vector.GetType()), count); - AlignVectorToReference(value_vector, key_vector, count, expanded_const); - map_value_vector.Reference(expanded_const); - - key_vector.Flatten(count); - map_key_vector.Reference(ListVector::GetEntry(key_vector)); - } else { - key_vector.Flatten(count); - value_vector.Flatten(count); - - if (!ListEntriesEqual(key_vector, value_vector, count)) { - throw InvalidInputException("Error in MAP creation: key list and value list do not align. i.e. different " - "size or incompatible structure"); + // set selection vectors + sel_keys.set_index(offset + child_idx, key_idx); + sel_values.set_index(offset + child_idx, value_idx); } - map_value_vector.Reference(ListVector::GetEntry(value_vector)); - map_key_vector.Reference(ListVector::GetEntry(key_vector)); + // keys_entry and values_entry have the same length + result_entries[result_idx].length = keys_entry.length; + result_entries[result_idx].offset = offset; + offset += keys_entry.length; } + D_ASSERT(offset == result_child_size); - idx_t list_size; - auto src_data = GetBiggestList(key_vector, value_vector, list_size); - ListVector::SetListSize(result, list_size); + auto &result_key_vector = MapVector::GetKeys(result); + auto &result_value_vector = MapVector::GetValues(result); - result_data = ListVector::GetData(result); - for (idx_t i = 0; i < count; i++) { - result_data[i] = src_data[i]; - } + ListVector::SetListSize(result, offset); + result_key_vector.Slice(keys_child_vector, sel_keys, offset); + result_key_vector.Flatten(offset); + result_value_vector.Slice(values_child_vector, sel_values, offset); + result_value_vector.Flatten(offset); - MapVector::MapConversionVerify(result, count); - result.Verify(count); + if (args.AllConstant()) { + result.SetVectorType(VectorType::CONSTANT_VECTOR); + } + result.Verify(row_count); } -static unique_ptr MapBind(ClientContext &context, ScalarFunction &bound_function, +static unique_ptr MapBind(ClientContext &, ScalarFunction &bound_function, vector> &arguments) { - child_list_t child_types; if (arguments.size() != 2 && !arguments.empty()) { - throw Exception("We need exactly two lists for a map"); - } - if (arguments.size() == 2) { - if (arguments[0]->return_type.id() != LogicalTypeId::LIST) { - throw Exception("First argument is not a list"); - } - if (arguments[1]->return_type.id() != LogicalTypeId::LIST) { - throw Exception("Second argument is not a list"); - } - child_types.push_back(make_pair("key", arguments[0]->return_type)); - child_types.push_back(make_pair("value", arguments[1]->return_type)); + MapVector::EvalMapInvalidReason(MapInvalidReason::INVALID_PARAMS); } + // bind an empty MAP if (arguments.empty()) { - auto empty = LogicalType::LIST(LogicalTypeId::SQLNULL); - child_types.push_back(make_pair("key", empty)); - child_types.push_back(make_pair("value", empty)); + bound_function.return_type = LogicalType::MAP(LogicalTypeId::SQLNULL, LogicalTypeId::SQLNULL); + return make_uniq(bound_function.return_type); + } + + // bind a MAP with key-value pairs + D_ASSERT(arguments.size() == 2); + if (arguments[0]->return_type.id() != LogicalTypeId::LIST) { + MapVector::EvalMapInvalidReason(MapInvalidReason::INVALID_PARAMS); + } + if (arguments[1]->return_type.id() != LogicalTypeId::LIST) { + MapVector::EvalMapInvalidReason(MapInvalidReason::INVALID_PARAMS); } - bound_function.return_type = - LogicalType::MAP(ListType::GetChildType(child_types[0].second), ListType::GetChildType(child_types[1].second)); + auto key_type = ListType::GetChildType(arguments[0]->return_type); + auto value_type = ListType::GetChildType(arguments[1]->return_type); + bound_function.return_type = LogicalType::MAP(key_type, value_type); return make_uniq(bound_function.return_type); } ScalarFunction MapFun::GetFunction() { - //! the arguments and return types are actually set in the binder function ScalarFunction fun({}, LogicalTypeId::MAP, MapFunction, MapBind); fun.varargs = LogicalType::ANY; fun.null_handling = FunctionNullHandling::SPECIAL_HANDLING; diff --git a/src/include/duckdb/common/types/vector.hpp b/src/include/duckdb/common/types/vector.hpp index bf5d339e0c8d..dfb2294079d9 100644 --- a/src/include/duckdb/common/types/vector.hpp +++ b/src/include/duckdb/common/types/vector.hpp @@ -432,7 +432,15 @@ struct FSSTVector { DUCKDB_API static idx_t GetCount(Vector &vector); }; -enum class MapInvalidReason : uint8_t { VALID, NULL_KEY_LIST, NULL_KEY, DUPLICATE_KEY }; +enum class MapInvalidReason : uint8_t { + VALID, + NULL_KEY_LIST, + NULL_KEY, + DUPLICATE_KEY, + NULL_VALUE_LIST, + NOT_ALIGNED, + INVALID_PARAMS +}; struct MapVector { DUCKDB_API static const Vector &GetKeys(const Vector &vector); @@ -441,6 +449,7 @@ struct MapVector { DUCKDB_API static Vector &GetValues(Vector &vector); DUCKDB_API static MapInvalidReason CheckMapValidity(Vector &map, idx_t count, const SelectionVector &sel = *FlatVector::IncrementalSelectionVector()); + DUCKDB_API static void EvalMapInvalidReason(MapInvalidReason reason); DUCKDB_API static void MapConversionVerify(Vector &vector, idx_t count); }; diff --git a/test/sql/cast/string_to_map_cast.test b/test/sql/cast/string_to_map_cast.test index d0d9c350f0fd..926eb467ae89 100644 --- a/test/sql/cast/string_to_map_cast.test +++ b/test/sql/cast/string_to_map_cast.test @@ -36,7 +36,7 @@ SELECT '{"key=A"=Duck, "key=B"="hello=world"}'::MAP(VARCHAR, VARCHAR) statement error SELECT '{a=1, b=2, a=3}'::MAP(VARCHAR, INT) ---- -Invalid Input Error: Map keys have to be unique +Map keys must be unique # Nested Maps diff --git a/test/sql/types/map/map_const_and_col_combination.test b/test/sql/types/map/map_const_and_col_combination.test index aa2675608f2a..deabbdf51020 100644 --- a/test/sql/types/map/map_const_and_col_combination.test +++ b/test/sql/types/map/map_const_and_col_combination.test @@ -1,67 +1,69 @@ # name: test/sql/types/map/map_const_and_col_combination.test -# description: Test creating map when one list is const and the other list columnar data +# description: Test MAP creation if one list is a constant and the other list contains columnar data # group: [map] statement ok -create table ints (i INT); +PRAGMA enable_verification; statement ok -INSERT INTO ints VALUES (1),(2),(3); +CREATE TABLE ints (i INT); + +statement ok +INSERT INTO ints VALUES (1), (2), (3); query I -select map(['name'], [i] ) FROM ints; +SELECT MAP(['name'], [i]) FROM ints; ---- {name=1} {name=2} {name=3} statement error -select map(['x', 'y'], [i] ) FROM ints; +SELECT MAP(['x', 'y'], [i] ) FROM ints; ---- -Error in MAP creation: key list and value list do not align. i.e. different size or incompatible structure +The map key list does not align with the map value list query I -select map([i], ['name'] ) FROM ints; +SELECT MAP([i], ['name'] ) FROM ints; ---- {1=name} {2=name} {3=name} query I -select map([i, i+1], ['x', 'y']) FROM ints; +SELECT MAP([i, i+1], ['x', 'y']) FROM ints; ---- {1=x, 2=y} {2=x, 3=y} {3=x, 4=y} - query I -select map([i, i+1], ['x', 'y']) FROM ints WHERE i > 1; +SELECT MAP([i, i+1], ['x', 'y']) FROM ints WHERE i > 1; ---- {2=x, 3=y} {3=x, 4=y} query I -select map(['x'], [m]) FROM (SELECT map([i], ['y']) m FROM ints WHERE i <> 1); +SELECT MAP(['x'], [m]) FROM (SELECT MAP([i], ['y']) m FROM ints WHERE i <> 1); ---- {x={2=y}} {x={3=y}} query I -select map(['key'], [range]) from range(5) WHERE range > 2; +SELECT MAP(['key'], [range]) FROM range(5) WHERE range > 2; ---- {key=3} {key=4} query I -select map(['🦆', '🦤', '🐓'], [i, i+1, i+2] ) FROM ints; +SELECT MAP(['🦆', '🦤', '🐓'], [i, i+1, i+2]) FROM ints; ---- {🦆=1, 🦤=2, 🐓=3} {🦆=2, 🦤=3, 🐓=4} {🦆=3, 🦤=4, 🐓=5} query I -SELECT map([10, i, i+1, 9], [i, 3.14, 0.12, 8.0]) FROM ints; +SELECT MAP([10, i, i+1, 9], [i, 3.14, 0.12, 8.0]) FROM ints; ---- {10=1.00, 1=3.14, 2=0.12, 9=8.00} {10=2.00, 2=3.14, 3=0.12, 9=8.00} @@ -71,26 +73,26 @@ statement ok CREATE TABLE tbl (v VARCHAR[]); statement ok -INSERT INTO tbl VALUES (ARRAY['test','string']), (ARRAY['foo','bar']) +INSERT INTO tbl VALUES (ARRAY['test', 'string']), (ARRAY['foo', 'bar']); query I -select map(['x', 'y'], v ) FROM tbl; +SELECT MAP(['x', 'y'], v ) FROM tbl; ---- {x=test, y=string} {x=foo, y=bar} -# test mismatched lists of same total length +# test mismatching lists of the same total length + statement ok -CREATE TABLE map_input (keys INT[], values INT[]) +CREATE TABLE MAP_input (keys INT[], values INT[]); statement ok -INSERT INTO map_input VALUES ([1,0],[2]), ([3],[4, 9]) +INSERT INTO MAP_input VALUES ([1, 0], [2]), ([3], [4, 9]); statement error -SELECT map(keys, values) FROM map_input; +SELECT MAP(keys, values) FROM MAP_input; ---- -Error in MAP creation: key list and value list do not align. i.e. different size or incompatible structure - +The map key list does not align with the map value list statement ok CREATE TABLE groups (category INT, score INT); @@ -99,14 +101,62 @@ statement ok INSERT INTO groups VALUES (1,2), (1,8), (1,3), (2,3), (2,4), (2,5), (3,6), (3,1), (3,9) query I -select map(['category', 'min', 'max'], [category, MIN(score), MAX(score)]) FROM groups GROUP BY category ORDER BY ALL; +SELECT MAP(['category', 'min', 'max'], [category, MIN(score), MAX(score)]) FROM groups GROUP BY category ORDER BY ALL; ---- {category=1, min=2, max=8} {category=2, min=3, max=5} {category=3, min=1, max=9} -#Map with larger-than-vector-size cardinalities +# test a MAP with larger-than-vector-size cardinalities + query I -select MAP([range], ['a']) FROM range(10000) WHERE range = 9999; +SELECT MAP([range], ['a']) FROM range(10000) WHERE range = 9999; ---- {9999=a} + +# the constant list has to align + +statement ok +CREATE TABLE align_tbl (i INT[]); + +statement ok +INSERT INTO align_tbl VALUES ([1, 2]), ([100, 200]); + +query I +SELECT MAP(['x', 'y'], i) FROM align_tbl; +---- +{x=1, y=2} +{x=100, y=200} + +statement ok +INSERT INTO align_tbl VALUES ([1, 2, 3, 4, 5, 6]), ([20, 30, 40, 50]); + +statement error +SELECT MAP(['x', 'y'], i) FROM align_tbl; +---- +The map key list does not align with the map value list + +statement error +SELECT MAP(['x', 'y', '1', '2', '3', '4'], i) FROM align_tbl; +---- +The map key list does not align with the map value list + +statement error +SELECT MAP(i, ['x', 'y']) FROM align_tbl; +---- +The map key list does not align with the map value list + +# test all const + +statement ok +CREATE TABLE allconst (i INT); + +statement ok +INSERT INTO allconst VALUES (1), (2), (3); + +query I +SELECT MAP(['name'], [2]) FROM allconst; +---- +{name=2} +{name=2} +{name=2} diff --git a/test/sql/types/nested/map/map_error.test b/test/sql/types/nested/map/map_error.test new file mode 100644 index 000000000000..4c1bae4aad67 --- /dev/null +++ b/test/sql/types/nested/map/map_error.test @@ -0,0 +1,72 @@ +# name: test/sql/types/nested/map/map_error.test +# description: Test map queries triggering various error messages +# group: [map] + +statement ok +PRAGMA enable_verification; + +statement error +SELECT MAP(list_value(NULL, NULL, NULL, NULL, NULL), list_value(10, 9, 10, 11, 13)) +---- +Map keys can not be NULL + +statement error +SELECT MAP(list_value(1, NULL, 3), list_value(6, 5, 4)) +---- +Map keys can not be NULL + +statement error +SELECT MAP(list_value(1, 2, 3, 4, 1), list_value(10, 9, 8, 7, 6)) +---- +Map keys must be unique + +statement error +SELECT MAP(NULL) +---- +Invalid map argument(s) + +# same map keys in a MAP + +statement ok +CREATE TABLE tbl (a INTEGER[], b TEXT[]) + +statement ok +INSERT INTO tbl VALUES (ARRAY[7, 5, 7], ARRAY['a', 'b', 'c']) + +statement error +SELECT MAP(a, b) FROM tbl; +---- +Map keys must be unique + +statement error +SELECT MAP(list_value(10), list_value()) +---- +The map key list does not align with the map value list + +statement error +SELECT MAP(10, 12) +---- +Invalid map argument(s) + +statement error +SELECT MAP(list_value(10), list_value(10), list_value(10)) +---- +Invalid map argument(s) + +statement error +SELECT MAP(list_value(10), 10) +---- +Invalid map argument(s) + +statement error +SELECT MAP(list_value(10, 20), list_value(10)) +---- +The map key list does not align with the map value list + +statement ok +CREATE TABLE t AS SELECT MAP(list_value(1, 2, 3), list_value(10, 9, 10)) AS m; + +statement error +SELECT struct_extract(m, 'key') FROM t; +---- +No function matches the given name and argument types \ No newline at end of file diff --git a/test/sql/types/nested/map/test_map.test b/test/sql/types/nested/map/test_map.test index 135ddc7f6601..a7afe64b5426 100644 --- a/test/sql/types/nested/map/test_map.test +++ b/test/sql/types/nested/map/test_map.test @@ -5,61 +5,47 @@ statement ok PRAGMA enable_verification -# All keys are NULL -statement error -select m[NULL] from (select MAP(LIST_VALUE(NULL, NULL, NULL,NULL,NULL ),LIST_VALUE(10,9,10,11,13)) as m) as T ----- +# constant maps query I -SELECT MAP(LIST_VALUE({'i':1,'j':2},{'i':3,'j':4}),LIST_VALUE({'i':1,'j':2},{'i':3,'j':4})) as a +SELECT MAP(list_value(1, 2, 3), list_value(10, 9, 8)) ---- -{{'i': 1, 'j': 2}={'i': 1, 'j': 2}, {'i': 3, 'j': 4}={'i': 3, 'j': 4}} +{1=10, 2=9, 3=8} -# Some keys are NULL -statement error -select MAP(LIST_VALUE(1,NULL,3), LIST_VALUE(6,5,4)) +query I +SELECT MAP(list_value({'i':1,'j':2}, {'i':3,'j':4}), list_value({'i':1,'j':2}, {'i':3,'j':4})) ---- +{{'i': 1, 'j': 2}={'i': 1, 'j': 2}, {'i': 3, 'j': 4}={'i': 3, 'j': 4}} query I -select MAP(LIST_VALUE(1,2,3), LIST_VALUE(6,NULL,4)) +SELECT MAP(list_value(1, 2, 3), list_value(6, NULL, 4)) ---- {1=6, 2=NULL, 3=4} query I -select MAP(LIST_VALUE(1, 2, 3, 4),LIST_VALUE(10, 9, 8, 7)) +SELECT MAP(list_value(1, 2, 3, 4), list_value(10, 9, 8, 7)) ---- {1=10, 2=9, 3=8, 4=7} +# empty maps -# Keys have to be unique -statement error -select MAP(LIST_VALUE(1,2,3,4,1),LIST_VALUE(10,9,8,7,6)) ----- - -# Empty maps query I -select MAP(LIST_VALUE(),LIST_VALUE()) +SELECT MAP(list_value(), list_value()) ---- {} query I -select MAP() +SELECT MAP() ---- {} -statement error -select MAP(NULL) ----- +# maps from columns -# Test making MAPs from columns statement ok -CREATE TABLE tbl ( - a INTEGER[], - b TEXT[], -); +CREATE TABLE tbl (a INTEGER[], b TEXT[]) statement ok -INSERT INTO tbl VALUES (ARRAY[5,7], ARRAY['test','string']), (ARRAY[6,3], ARRAY['foo','bar']) +INSERT INTO tbl VALUES (ARRAY[5, 7], ARRAY['test', 'string']), (ARRAY[6, 3], ARRAY['foo', 'bar']) query I SELECT MAP(a, b) FROM tbl; @@ -67,11 +53,11 @@ SELECT MAP(a, b) FROM tbl; {5=test, 7=string} {6=foo, 3=bar} -# Check unique exception with columns +# same map keys in different maps (rows) + statement ok -INSERT INTO tbl VALUES (ARRAY[5,7], ARRAY['also_test', 'also_string']) +INSERT INTO tbl VALUES (ARRAY[5, 7], ARRAY['also_test', 'also_string']) -# Unique between rows, should work query I SELECT MAP(a, b) FROM tbl; ---- @@ -79,46 +65,7 @@ SELECT MAP(a, b) FROM tbl; {6=foo, 3=bar} {5=also_test, 7=also_string} -statement ok -INSERT INTO tbl VALUES (ARRAY[7,5,7], ARRAY['a', 'b', 'c']) - -# Keys are not unique for one of the rows -statement error -SELECT MAP(a, b) FROM tbl; ----- - -# Lists with different size are FORBIDDEN -statement error -select MAP(LIST_VALUE(10),LIST_VALUE()) ----- - query I -select MAP(LIST_VALUE([1],[2],[3],[4]),LIST_VALUE(10,9,8,7)) ----- -{[1]=10, [2]=9, [3]=8, [4]=7} - -# Maps that are not initialized empty or with lists are FORBIDDEN -# Map where the 2 entries are not lists -statement error -select MAP(10,12) ----- - -# Map function called with more than 2 lists -statement error -select MAP(LIST_VALUE(10),LIST_VALUE(10),LIST_VALUE(10)) ----- - -# Map where 1 of the 2 entries is not a list -statement error -select MAP(LIST_VALUE(10),10) ----- - -#Map with unbalanced list cardinalities -statement error -select MAP(LIST_VALUE(10,20),LIST_VALUE(10)) ----- - -#Can't struct extract from a map -statement error -select struct_extract(m,'key') from (select MAP(LIST_VALUE(NULL, NULL, NULL,NULL,NULL ),LIST_VALUE(10,9,10,11,13)) as m) as T +SELECT MAP(list_value([1], [2], [3], [4]), list_value(10, 9, 8, 7)) ---- +{[1]=10, [2]=9, [3]=8, [4]=7} \ No newline at end of file diff --git a/test/sql/types/nested/map/test_map_and_lambdas.test b/test/sql/types/nested/map/test_map_and_lambdas.test new file mode 100644 index 000000000000..b57918fd69d4 --- /dev/null +++ b/test/sql/types/nested/map/test_map_and_lambdas.test @@ -0,0 +1,16 @@ +# name: test/sql/types/nested/map/test_map_and_lambdas.test +# description: Test MAP functionality in lambda functions +# group: [map] + +statement ok +PRAGMA enable_verification + +statement ok +CREATE TABLE i AS SELECT str_split('my yay', ' ') AS l, range AS i FROM range(3); + +query I +SELECT list_transform(l, x -> {'map1': MAP {x::VARCHAR:1::VARCHAR, 'b'::VARCHAR: x::VARCHAR}}) FROM i; +---- +[{'map1': {my=1, b=my}}, {'map1': {yay=1, b=yay}}] +[{'map1': {my=1, b=my}}, {'map1': {yay=1, b=yay}}] +[{'map1': {my=1, b=my}}, {'map1': {yay=1, b=yay}}] \ No newline at end of file diff --git a/test/sql/types/nested/map/test_map_concat.test b/test/sql/types/nested/map/test_map_concat.test index 27d154b13851..ce4a9ed5312f 100644 --- a/test/sql/types/nested/map/test_map_concat.test +++ b/test/sql/types/nested/map/test_map_concat.test @@ -1,8 +1,7 @@ # name: test/sql/types/nested/map/test_map_concat.test # group: [map] -#statement ok -#pragma enable_verification; +# NOTE: PRAGMA enable_verification significantly increases the duration of this test # Basic use, 2 maps, same type, no nulls query I diff --git a/test/sql/types/nested/map/test_map_keys.test b/test/sql/types/nested/map/test_map_keys.test index 27cf4359695a..0a5b70c5f4ff 100644 --- a/test/sql/types/nested/map/test_map_keys.test +++ b/test/sql/types/nested/map/test_map_keys.test @@ -2,7 +2,7 @@ # group: [map] statement ok -pragma enable_verification; +PRAGMA enable_verification; # Empty query I diff --git a/test/sql/types/nested/map/test_map_subscript_composite.test b/test/sql/types/nested/map/test_map_subscript_composite.test index 6faa93342c20..d6c320b9ebc5 100644 --- a/test/sql/types/nested/map/test_map_subscript_composite.test +++ b/test/sql/types/nested/map/test_map_subscript_composite.test @@ -29,7 +29,7 @@ select m[NULL] from (select MAP(LIST_VALUE({a:3}, {a:4}, {a:5}, {a:6},{a:7}),LIS statement error select m[2] from (select MAP(LIST_VALUE([2,2], [2], [3,3], [4,4,4],[2]),LIST_VALUE(10, 9, 8, 7,11)) as m) as T ---- -Invalid Input Error: Map keys have to be unique +Map keys must be unique # Check autocast query I @@ -47,7 +47,7 @@ select m[[10,11]] from (select MAP(lst,lst) as m from (SELECT LIST([i,i+1]) as l statement error select m[[1]] from (select MAP(LIST_VALUE([1], [1], [1], [4]),LIST_VALUE(10, 9, 8, 7)) as m) as T ---- -Invalid Input Error: Map keys have to be unique +Map keys must be unique query I select m[['Tenacious D', 'test']] from (select MAP(LIST_VALUE(['Jon Lajoie'], ['test', NULL], ['Tenacious D', 'test'], ['test', 'Tenacious D']),LIST_VALUE(5,10,9,11)) as m) as T From 68758baa256dd8f6ac786474a71603bc70e78324 Mon Sep 17 00:00:00 2001 From: taniabogatsch <44262898+taniabogatsch@users.noreply.github.com> Date: Fri, 2 Feb 2024 10:25:49 +0100 Subject: [PATCH 2/5] added missing test --- test/parquet/test_parquet_schema.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/parquet/test_parquet_schema.test b/test/parquet/test_parquet_schema.test index a7712718a3d7..5a8a86e7e859 100644 --- a/test/parquet/test_parquet_schema.test +++ b/test/parquet/test_parquet_schema.test @@ -64,7 +64,7 @@ FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map { 0: {name: 'new_column', type: 'UTINYINT', default_value: 43} }) ---- -Invalid Input Error: Map keys have to be unique +Map keys must be unique # cannot duplicate column names statement error From 7312fc33f8fdd2b78341308efcacea64807f97a0 Mon Sep 17 00:00:00 2001 From: taniabogatsch <44262898+taniabogatsch@users.noreply.github.com> Date: Fri, 2 Feb 2024 10:43:08 +0100 Subject: [PATCH 3/5] enum generation --- src/common/enum_util.cpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/common/enum_util.cpp b/src/common/enum_util.cpp index e728ac80fb24..cf9b40bdc83e 100644 --- a/src/common/enum_util.cpp +++ b/src/common/enum_util.cpp @@ -3446,6 +3446,12 @@ const char* EnumUtil::ToChars(MapInvalidReason value) { return "NULL_KEY"; case MapInvalidReason::DUPLICATE_KEY: return "DUPLICATE_KEY"; + case MapInvalidReason::NULL_VALUE_LIST: + return "NULL_VALUE_LIST"; + case MapInvalidReason::NOT_ALIGNED: + return "NOT_ALIGNED"; + case MapInvalidReason::INVALID_PARAMS: + return "INVALID_PARAMS"; default: throw NotImplementedException(StringUtil::Format("Enum value: '%d' not implemented", value)); } @@ -3465,6 +3471,15 @@ MapInvalidReason EnumUtil::FromString(const char *value) { if (StringUtil::Equals(value, "DUPLICATE_KEY")) { return MapInvalidReason::DUPLICATE_KEY; } + if (StringUtil::Equals(value, "NULL_VALUE_LIST")) { + return MapInvalidReason::NULL_VALUE_LIST; + } + if (StringUtil::Equals(value, "NOT_ALIGNED")) { + return MapInvalidReason::NOT_ALIGNED; + } + if (StringUtil::Equals(value, "INVALID_PARAMS")) { + return MapInvalidReason::INVALID_PARAMS; + } throw NotImplementedException(StringUtil::Format("Enum value: '%s' not implemented", value)); } From 4543207c4e8243e94d9632093212a95f438b2b7a Mon Sep 17 00:00:00 2001 From: taniabogatsch <44262898+taniabogatsch@users.noreply.github.com> Date: Fri, 2 Feb 2024 10:46:21 +0100 Subject: [PATCH 4/5] tidying up --- src/core_functions/scalar/map/map.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/core_functions/scalar/map/map.cpp b/src/core_functions/scalar/map/map.cpp index 829437ddebbf..fa1f4d00759a 100644 --- a/src/core_functions/scalar/map/map.cpp +++ b/src/core_functions/scalar/map/map.cpp @@ -9,8 +9,6 @@ namespace duckdb { -struct MapChildInfo {}; - static void MapFunctionEmptyInput(Vector &result, const idx_t row_count) { // if no chunk is set in ExpressionExecutor::ExecuteExpression (args.data.empty(), e.g., @@ -78,7 +76,7 @@ static void MapFunction(DataChunk &args, ExpressionState &, Vector &result) { result_child_size += keys_entry.length; } - // we need to slice potential dictionary vectors + // we need to slice potential non-flat vectors SelectionVector sel_keys(result_child_size); SelectionVector sel_values(result_child_size); idx_t offset = 0; From 294f350827cba10b8eb5116c571e992a940e28ea Mon Sep 17 00:00:00 2001 From: taniabogatsch <44262898+taniabogatsch@users.noreply.github.com> Date: Wed, 7 Feb 2024 14:38:03 +0100 Subject: [PATCH 5/5] review fixes and missing test --- src/core_functions/scalar/map/map.cpp | 2 +- test/sql/types/nested/map/map_error.test | 26 +++++++++++++++++++++--- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/src/core_functions/scalar/map/map.cpp b/src/core_functions/scalar/map/map.cpp index fa1f4d00759a..e27fe3fd6503 100644 --- a/src/core_functions/scalar/map/map.cpp +++ b/src/core_functions/scalar/map/map.cpp @@ -100,7 +100,7 @@ static void MapFunction(DataChunk &args, ExpressionState &, Vector &result) { if (!keys_data.validity.RowIsValid(keys_idx)) { MapVector::EvalMapInvalidReason(MapInvalidReason::NULL_KEY_LIST); } - if (!keys_data.validity.RowIsValid(keys_idx)) { + if (!values_data.validity.RowIsValid(values_idx)) { MapVector::EvalMapInvalidReason(MapInvalidReason::NULL_VALUE_LIST); } if (keys_entry.length != values_entry.length) { diff --git a/test/sql/types/nested/map/map_error.test b/test/sql/types/nested/map/map_error.test index 4c1bae4aad67..315a1620ea7a 100644 --- a/test/sql/types/nested/map/map_error.test +++ b/test/sql/types/nested/map/map_error.test @@ -25,8 +25,6 @@ SELECT MAP(NULL) ---- Invalid map argument(s) -# same map keys in a MAP - statement ok CREATE TABLE tbl (a INTEGER[], b TEXT[]) @@ -69,4 +67,26 @@ CREATE TABLE t AS SELECT MAP(list_value(1, 2, 3), list_value(10, 9, 10)) AS m; statement error SELECT struct_extract(m, 'key') FROM t; ---- -No function matches the given name and argument types \ No newline at end of file +No function matches the given name and argument types + +statement ok +CREATE TABLE null_keys_list (k INT[], v INT[]); + +statement ok +INSERT INTO null_keys_list VALUES ([1], [2]), (NULL, [4]); + +statement error +SELECT MAP(k, v) FROM null_keys_list; +---- +The list of map keys must not be NULL. + +statement ok +CREATE TABLE null_values_list (k INT[], v INT[]); + +statement ok +INSERT INTO null_values_list VALUES ([1], [2]), ([4], NULL); + +statement error +SELECT MAP(k, v) FROM null_values_list; +---- +The list of map values must not be NULL. \ No newline at end of file