Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ignore user defined parameters that change names or types of csv columns in sniffer's prompt. #11257

Merged
merged 2 commits into from
Mar 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions src/execution/operator/csv_scanner/util/csv_reader_options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,15 @@ static uint8_t GetCandidateSpecificity(const LogicalType &candidate_type) {
}
return it->second;
}

bool StoreUserDefinedParameter(string &option) {
if (option == "column_types" || option == "types" || option == "dtypes" || option == "auto_detect" ||
option == "auto_type_candidates" || option == "columns" || option == "names") {
// We don't store options related to types, names and auto-detection since these are either irrelevant to our
// prompt or are covered by the columns option.
return false;
}
return true;
}
void CSVReaderOptions::FromNamedParameters(named_parameter_map_t &in, ClientContext &context,
vector<LogicalType> &return_types, vector<string> &names) {
for (auto &kv : in) {
Expand All @@ -382,7 +390,7 @@ void CSVReaderOptions::FromNamedParameters(named_parameter_map_t &in, ClientCont
}
auto loption = StringUtil::Lower(kv.first);
// skip variables that are specific to auto detection
if (loption != "auto_detect" && loption != "auto_type_candidates") {
if (StoreUserDefinedParameter(loption)) {
user_defined_parameters += loption + "=" + kv.second.ToSQLString() + ", ";
}
if (loption == "columns") {
Expand Down
6 changes: 6 additions & 0 deletions src/function/table/sniff_csv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,12 @@ static void CSVSniffFunction(ClientContext &context, TableFunctionInput &data_p,
sniffer_options.file_path = data.path;

auto buffer_manager = make_shared<CSVBufferManager>(context, sniffer_options, sniffer_options.file_path, 0);
if (sniffer_options.name_list.empty()) {
sniffer_options.name_list = data.names_csv;
}
if (sniffer_options.sql_type_list.empty()) {
sniffer_options.sql_type_list = data.return_types_csv;
}
CSVSniffer sniffer(sniffer_options, buffer_manager, CSVStateMachineCache::Get(context));
auto sniffer_result = sniffer.SniffCSV(true);
string str_opt;
Expand Down
28 changes: 24 additions & 4 deletions test/sql/copy/csv/test_sniff_csv_options.test
Original file line number Diff line number Diff line change
Expand Up @@ -41,18 +41,18 @@ FROM sniff_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', escape='"');
query IIIIIIIIIII
FROM sniff_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', names=['c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9', 'c10', 'c11', 'c12', 'c13', 'c14', 'c15', 'c16']);
----
| " " \n 0 false {'c1': 'BIGINT', 'c2': 'BIGINT', 'c3': 'BIGINT', 'c4': 'BIGINT', 'c5': 'BIGINT', 'c6': 'DOUBLE', 'c7': 'DOUBLE', 'c8': 'DOUBLE', 'c9': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'} %Y-%m-%d NULL names=['c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9', 'c10', 'c11', 'c12', 'c13', 'c14', 'c15', 'c16'] FROM read_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', auto_detect=false, delim='|', quote='"', escape='"', new_line='\n', skip=0, header=false, columns={'c1': 'BIGINT', 'c2': 'BIGINT', 'c3': 'BIGINT', 'c4': 'BIGINT', 'c5': 'BIGINT', 'c6': 'DOUBLE', 'c7': 'DOUBLE', 'c8': 'DOUBLE', 'c9': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'}, dateformat='%Y-%m-%d', names=['c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9', 'c10', 'c11', 'c12', 'c13', 'c14', 'c15', 'c16']);
| " " \n 0 0 {'c1': 'BIGINT', 'c2': 'BIGINT', 'c3': 'BIGINT', 'c4': 'BIGINT', 'c5': 'BIGINT', 'c6': 'DOUBLE', 'c7': 'DOUBLE', 'c8': 'DOUBLE', 'c9': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'} %Y-%m-%d NULL NULL FROM read_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', auto_detect=false, delim='|', quote='"', escape='"', new_line='\n', skip=0, header=false, columns={'c1': 'BIGINT', 'c2': 'BIGINT', 'c3': 'BIGINT', 'c4': 'BIGINT', 'c5': 'BIGINT', 'c6': 'DOUBLE', 'c7': 'DOUBLE', 'c8': 'DOUBLE', 'c9': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'}, dateformat='%Y-%m-%d');


query IIIIIIIIIIIIIIII
FROM read_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', auto_detect=false, delim='|', quote='"', escape='"', new_line='\n', skip=0, header=false, columns={'c1': 'BIGINT', 'c2': 'BIGINT', 'c3': 'BIGINT', 'c4': 'BIGINT', 'c5': 'BIGINT', 'c6': 'DOUBLE', 'c7': 'DOUBLE', 'c8': 'DOUBLE', 'c9': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'}, dateformat='%Y-%m-%d', names=['c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9', 'c10', 'c11', 'c12', 'c13', 'c14', 'c15', 'c16']) limit 1
FROM read_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', auto_detect=false, delim='|', quote='"', escape='"', new_line='\n', skip=0, header=false, columns={'c1': 'BIGINT', 'c2': 'BIGINT', 'c3': 'BIGINT', 'c4': 'BIGINT', 'c5': 'BIGINT', 'c6': 'DOUBLE', 'c7': 'DOUBLE', 'c8': 'DOUBLE', 'c9': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'}, dateformat='%Y-%m-%d') limit 1;
----
1 15519 785 1 17 24386.67 0.04 0.02 N O 1996-03-13 1996-02-12 1996-03-22 DELIVER IN PERSON TRUCK egular courts above the

query IIIIIIIIIII
FROM sniff_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', columns={'c1': 'BIGINT', 'c2': 'BIGINT', 'c3': 'BIGINT', 'c4': 'BIGINT', 'c5': 'BIGINT', 'c6': 'DOUBLE', 'c7': 'DOUBLE', 'c8': 'DOUBLE', 'c9': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'});
----
| " " \n 0 0 {'column00': 'BIGINT', 'column01': 'BIGINT', 'column02': 'BIGINT', 'column03': 'BIGINT', 'column04': 'BIGINT', 'column05': 'DOUBLE', 'column06': 'DOUBLE', 'column07': 'DOUBLE', 'column08': 'VARCHAR', 'column09': 'VARCHAR', 'column10': 'DATE', 'column11': 'DATE', 'column12': 'DATE', 'column13': 'VARCHAR', 'column14': 'VARCHAR', 'column15': 'VARCHAR'} %Y-%m-%d NULL columns={'c1': 'BIGINT', 'c2': 'BIGINT', 'c3': 'BIGINT', 'c4': 'BIGINT', 'c5': 'BIGINT', 'c6': 'DOUBLE', 'c7': 'DOUBLE', 'c8': 'DOUBLE', 'c9': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'} FROM read_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', auto_detect=false, delim='|', quote='"', escape='"', new_line='\n', skip=0, header=false, columns={'column00': 'BIGINT', 'column01': 'BIGINT', 'column02': 'BIGINT', 'column03': 'BIGINT', 'column04': 'BIGINT', 'column05': 'DOUBLE', 'column06': 'DOUBLE', 'column07': 'DOUBLE', 'column08': 'VARCHAR', 'column09': 'VARCHAR', 'column10': 'DATE', 'column11': 'DATE', 'column12': 'DATE', 'column13': 'VARCHAR', 'column14': 'VARCHAR', 'column15': 'VARCHAR'}, dateformat='%Y-%m-%d', columns={'c1': 'BIGINT', 'c2': 'BIGINT', 'c3': 'BIGINT', 'c4': 'BIGINT', 'c5': 'BIGINT', 'c6': 'DOUBLE', 'c7': 'DOUBLE', 'c8': 'DOUBLE', 'c9': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'});
| " " \n 0 0 {'c1': 'BIGINT', 'c2': 'BIGINT', 'c3': 'BIGINT', 'c4': 'BIGINT', 'c5': 'BIGINT', 'c6': 'DOUBLE', 'c7': 'DOUBLE', 'c8': 'DOUBLE', 'c9': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'} %Y-%m-%d NULL NULL FROM read_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', auto_detect=false, delim='|', quote='"', escape='"', new_line='\n', skip=0, header=false, columns={'c1': 'BIGINT', 'c2': 'BIGINT', 'c3': 'BIGINT', 'c4': 'BIGINT', 'c5': 'BIGINT', 'c6': 'DOUBLE', 'c7': 'DOUBLE', 'c8': 'DOUBLE', 'c9': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'}, dateformat='%Y-%m-%d');

# skip rows
query IIIIIIIIIII
Expand Down Expand Up @@ -98,7 +98,7 @@ sniff_csv function does not accept auto_detect variable set to false
query IIIIIIIIIII
FROM sniff_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', columns={'c1': 'BIGINT', 'c2': 'BIGINT', 'c3': 'BIGINT', 'c4': 'BIGINT', 'c5': 'BIGINT', 'c6': 'DOUBLE', 'c7': 'DOUBLE', 'c8': 'DOUBLE', 'c9': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'}, auto_detect = true);
----
| " " \n 0 0 {'column00': 'BIGINT', 'column01': 'BIGINT', 'column02': 'BIGINT', 'column03': 'BIGINT', 'column04': 'BIGINT', 'column05': 'DOUBLE', 'column06': 'DOUBLE', 'column07': 'DOUBLE', 'column08': 'VARCHAR', 'column09': 'VARCHAR', 'column10': 'DATE', 'column11': 'DATE', 'column12': 'DATE', 'column13': 'VARCHAR', 'column14': 'VARCHAR', 'column15': 'VARCHAR'} %Y-%m-%d NULL columns={'c1': 'BIGINT', 'c2': 'BIGINT', 'c3': 'BIGINT', 'c4': 'BIGINT', 'c5': 'BIGINT', 'c6': 'DOUBLE', 'c7': 'DOUBLE', 'c8': 'DOUBLE', 'c9': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'} FROM read_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', auto_detect=false, delim='|', quote='"', escape='"', new_line='\n', skip=0, header=false, columns={'column00': 'BIGINT', 'column01': 'BIGINT', 'column02': 'BIGINT', 'column03': 'BIGINT', 'column04': 'BIGINT', 'column05': 'DOUBLE', 'column06': 'DOUBLE', 'column07': 'DOUBLE', 'column08': 'VARCHAR', 'column09': 'VARCHAR', 'column10': 'DATE', 'column11': 'DATE', 'column12': 'DATE', 'column13': 'VARCHAR', 'column14': 'VARCHAR', 'column15': 'VARCHAR'}, dateformat='%Y-%m-%d', columns={'c1': 'BIGINT', 'c2': 'BIGINT', 'c3': 'BIGINT', 'c4': 'BIGINT', 'c5': 'BIGINT', 'c6': 'DOUBLE', 'c7': 'DOUBLE', 'c8': 'DOUBLE', 'c9': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'});
| " " \n 0 0 {'c1': 'BIGINT', 'c2': 'BIGINT', 'c3': 'BIGINT', 'c4': 'BIGINT', 'c5': 'BIGINT', 'c6': 'DOUBLE', 'c7': 'DOUBLE', 'c8': 'DOUBLE', 'c9': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'} %Y-%m-%d NULL NULL FROM read_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', auto_detect=false, delim='|', quote='"', escape='"', new_line='\n', skip=0, header=false, columns={'c1': 'BIGINT', 'c2': 'BIGINT', 'c3': 'BIGINT', 'c4': 'BIGINT', 'c5': 'BIGINT', 'c6': 'DOUBLE', 'c7': 'DOUBLE', 'c8': 'DOUBLE', 'c9': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'}, dateformat='%Y-%m-%d');

# auto_type_candidates
query IIIIIIIIIII
Expand Down Expand Up @@ -128,3 +128,23 @@ query IIIIIIIIIII
FROM sniff_csv('data/csv/autotypecandidates.csv', HIVE_PARTITIONING=1);
----
| " " \n 0 false {'column0': 'BIGINT', 'column1': 'DOUBLE', 'column2': 'VARCHAR'} NULL NULL NULL FROM read_csv('data/csv/autotypecandidates.csv', auto_detect=false, delim='|', quote='"', escape='"', new_line='\n', skip=0, header=false, columns={'column0': 'BIGINT', 'column1': 'DOUBLE', 'column2': 'VARCHAR'});

query IIIIIIIIIII
FROM sniff_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', types=['INTEGER','BIGINT','BIGINT','BIGINT','BIGINT', 'DOUBLE','DOUBLE','DOUBLE','VARCHAR', 'VARCHAR','DATE', 'DATE', 'DATE', 'VARCHAR', 'VARCHAR', 'VARCHAR']);
----
| " " \n 0 0 {'column00': 'INTEGER', 'column01': 'BIGINT', 'column02': 'BIGINT', 'column03': 'BIGINT', 'column04': 'BIGINT', 'column05': 'DOUBLE', 'column06': 'DOUBLE', 'column07': 'DOUBLE', 'column08': 'VARCHAR', 'column09': 'VARCHAR', 'column10': 'DATE', 'column11': 'DATE', 'column12': 'DATE', 'column13': 'VARCHAR', 'column14': 'VARCHAR', 'column15': 'VARCHAR'} %Y-%m-%d NULL NULL FROM read_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', auto_detect=false, delim='|', quote='"', escape='"', new_line='\n', skip=0, header=false, columns={'column00': 'INTEGER', 'column01': 'BIGINT', 'column02': 'BIGINT', 'column03': 'BIGINT', 'column04': 'BIGINT', 'column05': 'DOUBLE', 'column06': 'DOUBLE', 'column07': 'DOUBLE', 'column08': 'VARCHAR', 'column09': 'VARCHAR', 'column10': 'DATE', 'column11': 'DATE', 'column12': 'DATE', 'column13': 'VARCHAR', 'column14': 'VARCHAR', 'column15': 'VARCHAR'}, dateformat='%Y-%m-%d');

query IIIIIIIIIII
FROM sniff_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', dtypes=['INTEGER','BIGINT','BIGINT','BIGINT','BIGINT', 'DOUBLE','DOUBLE','DOUBLE','VARCHAR', 'VARCHAR','DATE', 'DATE', 'DATE', 'VARCHAR', 'VARCHAR', 'VARCHAR']);
----
| " " \n 0 0 {'column00': 'INTEGER', 'column01': 'BIGINT', 'column02': 'BIGINT', 'column03': 'BIGINT', 'column04': 'BIGINT', 'column05': 'DOUBLE', 'column06': 'DOUBLE', 'column07': 'DOUBLE', 'column08': 'VARCHAR', 'column09': 'VARCHAR', 'column10': 'DATE', 'column11': 'DATE', 'column12': 'DATE', 'column13': 'VARCHAR', 'column14': 'VARCHAR', 'column15': 'VARCHAR'} %Y-%m-%d NULL NULL FROM read_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', auto_detect=false, delim='|', quote='"', escape='"', new_line='\n', skip=0, header=false, columns={'column00': 'INTEGER', 'column01': 'BIGINT', 'column02': 'BIGINT', 'column03': 'BIGINT', 'column04': 'BIGINT', 'column05': 'DOUBLE', 'column06': 'DOUBLE', 'column07': 'DOUBLE', 'column08': 'VARCHAR', 'column09': 'VARCHAR', 'column10': 'DATE', 'column11': 'DATE', 'column12': 'DATE', 'column13': 'VARCHAR', 'column14': 'VARCHAR', 'column15': 'VARCHAR'}, dateformat='%Y-%m-%d');

query IIIIIIIIIII
FROM sniff_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', column_types=['INTEGER','BIGINT','BIGINT','BIGINT','BIGINT', 'DOUBLE','DOUBLE','DOUBLE','VARCHAR', 'VARCHAR','DATE', 'DATE', 'DATE', 'VARCHAR', 'VARCHAR', 'VARCHAR']);
----
| " " \n 0 0 {'column00': 'INTEGER', 'column01': 'BIGINT', 'column02': 'BIGINT', 'column03': 'BIGINT', 'column04': 'BIGINT', 'column05': 'DOUBLE', 'column06': 'DOUBLE', 'column07': 'DOUBLE', 'column08': 'VARCHAR', 'column09': 'VARCHAR', 'column10': 'DATE', 'column11': 'DATE', 'column12': 'DATE', 'column13': 'VARCHAR', 'column14': 'VARCHAR', 'column15': 'VARCHAR'} %Y-%m-%d NULL NULL FROM read_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', auto_detect=false, delim='|', quote='"', escape='"', new_line='\n', skip=0, header=false, columns={'column00': 'INTEGER', 'column01': 'BIGINT', 'column02': 'BIGINT', 'column03': 'BIGINT', 'column04': 'BIGINT', 'column05': 'DOUBLE', 'column06': 'DOUBLE', 'column07': 'DOUBLE', 'column08': 'VARCHAR', 'column09': 'VARCHAR', 'column10': 'DATE', 'column11': 'DATE', 'column12': 'DATE', 'column13': 'VARCHAR', 'column14': 'VARCHAR', 'column15': 'VARCHAR'}, dateformat='%Y-%m-%d');

query IIIIIIIIIII
FROM sniff_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', names=['c01','c02','c03','c04','c5', 'c06','c07','c08','c09', 'c10','c11', 'c12', 'c13', 'c14', 'c15', 'c16']);
----
| " " \n 0 0 {'c01': 'BIGINT', 'c02': 'BIGINT', 'c03': 'BIGINT', 'c04': 'BIGINT', 'c5': 'BIGINT', 'c06': 'DOUBLE', 'c07': 'DOUBLE', 'c08': 'DOUBLE', 'c09': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'} %Y-%m-%d NULL NULL FROM read_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', auto_detect=false, delim='|', quote='"', escape='"', new_line='\n', skip=0, header=false, columns={'c01': 'BIGINT', 'c02': 'BIGINT', 'c03': 'BIGINT', 'c04': 'BIGINT', 'c5': 'BIGINT', 'c06': 'DOUBLE', 'c07': 'DOUBLE', 'c08': 'DOUBLE', 'c09': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'}, dateformat='%Y-%m-%d');
Loading