Skip to content

Commit

Permalink
Merge pull request #11257 from pdet/sniff_csv_ignore
Browse files Browse the repository at this point in the history
Ignore user defined parameters that change names or types of csv columns in sniffer's prompt.
  • Loading branch information
Mytherin authored Mar 20, 2024
2 parents 27fa232 + 1ef83f3 commit e2b1ed8
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 6 deletions.
12 changes: 10 additions & 2 deletions src/execution/operator/csv_scanner/util/csv_reader_options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,15 @@ static uint8_t GetCandidateSpecificity(const LogicalType &candidate_type) {
}
return it->second;
}

bool StoreUserDefinedParameter(string &option) {
if (option == "column_types" || option == "types" || option == "dtypes" || option == "auto_detect" ||
option == "auto_type_candidates" || option == "columns" || option == "names") {
// We don't store options related to types, names and auto-detection since these are either irrelevant to our
// prompt or are covered by the columns option.
return false;
}
return true;
}
void CSVReaderOptions::FromNamedParameters(named_parameter_map_t &in, ClientContext &context,
vector<LogicalType> &return_types, vector<string> &names) {
for (auto &kv : in) {
Expand All @@ -382,7 +390,7 @@ void CSVReaderOptions::FromNamedParameters(named_parameter_map_t &in, ClientCont
}
auto loption = StringUtil::Lower(kv.first);
// skip variables that are specific to auto detection
if (loption != "auto_detect" && loption != "auto_type_candidates") {
if (StoreUserDefinedParameter(loption)) {
user_defined_parameters += loption + "=" + kv.second.ToSQLString() + ", ";
}
if (loption == "columns") {
Expand Down
6 changes: 6 additions & 0 deletions src/function/table/sniff_csv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,12 @@ static void CSVSniffFunction(ClientContext &context, TableFunctionInput &data_p,
sniffer_options.file_path = data.path;

auto buffer_manager = make_shared<CSVBufferManager>(context, sniffer_options, sniffer_options.file_path, 0);
if (sniffer_options.name_list.empty()) {
sniffer_options.name_list = data.names_csv;
}
if (sniffer_options.sql_type_list.empty()) {
sniffer_options.sql_type_list = data.return_types_csv;
}
CSVSniffer sniffer(sniffer_options, buffer_manager, CSVStateMachineCache::Get(context));
auto sniffer_result = sniffer.SniffCSV(true);
string str_opt;
Expand Down
28 changes: 24 additions & 4 deletions test/sql/copy/csv/test_sniff_csv_options.test
Original file line number Diff line number Diff line change
Expand Up @@ -41,18 +41,18 @@ FROM sniff_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', escape='"');
query IIIIIIIIIII
FROM sniff_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', names=['c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9', 'c10', 'c11', 'c12', 'c13', 'c14', 'c15', 'c16']);
----
| " " \n 0 false {'c1': 'BIGINT', 'c2': 'BIGINT', 'c3': 'BIGINT', 'c4': 'BIGINT', 'c5': 'BIGINT', 'c6': 'DOUBLE', 'c7': 'DOUBLE', 'c8': 'DOUBLE', 'c9': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'} %Y-%m-%d NULL names=['c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9', 'c10', 'c11', 'c12', 'c13', 'c14', 'c15', 'c16'] FROM read_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', auto_detect=false, delim='|', quote='"', escape='"', new_line='\n', skip=0, header=false, columns={'c1': 'BIGINT', 'c2': 'BIGINT', 'c3': 'BIGINT', 'c4': 'BIGINT', 'c5': 'BIGINT', 'c6': 'DOUBLE', 'c7': 'DOUBLE', 'c8': 'DOUBLE', 'c9': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'}, dateformat='%Y-%m-%d', names=['c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9', 'c10', 'c11', 'c12', 'c13', 'c14', 'c15', 'c16']);
| " " \n 0 0 {'c1': 'BIGINT', 'c2': 'BIGINT', 'c3': 'BIGINT', 'c4': 'BIGINT', 'c5': 'BIGINT', 'c6': 'DOUBLE', 'c7': 'DOUBLE', 'c8': 'DOUBLE', 'c9': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'} %Y-%m-%d NULL NULL FROM read_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', auto_detect=false, delim='|', quote='"', escape='"', new_line='\n', skip=0, header=false, columns={'c1': 'BIGINT', 'c2': 'BIGINT', 'c3': 'BIGINT', 'c4': 'BIGINT', 'c5': 'BIGINT', 'c6': 'DOUBLE', 'c7': 'DOUBLE', 'c8': 'DOUBLE', 'c9': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'}, dateformat='%Y-%m-%d');


query IIIIIIIIIIIIIIII
FROM read_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', auto_detect=false, delim='|', quote='"', escape='"', new_line='\n', skip=0, header=false, columns={'c1': 'BIGINT', 'c2': 'BIGINT', 'c3': 'BIGINT', 'c4': 'BIGINT', 'c5': 'BIGINT', 'c6': 'DOUBLE', 'c7': 'DOUBLE', 'c8': 'DOUBLE', 'c9': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'}, dateformat='%Y-%m-%d', names=['c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9', 'c10', 'c11', 'c12', 'c13', 'c14', 'c15', 'c16']) limit 1
FROM read_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', auto_detect=false, delim='|', quote='"', escape='"', new_line='\n', skip=0, header=false, columns={'c1': 'BIGINT', 'c2': 'BIGINT', 'c3': 'BIGINT', 'c4': 'BIGINT', 'c5': 'BIGINT', 'c6': 'DOUBLE', 'c7': 'DOUBLE', 'c8': 'DOUBLE', 'c9': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'}, dateformat='%Y-%m-%d') limit 1;
----
1 15519 785 1 17 24386.67 0.04 0.02 N O 1996-03-13 1996-02-12 1996-03-22 DELIVER IN PERSON TRUCK egular courts above the

query IIIIIIIIIII
FROM sniff_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', columns={'c1': 'BIGINT', 'c2': 'BIGINT', 'c3': 'BIGINT', 'c4': 'BIGINT', 'c5': 'BIGINT', 'c6': 'DOUBLE', 'c7': 'DOUBLE', 'c8': 'DOUBLE', 'c9': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'});
----
| " " \n 0 0 {'column00': 'BIGINT', 'column01': 'BIGINT', 'column02': 'BIGINT', 'column03': 'BIGINT', 'column04': 'BIGINT', 'column05': 'DOUBLE', 'column06': 'DOUBLE', 'column07': 'DOUBLE', 'column08': 'VARCHAR', 'column09': 'VARCHAR', 'column10': 'DATE', 'column11': 'DATE', 'column12': 'DATE', 'column13': 'VARCHAR', 'column14': 'VARCHAR', 'column15': 'VARCHAR'} %Y-%m-%d NULL columns={'c1': 'BIGINT', 'c2': 'BIGINT', 'c3': 'BIGINT', 'c4': 'BIGINT', 'c5': 'BIGINT', 'c6': 'DOUBLE', 'c7': 'DOUBLE', 'c8': 'DOUBLE', 'c9': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'} FROM read_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', auto_detect=false, delim='|', quote='"', escape='"', new_line='\n', skip=0, header=false, columns={'column00': 'BIGINT', 'column01': 'BIGINT', 'column02': 'BIGINT', 'column03': 'BIGINT', 'column04': 'BIGINT', 'column05': 'DOUBLE', 'column06': 'DOUBLE', 'column07': 'DOUBLE', 'column08': 'VARCHAR', 'column09': 'VARCHAR', 'column10': 'DATE', 'column11': 'DATE', 'column12': 'DATE', 'column13': 'VARCHAR', 'column14': 'VARCHAR', 'column15': 'VARCHAR'}, dateformat='%Y-%m-%d', columns={'c1': 'BIGINT', 'c2': 'BIGINT', 'c3': 'BIGINT', 'c4': 'BIGINT', 'c5': 'BIGINT', 'c6': 'DOUBLE', 'c7': 'DOUBLE', 'c8': 'DOUBLE', 'c9': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'});
| " " \n 0 0 {'c1': 'BIGINT', 'c2': 'BIGINT', 'c3': 'BIGINT', 'c4': 'BIGINT', 'c5': 'BIGINT', 'c6': 'DOUBLE', 'c7': 'DOUBLE', 'c8': 'DOUBLE', 'c9': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'} %Y-%m-%d NULL NULL FROM read_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', auto_detect=false, delim='|', quote='"', escape='"', new_line='\n', skip=0, header=false, columns={'c1': 'BIGINT', 'c2': 'BIGINT', 'c3': 'BIGINT', 'c4': 'BIGINT', 'c5': 'BIGINT', 'c6': 'DOUBLE', 'c7': 'DOUBLE', 'c8': 'DOUBLE', 'c9': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'}, dateformat='%Y-%m-%d');

# skip rows
query IIIIIIIIIII
Expand Down Expand Up @@ -98,7 +98,7 @@ sniff_csv function does not accept auto_detect variable set to false
query IIIIIIIIIII
FROM sniff_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', columns={'c1': 'BIGINT', 'c2': 'BIGINT', 'c3': 'BIGINT', 'c4': 'BIGINT', 'c5': 'BIGINT', 'c6': 'DOUBLE', 'c7': 'DOUBLE', 'c8': 'DOUBLE', 'c9': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'}, auto_detect = true);
----
| " " \n 0 0 {'column00': 'BIGINT', 'column01': 'BIGINT', 'column02': 'BIGINT', 'column03': 'BIGINT', 'column04': 'BIGINT', 'column05': 'DOUBLE', 'column06': 'DOUBLE', 'column07': 'DOUBLE', 'column08': 'VARCHAR', 'column09': 'VARCHAR', 'column10': 'DATE', 'column11': 'DATE', 'column12': 'DATE', 'column13': 'VARCHAR', 'column14': 'VARCHAR', 'column15': 'VARCHAR'} %Y-%m-%d NULL columns={'c1': 'BIGINT', 'c2': 'BIGINT', 'c3': 'BIGINT', 'c4': 'BIGINT', 'c5': 'BIGINT', 'c6': 'DOUBLE', 'c7': 'DOUBLE', 'c8': 'DOUBLE', 'c9': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'} FROM read_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', auto_detect=false, delim='|', quote='"', escape='"', new_line='\n', skip=0, header=false, columns={'column00': 'BIGINT', 'column01': 'BIGINT', 'column02': 'BIGINT', 'column03': 'BIGINT', 'column04': 'BIGINT', 'column05': 'DOUBLE', 'column06': 'DOUBLE', 'column07': 'DOUBLE', 'column08': 'VARCHAR', 'column09': 'VARCHAR', 'column10': 'DATE', 'column11': 'DATE', 'column12': 'DATE', 'column13': 'VARCHAR', 'column14': 'VARCHAR', 'column15': 'VARCHAR'}, dateformat='%Y-%m-%d', columns={'c1': 'BIGINT', 'c2': 'BIGINT', 'c3': 'BIGINT', 'c4': 'BIGINT', 'c5': 'BIGINT', 'c6': 'DOUBLE', 'c7': 'DOUBLE', 'c8': 'DOUBLE', 'c9': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'});
| " " \n 0 0 {'c1': 'BIGINT', 'c2': 'BIGINT', 'c3': 'BIGINT', 'c4': 'BIGINT', 'c5': 'BIGINT', 'c6': 'DOUBLE', 'c7': 'DOUBLE', 'c8': 'DOUBLE', 'c9': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'} %Y-%m-%d NULL NULL FROM read_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', auto_detect=false, delim='|', quote='"', escape='"', new_line='\n', skip=0, header=false, columns={'c1': 'BIGINT', 'c2': 'BIGINT', 'c3': 'BIGINT', 'c4': 'BIGINT', 'c5': 'BIGINT', 'c6': 'DOUBLE', 'c7': 'DOUBLE', 'c8': 'DOUBLE', 'c9': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'}, dateformat='%Y-%m-%d');

# auto_type_candidates
query IIIIIIIIIII
Expand Down Expand Up @@ -128,3 +128,23 @@ query IIIIIIIIIII
FROM sniff_csv('data/csv/autotypecandidates.csv', HIVE_PARTITIONING=1);
----
| " " \n 0 false {'column0': 'BIGINT', 'column1': 'DOUBLE', 'column2': 'VARCHAR'} NULL NULL NULL FROM read_csv('data/csv/autotypecandidates.csv', auto_detect=false, delim='|', quote='"', escape='"', new_line='\n', skip=0, header=false, columns={'column0': 'BIGINT', 'column1': 'DOUBLE', 'column2': 'VARCHAR'});

query IIIIIIIIIII
FROM sniff_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', types=['INTEGER','BIGINT','BIGINT','BIGINT','BIGINT', 'DOUBLE','DOUBLE','DOUBLE','VARCHAR', 'VARCHAR','DATE', 'DATE', 'DATE', 'VARCHAR', 'VARCHAR', 'VARCHAR']);
----
| " " \n 0 0 {'column00': 'INTEGER', 'column01': 'BIGINT', 'column02': 'BIGINT', 'column03': 'BIGINT', 'column04': 'BIGINT', 'column05': 'DOUBLE', 'column06': 'DOUBLE', 'column07': 'DOUBLE', 'column08': 'VARCHAR', 'column09': 'VARCHAR', 'column10': 'DATE', 'column11': 'DATE', 'column12': 'DATE', 'column13': 'VARCHAR', 'column14': 'VARCHAR', 'column15': 'VARCHAR'} %Y-%m-%d NULL NULL FROM read_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', auto_detect=false, delim='|', quote='"', escape='"', new_line='\n', skip=0, header=false, columns={'column00': 'INTEGER', 'column01': 'BIGINT', 'column02': 'BIGINT', 'column03': 'BIGINT', 'column04': 'BIGINT', 'column05': 'DOUBLE', 'column06': 'DOUBLE', 'column07': 'DOUBLE', 'column08': 'VARCHAR', 'column09': 'VARCHAR', 'column10': 'DATE', 'column11': 'DATE', 'column12': 'DATE', 'column13': 'VARCHAR', 'column14': 'VARCHAR', 'column15': 'VARCHAR'}, dateformat='%Y-%m-%d');

query IIIIIIIIIII
FROM sniff_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', dtypes=['INTEGER','BIGINT','BIGINT','BIGINT','BIGINT', 'DOUBLE','DOUBLE','DOUBLE','VARCHAR', 'VARCHAR','DATE', 'DATE', 'DATE', 'VARCHAR', 'VARCHAR', 'VARCHAR']);
----
| " " \n 0 0 {'column00': 'INTEGER', 'column01': 'BIGINT', 'column02': 'BIGINT', 'column03': 'BIGINT', 'column04': 'BIGINT', 'column05': 'DOUBLE', 'column06': 'DOUBLE', 'column07': 'DOUBLE', 'column08': 'VARCHAR', 'column09': 'VARCHAR', 'column10': 'DATE', 'column11': 'DATE', 'column12': 'DATE', 'column13': 'VARCHAR', 'column14': 'VARCHAR', 'column15': 'VARCHAR'} %Y-%m-%d NULL NULL FROM read_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', auto_detect=false, delim='|', quote='"', escape='"', new_line='\n', skip=0, header=false, columns={'column00': 'INTEGER', 'column01': 'BIGINT', 'column02': 'BIGINT', 'column03': 'BIGINT', 'column04': 'BIGINT', 'column05': 'DOUBLE', 'column06': 'DOUBLE', 'column07': 'DOUBLE', 'column08': 'VARCHAR', 'column09': 'VARCHAR', 'column10': 'DATE', 'column11': 'DATE', 'column12': 'DATE', 'column13': 'VARCHAR', 'column14': 'VARCHAR', 'column15': 'VARCHAR'}, dateformat='%Y-%m-%d');

query IIIIIIIIIII
FROM sniff_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', column_types=['INTEGER','BIGINT','BIGINT','BIGINT','BIGINT', 'DOUBLE','DOUBLE','DOUBLE','VARCHAR', 'VARCHAR','DATE', 'DATE', 'DATE', 'VARCHAR', 'VARCHAR', 'VARCHAR']);
----
| " " \n 0 0 {'column00': 'INTEGER', 'column01': 'BIGINT', 'column02': 'BIGINT', 'column03': 'BIGINT', 'column04': 'BIGINT', 'column05': 'DOUBLE', 'column06': 'DOUBLE', 'column07': 'DOUBLE', 'column08': 'VARCHAR', 'column09': 'VARCHAR', 'column10': 'DATE', 'column11': 'DATE', 'column12': 'DATE', 'column13': 'VARCHAR', 'column14': 'VARCHAR', 'column15': 'VARCHAR'} %Y-%m-%d NULL NULL FROM read_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', auto_detect=false, delim='|', quote='"', escape='"', new_line='\n', skip=0, header=false, columns={'column00': 'INTEGER', 'column01': 'BIGINT', 'column02': 'BIGINT', 'column03': 'BIGINT', 'column04': 'BIGINT', 'column05': 'DOUBLE', 'column06': 'DOUBLE', 'column07': 'DOUBLE', 'column08': 'VARCHAR', 'column09': 'VARCHAR', 'column10': 'DATE', 'column11': 'DATE', 'column12': 'DATE', 'column13': 'VARCHAR', 'column14': 'VARCHAR', 'column15': 'VARCHAR'}, dateformat='%Y-%m-%d');

query IIIIIIIIIII
FROM sniff_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', names=['c01','c02','c03','c04','c5', 'c06','c07','c08','c09', 'c10','c11', 'c12', 'c13', 'c14', 'c15', 'c16']);
----
| " " \n 0 0 {'c01': 'BIGINT', 'c02': 'BIGINT', 'c03': 'BIGINT', 'c04': 'BIGINT', 'c5': 'BIGINT', 'c06': 'DOUBLE', 'c07': 'DOUBLE', 'c08': 'DOUBLE', 'c09': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'} %Y-%m-%d NULL NULL FROM read_csv('test/sql/copy/csv/data/real/lineitem_sample.csv', auto_detect=false, delim='|', quote='"', escape='"', new_line='\n', skip=0, header=false, columns={'c01': 'BIGINT', 'c02': 'BIGINT', 'c03': 'BIGINT', 'c04': 'BIGINT', 'c5': 'BIGINT', 'c06': 'DOUBLE', 'c07': 'DOUBLE', 'c08': 'DOUBLE', 'c09': 'VARCHAR', 'c10': 'VARCHAR', 'c11': 'DATE', 'c12': 'DATE', 'c13': 'DATE', 'c14': 'VARCHAR', 'c15': 'VARCHAR', 'c16': 'VARCHAR'}, dateformat='%Y-%m-%d');

0 comments on commit e2b1ed8

Please sign in to comment.