Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve candidate error message and relax constraint of rfc_4180 = false on quotes #15371

Draft
wants to merge 12 commits into
base: main
Choose a base branch
from
Next Next commit
Making sure the csv reader is robust against this sneaky broken line
  • Loading branch information
pdet committed Dec 16, 2024
commit dce0a8e30916c7c5b7ab698b2e17f662eeb90c34
1 change: 1 addition & 0 deletions data/csv/hits_problematic.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
5258277067567845909,1,"Смеситет магазин модноэтаж остан ",1,"2013-07-01 21:30:14","2013-07-02",40367,1330195396,229,5768967145066463567,0,2,5,"http://sib1.adriver.ru/blog/101445363034648639/Samara","http:%2F%2Fsc.cheloveplanet.ru/index.ru/yandex.ru/recipes/catalog=72&pages.yandex.ua/?targetUrl=/users/folderfull&Form",0,10282,952,13164,216,1750,938,37,15,7,"700",0,0,22,"4s",1,1,0,0,"","",1762492,2,13,"врагон отзывы туры+в+спб+василово",13,0,2031,885,135,"2013-07-02 08:16:37",0,0,0,0,"windows",1601,0,0,0,4823732634460328886,"",883400972,0,0,0,0,0,"5","2013-07-02 00:13:50",31,2,2,8713,0,2053662724,-1,-1,-1,"�","� ","","",0,0,0,0,722,16,32,0,"",0,"","NH",0,"direct.yandex.ru","501844362310er (Bada OS X 10.1"" (1280x720)/1024+1632Mb) MS Windows 8 (64-bit) [90AT0021200/12"" мощностью 1200055302 Xperia mini Medics 4000Mhz/10.1"" (1920x1200Лм 2,5","38882665528397864181259328149","","trud.com","30533","russia_except","86","","3000005",0,-8134524926569970711,2969999232154769439,0
14 changes: 13 additions & 1 deletion src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,19 @@ string DialectCandidates::Print() {
auto escape_candidate = escape_candidates_map[i];
for (idx_t j = 0; j < quote_candidate.size(); j++) {
for (idx_t k = 0; k < escape_candidate.size(); k++) {
search_space << "[\'" << quote_candidate[j] << "\',\'" << escape_candidate[k] << "\']";
search_space << "[\'";
if (quote_candidate[j] == '\0') {
search_space << "(no quote)";
} else {
search_space << quote_candidate[j];
}
search_space << "\',\'";
if (escape_candidate[k] == '\0') {
search_space << "(no escape)";
} else {
search_space << escape_candidate[k];
}
search_space << "\']";
if (k < escape_candidate.size() - 1) {
search_space << ",";
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
if (state_machine_options.quote == state_machine_options.escape) {
transition_array[quote][static_cast<uint8_t>(CSVState::UNQUOTED)] = CSVState::QUOTED;
}
if (state_machine_options.escape == '\0' && state_machine_options.rfc_4180 == false) {
if (state_machine_options.rfc_4180 == false) {
transition_array[quote][static_cast<uint8_t>(CSVState::UNQUOTED)] = CSVState::QUOTED;
}
if (comment != '\0') {
Expand Down
22 changes: 3 additions & 19 deletions test/sql/copy/csv/parallel/csv_parallel_clickbench.test_slow
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,13 @@

mode skip

require parquet

require httpfs

statement ok
pragma threads=4

statement ok
PRAGMA enable_verification


statement ok
CREATE TABLE hits_og
CREATE TABLE hits
(
WatchID BIGINT NOT NULL,
JavaEnable SMALLINT NOT NULL,
Expand Down Expand Up @@ -126,24 +120,14 @@ CREATE TABLE hits_og
PRIMARY KEY (CounterID, EventDate, UserID, EventTime, WatchID)
);


statement ok
INSERT INTO hits_og SELECT * FROM read_parquet('https://github.com/duckdb/duckdb-data/releases/download/v1.0/hits.parquet');

statement ok
COPY hits_og TO '__TEST_DIR__/hits.csv';

statement ok
create table hits as select * from hits_og limit 0;

statement ok
insert into hits from read_csv('__TEST_DIR__/hits.csv', compression = 'none', nullstr = 'null');
insert into hits from read_csv('/Users/holanda/Downloads/hits.csv');

#Q 01
query I
SELECT COUNT(*) FROM hits;
----
10000000
99997497

#Q 02
query I
Expand Down
24 changes: 12 additions & 12 deletions test/sql/copy/csv/relaxed_quotes.test
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,12 @@ de:08317:12007:2:1 Lahr Schlüssel "Vis-à-Vis Bus" 48.3411985847104 7.879329970
query II
from read_csv('data/csv/unescaped_quote.csv', rfc_4180=false);
----
1 pedro pdet holanda
2 pedro pdet holanda
3 pedro pdet ho;landa
4 pedro pdet holanda
5 pedro pdet holanda
6 pedro pdet holanda
1 pedro "pdet" holanda
2 pedro "pdet" holanda
3 pedro "pdet" ho;landa
4 pedro "pdet" holanda
5 pedro "pdet" holanda
6 pedro "pdet" holanda

query II
from read_csv('data/csv/unescaped_quote.csv', rfc_4180=false, escape = '', quote = '"', delim = ';');
Expand All @@ -57,12 +57,12 @@ loop buffer_size 30 35
query II
from read_csv('data/csv/unescaped_quote.csv', rfc_4180=false, buffer_size = ${buffer_size}, header = 0)
----
1 pedro pdet holanda
2 pedro pdet holanda
3 pedro pdet ho;landa
4 pedro pdet holanda
5 pedro pdet holanda
6 pedro pdet holanda
1 pedro "pdet" holanda
2 pedro "pdet" holanda
3 pedro "pdet" ho;landa
4 pedro "pdet" holanda
5 pedro "pdet" holanda
6 pedro "pdet" holanda

endloop

Expand Down
15 changes: 15 additions & 0 deletions test/sql/copy/csv/test_hits_problematic.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# name: test/sql/copy/csv/test_hits_problematic.test
# description: Test read CSV function on problematic Clickbench
# group: [csv]

statement ok
FROM read_csv('data/csv/hits_problematic.csv', auto_detect=false, delim=',', quote='"', escape='\', new_line='\n', skip=0, comment='', header=false, columns={'column000': 'BIGINT', 'column001': 'BIGINT', 'column002': 'VARCHAR', 'column003': 'BIGINT', 'column004': 'TIMESTAMP', 'column005': 'DATE', 'column006': 'BIGINT', 'column007': 'BIGINT', 'column008': 'BIGINT', 'column009': 'BIGINT', 'column010': 'BIGINT', 'column011': 'BIGINT', 'column012': 'BIGINT', 'column013': 'VARCHAR', 'column014': 'VARCHAR', 'column015': 'BIGINT', 'column016': 'BIGINT', 'column017': 'BIGINT', 'column018': 'BIGINT', 'column019': 'BIGINT', 'column020': 'BIGINT', 'column021': 'BIGINT', 'column022': 'BIGINT', 'column023': 'BIGINT', 'column024': 'BIGINT', 'column025': 'DOUBLE', 'column026': 'BIGINT', 'column027': 'BIGINT', 'column028': 'BIGINT', 'column029': 'VARCHAR', 'column030': 'BIGINT', 'column031': 'BIGINT', 'column032': 'BIGINT', 'column033': 'BIGINT', 'column034': 'VARCHAR', 'column035': 'VARCHAR', 'column036': 'BIGINT', 'column037': 'BIGINT', 'column038': 'BIGINT', 'column039': 'VARCHAR', 'column040': 'BIGINT', 'column041': 'BIGINT', 'column042': 'BIGINT', 'column043': 'BIGINT', 'column044': 'BIGINT', 'column045': 'TIMESTAMP', 'column046': 'BIGINT', 'column047': 'BIGINT', 'column048': 'BIGINT', 'column049': 'BIGINT', 'column050': 'VARCHAR', 'column051': 'BIGINT', 'column052': 'BIGINT', 'column053': 'BIGINT', 'column054': 'BIGINT', 'column055': 'BIGINT', 'column056': 'VARCHAR', 'column057': 'BIGINT', 'column058': 'BIGINT', 'column059': 'BIGINT', 'column060': 'BIGINT', 'column061': 'BIGINT', 'column062': 'BIGINT', 'column063': 'VARCHAR', 'column064': 'TIMESTAMP', 'column065': 'BIGINT', 'column066': 'BIGINT', 'column067': 'BIGINT', 'column068': 'BIGINT', 'column069': 'BIGINT', 'column070': 'BIGINT', 'column071': 'BIGINT', 'column072': 'BIGINT', 'column073': 'BIGINT', 'column074': 'VARCHAR', 'column075': 'VARCHAR', 'column076': 'VARCHAR', 'column077': 'VARCHAR', 'column078': 'BIGINT', 'column079': 'BIGINT', 'column080': 'BIGINT', 'column081': 'BIGINT', 'column082': 'BIGINT', 'column083': 'BIGINT', 'column084': 'BIGINT', 'column085': 'BIGINT', 'column086': 'VARCHAR', 'column087': 'BIGINT', 'column088': 'VARCHAR', 'column089': 'VARCHAR', 'column090': 'BIGINT', 'column091': 'VARCHAR', 'column092': 'VARCHAR', 'column093': 'VARCHAR', 'column094': 'VARCHAR', 'column095': 'VARCHAR', 'column096': 'VARCHAR', 'column097': 'VARCHAR', 'column098': 'VARCHAR', 'column099': 'VARCHAR', 'column100': 'VARCHAR', 'column101': 'BIGINT', 'column102': 'BIGINT', 'column103': 'BIGINT', 'column104': 'BIGINT'}, dateformat='%Y-%m-%d');

statement ok
FROM read_csv('data/csv/hits_problematic.csv', auto_detect=false, delim=',', quote='"', escape='', new_line='\n', skip=0, comment='', header=false, columns={'column000': 'BIGINT', 'column001': 'BIGINT', 'column002': 'VARCHAR', 'column003': 'BIGINT', 'column004': 'TIMESTAMP', 'column005': 'DATE', 'column006': 'BIGINT', 'column007': 'BIGINT', 'column008': 'BIGINT', 'column009': 'BIGINT', 'column010': 'BIGINT', 'column011': 'BIGINT', 'column012': 'BIGINT', 'column013': 'VARCHAR', 'column014': 'VARCHAR', 'column015': 'BIGINT', 'column016': 'BIGINT', 'column017': 'BIGINT', 'column018': 'BIGINT', 'column019': 'BIGINT', 'column020': 'BIGINT', 'column021': 'BIGINT', 'column022': 'BIGINT', 'column023': 'BIGINT', 'column024': 'BIGINT', 'column025': 'DOUBLE', 'column026': 'BIGINT', 'column027': 'BIGINT', 'column028': 'BIGINT', 'column029': 'VARCHAR', 'column030': 'BIGINT', 'column031': 'BIGINT', 'column032': 'BIGINT', 'column033': 'BIGINT', 'column034': 'VARCHAR', 'column035': 'VARCHAR', 'column036': 'BIGINT', 'column037': 'BIGINT', 'column038': 'BIGINT', 'column039': 'VARCHAR', 'column040': 'BIGINT', 'column041': 'BIGINT', 'column042': 'BIGINT', 'column043': 'BIGINT', 'column044': 'BIGINT', 'column045': 'TIMESTAMP', 'column046': 'BIGINT', 'column047': 'BIGINT', 'column048': 'BIGINT', 'column049': 'BIGINT', 'column050': 'VARCHAR', 'column051': 'BIGINT', 'column052': 'BIGINT', 'column053': 'BIGINT', 'column054': 'BIGINT', 'column055': 'BIGINT', 'column056': 'VARCHAR', 'column057': 'BIGINT', 'column058': 'BIGINT', 'column059': 'BIGINT', 'column060': 'BIGINT', 'column061': 'BIGINT', 'column062': 'BIGINT', 'column063': 'VARCHAR', 'column064': 'TIMESTAMP', 'column065': 'BIGINT', 'column066': 'BIGINT', 'column067': 'BIGINT', 'column068': 'BIGINT', 'column069': 'BIGINT', 'column070': 'BIGINT', 'column071': 'BIGINT', 'column072': 'BIGINT', 'column073': 'BIGINT', 'column074': 'VARCHAR', 'column075': 'VARCHAR', 'column076': 'VARCHAR', 'column077': 'VARCHAR', 'column078': 'BIGINT', 'column079': 'BIGINT', 'column080': 'BIGINT', 'column081': 'BIGINT', 'column082': 'BIGINT', 'column083': 'BIGINT', 'column084': 'BIGINT', 'column085': 'BIGINT', 'column086': 'VARCHAR', 'column087': 'BIGINT', 'column088': 'VARCHAR', 'column089': 'VARCHAR', 'column090': 'BIGINT', 'column091': 'VARCHAR', 'column092': 'VARCHAR', 'column093': 'VARCHAR', 'column094': 'VARCHAR', 'column095': 'VARCHAR', 'column096': 'VARCHAR', 'column097': 'VARCHAR', 'column098': 'VARCHAR', 'column099': 'VARCHAR', 'column100': 'VARCHAR', 'column101': 'BIGINT', 'column102': 'BIGINT', 'column103': 'BIGINT', 'column104': 'BIGINT'}, dateformat='%Y-%m-%d');

statement ok
FROM read_csv('data/csv/hits_problematic.csv', auto_detect=false, delim=',', quote='', escape='', new_line='\n', skip=0, comment='', header=false, columns={'column000': 'BIGINT', 'column001': 'BIGINT', 'column002': 'VARCHAR', 'column003': 'BIGINT', 'column004': 'TIMESTAMP', 'column005': 'DATE', 'column006': 'BIGINT', 'column007': 'BIGINT', 'column008': 'BIGINT', 'column009': 'BIGINT', 'column010': 'BIGINT', 'column011': 'BIGINT', 'column012': 'BIGINT', 'column013': 'VARCHAR', 'column014': 'VARCHAR', 'column015': 'BIGINT', 'column016': 'BIGINT', 'column017': 'BIGINT', 'column018': 'BIGINT', 'column019': 'BIGINT', 'column020': 'BIGINT', 'column021': 'BIGINT', 'column022': 'BIGINT', 'column023': 'BIGINT', 'column024': 'BIGINT', 'column025': 'DOUBLE', 'column026': 'BIGINT', 'column027': 'BIGINT', 'column028': 'BIGINT', 'column029': 'VARCHAR', 'column030': 'BIGINT', 'column031': 'BIGINT', 'column032': 'BIGINT', 'column033': 'BIGINT', 'column034': 'VARCHAR', 'column035': 'VARCHAR', 'column036': 'BIGINT', 'column037': 'BIGINT', 'column038': 'BIGINT', 'column039': 'VARCHAR', 'column040': 'BIGINT', 'column041': 'BIGINT', 'column042': 'BIGINT', 'column043': 'BIGINT', 'column044': 'BIGINT', 'column045': 'TIMESTAMP', 'column046': 'BIGINT', 'column047': 'BIGINT', 'column048': 'BIGINT', 'column049': 'BIGINT', 'column050': 'VARCHAR', 'column051': 'BIGINT', 'column052': 'BIGINT', 'column053': 'BIGINT', 'column054': 'BIGINT', 'column055': 'BIGINT', 'column056': 'VARCHAR', 'column057': 'BIGINT', 'column058': 'BIGINT', 'column059': 'BIGINT', 'column060': 'BIGINT', 'column061': 'BIGINT', 'column062': 'BIGINT', 'column063': 'VARCHAR', 'column064': 'TIMESTAMP', 'column065': 'BIGINT', 'column066': 'BIGINT', 'column067': 'BIGINT', 'column068': 'BIGINT', 'column069': 'BIGINT', 'column070': 'BIGINT', 'column071': 'BIGINT', 'column072': 'BIGINT', 'column073': 'BIGINT', 'column074': 'VARCHAR', 'column075': 'VARCHAR', 'column076': 'VARCHAR', 'column077': 'VARCHAR', 'column078': 'BIGINT', 'column079': 'BIGINT', 'column080': 'BIGINT', 'column081': 'BIGINT', 'column082': 'BIGINT', 'column083': 'BIGINT', 'column084': 'BIGINT', 'column085': 'BIGINT', 'column086': 'VARCHAR', 'column087': 'BIGINT', 'column088': 'VARCHAR', 'column089': 'VARCHAR', 'column090': 'BIGINT', 'column091': 'VARCHAR', 'column092': 'VARCHAR', 'column093': 'VARCHAR', 'column094': 'VARCHAR', 'column095': 'VARCHAR', 'column096': 'VARCHAR', 'column097': 'VARCHAR', 'column098': 'VARCHAR', 'column099': 'VARCHAR', 'column100': 'VARCHAR', 'column101': 'BIGINT', 'column102': 'BIGINT', 'column103': 'BIGINT', 'column104': 'BIGINT'}, dateformat='%Y-%m-%d');

statement ok
FROM read_csv('data/csv/hits_problematic.csv', auto_detect=false, delim=',', quote='', escape='\', new_line='\n', skip=0, comment='', header=false, columns={'column000': 'BIGINT', 'column001': 'BIGINT', 'column002': 'VARCHAR', 'column003': 'BIGINT', 'column004': 'TIMESTAMP', 'column005': 'DATE', 'column006': 'BIGINT', 'column007': 'BIGINT', 'column008': 'BIGINT', 'column009': 'BIGINT', 'column010': 'BIGINT', 'column011': 'BIGINT', 'column012': 'BIGINT', 'column013': 'VARCHAR', 'column014': 'VARCHAR', 'column015': 'BIGINT', 'column016': 'BIGINT', 'column017': 'BIGINT', 'column018': 'BIGINT', 'column019': 'BIGINT', 'column020': 'BIGINT', 'column021': 'BIGINT', 'column022': 'BIGINT', 'column023': 'BIGINT', 'column024': 'BIGINT', 'column025': 'DOUBLE', 'column026': 'BIGINT', 'column027': 'BIGINT', 'column028': 'BIGINT', 'column029': 'VARCHAR', 'column030': 'BIGINT', 'column031': 'BIGINT', 'column032': 'BIGINT', 'column033': 'BIGINT', 'column034': 'VARCHAR', 'column035': 'VARCHAR', 'column036': 'BIGINT', 'column037': 'BIGINT', 'column038': 'BIGINT', 'column039': 'VARCHAR', 'column040': 'BIGINT', 'column041': 'BIGINT', 'column042': 'BIGINT', 'column043': 'BIGINT', 'column044': 'BIGINT', 'column045': 'TIMESTAMP', 'column046': 'BIGINT', 'column047': 'BIGINT', 'column048': 'BIGINT', 'column049': 'BIGINT', 'column050': 'VARCHAR', 'column051': 'BIGINT', 'column052': 'BIGINT', 'column053': 'BIGINT', 'column054': 'BIGINT', 'column055': 'BIGINT', 'column056': 'VARCHAR', 'column057': 'BIGINT', 'column058': 'BIGINT', 'column059': 'BIGINT', 'column060': 'BIGINT', 'column061': 'BIGINT', 'column062': 'BIGINT', 'column063': 'VARCHAR', 'column064': 'TIMESTAMP', 'column065': 'BIGINT', 'column066': 'BIGINT', 'column067': 'BIGINT', 'column068': 'BIGINT', 'column069': 'BIGINT', 'column070': 'BIGINT', 'column071': 'BIGINT', 'column072': 'BIGINT', 'column073': 'BIGINT', 'column074': 'VARCHAR', 'column075': 'VARCHAR', 'column076': 'VARCHAR', 'column077': 'VARCHAR', 'column078': 'BIGINT', 'column079': 'BIGINT', 'column080': 'BIGINT', 'column081': 'BIGINT', 'column082': 'BIGINT', 'column083': 'BIGINT', 'column084': 'BIGINT', 'column085': 'BIGINT', 'column086': 'VARCHAR', 'column087': 'BIGINT', 'column088': 'VARCHAR', 'column089': 'VARCHAR', 'column090': 'BIGINT', 'column091': 'VARCHAR', 'column092': 'VARCHAR', 'column093': 'VARCHAR', 'column094': 'VARCHAR', 'column095': 'VARCHAR', 'column096': 'VARCHAR', 'column097': 'VARCHAR', 'column098': 'VARCHAR', 'column099': 'VARCHAR', 'column100': 'VARCHAR', 'column101': 'BIGINT', 'column102': 'BIGINT', 'column103': 'BIGINT', 'column104': 'BIGINT'}, dateformat='%Y-%m-%d');