Skip to content

Commit

Permalink
chore: Update vendored sources to duckdb/duckdb@0ce8631 (#439)
Browse files Browse the repository at this point in the history
[CSV Sniffer] Slight change of rules for dialect detection (duckdb/duckdb#14013)
fix minor typos in comments of aggregate function tests (duckdb/duckdb#14007)
Disable swift linux tests (duckdb/duckdb#14019)

Co-authored-by: krlmlr <krlmlr@users.noreply.github.com>
  • Loading branch information
github-actions[bot] and krlmlr authored Sep 27, 2024
1 parent 47cb18b commit d9800f4
Show file tree
Hide file tree
Showing 6 changed files with 74 additions and 38 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,32 @@ void ColumnCountResult::AddValue(ColumnCountResult &result, idx_t buffer_pos) {
}

inline void ColumnCountResult::InternalAddRow() {
column_counts[result_position].number_of_columns = current_column_count + 1;
const idx_t column_count = current_column_count + 1;
column_counts[result_position].number_of_columns = column_count;
rows_per_column_count[column_count]++;
current_column_count = 0;
}

idx_t ColumnCountResult::GetMostFrequentColumnCount() const {
if (rows_per_column_count.empty()) {
return 1;
}
idx_t column_count = 0;
idx_t current_max = 0;
for (auto &rpc : rows_per_column_count) {
if (rpc.second > current_max) {
current_max = rpc.second;
column_count = rpc.first;
} else if (rpc.second == current_max) {
// We pick the largest to untie
if (rpc.first > column_count) {
column_count = rpc.first;
}
}
}
return column_count;
}

bool ColumnCountResult::AddRow(ColumnCountResult &result, idx_t buffer_pos) {
result.InternalAddRow();
if (!result.states.EmptyLastValue()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -221,29 +221,35 @@ SnifferResult CSVSniffer::SniffCSV(bool force_match) {
// If the header exists it should match
string header_error = "The Column names set by the user do not match the ones found by the sniffer. \n";
auto &set_names = *set_columns.names;
for (idx_t i = 0; i < set_columns.Size(); i++) {
if (set_names[i] != names[i]) {
header_error += "Column at position: " + to_string(i) + " Set name: " + set_names[i] +
" Sniffed Name: " + names[i] + "\n";
match = false;
if (set_names.size() == names.size()) {
for (idx_t i = 0; i < set_columns.Size(); i++) {
if (set_names[i] != names[i]) {
header_error += "Column at position: " + to_string(i) + " Set name: " + set_names[i] +
" Sniffed Name: " + names[i] + "\n";
match = false;
}
}
}

if (!match) {
error += header_error;
}
}
match = true;
string type_error = "The Column types set by the user do not match the ones found by the sniffer. \n";
auto &set_types = *set_columns.types;
for (idx_t i = 0; i < set_columns.Size(); i++) {
if (set_types[i] != detected_types[i]) {
type_error += "Column at position: " + to_string(i) + " Set type: " + set_types[i].ToString() +
" Sniffed type: " + detected_types[i].ToString() + "\n";
detected_types[i] = set_types[i];
manually_set[i] = true;
match = false;
if (detected_types.size() == set_columns.Size()) {
for (idx_t i = 0; i < set_columns.Size(); i++) {
if (set_types[i] != detected_types[i]) {
type_error += "Column at position: " + to_string(i) + " Set type: " + set_types[i].ToString() +
" Sniffed type: " + detected_types[i].ToString() + "\n";
detected_types[i] = set_types[i];
manually_set[i] = true;
match = false;
}
}
}

if (!match) {
error += type_error;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ string DialectCandidates::Print() {
}
search_space << "\n";
search_space << "Quote/Escape Candidates: ";
for (uint8_t i = 0; i < static_cast<uint8_t>(quoterule_candidates.size()); i++) {
for (uint8_t i = 0; i < static_cast<uint8_t>(quote_rule_candidates.size()); i++) {
auto quote_candidate = quote_candidates_map[i];
auto escape_candidate = escape_candidates_map[i];
for (idx_t j = 0; j < quote_candidate.size(); j++) {
Expand All @@ -60,7 +60,7 @@ string DialectCandidates::Print() {
search_space << ",";
}
}
if (i < quoterule_candidates.size() - 1) {
if (i < quote_rule_candidates.size() - 1) {
search_space << ",";
}
}
Expand Down Expand Up @@ -111,7 +111,7 @@ DialectCandidates::DialectCandidates(const CSVStateMachineOptions &options) {
for (auto &quote_rule : default_quote_rule) {
quote_candidates_map[static_cast<uint8_t>(quote_rule)] = {options.quote.GetValue()};
}
// also add it as a escape rule
// also add it as an escape rule
if (!IsQuoteDefault(options.quote.GetValue())) {
escape_candidates_map[static_cast<uint8_t>(QuoteRule::QUOTES_RFC)].emplace_back(options.quote.GetValue());
}
Expand All @@ -124,14 +124,14 @@ DialectCandidates::DialectCandidates(const CSVStateMachineOptions &options) {
if (options.escape.IsSetByUser()) {
// user provided escape: use that escape rule
if (options.escape == '\0') {
quoterule_candidates = {QuoteRule::QUOTES_RFC};
quote_rule_candidates = {QuoteRule::QUOTES_RFC};
} else {
quoterule_candidates = {QuoteRule::QUOTES_OTHER};
quote_rule_candidates = {QuoteRule::QUOTES_OTHER};
}
escape_candidates_map[static_cast<uint8_t>(quoterule_candidates[0])] = {options.escape.GetValue()};
escape_candidates_map[static_cast<uint8_t>(quote_rule_candidates[0])] = {options.escape.GetValue()};
} else {
// no escape provided: try standard/common escapes
quoterule_candidates = default_quote_rule;
quote_rule_candidates = default_quote_rule;
}
}

Expand All @@ -146,12 +146,12 @@ void CSVSniffer::GenerateStateMachineSearchSpace(vector<unique_ptr<ColumnCountSc
}
CSVIterator first_iterator;
bool iterator_set = false;
for (const auto quoterule : dialect_candidates.quoterule_candidates) {
const auto &quote_candidates = dialect_candidates.quote_candidates_map.at(static_cast<uint8_t>(quoterule));
for (const auto quote_rule : dialect_candidates.quote_rule_candidates) {
const auto &quote_candidates = dialect_candidates.quote_candidates_map.at(static_cast<uint8_t>(quote_rule));
for (const auto &quote : quote_candidates) {
for (const auto &delimiter : dialect_candidates.delim_candidates) {
const auto &escape_candidates =
dialect_candidates.escape_candidates_map.at(static_cast<uint8_t>(quoterule));
dialect_candidates.escape_candidates_map.at(static_cast<uint8_t>(quote_rule));
for (const auto &escape : escape_candidates) {
for (const auto &comment : dialect_candidates.comment_candidates) {
D_ASSERT(buffer_manager);
Expand Down Expand Up @@ -181,7 +181,7 @@ void CSVSniffer::GenerateStateMachineSearchSpace(vector<unique_ptr<ColumnCountSc

// Returns true if a comment is acceptable
bool AreCommentsAcceptable(const ColumnCountResult &result, idx_t num_cols, bool comment_set_by_user) {
// For a comment to be acceptable, we want 3/5th's majority of unmatches in the columns
// For a comment to be acceptable, we want 3/5th's the majority of unmatched in the columns
constexpr double min_majority = 0.6;
// detected comments, are all lines that started with a comment character.
double detected_comments = 0;
Expand Down Expand Up @@ -226,6 +226,12 @@ void CSVSniffer::AnalyzeDialectCandidate(unique_ptr<ColumnCountScanner> scanner,
}
idx_t consistent_rows = 0;
idx_t num_cols = sniffed_column_counts.result_position == 0 ? 1 : sniffed_column_counts[0].number_of_columns;
const bool ignore_errors = options.ignore_errors.GetValue();
// If we are ignoring errors and not null_padding , we pick the most frequent number of columns as the right one
bool use_most_frequent_columns = ignore_errors && !options.null_padding;
if (use_most_frequent_columns) {
num_cols = sniffed_column_counts.GetMostFrequentColumnCount();
}
idx_t padding_count = 0;
idx_t comment_rows = 0;
idx_t ignored_rows = 0;
Expand All @@ -234,16 +240,15 @@ void CSVSniffer::AnalyzeDialectCandidate(unique_ptr<ColumnCountScanner> scanner,
if (sniffed_column_counts.result_position > rows_read) {
rows_read = sniffed_column_counts.result_position;
}
if (set_columns.IsCandidateUnacceptable(num_cols, options.null_padding, options.ignore_errors.GetValue(),
if (set_columns.IsCandidateUnacceptable(num_cols, options.null_padding, ignore_errors,
sniffed_column_counts[0].last_value_always_empty)) {
// Not acceptable
return;
}
idx_t header_idx = 0;
for (idx_t row = 0; row < sniffed_column_counts.result_position; row++) {
if (set_columns.IsCandidateUnacceptable(sniffed_column_counts[row].number_of_columns, options.null_padding,
options.ignore_errors.GetValue(),
sniffed_column_counts[row].last_value_always_empty)) {
ignore_errors, sniffed_column_counts[row].last_value_always_empty)) {
// Not acceptable
return;
}
Expand All @@ -258,7 +263,7 @@ void CSVSniffer::AnalyzeDialectCandidate(unique_ptr<ColumnCountScanner> scanner,
consistent_rows++;
} else if (num_cols < sniffed_column_counts[row].number_of_columns &&
(!options.dialect_options.skip_rows.IsSetByUser() || comment_rows > 0) &&
(!set_columns.IsSet() || options.null_padding)) {
(!set_columns.IsSet() || options.null_padding) && (!first_valid || (!use_most_frequent_columns))) {
// all rows up to this point will need padding
if (!first_valid) {
first_valid = true;
Expand All @@ -268,15 +273,14 @@ void CSVSniffer::AnalyzeDialectCandidate(unique_ptr<ColumnCountScanner> scanner,
// we use the maximum amount of num_cols that we find
num_cols = sniffed_column_counts[row].number_of_columns;
dirty_notes = row;
// sniffed_column_counts.state_machine.dialect_options.rows_until_header = dirty_notes;
dirty_notes_minus_comments = dirty_notes - comment_rows;
header_idx = row;
consistent_rows = 1;
} else if (sniffed_column_counts[row].number_of_columns == num_cols ||
(options.ignore_errors.GetValue() && !options.null_padding)) {
} else if (sniffed_column_counts[row].number_of_columns == num_cols || (use_most_frequent_columns)) {
if (!first_valid) {
first_valid = true;
sniffed_column_counts.state_machine.dialect_options.rows_until_header = row;
dirty_notes = row;
}
if (sniffed_column_counts[row].number_of_columns != num_cols) {
ignored_rows++;
Expand Down Expand Up @@ -404,7 +408,7 @@ void CSVSniffer::AnalyzeDialectCandidate(unique_ptr<ColumnCountScanner> scanner,
}
}

bool CSVSniffer::RefineCandidateNextChunk(ColumnCountScanner &candidate) {
bool CSVSniffer::RefineCandidateNextChunk(ColumnCountScanner &candidate) const {
auto &sniffed_column_counts = candidate.ParseChunk();
for (idx_t i = 0; i < sniffed_column_counts.result_position; i++) {
if (set_columns.IsSet()) {
Expand Down
6 changes: 3 additions & 3 deletions src/duckdb/src/function/table/version/pragma_version.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#ifndef DUCKDB_PATCH_VERSION
#define DUCKDB_PATCH_VERSION "1-dev236"
#define DUCKDB_PATCH_VERSION "1-dev243"
#endif
#ifndef DUCKDB_MINOR_VERSION
#define DUCKDB_MINOR_VERSION 1
Expand All @@ -8,10 +8,10 @@
#define DUCKDB_MAJOR_VERSION 1
#endif
#ifndef DUCKDB_VERSION
#define DUCKDB_VERSION "v1.1.1-dev236"
#define DUCKDB_VERSION "v1.1.1-dev243"
#endif
#ifndef DUCKDB_SOURCE_ID
#define DUCKDB_SOURCE_ID "f9e96b1910"
#define DUCKDB_SOURCE_ID "0ce8631130"
#endif
#include "duckdb/function/table/system_functions.hpp"
#include "duckdb/main/database.hpp"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ class ColumnCountResult : public ScannerResult {
bool error = false;
idx_t result_position = 0;
bool cur_line_starts_as_comment = false;

//! How many rows fit a given column count
map<idx_t, idx_t> rows_per_column_count;
//! Adds a Value to the result
static inline void AddValue(ColumnCountResult &result, idx_t buffer_pos);
//! Adds a Row to the result
Expand All @@ -57,6 +58,9 @@ class ColumnCountResult : public ScannerResult {

static inline void SetComment(ColumnCountResult &result, idx_t buffer_pos);

//! Returns the column count
idx_t GetMostFrequentColumnCount() const;

inline void InternalAddRow();
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ struct DialectCandidates {
//! Candidates for the comment
vector<char> comment_candidates;
//! Quote-Rule Candidates
vector<QuoteRule> quoterule_candidates;
vector<QuoteRule> quote_rule_candidates;
//! Candidates for the quote option
unordered_map<uint8_t, vector<char>> quote_candidates_map;
//! Candidates for the escape option
Expand Down Expand Up @@ -181,7 +181,7 @@ class CSVSniffer {
void RefineCandidates();

//! Checks if candidate still produces good values for the next chunk
bool RefineCandidateNextChunk(ColumnCountScanner &candidate);
bool RefineCandidateNextChunk(ColumnCountScanner &candidate) const;

//! ------------------------------------------------------//
//! ------------------- Type Detection ------------------ //
Expand Down

0 comments on commit d9800f4

Please sign in to comment.