Skip to content

Commit

Permalink
chore(planner): improve cardinality estimation (#16938)
Browse files Browse the repository at this point in the history
* chore(planner): improve cardinality estimation

* chore(planner): improve histogram cardinality estimation

* chore(planner): improve join cardinality

* chore(test): update sqllogictest

* chore(test): update sqllogictest

* chore(code): refine code

* chore(test): update sqllogictest

* chore(test): test ci tpch

* chore(code): fix typos

* chore(test): remove accurate_his test

* chore(test): fix sqllogictest

* chore(query): fix sub overflow

* chore(planner): refine scan histogram

* chore(test): update sqllogictest

* chore(test): update sqllogictest
  • Loading branch information
Dousir9 authored Nov 26, 2024
1 parent 4dd4290 commit 6cc2535
Show file tree
Hide file tree
Showing 21 changed files with 345 additions and 1,467 deletions.
4 changes: 2 additions & 2 deletions src/query/sql/src/planner/optimizer/property/histogram.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,13 +77,13 @@ pub fn histogram_from_ndv(
let mut buckets: Vec<HistogramBucket> = Vec::with_capacity(num_buckets);
let sample_set = UniformSampleSet { min, max };

for idx in 0..num_buckets + 1 {
let upper_bound = sample_set.get_upper_bound(num_buckets, idx)?;
for idx in 0..num_buckets {
let lower_bound = if idx == 0 {
sample_set.min.clone()
} else {
buckets[idx - 1].upper_bound().clone()
};
let upper_bound = sample_set.get_upper_bound(num_buckets, idx + 1)?;
let bucket = HistogramBucket::new(
lower_bound,
upper_bound,
Expand Down
162 changes: 106 additions & 56 deletions src/query/sql/src/planner/optimizer/property/selectivity.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.

use std::cmp::max;
use std::cmp::Ordering;
use std::collections::HashSet;

Expand All @@ -27,8 +28,10 @@ use databend_common_expression::FunctionContext;
use databend_common_expression::Scalar;
use databend_common_functions::BUILTIN_FUNCTIONS;
use databend_common_storage::Datum;
use databend_common_storage::DEFAULT_HISTOGRAM_BUCKETS;
use databend_common_storage::F64;

use crate::optimizer::histogram_from_ndv;
use crate::optimizer::ColumnStat;
use crate::optimizer::Statistics;
use crate::plans::BoundColumnRef;
Expand Down Expand Up @@ -323,8 +326,10 @@ impl<'a> SelectivityEstimator<'a> {
column_stat.ndv = new_ndv;
if let Some(histogram) = &mut column_stat.histogram {
if histogram.accuracy {
// If selectivity < 0.2, most buckets are invalid and
// the accuracy histogram can be discarded.
// Todo: find a better way to update histogram.
if selectivity < 0.8 {
if selectivity < 0.2 {
column_stat.histogram = None;
}
continue;
Expand All @@ -349,13 +354,14 @@ impl<'a> SelectivityEstimator<'a> {
column_stat: &mut ColumnStat,
updated_column_indexes: &mut HashSet<IndexType>,
) -> Result<f64> {
let col_hist = column_stat.histogram.as_ref();
let histogram = column_stat.histogram.as_ref();

if col_hist.is_none() && const_datum.is_numeric() {
if histogram.is_none() && const_datum.is_numeric() {
// If there is no histogram and the column isn't numeric, return default selectivity.
if !column_stat.min.is_numeric() {
if !column_stat.min.is_numeric() || !column_stat.max.is_numeric() {
return Ok(DEFAULT_SELECTIVITY);
}

let min = column_stat.min.to_double()?;
let max = column_stat.max.to_double()?;
let ndv = column_stat.ndv;
Expand Down Expand Up @@ -427,69 +433,93 @@ impl<'a> SelectivityEstimator<'a> {
return Ok(percent);
}

if col_hist.is_none() {
let Some(histogram) = histogram else {
return Ok(DEFAULT_SELECTIVITY);
}
let col_hist = col_hist.unwrap();
let (mut num_greater, new_min, new_max) = match comparison_op {
ComparisonOp::GT | ComparisonOp::GTE => {
let new_min = const_datum.clone();
let new_max = column_stat.max.clone();
(0.0, new_min, new_max)
}
ComparisonOp::LT | ComparisonOp::LTE => {
let new_max = const_datum.clone();
let new_min = column_stat.min.clone();
(0.0, new_min, new_max)
}
_ => unreachable!(),
};

for bucket in col_hist.buckets_iter() {
if let Ok(ord) = bucket.upper_bound().compare(const_datum) {
match comparison_op {
ComparisonOp::GT => {
if ord == Ordering::Less || ord == Ordering::Equal {
num_greater += bucket.num_values();
} else {
break;
}
}
ComparisonOp::GTE => {
if ord == Ordering::Less {
num_greater += bucket.num_values();
} else {
break;
}
}
ComparisonOp::LT => {
if ord == Ordering::Less {
num_greater += bucket.num_values();
let mut num_selected = 0.0;
for bucket in histogram.buckets_iter() {
let lower_bound = bucket.lower_bound();
let upper_bound = bucket.upper_bound();

let const_gte_upper_bound = matches!(
const_datum.compare(upper_bound)?,
Ordering::Greater | Ordering::Equal
);
let (no_overlap, complete_overlap) = match comparison_op {
ComparisonOp::LT => (
matches!(
const_datum.compare(lower_bound)?,
Ordering::Less | Ordering::Equal
),
const_gte_upper_bound,
),
ComparisonOp::LTE => (
matches!(const_datum.compare(lower_bound)?, Ordering::Less),
const_gte_upper_bound,
),
ComparisonOp::GT => (
const_gte_upper_bound,
matches!(const_datum.compare(lower_bound)?, Ordering::Less),
),
ComparisonOp::GTE => (
const_gte_upper_bound,
matches!(
const_datum.compare(lower_bound)?,
Ordering::Less | Ordering::Equal
),
),
_ => unreachable!(),
};

if complete_overlap {
num_selected += bucket.num_values();
} else if !no_overlap && const_datum.is_numeric() {
let ndv = bucket.num_distinct();
let lower_bound = lower_bound.to_double()?;
let upper_bound = upper_bound.to_double()?;
let const_value = const_datum.to_double()?;

let bucket_range = upper_bound - lower_bound;
let bucket_selectivity = match comparison_op {
ComparisonOp::LT => (const_value - lower_bound) / bucket_range,
ComparisonOp::LTE => {
if const_value == lower_bound {
1.0 / ndv
} else {
break;
(const_value - lower_bound + 1.0) / bucket_range
}
}
ComparisonOp::LTE => {
if ord == Ordering::Less || ord == Ordering::Equal {
num_greater += bucket.num_values();
ComparisonOp::GT => {
if const_value == lower_bound {
1.0 - 1.0 / ndv
} else {
break;
(upper_bound - const_value - 1.0).max(0.0) / bucket_range
}
}
ComparisonOp::GTE => (upper_bound - const_value) / bucket_range,
_ => unreachable!(),
}
} else {
return Ok(DEFAULT_SELECTIVITY);
};
num_selected += bucket.num_values() * bucket_selectivity;
}
}

let selectivity = match comparison_op {
ComparisonOp::GT | ComparisonOp::GTE => 1.0 - num_greater / col_hist.num_values(),
ComparisonOp::LT | ComparisonOp::LTE => num_greater / col_hist.num_values(),
_ => unreachable!(),
};
let selectivity = num_selected / histogram.num_values();

if update {
let (new_min, new_max) = match comparison_op {
ComparisonOp::GT | ComparisonOp::GTE => {
let new_min = const_datum.clone();
let new_max = column_stat.max.clone();
(new_min, new_max)
}
ComparisonOp::LT | ComparisonOp::LTE => {
let new_max = const_datum.clone();
let new_min = column_stat.min.clone();
(new_min, new_max)
}
_ => unreachable!(),
};
update_statistic(column_stat, new_min, new_max, selectivity)?;
updated_column_indexes.insert(column_ref.column.index);
}
Expand Down Expand Up @@ -575,11 +605,31 @@ fn update_statistic(
new_min = Datum::Float(F64::from(new_min.to_double()?));
new_max = Datum::Float(F64::from(new_max.to_double()?));
}
if selectivity < 0.8 {
// Todo: support unfixed buckets number for histogram and prune the histogram.
column_stat.histogram = None;
}
column_stat.min = new_min.clone();
column_stat.max = new_max.clone();

if let Some(histogram) = &column_stat.histogram {
// If selectivity < 0.2, most buckets are invalid and
// the accuracy histogram can be discarded.
// Todo: support unfixed buckets number for histogram and prune the histogram.
column_stat.histogram = if histogram.accuracy && selectivity >= 0.2 {
Some(histogram.clone())
} else {
let num_values = histogram.num_values();
let new_num_values = (num_values * selectivity).ceil() as u64;
let new_ndv = new_ndv as u64;
if new_ndv <= 2 {
column_stat.histogram = None;
return Ok(());
}
Some(histogram_from_ndv(
new_ndv,
max(new_num_values, new_ndv),
Some((new_min, new_max)),
DEFAULT_HISTOGRAM_BUCKETS,
)?)
}
}

Ok(())
}
Loading

0 comments on commit 6cc2535

Please sign in to comment.