sql: partial table statistics are not used in row count estimates #134461
Closed
Description
In the example below, I would expect the estimate row count
in the output of the last query to be greater than zero.
CREATE TABLE t (
k INT PRIMARY KEY,
a INT,
INDEX (a)
);
INSERT INTO t
SELECT i, i FROM generate_series(1, 1000) AS g(i);
CREATE STATISTICS __auto__ FROM t;
SHOW STATISTICS FOR TABLE t;
-- statistics_name | column_names | created | row_count | distinct_count | null_count | avg_size | partial_predicate | histogram_id | full_histogram_id
-- ------------------+--------------+-------------------------------+-----------+----------------+------------+----------+-------------------+---------------------+--------------------
-- __auto__ | {k} | 2024-11-06 20:52:20.858821+00 | 1000 | 1000 | 0 | 3 | NULL | 1018612747011162113 | 0
-- __auto__ | {a} | 2024-11-06 20:52:20.858821+00 | 1000 | 1000 | 0 | 3 | NULL | 1018612747014635521 | 0
-- (2 rows)
SELECT pg_sleep(5);
INSERT INTO t
SELECT i, i FROM generate_series(1001, 2000) AS g(i);
CREATE STATISTICS s FROM t USING EXTREMES;
SELECT pg_sleep(5);
SHOW STATISTICS FOR TABLE t;
-- statistics_name | column_names | created | row_count | distinct_count | null_count | avg_size | partial_predicate | histogram_id | full_histogram_id
-- ------------------+--------------+-------------------------------+-----------+----------------+------------+----------+------------------------------------------------------+---------------------+----------------------
-- __auto__ | {k} | 2024-11-06 20:52:20.858821+00 | 1000 | 1000 | 0 | 3 | NULL | 1018612747011162113 | 0
-- __auto__ | {a} | 2024-11-06 20:52:20.858821+00 | 1000 | 1000 | 0 | 3 | NULL | 1018612747014635521 | 0
-- s | {k} | 2024-11-06 20:52:25.921077+00 | 1000 | 1000 | 0 | 3 | (k IS NULL) OR ((k < 1:::INT8) OR (k > 1000:::INT8)) | 1018612763596947457 | 1018612747011162113
-- s | {a} | 2024-11-06 20:52:25.932229+00 | 1000 | 1000 | 0 | 3 | (a IS NULL) OR ((a < 1:::INT8) OR (a > 1000:::INT8)) | 1018612763632566273 | 1018612747014635521
-- (4 rows)
EXPLAIN ANALYZE
SELECT * FROM t WHERE a > 1002;
-- info
-- ---------------------------------------------------------------------------------------------------------------------------
-- planning time: 518µs
-- execution time: 3ms
-- distribution: local
-- vectorized: true
-- plan type: custom
-- rows decoded from KV: 998 (31 KiB, 1 gRPC calls)
-- cumulative time spent in KV: 1ms
-- maximum memory usage: 50 KiB
-- network usage: 0 B (0 messages)
-- regions: us-east1
-- sql cpu time: 542µs
-- isolation level: serializable
-- priority: normal
-- quality of service: regular
--
-- • scan
-- sql nodes: n1
-- kv nodes: n1
-- regions: us-east1
-- actual row count: 998
-- KV time: 1ms
-- KV contention time: 0µs
-- KV rows decoded: 998
-- KV bytes read: 31 KiB
-- KV gRPC calls: 1
-- estimated max memory allocated: 50 KiB
-- sql cpu time: 542µs
-- estimated row count: 0 (<0.01% of the table; stats collected 5 seconds ago)
-- table: t@t_a_idx ---------------------- WARNING: the row count estimate is inaccurate, consider running 'ANALYZE t'
-- spans: [/1003 - ]
--
-- WARNING: the row count estimate on table "t" is inaccurate, consider running 'ANALYZE t'
-- (32 rows)
Jira issue: CRDB-44090
Metadata
Assignees
Labels
SQL logical planning and optimizations.Table statistics (and their automatic refresh).Code not up to spec/doc, specs & docs deemed correct. Solution expected to change code/behavior.SQL Queries TeamFailures and bugs on the master branch.Used to mark GA and release blockers, technical advisories, and bugs for 24.3
Type
Projects
Status
Done