maria-developers team mailing list archive

Thread
Date
Re: [Commits] 1ed2d8b98ad: MDEV-17032: Estimates are higher for partitions of a table with @@use_stat_tables= PREFERABLY

To: maria-developers@xxxxxxxxxxxxxxxxxxx, Varun <varunraiko1803@xxxxxxxxx>
From: Sergey Petrunia <sergey@xxxxxxxxxxx>
Date: Sun, 25 Nov 2018 13:19:27 +0300
Cc: commits@xxxxxxxxxxx
In-reply-to: <20181116141108.24CE1F41D86@Varuns-MacBook-Pro.local>
User-agent: Mutt/1.9.4 (2018-02-28)
Hi Varun,

On Fri, Nov 16, 2018 at 07:41:07PM +0530, Varun wrote:
> revision-id: 1ed2d8b98ade099fe23b7d5c00d23364388e15aa (mariadb-10.0.36-80-g1ed2d8b98ad)
> parent(s): a84d87fde8c0bc325c8e00f06ea02bcd84a75d55
> author: Varun Gupta
> committer: Varun Gupta
> timestamp: 2018-11-16 19:40:47 +0530
> message:
> 
> MDEV-17032: Estimates are higher for partitions of a table with @@use_stat_tables= PREFERABLY
> 
> The problem here is EITS statistics does not calculate statistics for the partitions of the table.
> So a temporary solution would be to not read EITS statistics for partitioned tables.
> 
> Also disabling reading of EITS for columns that participate in the partition list of a table.
> 
> ---
>  mysql-test/r/partition.result | 88 +++++++++++++++++++++++++++++++++++++++++++
>  mysql-test/t/partition.test   | 58 ++++++++++++++++++++++++++++
>  sql/opt_range.cc              | 12 +++++-
>  sql/partition_info.cc         | 26 +++++++++++++
>  sql/partition_info.h          |  1 +
>  sql/sql_statistics.cc         | 16 ++++++++
>  6 files changed, 199 insertions(+), 2 deletions(-)
> 
> diff --git a/mysql-test/r/partition.result b/mysql-test/r/partition.result
> index c6669176b3d..aedf9f89f0e 100644
> --- a/mysql-test/r/partition.result
> +++ b/mysql-test/r/partition.result
> @@ -2645,3 +2645,91 @@ Warnings:
>  Note	1517	Duplicate partition name p2
>  DEALLOCATE PREPARE stmt;
>  DROP TABLE t1;
> +#
> +# MDEV-17032: Estimates are higher for partitions of a table with @@use_stat_tables= PREFERABLY
> +#
> +create table t0(a int);
> +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
> +create table t1(a int);
> +insert into t1 select A.a + B.a* 10 + C.a * 100 from t0 A, t0 B, t0 C;
> +create table t2 (
> +part_key int,
> +a int,
> +b int
> +) partition by list(part_key) (
> +partition p0 values in (0),
> +partition p1 values in (1),
> +partition p2 values in (2),
> +partition p3 values in (3),
> +partition p4 values in (4)
> +);
> +insert into t2
> +select mod(a,5), a/100, a from t1;
> +set @save_use_stat_tables= @@use_stat_tables;
> +set @save_optimizer_use_condition_selectivity=@@optimizer_use_condition_selectivity;
> +#
> +# Tests using stats provided by the storage engine
> +#
> +explain extended select * from t2 where part_key=1;
> +id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
> +1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	200	100.00	Using where
> +Warnings:
> +Note	1003	select `test`.`t2`.`part_key` AS `part_key`,`test`.`t2`.`a` AS `a`,`test`.`t2`.`b` AS `b` from `test`.`t2` where (`test`.`t2`.`part_key` = 1)
> +explain partitions select * from t2 where part_key=1;
> +id	select_type	table	partitions	type	possible_keys	key	key_len	ref	rows	Extra
> +1	SIMPLE	t2	p1	ALL	NULL	NULL	NULL	NULL	200	Using where
> +explain extended select * from t2 where  part_key in (1,2);
> +id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
> +1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	400	100.00	Using where
> +Warnings:
> +Note	1003	select `test`.`t2`.`part_key` AS `part_key`,`test`.`t2`.`a` AS `a`,`test`.`t2`.`b` AS `b` from `test`.`t2` where (`test`.`t2`.`part_key` in (1,2))
> +explain partitions select * from t2 where part_key=1;
> +id	select_type	table	partitions	type	possible_keys	key	key_len	ref	rows	Extra
> +1	SIMPLE	t2	p1	ALL	NULL	NULL	NULL	NULL	200	Using where
> +explain extended select * from t2 where  part_key in (1,2);
> +id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
> +1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	400	100.00	Using where
> +Warnings:
> +Note	1003	select `test`.`t2`.`part_key` AS `part_key`,`test`.`t2`.`a` AS `a`,`test`.`t2`.`b` AS `b` from `test`.`t2` where (`test`.`t2`.`part_key` in (1,2))
> +explain partitions select * from t2 where  part_key in (1,2);
> +id	select_type	table	partitions	type	possible_keys	key	key_len	ref	rows	Extra
> +1	SIMPLE	t2	p1,p2	ALL	NULL	NULL	NULL	NULL	400	Using where
> +set @save_histogram_size=@@histogram_size;
> +set @@histogram_size=100;
> +set @@use_stat_tables= PREFERABLY;
> +set @@optimizer_use_condition_selectivity=4;
> +analyze table t2;
> +Table	Op	Msg_type	Msg_text
> +test.t2	analyze	status	Engine-independent statistics collected
> +test.t2	analyze	status	OK
> +#
> +# Tests using EITS
> +#
> +explain extended select * from t2 where part_key=1;
> +id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
> +1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	200	100.00	Using where
> +Warnings:
> +Note	1003	select `test`.`t2`.`part_key` AS `part_key`,`test`.`t2`.`a` AS `a`,`test`.`t2`.`b` AS `b` from `test`.`t2` where (`test`.`t2`.`part_key` = 1)
> +explain partitions select * from t2 where part_key=1;
> +id	select_type	table	partitions	type	possible_keys	key	key_len	ref	rows	Extra
> +1	SIMPLE	t2	p1	ALL	NULL	NULL	NULL	NULL	200	Using where
> +explain extended select * from t2 where  part_key in (1,2);
> +id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
> +1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	400	100.00	Using where
> +Warnings:
> +Note	1003	select `test`.`t2`.`part_key` AS `part_key`,`test`.`t2`.`a` AS `a`,`test`.`t2`.`b` AS `b` from `test`.`t2` where (`test`.`t2`.`part_key` in (1,2))
> +explain partitions select * from t2 where part_key=1;
> +id	select_type	table	partitions	type	possible_keys	key	key_len	ref	rows	Extra
> +1	SIMPLE	t2	p1	ALL	NULL	NULL	NULL	NULL	200	Using where
> +explain extended select * from t2 where  part_key in (1,2);
> +id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
> +1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	400	100.00	Using where
> +Warnings:
> +Note	1003	select `test`.`t2`.`part_key` AS `part_key`,`test`.`t2`.`a` AS `a`,`test`.`t2`.`b` AS `b` from `test`.`t2` where (`test`.`t2`.`part_key` in (1,2))
> +explain partitions select * from t2 where  part_key in (1,2);
> +id	select_type	table	partitions	type	possible_keys	key	key_len	ref	rows	Extra
> +1	SIMPLE	t2	p1,p2	ALL	NULL	NULL	NULL	NULL	400	Using where
> +set @@use_stat_tables= @save_use_stat_tables;
> +set @@optimizer_use_condition_selectivity= @save_optimizer_use_condition_selectivity;
> +set @@histogram_size= @save_histogram_size;
> +drop table t0,t1,t2;
> diff --git a/mysql-test/t/partition.test b/mysql-test/t/partition.test
> index 1c8cd0375d6..00c6f1ce77c 100644
> --- a/mysql-test/t/partition.test
> +++ b/mysql-test/t/partition.test
> @@ -2897,3 +2897,61 @@ EXECUTE stmt;
>  DEALLOCATE PREPARE stmt;
>  DROP TABLE t1;
>  
> +--echo #
> +--echo # MDEV-17032: Estimates are higher for partitions of a table with @@use_stat_tables= PREFERABLY
> +--echo #
> +
> +create table t0(a int);
> +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
> +
> +create table t1(a int);
> +insert into t1 select A.a + B.a* 10 + C.a * 100 from t0 A, t0 B, t0 C;
> +
> +
> +create table t2 (
> +  part_key int,
> +  a int,
> +  b int
> +) partition by list(part_key) (
> +  partition p0 values in (0),
> +  partition p1 values in (1),
> +  partition p2 values in (2),
> +  partition p3 values in (3),
> +  partition p4 values in (4)
> +);
> +insert into t2
> +select mod(a,5), a/100, a from t1;
> +
> +set @save_use_stat_tables= @@use_stat_tables;
> +set @save_optimizer_use_condition_selectivity=@@optimizer_use_condition_selectivity;
> +--echo #
> +--echo # Tests using stats provided by the storage engine
> +--echo #
> +explain extended select * from t2 where part_key=1;
> +explain partitions select * from t2 where part_key=1;
> +explain extended select * from t2 where  part_key in (1,2);
> +explain partitions select * from t2 where part_key=1;
^^ Isn't this query the same like one two lines above?

> +explain extended select * from t2 where  part_key in (1,2);
> +explain partitions select * from t2 where  part_key in (1,2);
> +
> +set @save_histogram_size=@@histogram_size;
> +set @@histogram_size=100;
> +set @@use_stat_tables= PREFERABLY;
> +set @@optimizer_use_condition_selectivity=4;
> +analyze table t2;
> +--echo #
> +--echo # Tests using EITS
> +--echo #
> +# filtered should be 100

Please change to use '--echo #' so that the above is visible in the .result
file as well.

> +explain extended select * from t2 where part_key=1;
> +explain partitions select * from t2 where part_key=1;
> +# filtered should be 100
> +explain extended select * from t2 where  part_key in (1,2);
> +explain partitions select * from t2 where part_key=1;
> +# filtered should be 100
> +explain extended select * from t2 where  part_key in (1,2);
> +explain partitions select * from t2 where  part_key in (1,2);

Please add a test that shows that condition selectivity is taken into account
for non-partitioned columns.

> +set @@use_stat_tables= @save_use_stat_tables;
> +set @@optimizer_use_condition_selectivity= @save_optimizer_use_condition_selectivity;
> +set @@histogram_size= @save_histogram_size;
> +drop table t0,t1,t2;
> diff --git a/sql/opt_range.cc b/sql/opt_range.cc
> index 3bcaa72e32f..59aa2f3f280 100644
> --- a/sql/opt_range.cc
> +++ b/sql/opt_range.cc
> @@ -3322,6 +3322,10 @@ bool create_key_parts_for_pseudo_indexes(RANGE_OPT_PARAM *param,
>  {
>    Field **field_ptr;
>    TABLE *table= param->table;
> +  partition_info *part_info= NULL;
> +  #ifdef WITH_PARTITION_STORAGE_ENGINE
> +    part_info= table->part_info;
> +  #endif
>    uint parts= 0;
>  
>    for (field_ptr= table->field; *field_ptr; field_ptr++)
> @@ -3329,7 +3333,9 @@ bool create_key_parts_for_pseudo_indexes(RANGE_OPT_PARAM *param,
>      Column_statistics* col_stats= (*field_ptr)->read_stats;
>      if (bitmap_is_set(used_fields, (*field_ptr)->field_index)
>         && col_stats && !col_stats->no_stat_values_provided()
> -       && !((*field_ptr)->type() == MYSQL_TYPE_GEOMETRY))
> +       && !((*field_ptr)->type() == MYSQL_TYPE_GEOMETRY)
> +       && (!part_info ||
> +        part_info->field_in_partition_expr(*field_ptr)))
>        parts++;
>    }
>  
> @@ -3350,7 +3356,9 @@ bool create_key_parts_for_pseudo_indexes(RANGE_OPT_PARAM *param,
>      if (bitmap_is_set(used_fields, (*field_ptr)->field_index))
>      {
>        Field *field= *field_ptr;
> -      if (field->type() == MYSQL_TYPE_GEOMETRY)
> +      if (field->type() == MYSQL_TYPE_GEOMETRY
> +         || (!part_info ||
> +            part_info->field_in_partition_expr(field)))
>          continue;

This condition is wrong.

    (field->type() == MYSQL_TYPE_GEOMETRY
      || (!part_info ||
         part_info->field_in_partition_expr(field)))
   
   is the same as 

    (field->type() == MYSQL_TYPE_GEOMETRY ||
     !part_info || ...

which means if there is no partitioning, we won't count the selectivity.
Indeed, if I apply this patch, all EITS tests start to fail, selectivity is no
longer counted:

--- /home/psergey/dev-git/10.1/mysql-test/r/selectivity_innodb.result   2018-11-16 14:06:27.231202907 +0300
+++ /home/psergey/dev-git/10.1/mysql-test/r/selectivity_innodb.reject   2018-11-25 01:03:14.661647893 +0300
@@ -28,13 +28,13 @@
 explain extended 
 select * from t1 where a is null;
 id     select_type     table   type    possible_keys   key     key_len ref     rows    filtered        Extra
-1      SIMPLE  t1      ALL     NULL    NULL    NULL    NULL    10      40.00   Using where
+1      SIMPLE  t1      ALL     NULL    NULL    NULL    NULL    10      100.00  Using where
 Warnings:
 Note   1003    select `test`.`t1`.`a` AS `a` from `test`.`t1` where isnull(`test`.`t1`.`a`)
 explain extended 
 select * from t1 where a is not null;
 id     select_type     table   type    possible_keys   key     key_len ref     rows    filtered        Extra
-1      SIMPLE  t1      ALL     NULL    NULL    NULL    NULL    10      60.00   Using where
+1      SIMPLE  t1      ALL     NULL    NULL    NULL    NULL    10      100.00  Using where
 Warnings:

See also the comment below about the return value of field_in_partition_expr.

>  
>        uint16 store_length;
> diff --git a/sql/partition_info.cc b/sql/partition_info.cc
> index 52bda560c1c..0111fc1451d 100644
> --- a/sql/partition_info.cc
> +++ b/sql/partition_info.cc
> @@ -3164,6 +3164,32 @@ void partition_info::print_debug(const char *str, uint *value)
>      DBUG_PRINT("info", ("parser: %s", str));
>    DBUG_VOID_RETURN;
>  }
> +

> +/*
> +
> + Disabling reading EITS statistics for columns involved in the
> + partition list of a table.
> + We assume the selecticivity for such columns would be handled
> + during partition pruning.
> +
> +*/
> +
This comment here doesn't make any sense anymore. Please move it to
create_key_parts_for_pseudo_indexes.

Also, the name and return value of the function is confusing now. If 
one calls the function for the partitioning column:

  field_in_partition_expr(partition_column) == false.

Please change the return value to be true in this case, and false otherwise.

> +bool partition_info::field_in_partition_expr(Field *field) const
> +{
> +  uint i;
> +  for (i= 0; i < num_part_fields; i++)
> +  {
> +    if (field->eq(part_field_array[i]))
> +      return FALSE;
> +  }
> +  for (i= 0; i < num_subpart_fields; i++)
> +  {
> +    if (field->eq(subpart_field_array[i]))
> +      return FALSE;
> +  }
> +  return TRUE;
> +}
> +
>  #else /* WITH_PARTITION_STORAGE_ENGINE */
>   /*
>     For builds without partitioning we need to define these functions
> diff --git a/sql/partition_info.h b/sql/partition_info.h
> index f250c5496bf..10b8954ace7 100644
> --- a/sql/partition_info.h
> +++ b/sql/partition_info.h
> @@ -384,6 +384,7 @@ class partition_info : public Sql_alloc
>    bool is_full_part_expr_in_fields(List<Item> &fields);
>  public:
>    bool has_unique_name(partition_element *element);
> +  bool field_in_partition_expr(Field *field) const;
>  };
>  
>  uint32 get_next_partition_id_range(struct st_partition_iter* part_iter);
> diff --git a/sql/sql_statistics.cc b/sql/sql_statistics.cc
> index cb75a5c2176..02dd4970c99 100644
> --- a/sql/sql_statistics.cc
> +++ b/sql/sql_statistics.cc
> @@ -3589,6 +3589,22 @@ void set_statistics_for_table(THD *thd, TABLE *table)
>      (use_stat_table_mode <= COMPLEMENTARY ||
>       !table->stats_is_read || read_stats->cardinality_is_null) ?
>      table->file->stats.records : read_stats->cardinality;
> +
> +  /*
> +    For partitioned table, EITS statistics is based on data from all partitions.
> +
> +    On the other hand, Partition Pruning figures which partitions will be
> +    accessed and then computes the estimate of rows in used_partitions.
> +
> +    Use the estimate from Partition Pruning as it is typically more precise.
> +    Ideally, EITS should provide per-partition statistics but this is not
> +    implemented currently.
> +  */

Good. 

> +  #ifdef WITH_PARTITION_STORAGE_ENGINE
> +    if (table->part_info)
> +      table->used_stat_records= table->file->stats.records;
> +  #endif
> +
>    KEY *key_info, *key_info_end;
>    for (key_info= table->key_info, key_info_end= key_info+table->s->keys;
>         key_info < key_info_end; key_info++)

-- 
BR
 Sergei
-- 
Sergei Petrunia, Software Developer
MariaDB Corporation | Skype: sergefp | Blog: http://s.petrunia.net/blog