maria-developers team mailing list archive

Thread
Date
Re: [Commits] 2950feb5c09: MDEV-9959: A serious MariaDB server performance bug

To: maria-developers@xxxxxxxxxxxxxxxxxxx, Varun <varunraiko1803@xxxxxxxxx>
From: Sergey Petrunia <sergey@xxxxxxxxxxx>
Date: Mon, 26 Mar 2018 12:41:15 +0300
Cc: commits@xxxxxxxxxxx
In-reply-to: <20180323114758.2167B101B09AA1@Varuns-MacBook-Pro.local>
User-agent: Mutt/1.5.21 (2010-09-15)
Hi Varun,

Please find my feedback below.

As disucssed before: please also run the MDEV's test case with this patch and
post a comment with the result.

On Fri, Mar 23, 2018 at 05:17:57PM +0530, Varun wrote:
> revision-id: 2950feb5c09033843a6a32c7431421b45d5812dd (mariadb-10.3.0-646-g2950feb5c09)
> parent(s): 9ef7ab221d70630155206910de8a6053db097164
> author: Varun Gupta
> committer: Varun Gupta
> timestamp: 2018-03-23 17:17:09 +0530
> message:
> 
> MDEV-9959: A serious MariaDB server performance bug
> 
> step#1: if a derived table has SELECT DISTINCT, provide index statistics for it so that the join optimizer in the
> upper select knows that ref access to the table will produce one row.
> 
> Added handling for multiple selects in the derived table
> 
> ---
>  mysql-test/r/mdev9959.result | 64 ++++++++++++++++++++++++++++++++++++++++++++
>  mysql-test/t/mdev9959.test   | 28 +++++++++++++++++++
>  sql/sql_lex.h                |  2 ++
>  sql/sql_union.cc             | 36 +++++++++++++++++++++++++
>  sql/table.cc                 | 26 +++++++++++++++---
>  5 files changed, 153 insertions(+), 3 deletions(-)
> 
> diff --git a/mysql-test/r/mdev9959.result b/mysql-test/r/mdev9959.result
> new file mode 100644
> index 00000000000..5269ba9b734
> --- /dev/null
> +++ b/mysql-test/r/mdev9959.result
> @@ -0,0 +1,64 @@
> +create table t1(a int);
> +insert into t1 values (1),(2),(3),(4),(5),(6);
> +create table t2(a int, b int,c int);
> +insert into t2(a,b,c) values (1,1,2),(2,2,3),(3,1,4),(4,2,2),(5,1,1),(6,2,5);
> +create table t3(a int, b int);
> +insert into t3(a,b) values (1,1),(2,2),(2,1),(1,2),(5,1),(9,2);
> +analyze select * from t1 , ( (select distinct t2.a from t2 group by c))q  where t1.a=q.a;
> +id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
> +1	PRIMARY	t1	ALL	NULL	NULL	NULL	NULL	6	6.00	100.00	100.00	Using where
> +1	PRIMARY	<derived2>	ref	key0	key0	5	test.t1.a	1	0.83	100.00	100.00	
> +2	DERIVED	t2	ALL	NULL	NULL	NULL	NULL	6	6.00	100.00	100.00	Using temporary; Using filesort

It is not at all clear which parts of the above output matter and which don't,
for example, does #rows matter for table t1?

Please add a comment (into .test file with --echo # ) describing that table
"<derived2>" should have type=ref and rows=1 

> +select * from t1 , ( (select distinct t2.a from t2 group by c))q  where t1.a=q.a;
> +a	a
> +1	1
> +2	2
> +3	3
> +5	5
> +6	6
> +analyze select * from t1 , ( (select t2.a from t2 group by c))q  where t1.a=q.a;
> +id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
> +1	PRIMARY	t1	ALL	NULL	NULL	NULL	NULL	6	6.00	100.00	100.00	Using where
> +1	PRIMARY	<derived2>	ref	key0	key0	5	test.t1.a	2	0.83	100.00	100.00	
> +2	DERIVED	t2	ALL	NULL	NULL	NULL	NULL	6	6.00	100.00	100.00	Using temporary; Using filesort
> +select * from t1 , ( (select t2.a from t2 group by c))q  where t1.a=q.a;
> +a	a
> +1	1
> +2	2
> +3	3
> +5	5
> +6	6
> +analyze select * from t1 , ( (select distinct t2.a from t2 group by c) union all (select distinct t2.a from t2 group by c))q  where t1.a=q.a;
> +id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
> +1	PRIMARY	t1	ALL	NULL	NULL	NULL	NULL	6	6.00	100.00	100.00	Using where
> +1	PRIMARY	<derived2>	ref	key0	key0	5	test.t1.a	2	1.67	100.00	100.00	
> +2	DERIVED	t2	ALL	NULL	NULL	NULL	NULL	6	6.00	100.00	100.00	Using temporary; Using filesort
> +3	UNION	t2	ALL	NULL	NULL	NULL	NULL	6	6.00	100.00	100.00	Using temporary; Using filesort

What does the above test show?
We've got type=ref, #rows = 2 for the derived table.
The estimate is correct but where does it come from? 
As far as I understand this patch, it only sets the estimate rows=1 for derived
tables.  How do we arrive at the value of 2? 

(If we would have gotten rows=2 without this patch, what's the point of having
the testcase as part of this patch?)

> +select * from t1 , ( (select distinct t2.a from t2 group by c) union all (select distinct t2.a from t2 group by c))q  where t1.a=q.a;
> +a	a
> +1	1
> +1	1
> +2	2
> +2	2
> +3	3
> +3	3
> +5	5
> +5	5
> +6	6
> +6	6
> +analyze select * from t1 , ( (select distinct t2.a from t2 group by c) union (select distinct t2.a from t2 group by c) union (select t3.a from t3 group by b))q  where t1.a=q.a;
> +id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
> +1	PRIMARY	t1	ALL	NULL	NULL	NULL	NULL	6	6.00	100.00	100.00	Using where
> +1	PRIMARY	<derived2>	ref	key0	key0	5	test.t1.a	2	0.83	100.00	100.00	
> +2	DERIVED	t2	ALL	NULL	NULL	NULL	NULL	6	6.00	100.00	100.00	Using temporary; Using filesort
> +3	UNION	t2	ALL	NULL	NULL	NULL	NULL	6	6.00	100.00	100.00	Using temporary; Using filesort
> +4	UNION	t3	ALL	NULL	NULL	NULL	NULL	6	6.00	100.00	100.00	Using temporary; Using filesort
> +NULL	UNION RESULT	<union2,3,4>	ALL	NULL	NULL	NULL	NULL	NULL	5.00	NULL	NULL	
> +select * from t1 , ( (select distinct t2.a from t2 group by c) union (select distinct t2.a from t2 group by c) union (select t3.a from t3 group by b))q  where t1.a=q.a;
> +a	a
> +1	1
> +2	2
> +3	3
> +5	5
> +6	6
> +drop table t1,t2,t3;
> diff --git a/mysql-test/t/mdev9959.test b/mysql-test/t/mdev9959.test
> new file mode 100644
> index 00000000000..77970f363ac
> --- /dev/null
> +++ b/mysql-test/t/mdev9959.test
> @@ -0,0 +1,28 @@
> +create table t1(a int);
> +insert into t1 values (1),(2),(3),(4),(5),(6);
> +create table t2(a int, b int,c int);
> +insert into t2(a,b,c) values (1,1,2),(2,2,3),(3,1,4),(4,2,2),(5,1,1),(6,2,5);
> +create table t3(a int, b int);
> +insert into t3(a,b) values (1,1),(2,2),(2,1),(1,2),(5,1),(9,2);
> +
> +# one select in derived table
> +
> +#with distinct
> +analyze select * from t1 , ( (select distinct t2.a from t2 group by c))q  where t1.a=q.a; 
> +select * from t1 , ( (select distinct t2.a from t2 group by c))q  where t1.a=q.a; 
> +# without distinct
> +analyze select * from t1 , ( (select t2.a from t2 group by c))q  where t1.a=q.a; 
> +select * from t1 , ( (select t2.a from t2 group by c))q  where t1.a=q.a; 
> +
> +# multiple selects in derived table
> +
> +#UNION ALL
> +analyze select * from t1 , ( (select distinct t2.a from t2 group by c) union all (select distinct t2.a from t2 group by c))q  where t1.a=q.a;
> +select * from t1 , ( (select distinct t2.a from t2 group by c) union all (select distinct t2.a from t2 group by c))q  where t1.a=q.a;
> +
> +#UNION
> +analyze select * from t1 , ( (select distinct t2.a from t2 group by c) union (select distinct t2.a from t2 group by c) union (select t3.a from t3 group by b))q  where t1.a=q.a;
> +select * from t1 , ( (select distinct t2.a from t2 group by c) union (select distinct t2.a from t2 group by c) union (select t3.a from t3 group by b))q  where t1.a=q.a;
> +
> +drop table t1,t2,t3;
> +
> diff --git a/sql/sql_lex.h b/sql/sql_lex.h
> index f0241a32acf..4128c1dd300 100644
> --- a/sql/sql_lex.h
> +++ b/sql/sql_lex.h
> @@ -761,6 +761,7 @@ class st_select_lex_unit: public st_select_lex_node {
>    Procedure *last_procedure;	 /* Pointer to procedure, if such exists */
>  
>    bool columns_are_renamed;
> +  bool union_all;           /* TRUE if we have UNION ALL operation */

This variable is checked but it is never set?

>  
>    void init_query();
>    st_select_lex* outer_select();
> @@ -800,6 +801,7 @@ class st_select_lex_unit: public st_select_lex_node {
>    bool union_needs_tmp_table();
>  
>    void set_unique_exclude();
> +  bool check_distinct_in_union();
>  
>    friend struct LEX;
>    friend int subselect_union_engine::exec();
> diff --git a/sql/sql_union.cc b/sql/sql_union.cc
> index 857c9a117f5..43ce0418fe5 100644
> --- a/sql/sql_union.cc
> +++ b/sql/sql_union.cc
> @@ -1960,3 +1960,39 @@ void st_select_lex_unit::set_unique_exclude()
>      }
>    }
>  }
> +
> +/*
> +  Check if the selects in the derived table can give distinct rows

I think "can" is the wrong word here. UNION/INTERSECT/EXCEPT output "can" 
be distinct if the underlying selects produce distinct sets of rows.

What we are checking is whether the output will be distinct regardless of what
data is there in the queried tables.

> +
> +  for example:
> +  select * from 
> +  ((select t1.a from t1) op (select t2.a from t2) op (select t3.a from t3));
> +  the op here being UNION/INTERSECT/EXCEPT
> +
> +  so this function would check if the derived table like the case above
> +  can give distinct rows or not.

Please describe the criteria and provide a couple of examples (either here or
in the comments inside the function).

> +  
> +  @retval false Distinct rows are not guarenteed
> +  @retval true  Distinct rows are guanrenteed
Typos in the above lines.

> +  
> +*/
> +
> +bool st_select_lex_unit::check_distinct_in_union()
> +{
> +  bool is_intersect_present=FALSE;
> +  st_select_lex* first= first_select();
> +  for(st_select_lex *sl=first; sl ; sl=sl->next_select())
> +    is_intersect_present|= (sl->linkage == INTERSECT_TYPE);
> +      
> +  if (!union_all)
> +    return true;
> +  else
> +  {
> +    if (union_select)
> +    {
> +      if (!is_intersect_present && !union_select->next_select())
> +        return true;
> +    }
> +  }
> +  return false;
> +}
> diff --git a/sql/table.cc b/sql/table.cc
> index f42455a8413..c474d2ed4d7 100644
> --- a/sql/table.cc
> +++ b/sql/table.cc
> @@ -7113,9 +7113,29 @@ bool TABLE::add_tmp_key(uint key, uint key_parts,
>    if (!keyinfo->rec_per_key)
>      return TRUE;
>    bzero(keyinfo->rec_per_key, sizeof(ulong)*key_parts);
> -  st_select_lex* lex= pos_in_table_list ? pos_in_table_list->derived->first_select(): NULL;
> -  if (lex && (lex->options & SELECT_DISTINCT))
> -    keyinfo->rec_per_key[key_parts-1]=1;
> +  
> +  /*
> +    For the case when there is a derived table that would give distinct rows,
> +    the index statistics are passed to the join optimizer to tell that
> +    a ref access to the derived table will produce only one row.
> +  */
> +
> +  st_select_lex_unit* derived= pos_in_table_list ? pos_in_table_list->derived: NULL;
> +  if (derived)
> +  {
> +    /*
> +      This handles the case when we have a single select in the derived table
> +    */
> +    st_select_lex* first= derived->first_select();
> +    if (first && !first->is_part_of_union() && 
> +        first->options & SELECT_DISTINCT)
> +      keyinfo->rec_per_key[key_parts-1]=1;
> +    else
> +    {
> +      if (check_distinct_in_union())
> +        keyinfo->rec_per_key[key_parts-1]=1;
> +    }

I think it would be better if there was a single "if", like so:

  if ((this is a non-union select with SELECT_DISTINCT || 
      check_distinct_in_union())
  {
    keyinfo->rec_per_key[key_parts-1]=1;
  }

> +  }
>    keyinfo->read_stats= NULL;
>    keyinfo->collected_stats= NULL;
>  

BR
 Sergei
-- 
Sergei Petrunia, Software Developer
MariaDB Corporation | Skype: sergefp | Blog: http://s.petrunia.net/blog