← Back to team overview

maria-developers team mailing list archive

Igor please review: [Commits] Rev 3464: BUG#951937: Wrong result (missing rows) with semijoin+materialization, IN subquery, InnoDB, TEMPTABLE view in file:///home/psergey/dev2/5.3-look60/]

 

Hello Igor, 

Could you please review the below:

----- Forwarded message from Sergey Petrunya <psergey@xxxxxxxxxxxx> -----

From: Sergey Petrunya <psergey@xxxxxxxxxxxx>
To: commits@xxxxxxxxxxx
X-Mailer: mail (GNU Mailutils 1.2)
Date: Sat, 24 Mar 2012 01:40:02 +0400 (GST)
Subject: [Commits] Rev 3464: BUG#951937: Wrong result (missing rows) with
	semijoin+materialization, IN subquery, InnoDB,
	TEMPTABLE view in file:///home/psergey/dev2/5.3-look60/

At file:///home/psergey/dev2/5.3-look60/

------------------------------------------------------------
revno: 3464
revision-id: psergey@xxxxxxxxxxxx-20120323213956-ohxb9jitekne7yya
parent: psergey@xxxxxxxxxxxx-20120321071820-t2azo8xxlm73mezd
committer: Sergey Petrunya <psergey@xxxxxxxxxxxx>
branch nick: 5.3-look60
timestamp: Sat 2012-03-24 01:39:56 +0400
message:
  BUG#951937: Wrong result (missing rows) with semijoin+materialization, IN subquery, InnoDB, TEMPTABLE view
  - Fix equality propagation to work with SJM nests and OR clauses (full descirption of problem and 
  solution in the comment in the patch)
=== modified file 'mysql-test/r/subselect_sj2.result'
--- a/mysql-test/r/subselect_sj2.result	2012-02-25 00:50:22 +0000
+++ b/mysql-test/r/subselect_sj2.result	2012-03-23 21:39:56 +0000
@@ -903,5 +903,48 @@
 3	1	1
 4	1	1
 DROP TABLE t1,t2;
+#
+# BUG#951937: Wrong result (missing rows) with semijoin+materialization, IN subquery, InnoDB, TEMPTABLE view
+#
+CREATE TABLE t1 (
+a VARCHAR(1),
+b VARCHAR(1) NOT NULL,
+KEY(a)
+) ENGINE=InnoDB;
+INSERT INTO t1 VALUES
+('j','j'),('v','v'),('c','c'),('m','m'),('d','d'),
+('y','y'),('t','t'),('d','d'),('s','s'),('r','r'),
+('m','m'),('b','b'),('x','x'),('g','g'),('p','p'),
+('q','q'),('w','w'),('d','d'),('e','e');
+CREATE ALGORITHM=TEMPTABLE VIEW v1 AS SELECT * FROM t1;
+SELECT * FROM v1
+WHERE ( a, a ) IN (
+SELECT alias2.b, alias2.a
+FROM t1 AS alias1, t1 AS alias2
+WHERE alias2.b = alias1.a
+AND ( alias1.b >= alias1.a OR alias2.b = 'z' )
+);
+a	b
+j	j
+v	v
+c	c
+m	m
+m	m
+d	d
+d	d
+d	d
+y	y
+t	t
+s	s
+r	r
+b	b
+x	x
+g	g
+p	p
+q	q
+w	w
+e	e
+DROP VIEW v1;
+DROP TABLE t1;
 # This must be the last in the file:
 set optimizer_switch=@subselect_sj2_tmp;

=== modified file 'mysql-test/r/subselect_sj2_jcl6.result'
--- a/mysql-test/r/subselect_sj2_jcl6.result	2012-02-25 00:50:22 +0000
+++ b/mysql-test/r/subselect_sj2_jcl6.result	2012-03-23 21:39:56 +0000
@@ -917,6 +917,49 @@
 3	1	1
 4	1	1
 DROP TABLE t1,t2;
+#
+# BUG#951937: Wrong result (missing rows) with semijoin+materialization, IN subquery, InnoDB, TEMPTABLE view
+#
+CREATE TABLE t1 (
+a VARCHAR(1),
+b VARCHAR(1) NOT NULL,
+KEY(a)
+) ENGINE=InnoDB;
+INSERT INTO t1 VALUES
+('j','j'),('v','v'),('c','c'),('m','m'),('d','d'),
+('y','y'),('t','t'),('d','d'),('s','s'),('r','r'),
+('m','m'),('b','b'),('x','x'),('g','g'),('p','p'),
+('q','q'),('w','w'),('d','d'),('e','e');
+CREATE ALGORITHM=TEMPTABLE VIEW v1 AS SELECT * FROM t1;
+SELECT * FROM v1
+WHERE ( a, a ) IN (
+SELECT alias2.b, alias2.a
+FROM t1 AS alias1, t1 AS alias2
+WHERE alias2.b = alias1.a
+AND ( alias1.b >= alias1.a OR alias2.b = 'z' )
+);
+a	b
+j	j
+v	v
+c	c
+m	m
+m	m
+d	d
+d	d
+d	d
+y	y
+t	t
+s	s
+r	r
+b	b
+x	x
+g	g
+p	p
+q	q
+w	w
+e	e
+DROP VIEW v1;
+DROP TABLE t1;
 # This must be the last in the file:
 set optimizer_switch=@subselect_sj2_tmp;
 #

=== modified file 'mysql-test/r/subselect_sj2_mat.result'
--- a/mysql-test/r/subselect_sj2_mat.result	2012-02-25 00:50:22 +0000
+++ b/mysql-test/r/subselect_sj2_mat.result	2012-03-23 21:39:56 +0000
@@ -905,6 +905,49 @@
 3	1	1
 4	1	1
 DROP TABLE t1,t2;
+#
+# BUG#951937: Wrong result (missing rows) with semijoin+materialization, IN subquery, InnoDB, TEMPTABLE view
+#
+CREATE TABLE t1 (
+a VARCHAR(1),
+b VARCHAR(1) NOT NULL,
+KEY(a)
+) ENGINE=InnoDB;
+INSERT INTO t1 VALUES
+('j','j'),('v','v'),('c','c'),('m','m'),('d','d'),
+('y','y'),('t','t'),('d','d'),('s','s'),('r','r'),
+('m','m'),('b','b'),('x','x'),('g','g'),('p','p'),
+('q','q'),('w','w'),('d','d'),('e','e');
+CREATE ALGORITHM=TEMPTABLE VIEW v1 AS SELECT * FROM t1;
+SELECT * FROM v1
+WHERE ( a, a ) IN (
+SELECT alias2.b, alias2.a
+FROM t1 AS alias1, t1 AS alias2
+WHERE alias2.b = alias1.a
+AND ( alias1.b >= alias1.a OR alias2.b = 'z' )
+);
+a	b
+j	j
+v	v
+c	c
+m	m
+m	m
+d	d
+d	d
+d	d
+y	y
+t	t
+s	s
+r	r
+b	b
+x	x
+g	g
+p	p
+q	q
+w	w
+e	e
+DROP VIEW v1;
+DROP TABLE t1;
 # This must be the last in the file:
 set optimizer_switch=@subselect_sj2_tmp;
 set optimizer_switch=default;

=== modified file 'mysql-test/t/subselect_sj2.test'
--- a/mysql-test/t/subselect_sj2.test	2012-01-19 19:44:43 +0000
+++ b/mysql-test/t/subselect_sj2.test	2012-03-23 21:39:56 +0000
@@ -1090,5 +1090,32 @@
 
 DROP TABLE t1,t2;
 
+--echo #
+--echo # BUG#951937: Wrong result (missing rows) with semijoin+materialization, IN subquery, InnoDB, TEMPTABLE view
+--echo #
+CREATE TABLE t1 (
+  a VARCHAR(1),
+  b VARCHAR(1) NOT NULL,
+  KEY(a)
+) ENGINE=InnoDB;
+INSERT INTO t1 VALUES
+('j','j'),('v','v'),('c','c'),('m','m'),('d','d'),
+('y','y'),('t','t'),('d','d'),('s','s'),('r','r'),
+('m','m'),('b','b'),('x','x'),('g','g'),('p','p'),
+('q','q'),('w','w'),('d','d'),('e','e');
+
+CREATE ALGORITHM=TEMPTABLE VIEW v1 AS SELECT * FROM t1;
+
+# This query returns 6 rows instead of 19
+
+SELECT * FROM v1
+WHERE ( a, a ) IN (
+  SELECT alias2.b, alias2.a
+  FROM t1 AS alias1, t1 AS alias2
+  WHERE alias2.b = alias1.a
+    AND ( alias1.b >= alias1.a OR alias2.b = 'z' )
+);
+DROP VIEW v1;
+DROP TABLE t1;
 --echo # This must be the last in the file:
 set optimizer_switch=@subselect_sj2_tmp;

=== modified file 'sql/item_cmpfunc.cc'
--- a/sql/item_cmpfunc.cc	2012-02-28 14:41:55 +0000
+++ b/sql/item_cmpfunc.cc	2012-03-23 21:39:56 +0000
@@ -5388,7 +5388,7 @@
 */
 
 Item_equal::Item_equal(Item *f1, Item *f2, bool with_const_item)
-  : Item_bool_func(), eval_item(0), cond_false(0)
+  : Item_bool_func(), eval_item(0), cond_false(0), context_field(NULL)
 {
   const_item_cache= 0;
   with_const= with_const_item;
@@ -5411,7 +5411,7 @@
 */
 
 Item_equal::Item_equal(Item_equal *item_equal)
-  : Item_bool_func(), eval_item(0), cond_false(0)
+  : Item_bool_func(), eval_item(0), cond_false(0), context_field(NULL)
 {
   const_item_cache= 0;
   List_iterator_fast<Item> li(item_equal->equal_items);

=== modified file 'sql/item_cmpfunc.h'
--- a/sql/item_cmpfunc.h	2012-01-19 22:11:53 +0000
+++ b/sql/item_cmpfunc.h	2012-03-23 21:39:56 +0000
@@ -1700,9 +1700,16 @@
     as datetimes. The comparator is used only if compare_as_dates=TRUE
   */
   Arg_comparator cmp;
+ 
+  /*
+    For Item_equal objects inside an OR clause: one of the fields that were
+    used in the original comparison.
+  */
+  Item_field *context_field;
 public:
   inline Item_equal()
-    : Item_bool_func(), with_const(FALSE), eval_item(0), cond_false(0)
+    : Item_bool_func(), with_const(FALSE), eval_item(0), cond_false(0),
+      context_field(NULL)
   { const_item_cache=0 ;}
   Item_equal(Item *f1, Item *f2, bool with_const_item);
   Item_equal(Item_equal *item_equal);
@@ -1729,6 +1736,8 @@
   Item *transform(Item_transformer transformer, uchar *arg);
   virtual void print(String *str, enum_query_type query_type);
   CHARSET_INFO *compare_collation();
+
+  void set_context_field(Item_field *ctx_field) { context_field= ctx_field; }
   friend class Item_equal_fields_iterator;
   friend Item *eliminate_item_equal(COND *cond, COND_EQUAL *upper_levels,
                            Item_equal *item_equal);

=== modified file 'sql/opt_subselect.cc'
--- a/sql/opt_subselect.cc	2012-03-18 19:58:20 +0000
+++ b/sql/opt_subselect.cc	2012-03-23 21:39:56 +0000
@@ -171,6 +171,246 @@
 
 */
 
+/*
+EqualityPropagationAndSjmNests
+******************************
+
+The process of equality propagation has two parts
+P1. Equality propagation 
+P2. Equality substitution [for a certain join order]
+
+The propagation part is not affected by SJM nests. In fact, it is done before
+we determine the execution plan, i.e. before we even know we will use
+SJM-nests for execution.
+
+The substitution part is affected. 
+
+Substitution without SJMs
+=========================
+When one doesn't have SJM nests, tables have a strict join order:
+
+  ---------------------------------> 
+    t1 -- t2 -- t3 -- t4 --- t5 
+
+
+       ?  ^
+           \
+            --(part-of-WHERE)
+
+
+parts WHERE/ON and ref. expressions are attached at some point along the axis.
+Expression is allowed to refer to a table column if the table is to the left of
+the attachment point. For any given expression, we have a goal: 
+
+  "Move leftmost allowed attachment point as much as possible to the left"
+
+Substitution with SJMs - task setting
+=====================================
+
+When SJM nests are present, there is no global strict table ordering anymore:
+
+   
+  ---------------------------------> 
+
+    ot1 -- ot2 --- sjm -- ot4 --- ot5 
+                   |
+                   |                Main execution
+   - - - - - - - - - - - - - - - - - - - - - - - -                 
+                   |                 Materialization
+      it1 -- it2 --/    
+
+
+Besides that, we must take into account that
+ - values for outer table columns, otN.col, are inaccessible at
+   materialization step                                           (SJM-RULE)
+ - values for inner table columns, itN.col, are inaccessible at Main execution
+   step, except for SJ-Materialization-Scan and columns that are in the 
+   subquery's select list.                                        (SJM-RULE)
+
+Substitution with SJMs - solution
+=================================
+
+First, we introduce global strict table ordering like this:
+
+  ot1 - ot2 --\                    /--- ot3 -- ot5 
+               \--- it1 --- it2 --/
+
+Now, let's see how to meet (SJM-RULE).
+
+SJ-Materialization is only applicable for uncorrelated subqueries. From this, it
+follows that any multiple equality will either
+1. include only columns of outer tables, or
+2. include only columns of inner tables, or
+3. include columns of inner and outer tables, joined together through one 
+   of IN-equalities.
+
+Cases #1 and #2 can be handled in the same way as with regular inner joins.
+
+Case #3 requires special handling, so that we don't construct violations of
+(SJM-RULE). Let's consider possible ways to build violations.
+
+Equality propagation starts with the clause in this form
+
+   top_query_where AND subquery_where AND in_equalities
+
+First, it builds multi-equalities. It can also build a mixed multi-equality
+
+  multiple-equal(ot1.col, ot2.col, ... it1.col, itN.col) 
+
+Multi-equalities are pushed down the OR-clauses in top_query_where and in
+subquery_where, so it's possible that clauses like this one are built:
+
+   subquery_cond OR (multiple-equal(it1.col, ot1.col,...) AND ...)
+   ^^^^^^^^^^^^^                                 \
+         |                                        this must be evaluated
+         \- can only be invaluated                on the main phase.
+            on materialization phase
+
+Finally, equality substitution is started. It does two operations:
+
+
+1. Field reference substitution 
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+(In the code, this is Item_field::replace_equal_field)
+
+This is a process of replacing each reference to "tblX.col" 
+with the first element of the multi-equality.          (REF-SUBST-ORIG)
+
+This behaviour can cause problems with Semi-join nests. Suppose, we have a
+condition: 
+
+  func(it1.col, it2.col)
+
+and a multi-equality(ot1.col, it1.col). Then, reference to "it1.col" will be 
+replaced with "ot1.col", constructing a condition
+   
+   func(ot1.col, it2.col)
+
+which will be a vioation of (SJM-RULE).
+
+In order to avoid this, (REF-SUBST-ORIG) is amended as follows: 
+
+- references to tables "itX.col" that are inner wrt some SJM nest, are
+  replaced with references to the first inner table from the same SJM nest.
+
+- references to top-level tables "otX.col" are replaced with references to
+  the first element of the multi-equality, no matter if that first element is
+  a column of a top-level table or of table from some SJM nest.
+                                                              (REF-SUBST-SJM)
+
+  The case where the first element is a table from an SJM nest $SJM is ok, 
+  because it can be proven that $SJM uses SJ-Materialization-Scan, and 
+  "unpacks" correct column values to the first element during the main
+  execution phase.
+
+2. Item_equal elimination
+~~~~~~~~~~~~~~~~~~~~~~~~~
+(In the code: eliminate_item_equal) This is a process of taking 
+
+  multiple-equal(a,b,c,d,e)
+
+and replacing it with an equivalent expression which is an AND of pair-wise 
+equalities:
+
+  a=b AND a=c AND ...
+
+The equalities are picked such that for any given join prefix (t1,t2...) the
+subset of equalities that can be evaluated gives the most restrictive
+filtering. 
+
+Without SJM nests, it is sufficient to compare every multi-equality member
+with the first one:
+
+  elem1=elem2 AND elem1=elem3 AND elem1=elem4 ... 
+
+When SJM nests are present, we should take care not to construct equalities
+that violate the (SJM-RULE). This is achieved by generating separate sets of
+equalites for top-level tables and for inner tables. That is, for the join
+order 
+
+  ot1 - ot2 --\                    /--- ot3 -- ot5 
+               \--- it1 --- it2 --/
+
+we will generate
+   ot1.col=ot2.col
+   ot1.col=ot3.col
+   ot1.col=ot5.col
+   it2.col=it1.col
+
+
+2.1 The problem with Item_equals and ORs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+As has been mentioned above, multiple equalities are pushed down into OR
+clauses, possibly building clauses like this:
+
+   func(it.col2) OR multiple-equal(it1.col1, it1.col2, ot1.col)      (1)
+
+where the first part of the clause has references to inner tables, while the
+second has references to the top-level tables, which is a violation of
+(SJM-RULE).
+
+AND-clauses of this kind do not create problems, because make_cond_for_table()
+will take them apart. OR-clauses will not be split. It is possible to
+split-out the part that's dependent on the inner table:
+
+   func(it.col2) OR it1.col1=it1.col2
+
+but this is a less-restrictive condition than condition (1). Current execution
+scheme will still try to generate the "remainder" condition:
+
+   func(it.col2) OR it1.col1=ot1.col
+
+which is a violation of (SJM-RULE).
+
+QQ: "ot1.col=it1.col" is checked at the upper level. Why was it not removed
+here?
+AA: because has a proper subset of conditions that are found on this level.
+    consider a join order of  ot, sjm(it)
+    and a condition
+      ot.col=it.col AND ( ot.col=it.col='foo' OR it.col2='bar')
+
+    we will produce: 
+       table ot:  nothing
+       table it:  ot.col=it.col AND (ot.col='foo' OR it.col2='bar')
+                                     ^^^^        ^^^^^^^^^^^^^^^^       
+                                      |          \ the problem is that 
+                                      |            this part condition didnt
+                                      |            receive a substitution
+                                      |
+                                      +--- it was correct to subst, 'ot' is 
+                                           the left-most.
+
+
+Is there any merit in pushing inner=outer down into ORs?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Yes, there is. Consider the query:
+
+  select * from ot 
+  where ot.col in (select it.col from it where (it.col='foo' OR it.col='bar'))
+
+here, it may be useful to infer that 
+
+   (ot.col='foo' OR ot.col='bar')       (CASE-FOR-SUBST)
+
+and attach that condition to the table 'ot'.
+
+Solution #1
+~~~~~~~~~~~
+Let make_cond_for_table() chop off the bad parts of produced OR clause. If it
+has produced an OR clause:
+- check if all of its branches are evaluable for the context join_tab
+- If some aren't, destroy the OR clause
+
+Solution #2
+~~~~~~~~~~~
+Before the equality propagation phase, none of the OR clauses violate the
+(SJM-RULE). This way, if we remember which tables the original equality
+referred to, we can only generate equalities that refer to outer or inner
+tables. Note that this will disallow handling of cases like (CASE-FOR-SUBST).
+*/
+
 
 static
 bool subquery_types_allow_materialization(Item_in_subselect *in_subs);

=== modified file 'sql/sql_select.cc'
--- a/sql/sql_select.cc	2012-03-13 20:49:18 +0000
+++ b/sql/sql_select.cc	2012-03-23 21:39:56 +0000
@@ -10666,6 +10666,9 @@
     acceptable, as this happens rarely. The implementation without
     copying would be much more complicated.
 
+    For description of how equality propagation works with SJM nests, grep 
+    for EqualityPropagationAndSjmNests.
+
   @param left_item   left term of the quality to be checked
   @param right_item  right term of the equality to be checked
   @param item        equality item if the equality originates from a condition
@@ -10739,12 +10742,14 @@
     {
       /* left_item_equal of an upper level contains left_item */
       left_item_equal= new Item_equal(left_item_equal);
+      left_item_equal->set_context_field(((Item_field*) left_item));
       cond_equal->current_level.push_back(left_item_equal);
     }
     if (right_copyfl)
     {
       /* right_item_equal of an upper level contains right_item */
       right_item_equal= new Item_equal(right_item_equal);
+      right_item_equal->set_context_field(((Item_field*) right_item));
       cond_equal->current_level.push_back(right_item_equal);
     }
 
@@ -10830,6 +10835,7 @@
       {
         item_equal= new Item_equal(item_equal);
         cond_equal->current_level.push_back(item_equal);
+        item_equal->set_context_field(field_item);
       }
       if (item_equal)
       {
@@ -11480,6 +11486,8 @@
         Item_equal::get_first() also takes similar measures for dealing with
         equality substitution in presense of SJM nests.
 
+    Grep for EqualityPropagationAndSjmNests for a more verbose description.
+
   @return
     - The condition with generated simple equalities or
     a pointer to the simple generated equality, if success.
@@ -11543,9 +11551,13 @@
       on upper AND-levels.
     */
     if (upper)
-    { 
+    {
+      TABLE_LIST *native_sjm= embedding_sjm(item_equal->context_field);
       if (item_const && upper->get_const())
+      {
+        /* Upper item also has "field_item=const". Don't produce equality here */
         item= 0;
+      }
       else
       {
         Item_equal_fields_iterator li(*item_equal);
@@ -11556,6 +11568,8 @@
             break;
         }
       }
+      if (embedding_sjm(field_item) != native_sjm)
+        item= NULL; /* Don't produce equality */
     }
     
     bool produce_equality= test(item == field_item);

_______________________________________________
commits mailing list
commits@xxxxxxxxxxx
https://lists.askmonty.org/cgi-bin/mailman/listinfo/commits

----- End forwarded message -----

-- 
BR
 Sergei
-- 
Sergei Petrunia, Software Developer
Monty Program AB, http://askmonty.org
Blog: http://s.petrunia.net/blog


Follow ups