← Back to team overview

maria-developers team mailing list archive

FW: Rev 3758: MDEV-5470: Cost-based subquery item pushdown must take into account semi-joins

 

Hi Timour,

Please find below a patch that should implement execution-time protection
against malfunction between condition moving and semi-joins.

There needs also to be an optimization-time counterpart, which should have the
same logic. Will you code it yourself or want me to?

----- Forwarded message from Sergei Petrunia <psergey@xxxxxxxxxxxx> -----

Date: Thu, 19 Dec 2013 02:49:22 +0400
From: Sergei Petrunia <psergey@xxxxxxxxxxxx>
To: commits@xxxxxxxxxxx
Subject: Rev 3758: MDEV-5470: Cost-based subquery item pushdown must take into account semi-joins

At http://bazaar.launchpad.net/~maria-captains/maria/10.0-mdev83
 
------------------------------------------------------------

revno: 3758
committer: Sergey Petrunya <psergey@xxxxxxxxxxxx>
branch nick: 10.0-mdev83
timestamp: Thu 2013-12-19 06:50:28 +0400
message:
  MDEV-5470: Cost-based subquery item pushdown must take into account semi-joins
  - When moving expensive conditions, take into account that semi-joins limit
    where they can be moved.
  - This is only execution part in JOIN::setup_dynamic_conditions(). Optimization
    part is not yet present.
  - No testcases yet.
  - Moving inside SJ-Materialization nests doesn't seem to be supported (either 
    before this patch or after. This patch itself does take SJ-Materialization 
    into account) 

=== modified file 'sql/item_cmpfunc.cc'
--- sql/item_cmpfunc.cc	2013-11-01 11:10:38 +0000
+++ sql/item_cmpfunc.cc	2013-12-19 02:50:28 +0000
@@ -6591,6 +6591,8 @@ void Item_func_collect_stats::dbug_print
 }
 #endif /*DBUG_OFF*/
 
+JOIN_TAB *find_last_tab_for_dynamic_cond(table_map cond_tables,
+                                         JOIN *join, JOIN_TAB *join_tab);
 
 Item_func_dynamic_cond::Item_func_dynamic_cond(Item *cond, JOIN_TAB *atab, JOIN_TAB **ctab)
   : Item_func_collect_stats(cond)
@@ -6624,13 +6626,15 @@ Item_func_dynamic_cond::Item_func_dynami
   if (active_tab->bush_root_tab)
   {
     first_tab= active_tab->bush_root_tab->bush_children->start;
-    last_tab= active_tab->bush_root_tab->bush_children->end;
+    //last_tab= active_tab->bush_root_tab->bush_children->end;
   }
   else
   {
     first_tab= join->join_tab + join->const_tables;
-    last_tab= join->join_tab + join->top_join_tab_count;
+    //last_tab= join->join_tab + join->top_join_tab_count;
   }
+  last_tab= find_last_tab_for_dynamic_cond(cond->used_tables(), 
+                                           active_tab->join, active_tab);
 
   DBUG_ASSERT(active_tab >= first_tab && active_tab < last_tab);
 

=== modified file 'sql/item_cmpfunc.h'
--- sql/item_cmpfunc.h	2013-10-18 19:56:55 +0000
+++ sql/item_cmpfunc.h	2013-12-19 02:50:28 +0000
@@ -561,14 +561,20 @@ class Item_func_collect_stats : public I
 
 class Item_func_dynamic_cond : public Item_func_collect_stats
 {
-protected:
+protected: // as if there were any ancestor classes?
   List<Item_subselect> subqueries;
   /* The join_tab where the condition is currently 'pushed'.*/
   st_join_table *active_tab;
   /* The join_tab currently being executed. */
   st_join_table **exec_tab;
   /* The boundaries of the sub-plan where this predicate can be pushed. */
-  st_join_table *first_tab, *last_tab;
+  st_join_table *first_tab;
+  st_join_table *last_tab;  //psergey-todo: this should point to the last tab.
+                            // conditions depending on semi-join tables have 
+                            // limits re. how far they can be moved.
+public:
+  inline st_join_table * get_last_tab() { return last_tab; }
+protected:
   /*
     The relative access cost based on optimizer estimates. This is the cost
     of all access methods per one record from the first table in the plan.

=== modified file 'sql/opt_subselect.cc'
--- sql/opt_subselect.cc	2013-11-04 15:56:09 +0000
+++ sql/opt_subselect.cc	2013-12-19 02:50:28 +0000
@@ -1480,6 +1480,7 @@ static bool convert_subq_to_sj(JOIN *par
   sj_nest->sj_subq_pred= subq_pred;
   sj_nest->original_subq_pred_used_tables= subq_pred->used_tables() |
                                            subq_pred->left_expr->used_tables();
+  sj_nest->sj_strategys_join_tab= uint(-1);
   /* Nests do not participate in those 'chains', so: */
   /* sj_nest->next_leaf= sj_nest->next_local= sj_nest->next_global == NULL*/
   emb_join_list->push_back(sj_nest);
@@ -4321,6 +4322,21 @@ int init_dups_weedout(JOIN *join, uint f
 
 
 /*
+  For each SJ-inner table in [start_tab; end_tab], set emb_sj_nest to idx
+*/
+static void update_sj_strategys_join_tab(JOIN_TAB *start_tab, 
+                                         JOIN_TAB *end_tab,
+                                         uint idx)
+{
+  for (JOIN_TAB *j= start_tab; j < end_tab; j++)
+  {
+    TABLE_LIST *emb_nest;
+    if ((emb_nest= j->emb_sj_nest))
+      emb_nest->sj_strategys_join_tab= idx;
+  }
+}
+
+/*
   Setup the strategies to eliminate semi-join duplicates.
   
   SYNOPSIS
@@ -4448,6 +4464,10 @@ int setup_semijoin_dups_elimination(JOIN
 
         for (uint j= i; j < i + pos->n_sj_tables; j++)
           join->join_tab[j].inside_loosescan_range= TRUE;
+        
+        update_sj_strategys_join_tab(join->join_tab + i, 
+                                     join->join_tab + i + pos->n_sj_tables,
+                                     i + pos->n_sj_tables - 1);
 
         /* Calculate key length */
         keylen= 0;
@@ -4461,6 +4481,7 @@ int setup_semijoin_dups_elimination(JOIN
           tab[pos->n_sj_tables - 1].do_firstmatch= tab;
         i+= pos->n_sj_tables;
         pos+= pos->n_sj_tables;
+
         break;
       }
       case SJ_OPT_DUPS_WEEDOUT:
@@ -4503,6 +4524,9 @@ int setup_semijoin_dups_elimination(JOIN
             break;
           }
         }
+        update_sj_strategys_join_tab(join->join_tab + i, 
+                                     join->join_tab + i + pos->n_sj_tables,
+                                     i + pos->n_sj_tables - 1);
 
         init_dups_weedout(join, first_table, i, i + pos->n_sj_tables - first_table);
         i+= pos->n_sj_tables;
@@ -4558,6 +4582,9 @@ int setup_semijoin_dups_elimination(JOIN
           }
         }
         j[-1].do_firstmatch= jump_to;
+        update_sj_strategys_join_tab(join->join_tab + i, 
+                                     join->join_tab + i + pos->n_sj_tables,
+                                     i + pos->n_sj_tables - 1);
         i+= pos->n_sj_tables;
         pos+= pos->n_sj_tables;
 

=== modified file 'sql/sql_select.cc'
--- sql/sql_select.cc	2013-11-08 12:36:02 +0000
+++ sql/sql_select.cc	2013-12-19 02:50:28 +0000
@@ -24608,8 +24608,8 @@ double JOIN::static_pushdown_cost(uint i
   @param pred      the predicate to be wrapped in Item_func_dynamic_cond
   @param init_tab  the initial 'active' JOIN_TAB where the dynamic predicate
                    is pushed to
-  @param cur_tab   pointer to the JOIN_TAB* that stores the current JOIN_TAB
-                   during execution
+  @param active_tab_ptr   pointer to the JOIN_TAB* that stores the current JOIN_TAB
+                          during execution
 */
 static Item_func_collect_stats *
 wrap_with_dynamic_conds(Item *pred, JOIN_TAB *init_tab, JOIN_TAB **active_tab_ptr,
@@ -24674,6 +24674,81 @@ wrap_with_dynamic_conds(Item *pred, JOIN
 }
 
 
+/*
+  @brief Find the last JOIN_TAB that a condition can be attached.
+  
+
+  @param cond_tables  used_tables() of the condition that we intend to move.
+  @param join_tab     The left-most join tab where the condition can be
+                      attached (in other words, default attachment point)
+
+  @detail
+  In a regular inner join, one can take a condition that is attached to a
+  table $T, and move it to any table that comes after $T.
+
+  When semi-joins are present, this is no longer true. Detailed explanation
+  can be found in maria-developers@, email "Movable conditions and semi-joins".
+  In short:
+
+  - Inside an SJ-Materialization nest, a condition can be moved to any table to
+    the right, but it must stay within the nest. (We don't support nested 
+    semi-joins, so one will not find semi-joins inside an SJ-M nest)
+
+  - All other Semi-Join strategies have a "range" where they apply. If a
+    condition depends on a table that is within a semi-join, it must not be
+    moved over the right edge of the range.
+*/
+
+JOIN_TAB *find_last_tab_for_dynamic_cond(table_map cond_tables,
+                                         JOIN *join, JOIN_TAB *join_tab)
+{
+  if (join_tab->bush_root_tab)
+  {
+    /* 
+      We are inside SJ-Materialization nest. There are no semi-joins here, 
+      return the last tab in the nest.
+    */
+    return join_tab->bush_root_tab->bush_children->end;
+  }
+  
+  /* Start from the right-most bound */
+  uint last_tab = join->top_join_tab_count;
+
+  cond_tables= cond_tables & ~(PSEUDO_TABLE_BITS | join->const_table_map);
+  /*
+    Walk through semi-joins that this table depends on, and check their last
+    join tabs. The smallest last join tab sets the limit about how far right
+    the condition can be moved.
+  */
+  Table_map_iterator tm_it(cond_tables);
+  int tableno;
+  while ((tableno = tm_it.next_bit()) != Table_map_iterator::BITMAP_END)
+  {
+    TABLE_LIST *sj_nest;
+    if ((sj_nest= join->map2table[tableno]->emb_sj_nest))
+    {
+      if (sj_nest->is_active_sjm())
+      {
+        /*
+          Whoa this condition also depends on a table from within
+          SJ-Materialization nest. (Can happen possible because of equality
+          propagation/substitution).  If we didn't return earlier in this
+          function, we're not inside a semi-join nest.  A value that depends
+          on a table within a SJ-Materialization nest means that it depends on
+          the materialized table columns.
+        */
+        continue;
+      }
+
+      if (sj_nest->sj_strategys_join_tab != uint(-1) && 
+          sj_nest->sj_strategys_join_tab < last_tab)
+        last_tab= sj_nest->sj_strategys_join_tab;
+    }
+  }
+  return join->join_tab + last_tab;
+}
+
+
 /**
   Make expensive subqueries dynamically pushdownable.
 */
@@ -24703,7 +24778,12 @@ bool JOIN::setup_dynamic_conditions()
     Update the least cardinality join_tab for each tab for the chosen join order.
     Check if there are any conditions at all that need to be made dynamic.
   */
-  min_tab= NULL;
+  min_tab= NULL; 
+  /*
+    psergey-todo: the following loop should walk inside SJ-Materialization
+    nests, too. Maybe, this whole function should be invoked for each
+    SJ-Materilization nest.
+  */
   for (tab= last_tab - 1; tab >= first_tab; tab--)
   {
     if (optimizer_flag(thd, OPTIMIZER_SWITCH_STATIC_COND_PUSHDOWN))
@@ -24746,7 +24826,8 @@ bool JOIN::setup_dynamic_conditions()
   */
   for (tab= first_tab; tab < last_tab; tab++)
   {
-    Item_func_collect_stats *wrapped_select_cond= NULL, *cur_wrapped_cond;
+    Item_func_collect_stats *wrapped_select_cond= NULL;
+    Item_func_dynamic_cond *cur_wrapped_cond;
 
     if (tab->select_cond &&
         !(wrapped_select_cond=
@@ -24800,14 +24881,29 @@ bool JOIN::setup_dynamic_conditions()
       Add all dynamic conditions pushed to previous JOIN_TABs also to the
       current JOIN_TAB. This allows to "move" a dynamic condition
       from one tab to another by enabling it for the corrseponding tab.
+
+      psergey-todo: conditions have limits on how far to the right they can be
+      moved (Item_func_dynamic_cond::last_tab).  Don't attach conditions to
+      where they can't be moved.
     */
     if (!prev_dynamic_conds.is_empty())
     {
-      List_iterator_fast<Item_func_dynamic_cond> dyn_cond_it(prev_dynamic_conds);
-      if (!wrapped_select_cond)
-        wrapped_select_cond= dyn_cond_it++;
+      List_iterator<Item_func_dynamic_cond> dyn_cond_it(prev_dynamic_conds);
       while ((cur_wrapped_cond= dyn_cond_it++))
-        wrapped_select_cond->add_pred(cur_wrapped_cond);
+      {
+        if (!wrapped_select_cond)
+          wrapped_select_cond= cur_wrapped_cond;
+        else
+          wrapped_select_cond->add_pred(cur_wrapped_cond);
+        
+        /*
+          psergey-todo: conditions have limits on how far to the right they can be
+          moved (Item_func_dynamic_cond::last_tab).  Don't attach conditions to
+          where they can't be moved.
+        */
+        if (cur_wrapped_cond->get_last_tab() == tab)
+          dyn_cond_it.remove();
+      }
     }
     prev_dynamic_conds.concat(&cur_dynamic_conds);
     cur_dynamic_conds.empty();

=== modified file 'sql/table.h'
--- sql/table.h	2013-09-25 18:07:06 +0000
+++ sql/table.h	2013-12-19 02:50:28 +0000
@@ -1721,6 +1721,9 @@ struct TABLE_LIST
   table_map     sj_inner_tables;
   /* Number of IN-compared expressions */
   uint          sj_in_exprs;
+
+  //psergey-todo: add the index of last table in the join order
+  uint          sj_strategys_join_tab;
   
   /* If this is a non-jtbm semi-join nest: corresponding subselect predicate */
   Item_in_subselect  *sj_subq_pred;


----- End forwarded message -----

-- 
BR
 Sergei
-- 
Sergei Petrunia, Software Developer
MariaDB | Skype: sergefp | Blog: http://s.petrunia.net/blog