← Back to team overview

maria-developers team mailing list archive

review: part 1 of MWL#90 combined patch for review

 

Hi!

Here is part 1 of the review; part 2 should come later today.

>>>>> "Sergey" == Sergey Petrunya <psergey@xxxxxxxxxxxx> writes:



Sergey> On Tue, Feb 22, 2011 at 05:24:37PM +0300, Sergey Petrunya wrote:
>> Hi Monty,
>> 
>> Please find attached the complete patch of MWL#90 for review.  The tree with
>> the code is on launchpad/buildbot:
>> https://code.launchpad.net/~maria-captains/maria/5.3-subqueries-mwl90
>> http://buildbot.askmonty.org/buildbot/grid?branch=5.3-subqueries-mwl90

<cut>

> +++ maria-5.3-subqueries-r36-noc/mysql-test/include/mix1.inc	2011-02-16 14:42:52.000000000 +0300
> @@ -1532,6 +1532,12 @@
>  
>  SELECT 1 FROM (SELECT COUNT(DISTINCT c1) 
>                   FROM t1 WHERE c2 IN (1, 1) AND c3 = 2 GROUP BY c2) x;
> +--echo # MariaDB note:
> +--echo #  This will show 2 for table which has 5 rows. 
> +--echo #  This is because the access method employed is actually range access
> +--echo #  which scans 2 records (yes, EXPLAIN displays it incorrectly).
> +--echo #  our correct printing is an artifact of changing in select_describe()
> +--echo #  from printing table->starts.records() to tab->records.

Please remove comment as this is not an issue anymore.
 
> diff -urN --exclude='.*' 5.3-noc/sql/item_subselect.cc maria-5.3-subqueries-r36-noc/sql/item_subselect.cc
> 
> @@ -4067,6 +4078,17 @@
>  }
>  
>  
> +int subselect_hash_sj_engine::optimize()
> +{
> +  int res= 0;

Don't reset res

> +  SELECT_LEX *save_select= thd->lex->current_select;
> +  thd->lex->current_select= materialize_join->select_lex;
> +  res= materialize_join->optimize();
> +  thd->lex->current_select= save_select;
> +
> +  return res;
> +}
> +

> diff -urN --exclude='.*' 5.3-noc/sql/item_subselect.h maria-5.3-subqueries-r36-noc/sql/item_subselect.h
> --- 5.3-noc/sql/item_subselect.h	2011-01-27 21:39:33.000000000 +0300
> +++ maria-5.3-subqueries-r36-noc/sql/item_subselect.h	2011-02-16 14:42:52.000000000 +0300
> @@ -46,16 +46,17 @@
>            < child_join->prepare
>          < engine->prepare
>          *ref= substitution;
> +        substitution= NULL;
>        < Item_subselect::fix_fields
>    */
> -  Item *substitution;
>  public:
> +  Item *substitution;
>    /* unit of subquery */
>    st_select_lex_unit *unit;
> -protected:
>    Item *expr_cache;
>    /* engine that perform execution of subselect (single select or union) */
>    subselect_engine *engine;
> +protected:
>    /* old engine if engine was changed */
>    subselect_engine *old_engine;
>    /* cache of used external tables */
> @@ -148,6 +149,7 @@
>    bool mark_as_dependent(THD *thd, st_select_lex *select, Item *item);
>    void fix_after_pullout(st_select_lex *new_parent, Item **ref);
>    void recalc_used_tables(st_select_lex *new_parent, bool after_pullout);
> +  virtual int optimize();
>    virtual bool exec();
>    virtual void fix_length_and_dec();
>    table_map used_tables() const;
> @@ -351,7 +353,9 @@
>      all JOIN in UNION
>    */
>    Item *expr;
> +public:
>    Item_in_optimizer *optimizer;
> +protected:

Move the above to after Item *substitions; No reason to have many
:public :protected blocks

> @@ -397,6 +401,16 @@
>    };
>    enum_exec_method exec_method;
>  
> +  /*
> +    TRUE<=>this is a flattenable semi-join, false overwise.
> +  */
> +  bool is_flattenable_semijoin;

You should set this to zero in Item_subselect::Item_subselect() !

> +  
> +  /*
> +    Cost to populate the temporary table (set on if-needed basis).
> +  */
> +  //double startup_cost;
> +

Remove startup_cost as it's not used

<cut>

> @@ -752,7 +767,7 @@
>  
>  class subselect_hash_sj_engine : public subselect_engine
>  {
> -protected:
> +public:
>    /* The table into which the subquery is materialized. */
>    TABLE *tmp_table;
>    /* TRUE if the subquery was materialized into a temp table. */
> @@ -764,14 +779,16 @@
>      of subselect_single_select_engine::[prepare | cols].
>    */
>    subselect_single_select_engine *materialize_engine;
> +protected:
>    /* The engine used to compute the IN predicate. */
>    subselect_engine *lookup_engine;
>    /*
>      QEP to execute the subquery and materialize its result into a
>      temporary table. Created during the first call to exec().
>    */
> +public:
>    JOIN *materialize_join;
> -
> +protected:
>    /* Keyparts of the only non-NULL composite index in a rowid merge. */
>    MY_BITMAP non_null_key_parts;
>    /* Keyparts of the single column indexes with NULL, one keypart per index. */
> @@ -784,7 +801,9 @@
>      IN results because index lookups sometimes match values that are actually
>      not equal to the search key in SQL terms.
>   */
> +public:
>    Item_cond_and *semi_join_conds;
> +protected:
>    /* Possible execution strategies that can be used to compute hash semi-join.*/

Please move all public variables to one block.

> +++ maria-5.3-subqueries-r36-noc/sql/opt_subselect.cc	2011-02-16 14:42:52.000000000 +0300
> @@ -166,6 +328,7 @@
>        (void)subquery_types_allow_materialization(in_subs);
>  
>        in_subs->emb_on_expr_nest= thd->thd_marker.emb_on_expr_nest;
> +      in_subs->is_flattenable_semijoin= TRUE;
>  
>        /* Register the subquery for further processing in flatten_subqueries() */
>        select_lex->
> @@ -220,10 +383,24 @@
>            (in_subs->is_top_level_item() ||
>             optimizer_flag(thd, OPTIMIZER_SWITCH_PARTIAL_MATCH_ROWID_MERGE) ||
>             optimizer_flag(thd, OPTIMIZER_SWITCH_PARTIAL_MATCH_TABLE_SCAN)) &&//4
> -          !in_subs->is_correlated &&                                  // 5
> -          in_subs->exec_method == Item_in_subselect::NOT_TRANSFORMED) // 6
> +          !in_subs->is_correlated)                                  // 5
>        {
> +        if (in_subs->exec_method == Item_in_subselect::NOT_TRANSFORMED)
>            in_subs->exec_method= Item_in_subselect::MATERIALIZATION;

You removed //6 comment without updating the comment for the block
that describes 6. Please update comment!

> +
> +        /*
> +          If the subquery is an AND-part of WHERE register for being processed
> +          with jtbm strategy
> +        */
> +        if (in_subs->exec_method == Item_in_subselect::MATERIALIZATION &&
> +            thd->thd_marker.emb_on_expr_nest == (TABLE_LIST*)0x1 &&
> +            optimizer_flag(thd, OPTIMIZER_SWITCH_SEMIJOIN))

Instead of doing:

thd->thd_marker.emb_on_expr_nest == (TABLE_LIST*)0x1 please create an
inline function with a meningfull name to test test or at least
replace (TABLE_LIST*) 0x with a define or both.

> +        {
> +          in_subs->emb_on_expr_nest= thd->thd_marker.emb_on_expr_nest;

Here you could make things cleared and set in_subs->emb_on_expr_nest
to a define.

<cut>

> @@ -339,6 +516,69 @@
>  
>  
>  /*
> +  Finalize IN->EXISTS conversion in case we couldn't use materialization.
> +
> +  DESCRIPTION  Invoke the IN->EXISTS converter
> +    Replace the Item_in_subselect with its wrapper Item_in_optimizer in WHERE.
> +
> +  RETURN 
> +    FALSE - Ok
> +    TRUE  - Fatal error
> +*/
> +
> +static 
> +bool make_in_exists_conversion(THD *thd, JOIN *join, Item_in_subselect *item)
> +{
> +  DBUG_ENTER("make_in_exists_conversion");
> +  JOIN *child_join= item->unit->first_select()->join;
> +  Item_subselect::trans_res res;
> +  item->changed= 0;

Please add a comment why you set item->fixed to 0 as this is not obvious

> +  item->fixed= 0;
> +
> +  SELECT_LEX *save_select_lex= thd->lex->current_select;
> +  thd->lex->current_select= item->unit->first_select();
> +
> +  res= item->select_transformer(child_join);
> +
> +  thd->lex->current_select= save_select_lex;
> +
> +  if (res == Item_subselect::RES_ERROR)
> +    DBUG_RETURN(TRUE);
> +
> +  item->changed= 1;
> +  item->fixed= 1;
> +
> +  Item *substitute= item->substitution;
> +  bool do_fix_fields= !item->substitution->fixed;

item->substitution -> substitute

> +  /*
> +    The Item_subselect has already been wrapped with Item_in_optimizer, so we
> +    should search for item->optimizer, not 'item'.
> +  */
> +  Item *replace_me= item->optimizer;
> +  DBUG_ASSERT(replace_me==substitute);
> +
> +  Item **tree= (item->emb_on_expr_nest == (TABLE_LIST*)1)?
> +                 &join->conds : &(item->emb_on_expr_nest->on_expr);
> +  if (replace_where_subcondition(join, tree, replace_me, substitute, 
> +                                 do_fix_fields))
> +    DBUG_RETURN(TRUE);
> +  item->substitution= NULL;
> +   

The code belove makes a permanent change to the prepared statement
tree. Please add a comment about this!

Also please add a test where you do a prepared statement that hits his
code and excecute it twice with different parameters.

> +  if (!thd->stmt_arena->is_conventional())
> +  {
> +    tree= (item->emb_on_expr_nest == (TABLE_LIST*)1)?
> +           &join->select_lex->prep_where : 
> +           &(item->emb_on_expr_nest->prep_on_expr);
> +
> +    if (replace_where_subcondition(join, tree, replace_me, substitute, 
> +                                   FALSE))
> +      DBUG_RETURN(TRUE);
> +  }
> +  DBUG_RETURN(FALSE);
> +}
> +
> +
> +/*
>    Convert semi-join subquery predicates into semi-join join nests
>  
>    SYNOPSIS
> @@ -445,25 +685,41 @@
>    // #tables-in-parent-query + #tables-in-subquery < MAX_TABLES
>    /* Replace all subqueries to be flattened with Item_int(1) */
>    arena= thd->activate_stmt_arena_if_needed(&backup);
> -  for (in_subq= join->sj_subselects.front(); 
> -       in_subq != in_subq_end && 
> -       join->tables + (*in_subq)->unit->first_select()->join->tables < MAX_TABLES;
> -       in_subq++)
> -  {
> -    Item **tree= ((*in_subq)->emb_on_expr_nest == (TABLE_LIST*)1)?
> -                   &join->conds : &((*in_subq)->emb_on_expr_nest->on_expr);
> -    if (replace_where_subcondition(join, tree, *in_subq, new Item_int(1),
> -                                   FALSE))
> -      DBUG_RETURN(TRUE); /* purecov: inspected */
> -  }
>   
>    for (in_subq= join->sj_subselects.front(); 
>         in_subq != in_subq_end && 
>         join->tables + (*in_subq)->unit->first_select()->join->tables < MAX_TABLES;
>         in_subq++)
>    {
> -    if (convert_subq_to_sj(join, *in_subq))
> -      DBUG_RETURN(TRUE);
> +    bool remove_item= TRUE;
> +    if ((*in_subq)->is_flattenable_semijoin) 
> +    {
> +      if (convert_subq_to_sj(join, *in_subq))
> +        DBUG_RETURN(TRUE);
> +    }
> +    else
> +    {
> +      if (convert_subq_to_jtbm(join, *in_subq, &remove_item))
> +        DBUG_RETURN(TRUE);
> +    }
> +    if (remove_item)
> +    {
> +      Item **tree= ((*in_subq)->emb_on_expr_nest == (TABLE_LIST*)1)?
> +                     &join->conds : &((*in_subq)->emb_on_expr_nest->on_expr);
> +      Item *replace_me= *in_subq;
> +      /*
> +        JTBM: the subquery was already mapped with Item_in_optimizer, so we
> +        should search for that, not for original Item_in_subselect.
> +        TODO: what about delaying that rewrite until here?
> +      */
> +      if (!(*in_subq)->is_flattenable_semijoin)
> +      {
> +        replace_me= (*in_subq)->optimizer;
> +      }

Consider doing a function:

Item_in_subselect::original_item()
{
  return is_flattenable_semijoin ? self : item->optimizer;
}

This would make the above code a bit easier to understand by doing:

Item *replace_me= (*in_subq)->original_item()

This would thus be a bit like real_item()

> +      if (replace_where_subcondition(join, tree, replace_me, new Item_int(1),
> +                                     FALSE))
> +        DBUG_RETURN(TRUE); /* purecov: inspected */
> +    }
>    }
>  skip_conversion:
>    /* 
> @@ -494,7 +750,19 @@
>      bool do_fix_fields= !(*in_subq)->substitution->fixed;
>      Item **tree= ((*in_subq)->emb_on_expr_nest == (TABLE_LIST*)1)?
>                     &join->conds : &((*in_subq)->emb_on_expr_nest->on_expr);
> -    if (replace_where_subcondition(join, tree, *in_subq, substitute, 
> +
> +    Item *replace_me= *in_subq;
> +    /*
> +      JTBM: the subquery was already mapped with Item_in_optimizer, so we
> +      should search for that, not for original Item_in_subselect.
> +      TODO: what about delaying that rewrite until here?
> +    */
> +    if (!(*in_subq)->is_flattenable_semijoin)
> +    {
> +      replace_me= (*in_subq)->optimizer;
> +    }

The above code can also be made easier with
(*in_subq)->original_item(). The above comment is a good one for
original_item()!

> +
> +/*
> +  Get #output_rows and scan_time estimates for a "delayed" table.
> +
> +  SYNOPSIS
> +    get_delayed_table_estimates()
> +      table         IN    Table to get estimates for
> +      out_rows      OUT   E(#rows in the table)
> +      scan_time     OUT   E(scan_time).
> +      startup_cost  OUT   cost to populate the table.
> +
> +  DESCRIPTION
> +    Get #output_rows and scan_time estimates for a "delayed" table. By
> +    "delayed" here we mean that the table is filled at the start of query
> +    execution. This means that the optimizer can't use table statistics to 
> +    get #rows estimate for it, it has to call this function instead.
> +
> +    This function is expected to make different actions depending on the nature
> +    of the table. At the moment there is only one kind of delayed tables,
> +    non-flattenable semi-joins.
> +*/
> +
> +void get_delayed_table_estimates(TABLE *table,
> +                                 ha_rows *out_rows, 
> +                                 double *scan_time,
> +                                 double *startup_cost)
> +{
> +  Item_in_subselect *item= table->pos_in_table_list->jtbm_subselect;
> +  item->optimize();
> +
> +  DBUG_ASSERT(item->engine->engine_type() ==
> +              subselect_engine::HASH_SJ_ENGINE);
> +
> +  subselect_hash_sj_engine *hash_sj_engine=
> +    ((subselect_hash_sj_engine*)item->engine);
> +  JOIN *join= hash_sj_engine->materialize_join;
> +
> +  double rows= 1;
> +  double read_time= 0.0;
> +
> +  /* Calculate #rows and cost of join execution */
> +  for (uint i= join->const_tables; i < join->tables; i++)
> +  {
> +    rows      *= join->best_positions[i].records_read;
> +    read_time += join->best_positions[i].read_time;
> +  }
> +  *out_rows= (ha_rows)rows;
> +  *startup_cost= read_time;

The above loop is identical to doing:

    get_partial_join_cost(join, join->table, startup_cost, &rows);
    *out_rows= (ha_rows) rows;


> +  /* Calculate cost of scanning the temptable */
> +  double data_size= rows * hash_sj_engine->tmp_table->s->reclength;
> +  /* Do like in handler::read_time */
> +  *scan_time= data_size/IO_SIZE + 2;
> +} 

> +
> +/*
> +  Convert subquery predicate into non-mergeable semi-join nest.
> +
> +  TODO: 
> +    why does this do IN-EXISTS conversion? Can't we unify it with mergeable
> +    semi-joins? currently, convert_subq_to_sj() cannot fail to convert (unless
> +    fatal errors)
> +
> +    
> +  RETURN 
> +    FALSE - Ok
> +    TRUE  - Fatal error
> +*/
> +
> +static bool convert_subq_to_jtbm(JOIN *parent_join, 
> +                                 Item_in_subselect *subq_pred, 
> +                                 bool *remove_item)
> +{
> +  SELECT_LEX *parent_lex= parent_join->select_lex;
> +  List<TABLE_LIST> *emb_join_list= &parent_lex->top_join_list;
> +  TABLE_LIST *emb_tbl_nest= NULL; // will change when we learn to handle outer joins
> +  TABLE_LIST *tl;
> +  DBUG_ENTER("convert_subq_to_jtbm");
> +
> +  if (subq_pred->setup_engine(TRUE))
> +    DBUG_RETURN(TRUE);
> +

  /* Please ensure that the code below is covered with some test case !*/

> +  if (subq_pred->engine->engine_type() != subselect_engine::HASH_SJ_ENGINE)
> +  {
> +    *remove_item= FALSE;
> +    make_in_exists_conversion(parent_join->thd, parent_join, subq_pred);

You should return value of make_in_exists_conversion() as this may fail...

> +    DBUG_RETURN(FALSE);
> +  }

> +  *remove_item= TRUE;
> +
> +  TABLE_LIST *jtbm;
> +  char *tbl_alias;
> +  const char alias_mask[]="<subquery%d>";
> +  if (!(tbl_alias= (char*)parent_join->thd->calloc(sizeof(alias_mask)+5)) || 
> +      !(jtbm= alloc_join_nest(parent_join->thd))) //todo: this is not a join nest!
> +  {
> +    DBUG_RETURN(TRUE);
> +  }
> +
> +  jtbm->join_list= emb_join_list;
> +  jtbm->embedding= emb_tbl_nest;
> +  jtbm->jtbm_subselect= subq_pred;
> +  jtbm->nested_join= NULL;
> +
> +  /* Nests do not participate in those 'chains', so: */
> +  /* jtbm->next_leaf= jtbm->next_local= jtbm->next_global == NULL*/
> +  emb_join_list->push_back(jtbm);
> +  
> +  /* 
> +    Inject the jtbm table into TABLE_LIST::next_leaf list, so that 
> +    make_join_statistics() and co. can find it.
> +  */
> +  for (tl= parent_lex->leaf_tables; tl->next_leaf; tl= tl->next_leaf) ;

Add an explicit {} on an empty row, as otherwise you may get compiler
warnings from some compilers

> +  tl->next_leaf= jtbm;
> +
> +  /*
> +    Same as above for TABLE_LIST::next_local chain
> +    (a theory: a next_local chain always starts with ::leaf_tables
> +     because view's tables are inserted after the view)
> +  */
> +  for (tl= parent_lex->leaf_tables; tl->next_local; tl= tl->next_local) ;

Add an explicit {} on an empty row...

> +  tl->next_local= jtbm;

Do you really have to put the table last ?
It would be much esier to put it first...

> +
> +  /* A theory: no need to re-connect the next_global chain */
> +
> +  subselect_hash_sj_engine *hash_sj_engine=
> +    ((subselect_hash_sj_engine*)subq_pred->engine);
> +  jtbm->table= hash_sj_engine->tmp_table;
> +
> +  jtbm->table->tablenr= parent_join->tables;
> +  jtbm->table->map= table_map(1) << (parent_join->tables);
> +
> +  parent_join->tables++;
> +

Where is it checked that this doesn't exceed MAX_TABLES?
Add a comment for the function that guarantees this and an DBUG_ASSERT

> +  Item *conds= hash_sj_engine->semi_join_conds;
> +  conds->fix_after_pullout(parent_lex, &conds);
> +
> +  DBUG_EXECUTE("where", print_where(conds,"SJ-EXPR", QT_ORDINARY););
> +  
> +  my_snprintf(tbl_alias, sizeof(alias_mask)+5, alias_mask, 
> +    hash_sj_engine->materialize_join->select_lex->select_number);

A fast way to generate the name would be to use the following
function:

#define subquery_table_name_length= sizeof("<subquery9999>");

void create_temporary_subquery_table_name(char *to, uint number)
{
  DBUG_ASSERT(number < 10000);       
  to= strmov(to, "<subquery");
  to= int10_to_str((int) number, to, 10);
  to[0]= '>';
  to[1]= 0;
}

This is significantly faster than my_snprintf(). This would also
simplify the other code you have when you generate the name.

> +  jtbm->alias= tbl_alias; 
> +
> +  /* Inject sj_on_expr into the parent's WHERE or ON */
> +  if (emb_tbl_nest)
> +  {
> +    DBUG_ASSERT(0);
> +    /*emb_tbl_nest->on_expr= and_items(emb_tbl_nest->on_expr, 
> +                                     sj_nest->sj_on_expr);
> +    emb_tbl_nest->on_expr->fix_fields(parent_join->thd, &emb_tbl_nest->on_expr);
> +    */

Please remove the comment and change the above to:

DBUG_ASSERT(!emb_tbl_nest);

>  
> +enum_nested_loop_state join_tab_execution_startup(JOIN_TAB *tab)
> +{

<cut>

> +  else if (tab->bush_children)
> +  {
> +    /* It's a merged SJM nest */
> +    enum_nested_loop_state rc;
> +    JOIN *join= tab->join;
> +    SJ_MATERIALIZATION_INFO *sjm= tab->bush_children->start->emb_sj_nest->sj_mat_info;
> +    JOIN_TAB *join_tab= tab->bush_children->start;
> +    JOIN_TAB *save_return_tab= join->return_tab;

move setting of the last 2 variables inside the loop belpw

> --- 5.3-noc/sql/sql_base.cc	2011-01-27 21:39:33.000000000 +0300
> +++ maria-5.3-subqueries-r36-noc/sql/sql_base.cc	2011-02-21 18:15:06.000000000 +0300
> @@ -7778,6 +7778,18 @@
>        if (res)
>          DBUG_RETURN(1);
>      }
> +    if (table_list->jtbm_subselect)
> +    {
> +      Item *item= table_list->jtbm_subselect;
> +      if (item->fix_fields(thd, &item))
> +      {
> +        my_error(ER_TOO_MANY_TABLES,MYF(0),MAX_TABLES);
> +        DBUG_RETURN(1);
> +      }
> +      DBUG_ASSERT(item == table_list->jtbm_subselect);
> +      if (table_list->jtbm_subselect->setup_engine(FALSE))
> +        DBUG_RETURN(1);
> +    }
>    }

Regards,
Monty



References