maria-developers team mailing list archive
-
maria-developers team
-
Mailing list archive
-
Message #03957
review: part 1 of MWL#90 combined patch for review
Hi!
Here is part 1 of the review; part 2 should come later today.
>>>>> "Sergey" == Sergey Petrunya <psergey@xxxxxxxxxxxx> writes:
Sergey> On Tue, Feb 22, 2011 at 05:24:37PM +0300, Sergey Petrunya wrote:
>> Hi Monty,
>>
>> Please find attached the complete patch of MWL#90 for review. The tree with
>> the code is on launchpad/buildbot:
>> https://code.launchpad.net/~maria-captains/maria/5.3-subqueries-mwl90
>> http://buildbot.askmonty.org/buildbot/grid?branch=5.3-subqueries-mwl90
<cut>
> +++ maria-5.3-subqueries-r36-noc/mysql-test/include/mix1.inc 2011-02-16 14:42:52.000000000 +0300
> @@ -1532,6 +1532,12 @@
>
> SELECT 1 FROM (SELECT COUNT(DISTINCT c1)
> FROM t1 WHERE c2 IN (1, 1) AND c3 = 2 GROUP BY c2) x;
> +--echo # MariaDB note:
> +--echo # This will show 2 for table which has 5 rows.
> +--echo # This is because the access method employed is actually range access
> +--echo # which scans 2 records (yes, EXPLAIN displays it incorrectly).
> +--echo # our correct printing is an artifact of changing in select_describe()
> +--echo # from printing table->starts.records() to tab->records.
Please remove comment as this is not an issue anymore.
> diff -urN --exclude='.*' 5.3-noc/sql/item_subselect.cc maria-5.3-subqueries-r36-noc/sql/item_subselect.cc
>
> @@ -4067,6 +4078,17 @@
> }
>
>
> +int subselect_hash_sj_engine::optimize()
> +{
> + int res= 0;
Don't reset res
> + SELECT_LEX *save_select= thd->lex->current_select;
> + thd->lex->current_select= materialize_join->select_lex;
> + res= materialize_join->optimize();
> + thd->lex->current_select= save_select;
> +
> + return res;
> +}
> +
> diff -urN --exclude='.*' 5.3-noc/sql/item_subselect.h maria-5.3-subqueries-r36-noc/sql/item_subselect.h
> --- 5.3-noc/sql/item_subselect.h 2011-01-27 21:39:33.000000000 +0300
> +++ maria-5.3-subqueries-r36-noc/sql/item_subselect.h 2011-02-16 14:42:52.000000000 +0300
> @@ -46,16 +46,17 @@
> < child_join->prepare
> < engine->prepare
> *ref= substitution;
> + substitution= NULL;
> < Item_subselect::fix_fields
> */
> - Item *substitution;
> public:
> + Item *substitution;
> /* unit of subquery */
> st_select_lex_unit *unit;
> -protected:
> Item *expr_cache;
> /* engine that perform execution of subselect (single select or union) */
> subselect_engine *engine;
> +protected:
> /* old engine if engine was changed */
> subselect_engine *old_engine;
> /* cache of used external tables */
> @@ -148,6 +149,7 @@
> bool mark_as_dependent(THD *thd, st_select_lex *select, Item *item);
> void fix_after_pullout(st_select_lex *new_parent, Item **ref);
> void recalc_used_tables(st_select_lex *new_parent, bool after_pullout);
> + virtual int optimize();
> virtual bool exec();
> virtual void fix_length_and_dec();
> table_map used_tables() const;
> @@ -351,7 +353,9 @@
> all JOIN in UNION
> */
> Item *expr;
> +public:
> Item_in_optimizer *optimizer;
> +protected:
Move the above to after Item *substitions; No reason to have many
:public :protected blocks
> @@ -397,6 +401,16 @@
> };
> enum_exec_method exec_method;
>
> + /*
> + TRUE<=>this is a flattenable semi-join, false overwise.
> + */
> + bool is_flattenable_semijoin;
You should set this to zero in Item_subselect::Item_subselect() !
> +
> + /*
> + Cost to populate the temporary table (set on if-needed basis).
> + */
> + //double startup_cost;
> +
Remove startup_cost as it's not used
<cut>
> @@ -752,7 +767,7 @@
>
> class subselect_hash_sj_engine : public subselect_engine
> {
> -protected:
> +public:
> /* The table into which the subquery is materialized. */
> TABLE *tmp_table;
> /* TRUE if the subquery was materialized into a temp table. */
> @@ -764,14 +779,16 @@
> of subselect_single_select_engine::[prepare | cols].
> */
> subselect_single_select_engine *materialize_engine;
> +protected:
> /* The engine used to compute the IN predicate. */
> subselect_engine *lookup_engine;
> /*
> QEP to execute the subquery and materialize its result into a
> temporary table. Created during the first call to exec().
> */
> +public:
> JOIN *materialize_join;
> -
> +protected:
> /* Keyparts of the only non-NULL composite index in a rowid merge. */
> MY_BITMAP non_null_key_parts;
> /* Keyparts of the single column indexes with NULL, one keypart per index. */
> @@ -784,7 +801,9 @@
> IN results because index lookups sometimes match values that are actually
> not equal to the search key in SQL terms.
> */
> +public:
> Item_cond_and *semi_join_conds;
> +protected:
> /* Possible execution strategies that can be used to compute hash semi-join.*/
Please move all public variables to one block.
> +++ maria-5.3-subqueries-r36-noc/sql/opt_subselect.cc 2011-02-16 14:42:52.000000000 +0300
> @@ -166,6 +328,7 @@
> (void)subquery_types_allow_materialization(in_subs);
>
> in_subs->emb_on_expr_nest= thd->thd_marker.emb_on_expr_nest;
> + in_subs->is_flattenable_semijoin= TRUE;
>
> /* Register the subquery for further processing in flatten_subqueries() */
> select_lex->
> @@ -220,10 +383,24 @@
> (in_subs->is_top_level_item() ||
> optimizer_flag(thd, OPTIMIZER_SWITCH_PARTIAL_MATCH_ROWID_MERGE) ||
> optimizer_flag(thd, OPTIMIZER_SWITCH_PARTIAL_MATCH_TABLE_SCAN)) &&//4
> - !in_subs->is_correlated && // 5
> - in_subs->exec_method == Item_in_subselect::NOT_TRANSFORMED) // 6
> + !in_subs->is_correlated) // 5
> {
> + if (in_subs->exec_method == Item_in_subselect::NOT_TRANSFORMED)
> in_subs->exec_method= Item_in_subselect::MATERIALIZATION;
You removed //6 comment without updating the comment for the block
that describes 6. Please update comment!
> +
> + /*
> + If the subquery is an AND-part of WHERE register for being processed
> + with jtbm strategy
> + */
> + if (in_subs->exec_method == Item_in_subselect::MATERIALIZATION &&
> + thd->thd_marker.emb_on_expr_nest == (TABLE_LIST*)0x1 &&
> + optimizer_flag(thd, OPTIMIZER_SWITCH_SEMIJOIN))
Instead of doing:
thd->thd_marker.emb_on_expr_nest == (TABLE_LIST*)0x1 please create an
inline function with a meningfull name to test test or at least
replace (TABLE_LIST*) 0x with a define or both.
> + {
> + in_subs->emb_on_expr_nest= thd->thd_marker.emb_on_expr_nest;
Here you could make things cleared and set in_subs->emb_on_expr_nest
to a define.
<cut>
> @@ -339,6 +516,69 @@
>
>
> /*
> + Finalize IN->EXISTS conversion in case we couldn't use materialization.
> +
> + DESCRIPTION Invoke the IN->EXISTS converter
> + Replace the Item_in_subselect with its wrapper Item_in_optimizer in WHERE.
> +
> + RETURN
> + FALSE - Ok
> + TRUE - Fatal error
> +*/
> +
> +static
> +bool make_in_exists_conversion(THD *thd, JOIN *join, Item_in_subselect *item)
> +{
> + DBUG_ENTER("make_in_exists_conversion");
> + JOIN *child_join= item->unit->first_select()->join;
> + Item_subselect::trans_res res;
> + item->changed= 0;
Please add a comment why you set item->fixed to 0 as this is not obvious
> + item->fixed= 0;
> +
> + SELECT_LEX *save_select_lex= thd->lex->current_select;
> + thd->lex->current_select= item->unit->first_select();
> +
> + res= item->select_transformer(child_join);
> +
> + thd->lex->current_select= save_select_lex;
> +
> + if (res == Item_subselect::RES_ERROR)
> + DBUG_RETURN(TRUE);
> +
> + item->changed= 1;
> + item->fixed= 1;
> +
> + Item *substitute= item->substitution;
> + bool do_fix_fields= !item->substitution->fixed;
item->substitution -> substitute
> + /*
> + The Item_subselect has already been wrapped with Item_in_optimizer, so we
> + should search for item->optimizer, not 'item'.
> + */
> + Item *replace_me= item->optimizer;
> + DBUG_ASSERT(replace_me==substitute);
> +
> + Item **tree= (item->emb_on_expr_nest == (TABLE_LIST*)1)?
> + &join->conds : &(item->emb_on_expr_nest->on_expr);
> + if (replace_where_subcondition(join, tree, replace_me, substitute,
> + do_fix_fields))
> + DBUG_RETURN(TRUE);
> + item->substitution= NULL;
> +
The code belove makes a permanent change to the prepared statement
tree. Please add a comment about this!
Also please add a test where you do a prepared statement that hits his
code and excecute it twice with different parameters.
> + if (!thd->stmt_arena->is_conventional())
> + {
> + tree= (item->emb_on_expr_nest == (TABLE_LIST*)1)?
> + &join->select_lex->prep_where :
> + &(item->emb_on_expr_nest->prep_on_expr);
> +
> + if (replace_where_subcondition(join, tree, replace_me, substitute,
> + FALSE))
> + DBUG_RETURN(TRUE);
> + }
> + DBUG_RETURN(FALSE);
> +}
> +
> +
> +/*
> Convert semi-join subquery predicates into semi-join join nests
>
> SYNOPSIS
> @@ -445,25 +685,41 @@
> // #tables-in-parent-query + #tables-in-subquery < MAX_TABLES
> /* Replace all subqueries to be flattened with Item_int(1) */
> arena= thd->activate_stmt_arena_if_needed(&backup);
> - for (in_subq= join->sj_subselects.front();
> - in_subq != in_subq_end &&
> - join->tables + (*in_subq)->unit->first_select()->join->tables < MAX_TABLES;
> - in_subq++)
> - {
> - Item **tree= ((*in_subq)->emb_on_expr_nest == (TABLE_LIST*)1)?
> - &join->conds : &((*in_subq)->emb_on_expr_nest->on_expr);
> - if (replace_where_subcondition(join, tree, *in_subq, new Item_int(1),
> - FALSE))
> - DBUG_RETURN(TRUE); /* purecov: inspected */
> - }
>
> for (in_subq= join->sj_subselects.front();
> in_subq != in_subq_end &&
> join->tables + (*in_subq)->unit->first_select()->join->tables < MAX_TABLES;
> in_subq++)
> {
> - if (convert_subq_to_sj(join, *in_subq))
> - DBUG_RETURN(TRUE);
> + bool remove_item= TRUE;
> + if ((*in_subq)->is_flattenable_semijoin)
> + {
> + if (convert_subq_to_sj(join, *in_subq))
> + DBUG_RETURN(TRUE);
> + }
> + else
> + {
> + if (convert_subq_to_jtbm(join, *in_subq, &remove_item))
> + DBUG_RETURN(TRUE);
> + }
> + if (remove_item)
> + {
> + Item **tree= ((*in_subq)->emb_on_expr_nest == (TABLE_LIST*)1)?
> + &join->conds : &((*in_subq)->emb_on_expr_nest->on_expr);
> + Item *replace_me= *in_subq;
> + /*
> + JTBM: the subquery was already mapped with Item_in_optimizer, so we
> + should search for that, not for original Item_in_subselect.
> + TODO: what about delaying that rewrite until here?
> + */
> + if (!(*in_subq)->is_flattenable_semijoin)
> + {
> + replace_me= (*in_subq)->optimizer;
> + }
Consider doing a function:
Item_in_subselect::original_item()
{
return is_flattenable_semijoin ? self : item->optimizer;
}
This would make the above code a bit easier to understand by doing:
Item *replace_me= (*in_subq)->original_item()
This would thus be a bit like real_item()
> + if (replace_where_subcondition(join, tree, replace_me, new Item_int(1),
> + FALSE))
> + DBUG_RETURN(TRUE); /* purecov: inspected */
> + }
> }
> skip_conversion:
> /*
> @@ -494,7 +750,19 @@
> bool do_fix_fields= !(*in_subq)->substitution->fixed;
> Item **tree= ((*in_subq)->emb_on_expr_nest == (TABLE_LIST*)1)?
> &join->conds : &((*in_subq)->emb_on_expr_nest->on_expr);
> - if (replace_where_subcondition(join, tree, *in_subq, substitute,
> +
> + Item *replace_me= *in_subq;
> + /*
> + JTBM: the subquery was already mapped with Item_in_optimizer, so we
> + should search for that, not for original Item_in_subselect.
> + TODO: what about delaying that rewrite until here?
> + */
> + if (!(*in_subq)->is_flattenable_semijoin)
> + {
> + replace_me= (*in_subq)->optimizer;
> + }
The above code can also be made easier with
(*in_subq)->original_item(). The above comment is a good one for
original_item()!
> +
> +/*
> + Get #output_rows and scan_time estimates for a "delayed" table.
> +
> + SYNOPSIS
> + get_delayed_table_estimates()
> + table IN Table to get estimates for
> + out_rows OUT E(#rows in the table)
> + scan_time OUT E(scan_time).
> + startup_cost OUT cost to populate the table.
> +
> + DESCRIPTION
> + Get #output_rows and scan_time estimates for a "delayed" table. By
> + "delayed" here we mean that the table is filled at the start of query
> + execution. This means that the optimizer can't use table statistics to
> + get #rows estimate for it, it has to call this function instead.
> +
> + This function is expected to make different actions depending on the nature
> + of the table. At the moment there is only one kind of delayed tables,
> + non-flattenable semi-joins.
> +*/
> +
> +void get_delayed_table_estimates(TABLE *table,
> + ha_rows *out_rows,
> + double *scan_time,
> + double *startup_cost)
> +{
> + Item_in_subselect *item= table->pos_in_table_list->jtbm_subselect;
> + item->optimize();
> +
> + DBUG_ASSERT(item->engine->engine_type() ==
> + subselect_engine::HASH_SJ_ENGINE);
> +
> + subselect_hash_sj_engine *hash_sj_engine=
> + ((subselect_hash_sj_engine*)item->engine);
> + JOIN *join= hash_sj_engine->materialize_join;
> +
> + double rows= 1;
> + double read_time= 0.0;
> +
> + /* Calculate #rows and cost of join execution */
> + for (uint i= join->const_tables; i < join->tables; i++)
> + {
> + rows *= join->best_positions[i].records_read;
> + read_time += join->best_positions[i].read_time;
> + }
> + *out_rows= (ha_rows)rows;
> + *startup_cost= read_time;
The above loop is identical to doing:
get_partial_join_cost(join, join->table, startup_cost, &rows);
*out_rows= (ha_rows) rows;
> + /* Calculate cost of scanning the temptable */
> + double data_size= rows * hash_sj_engine->tmp_table->s->reclength;
> + /* Do like in handler::read_time */
> + *scan_time= data_size/IO_SIZE + 2;
> +}
> +
> +/*
> + Convert subquery predicate into non-mergeable semi-join nest.
> +
> + TODO:
> + why does this do IN-EXISTS conversion? Can't we unify it with mergeable
> + semi-joins? currently, convert_subq_to_sj() cannot fail to convert (unless
> + fatal errors)
> +
> +
> + RETURN
> + FALSE - Ok
> + TRUE - Fatal error
> +*/
> +
> +static bool convert_subq_to_jtbm(JOIN *parent_join,
> + Item_in_subselect *subq_pred,
> + bool *remove_item)
> +{
> + SELECT_LEX *parent_lex= parent_join->select_lex;
> + List<TABLE_LIST> *emb_join_list= &parent_lex->top_join_list;
> + TABLE_LIST *emb_tbl_nest= NULL; // will change when we learn to handle outer joins
> + TABLE_LIST *tl;
> + DBUG_ENTER("convert_subq_to_jtbm");
> +
> + if (subq_pred->setup_engine(TRUE))
> + DBUG_RETURN(TRUE);
> +
/* Please ensure that the code below is covered with some test case !*/
> + if (subq_pred->engine->engine_type() != subselect_engine::HASH_SJ_ENGINE)
> + {
> + *remove_item= FALSE;
> + make_in_exists_conversion(parent_join->thd, parent_join, subq_pred);
You should return value of make_in_exists_conversion() as this may fail...
> + DBUG_RETURN(FALSE);
> + }
> + *remove_item= TRUE;
> +
> + TABLE_LIST *jtbm;
> + char *tbl_alias;
> + const char alias_mask[]="<subquery%d>";
> + if (!(tbl_alias= (char*)parent_join->thd->calloc(sizeof(alias_mask)+5)) ||
> + !(jtbm= alloc_join_nest(parent_join->thd))) //todo: this is not a join nest!
> + {
> + DBUG_RETURN(TRUE);
> + }
> +
> + jtbm->join_list= emb_join_list;
> + jtbm->embedding= emb_tbl_nest;
> + jtbm->jtbm_subselect= subq_pred;
> + jtbm->nested_join= NULL;
> +
> + /* Nests do not participate in those 'chains', so: */
> + /* jtbm->next_leaf= jtbm->next_local= jtbm->next_global == NULL*/
> + emb_join_list->push_back(jtbm);
> +
> + /*
> + Inject the jtbm table into TABLE_LIST::next_leaf list, so that
> + make_join_statistics() and co. can find it.
> + */
> + for (tl= parent_lex->leaf_tables; tl->next_leaf; tl= tl->next_leaf) ;
Add an explicit {} on an empty row, as otherwise you may get compiler
warnings from some compilers
> + tl->next_leaf= jtbm;
> +
> + /*
> + Same as above for TABLE_LIST::next_local chain
> + (a theory: a next_local chain always starts with ::leaf_tables
> + because view's tables are inserted after the view)
> + */
> + for (tl= parent_lex->leaf_tables; tl->next_local; tl= tl->next_local) ;
Add an explicit {} on an empty row...
> + tl->next_local= jtbm;
Do you really have to put the table last ?
It would be much esier to put it first...
> +
> + /* A theory: no need to re-connect the next_global chain */
> +
> + subselect_hash_sj_engine *hash_sj_engine=
> + ((subselect_hash_sj_engine*)subq_pred->engine);
> + jtbm->table= hash_sj_engine->tmp_table;
> +
> + jtbm->table->tablenr= parent_join->tables;
> + jtbm->table->map= table_map(1) << (parent_join->tables);
> +
> + parent_join->tables++;
> +
Where is it checked that this doesn't exceed MAX_TABLES?
Add a comment for the function that guarantees this and an DBUG_ASSERT
> + Item *conds= hash_sj_engine->semi_join_conds;
> + conds->fix_after_pullout(parent_lex, &conds);
> +
> + DBUG_EXECUTE("where", print_where(conds,"SJ-EXPR", QT_ORDINARY););
> +
> + my_snprintf(tbl_alias, sizeof(alias_mask)+5, alias_mask,
> + hash_sj_engine->materialize_join->select_lex->select_number);
A fast way to generate the name would be to use the following
function:
#define subquery_table_name_length= sizeof("<subquery9999>");
void create_temporary_subquery_table_name(char *to, uint number)
{
DBUG_ASSERT(number < 10000);
to= strmov(to, "<subquery");
to= int10_to_str((int) number, to, 10);
to[0]= '>';
to[1]= 0;
}
This is significantly faster than my_snprintf(). This would also
simplify the other code you have when you generate the name.
> + jtbm->alias= tbl_alias;
> +
> + /* Inject sj_on_expr into the parent's WHERE or ON */
> + if (emb_tbl_nest)
> + {
> + DBUG_ASSERT(0);
> + /*emb_tbl_nest->on_expr= and_items(emb_tbl_nest->on_expr,
> + sj_nest->sj_on_expr);
> + emb_tbl_nest->on_expr->fix_fields(parent_join->thd, &emb_tbl_nest->on_expr);
> + */
Please remove the comment and change the above to:
DBUG_ASSERT(!emb_tbl_nest);
>
> +enum_nested_loop_state join_tab_execution_startup(JOIN_TAB *tab)
> +{
<cut>
> + else if (tab->bush_children)
> + {
> + /* It's a merged SJM nest */
> + enum_nested_loop_state rc;
> + JOIN *join= tab->join;
> + SJ_MATERIALIZATION_INFO *sjm= tab->bush_children->start->emb_sj_nest->sj_mat_info;
> + JOIN_TAB *join_tab= tab->bush_children->start;
> + JOIN_TAB *save_return_tab= join->return_tab;
move setting of the last 2 variables inside the loop belpw
> --- 5.3-noc/sql/sql_base.cc 2011-01-27 21:39:33.000000000 +0300
> +++ maria-5.3-subqueries-r36-noc/sql/sql_base.cc 2011-02-21 18:15:06.000000000 +0300
> @@ -7778,6 +7778,18 @@
> if (res)
> DBUG_RETURN(1);
> }
> + if (table_list->jtbm_subselect)
> + {
> + Item *item= table_list->jtbm_subselect;
> + if (item->fix_fields(thd, &item))
> + {
> + my_error(ER_TOO_MANY_TABLES,MYF(0),MAX_TABLES);
> + DBUG_RETURN(1);
> + }
> + DBUG_ASSERT(item == table_list->jtbm_subselect);
> + if (table_list->jtbm_subselect->setup_engine(FALSE))
> + DBUG_RETURN(1);
> + }
> }
Regards,
Monty
References