← Back to team overview

maria-developers team mailing list archive

Re: [Commits] Rev 2983: MWL#89 - Automatic merge with 5.3 in file:///home/tsk/mprog/src/5.3-mwl89-merge/

 

Hi Timour,

Congratulations on having pushed MWL#89 into 5.3-main! This is quite an
achievement which really moves us forward. 

I've been studying the new code and has got some feedback which I thought 
I need to share. Please find the notes below, marked with 'psergey'. 

> diff -urN --exclude='.*' 5.3-timours-wl-clean/sql/filesort.cc 5.3-timours-wl/sql/filesort.cc
> --- 5.3-timours-wl-clean/sql/filesort.cc	2011-05-11 20:12:50.000000000 +0100
> +++ 5.3-timours-wl/sql/filesort.cc	2011-05-11 20:08:50.000000000 +0100
> @@ -612,10 +612,34 @@
>        }
>        DBUG_RETURN(HA_POS_ERROR);		/* purecov: inspected */
>      }
> +
> +    bool write_record= false;
>      if (error == 0)
> +    {
>        param->examined_rows++;
> -   
> -    if (error == 0 && (!select || select->skip_record(thd) > 0))
> +      if (select && select->cond)
> +      {
> +        /*
> +          If the condition 'select->cond' contains a subquery, restore the
> +          original read/write sets of the table 'sort_form' because when
> +          SQL_SELECT::skip_record evaluates this condition. it may include a
psergey: 
- The last sentence doesn't parse, please fix.
- Why is the check done *for each record we enumerate*? Please move it outside
  of the loop.
- I suspect that now, with Sanja's subquery code, each subquery has a list of 
  references it makes to outside, so this shortcut is not necessary (let's 
  discuss that on the next optimizer call)

> +          correlated subquery predicate, such that some field in the subquery
> +          refers to 'sort_form'.
> +        */
> +        if (select->cond->with_subselect)
> +          sort_form->column_bitmaps_set(save_read_set, save_write_set,
> +                                        save_vcol_set);
> +        write_record= (select->skip_record(thd) > 0);
> +        if (select->cond->with_subselect)
> +          sort_form->column_bitmaps_set(&sort_form->tmp_set,
> +                                        &sort_form->tmp_set,
> +                                        &sort_form->tmp_set);
> +      }
> +      else
> +        write_record= true;
> +    }
> +
> +    if (write_record)
>      {
>        if (idx == param->keys)
>        {
> diff -urN --exclude='.*' 5.3-timours-wl-clean/sql/item_cmpfunc.cc 5.3-timours-wl/sql/item_cmpfunc.cc
> --- 5.3-timours-wl-clean/sql/item_cmpfunc.cc	2011-05-11 20:12:50.000000000 +0100
> +++ 5.3-timours-wl/sql/item_cmpfunc.cc	2011-05-11 20:08:50.000000000 +0100
> @@ -2072,6 +2072,18 @@
>  }
>  
>  
> +bool Item_in_optimizer::is_expensive_processor(uchar *arg)
> +{
> +  return args[1]->is_expensive_processor(arg);
> +}
> +
> +
> +bool Item_in_optimizer::is_expensive()
> +{
> +  return args[1]->is_expensive();
> +}
> +
psergey: what about the cases when args[0] is expensive? It should either be
checked, or a comment here is needed with explanation why we don't need to check it.

>  longlong Item_func_eq::val_int()
>  {
>    DBUG_ASSERT(fixed == 1);
> diff -urN --exclude='.*' 5.3-timours-wl-clean/sql/item.h 5.3-timours-wl/sql/item.h
> --- 5.3-timours-wl-clean/sql/item.h	2011-05-11 20:12:50.000000000 +0100
> +++ 5.3-timours-wl/sql/item.h	2011-05-11 20:08:50.000000000 +0100
> @@ -510,7 +521,7 @@
>               SUBSELECT_ITEM, ROW_ITEM, CACHE_ITEM, TYPE_HOLDER,
>               PARAM_ITEM, TRIGGER_FIELD_ITEM, DECIMAL_ITEM,
>               XPATH_NODESET, XPATH_NODESET_CMP,
> -             VIEW_FIXER_ITEM, EXPR_CACHE_ITEM};
> +             VIEW_FIXER_ITEM, EXPR_CACHE_ITEM, UNKNOWN_ITEM};
psergey: When I grep for UNKNOWN_ITEM, I find only this occurence. Why add it?

>  
>    enum cond_result { COND_UNDEF,COND_OK,COND_TRUE,COND_FALSE };
>  
...
> diff -urN --exclude='.*' 5.3-timours-wl-clean/sql/item_subselect.cc 5.3-timours-wl/sql/item_subselect.cc
> --- 5.3-timours-wl-clean/sql/item_subselect.cc	2011-05-11 20:12:50.000000000 +0100
> +++ 5.3-timours-wl/sql/item_subselect.cc	2011-05-11 20:08:50.000000000 +0100
...
> @@ -1659,49 +1696,40 @@
>          {
>            if (!(new_having= new Item_func_trig_cond(new_having,
>                                                      get_cond_guard(0))))
> -            DBUG_RETURN(RES_ERROR);
> +            DBUG_RETURN(true);
>          }
> -        new_having->name= (char*)in_having_cond;
> -	select_lex->having= join->having= new_having;
> -	select_lex->having_fix_field= 1;
> -        
> -        /*
> -          we do not check join->having->fixed, because comparison function
> -          (from func->create) can't be fixed after creation
> -        */
> -	tmp= join->having->fix_fields(thd, 0);
> -        select_lex->having_fix_field= 0;
> -        if (tmp)
> -	  DBUG_RETURN(RES_ERROR);
> +
> +        new_having->name= (char*) in_having_cond;
> +        if (fix_having(new_having, select_lex))
> +          DBUG_RETURN(true);
> +        *having_item= new_having;
>        }
>        else
> -      {
> -	// it is single select without tables => possible optimization
> -        // remove the dependence mark since the item is moved to upper
> -        // select and is not outer anymore.
> -        item->walk(&Item::remove_dependence_processor, 0,
> -                           (uchar *) select_lex->outer_select());
> -	item= func->create(left_expr, item);
> -	// fix_field of item will be done in time of substituting
> -	substitution= item;
> -	have_to_be_excluded= 1;
> -	if (thd->lex->describe)
> -	{
> -	  char warn_buff[MYSQL_ERRMSG_SIZE];
> -	  sprintf(warn_buff, ER(ER_SELECT_REDUCED), select_lex->select_number);
> -	  push_warning(thd, MYSQL_ERROR::WARN_LEVEL_NOTE,
> -		       ER_SELECT_REDUCED, warn_buff);
> -	}
> -	DBUG_RETURN(RES_REDUCE);
> -      }
> +        DBUG_ASSERT(FALSE);
psergey: (one)
>      }
>    }
>  
> -  DBUG_RETURN(RES_OK);
> +  DBUG_RETURN(false);
psergey: (two).  I think using both 'false' and 'FALSE' in the same patch is
too much. Please change to TRUE/FALSE since we have it everywhere else.

>  }
>  
>  
> -Item_subselect::trans_res
> +/**
> +  Wrap a multi-column IN/ALL/ANY subselect into an Item_in_optimizer.
> +
> +  @param join  Join object of the subquery (i.e. 'child' join).
> +
> +  @details
> +  The subquery predicate is wrapped into an Item_in_optimizer. Later the query
> +  optimization phase chooses whether the subquery under the Item_in_optimizer
> +  will be further transformed into an equivalent correlated EXISTS by injecting
> +  additional predicates, or will be executed via subquery materialization in its
> +  unmodified form.
> +
> +  @retval false  The subquery was transformed
> +  @retval true   Error
> +*/
> +
> +bool
>  Item_in_subselect::row_value_transformer(JOIN *join)
>  {
>    SELECT_LEX *select_lex= join->select_lex;
...
> diff -urN --exclude='.*' 5.3-timours-wl-clean/sql/opt_subselect.cc 5.3-timours-wl/sql/opt_subselect.cc
> --- 5.3-timours-wl-clean/sql/opt_subselect.cc	2011-05-11 20:12:50.000000000 +0100
> +++ 5.3-timours-wl/sql/opt_subselect.cc	2011-05-11 20:08:50.000000000 +0100
> @@ -175,63 +189,80 @@
>      else
>      {
>        DBUG_PRINT("info", ("Subquery can't be converted to semi-join"));
> -      /*
> -        Check if the subquery predicate can be executed via materialization.
> -        The required conditions are:
> -        1. Subquery predicate is an IN/=ANY subq predicate
> -        2. Subquery is a single SELECT (not a UNION)
> -        3. Subquery is not a table-less query. In this case there is no
> -           point in materializing.
> -          3A The upper query is not a table-less SELECT ... FROM DUAL. We
> +      /* Test if the user has set a legal combination of optimizer switches. */
> +      if (!optimizer_flag(thd, OPTIMIZER_SWITCH_IN_TO_EXISTS) &&
> +          !optimizer_flag(thd, OPTIMIZER_SWITCH_MATERIALIZATION))
> +        my_error(ER_ILLEGAL_SUBQUERY_OPTIMIZER_SWITCHES, MYF(0));
> +
> +      if (in_subs)
> +      {
> +        /* Subquery predicate is an IN/=ANY predicate. */
> +        if (optimizer_flag(thd, OPTIMIZER_SWITCH_IN_TO_EXISTS))
> +          in_subs->in_strategy|= SUBS_IN_TO_EXISTS;
> +        if (optimizer_flag(thd, OPTIMIZER_SWITCH_MATERIALIZATION))
> +          in_subs->in_strategy|= SUBS_MATERIALIZATION;
> +
> +        /*
> +          Check if the subquery predicate can be executed via materialization.
> +          The required conditions are:
> +          1. Subquery is a single SELECT (not a UNION)
> +          2. Subquery is not a table-less query. In this case there is no
> +             point in materializing.
> +          2A The upper query is not a table-less SELECT ... FROM DUAL. We
>               can't do materialization for SELECT .. FROM DUAL because it
>               does not call setup_subquery_materialization(). We could make 
>               SELECT ... FROM DUAL call that function but that doesn't seem
>               to be the case that is worth handling.
> -        4. Either the subquery predicate is a top-level predicate, or at
> -           least one partial match strategy is enabled. If no partial match
> -           strategy is enabled, then materialization cannot be used for
> -           non-top-level queries because it cannot handle NULLs correctly.
> -        5. Subquery is non-correlated
> -           TODO:
> -           This is an overly restrictive condition. It can be extended to:
> -           (Subquery is non-correlated ||
> -            Subquery is correlated to any query outer to IN predicate ||
> -            (Subquery is correlated to the immediate outer query &&
> -             Subquery !contains {GROUP BY, ORDER BY [LIMIT],
> -             aggregate functions}) && subquery predicate is not under "NOT IN"))
> -        6. No execution method was already chosen (by a prepared statement).
> -
> -        (*) The subquery must be part of a SELECT statement. The current
> -             condition also excludes multi-table update statements.
> -
> -        Determine whether we will perform subquery materialization before
> -        calling the IN=>EXISTS transformation, so that we know whether to
> -        perform the whole transformation or only that part of it which wraps
> -        Item_in_subselect in an Item_in_optimizer.
> -      */
> -      if (optimizer_flag(thd, OPTIMIZER_SWITCH_MATERIALIZATION)  && 
> -          in_subs  &&                                                   // 1
> -          !select_lex->is_part_of_union() &&                            // 2
> -          select_lex->master_unit()->first_select()->leaf_tables &&     // 3
> -          thd->lex->sql_command == SQLCOM_SELECT &&                     // *
> -          select_lex->outer_select()->leaf_tables &&                    // 3A
> -          subquery_types_allow_materialization(in_subs) &&
> -          // psergey-todo: duplicated_subselect_card_check: where it's done?
> -          (in_subs->is_top_level_item() ||
> -           optimizer_flag(thd, OPTIMIZER_SWITCH_PARTIAL_MATCH_ROWID_MERGE) ||
> -           optimizer_flag(thd, OPTIMIZER_SWITCH_PARTIAL_MATCH_TABLE_SCAN)) &&//4
> -          !in_subs->is_correlated &&                                  // 5
> -          in_subs->exec_method == Item_in_subselect::NOT_TRANSFORMED) // 6
> -      {
> -          in_subs->exec_method= Item_in_subselect::MATERIALIZATION;
> -      }
> -
> -      Item_subselect::trans_res trans_res;
> -      if ((trans_res= subselect->select_transformer(join)) !=
> -          Item_subselect::RES_OK)
> -      {
> -        DBUG_RETURN((trans_res == Item_subselect::RES_ERROR));
> +          3. Either the subquery predicate is a top-level predicate, or at
> +             least one partial match strategy is enabled. If no partial match
> +             strategy is enabled, then materialization cannot be used for
> +             non-top-level queries because it cannot handle NULLs correctly.
> +          4. Subquery is non-correlated
> +             TODO:
> +             This is an overly restrictive condition. It can be extended to:
> +             (Subquery is non-correlated ||
> +              Subquery is correlated to any query outer to IN predicate ||
> +              (Subquery is correlated to the immediate outer query &&
> +               Subquery !contains {GROUP BY, ORDER BY [LIMIT],
> +               aggregate functions}) && subquery predicate is not under "NOT IN"))
> +
> +          (*) The subquery must be part of a SELECT statement. The current
> +               condition also excludes multi-table update statements.
> +        */
> +        if (!(in_subs->in_strategy & SUBS_MATERIALIZATION && 
> +              !select_lex->is_part_of_union() &&                            // 1
> +              parent_unit->first_select()->leaf_tables &&                   // 2
> +              thd->lex->sql_command == SQLCOM_SELECT &&                     // *
> +              select_lex->outer_select()->leaf_tables &&                    // 2A
> +              subquery_types_allow_materialization(in_subs) &&
> +              // psergey-todo: duplicated_subselect_card_check: where it's done?
> +              (in_subs->is_top_level_item() ||                               //3
> +               optimizer_flag(thd,
> +                              OPTIMIZER_SWITCH_PARTIAL_MATCH_ROWID_MERGE) || //3
> +               optimizer_flag(thd,
> +                              OPTIMIZER_SWITCH_PARTIAL_MATCH_TABLE_SCAN)) && //3
> +              !in_subs->is_correlated))                                      //4
> +        {
> +          /* Materialization is not possible based on syntactic properties. */
> +          in_subs->in_strategy&= ~SUBS_MATERIALIZATION;
> +        }
> +
> +        if (!in_subs->in_strategy)
> +        {
> +          /*
> +            If neither materialization is possible, nor the user chose
> +            IN-TO-EXISTS, choose IN-TO-EXISTS as the only universal strategy.
> +          */
> +          in_subs->in_strategy|= SUBS_IN_TO_EXISTS;
> +        }
>        }
psergey: The above seems unneccessarily convoluted. This is not apprent from
the patch, but the resulting code looks like this:

      if (in_subs)
      {
        /* Subquery predicate is an IN/=ANY predicate. */
        if (optimizer_flag(thd, OPTIMIZER_SWITCH_IN_TO_EXISTS))
          in_subs->in_strategy|= SUBS_IN_TO_EXISTS;
        if (optimizer_flag(thd, OPTIMIZER_SWITCH_MATERIALIZATION))
          in_subs->in_strategy|= SUBS_MATERIALIZATION;
        
        /* a long comment */
        if (!(long condition))
        {
          /* Materialization is not possible based on syntactic properties. */
          in_subs->in_strategy&= ~SUBS_MATERIALIZATION;
        } 
      ...

Is there any particular reason to first set SUBS_MATERIALIZATION flag, and
then optionally clear it? It seems it would be more straightforward to only 
set it when materialization is really applicable? 

> +
> +      /*
> +        Transform each subquery predicate according to its overloaded
> +        transformer.
> +      */
> +      if (subselect->select_transformer(join))
> +        DBUG_RETURN(-11);
>      }
>    }
>    DBUG_RETURN(0);
...
> @@ -1830,15 +1919,15 @@
>          - sj_inner_fanout*sj_outer_fanout  lookups.
>  
>        */
> -      double one_lookup_cost;
> -      if (sj_outer_fanout*temptable_rec_size > 
> -          join->thd->variables.max_heap_table_size)
> -        one_lookup_cost= DISK_TEMPTABLE_LOOKUP_COST;
> -      else
> -        one_lookup_cost= HEAP_TEMPTABLE_LOOKUP_COST;
> +      double one_lookup_cost= get_tmp_table_lookup_cost(join->thd,
> +                                                        sj_outer_fanout,
> +                                                        temptable_rec_size);
> +      double one_write_cost= get_tmp_table_write_cost(join->thd,
> +                                                      sj_outer_fanout,
> +                                                      temptable_rec_size);
>  
>        double write_cost= join->positions[first_tab].prefix_record_count* 
> -                         sj_outer_fanout * one_lookup_cost;
> +                         sj_outer_fanout * one_write_cost;
>        double full_lookup_cost= join->positions[first_tab].prefix_record_count* 
>                                 sj_outer_fanout* sj_inner_fanout * 
>                                 one_lookup_cost;
> @@ -3360,9 +3449,23 @@
>    JOIN_TAB* join_tab=join->join_tab;
>    SELECT_LEX_UNIT *unit= join->unit;
>    DBUG_ENTER("rewrite_to_index_subquery_engine");
> +
>    /*
>      is this simple IN subquery?
>    */
> +  /* TODO: In order to use these more efficient subquery engines in more cases,
> +     the following problems need to be solved:
> +     - the code that removes GROUP BY (group_list), also adds an ORDER BY
> +       (order), thus GROUP BY queries (almost?) never pass through this branch.
> +       Solution: remove the test below '!join->order', because we remove the
> +       ORDER clase for subqueries anyway.
> +     - in order to set a more efficient engine, the optimizer needs to both
> +       decide to remove GROUP BY, *and* select one of the JT_[EQ_]REF[_OR_NULL]
> +       access methods, *and* loose scan should be more expensive or
> +       inapliccable. When is that possible?
> +     - Consider expanding the applicability of this rewrite for loose scan
> +       for group by queries.
> +  */
>    if (!join->group_list && !join->order &&
>        join->unit->item && 
>        join->unit->item->substype() == Item_subselect::IN_SUBS &&
> @@ -3503,3 +3606,349 @@
>  }
>  
>  
> +/**
> +  Optimize all subqueries of a query that have were flattened into a semijoin.
psergey: a typo on the above line?
> +
> +  @details
> +  Optimize all immediate children subqueries of a query.
> +
> +  This phase must be called after substitute_for_best_equal_field() because
> +  that function may replace items with other items from a multiple equality,
> +  and we need to reference the correct items in the index access method of the
> +  IN predicate.
> +
> +  @return Operation status
> +  @retval FALSE     success.
> +  @retval TRUE      error occurred.
> +*/
> +
> +bool JOIN::optimize_unflattened_subqueries()
> +{
> +  return select_lex->optimize_unflattened_subqueries();
> +}
> +
> +
> +/**
> +  Choose an optimal strategy to execute an IN/ALL/ANY subquery predicate
> +  based on cost.
> +
> +  @param join_tables  the set of tables joined in the subquery
> +
> +  @notes
> +  The method chooses between the materialization and IN=>EXISTS rewrite
> +  strategies for the execution of a non-flattened subquery IN predicate.
> +  The cost-based decision is made as follows:
> +
> +  1. compute materialize_strategy_cost based on the unmodified subquery
> +  2. reoptimize the subquery taking into account the IN-EXISTS predicates
> +  3. compute in_exists_strategy_cost based on the reoptimized plan
> +  4. compare and set the cheaper strategy
> +     if (materialize_strategy_cost >= in_exists_strategy_cost)
> +       in_strategy = MATERIALIZATION
> +     else
> +       in_strategy = IN_TO_EXISTS
> +  5. if in_strategy = MATERIALIZATION and it is not possible to initialize it
> +       revert to IN_TO_EXISTS
> +  6. if (in_strategy == MATERIALIZATION)
> +       revert the subquery plan to the original one before reoptimizing
> +     else
> +       inject the IN=>EXISTS predicates into the new EXISTS subquery plan
> +
> +  The implementation itself is a bit more complicated because it takes into
> +  account two more factors:
> +  - whether the user allowed both strategies through an optimizer_switch, and
> +  - if materialization was the cheaper strategy, whether it can be executed
> +    or not.
> +
> +  @retval FALSE     success.
> +  @retval TRUE      error occurred.
> +*/
> +
> +bool JOIN::choose_subquery_plan(table_map join_tables)
> +{
> +  Query_plan_state save_qep; /* The original QEP of the subquery. */
> +  enum_reopt_result reopt_result= REOPT_NONE;
> +  Item_in_subselect *in_subs;
> +
> +  if (select_lex->master_unit()->item &&
> +      select_lex->master_unit()->item->is_in_predicate())
> +  {
> +    in_subs= (Item_in_subselect*) select_lex->master_unit()->item;
> +    if (in_subs->create_in_to_exists_cond(this))
> +      return true;
> +  }
> +  else
> +    return false;
> +
> +  DBUG_ASSERT(in_subs->in_strategy); /* A strategy must be chosen earlier. */
> +  DBUG_ASSERT(in_to_exists_where || in_to_exists_having);
> +  DBUG_ASSERT(!in_to_exists_where || in_to_exists_where->fixed);
> +  DBUG_ASSERT(!in_to_exists_having || in_to_exists_having->fixed);
> +
> +  /*
> +    Compute and compare the costs of materialization and in-exists if both
> +    strategies are possible and allowed by the user (checked during the prepare
> +    phase.
> +  */
> +  if (in_subs->in_strategy & SUBS_MATERIALIZATION &&
> +      in_subs->in_strategy & SUBS_IN_TO_EXISTS)
psergey: when I write conditions like the one above, Monty tells me to change
it to

  in_subs->in_strategy & (SUBS_MATERIALIZATION |SUBS_IN_TO_EXISTS)

Did he not tell the same to you?

> +  {
> +    JOIN *outer_join;
> +    JOIN *inner_join= this;
> +    /* Number of (partial) rows of the outer JOIN filtered by the IN predicate. */
> +    double outer_record_count;
> +    /* Number of unique value combinations filtered by the IN predicate. */
> +    double outer_lookup_keys;
> +    /* Cost and row count of the unmodified subquery. */
> +    double inner_read_time_1, inner_record_count_1;
> +    /* Cost of the subquery with injected IN-EXISTS predicates. */
> +    double inner_read_time_2;
> +    /* The cost to compute IN via materialization. */
> +    double materialize_strategy_cost;
> +    /* The cost of the IN->EXISTS strategy. */
> +    double in_exists_strategy_cost;
> +    double dummy;
> +
> +    /*
> +      A. Estimate the number of rows of the outer table that will be filtered
> +      by the IN predicate.
> +    */
> +    outer_join= unit->outer_select() ? unit->outer_select()->join : NULL;
> +    if (outer_join)
> +    {
> +      uint outer_partial_plan_len;
> +      /*
> +        Make_cond_for_table is called for predicates only in the WHERE/ON
> +        clauses. In all other cases, predicates are not pushed to any
> +        JOIN_TAB, and their joi_tab_idx remains MAX_TABLES. Such predicates
> +        are evaluated for each complete row of the outer join.
> +      */
> +      outer_partial_plan_len= (in_subs->get_join_tab_idx() == MAX_TABLES) ?
> +                               outer_join->tables :
> +                               in_subs->get_join_tab_idx() + 1;
> +      outer_join->get_partial_join_cost(outer_partial_plan_len, &dummy,
> +                                        &outer_record_count);
> +      if (outer_join->tables > outer_join->const_tables)
> +        outer_lookup_keys= prev_record_reads(outer_join->best_positions,
> +                                             outer_partial_plan_len,
> +                                             in_subs->used_tables());
> +      else
> +      {
> +        /* If all tables are constant, positions is undefined. */
> +        outer_lookup_keys= 1;
> +      }
> +    }
> +    else
> +    {
> +      /*
> +        TODO: outer_join can be NULL for DELETE statements.
> +        How to compute its cost?
> +      */
> +      outer_record_count= 1;
> +      outer_lookup_keys=1;
> +    }
> +    /*
> +      There cannot be more lookup keys than the total number of records.
> +      TODO: this a temporary solution until we find a better way to compute
> +      get_partial_join_cost() and prev_record_reads() in a consitent manner,
> +      where it is guaranteed that (outer_lookup_keys <= outer_record_count).
> +    */
> +    if (outer_lookup_keys > outer_record_count)
> +      outer_lookup_keys= outer_record_count;
> +
> +    /*
> +      B. Estimate the cost and number of records of the subquery both
> +      unmodified, and with injected IN->EXISTS predicates.
> +    */
> +    inner_read_time_1= inner_join->best_read;
> +    inner_record_count_1= inner_join->record_count;
> +
> +    if (in_to_exists_where && const_tables != tables)
> +    {
> +      /*
> +        Re-optimize and cost the subquery taking into account the IN-EXISTS
> +        conditions.
> +      */
> +      reopt_result= reoptimize(in_to_exists_where, join_tables, &save_qep);
> +      if (reopt_result == REOPT_ERROR)
> +        return TRUE;
> +
> +      /* Get the cost of the modified IN-EXISTS plan. */
> +      inner_read_time_2= inner_join->best_read;
> +
> +    }
> +    else
> +    {
> +      /* Reoptimization would not produce any better plan. */
> +      inner_read_time_2= inner_read_time_1;
> +    }
> +
> +    /*
> +      C. Compute execution costs.
> +    */
> +    /* C.1 Compute the cost of the materialization strategy. */
> +    uint rowlen= get_tmp_table_rec_length(unit->first_select()->item_list);
> +    /* The cost of writing one row into the temporary table. */
> +    double write_cost= get_tmp_table_write_cost(thd, inner_record_count_1,
> +                                                rowlen);
> +    /* The cost of a lookup into the unique index of the materialized table. */
> +    double lookup_cost= get_tmp_table_lookup_cost(thd, inner_record_count_1,
> +                                                  rowlen);
> +    /*
> +      The cost of executing the subquery and storing its result in an indexed
> +      temporary table.
> +    */
> +    double materialization_cost= inner_read_time_1 +
> +                                 write_cost * inner_record_count_1;
> +
> +    materialize_strategy_cost= materialization_cost +
> +                               outer_record_count * lookup_cost;
> +
> +    /* C.2 Compute the cost of the IN=>EXISTS strategy. */
> +    in_exists_strategy_cost= outer_lookup_keys * inner_read_time_2;
> +
> +    /* C.3 Compare the costs and choose the cheaper strategy. */
> +    if (materialize_strategy_cost >= in_exists_strategy_cost)
> +      in_subs->in_strategy&= ~SUBS_MATERIALIZATION;
> +    else
> +      in_subs->in_strategy&= ~SUBS_IN_TO_EXISTS;
> +  }
> +
> +  /*
> +    If (1) materialization is a possible strategy based on semantic analysis
> +    during the prepare phase, then if
> +      (2) it is more expensive than the IN->EXISTS transformation, and
> +      (3) it is not possible to create usable indexes for the materialization
> +          strategy,
> +      fall back to IN->EXISTS.
> +    otherwise
> +      use materialization.
> +  */
> +  if (in_subs->in_strategy & SUBS_MATERIALIZATION &&
> +      in_subs->setup_mat_engine())
> +  {
> +    /*
> +      If materialization was the cheaper or the only user-selected strategy,
> +      but it is not possible to execute it due to limitations in the
> +      implementation, fall back to IN-TO-EXISTS.
> +    */
> +    in_subs->in_strategy&= ~SUBS_MATERIALIZATION;
> +    in_subs->in_strategy|= SUBS_IN_TO_EXISTS;
> +  }
> +
> +  if (in_subs->in_strategy & SUBS_MATERIALIZATION)
> +  {
> +    /* Restore the original query plan used for materialization. */
> +    if (reopt_result == REOPT_NEW_PLAN)
> +      restore_query_plan(&save_qep);
> +
> +    /* TODO: should we set/unset this flag for both select_lex and its unit? */
> +    in_subs->unit->uncacheable&= ~UNCACHEABLE_DEPENDENT;
> +    select_lex->uncacheable&= ~UNCACHEABLE_DEPENDENT;
> +
> +    /*
> +      Reset the "LIMIT 1" set in Item_exists_subselect::fix_length_and_dec.
> +      TODO:
> +      Currently we set the subquery LIMIT to infinity, and this is correct
> +      because we forbid at parse time LIMIT inside IN subqueries (see
> +      Item_in_subselect::test_limit). However, once we allow this, here
> +      we should set the correct limit if given in the query.
> +    */
> +    in_subs->unit->global_parameters->select_limit= NULL;
> +    in_subs->unit->set_limit(unit->global_parameters);
> +    /*
> +      Set the limit of this JOIN object as well, because normally its being
> +      set in the beginning of JOIN::optimize, which was already done.
> +    */
> +    select_limit= in_subs->unit->select_limit_cnt;
> +  }
> +  else if (in_subs->in_strategy & SUBS_IN_TO_EXISTS)
> +  {
> +    if (reopt_result == REOPT_NONE && in_to_exists_where &&
> +        const_tables != tables)
> +    {
> +      /*
> +        The subquery was not reoptimized either because the user allowed only the
> +        IN-EXISTS strategy, or because materialization was not possible based on
> +        semantic analysis. Clenup the original plan and reoptimize.
> +      */
> +      for (uint i= 0; i < tables; i++)
> +      {
> +        join_tab[i].keyuse= NULL;
> +        join_tab[i].checked_keys.clear_all();
> +      }
> +      if ((reopt_result= reoptimize(in_to_exists_where, join_tables, NULL)) ==
> +          REOPT_ERROR)
> +        return TRUE;
> +    }
> +
> +    if (in_subs->inject_in_to_exists_cond(this))
> +      return TRUE;
> +  }
> +  else
> +    DBUG_ASSERT(FALSE);
> +
> +  return FALSE;
> +}
> +
> +
> +/**
> +  Choose a query plan for a table-less subquery.
> +
> +  @notes
> +
> +  @retval FALSE     success.
> +  @retval TRUE      error occurred.
> +*/
> +
> +bool JOIN::choose_tableless_subquery_plan()
> +{
> +  DBUG_ASSERT(!tables_list || !tables);
> +  if (select_lex->master_unit()->item)
> +  {
> +    DBUG_ASSERT(select_lex->master_unit()->item->type() ==
> +                Item::SUBSELECT_ITEM);
> +    Item_subselect *subs_predicate= select_lex->master_unit()->item;
> +
> +    /*
> +      If the optimizer determined that his query has an empty result,
> +      in most cases the subquery predicate is a known constant value -
> +      either FALSE or NULL. The implementation of Item_subselect::reset()
> +      determines which one.
> +    */
> +    if (zero_result_cause)
> +    {
> +      if (!implicit_grouping)
> +      {
> +        /*
> +          Both group by queries and non-group by queries without aggregate
> +          functions produce empty subquery result.
> +        */
> +        subs_predicate->reset();
> +        subs_predicate->make_const();
> +        return FALSE;
> +      }
> +
> +      /* TODO:
> +         A further optimization is possible when a non-group query with
> +         MIN/MAX/COUNT is optimized by opt_sum_query. Then, if there are
> +         only MIN/MAX functions over an empty result set, the subquery
> +         result is a NULL value/row, thus the value of subs_predicate is
> +         NULL.
> +      */
> +    }
> +
> +    if (subs_predicate->is_in_predicate())
> +    {
> +      Item_in_subselect *in_subs;
> +      in_subs= (Item_in_subselect*) subs_predicate;
> +      in_subs->in_strategy= SUBS_IN_TO_EXISTS;
> +      if (in_subs->create_in_to_exists_cond(this) ||
> +          in_subs->inject_in_to_exists_cond(this))
> +        return TRUE;
> +      tmp_having= having;
> +    }
> +  }
> +  return FALSE;
> +}
> +
> diff -urN --exclude='.*' 5.3-timours-wl-clean/sql/sql_lex.cc 5.3-timours-wl/sql/sql_lex.cc
> --- 5.3-timours-wl-clean/sql/sql_lex.cc	2011-05-11 20:12:50.000000000 +0100
> +++ 5.3-timours-wl/sql/sql_lex.cc	2011-05-11 20:08:50.000000000 +0100
> @@ -1684,6 +1684,31 @@
>    slave= 0;
>  }
>  
> +
> +void st_select_lex_node::add_slave(st_select_lex_node *slave_arg)
> +{
> +  for (; slave; slave= slave->next)
> +    if (slave == slave_arg)
> +      return;
> +
> +  if (slave)
> +  {
> +    st_select_lex_node *slave_arg_slave= slave_arg->slave;
> +    /* Insert in the front of list of slaves if any. */
> +    slave_arg->include_neighbour(slave);
> +    /* include_neighbour() sets slave_arg->slave=0, restore it. */
> +    slave_arg->slave= slave_arg_slave;
> +    /* Count on include_neighbour() setting the master. */
> +    DBUG_ASSERT(slave_arg->master == this);
> +  }
> +  else
> +  {
> +    slave= slave_arg;
> +    slave_arg->master= this;
> +  }
> +}

psergey: 
1. Suppose slave_arg is already in the list of slaves (based on the function's
code, this seems to be possible) Why does this function adjust this->slave in
this case?

2. I've made the following change:

=== modified file 'sql/sql_lex.cc'
--- sql/sql_lex.cc	2011-05-05 12:24:28 +0000
+++ sql/sql_lex.cc	2011-05-13 21:34:49 +0000
@@ -1693,6 +1693,7 @@ void st_select_lex_node::add_slave(st_se
 
   if (slave)
   {
+    DBUG_ASSERT(0);
     st_select_lex_node *slave_arg_slave= slave_arg->slave;
     /* Insert in the front of list of slaves if any. */
     slave_arg->include_neighbour(slave);


and all t/subselect*.test still passed. I think the code inside if () { ... } 
is deadcode. If yes, please remove it, if not, please add coverage to the
testsuite.


> +
> +
>  /*
>    include on level down (but do not link)
>  
...
> diff -urN --exclude='.*' 5.3-timours-wl-clean/sql/sql_list.h 5.3-timours-wl/sql/sql_list.h
> --- 5.3-timours-wl-clean/sql/sql_list.h	2011-05-11 20:12:50.000000000 +0100
> +++ 5.3-timours-wl/sql/sql_list.h	2011-05-11 20:08:50.000000000 +0100
> @@ -260,7 +260,7 @@
>      list_node *node= first;
>      list_node *list_first= list->first;
>      elements=0;
> -    while (node && node != list_first)
> +    while (node->info && node != list_first)
psergey:
Why the above change? Does this mean that List<T> now can contain (T*)NULL ? 

>      {
>        prev= &node->next;
>        node= node->next;
> diff -urN --exclude='.*' 5.3-timours-wl-clean/sql/sql_select.cc 5.3-timours-wl/sql/sql_select.cc
> --- 5.3-timours-wl-clean/sql/sql_select.cc	2011-05-11 20:12:50.000000000 +0100
> +++ 5.3-timours-wl/sql/sql_select.cc	2011-05-11 20:08:50.000000000 +0100
...
> @@ -882,6 +887,7 @@
>                             "Impossible HAVING" : "Impossible WHERE";
>        tables= 0;
>        error= 0;
> +      choose_tableless_subquery_plan();
>        goto setup_subq_exit;
>      }
psergey: there are three instances of this piece of code:

    if (.. we've figured out this is a degenerate join which produces no rows ...)
    {
      ...
      choose_tableless_subquery_plan();
      goto setup_subq_exit;
    }
  
I'm afraid if somebody invents a fourth way to figure out that the join is
degenerate, they will forget to make call to choose_tableless_subquery_plan(). 
Could you prevent this by introducing 

  empty_join_output:
    choose_tableless_subquery_plan();
    ... whatever else ...
  setup_subq_exit:
   
and using appropriate goto's?  

>    }
> @@ -926,12 +932,13 @@
>      */
>      if ((res=opt_sum_query(select_lex->leaf_tables, all_fields, conds)))
>      {
> -      if (res == HA_ERR_KEY_NOT_FOUND)
> +      if (res == HA_ERR_KEY_NOT_FOUND || res < 0)

psergey: What is this for? I am looking at opt_sum_query code and I don't 
see any cases where it will return a negative value?

>        {
>          DBUG_PRINT("info",("No matching min/max row"));
>  	zero_result_cause= "No matching min/max row";
>          tables= 0;
>  	error=0;
> +        choose_tableless_subquery_plan();
>          goto setup_subq_exit;
>        }
>        if (res > 1)
...
> @@ -9036,11 +9070,16 @@
>        *simple_order=0;				// Must do a temp table to sort
>      else if (!(order_tables & not_const_tables))
>      {
> -      if (order->item[0]->with_subselect && 
> -          !(join->select_lex->options & SELECT_DESCRIBE))
> -        order->item[0]->val_str(&order->item[0]->str_value);
> +      if (order->item[0]->with_subselect)
> +      {
> +        /*
> +          Delay the evaluation of constant ORDER and/or GROUP expressions that
> +          contain subqueries until the execution phase.
> +        */
> +        join->exec_const_order_group_cond.push_back(order->item[0]);
psergey:  I'm wondering why would one want to evaluate them at all in that
case?

If we have

  SELECT ... GROUP BY const_subquery_expression, ...

we don't need to evaluate const_subquery_expression at all, do we?

> +      }
>        DBUG_PRINT("info",("removing: %s", order->item[0]->full_name()));
> -      continue;					// skip const item
> +      continue;
>      }
>      else
>      {
...
> @@ -20316,5 +20363,154 @@
>  
>  
>  /**
> +  Save a query execution plan so that the caller can revert to it if needed,
> +  and reset the current query plan so that it can be reoptimized.
> +
> +  @param save_to  The object into which the current query plan state is saved
> +*/
> +
> +void JOIN::save_query_plan(Query_plan_state *save_to)
> +{
> +  if (keyuse.elements)
> +  {
> +    DYNAMIC_ARRAY tmp_keyuse;
> +    /* Swap the current and the backup keyuse internal arrays. */
> +    tmp_keyuse= keyuse;
> +    keyuse= save_to->keyuse; /* keyuse is reset to an empty array. */
> +    save_to->keyuse= tmp_keyuse;
> +
> +    for (uint i= 0; i < tables; i++)
> +    {
> +      save_to->join_tab_keyuse[i]= join_tab[i].keyuse;
> +      join_tab[i].keyuse= NULL;
> +      save_to->join_tab_checked_keys[i]= join_tab[i].checked_keys;
> +      join_tab[i].checked_keys.clear_all();
> +    }
> +  }
> +  memcpy((uchar*) save_to->best_positions, (uchar*) best_positions,
> +         sizeof(POSITION) * (tables + 1));
> +  memset(best_positions, 0, sizeof(POSITION) * (tables + 1));
> +}
> +
> +
> +/**
> +  Restore a query execution plan previously saved by the caller.
> +
> +  @param The object from which the current query plan state is restored.
> +*/
> +
> +void JOIN::restore_query_plan(Query_plan_state *restore_from)
> +{
> +  if (restore_from->keyuse.elements)
> +  {
> +    DYNAMIC_ARRAY tmp_keyuse;
> +    tmp_keyuse= keyuse;
> +    keyuse= restore_from->keyuse;
> +    restore_from->keyuse= tmp_keyuse;
> +
> +    for (uint i= 0; i < tables; i++)
> +    {
> +      join_tab[i].keyuse= restore_from->join_tab_keyuse[i];
> +      join_tab[i].checked_keys= restore_from->join_tab_checked_keys[i];
> +    }
> +
> +  }
> +  memcpy((uchar*) best_positions, (uchar*) restore_from->best_positions,
> +         sizeof(POSITION) * (tables + 1));
> +}
> +
> +
> +/**
> +  Reoptimize a query plan taking into account an additional conjunct to the
> +  WHERE clause.
> +
> +  @param added_where  An extra conjunct to the WHERE clause to reoptimize with
> +  @param join_tables  The set of tables to reoptimize
> +  @param save_to      If != NULL, save here the state of the current query plan
> +
> +  @notes
> +  Given a query plan that was already optimized taking into account some WHERE
> +  clause 'C', reoptimize this plan with a new WHERE clause 'C AND added_where'.
> +  The reoptimization works as follows:
> +
> +  1. Call update_ref_and_keys *only* for the new conditions 'added_where'
> +     that are about to be injected into the query.
> +  2. Expand if necessary the original KEYUSE array JOIN::keyuse to
> +     accommodate the new REF accesses computed for the 'added_where' condition.
> +  3. Add the new KEYUSEs into JOIN::keyuse.
> +  4. Re-sort and re-filter the JOIN::keyuse array with the newly added
> +     KEYUSE elements. 
psergey: 
I think the above scheme has some flaws.  What if the initial filtering 
has discarded elements that would be useful with the new injected condition?

Consider this example:

create table ten (a int);
insert into ten values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
create table one_k (a int);
insert into one_k select A.a + B.a*10 + C.a*100 from ten A, ten B, ten C;
create table t11 (kp1 int, kp2 int, a int, filler char(100), key(kp1, kp2));
insert into t11 select a/100, a, a, 'filler' from one_k;

If I debug this query:

explain select a from one_k where a in (select kp1 from t11 where kp2=10 and a=4) or a+1 < 100000;

I see that the optimizer considers access to t11 via lookup on t11.kp1=...
but not on  (t11.kp1=... AND t11.kp2=..).

Please note that from user point of view this will look like a regression.

> + 
> +  @retval REOPT_NEW_PLAN  there is a new plan.
> +  @retval REOPT_OLD_PLAN  no new improved plan was produced, use the old one.
> +  @retval REOPT_ERROR     an irrecovarable error occured during reoptimization.
> +*/
> +
> +JOIN::enum_reopt_result
> +JOIN::reoptimize(Item *added_where, table_map join_tables,
> +                 Query_plan_state *save_to)
> +{
psergey: 
This function looks very odd. We touch "keyuse" only when 

  restore_from->keyuse.elements!=NULL


What about the case when

  this->keyuse.elements!=NULL && restore_from->keyuse.elements==NULL
  
? 
This looks like a possible scenario to me. Suppose that 
 - original (i.e. Materialization) plan produces no KEYUSE-s
 - IN-to-EXISTS plan does produce KEYUSE

and we're restoring from IN-to-EXISTS to Materialization plan?


> +  DYNAMIC_ARRAY added_keyuse;
> +  SARGABLE_PARAM *sargables= 0; /* Used only as a dummy parameter. */
> +  uint org_keyuse_elements;
> +
> +  /* Re-run the REF optimizer to take into account the new conditions. */
> +  if (update_ref_and_keys(thd, &added_keyuse, join_tab, tables, added_where,
> +                          ~outer_join, select_lex, &sargables))
> +  {
> +    delete_dynamic(&added_keyuse);
> +    return REOPT_ERROR;
> +  }
> +
> +  if (!added_keyuse.elements)
> +  {
> +    delete_dynamic(&added_keyuse);
> +    return REOPT_OLD_PLAN;
> +  }
> +
> +  if (save_to)
> +    save_query_plan(save_to);
> +
> +  if (!keyuse.buffer &&
> +      my_init_dynamic_array(&keyuse, sizeof(KEYUSE), 20, 64))
> +  {
> +    delete_dynamic(&added_keyuse);
> +    return REOPT_ERROR;
> +  }
> +
> +  org_keyuse_elements= save_to ? save_to->keyuse.elements : keyuse.elements;
> +  allocate_dynamic(&keyuse, org_keyuse_elements + added_keyuse.elements);
> +
> +  /* If needed, add the access methods from the original query plan. */
> +  if (save_to)
> +  {
> +    DBUG_ASSERT(!keyuse.elements);
> +    memcpy(keyuse.buffer,
> +           save_to->keyuse.buffer,
> +           (size_t) save_to->keyuse.elements * keyuse.size_of_element);
> +    keyuse.elements= save_to->keyuse.elements;
> +  }
> +
> +  /* Add the new access methods to the keyuse array. */
> +  memcpy(keyuse.buffer + keyuse.elements * keyuse.size_of_element,
> +         added_keyuse.buffer,
> +         (size_t) added_keyuse.elements * added_keyuse.size_of_element);
> +  keyuse.elements+= added_keyuse.elements;
> +  /* added_keyuse contents is copied, and it is no longer needed. */
> +  delete_dynamic(&added_keyuse);
> +
> +  if (sort_and_filter_keyuse(&keyuse))
> +    return REOPT_ERROR;
> +  optimize_keyuse(this, &keyuse);
> +
> +  /* Re-run the join optimizer to compute a new query plan. */
> +  if (choose_plan(this, join_tables))
> +    return REOPT_ERROR;
> +
> +  return REOPT_NEW_PLAN;
> +}
> +
> +
> +/**
>    @} (end of group Query_Optimizer)
>  */
> diff -urN --exclude='.*' 5.3-timours-wl-clean/sql/sql_select.h 5.3-timours-wl/sql/sql_select.h
> --- 5.3-timours-wl-clean/sql/sql_select.h	2011-05-11 20:12:50.000000000 +0100
> +++ 5.3-timours-wl/sql/sql_select.h	2011-05-11 20:08:50.000000000 +0100
> @@ -603,8 +603,53 @@
>  
>  class JOIN :public Sql_alloc
>  {
> +private:
>    JOIN(const JOIN &rhs);                        /**< not implemented */
>    JOIN& operator=(const JOIN &rhs);             /**< not implemented */
> +
> +protected:
> +
> +  /**
> +    The subset of the state of a JOIN that represents an optimized query
> +    execution plan. Allows saving/restoring different plans for the same query.
> +  */
> +  class Query_plan_state {
psergey:  I have a concern about naming. This is really a join plan state, not
a query plan state. We may get into mess if at some point we'll need to save 
plan state for the whole query.
Is it still possible to rename this to something like Join_plan_state?


> +  public:
> +    DYNAMIC_ARRAY keyuse; /* Copy of the JOIN::keyuse array. */
> +    POSITION best_positions[MAX_TABLES+1]; /* Copy of JOIN::best_positions */
> +    /* Copies of the JOIN_TAB::keyuse pointers for each JOIN_TAB. */
> +    KEYUSE *join_tab_keyuse[MAX_TABLES];
> +    /* Copies of JOIN_TAB::checked_keys for each JOIN_TAB. */
> +    key_map join_tab_checked_keys[MAX_TABLES];
> +  public:
> +    Query_plan_state()
> +    {   
> +      keyuse.elements= 0;
> +      keyuse.buffer= NULL;
> +    }
> +    Query_plan_state(JOIN *join);
> +    ~Query_plan_state()
> +    {
> +      delete_dynamic(&keyuse);
> +    }
> +  };
> +
> +  /* Results of reoptimizing a JOIN via JOIN::reoptimize(). */
> +  enum enum_reopt_result {
> +    REOPT_NEW_PLAN, /* there is a new reoptimized plan */
> +    REOPT_OLD_PLAN, /* no new improved plan can be found, use the old one */
> +    REOPT_ERROR,    /* an irrecovarable error occured during reoptimization */
> +    REOPT_NONE      /* not yet reoptimized */
> +  };
> +
> +  /* Support for plan reoptimization with rewritten conditions. */
> +  enum_reopt_result reoptimize(Item *added_where, table_map join_tables,
> +                               Query_plan_state *save_to);
> +  void save_query_plan(Query_plan_state *save_to);
> +  void restore_query_plan(Query_plan_state *restore_from);
> +  /* Choose a subquery plan for a table-less subquery. */
> +  bool choose_tableless_subquery_plan();
> +
>  public:
>    JOIN_TAB *join_tab,**best_ref;
>    JOIN_TAB **map2table;    ///< mapping between table indexes and JOIN_TABs
> @@ -702,6 +747,13 @@
>      account the changes made by test_if_skip_sort_order()).
>    */
>    double   best_read;
> +  /*
> +    Estimated result rows (fanout) of the whole query. If this is a subquery
psergey: I think it would be better to change "whole query" to "join
operation" or "the select" here.

> +    that is reexecuted multiple times, this value includes the estiamted # of
> +    reexecutions. This value is equal to the multiplication of all
> +    join->positions[i].records_read of a JOIN.
> +  */
> +  double   record_count;
>    List<Item> *fields;
>    List<Cached_item> group_fields, group_fields_cache;
>    TABLE    *tmp_table;
..
> @@ -1233,6 +1313,9 @@
>      if (!inited)
>      {
>        inited=1;
psergey: I don't understand why this wasn't needed before MWL#89, but is
needed after MWL#89.  Do you still remember why you've added it?

> +      TABLE *table= to_field->table;
> +      my_bitmap_map *old_map= dbug_tmp_use_all_columns(table,
> +                                                       table->write_set);
>        if ((res= item->save_in_field(to_field, 1)))
>        {       
>          if (!err)

BR
 Sergey
-- 
Sergey Petrunia, Software Developer
Monty Program AB, http://askmonty.org
Blog: http://s.petrunia.net/blog


Follow ups