← Back to team overview

maria-developers team mailing list archive

Review of PERCENTILE_CONT && PERCENTILE_DISC

 

Hi Varun,

Here's the review for your patch.

> diff --git a/mysql-test/t/percentile.test b/mysql-test/t/percentile.test
> new file mode 100644
> index 00000000000..0958fc05e7d
> --- /dev/null
> +++ b/mysql-test/t/percentile.test
> @@ -0,0 +1,41 @@
Whenever you are adding a test case, please add the result file too.
It makes things easier to review.
> +CREATE TABLE student (name CHAR(10), test double, score TINYINT);
> +
> +INSERT INTO student VALUES
> +('Chun', 0, null), ('Chun', 0, 4),
> +('Esben', 1, null), ('Esben', 1, null),
> +('Kaolin', 0.5, 56), ('Kaolin', 0.5, 88),
> +('Tatiana', 0.8, 2), ('Tatiana', 0.8, 1);
> +
> +
> +
> +select name, percentile_disc(0.6)  within group(order by score) over
(partition by name) from student;
> +select name, percentile_disc(test)  within group(order by score) over
(partition by name) from student;
> +select name, percentile_disc(0.4)  within group(order by score) over
(partition by name) from student;
> +
> +
> +#select name, percentile_cont(null)  within group(order by score) over
(partition by name) from student;
> +#select name, cume_dist()  over (partition by name order by score) from
student;
> +
> +
> +#normal parsing
> +#select percentile_cont(0.5) within group(order by score)  over w1  from
student
> +#window w1 AS (partition by name);
> +
> +# no partition clause
> +#select percentile_cont(0.5) within group(order by score)  over ()  from
student;
> +
> +
> +# only one sort allowed
> +#select percentile_cont(0.5) within group(order by score) over
(partition by name);
> +
> +#parameter value should be in the range of 0 to 1
> +#select percentile_cont(1.5) within group(order by score) over
(partition by name);
> +
> +
> +#
> +#select rank() over (partition by name order by score  ROWS BETWEEN
UNBOUNDED PRECEDING AND CURRENT ROW) from student;
> +
> +
> +
> +drop table student;
> +
> diff --git a/sql/item.h b/sql/item.h
> index 76ce4aa935f..3df4ec4f358 100644
> --- a/sql/item.h
> +++ b/sql/item.h
> @@ -5215,6 +5217,14 @@ class Cached_item_item : public Cached_item
>      cmp();
>      item= save;
>    }
This is not used in the patch. Is there a need for it any more?
> +  Item* get_item()
> +  {
> +    return item;
> +  }
Add an empty line here here.
> +  void clear()
> +  {
> +    null_value= false;
> +  }
>  };
>
>  class Cached_item_str :public Cached_item_item
> @@ -5236,6 +5250,12 @@ class Cached_item_real :public Cached_item_item
>    Cached_item_real(Item *item_par)
:Cached_item_item(item_par),value(0.0) {}
>    bool cmp(void);
>    int  cmp_read_only();
Space after ). Mark this function as const
> +  double get_value(){ return value;}
I don't quite get what this clear is supposed to do. I'd expect clear to set
the null_value to true, not to false. Same for other functions in the diff.
Add a comment explaining this.
> +  void clear()
> +  {
Space after =
> +    value=0.0;
> +    null_value= false;
> +  }
>  };
>
>  class Cached_item_int :public Cached_item_item
> @@ -5245,6 +5265,12 @@ class Cached_item_int :public Cached_item_item
>    Cached_item_int(Item *item_par) :Cached_item_item(item_par),value(0) {}
>    bool cmp(void);
>    int  cmp_read_only();
Space after ). Mark this function as const.
> +  longlong get_value(){ return value;}
> +  void clear()
> +  {
Space after =
> +    value=0.0;
> +    null_value= false;
> +  }
>  };
>
>
> @@ -5255,6 +5281,12 @@ class Cached_item_decimal :public Cached_item_item
>    Cached_item_decimal(Item *item_par);
>    bool cmp(void);
>    int  cmp_read_only();
Space after ) Mark this function as const.
> +  my_decimal *get_value(){ return &value;};
> +  void clear()
> +  {
> +    null_value= false;
> +    my_decimal_set_zero(&value);
> +  }
>  };
>
>  class Cached_item_field :public Cached_item
> diff --git a/sql/item_windowfunc.cc b/sql/item_windowfunc.cc
> index 6ab903a81bb..b711642c52c 100644
> --- a/sql/item_windowfunc.cc
> +++ b/sql/item_windowfunc.cc
> @@ -93,6 +93,17 @@ Item_window_func::fix_fields(THD *thd, Item **ref)
>      my_error(ER_NO_ORDER_LIST_IN_WINDOW_SPEC, MYF(0),
window_func()->func_name());
>      return true;
>    }
> +
> +  if (only_single_element_order_list())
> +  {
Mark such comments as TODO, like so:
TODO(varunt): <message>
> +    // need to change the error, the error should say that we have more
than one element in the order list
> +    if (window_spec->order_list->elements != 1)
> +    {
> +      my_error(ER_NO_ORDER_LIST_IN_WINDOW_SPEC, MYF(0),
window_func()->func_name());
> +      return true;
> +    }
> +  }
> +
>    /*
>      TODO: why the last parameter is 'ref' in this call? What if
window_func
>      decides to substitute itself for something else and does *ref=.... ?
> @@ -179,6 +190,16 @@ void Item_sum_dense_rank::setup_window_func(THD
*thd, Window_spec *window_spec)
>    clear();
>  }
>
>
I don't see any added value for this extra setup_percentile_func, when it's
only
I guess you can just put the code in directly.
> +void Item_sum_percentile_disc::setup_window_func(THD *thd, Window_spec
*window_spec)
> +{
> +  setup_percentile_func(thd, window_spec->order_list);
> +}
> +
This function does nothing. get_handler_by_cmp_type returns an item and
then you
don't do anything with it.
> +void Item_sum_percentile_disc::set_type_handler(Window_spec *window_spec)
> +{
> +
 type_handler()->get_handler_by_cmp_type(window_spec->order_list->first->item[0]->result_type());
> +}
> +
>  bool Item_sum_dense_rank::add()
>  {
>    if (peer_tracker->check_if_next_group() || first_add)
> diff --git a/sql/item_windowfunc.h b/sql/item_windowfunc.h
> index 77cbd556e60..5179d1832e8 100644
> --- a/sql/item_windowfunc.h
> +++ b/sql/item_windowfunc.h
> @@ -16,10 +17,13 @@ class Group_bound_tracker
>
>    Group_bound_tracker(THD *thd, SQL_I_List<ORDER> *list)
>    {
As discussed on slack, lets keep this Group_bound_tracker as is.
Better yet, add a DBUG_ASSERT(list).
> -    for (ORDER *curr = list->first; curr; curr=curr->next)
> +    if (list)
>      {
> -      Cached_item *tmp= new_Cached_item(thd, curr->item[0], TRUE);
> -      group_fields.push_back(tmp);
> +      for (ORDER *curr = list->first; curr; curr=curr->next)
> +      {
> +        Cached_item *tmp= new_Cached_item(thd, curr->item[0], TRUE);
> +        group_fields.push_back(tmp);
> +      }
>      }
>    }
>
> @@ -71,6 +75,19 @@ class Group_bound_tracker
>      return 0;
>    }
>
Add a comment as to what this function does. I don't quite like how this
function works. cmp has a side effect of overwriting the cached item with
the
compared value. In addition this function stops if we encounter a null value
and the group fields are now partially updated.

I assume this is only for a single element order list. Why not make
something
specialised for that and not use this cached_item construct.
> +  bool compare_with_cache_for_null_values()
> +  {
> +    List_iterator<Cached_item> li(group_fields);
> +    Cached_item *ptr;
> +    while ((ptr= li++))
> +    {
> +      ptr->cmp();
> +      if (ptr->null_value)
> +        return true;
> +    }
> +    return false;
> +  }
> +
>  private:
>    List<Cached_item> group_fields;
>    /*
> @@ -678,6 +704,138 @@ class Item_sum_ntile : public
Item_sum_window_with_row_count
>    ulong current_row_count_;
>  };
>
> +class Item_sum_percentile_disc : public Item_sum_cume_dist,
> +                                 public Type_handler_hybrid_field_type
> +{
> +public:
> +  Item_sum_percentile_disc(THD *thd, Item* arg) :
Item_sum_cume_dist(thd, arg),
> +
Type_handler_hybrid_field_type(&type_handler_longlong),
> +                           value(NULL), result_value(NULL),
val_calculated(FALSE) {}
> +
> +  double val_real()
> +  {
> +    if (get_row_count() == 0 || get_arg(0)->is_null())
> +    {
> +      null_value= true;
> +      return 0;
> +    }
> +    null_value= false;
Look at Item_sum_hybrid_simple. See how Item_cache * works there. Make use
of
that instead of Cached_item_xxx.
> +    return ((Cached_item_real*) result_value)->get_value();
> +  }
> +
> +  longlong val_int()
> +  {
> +    if (get_row_count() == 0 || get_arg(0)->is_null())
> +    {
> +      null_value= true;
> +      return 0;
> +    }
> +    null_value= false;
Same comment as for val_real()
> +    return ((Cached_item_int*) result_value)->get_value();
> +  }
> +
> +  my_decimal* val_decimal(my_decimal* dec)
> +  {
> +    if (get_row_count() == 0 || get_arg(0)->is_null())
> +    {
> +      null_value= true;
> +      return 0;
> +    }
> +    null_value= false;
Same comment as for val_decimal()
> +    return ((Cached_item_decimal*) result_value)->get_value();
> +  }
> +
Lets try implementing this with Item_cache, instead of cached_item.
Cached_item
is designed for comparison, while Item_cache is used for storing something
just
to cache it.
> +  bool add()
> +  {
> +    Item *arg = get_arg(0);
> +    if (arg->is_null())
> +      return true;
> +    /*
> +      need to ensure that the Item arg is constant across the entire
partition
> +      and its value ranges between [0,1]
> +    */
> +    value->cmp();
> +
> +    /* for the null values of the row, we dont count take those rows in
account for calculating
> +    the CUME_DIST */
> +
> +    if(value->null_value)
> +      return false;
> +
> +    Item_sum_cume_dist::add();
> +    double val1= Item_sum_cume_dist::val_real();
> +    /* need to check type and return value accordingly*/
> +    double val2 =arg->val_real_from_decimal();
> +
> +    /* use Cached_item to do the comparision using cmp_read_only() */
> +
> +    if( val1 >= val2 && !val_calculated)
> +    {
> +      val_calculated= true;
> +      result_value->cmp();
> +      return false;
> +    }
> +    return false;
> +  }
> +
> +  enum Sumfunctype sum_func() const
> +  {
> +    return PERCENTILE_DISC_FUNC;
> +  }
> +
> +  void clear()
> +  {
> +    val_calculated= false;
> +    value->clear();
> +    result_value->clear();
> +    Item_sum_cume_dist::clear();
> +  }
> +
> +  const char*func_name() const
> +  {
> +    return "percentile_disc";
> +  }
> +
> +  void update_field() {}
> +  void set_type_handler(Window_spec *window_spec);
> +  const Type_handler *type_handler() const
> +  {return Type_handler_hybrid_field_type::type_handler();}
> +
> +  void fix_length_and_dec()
> +  {
> +    decimals = 10;  // TODO-cvicentiu find out how many decimals the
standard
> +                    // requires.
> +  }
> +
> +  Item *get_copy(THD *thd, MEM_ROOT *mem_root)
> +  { return get_item_copy<Item_sum_percentile_disc>(thd, mem_root, this);
}
> +  void setup_window_func(THD *thd, Window_spec *window_spec);
> +  void setup_percentile_func(THD *thd, SQL_I_List<ORDER> *list)
> +  {
> +    value= new_Cached_item(thd, list->first->item[0], FALSE);
> +    result_value= new_Cached_item(thd, list->first->item[0], FALSE);
> +  }
> +  void cleanup()
> +  {
> +    if (value)
> +    {
> +      delete value;
> +      value= NULL;
> +    }
> +    if(result_value)
> +    {
> +      delete result_value;
> +      result_value= NULL;
> +    }
> +    Item_sum_num::cleanup();
> +  }
> +
> +private:
> +  Cached_item *value;
> +  Cached_item *result_value;
> +  bool val_calculated;
> +};
> +
>
>  class Item_window_func : public Item_func_or_sum
>  {
> diff --git a/sql/sql_window.cc b/sql/sql_window.cc
> index 74e1ad25183..9fda4520a05 100644
> --- a/sql/sql_window.cc
> +++ b/sql/sql_window.cc
> @@ -868,9 +873,39 @@ class Partition_read_cursor : public
Table_read_cursor
>      }
>      return 0;
>    }
This is not something we want in Partition_read_cursor. Move the contents
of it to the most derived class that needs it.
> +  bool next_func(ha_rows *counter)
> +  {
> +    if (next())
> +      return true;
> +    if (!check_for_null_row())
> +    {
> +      (*counter)++;
> +    }
> +    return false;
> +  }
This is also not something we want in Partition_read_cursor.
> +  bool fetch_func(ha_rows *counter)
> +  {
> +    if (fetch())
> +      return true;
> +    if (!check_for_null_row())
> +    {
> +      (*counter)++;
> +    }
> +    return false;
> +  }
Null row here has no meaning. Partition_read_cursor looks at the
bound_tracker
for partitions. What "row" are we talking about here. Then again,
this logic can probably go into our "special" frame cursor below.
> +  bool check_for_null_row()
> +  {
> +    if (!end_of_partition)
> +    {
Check compare_with_cache_for_null_values comment.
> +      if (order_tracker.compare_with_cache_for_null_values())
> +        return true;
> +    }
> +    return false;
> +  }
>
>  private:
>    Group_bound_tracker bound_tracker;
Not something we want in Partition_read_cursor. If you need an order_tracker
define a different class.
> +  Group_bound_tracker order_tracker;
>    bool end_of_partition;
>  };
>
> @@ -1102,7 +1137,7 @@ class Frame_range_n_top : public Frame_cursor
>                      SQL_I_List<ORDER> *partition_list,
>                      SQL_I_List<ORDER> *order_list,
>                      bool is_preceding_arg, Item *n_val_arg) :
So this gets reverted as discussed on slack.
> -    cursor(thd, partition_list), n_val(n_val_arg), item_add(NULL),
> +    cursor(thd, partition_list, NULL), n_val(n_val_arg), item_add(NULL),
>      is_preceding(is_preceding_arg)
>    {
>      DBUG_ASSERT(order_list->elements == 1);
> @@ -1241,7 +1276,7 @@ class Frame_range_n_bottom: public Frame_cursor
>                         SQL_I_List<ORDER> *partition_list,
>                         SQL_I_List<ORDER> *order_list,
>                         bool is_preceding_arg, Item *n_val_arg) :
So this gets reverted as discussed on slack.
> -    cursor(thd, partition_list), n_val(n_val_arg), item_add(NULL),
> +    cursor(thd, partition_list, NULL), n_val(n_val_arg), item_add(NULL),
>      is_preceding(is_preceding_arg), added_values(false)
>    {
>      DBUG_ASSERT(order_list->elements == 1);
> @@ -1371,7 +1406,7 @@ class Frame_range_current_row_bottom: public
Frame_cursor
>    Frame_range_current_row_bottom(THD *thd,
>                                   SQL_I_List<ORDER> *partition_list,
>                                   SQL_I_List<ORDER> *order_list) :
So this gets reverted as discussed on slack.
> -    cursor(thd, partition_list), peer_tracker(thd, order_list)
> +    cursor(thd, partition_list, NULL), peer_tracker(thd, order_list)
>    {
>    }
>
> @@ -1658,6 +1693,35 @@ class Frame_unbounded_following_set_count : public
Frame_unbounded_following
>    }
>  };
>
As discussed on slack, this needs to be renamed to something that makes
sense.
> +class Frame_unbounded_following_set_count_special : public
Frame_unbounded_following_set_count
> +{
> +public:
> +  Frame_unbounded_following_set_count_special(
> +      THD *thd,
> +      SQL_I_List<ORDER> *partition_list, SQL_I_List<ORDER> *order_list) :
> +    Frame_unbounded_following_set_count(thd, partition_list, order_list)
> +    {}
> +
next_partition can have the logic of fetch_func directly in it.
> +  void next_partition(ha_rows rownum)
> +  {
> +    ha_rows num_rows_in_partition= 0;
> +    if (cursor.fetch_func(&num_rows_in_partition))
> +      return;
> +
Dangerous to have a while loop like this. Lets put next_func code here
directly.
> +    /* Walk to the end of the partition, find how many rows there are. */
> +    while (!cursor.next_func(&num_rows_in_partition));
> +
> +    List_iterator_fast<Item_sum> it(sum_functions);
> +    Item_sum* item;
> +    while ((item= it++))
> +    {
> +      Item_sum_window_with_row_count* item_with_row_count =
> +        static_cast<Item_sum_window_with_row_count *>(item);
> +      item_with_row_count->set_row_count(num_rows_in_partition);
> +    }
> +  }
> +};
> +
>
 /////////////////////////////////////////////////////////////////////////////
>  // ROWS-type frame bounds
>
 /////////////////////////////////////////////////////////////////////////////
> diff --git a/sql/sql_yacc.yy b/sql/sql_yacc.yy
> index a0bbf39b138..6c64477f08d 100644
> --- a/sql/sql_yacc.yy
> +++ b/sql/sql_yacc.yy
> @@ -10671,6 +10676,45 @@ simple_window_func:
>            }
>          ;
>
> +inverse_distribution_function:
> +        inverse_distribution_function_def WITHIN GROUP_SYM
> +        '('
> +         { Select->prepare_add_window_spec(thd); }
> +         order_by_single_element_list ')' OVER_SYM
> +        '(' opt_window_ref opt_window_partition_clause ')'
> +         {
> +           LEX *lex= Lex;
> +           if (Select->add_window_spec(thd, lex->win_ref,
> +                                      Select->group_list,
> +                                      Select->order_list,
> +                                      NULL))
> +             MYSQL_YYABORT;
> +           $$= new (thd->mem_root) Item_window_func(thd, (Item_sum *) $1,
> +                                                    thd->lex->win_spec);
> +           if ($$ == NULL)
> +             MYSQL_YYABORT;
> +           if (Select->add_window_func((Item_window_func *) $$))
> +             MYSQL_YYABORT;
> +         }
> +        ;
> +
> +inverse_distribution_function_def:
> +         PERCENTILE_CONT_SYM '(' expr ')'
Indentation for { and } is not correct here. Please check this file and do
the same.
> +       {
mark it as TODO(varun):
> +         //Not yet started implementing
> +       }
> +       | PERCENTILE_DISC_SYM '(' expr ')'
Same, check indentation for { and }
> +       {
> +          $$= new (thd->mem_root) Item_sum_percentile_disc(thd, $3);
> +          if ($$ == NULL)
> +            MYSQL_YYABORT;
> +       }
> +      ;
> +
Indentation check!
> +order_by_single_element_list:
> +    ORDER_SYM BY order_list
> +    ;
> +
>  window_name:
>            ident
>            {