maria-developers team mailing list archive
-
maria-developers team
-
Mailing list archive
-
Message #10789
Review of PERCENTILE_CONT && PERCENTILE_DISC
Hi Varun,
Here's the review for your patch.
> diff --git a/mysql-test/t/percentile.test b/mysql-test/t/percentile.test
> new file mode 100644
> index 00000000000..0958fc05e7d
> --- /dev/null
> +++ b/mysql-test/t/percentile.test
> @@ -0,0 +1,41 @@
Whenever you are adding a test case, please add the result file too.
It makes things easier to review.
> +CREATE TABLE student (name CHAR(10), test double, score TINYINT);
> +
> +INSERT INTO student VALUES
> +('Chun', 0, null), ('Chun', 0, 4),
> +('Esben', 1, null), ('Esben', 1, null),
> +('Kaolin', 0.5, 56), ('Kaolin', 0.5, 88),
> +('Tatiana', 0.8, 2), ('Tatiana', 0.8, 1);
> +
> +
> +
> +select name, percentile_disc(0.6) within group(order by score) over
(partition by name) from student;
> +select name, percentile_disc(test) within group(order by score) over
(partition by name) from student;
> +select name, percentile_disc(0.4) within group(order by score) over
(partition by name) from student;
> +
> +
> +#select name, percentile_cont(null) within group(order by score) over
(partition by name) from student;
> +#select name, cume_dist() over (partition by name order by score) from
student;
> +
> +
> +#normal parsing
> +#select percentile_cont(0.5) within group(order by score) over w1 from
student
> +#window w1 AS (partition by name);
> +
> +# no partition clause
> +#select percentile_cont(0.5) within group(order by score) over () from
student;
> +
> +
> +# only one sort allowed
> +#select percentile_cont(0.5) within group(order by score) over
(partition by name);
> +
> +#parameter value should be in the range of 0 to 1
> +#select percentile_cont(1.5) within group(order by score) over
(partition by name);
> +
> +
> +#
> +#select rank() over (partition by name order by score ROWS BETWEEN
UNBOUNDED PRECEDING AND CURRENT ROW) from student;
> +
> +
> +
> +drop table student;
> +
> diff --git a/sql/item.h b/sql/item.h
> index 76ce4aa935f..3df4ec4f358 100644
> --- a/sql/item.h
> +++ b/sql/item.h
> @@ -5215,6 +5217,14 @@ class Cached_item_item : public Cached_item
> cmp();
> item= save;
> }
This is not used in the patch. Is there a need for it any more?
> + Item* get_item()
> + {
> + return item;
> + }
Add an empty line here here.
> + void clear()
> + {
> + null_value= false;
> + }
> };
>
> class Cached_item_str :public Cached_item_item
> @@ -5236,6 +5250,12 @@ class Cached_item_real :public Cached_item_item
> Cached_item_real(Item *item_par)
:Cached_item_item(item_par),value(0.0) {}
> bool cmp(void);
> int cmp_read_only();
Space after ). Mark this function as const
> + double get_value(){ return value;}
I don't quite get what this clear is supposed to do. I'd expect clear to set
the null_value to true, not to false. Same for other functions in the diff.
Add a comment explaining this.
> + void clear()
> + {
Space after =
> + value=0.0;
> + null_value= false;
> + }
> };
>
> class Cached_item_int :public Cached_item_item
> @@ -5245,6 +5265,12 @@ class Cached_item_int :public Cached_item_item
> Cached_item_int(Item *item_par) :Cached_item_item(item_par),value(0) {}
> bool cmp(void);
> int cmp_read_only();
Space after ). Mark this function as const.
> + longlong get_value(){ return value;}
> + void clear()
> + {
Space after =
> + value=0.0;
> + null_value= false;
> + }
> };
>
>
> @@ -5255,6 +5281,12 @@ class Cached_item_decimal :public Cached_item_item
> Cached_item_decimal(Item *item_par);
> bool cmp(void);
> int cmp_read_only();
Space after ) Mark this function as const.
> + my_decimal *get_value(){ return &value;};
> + void clear()
> + {
> + null_value= false;
> + my_decimal_set_zero(&value);
> + }
> };
>
> class Cached_item_field :public Cached_item
> diff --git a/sql/item_windowfunc.cc b/sql/item_windowfunc.cc
> index 6ab903a81bb..b711642c52c 100644
> --- a/sql/item_windowfunc.cc
> +++ b/sql/item_windowfunc.cc
> @@ -93,6 +93,17 @@ Item_window_func::fix_fields(THD *thd, Item **ref)
> my_error(ER_NO_ORDER_LIST_IN_WINDOW_SPEC, MYF(0),
window_func()->func_name());
> return true;
> }
> +
> + if (only_single_element_order_list())
> + {
Mark such comments as TODO, like so:
TODO(varunt): <message>
> + // need to change the error, the error should say that we have more
than one element in the order list
> + if (window_spec->order_list->elements != 1)
> + {
> + my_error(ER_NO_ORDER_LIST_IN_WINDOW_SPEC, MYF(0),
window_func()->func_name());
> + return true;
> + }
> + }
> +
> /*
> TODO: why the last parameter is 'ref' in this call? What if
window_func
> decides to substitute itself for something else and does *ref=.... ?
> @@ -179,6 +190,16 @@ void Item_sum_dense_rank::setup_window_func(THD
*thd, Window_spec *window_spec)
> clear();
> }
>
>
I don't see any added value for this extra setup_percentile_func, when it's
only
I guess you can just put the code in directly.
> +void Item_sum_percentile_disc::setup_window_func(THD *thd, Window_spec
*window_spec)
> +{
> + setup_percentile_func(thd, window_spec->order_list);
> +}
> +
This function does nothing. get_handler_by_cmp_type returns an item and
then you
don't do anything with it.
> +void Item_sum_percentile_disc::set_type_handler(Window_spec *window_spec)
> +{
> +
type_handler()->get_handler_by_cmp_type(window_spec->order_list->first->item[0]->result_type());
> +}
> +
> bool Item_sum_dense_rank::add()
> {
> if (peer_tracker->check_if_next_group() || first_add)
> diff --git a/sql/item_windowfunc.h b/sql/item_windowfunc.h
> index 77cbd556e60..5179d1832e8 100644
> --- a/sql/item_windowfunc.h
> +++ b/sql/item_windowfunc.h
> @@ -16,10 +17,13 @@ class Group_bound_tracker
>
> Group_bound_tracker(THD *thd, SQL_I_List<ORDER> *list)
> {
As discussed on slack, lets keep this Group_bound_tracker as is.
Better yet, add a DBUG_ASSERT(list).
> - for (ORDER *curr = list->first; curr; curr=curr->next)
> + if (list)
> {
> - Cached_item *tmp= new_Cached_item(thd, curr->item[0], TRUE);
> - group_fields.push_back(tmp);
> + for (ORDER *curr = list->first; curr; curr=curr->next)
> + {
> + Cached_item *tmp= new_Cached_item(thd, curr->item[0], TRUE);
> + group_fields.push_back(tmp);
> + }
> }
> }
>
> @@ -71,6 +75,19 @@ class Group_bound_tracker
> return 0;
> }
>
Add a comment as to what this function does. I don't quite like how this
function works. cmp has a side effect of overwriting the cached item with
the
compared value. In addition this function stops if we encounter a null value
and the group fields are now partially updated.
I assume this is only for a single element order list. Why not make
something
specialised for that and not use this cached_item construct.
> + bool compare_with_cache_for_null_values()
> + {
> + List_iterator<Cached_item> li(group_fields);
> + Cached_item *ptr;
> + while ((ptr= li++))
> + {
> + ptr->cmp();
> + if (ptr->null_value)
> + return true;
> + }
> + return false;
> + }
> +
> private:
> List<Cached_item> group_fields;
> /*
> @@ -678,6 +704,138 @@ class Item_sum_ntile : public
Item_sum_window_with_row_count
> ulong current_row_count_;
> };
>
> +class Item_sum_percentile_disc : public Item_sum_cume_dist,
> + public Type_handler_hybrid_field_type
> +{
> +public:
> + Item_sum_percentile_disc(THD *thd, Item* arg) :
Item_sum_cume_dist(thd, arg),
> +
Type_handler_hybrid_field_type(&type_handler_longlong),
> + value(NULL), result_value(NULL),
val_calculated(FALSE) {}
> +
> + double val_real()
> + {
> + if (get_row_count() == 0 || get_arg(0)->is_null())
> + {
> + null_value= true;
> + return 0;
> + }
> + null_value= false;
Look at Item_sum_hybrid_simple. See how Item_cache * works there. Make use
of
that instead of Cached_item_xxx.
> + return ((Cached_item_real*) result_value)->get_value();
> + }
> +
> + longlong val_int()
> + {
> + if (get_row_count() == 0 || get_arg(0)->is_null())
> + {
> + null_value= true;
> + return 0;
> + }
> + null_value= false;
Same comment as for val_real()
> + return ((Cached_item_int*) result_value)->get_value();
> + }
> +
> + my_decimal* val_decimal(my_decimal* dec)
> + {
> + if (get_row_count() == 0 || get_arg(0)->is_null())
> + {
> + null_value= true;
> + return 0;
> + }
> + null_value= false;
Same comment as for val_decimal()
> + return ((Cached_item_decimal*) result_value)->get_value();
> + }
> +
Lets try implementing this with Item_cache, instead of cached_item.
Cached_item
is designed for comparison, while Item_cache is used for storing something
just
to cache it.
> + bool add()
> + {
> + Item *arg = get_arg(0);
> + if (arg->is_null())
> + return true;
> + /*
> + need to ensure that the Item arg is constant across the entire
partition
> + and its value ranges between [0,1]
> + */
> + value->cmp();
> +
> + /* for the null values of the row, we dont count take those rows in
account for calculating
> + the CUME_DIST */
> +
> + if(value->null_value)
> + return false;
> +
> + Item_sum_cume_dist::add();
> + double val1= Item_sum_cume_dist::val_real();
> + /* need to check type and return value accordingly*/
> + double val2 =arg->val_real_from_decimal();
> +
> + /* use Cached_item to do the comparision using cmp_read_only() */
> +
> + if( val1 >= val2 && !val_calculated)
> + {
> + val_calculated= true;
> + result_value->cmp();
> + return false;
> + }
> + return false;
> + }
> +
> + enum Sumfunctype sum_func() const
> + {
> + return PERCENTILE_DISC_FUNC;
> + }
> +
> + void clear()
> + {
> + val_calculated= false;
> + value->clear();
> + result_value->clear();
> + Item_sum_cume_dist::clear();
> + }
> +
> + const char*func_name() const
> + {
> + return "percentile_disc";
> + }
> +
> + void update_field() {}
> + void set_type_handler(Window_spec *window_spec);
> + const Type_handler *type_handler() const
> + {return Type_handler_hybrid_field_type::type_handler();}
> +
> + void fix_length_and_dec()
> + {
> + decimals = 10; // TODO-cvicentiu find out how many decimals the
standard
> + // requires.
> + }
> +
> + Item *get_copy(THD *thd, MEM_ROOT *mem_root)
> + { return get_item_copy<Item_sum_percentile_disc>(thd, mem_root, this);
}
> + void setup_window_func(THD *thd, Window_spec *window_spec);
> + void setup_percentile_func(THD *thd, SQL_I_List<ORDER> *list)
> + {
> + value= new_Cached_item(thd, list->first->item[0], FALSE);
> + result_value= new_Cached_item(thd, list->first->item[0], FALSE);
> + }
> + void cleanup()
> + {
> + if (value)
> + {
> + delete value;
> + value= NULL;
> + }
> + if(result_value)
> + {
> + delete result_value;
> + result_value= NULL;
> + }
> + Item_sum_num::cleanup();
> + }
> +
> +private:
> + Cached_item *value;
> + Cached_item *result_value;
> + bool val_calculated;
> +};
> +
>
> class Item_window_func : public Item_func_or_sum
> {
> diff --git a/sql/sql_window.cc b/sql/sql_window.cc
> index 74e1ad25183..9fda4520a05 100644
> --- a/sql/sql_window.cc
> +++ b/sql/sql_window.cc
> @@ -868,9 +873,39 @@ class Partition_read_cursor : public
Table_read_cursor
> }
> return 0;
> }
This is not something we want in Partition_read_cursor. Move the contents
of it to the most derived class that needs it.
> + bool next_func(ha_rows *counter)
> + {
> + if (next())
> + return true;
> + if (!check_for_null_row())
> + {
> + (*counter)++;
> + }
> + return false;
> + }
This is also not something we want in Partition_read_cursor.
> + bool fetch_func(ha_rows *counter)
> + {
> + if (fetch())
> + return true;
> + if (!check_for_null_row())
> + {
> + (*counter)++;
> + }
> + return false;
> + }
Null row here has no meaning. Partition_read_cursor looks at the
bound_tracker
for partitions. What "row" are we talking about here. Then again,
this logic can probably go into our "special" frame cursor below.
> + bool check_for_null_row()
> + {
> + if (!end_of_partition)
> + {
Check compare_with_cache_for_null_values comment.
> + if (order_tracker.compare_with_cache_for_null_values())
> + return true;
> + }
> + return false;
> + }
>
> private:
> Group_bound_tracker bound_tracker;
Not something we want in Partition_read_cursor. If you need an order_tracker
define a different class.
> + Group_bound_tracker order_tracker;
> bool end_of_partition;
> };
>
> @@ -1102,7 +1137,7 @@ class Frame_range_n_top : public Frame_cursor
> SQL_I_List<ORDER> *partition_list,
> SQL_I_List<ORDER> *order_list,
> bool is_preceding_arg, Item *n_val_arg) :
So this gets reverted as discussed on slack.
> - cursor(thd, partition_list), n_val(n_val_arg), item_add(NULL),
> + cursor(thd, partition_list, NULL), n_val(n_val_arg), item_add(NULL),
> is_preceding(is_preceding_arg)
> {
> DBUG_ASSERT(order_list->elements == 1);
> @@ -1241,7 +1276,7 @@ class Frame_range_n_bottom: public Frame_cursor
> SQL_I_List<ORDER> *partition_list,
> SQL_I_List<ORDER> *order_list,
> bool is_preceding_arg, Item *n_val_arg) :
So this gets reverted as discussed on slack.
> - cursor(thd, partition_list), n_val(n_val_arg), item_add(NULL),
> + cursor(thd, partition_list, NULL), n_val(n_val_arg), item_add(NULL),
> is_preceding(is_preceding_arg), added_values(false)
> {
> DBUG_ASSERT(order_list->elements == 1);
> @@ -1371,7 +1406,7 @@ class Frame_range_current_row_bottom: public
Frame_cursor
> Frame_range_current_row_bottom(THD *thd,
> SQL_I_List<ORDER> *partition_list,
> SQL_I_List<ORDER> *order_list) :
So this gets reverted as discussed on slack.
> - cursor(thd, partition_list), peer_tracker(thd, order_list)
> + cursor(thd, partition_list, NULL), peer_tracker(thd, order_list)
> {
> }
>
> @@ -1658,6 +1693,35 @@ class Frame_unbounded_following_set_count : public
Frame_unbounded_following
> }
> };
>
As discussed on slack, this needs to be renamed to something that makes
sense.
> +class Frame_unbounded_following_set_count_special : public
Frame_unbounded_following_set_count
> +{
> +public:
> + Frame_unbounded_following_set_count_special(
> + THD *thd,
> + SQL_I_List<ORDER> *partition_list, SQL_I_List<ORDER> *order_list) :
> + Frame_unbounded_following_set_count(thd, partition_list, order_list)
> + {}
> +
next_partition can have the logic of fetch_func directly in it.
> + void next_partition(ha_rows rownum)
> + {
> + ha_rows num_rows_in_partition= 0;
> + if (cursor.fetch_func(&num_rows_in_partition))
> + return;
> +
Dangerous to have a while loop like this. Lets put next_func code here
directly.
> + /* Walk to the end of the partition, find how many rows there are. */
> + while (!cursor.next_func(&num_rows_in_partition));
> +
> + List_iterator_fast<Item_sum> it(sum_functions);
> + Item_sum* item;
> + while ((item= it++))
> + {
> + Item_sum_window_with_row_count* item_with_row_count =
> + static_cast<Item_sum_window_with_row_count *>(item);
> + item_with_row_count->set_row_count(num_rows_in_partition);
> + }
> + }
> +};
> +
>
/////////////////////////////////////////////////////////////////////////////
> // ROWS-type frame bounds
>
/////////////////////////////////////////////////////////////////////////////
> diff --git a/sql/sql_yacc.yy b/sql/sql_yacc.yy
> index a0bbf39b138..6c64477f08d 100644
> --- a/sql/sql_yacc.yy
> +++ b/sql/sql_yacc.yy
> @@ -10671,6 +10676,45 @@ simple_window_func:
> }
> ;
>
> +inverse_distribution_function:
> + inverse_distribution_function_def WITHIN GROUP_SYM
> + '('
> + { Select->prepare_add_window_spec(thd); }
> + order_by_single_element_list ')' OVER_SYM
> + '(' opt_window_ref opt_window_partition_clause ')'
> + {
> + LEX *lex= Lex;
> + if (Select->add_window_spec(thd, lex->win_ref,
> + Select->group_list,
> + Select->order_list,
> + NULL))
> + MYSQL_YYABORT;
> + $$= new (thd->mem_root) Item_window_func(thd, (Item_sum *) $1,
> + thd->lex->win_spec);
> + if ($$ == NULL)
> + MYSQL_YYABORT;
> + if (Select->add_window_func((Item_window_func *) $$))
> + MYSQL_YYABORT;
> + }
> + ;
> +
> +inverse_distribution_function_def:
> + PERCENTILE_CONT_SYM '(' expr ')'
Indentation for { and } is not correct here. Please check this file and do
the same.
> + {
mark it as TODO(varun):
> + //Not yet started implementing
> + }
> + | PERCENTILE_DISC_SYM '(' expr ')'
Same, check indentation for { and }
> + {
> + $$= new (thd->mem_root) Item_sum_percentile_disc(thd, $3);
> + if ($$ == NULL)
> + MYSQL_YYABORT;
> + }
> + ;
> +
Indentation check!
> +order_by_single_element_list:
> + ORDER_SYM BY order_list
> + ;
> +
> window_name:
> ident
> {