← Back to team overview

maria-developers team mailing list archive

Review: MDEV-12985: Support percentile and media window functions

 

Hi Varun!


Here is the in-detail review for the percentile functions. Please add a
grammar definition for median too and test it. All in all, not many
changes, but please address my comments. If you disagree with some of them,
let's discuss them.

> diff --git a/mysql-test/r/win_percentile_cont.result
b/mysql-test/r/win_percentile_cont.result
> new file mode 100644
> index 00000000000..61f70892887
> --- /dev/null
> +++ b/mysql-test/r/win_percentile_cont.result
> diff --git a/mysql-test/t/win_percentile_cont.test
b/mysql-test/t/win_percentile_cont.test
> new file mode 100644
> index 00000000000..75fde963b2a
> --- /dev/null
> +++ b/mysql-test/t/win_percentile_cont.test
> @@ -0,0 +1,55 @@
Please put more tests here, as we discussed.
Also, to make test cases easier to follow, please add cume_dist as a
function
in the select to all these use cases.
> +CREATE TABLE student (name CHAR(10), test double, score DECIMAL(19,4));
> +INSERT INTO student VALUES
> +('Chun', 0, 3), ('Chun', 0, 7),
> +('Kaolin', 0.5, 3), ('Kaolin', 0.6, 7),
> +('Kaolin', 0.5, 4),
> +('Tatiana', 0.8, 4), ('Tata', 0.8, 4);
> +
Make this --echo # No partition clause
> +#no partition clause
> +select name, percentile_disc(0.5) within group(order by score) over ()
from student;
> +select name, percentile_cont(0.5) within group(order by score) over ()
from student;
> +
Same as before, use --echo # so we can see it in the result set too.
> +# argument set to null
> +select name, percentile_cont(null) within group(order by score) over
(partition by name) from student;
> +select name, percentile_disc(null) within group(order by score) over
(partition by name) from student;
> +
Use --echo #
> +# complete query with partition column
> +select name, percentile_cont(0.5) within group(order by score) over
(partition by name) as c from student;
> +select name, percentile_disc(0.5) within group(order by score) over
(partition by name) as c from student;
> +
Use --echo #
> +#subqueries having percentile functions
> +
> +select * from ( select name , percentile_cont(0.5) within group ( order
by score) over (partition by name ) from student ) as t;
> +select * from ( select name , percentile_disc(0.5) within group ( order
by score) over (partition by name ) from student ) as t;
> +select name from student a where (select percentile_disc(0.5) within
group (order by score) over (partition by name) from student b limit 1) >=
0.5;
> +
USe --echo
> +# WITH STORED PROCEDURES
> +
> +
USe --echo
> +#DISALLOWED FIELDS IN ORDER BY CLAUSE
> +--error ER_WRONG_TYPE_FOR_PERCENTILE_CONT
> +select score, percentile_cont(0.5) within group(order by name) over
(partition by score) from student;
> +select score, percentile_disc(0.5) within group(order by name) over
(partition by score) from student;
>
USe --echo
> +
> +# error with 2 order by elements
> +
> +--error ER_NOT_SINGLE_ELEMENT_ORDER_LIST
> +select percentile_disc(0.5) within group(order by score,test) over
(partition by name) from student;
> +--error ER_NOT_SINGLE_ELEMENT_ORDER_LIST
> +select percentile_cont(0.5) within group(order by score,test) over
(partition by name) from student;
> +
Use --echo and fix that comment. We thought of a way for this right?
> +#parameter value should be in the range of 0 to 1( NEED TO THINK A WAY
FOR THIS)
> +--error ER_ARGUMENT_OUT_OF_RANGE
> +select percentile_disc(1.5) within group(order by score) over (partition
by name) from student;
> +--error ER_ARGUMENT_OUT_OF_RANGE
> +select percentile_cont(1.5) within group(order by score) over (partition
by name) from student;
> +
> +--error ER_ARGUMENT_NOT_CONSTANT
> +select name,percentile_cont(test) within group(order by score) over
(partition by name) from student;
> +--error ER_ARGUMENT_NOT_CONSTANT
> +select name, percentile_disc(test) within group(order by score) over
(partition by name) from student;
> +
This is commented out. Not good, fix it please. And use --echo #
> +#CHECK TYPE OF THE ARGUMENT, SHOULD BE ONLY NUMERICAL
> +#select name, percentile_cont(name) within group(order by score) over
(partition by name) from student;
> +
>
> +drop table student;
> diff --git a/sql/item.h b/sql/item.h
> index 76ce4aa935f..108b0838dfc 100644
> --- a/sql/item.h
> +++ b/sql/item.h
> @@ -5245,6 +5246,7 @@ class Cached_item_int :public Cached_item_item
> Cached_item_int(Item *item_par) :Cached_item_item(item_par),value(0) {}
> bool cmp(void);
> int cmp_read_only();
Delete this, it's not used.
> + longlong get_value(){ return value;}
> };
>
>
> @@ -5255,6 +5257,7 @@ class Cached_item_decimal :public Cached_item_item
> Cached_item_decimal(Item *item_par);
> bool cmp(void);
> int cmp_read_only();
Delete this, it's not used.
> + my_decimal *get_value(){ return &value;};
> };
>
> class Cached_item_field :public Cached_item
> diff --git a/sql/item_windowfunc.cc b/sql/item_windowfunc.cc
> index 6ab903a81bb..916f4113682 100644
> --- a/sql/item_windowfunc.cc
> +++ b/sql/item_windowfunc.cc
> diff --git a/sql/item_windowfunc.h b/sql/item_windowfunc.h
> index 77cbd556e60..ca2ed7136ec 100644
> --- a/sql/item_windowfunc.h
> +++ b/sql/item_windowfunc.h
> @@ -678,6 +687,275 @@ class Item_sum_ntile : public
Item_sum_window_with_row_count
> ulong current_row_count_;
> };
>
> +class Item_sum_percentile_disc : public Item_sum_cume_dist,
> + public Type_handler_hybrid_field_type
> +{
> +public:
> + Item_sum_percentile_disc(THD *thd, Item* arg) : Item_sum_cume_dist(thd,
arg),
> + Type_handler_hybrid_field_type(&type_handler_longlong),
> + value(NULL), val_calculated(FALSE), first_call(TRUE),
> + prev_value(0), order_item(NULL){}
> +
> + double val_real()
> + {
> + if (get_row_count() == 0 || get_arg(0)->is_null())
> + {
> + null_value= true;
> + return 0;
> + }
> + null_value= false;
> + return value->val_real();
> + }
> +
> + longlong val_int()
> + {
> + if (get_row_count() == 0 || get_arg(0)->is_null())
> + {
> + null_value= true;
> + return 0;
> + }
> + null_value= false;
> + return value->val_int();
> + }
> +
> + my_decimal* val_decimal(my_decimal* dec)
> + {
> + if (get_row_count() == 0 || get_arg(0)->is_null())
> + {
> + null_value= true;
> + return 0;
> + }
> + null_value= false;
> + return value->val_decimal(dec);
> + }
> +
> + String* val_str(String *str)
> + {
> + if (get_row_count() == 0 || get_arg(0)->is_null())
> + {
> + null_value= true;
> + return 0;
> + }
> + null_value= false;
> + return value->val_str(str);
> + }
> +
> + bool add()
> + {
An extra space here before =
> + Item *arg = get_arg(0);
> + if (arg->is_null())
> + return false;
> +
> + if (first_call)
> + {
> + prev_value= arg->val_real();
We are missing a space after >.
> + if (prev_value >1 || prev_value < 0)
> + {
> + my_error(ER_ARGUMENT_OUT_OF_RANGE, MYF(0));
> + return true;
> + }
> + first_call= false;
> + }
> +
> + double arg_val= arg->val_real();
> +
There is one extra space here.
> + if (prev_value != arg_val)
> + {
> + my_error(ER_ARGUMENT_NOT_CONSTANT, MYF(0));
> + return true;
> + }
> +
> + if (val_calculated)
> + return false;
> +
> + value->store(order_item);
> + value->cache_value();
> + if (value->null_value)
> + return false;
> +
> + Item_sum_cume_dist::add();
> + double val= Item_sum_cume_dist::val_real();
> +
> + if (val >= prev_value && !val_calculated)
> + val_calculated= true;
> + return false;
> + }
> +
> + enum Sumfunctype sum_func() const
> + {
> + return PERCENTILE_DISC_FUNC;
> + }
> +
> + void clear()
> + {
> + val_calculated= false;
> + first_call= true;
> + value->clear();
> + Item_sum_cume_dist::clear();
> + }
> +
> + const char*func_name() const
> + {
> + return "percentile_disc";
> + }
> +
> + void update_field() {}
> + void set_type_handler(Window_spec *window_spec);
> + const Type_handler *type_handler() const
> + {return Type_handler_hybrid_field_type::type_handler();}
> +
> + void fix_length_and_dec()
> + {
> + decimals = 10; // TODO-cvicentiu find out how many decimals the standard
> + // requires.
> + }
> +
> + Item *get_copy(THD *thd, MEM_ROOT *mem_root)
> + { return get_item_copy<Item_sum_percentile_disc>(thd, mem_root, this); }
> + void setup_window_func(THD *thd, Window_spec *window_spec);
> + void setup_hybrid(THD *thd, Item *item);
> +
> +private:
> + Item_cache *value;
> + bool val_calculated;
> + bool first_call;
> + double prev_value;
> + Item *order_item;
> +};
> +
> +class Item_sum_percentile_cont : public Item_sum_cume_dist,
> + public Type_handler_hybrid_field_type
> +{
> +public:
> + Item_sum_percentile_cont(THD *thd, Item* arg) : Item_sum_cume_dist(thd,
arg),
> + Type_handler_hybrid_field_type(&type_handler_double),
> + floor_value(NULL), ceil_value(NULL), first_call(TRUE),prev_value(0),
> + ceil_val_calculated(FALSE), floor_val_calculated(FALSE),
order_item(NULL){}
> +
> + double val_real()
> + {
> + if (get_row_count() == 0 || get_arg(0)->is_null())
> + {
> + null_value= true;
> + return 0;
> + }
> + null_value= false;
> + double val= 1 + prev_value * (get_row_count()-1);
> +
> + /*
> + Applying the formula to get the value
> + If (CRN = FRN = RN) then the result is (value of expression from row at
RN)
> + Otherwise the result is
> + (CRN - RN) * (value of expression for row at FRN) +
> + (RN - FRN) * (value of expression for row at CRN)
> + */
> +
> + if(ceil(val) == floor(val))
> + return floor_value->val_real();
> +
> + double ret_val= ((val - floor(val)) * ceil_value->val_real()) +
> + ((ceil(val) - val) * floor_value->val_real());
> +
> + return ret_val;
> + }
> +
> + bool add()
> + {
> + Item *arg = get_arg(0);
> + if (arg->is_null())
> + return false;
> +
> + if (first_call)
> + {
> + first_call= false;
> + prev_value= arg->val_real();
> + if (prev_value >1 || prev_value < 0)
> + {
> + my_error(ER_ARGUMENT_OUT_OF_RANGE, MYF(0));
> + return true;
> + }
> + }
> +
> + double arg_val= arg->val_real();
> + if (prev_value != arg_val)
> + {
> + my_error(ER_ARGUMENT_NOT_CONSTANT, MYF(0));
> + return true;
> + }
> +
> + if (!floor_val_calculated)
> + {
> + floor_value->store(order_item);
> + floor_value->cache_value();
> + if (floor_value->null_value)
> + return false;
> + }
> + if (floor_val_calculated && !ceil_val_calculated)
> + {
> + ceil_value->store(order_item);
> + ceil_value->cache_value();
> + if (ceil_value->null_value)
> + return false;
> + }
> +
> + Item_sum_cume_dist::add();
> + double val= 1 + prev_value * (get_row_count()-1);
> +
> + if (!floor_val_calculated && get_row_number() == floor(val))
> + floor_val_calculated= true;
> +
> + if (!ceil_val_calculated && get_row_number() == ceil(val))
> + ceil_val_calculated= true;
> + return false;
> + }
> +
> + enum Sumfunctype sum_func() const
> + {
> + return PERCENTILE_CONT_FUNC;
> + }
> +
> + void clear()
> + {
> + first_call= true;
> + floor_value->clear();
> + ceil_value->clear();
> + floor_val_calculated= false;
> + ceil_val_calculated= false;
> + Item_sum_cume_dist::clear();
> + }
> +
> + const char*func_name() const
> + {
> + return "percentile_cont";
> + }
> + void update_field() {}
> + void set_type_handler(Window_spec *window_spec);
> + const Type_handler *type_handler() const
> + {return Type_handler_hybrid_field_type::type_handler();}
> +
> + void fix_length_and_dec()
> + {
> + decimals = 10; // TODO-cvicentiu find out how many decimals the standard
> + // requires.
> + }
> +
> + Item *get_copy(THD *thd, MEM_ROOT *mem_root)
> + { return get_item_copy<Item_sum_percentile_cont>(thd, mem_root, this); }
> + void setup_window_func(THD *thd, Window_spec *window_spec);
> + void setup_hybrid(THD *thd, Item *item);
> +
> +private:
> + Item_cache *floor_value;
> + Item_cache *ceil_value;
> + bool first_call;
> + double prev_value;
> + bool ceil_val_calculated;
> + bool floor_val_calculated;
> + Item *order_item;
> +};
> +
> +
> +
>
> class Item_window_func : public Item_func_or_sum
> {
> @@ -781,12 +1063,40 @@ class Item_window_func : public Item_func_or_sum
> case Item_sum::DENSE_RANK_FUNC:
> case Item_sum::PERCENT_RANK_FUNC:
> case Item_sum::CUME_DIST_FUNC:
> + case Item_sum::PERCENTILE_CONT_FUNC:
> + case Item_sum::PERCENTILE_DISC_FUNC:
> return true;
> default:
> return false;
> }
> }
Let's make the grammar enforce this. This is silly. Is there a big
limitation
why we can't define a special rule for these functions?
There already is order_by_single_element_list in sql_yacc.yy, it just
isn't defined properly. We can get rid of these checks altogether then.
> + bool only_single_element_order_list() const
> + {
> + switch (window_func()->sum_func()){
> + case Item_sum::PERCENTILE_CONT_FUNC:
> + case Item_sum::PERCENTILE_DISC_FUNC:
> + return true;
> + default:
> + return false;
> + }
> + }
> +
This is not needed. Put this in Item_sum_percentile_disc's fix_fields.
> + void setting_handler_for_percentile_functions(Item_result rtype) const
> + {
> + switch (window_func()->sum_func()){
> + case Item_sum::PERCENTILE_DISC_FUNC:
> + ((Item_sum_percentile_disc* )
window_func())->set_handler_by_cmp_type(rtype);
> + break;
> + default:
> + return;
> + }
> + }
> +
Get rid of this function here. Put the logic of checking this result type in
fix_fields specific to Item_sum_percentile_cont / disc.
> + bool check_result_type_of_order_item();
> +
> +
> +
> /*
> Computation functions.
> TODO: consoder merging these with class Group_bound_tracker.
> diff --git a/sql/share/errmsg-utf8.txt b/sql/share/errmsg-utf8.txt
> index 1f282e6aee5..d8d6975d92c 100644
> --- a/sql/share/errmsg-utf8.txt
> +++ b/sql/share/errmsg-utf8.txt
> @@ -7490,3 +7490,11 @@ ER_WRONG_INSERT_INTO_SEQUENCE
> eng "Wrong INSERT into a SEQUENCE. One can only do single table INSERT
into a squence object (like with mysqldump). If you want to change the
SEQUENCE, use ALTER SEQUENCE instead."
> ER_SP_STACK_TRACE
> eng "At line %u in %s"
This one will go away when we enforce the limitation in the grammar, make
sure
to remove it.
> +ER_NOT_SINGLE_ELEMENT_ORDER_LIST
> + eng "Incorrect number of elements in the order list for '%s'"
> +ER_WRONG_TYPE_FOR_PERCENTILE_CONT
> + eng "Numeric datatype is required for Percentile_CONT function"
As discussed have these two errors print the function name to which the
error is for, just like
ER_NOT_SINGLE_ELEMENT_ORDER_LIST uses now.
> +ER_ARGUMENT_NOT_CONSTANT
> + eng "Argument to the percentile functions is not a constant"
> +ER_ARGUMENT_OUT_OF_RANGE
> + eng "Argument to the percentile functions does not belong to the range
[0,1]"
> diff --git a/sql/sql_window.cc b/sql/sql_window.cc
> index 74e1ad25183..127d6d73ee6 100644
> --- a/sql/sql_window.cc
> +++ b/sql/sql_window.cc
> @@ -304,6 +304,12 @@ setup_windows(THD *thd, Ref_ptr_array
ref_pointer_array, TABLE_LIST *tables,
> win_func_item->update_used_tables();
> }
>
2 problems with this code.
1. We can probably move this logic in
Item_sum_percentile_[cont|disc]::fix_fields.
See if that works.
2. There already is an identical while loop before this. USe that one,
don't have
an extra loop here for no reason.
> + li.rewind();
> + while ((win_func_item= li++))
> + {
> + if (win_func_item->check_result_type_of_order_item())
> + DBUG_RETURN(1);
> + }
> DBUG_RETURN(0);
> }
>
> @@ -1658,6 +1666,49 @@ class Frame_unbounded_following_set_count : public
Frame_unbounded_following
> }
> };
>
This cursor counts all rows in the partition that don't have the order_item
set to null.
Let's rewrite this like so:
Extract this logic:
"""
List_iterator_fast<Item_sum> it(sum_functions);
Item_sum* item;
while ((item= it++))
{
Item_sum_window_with_row_count* item_with_row_count =
static_cast<Item_sum_window_with_row_count *>(item);
item_with_row_count->set_row_count(num_rows_in_partition);
}
"""
1. Nake it a protected method for Frame_unbounded_following_set_count. Lets
call it
set_win_funcs_row_count.
2. Rewrite Frame_unbounded_following_set_count to use that function.
3. Make a subclass Frame_unbounded_following_set_count_no_nulls
4. Make use of the protected method in the
Frame_unbounded_following_set_count_no_nulls
>
> +class Frame_unbounded_following_set_count_special: public
Frame_unbounded_following_set_count
> +{
> +
> +public:
> + Frame_unbounded_following_set_count_special(THD *thd,
> + SQL_I_List<ORDER> *partition_list,
> + SQL_I_List<ORDER> *order_list, Item* arg) :
> + Frame_unbounded_following_set_count(thd,partition_list, order_list)
> + {
> + order_item= order_list->first->item[0];
> + }
> + void next_partition(ha_rows rownum)
> + {
> + ha_rows num_rows_in_partition= 0;
> + if (cursor.fetch())
> + return;
> +
> + /* Walk to the end of the partition, find how many rows there are. */
> + do
> + {
> + if (!order_item->is_null())
> + num_rows_in_partition++;
You are missing a space after }
> + }while (!cursor.next());
> +
> + List_iterator_fast<Item_sum> it(sum_functions);
> + Item_sum* item;
> + while ((item= it++))
> + {
> + Item_sum_window_with_row_count* item_with_row_count =
> + static_cast<Item_sum_window_with_row_count *>(item);
> + item_with_row_count->set_row_count(num_rows_in_partition);
> + }
> + }
> +
> + ha_rows get_curr_rownum() const
> + {
> + return cursor.get_rownum();
> + }
> +
> +private:
> + Item* order_item;
> +};
> +
>
/////////////////////////////////////////////////////////////////////////////
> // ROWS-type frame bounds
>
/////////////////////////////////////////////////////////////////////////////
> diff --git a/sql/sql_yacc.yy b/sql/sql_yacc.yy
> index a0bbf39b138..b3b34fce3a4 100644
> --- a/sql/sql_yacc.yy
> +++ b/sql/sql_yacc.yy
> @@ -10210,6 +10215,11 @@ geometry_function:
> Geometry::wkb_polygon,
> Geometry::wkb_linestring));
> }
I think this is something that should not be here right?
What is this?
> + | WITHIN '(' expr ',' expr ')'
> + {
> + $$= GEOM_NEW(thd, Item_func_spatial_precise_rel(thd, $3, $5,
> + Item_func::SP_WITHIN_FUNC));
> + }
> ;
>
> /*
> @@ -10671,6 +10681,47 @@ simple_window_func:
> }
> ;
>
> +inverse_distribution_function:
> + inverse_distribution_function_def WITHIN GROUP_SYM
> + '('
> + { Select->prepare_add_window_spec(thd); }
> + order_by_single_element_list ')' OVER_SYM
> + '(' opt_window_ref opt_window_partition_clause ')'
> + {
> + LEX *lex= Lex;
> + if (Select->add_window_spec(thd, lex->win_ref,
> + Select->group_list,
> + Select->order_list,
> + NULL))
> + MYSQL_YYABORT;
> + $$= new (thd->mem_root) Item_window_func(thd, (Item_sum *) $1,
> + thd->lex->win_spec);
> + if ($$ == NULL)
> + MYSQL_YYABORT;
> + if (Select->add_window_func((Item_window_func *) $$))
> + MYSQL_YYABORT;
> + }
> + ;
> +
> +inverse_distribution_function_def:
> + PERCENTILE_CONT_SYM '(' expr ')'
> + {
> + $$= new (thd->mem_root) Item_sum_percentile_cont(thd, $3);
> + if ($$ == NULL)
> + MYSQL_YYABORT;
> + }
> + | PERCENTILE_DISC_SYM '(' expr ')'
> + {
> + $$= new (thd->mem_root) Item_sum_percentile_disc(thd, $3);
> + if ($$ == NULL)
> + MYSQL_YYABORT;
> + }
> + ;
> +
>
See if you can make this accept only one order element.
> +order_by_single_element_list:
> + ORDER_SYM BY order_list
> + ;
> +
> window_name:
> ident
> {