maria-developers team mailing list archive

Thread
Date
Fwd: mixing of user-defined data types with other data types

To: Alexander Barkov <bar@xxxxxxxxxxx>, maria-developers <maria-developers@xxxxxxxxxxxxxxxxxxx>
From: Vicențiu Ciorbaru <vicentiu@xxxxxxxxxxx>
Date: Mon, 12 Dec 2016 21:57:52 +0200
In-reply-to: <5841EA8B.3030902@mariadb.org>
Hi Alexander!

Comments inline. Most are stylistic comments. I think the patch is
great. The Item_type_holder is really bugging me though. It feels
poorly designed in the current place.

> diff --git a/sql/field.cc b/sql/field.cc
> index 6f273e6..7876a6f 100644
> --- a/sql/field.cc
> +++ b/sql/field.cc
> @@ -84,9 +84,13 @@ const char field_separator=',';
>  */
>  #define FIELDTYPE_TEAR_FROM (MYSQL_TYPE_BIT + 1)
>  #define FIELDTYPE_TEAR_TO   (MYSQL_TYPE_NEWDECIMAL - 1)
> -#define FIELDTYPE_NUM (FIELDTYPE_TEAR_FROM + (255 - FIELDTYPE_TEAR_TO))
> +#define FIELDTYPE_LAST      254
> +#define FIELDTYPE_NUM (FIELDTYPE_TEAR_FROM + (FIELDTYPE_LAST - FIELDTYPE_TEAR_TO))
>
These defines here look sketchy. I know it's not from your patch but can we not
turn them to const ints instead?
Also, the naming is horibly inconsistent, one uses 2 and others use to.
I vote for _to_. We can have the full function definition as:

const int FIELDTYPE_TEAR_FROM= MYSQL_TYPE_BIT + 1;
const int FIELDTYPE_TEAR_TO=   MYSQL_TYPE_NEWDECIMAL - 1;
const int FIELDTYPE_LAST=      254;
const int FIELDTYPE_NUM=       FIELDTYPE_TEAR_FROM + (FIELDTYPE_LAST -
                                                     FIELDTYPE_TEAR_TO);
static inline int field_type_to_index (enum_field_types field_type)
{
 DBUG_ASSERT(real_type_to_type(field_type) < FIELDTYPE_TEAR_FROM ||
             real_type_to_type(field_type) > FIELDTYPE_TEAR_TO);
 DBUG_ASSERT(field_type <= FIELDTYPE_LAST);
 field_type= real_type_to_type(field_type);
 if (field_type < FIELDTYPE_TEAR_FROM)
   return field_type;
 return FIELDTYPE_TEAR_FROM + (field_type - FIELDTYPE_TEAR_TO) - 1;
}

The ternary operator provides no benefit in this case, let's replace it with
regular if statement. Same number of lines but the code is cleaner I'd say.
>
>  static inline int field_type2index (enum_field_types field_type)
>  {
> +  DBUG_ASSERT(real_type_to_type(field_type) < FIELDTYPE_TEAR_FROM ||
> +              real_type_to_type(field_type) > FIELDTYPE_TEAR_TO);
> +  DBUG_ASSERT(field_type <= FIELDTYPE_LAST);
>    field_type= real_type_to_type(field_type);
>    return (field_type < FIELDTYPE_TEAR_FROM ?
>            field_type :
> @@ -94,6 +98,12 @@ static inline int field_type2index (enum_field_types field_type)
>  }
>
>
> +/**
> +  Implements data type merge rules for the built-in traditional data types.
> +  - UNION
> +  - CASE and its abbreviations COALESCE, IF, IFNULL
> +  - LEAST/GREATEST
> +*/
>
I don't like this comment here as it's not complete. I'd much rather have it
also explain how to interpret this table. I had to look up a calling spot
to figure out exactly how it's done. I'd also have it say:
Ex:
Given Fields A and B of real_types a and b, we find the result type of
COALESCE(A, B) by querying:
field_types_merge_rules[field_type_to_index(a)][field_type_to_index(b)].
>
>
>  static enum_field_types field_types_merge_rules [FIELDTYPE_NUM][FIELDTYPE_NUM]=
>  {
>    /* MYSQL_TYPE_DECIMAL -> */
> @@ -948,46 +927,18 @@ static enum_field_types field_types_merge_rules [FIELDTYPE_NUM][FIELDTYPE_NUM]=
>  enum_field_types Field::field_type_merge(enum_field_types a,
>                                           enum_field_types b)
>  {
> -  DBUG_ASSERT(real_type_to_type(a) < FIELDTYPE_TEAR_FROM ||
> -              real_type_to_type(a) > FIELDTYPE_TEAR_TO);
> -  DBUG_ASSERT(real_type_to_type(b) < FIELDTYPE_TEAR_FROM ||
> -              real_type_to_type(b) > FIELDTYPE_TEAR_TO);
>    return field_types_merge_rules[field_type2index(a)]
>                                  [field_type2index(b)];
>  }
>
> -
> -static Item_result field_types_result_type [FIELDTYPE_NUM]=
> +const Type_handler *
> +Type_handler::aggregate_for_result_traditional(const Type_handler *a,
> +                                               const Type_handler *b)
>  {
> -  //MYSQL_TYPE_DECIMAL      MYSQL_TYPE_TINY
> -  DECIMAL_RESULT,           INT_RESULT,
> -  //MYSQL_TYPE_SHORT        MYSQL_TYPE_LONG
> -  INT_RESULT,               INT_RESULT,
> -  //MYSQL_TYPE_FLOAT        MYSQL_TYPE_DOUBLE
> -  REAL_RESULT,              REAL_RESULT,
> -  //MYSQL_TYPE_NULL         MYSQL_TYPE_TIMESTAMP
> -  STRING_RESULT,            STRING_RESULT,
> -  //MYSQL_TYPE_LONGLONG     MYSQL_TYPE_INT24
> -  INT_RESULT,               INT_RESULT,
> -  //MYSQL_TYPE_DATE         MYSQL_TYPE_TIME
> -  STRING_RESULT,            STRING_RESULT,
> -  //MYSQL_TYPE_DATETIME     MYSQL_TYPE_YEAR
> -  STRING_RESULT,            INT_RESULT,
> -  //MYSQL_TYPE_NEWDATE      MYSQL_TYPE_VARCHAR
> -  STRING_RESULT,            STRING_RESULT,
> -  //MYSQL_TYPE_BIT          <16>-<245>
> -  STRING_RESULT,
> -  //MYSQL_TYPE_NEWDECIMAL   MYSQL_TYPE_ENUM
> -  DECIMAL_RESULT,           STRING_RESULT,
> -  //MYSQL_TYPE_SET          MYSQL_TYPE_TINY_BLOB
> -  STRING_RESULT,            STRING_RESULT,
> -  //MYSQL_TYPE_MEDIUM_BLOB  MYSQL_TYPE_LONG_BLOB
> -  STRING_RESULT,            STRING_RESULT,
> -  //MYSQL_TYPE_BLOB         MYSQL_TYPE_VAR_STRING
> -  STRING_RESULT,            STRING_RESULT,
> -  //MYSQL_TYPE_STRING       MYSQL_TYPE_GEOMETRY
> -  STRING_RESULT,            STRING_RESULT
> -};
> +  enum_field_types ta= a->real_field_type();
> +  enum_field_types tb= b->real_field_type();
> +  return Type_handler::get_handler_by_real_type(Field::field_type_merge(ta, tb));
Nit: this line is over 80 characters. Do we care? Most code tends to comply
to it. It adds one extra line to split so it might not be worth it to adhere
to a full 100% 80 character limit.
> +}
>
>
>  /*
> diff --git a/sql/field.h b/sql/field.h
> index 541da5a..fa84e7d 100644
> --- a/sql/field.h
> +++ b/sql/field.h
> @@ -835,7 +835,6 @@ class Field: public Value_source
>    virtual Item_result cmp_type () const { return result_type(); }
>    static bool type_can_have_key_part(enum_field_types);
>
The field_type_merge function breaks all the other naming patterns. We
have result_type, cmp_type real_type and now field_type_merge. Wouldn't it be
better to name it field_merge_type, to be consistent? Or since this
is a lookup kind of operation, perhaps name it (get|lookup)_merge_field_type?
I know it's not _exactly_ like result_type and compare_type, but it generally
is used in a simillar context.
>
>    static enum_field_types field_type_merge(enum_field_types, enum_field_types);
> -  static Item_result result_merge_type(enum_field_types);
>    virtual bool eq(Field *field)
>    {
>      return (ptr == field->ptr && null_ptr == field->null_ptr &&
> diff --git a/sql/item.cc b/sql/item.cc
> index c97f41f..c9b6155 100644
> --- a/sql/item.cc
> +++ b/sql/item.cc
> @@ -9657,20 +9657,8 @@ Item_type_holder::Item_type_holder(THD *thd, Item *item)
>    maybe_null= item->maybe_null;
>    collation.set(item->collation);
>    get_full_info(item);
> -  /**
> -    Field::result_merge_type(real_field_type()) should be equal to
> -    result_type(), with one exception when "this" is a Item_field for
> -    a BIT field:
> -    - Field_bit::result_type() returns INT_RESULT, so does its Item_field.
> -    - Field::result_merge_type(MYSQL_TYPE_BIT) returns STRING_RESULT.
> -    Perhaps we need a new method in Type_handler to cover these type
> -    merging rules for UNION.
> -  */
> -  DBUG_ASSERT(real_field_type() == MYSQL_TYPE_BIT ||
> -              Item_type_holder::result_type()  ==
> -              Field::result_merge_type(Item_type_holder::real_field_type()));
>    /* fix variable decimals which always is NOT_FIXED_DEC */
> -  if (Field::result_merge_type(real_field_type()) == INT_RESULT)
>
Alright so this seems to be fixed here, looking at Type_handler_bit,
inheriting from Type_handler_int_result. Do we test this somewhere though?
I couldn't find it in the test case, perhaps you can point it out to me.
>
> +  if (Item_type_holder::result_type() == INT_RESULT)
>      decimals= 0;
>    prev_decimal_int_part= item->decimal_int_part();
>  #ifdef HAVE_SPATIAL
> @@ -9785,12 +9773,21 @@ bool Item_type_holder::join_types(THD *thd, Item *item)
>    DBUG_PRINT("info:", ("in type %d len %d, dec %d",
>                         get_real_type(item),
>                         item->max_length, item->decimals));
> -  set_handler_by_real_type(Field::field_type_merge(real_field_type(),
> -                                                   get_real_type(item)));
> +  const Type_handler *item_type_handler=
> +    Type_handler::get_handler_by_real_type(get_real_type(item));
> +  if (aggregate_for_result(item_type_handler))
> +  {
> +    my_error(ER_CANT_AGGREGATE_2TYPES, MYF(0),
> +             Item_type_holder::type_handler()->name().ptr(),
> +             item_type_handler->name().ptr(),
> +             "UNION");
> +    DBUG_RETURN(true);
> +  }
> +
>    {
>      uint item_decimals= item->decimals;
>      /* fix variable decimals which always is NOT_FIXED_DEC */
> -    if (Field::result_merge_type(real_field_type()) == INT_RESULT)
> +    if (Item_type_holder::result_type() == INT_RESULT)
>
Ok, we're avoiding a virtual function call here. What I don't like is that
this Item_type_holder class basically "abuses" the Item interface to such a
degree that it even has to directly declare functions such as:
double Item_type_holder::val_real()
{
  DBUG_ASSERT(0); // should never be called
  return 0.0;
}

Let's discuss about cleaning this up later. To me it feels like this Item does
not really belong in the Item class and should be factored out. Probably
a whole project on its own :)
>
>
>        item_decimals= 0;
>      decimals= MY_MAX(decimals, item_decimals);
>    }
> diff --git a/sql/item_cmpfunc.cc b/sql/item_cmpfunc.cc
> index 98b179b..e5e366e 100644
> --- a/sql/item_cmpfunc.cc
> +++ b/sql/item_cmpfunc.cc
> @@ -180,32 +179,40 @@ static int agg_cmp_type(Item_result *type, Item **items, uint nitems)
>    @return aggregated field type.
>  */
>
> -enum_field_types agg_field_type(Item **items, uint nitems,
> -                                bool treat_bit_as_number)
>
Why is this function in item_cmpfunc.cc and not in sql_type.cc?
> +bool
> +Type_handler_hybrid_field_type::aggregate_for_result(const char *funcname,
> +                                                     Item **items, uint nitems,
> +                                                     bool treat_bit_as_number)
>  {
I would move uint i to be a local for variable. This is a C-style loop.
Is there a compiler that doesn't support this in one of our builders?
Also, how about size_t instead of uint? (Probably not necessary but
wlad made a point of prefering to use that for iterators and such).
>    uint i;
>    if (!nitems || items[0]->result_type() == ROW_RESULT)
>    {
>      DBUG_ASSERT(0);
> -    return MYSQL_TYPE_NULL;
> +    set_handler(&type_handler_null);
> +    return true;
>    }
> -  enum_field_types res= items[0]->field_type();
> +  set_handler(items[0]->type_handler());
>    uint unsigned_count= items[0]->unsigned_flag;
>    for (i= 1 ; i < nitems ; i++)
>    {
> -    enum_field_types cur= items[i]->field_type();
> +    const Type_handler *cur= items[i]->type_handler();
>      if (treat_bit_as_number &&
> -        ((res == MYSQL_TYPE_BIT) ^ (cur == MYSQL_TYPE_BIT)))
> +        ((type_handler() == &type_handler_bit) ^ (cur == &type_handler_bit)))
>      {
> -      if (res == MYSQL_TYPE_BIT)
> -        res= MYSQL_TYPE_LONGLONG; // BIT + non-BIT
> +      if (type_handler() == &type_handler_bit)
> +        set_handler(&type_handler_longlong); // BIT + non-BIT
>        else
> -        cur= MYSQL_TYPE_LONGLONG; // non-BIT + BIT
> +        cur= &type_handler_longlong; // non-BIT + BIT
> +    }
> +    if (aggregate_for_result(cur))
> +    {
> +      my_error(ER_CANT_AGGREGATE_2TYPES, MYF(0),
> +               type_handler()->name().ptr(), cur->name().ptr(), funcname);
> +      return true;
>      }
> -    res= Field::field_type_merge(res, cur);
>      unsigned_count+= items[i]->unsigned_flag;
>    }
> -  switch (res) {
> +  switch (field_type()) {
>    case MYSQL_TYPE_TINY:
>    case MYSQL_TYPE_SHORT:
>    case MYSQL_TYPE_LONG:
> diff --git a/sql/sql_type.cc b/sql/sql_type.cc
> index 8746595..397b5cf 100644
> --- a/sql/sql_type.cc
> +++ b/sql/sql_type.cc
> @@ -54,6 +51,41 @@ static Type_handler_set         type_handler_set;
>  Type_handler_null        type_handler_null;
>  Type_handler_row         type_handler_row;
>  Type_handler_varchar     type_handler_varchar;
> +Type_handler_newdecimal  type_handler_newdecimal;
> +Type_handler_longlong    type_handler_longlong;
> +Type_handler_bit         type_handler_bit;
> +
> +
I'm sure there's a better way to write this so that it gets initialized at
compile time instead of at runtime (before main).
Perhaps we can define the Type_aggregator differently. I need to look this
Up. For now it will work.
Standard offers no guarantees regarding the order, but it shouldn't matter
for us as the address shouldn't change for global objects during
initialization.
"It is implementation-defined whether or not the dynamic initialization
(8.5, 9.4, 12.1, 12.6.1) of an object of namespace scope is done before the
first statement of main. If the initialization is deferred to some point in
time after the first statement of main, it shall occur before the first use
of any function or object defined in the same translation unit as the object
to be initialized."
> +Type_aggregator type_aggregator_for_result;
> +
> +
> +class Static_data_initializer
> +{
> +public:
> +  static Static_data_initializer m_singleton;
> +  Static_data_initializer()
> +  {
> +#ifdef HAVE_SPATIAL
> +    type_aggregator_for_result.add(&type_handler_geometry,
> +                                   &type_handler_null,
> +                                   &type_handler_geometry);
> +    type_aggregator_for_result.add(&type_handler_geometry,
> +                                   &type_handler_geometry,
> +                                   &type_handler_geometry);
> +    type_aggregator_for_result.add(&type_handler_geometry,
> +                                   &type_handler_blob,
> +                                   &type_handler_long_blob);
> +    type_aggregator_for_result.add(&type_handler_geometry,
> +                                   &type_handler_varchar,
> +                                   &type_handler_long_blob);
> +    type_aggregator_for_result.add(&type_handler_geometry,
> +                                   &type_handler_string,
> +                                   &type_handler_long_blob);
> +#endif
> +  }
> +};
> +
> +Static_data_initializer Static_data_initializer::m_singleton;
>
>
>  /**
> diff --git a/sql/sql_type.h b/sql/sql_type.h
> index 92dee61..4bc5542 100644
> --- a/sql/sql_type.h
> +++ b/sql/sql_type.h
> @@ -214,6 +214,31 @@ class Type_std_attributes
>  };
>
>
I don't like this class too much. One can easily break it by either changing
LEX_CSTRING::str or LEX_CSTRING::length without changing the other one.
I suggest we make the inheritance private so that the only way to access the
members is through the methods available.
A suggestion I have is to create a generic "String" class that has this same
behaviour, without calling it Name. Afterwards typedefing it to Name.
>
> +class Name: public LEX_CSTRING
> +{
> +public:
> +  Name(const char *str_arg, uint length_arg)
> +  {
> +    LEX_CSTRING::str= str_arg;
> +    LEX_CSTRING::length= length_arg;
> +  }
Also, these ought to be removed if they are commented out.
> +  /*
> +  Name()
> +  {
> +    LEX_CSTRING::str= NULL;
> +    LEX_CSTRING::length= 0;
> +  }
> +  Name(const LEX_STRING &name)
> +  {
> +    LEX_CSTRING::str= name.str;
> +    LEX_CSTRING::length= name.length;
> +  }
> +  */
> +  const char *ptr() const { return LEX_CSTRING::str; }
> +  uint length() const { return LEX_CSTRING::length; }
> +};
> +
> +
>  class Type_handler
>  {
>  protected:
> @@ -235,6 +260,11 @@ class Type_handler
>      DBUG_ASSERT(type != TIME_RESULT);
>      return get_handler_by_cmp_type(type);
>    }
> +  static const
> +  Type_handler *aggregate_for_result_traditional(const Type_handler *h1,
> +                                                 const Type_handler *h2);
> +
This function can return a const reference from a const static object
within the class's namespace. Why create an object every time this is called?
Compiler might optimize it, but why risk it?
This goes for all the implementations.
> +  virtual const Name name() const= 0;
>    virtual enum_field_types field_type() const= 0;
>    virtual enum_field_types real_field_type() const { return field_type(); }
>    virtual Item_result result_type() const= 0;
> @@ -884,5 +958,63 @@ class Type_handler_hybrid_real_field_type:
>  extern Type_handler_row   type_handler_row;
>  extern Type_handler_null  type_handler_null;
>  extern Type_handler_varchar type_handler_varchar;
> +extern Type_handler_newdecimal type_handler_newdecimal;
> +extern Type_handler_longlong type_handler_longlong;
> +extern Type_handler_bit type_handler_bit;
> +
> +class Type_aggregator
> +{

I'm not sure if Element is a good name for this. Aggregate_result_pair maybe?

> +  class Element
> +  {
> +  public:
> +    const Type_handler *m_handler1;
> +    const Type_handler *m_handler2;
> +    const Type_handler *m_result;
> +    Element() { }
> +    Element(const Type_handler *handler1,
> +            const Type_handler *handler2,
> +            const Type_handler *result)
> +     :m_handler1(handler1), m_handler2(handler2), m_result(result)
> +    { }
> +    bool eq(const Type_handler *handler1, const Type_handler *handler2) const
> +    {
> +      return m_handler1 == handler1 && m_handler2 == handler2;
> +    }
> +  };
So, we've discussed this before to change to a dynamic array. I prefer the
template Dynamic_array over DYNAMIC_ARRAY.
> +  Element m_element_array[256];
> +  uint m_element_count;
> +  const Element *element(uint i) const { return &m_element_array[i]; }
> +public:
> +  Type_aggregator()
> +   :m_element_count(0)
> +  { }
> +  bool add(const Type_handler *handler1,
> +           const Type_handler *handler2,
> +           const Type_handler *result)
> +  {
> +    m_element_array[m_element_count]= Element(handler1, handler2, result);
> +    m_element_count++;
> +    return false;
> +  }
> +  const Element *find_element(const Type_handler *handler1,
> +                              const Type_handler *handler2) const
> +  {
> +    for (uint i= 0; i < m_element_count; i++)
> +    {
> +      const Element *el= element(i);
> +      if (el->eq(handler1, handler2) || el->eq(handler2, handler1))
> +        return el;
> +    }
> +    return NULL;
> +  }
> +  const Type_handler *find_handler(const Type_handler *handler1,
> +                                   const Type_handler *handler2) const
> +  {
> +    const Element *el= find_element(handler1, handler2);
> +    return el ? el->m_result : NULL;
> +  }
> +};
> +
> +extern Type_aggregator type_aggregator_for_result;
>
>  #endif /* SQL_TYPE_H_INCLUDED */
Follow ups

Re: Fwd: mixing of user-defined data types with other data types
From: Alexander Barkov, 2016-12-13