maria-developers team mailing list archive

Thread
Date
Re: d1ccc60: MDEV-14005 Remove need for Partition Key to be part of Primary Key.

To: maria-developers@xxxxxxxxxxxxxxxxxxx
From: Sergei Golubchik <serg@xxxxxxxxxxx>
Date: Fri, 5 Oct 2018 23:59:27 +0200
Cc: holyfoot@xxxxxxxxxxx
In-reply-to: <20180821104921.65149140C2D@nebo.localdomain>
User-agent: Mutt/1.7.2 (2016-11-26)
Hi, Alexey!

Some minor comments, questions, see below...

On Aug 21, Alexey Botchkov wrote:
> revision-id: d1ccc60360c79cd456abbebafc1c80c8043b7dce (mariadb-10.3.6-106-gd1ccc60)
> parent(s): ead9a34a3e934f607c2ea7a6c68f7f6d9d29b5bd
> committer: Alexey Botchkov
> timestamp: 2018-08-21 14:47:45 +0400
> message:
> 
> MDEV-14005 Remove need for Partition Key to be part of Primary Key.
> 
> The limitation was removed, so now we check all the partition
> for unique key duplicates in these cases.
> 
> diff --git a/mysql-test/main/partition_unique.result b/mysql-test/main/partition_unique.result
> new file mode 100644
> index 0000000..17a9550
> --- /dev/null
> +++ b/mysql-test/main/partition_unique.result
> @@ -0,0 +1,51 @@
> +CREATE TABLE t1 (
> +id int(10) NOT NULL,
> +status varchar(1) DEFAULT NULL,
> +PRIMARY KEY (`id`)
> +)
> +PARTITION BY LIST  COLUMNS(`status`)
> +(
> +PARTITION `a` VALUES IN ('A'),
> +PARTITION `b` VALUES IN ('B'),
> +PARTITION `c` VALUES IN ('C'),
> +PARTITION `d` DEFAULT);
> +INSERT INTO t1 VALUES (4, 'A');
> +INSERT INTO t1 VALUES (6, 'A');
> +INSERT INTO t1 VALUES (4, 'C');
> +ERROR 23000: Duplicate entry '4' for key 'PRIMARY'
> +INSERT INTO t1 VALUES (5, 'C');
> +UPDATE t1 SET id=4 WHERE id=5;
> +ERROR 23000: Duplicate entry '4' for key 'PRIMARY'
> +UPDATE t1 SET id=4 WHERE id=5 AND status='C';
> +ERROR 23000: Duplicate entry '4' for key 'PRIMARY'
> +UPDATE t1 SET id=6 WHERE id=4 AND status='A';
> +ERROR 23000: Duplicate entry '6' for key 'PRIMARY'
> +select * from t1;
> +id	status
> +4	A
> +6	A
> +5	C
> +connect  con1,localhost,root,,test;
> +connect  con2,localhost,root,,test;
> +connection con1;
> +SET DEBUG_SYNC= 'berfore_part_unique_check SIGNAL bpu_hit WAIT_FOR bpu_flushed';

may be s/berfore/before/ ? :)

> +INSERT INTO t1 VALUES(7, 'A');
> +connection con2;
> +SET DEBUG_SYNC= 'now WAIT_FOR bpu_hit';
> +INSERT INTO t1 VALUES(7, 'C');
> +connection default;
> +SET DEBUG_SYNC= 'now SIGNAL bpu_flushed';
> +connection con1;
> +connection con2;
> +ERROR 23000: Duplicate entry '7' for key 'PRIMARY'
> +disconnect con1;
> +disconnect con2;
> +connection default;
> +select * from t1;
> +id	status
> +4	A
> +6	A
> +7	A
> +5	C
> +DROP TABLE t1;
> +set debug_sync= "RESET";
> diff --git a/mysql-test/main/partition_unique.test b/mysql-test/main/partition_unique.test
> new file mode 100644
> index 0000000..f292359
> --- /dev/null
> +++ b/mysql-test/main/partition_unique.test
> @@ -0,0 +1,58 @@
> +--source include/have_innodb.inc
> +--source include/have_partition.inc
> +
> +CREATE TABLE t1 (
> +      id int(10) NOT NULL,
> +      status varchar(1) DEFAULT NULL,
> +      PRIMARY KEY (`id`)
> +    )
> +   PARTITION BY LIST  COLUMNS(`status`)
> +  (
> +    PARTITION `a` VALUES IN ('A'),
> +    PARTITION `b` VALUES IN ('B'),
> +    PARTITION `c` VALUES IN ('C'),
> +    PARTITION `d` DEFAULT);

you don't seem to test innodb, this looks like a partitioned myisam table

> +
> +INSERT INTO t1 VALUES (4, 'A');
> +INSERT INTO t1 VALUES (6, 'A');
> +--error ER_DUP_ENTRY
> +INSERT INTO t1 VALUES (4, 'C');
> +INSERT INTO t1 VALUES (5, 'C');
> +--error ER_DUP_ENTRY
> +UPDATE t1 SET id=4 WHERE id=5;
> +--error ER_DUP_ENTRY
> +UPDATE t1 SET id=4 WHERE id=5 AND status='C';
> +--error ER_DUP_ENTRY
> +UPDATE t1 SET id=6 WHERE id=4 AND status='A';
> +select * from t1;
> +
> +connect (con1,localhost,root,,test);
> +connect (con2,localhost,root,,test);
> +connection con1;
> +SET DEBUG_SYNC= 'berfore_part_unique_check SIGNAL bpu_hit WAIT_FOR bpu_flushed';
> +send INSERT INTO t1 VALUES(7, 'A');
> +
> +connection con2;
> +SET DEBUG_SYNC= 'now WAIT_FOR bpu_hit';
> +send INSERT INTO t1 VALUES(7, 'C');

add also a test that you can insert some other value in 'A' and 'C'
partitions. That is, partitions aren't completely locked.

> +
> +connection default;
> +SET DEBUG_SYNC= 'now SIGNAL bpu_flushed';
> +
> +connection con1;
> +--reap
> +connection con2;
> +--error ER_DUP_ENTRY
> +--reap
> +
> +disconnect con1;
> +disconnect con2;
> +
> +connection default;
> +
> +select * from t1;
> +
> +DROP TABLE t1;
> +
> +set debug_sync= "RESET";
> +
> diff --git a/sql/ha_partition.cc b/sql/ha_partition.cc
> index 262e791..da07075 100644
> --- a/sql/ha_partition.cc
> +++ b/sql/ha_partition.cc
> @@ -368,6 +368,7 @@ void ha_partition::init_handler_variables()
>    part_share= NULL;
>    m_new_partitions_share_refs.empty();
>    m_part_ids_sorted_by_num_of_records= NULL;
> +  m_cu_errkey= (uint) -1;

rather confusing name. Could you de-confuse it?
Why not simply m_errkey? or m_err_dupkey, or whatever

and why did you make it unsigned? you need to cast in every second
occurence of it :)

>    m_partitions_to_open= NULL;
>  
>    m_range_info= NULL;
> @@ -4198,6 +4199,95 @@ void ha_partition::try_semi_consistent_read(bool yes)
>  }
>  
>  
> +int ha_partition::check_files_for_key(uchar *key, int n_key,
> +                                      int part_begin, int part_end,
> +                                      int part_to_skip,
> +                                      int *res)
> +{
> +  DBUG_ASSERT(inited == NONE ||
> +              (inited == RND && !m_scan_value));
> +
> +  for (int n=part_begin; n < part_end; n++)
> +  {
> +    handler *f= m_file[n];
> +    init_stat sav_inited;
> +
> +    if ((int) n == part_to_skip)
> +      continue;
> +
> +    if ((sav_inited= f->inited) == RND)
> +      f->ha_rnd_end();

When can this happen?

> +
> +    *res= f->index_read_idx_map(m_part_info->table->record[0],
> +        n_key, key, HA_WHOLE_KEY, HA_READ_KEY_EXACT);
> +
> +    f->ha_end_keyread();
> +
> +    if (sav_inited == RND)
> +    {
> +      int ires= f->ha_rnd_init(FALSE);
> +      if (ires)
> +        *res= ires;
> +    }
> +
> +    if (*res == HA_ERR_KEY_NOT_FOUND)
> +      continue;
> +
> +    if (*res == 0)
> +      *res= HA_ERR_FOUND_DUPP_KEY;
> +
> +    m_last_part= n;
> +    m_cu_errkey= n_key;
> +    return 1;
> +  }
> +
> +  *res= 0;
> +  return 0;
> +}
> +
> +
> +int ha_partition::check_uniques_insert(uchar *buf,
> +                                       int part_begin, int part_end,
> +                                       KEY **dup_key,
> +                                       int *res)
> +{
> +  uint n_key;
> +
> +  for (n_key=0; n_key < m_part_info->n_uniques_to_check; n_key++)
> +  {
> +    uchar *cbuf= m_part_info->unique_key_buf[0];
> +    KEY *ckey= m_part_info->uniques_to_check[n_key];
> +    uint n;
> +
> +    for (n=0; n < ckey->user_defined_key_parts; n++)
> +    {

can you use key_copy() here?

> +      const KEY_PART_INFO *cpart= ckey->key_part + n;
> +      uint maybe_null= MY_TEST(cpart->null_bit);
> +      Field *f= cpart->field;
> +      my_ptrdiff_t ofs= buf-table->record[0];
> +
> +      f->move_field_offset(ofs);
> +      if (maybe_null)
> +        cbuf[0]= f->is_null();
> +      f->get_key_image(cbuf+maybe_null, cpart->length, Field::itRAW);
> +      cbuf+= cpart->store_length;
> +      f->move_field_offset(-ofs);
> +    }
> +
> +    if (check_files_for_key(m_part_info->unique_key_buf[0],
> +                            table->key_info - ckey,
> +                            part_begin, part_end, -1, res))
> +    {
> +      *dup_key= ckey;
> +      return 1;
> +    }
> +  }
> +
> +  *res= 0;
> +  return 0;
> +}
> +
> +
>  /****************************************************************************
>                  MODULE change record
>  ****************************************************************************/
> @@ -4307,7 +4398,41 @@ int ha_partition::write_row(uchar * buf)
>    start_part_bulk_insert(thd, part_id);
>  
>    tmp_disable_binlog(thd); /* Do not replicate the low-level changes. */
> +
> +  /*
> +     Check unique keys (if there is any) for duplications in
> +     partitions 0 .. inserted partition, then
> +     do the write_row then check the unique in
> +     partitions inserted partition +1 .. m_tot_parts.
> +     We do so to keep the order of locking always same to
> +     avoid deadlocks.
> +  */
> +  if (table->part_info->n_uniques_to_check &&
> +      check_uniques_insert(buf, 0, part_id, &dup_key, &error))
> +  {
> +    goto exit;
> +  }
> +
>    error= m_file[part_id]->ha_write_row(buf);
> +
> +  DEBUG_SYNC(thd, "berfore_part_unique_check");
> +
> +  if (!error && table->part_info->n_uniques_to_check &&
> +      check_uniques_insert(buf, part_id+1, m_tot_parts, &dup_key, &error))
> +  {
> +    /*
> +      Errors here are ignored, as the error already found.
> +      and i don't have a good idea what to do if things go
> +      wrong here.
> +    */
> +    if (!(m_file[part_id]->ha_index_read_idx_map(table->record[1],
> +            dup_key - table->key_info, table->part_info->unique_key_buf[0],
> +            HA_WHOLE_KEY, HA_READ_KEY_EXACT)))

1. why do you need ha_index_read_idx_map here?
   you've just did ha_write_row, isn't it enough?
2. if the table is transational and no IGNORE was used,
   you don't need to delete anything, the statement will be
   rolled back anyway. an easy optimization.

> +    {
> +      (void) m_file[part_id]->ha_delete_row(buf);
> +    }
> +  }
> +
>    if (have_auto_increment && !table->s->next_number_keypart)
>      set_auto_increment_if_higher(table->next_number_field);
>    reenable_binlog(thd);
> @@ -4317,6 +4442,59 @@ int ha_partition::write_row(uchar * buf)
>    table->auto_increment_field_not_null= saved_auto_inc_field_not_null;
>    DBUG_RETURN(error);
>  }
> + 
> + 
> +int ha_partition::check_uniques_update(const uchar *old_data,
> +                                       const uchar *new_data,
> +                                       int new_part_id, int *res)
> +{
> +  uint n_key;
> +
> +  for (n_key=0; n_key < m_part_info->n_uniques_to_check; n_key++)
> +  {
> +    uchar *buf0= m_part_info->unique_key_buf[0];
> +    uchar *buf1= m_part_info->unique_key_buf[1];
> +    KEY *ckey= m_part_info->uniques_to_check[n_key];
> +    uint n;
> +
> +    for (n=0; n < ckey->user_defined_key_parts; n++)
> +    {
> +      const KEY_PART_INFO *cpart= ckey->key_part + n;
> +      uint maybe_null= MY_TEST(cpart->null_bit);
> +      Field *f= cpart->field;
> +      my_ptrdiff_t dif;
> +
> +      dif= old_data - table->record[0];
> +      f->move_field_offset(dif);
> +      if (maybe_null)
> +        buf0[0]= f->is_null();
> +      f->get_key_image(buf0+maybe_null, cpart->length, Field::itRAW);
> +      buf0+= cpart->store_length;
> +      f->move_field_offset(-dif);
> +
> +      dif= new_data - table->record[0];
> +      f->move_field_offset(dif);
> +      if (maybe_null)
> +        buf1[0]= f->is_null();
> +      f->get_key_image(buf1+maybe_null, cpart->length, Field::itRAW);
> +      buf1+= cpart->store_length;
> +      f->move_field_offset(-dif);
> +    }
> +
> +    if (memcmp(m_part_info->unique_key_buf[0], m_part_info->unique_key_buf[1],
> +               buf0 - m_part_info->unique_key_buf[0]) == 0)
> +    {
> +      /* Key did not change. */
> +      continue;
> +    }
> +
> +    if (check_files_for_key(m_part_info->unique_key_buf[1],
> +                    table->key_info - ckey, 0, m_tot_parts, new_part_id, res))

How does that work?

I'd expect you to do like you did for write_row:
- check from 0 to new_part_id, skipping old_part_id,
- insert the row
- check from new_part_id+1 to m_last_part, skipping old_part_id.

> +      return 1;
> +  }
> +
> +  return 0;
> +}
>  
>  
>  /*
> @@ -5754,11 +5935,14 @@ int ha_partition::index_read_idx_map(uchar *buf, uint index,
>      get_partition_set(table, buf, index, &m_start_key, &m_part_spec);
>  
>      /*
> -      We have either found exactly 1 partition
> +      If there is no 'unbound' unique keys where not all keyparts
> +      are partition definition fields,
> +      we have either found exactly 1 partition

I don't understand. It's HA_READ_KEY_EXACT, so it can never
find more than one row in a unique key, and one row can never
be in more than one partition. Or do you mean unique key over
nullable column that can have many null values? Then mention it
in the comment, please.

>        (in which case start_part == end_part)
> -      or no matching partitions (start_part > end_part)
> +      or no matching partitions (start_part > end_part),
>      */
> -    DBUG_ASSERT(m_part_spec.start_part >= m_part_spec.end_part);
> +    DBUG_ASSERT(m_part_spec.start_part >= m_part_spec.end_part ||
> +                m_part_info->n_uniques_to_check);
>      /* The start part is must be marked as used. */
>      DBUG_ASSERT(m_part_spec.start_part > m_part_spec.end_part ||
>                  bitmap_is_set(&(m_part_info->read_partitions),
> @@ -8315,15 +8499,20 @@ int ha_partition::info(uint flag)
>    {
>      handler *file= m_file[m_last_part];
>      DBUG_PRINT("info", ("info: HA_STATUS_ERRKEY"));
> -    /*
> -      This flag is used to get index number of the unique index that
> -      reported duplicate key
> -      We will report the errkey on the last handler used and ignore the rest
> -      Note: all engines does not support HA_STATUS_ERRKEY, so set errkey.
> -    */
> -    file->errkey= errkey;
> -    file->info(HA_STATUS_ERRKEY | no_lock_flag);
> -    errkey= file->errkey;
> +    if ((int) m_cu_errkey >= 0)
> +      errkey= m_cu_errkey;
> +    else
> +    {
> +      /*
> +        This flag is used to get index number of the unique index that
> +        reported duplicate key
> +        We will report the errkey on the last handler used and ignore the rest
> +        Note: all engines does not support HA_STATUS_ERRKEY, so set errkey.

I don't understand what this means (I know it's an old comment, but still)
Is it "not all engines support HA_STATUS_ERRKEY"?
If yes, please change it to say that.

> +      */
> +      file->errkey= errkey;
> +      file->info(HA_STATUS_ERRKEY | no_lock_flag);
> +      errkey= file->errkey;
> +    }
>    }
>    if (flag & HA_STATUS_TIME)
>    {
> diff --git a/sql/partition_info.h b/sql/partition_info.h
> index e00a2c4..eeac0d9 100644
> --- a/sql/partition_info.h
> +++ b/sql/partition_info.h
> @@ -283,6 +283,14 @@ class partition_info : public Sql_alloc
>    bool is_auto_partitioned;
>    bool has_null_value;
>    bool column_list;                          // COLUMNS PARTITIONING, 5.5+
> +  /*
> +    Unique keys that don't have all the partitioning fields in them
> +    need to be checked when INSERT/UPDATE.
> +    So they are collected here.
> +  */
> +  KEY **uniques_to_check;
> +  uint n_uniques_to_check;
> +  uchar *unique_key_buf[2];

Is it safe? Isn't partition_info shared between many handlers
so that every handler can only use it read-only?
it's not quite clear to me. it seems that every TABLE has its own
partition_info. But cloned handlers all share the same partition_info.

>  
>    partition_info()
>    : get_partition_id(NULL), get_part_partition_id(NULL),
> diff --git a/sql/sql_partition.cc b/sql/sql_partition.cc
> index 3133b94..6cd46dd 100644
> --- a/sql/sql_partition.cc
> +++ b/sql/sql_partition.cc
> @@ -988,15 +945,15 @@ static bool check_unique_keys(TABLE *table)
>        check_fields_in_PF(table->part_info->full_part_field_array,
>                           &all_fields, &some_fields);
>        clear_indicator_in_key_fields(table->key_info+i);
> -      if (unlikely(!all_fields))
> +      if (!all_fields)
>        {
> -        my_error(ER_UNIQUE_KEY_NEED_ALL_FIELDS_IN_PF,MYF(0),"UNIQUE INDEX");

you can rename ER_UNIQUE_KEY_NEED_ALL_FIELDS_IN_PF to
ER_UNUSED with the next free number

> -        result= TRUE;
> -        break;
> +        ++keys_found;
> +        if (table->part_info->n_uniques_to_check >= keys_found)
> +          table->part_info->uniques_to_check[keys_found-1]= table->key_info+i;
>        }
>      }
>    }
> -  DBUG_RETURN(result);
> +  DBUG_RETURN(keys_found);
>  }
>  
>  
> @@ -2798,6 +2765,47 @@ bool partition_key_modified(TABLE *table, const MY_BITMAP *fields)
>  }
>  
>  
> + /*
> +  Check if there are unique keys that not entirely 'inside' the partition
> +  key and if any fields of such keys are modified.
> +  If it's so, it usually means we have to use tamporary storage for records

"temporary"

> +  handling the UPDATE command.
> +
> +  SYNOPSIS

please, use doxygen format for new functions

> +    partition_unique_modified
> +    table                TABLE object for which partition fields are set-up
> +    fields               Bitmap representing fields to be modified
> +
> +  RETURN VALUES
> +    TRUE                 Need special handling of UPDATE
> +    FALSE                Normal UPDATE handling is ok
> +*/
> +
> +bool partition_unique_modified(TABLE *table, const MY_BITMAP *fields)
> +{
> +  partition_info *part_info= table->part_info;
> +  DBUG_ENTER("partition_unique_modified");
> +
> +  if (!part_info)
> +    DBUG_RETURN(FALSE);
> +
> +  if (part_info->uniques_to_check == 0)
> +    DBUG_RETURN(FALSE);
> +
> +  for (uint n_key=0; n_key < part_info->n_uniques_to_check; n_key++)
> +  {
> +    KEY *ckey= part_info->uniques_to_check[n_key];
> +    for (uint n_part=0; n_part < ckey->user_defined_key_parts; n_part++)
> +    {
> +      if (bitmap_is_set(fields, ckey->key_part[n_part].field->field_index))
> +        DBUG_RETURN(TRUE);
> +    }
> +  }
> +
> +  DBUG_RETURN(FALSE);
> +}
> +
> +
>  /*
>    A function to handle correct handling of NULL values in partition
>    functions.


Regards,
Sergei
Chief Architect MariaDB
and security@xxxxxxxxxxx