← Back to team overview

maria-developers team mailing list archive

Re: 38a888da0f1: MDEV-17554 Auto-create new partition for system versioned tables with history partitioned by INTERVAL/LIMIT

 

Hi, Aleksey!

Like last time, it's a review of the diff bc04ded2353 22158b3c05a,
so not only commit 38a888da0f1.

The main thing below - I didn't understand a couple of changes and asked
for clarifications.

On May 30, Aleksey Midenkov wrote:
> revision-id: 38a888da0f1 (mariadb-10.5.2-653-g38a888da0f1)
> parent(s): bc04ded2353
> author: Aleksey Midenkov <midenok@xxxxxxxxx>
> committer: Aleksey Midenkov <midenok@xxxxxxxxx>
> timestamp: 2021-04-22 23:35:06 +0300
> message:
> 
> MDEV-17554 Auto-create new partition for system versioned tables with history partitioned by INTERVAL/LIMIT


> diff --git a/mysql-test/suite/versioning/r/delete_history.result b/mysql-test/suite/versioning/r/delete_history.result
> --- a/mysql-test/suite/versioning/r/delete_history.result
> +++ b/mysql-test/suite/versioning/r/delete_history.result
> @@ -154,3 +152,21 @@ select * from t1;
>  a
>  1
>  drop table t1;
> +#
> +# MDEV-17554 Auto-create new partition for system versioned tables with history partitioned by INTERVAL/LIMIT
> +#
> +# Don't auto-create new partition on DELETE HISTORY:
> +set timestamp= unix_timestamp('2000-01-01 00:00:00');
> +create or replace table t (a int) with system versioning
> +partition by system_time interval 1 hour auto;
> +set timestamp= unix_timestamp('2000-01-01 10:00:00');

perhaps, make it only 02:00:00 to avoid any chance that
a partition won't be created because of max-auto-create limit.
it's possible we put the limit at 10, but 2 is very unlikely.

or does max-auto-create limit always result in a warning?

> +delete history from t;
> +set timestamp= default;
> +show create table t;
> +Table	Create Table
> +t	CREATE TABLE `t` (
> +  `a` int(11) DEFAULT NULL
> +) ENGINE=DEFAULT_ENGINE DEFAULT CHARSET=latin1 WITH SYSTEM VERSIONING
> + PARTITION BY SYSTEM_TIME INTERVAL 1 HOUR STARTS TIMESTAMP'2000-01-01 00:00:00' AUTO
> +PARTITIONS 2
> +drop table t;
> diff --git a/mysql-test/suite/versioning/r/partition.result b/mysql-test/suite/versioning/r/partition.result
> --- a/mysql-test/suite/versioning/r/partition.result
> +++ b/mysql-test/suite/versioning/r/partition.result
> @@ -1236,27 +1270,826 @@ t1	CREATE TABLE `t1` (
>   PARTITION `p5` HISTORY ENGINE = DEFAULT_ENGINE,
>   PARTITION `pn` CURRENT ENGINE = DEFAULT_ENGINE)
>  alter table t1 add partition partitions 8;
> +drop table t1;
> +#
> +# End of 10.5 tests
> +#
> +#
> +# MDEV-17554 Auto-create new partition for system versioned tables with history partitioned by INTERVAL/LIMIT
> +#
> +create or replace table t1 (x int) with system versioning
> +partition by system_time limit 1 auto;
>  show create table t1;
>  Table	Create Table
>  t1	CREATE TABLE `t1` (
>    `x` int(11) DEFAULT NULL
>  ) ENGINE=DEFAULT_ENGINE DEFAULT CHARSET=latin1 WITH SYSTEM VERSIONING
> - PARTITION BY SYSTEM_TIME LIMIT 1
> -(PARTITION `p2` HISTORY ENGINE = DEFAULT_ENGINE,
> - PARTITION `p8` HISTORY ENGINE = DEFAULT_ENGINE,
> + PARTITION BY SYSTEM_TIME LIMIT 1 AUTO
> +PARTITIONS 2
> +insert into t1 values (1);
> +create or replace table t2 (y int);
> +insert into t2 values (2);
> +insert into t1 select * from t2;
> +insert into t2 select * from t1;
> +show create table t1;
> +Table	Create Table
> +t1	CREATE TABLE `t1` (
> +  `x` int(11) DEFAULT NULL
> +) ENGINE=DEFAULT_ENGINE DEFAULT CHARSET=latin1 WITH SYSTEM VERSIONING
> + PARTITION BY SYSTEM_TIME LIMIT 1 AUTO
> +PARTITIONS 2
> +# Too many partitions error
> +set timestamp= unix_timestamp('2000-01-01 00:00:00');
> +create or replace table t1 (x int) with system versioning
> +partition by system_time interval 1 hour auto;
> +set timestamp= unix_timestamp('2001-01-01 00:01:00');
> +update t1 set x= x + 1;
> +ERROR HY000: Too many partitions (including subpartitions) were defined
> +# Partition overflow error and manual fix

seems to be a wrong comment? no manual fix here.

> +set timestamp= unix_timestamp('2000-01-01 00:00:00');
> +create or replace table t1 (x int) with system versioning
> +partition by system_time interval 1 hour;
> +insert into t1 values (333);
> +set timestamp= unix_timestamp('2000-01-01 01:00:00');
> +update t1 set x= x + 1;
> +Warnings:
> +Warning	4114	Versioned table `test`.`t1`: last HISTORY partition (`p0`) is out of INTERVAL, need more HISTORY partitions
> +drop table t1;
> +# Partition overflow error and manual fix
> +set timestamp= unix_timestamp('2000-01-01 00:00:00');
> +create or replace table t1 (x int) with system versioning
> +partition by system_time interval 1 hour;
> +insert into t1 values (440);
> +set timestamp= unix_timestamp('2000-01-01 00:10:00');
> +update t1 set x= x + 1;
> +# Check how pruning boundaries work
> +explain partitions select * from t1 for system_time as of '2000-01-01 00:59:58';
> +id	select_type	table	partitions	type	possible_keys	key	key_len	ref	rows	Extra
> +1	SIMPLE	t1	p0,pn	#	NULL	NULL	NULL	NULL	#	#
> +explain partitions select * from t1 for system_time as of '2000-01-01 00:59:59';
> +id	select_type	table	partitions	type	possible_keys	key	key_len	ref	rows	Extra
> +1	SIMPLE	t1	pn	#	NULL	NULL	NULL	NULL	#	#

Why the first select looks in both partitions and the second one - only in pn?

> +explain partitions select * from t1 for system_time as of '2000-01-01 01:00:00';
> +id	select_type	table	partitions	type	possible_keys	key	key_len	ref	rows	Extra
> +1	SIMPLE	t1	pn	#	NULL	NULL	NULL	NULL	#	#
> +select * from t1 for system_time as of '2000-01-01 00:09:59';
> +x
> +440
> +set timestamp= unix_timestamp('2000-01-01 02:00:00');
> +update t1 set x= x + 1;
> +Warnings:
> +Warning	4114	Versioned table `test`.`t1`: last HISTORY partition (`p0`) is out of INTERVAL, need more HISTORY partitions
> +select * from t1 for system_time as of '2000-01-01 01:00:00';
> +x
> +select * from t1 partition (p0) order by x;
> +x
> +440
> +441
> +alter table t1 add partition partitions 3;
> +select * from t1 for system_time as of '2000-01-01 01:00:00';
> +x
> +441
> +select * from t1 partition (p0) order by x;
> +x
> +440

would be good to have EXPLAINs after the overflow and after the manual fix.
to show how ALTER changes what partition will be pruned

> +drop table t1;
> +create or replace table t1 (x int) with system versioning
> +partition by system_time interval 3600 second
> +starts '2000-01-01 00:00:00' auto partitions 3;
> +affected rows: 0
> +insert into t1 values (1);
> +affected rows: 1
> +show create table t1;
> +Table	Create Table
> +t1	CREATE TABLE `t1` (
> +  `x` int(11) DEFAULT NULL
> +) ENGINE=DEFAULT_ENGINE DEFAULT CHARSET=latin1 WITH SYSTEM VERSIONING
> + PARTITION BY SYSTEM_TIME INTERVAL 3600 SECOND STARTS TIMESTAMP'2000-01-01 00:00:00' AUTO
> +PARTITIONS 3
> ...
> diff --git a/mysql-test/suite/versioning/r/rpl_mix.result b/mysql-test/suite/versioning/r/rpl_mix.result
> --- a/mysql-test/suite/versioning/r/rpl_mix.result
> +++ b/mysql-test/suite/versioning/r/rpl_mix.result
> @@ -8,4 +8,59 @@ DELETE HISTORY FROM t1;
>  connection slave;
>  connection master;
>  drop table t1;
> +#
> +# MDEV-25347 DML events for auto-partitioned tables are written into binary log twice
> +#
> +flush binary logs;
> +create table t1 (a int) with system versioning
> +partition by system_time limit 1 auto;
> +insert into t1 values (1);
> +update t1 set a= a + 1;
> +update t1 set a= a + 1;

better do a+2 the second time. otherwise it's a bit confusing, the bug says
"written twice" and you indeed have update a=a+1 written twice :)
with clearly different statements the test will prove that every event is
written once, and not one is skipped and the other is duplicated

> +show create table t1;
> +Table	Create Table
> +t1	CREATE TABLE `t1` (
> +  `a` int(11) DEFAULT NULL
> +) ENGINE=ENGINE DEFAULT CHARSET=latin1 WITH SYSTEM VERSIONING
> + PARTITION BY SYSTEM_TIME LIMIT 1 AUTO
> +PARTITIONS 3
> +select * from t1;
> +a
> +3
> +include/show_binlog_events.inc
> +Log_name	Pos	Event_type	Server_id	End_log_pos	Info
> +master-bin.000002	#	Binlog_checkpoint	#	#	master-bin.000002
> +master-bin.000002	#	Gtid	#	#	GTID #-#-#
> +master-bin.000002	#	Query	#	#	use `test`; create table t1 (a int) with system versioning
> +partition by system_time limit 1 auto
> +master-bin.000002	#	Gtid	#	#	BEGIN GTID #-#-#
> +master-bin.000002	#	Annotate_rows	#	#	insert into t1 values (1)
> +master-bin.000002	#	Table_map	#	#	table_id: # (test.t1)
> +master-bin.000002	#	Write_rows_v1	#	#	table_id: # flags: STMT_END_F
> +master-bin.000002	#	Query	#	#	COMMIT
> +master-bin.000002	#	Gtid	#	#	BEGIN GTID #-#-#
> +master-bin.000002	#	Annotate_rows	#	#	update t1 set a= a + 1
> +master-bin.000002	#	Table_map	#	#	table_id: # (test.t1)
> +master-bin.000002	#	Update_rows_v1	#	#	table_id: #
> +master-bin.000002	#	Write_rows_v1	#	#	table_id: # flags: STMT_END_F
> +master-bin.000002	#	Query	#	#	COMMIT
> +master-bin.000002	#	Gtid	#	#	BEGIN GTID #-#-#
> +master-bin.000002	#	Annotate_rows	#	#	update t1 set a= a + 1
> +master-bin.000002	#	Table_map	#	#	table_id: # (test.t1)
> +master-bin.000002	#	Update_rows_v1	#	#	table_id: #
> +master-bin.000002	#	Write_rows_v1	#	#	table_id: # flags: STMT_END_F
> +master-bin.000002	#	Query	#	#	COMMIT
> +connection slave;
> +show create table t1;
> +Table	Create Table
> +t1	CREATE TABLE `t1` (
> +  `a` int(11) DEFAULT NULL
> +) ENGINE=ENGINE DEFAULT CHARSET=latin1 WITH SYSTEM VERSIONING
> + PARTITION BY SYSTEM_TIME LIMIT 1 AUTO
> +PARTITIONS 3
> +select * from t1;
> +a
> +3
> +connection master;
> +drop table t1;
>  include/rpl_end.inc
> diff --git a/mysql-test/suite/versioning/r/rpl_row.result b/mysql-test/suite/versioning/r/rpl_row.result
> --- a/mysql-test/suite/versioning/r/rpl_row.result
> +++ b/mysql-test/suite/versioning/r/rpl_row.result
> @@ -11,4 +11,59 @@ connection slave;
>  connection master;
>  drop table t1;
>  set binlog_row_image= @old_row_image;
> +#
> +# MDEV-25347 DML events for auto-partitioned tables are written into binary log twice
> +#
> +flush binary logs;
> +create table t1 (a int) with system versioning
> +partition by system_time limit 1 auto;
> +insert into t1 values (1);
> +update t1 set a= a + 1;
> +update t1 set a= a + 1;

same. as a general rule it's helpful to use slightly different statement,
to be able to map them to binlog events uniquely. a+2, or 1+a, or whatever.

> +show create table t1;
> +Table	Create Table
> +t1	CREATE TABLE `t1` (
> +  `a` int(11) DEFAULT NULL
> +) ENGINE=ENGINE DEFAULT CHARSET=latin1 WITH SYSTEM VERSIONING
> + PARTITION BY SYSTEM_TIME LIMIT 1 AUTO
> +PARTITIONS 3
> +select * from t1;
> +a
> +3
> +include/show_binlog_events.inc
> +Log_name	Pos	Event_type	Server_id	End_log_pos	Info
> +master-bin.000002	#	Binlog_checkpoint	#	#	master-bin.000002
> +master-bin.000002	#	Gtid	#	#	GTID #-#-#
> +master-bin.000002	#	Query	#	#	use `test`; create table t1 (a int) with system versioning
> +partition by system_time limit 1 auto
> +master-bin.000002	#	Gtid	#	#	BEGIN GTID #-#-#
> +master-bin.000002	#	Annotate_rows	#	#	insert into t1 values (1)
> +master-bin.000002	#	Table_map	#	#	table_id: # (test.t1)
> +master-bin.000002	#	Write_rows_v1	#	#	table_id: # flags: STMT_END_F
> +master-bin.000002	#	Query	#	#	COMMIT
> +master-bin.000002	#	Gtid	#	#	BEGIN GTID #-#-#
> +master-bin.000002	#	Annotate_rows	#	#	update t1 set a= a + 1
> +master-bin.000002	#	Table_map	#	#	table_id: # (test.t1)
> +master-bin.000002	#	Update_rows_v1	#	#	table_id: #
> +master-bin.000002	#	Write_rows_v1	#	#	table_id: # flags: STMT_END_F
> +master-bin.000002	#	Query	#	#	COMMIT
> +master-bin.000002	#	Gtid	#	#	BEGIN GTID #-#-#
> +master-bin.000002	#	Annotate_rows	#	#	update t1 set a= a + 1
> +master-bin.000002	#	Table_map	#	#	table_id: # (test.t1)
> +master-bin.000002	#	Update_rows_v1	#	#	table_id: #
> +master-bin.000002	#	Write_rows_v1	#	#	table_id: # flags: STMT_END_F
> +master-bin.000002	#	Query	#	#	COMMIT
> +connection slave;
> +show create table t1;
> +Table	Create Table
> +t1	CREATE TABLE `t1` (
> +  `a` int(11) DEFAULT NULL
> +) ENGINE=ENGINE DEFAULT CHARSET=latin1 WITH SYSTEM VERSIONING
> + PARTITION BY SYSTEM_TIME LIMIT 1 AUTO
> +PARTITIONS 3
> +select * from t1;
> +a
> +3
> +connection master;
> +drop table t1;
>  include/rpl_end.inc
> diff --git a/mysql-test/suite/versioning/t/partition.test b/mysql-test/suite/versioning/t/partition.test
> --- a/mysql-test/suite/versioning/t/partition.test
> +++ b/mysql-test/suite/versioning/t/partition.test
> @@ -1068,13 +1078,456 @@ alter table t1 add partition partitions 2;
>  --replace_result $default_engine DEFAULT_ENGINE
>  show create table t1;
>  alter table t1 add partition partitions 8;
> +drop table t1;
> +
> +--echo #
> +--echo # End of 10.5 tests
> +--echo #
> +
> +--echo #
> +--echo # MDEV-17554 Auto-create new partition for system versioned tables with history partitioned by INTERVAL/LIMIT
> +--echo #
> +create or replace table t1 (x int) with system versioning
> +partition by system_time limit 1 auto;
> +--replace_result $default_engine DEFAULT_ENGINE
> +show create table t1;
> +

in the previous version of this patch you had clarifying comments here, like

--echo # INSERT, INSERT .. SELECT don't auto-create partitions

or

--echo # Number of partitions goes from 3 to 5

I kind of miss them now, they were helpful

> +insert into t1 values (1);
> +
> +create or replace table t2 (y int);
> +insert into t2 values (2);
> +
> +insert into t1 select * from t2;
> +insert into t2 select * from t1;
> +--replace_result $default_engine DEFAULT_ENGINE
> +show create table t1;
> +
> ...
> diff --git a/sql/ha_partition.h b/sql/ha_partition.h
> --- a/sql/ha_partition.h
> +++ b/sql/ha_partition.h
> @@ -1607,7 +1607,7 @@ class ha_partition :public handler
>      for (; part_id < part_id_end; ++part_id)
>      {
>        handler *file= m_file[part_id];
> -      DBUG_ASSERT(bitmap_is_set(&(m_part_info->read_partitions), part_id));
> +      bitmap_set_bit(&(m_part_info->read_partitions), part_id);

would be good to have a comment before the method that it's
only used in vers_set_hist_part(), if (vers_info->limit).

because if it's a generic method to return the number of rows in a partition
(incl. subpartitions) then it's totally not clear why it modifies
read_partitions bitmap.

or may be you'd want to rename the method to make it look more
specific for LIMIT partitions in vers_set_hist_part() ?

>        file->info(HA_STATUS_VARIABLE | HA_STATUS_NO_LOCK | HA_STATUS_OPEN);
>        part_recs+= file->stats.records;
>      }
> diff --git a/sql/handler.cc b/sql/handler.cc
> --- a/sql/handler.cc
> +++ b/sql/handler.cc
> @@ -1574,6 +1574,8 @@ int ha_commit_trans(THD *thd, bool all)
>    DBUG_ASSERT(thd->transaction->stmt.ha_list == NULL ||
>                trans == &thd->transaction->stmt);
>  
> +  DBUG_ASSERT(!thd->in_sub_stmt);
>    if (thd->in_sub_stmt)
>    {
>      DBUG_ASSERT(0);

This is a bit redundant, don't you think?
One DBUG_ASSERT would be enough.
(personally, I'd prefer the one you added, not DBUG_ASSERT(0))

> diff --git a/sql/lock.cc b/sql/lock.cc
> --- a/sql/lock.cc
> +++ b/sql/lock.cc
> @@ -662,16 +662,28 @@ MYSQL_LOCK *mysql_lock_merge(MYSQL_LOCK *a,MYSQL_LOCK *b)
>    DBUG_PRINT("enter", ("a->lock_count: %u  b->lock_count: %u",
>                         a->lock_count, b->lock_count));
>  
> -  if (!(sql_lock= (MYSQL_LOCK*)
> -	my_malloc(key_memory_MYSQL_LOCK, sizeof(*sql_lock) +
> -		  sizeof(THR_LOCK_DATA*)*((a->lock_count+b->lock_count)*2) +
> -		  sizeof(TABLE*)*(a->table_count+b->table_count),MYF(MY_WME))))
> -    DBUG_RETURN(0);				// Fatal error
> +  const size_t lock_size= sizeof(*sql_lock) +
> +    sizeof(THR_LOCK_DATA *) * ((a->lock_count + b->lock_count) * 2) +
> +    sizeof(TABLE *) * (a->table_count + b->table_count);
> +  if (thd)
> +  {
> +    sql_lock= (MYSQL_LOCK *) thd->alloc(lock_size);
> +    if (!sql_lock)
> +      DBUG_RETURN(0);
> +    sql_lock->flags= GET_LOCK_ON_THD;

why? the commit comment for MDEV-23639 doesn't mention it.

generally, I think it's a good idea and, likely, should be used more than
just in recover_from_failed_open(). But why right now and in this commit?
Does this commit (22158b3c) rely on locks being on THD memroot? How so?

> +  }
> +  else
> +  {
> +    sql_lock= (MYSQL_LOCK *)
> +      my_malloc(key_memory_MYSQL_LOCK, lock_size, MYF(MY_WME));
> +    if (!sql_lock)
> +      DBUG_RETURN(0);
> +    sql_lock->flags= 0;
> +  }
>    sql_lock->lock_count=a->lock_count+b->lock_count;
>    sql_lock->table_count=a->table_count+b->table_count;
>    sql_lock->locks=(THR_LOCK_DATA**) (sql_lock+1);
>    sql_lock->table=(TABLE**) (sql_lock->locks+sql_lock->lock_count*2);
> -  sql_lock->flags= 0;
>    memcpy(sql_lock->locks,a->locks,a->lock_count*sizeof(*a->locks));
>    memcpy(sql_lock->locks+a->lock_count,b->locks,
>  	 b->lock_count*sizeof(*b->locks));
> @@ -705,8 +717,10 @@ MYSQL_LOCK *mysql_lock_merge(MYSQL_LOCK *a,MYSQL_LOCK *b)
>                    a->lock_count, b->lock_count);
>  
>    /* Delete old, not needed locks */
> -  my_free(a);
> -  my_free(b);
> +  if (!(a->flags & GET_LOCK_ON_THD))
> +    my_free(a);
> +  if (!(b->flags & GET_LOCK_ON_THD))
> +    my_free(b);

this even looks like a bug, no check that old locks weren't on THD memroot.
(probably impossible in this particular case, but still rather fragile)
thanks!

>    DBUG_RETURN(sql_lock);
>  }
>  
> diff --git a/sql/share/errmsg-utf8.txt b/sql/share/errmsg-utf8.txt
> --- a/sql/share/errmsg-utf8.txt
> +++ b/sql/share/errmsg-utf8.txt
> @@ -7985,3 +7985,5 @@ ER_JSON_TABLE_MULTIPLE_MATCHES
>          eng "Can't store multiple matches of the path in the column '%s' of JSON_TABLE '%s'."
>  ER_WITH_TIES_NEEDS_ORDER
>          eng "FETCH ... WITH TIES requires ORDER BY clause to be present"
> +ER_VERS_HIST_PART_FAILED
> +        eng "Versioned table %`s.%`s: adding HISTORY partition failed"

Can you come up with a test for this error?

> diff --git a/sql/sql_base.cc b/sql/sql_base.cc
> --- a/sql/sql_base.cc
> +++ b/sql/sql_base.cc
> @@ -931,7 +934,7 @@ void close_thread_table(THD *thd, TABLE **table_ptr)
>    DBUG_PRINT("tcache", ("table: '%s'.'%s' %p", table->s->db.str,
>                          table->s->table_name.str, table));
>    DBUG_ASSERT(!table->file->keyread_enabled());
> -  DBUG_ASSERT(!table->file || table->file->inited == handler::NONE);
> +  DBUG_ASSERT(table->file->inited == handler::NONE);

indeed

>  
>    /*
>      The metadata lock must be released after giving back
> @@ -1625,6 +1625,125 @@ bool is_locked_view(THD *thd, TABLE_LIST *t)
>  }
>  
>  
> +#ifdef WITH_PARTITION_STORAGE_ENGINE
> +/**
> +  Switch part_info->hist_part and request partition creation if needed.
> +
> +  @retval true  Error or partition creation was requested.
> +  @retval false No error
> +*/
> +bool TABLE::vers_switch_partition(THD *thd, TABLE_LIST *table_list,
> +                                  Open_table_context *ot_ctx)
> +{
> +  if (!part_info || part_info->part_type != VERSIONING_PARTITION ||
> +      table_list->vers_conditions.delete_history ||
> +      thd->stmt_arena->is_stmt_prepare() ||
> +      table_list->lock_type < TL_WRITE_ALLOW_WRITE ||
> +      table_list->mdl_request.type < MDL_SHARED_WRITE ||
> +      table_list->mdl_request.type == MDL_EXCLUSIVE)
> +  {
> +    return false;
> +  }
> +
> +  switch (thd->lex->sql_command)
> +  {
> +    case SQLCOM_INSERT:
> +      if (thd->lex->duplicates != DUP_UPDATE)
> +        return false;
> +      break;
> +    case SQLCOM_LOAD:
> +      if (thd->lex->duplicates != DUP_REPLACE)
> +        return false;
> +      break;
> +    case SQLCOM_LOCK_TABLES:
> +    case SQLCOM_DELETE:
> +    case SQLCOM_UPDATE:
> +    case SQLCOM_REPLACE:
> +    case SQLCOM_REPLACE_SELECT:
> +    case SQLCOM_DELETE_MULTI:
> +    case SQLCOM_UPDATE_MULTI:
> +      break;
> +    default:
> +      /*
> +        TODO: make row events set thd->lex->sql_command appropriately.
> +
> +        Sergei Golubchik: f.ex. currently row events increment
> +        thd->status_var.com_stat[] each event for its own SQLCOM_xxx, it won't be
> +        needed if they'll just set thd->lex->sql_command.
> +      */
> +      if (thd->rgi_slave && thd->rgi_slave->current_event &&
> +          thd->lex->sql_command == SQLCOM_END)
> +      {
> +        switch (thd->rgi_slave->current_event->get_type_code())
> +        {
> +        case UPDATE_ROWS_EVENT:
> +        case UPDATE_ROWS_EVENT_V1:
> +        case DELETE_ROWS_EVENT:
> +        case DELETE_ROWS_EVENT_V1:
> +          break;
> +        default:;
> +          return false;
> +        }
> +      }
> +      break;
> +  }
> +
> +  TABLE *table= this;
> +
> +  /*
> +      NOTE: The semantics of vers_set_hist_part() is double: even when we

twofold

> +      don't need auto-create, we need to update part_info->hist_part.
> +  */
> +  uint *create_count= table_list->vers_skip_create ?
> +    NULL : &ot_ctx->vers_create_count;
> +  table_list->vers_skip_create= true;
> +  if (table->part_info->vers_set_hist_part(thd, create_count))
> +  {
> +    MYSQL_UNBIND_TABLE(table->file);
> +    tc_release_table(table);
> +    return true;
> +  }
> +  if (ot_ctx->vers_create_count)
> +  {
> +    Open_table_context::enum_open_table_action action;
> +    TABLE_LIST *table_arg;
> +    mysql_mutex_lock(&table->s->LOCK_share);
> +    if (!table->s->vers_skip_auto_create)
> +    {
> +      table->s->vers_skip_auto_create= true;
> +      action= Open_table_context::OT_ADD_HISTORY_PARTITION;
> +      table_arg= table_list;
> +    }
> +    else
> +    {
> +      /*
> +          NOTE: this may repeat multiple times until creating thread acquires
> +          MDL_EXCLUSIVE. Since auto-creation is rare operation this is acceptable.
> +          We could suspend this thread on cond-var but we must first exit
> +          MDL_SHARED_WRITE first and we cannot store cond-var into TABLE_SHARE
> +          because it is already released and there is no guarantee that it will
> +          be same instance if we acquire it again.
> +      */
> +      table_list->vers_skip_create= false;
> +      ot_ctx->vers_create_count= 0;
> +      action= Open_table_context::OT_REOPEN_TABLES;
> +      table_arg= NULL;
> +    }

I'm afraid I don't understand. All this business with vers_skip_create and
vers_skip_auto_create, it wasn't in the previous version of the patch. So, I
believe it was a fix for one of the MDEV bugs reported and fixed meanwhile.

What MDEV was it?
Could you explain what was the issue and what was the fix, please?

> +    mysql_mutex_unlock(&table->s->LOCK_share);
> +    if (!thd->locked_tables_mode)
> +    {
> +      MYSQL_UNBIND_TABLE(table->file);
> +      tc_release_table(table);
> +    }
> +    ot_ctx->request_backoff_action(action, table_arg);
> +    return true;
> +  }
> +
> +  return false;
> +}
> +#endif /* WITH_PARTITION_STORAGE_ENGINE */
> +
> +
>  /**
>    Open a base table.
>  
> @@ -2572,11 +2699,13 @@ void Locked_tables_list::mark_table_for_reopen(THD *thd, TABLE *table)
>         table_list; table_list= table_list->next_global)
>    {
>      if (table_list->table->s == share)
> +    {
>        table_list->table->internal_set_needs_reopen(true);
> +      some_table_marked_for_reopen= 1;
> +    }
>    }
>    /* This is needed in the case where lock tables where not used */
>    table->internal_set_needs_reopen(true);

why ^^^ this doesn't count as "some_table_marked_for_reopen" ?

> -  some_table_marked_for_reopen= 1;
>  }
>  
>  
> @@ -3172,13 +3304,49 @@ Open_table_context::recover_from_failed_open()
>        break;
>      case OT_DISCOVER:
>      case OT_REPAIR:
> -      if ((result= lock_table_names(m_thd, m_thd->lex->create_info,
> -                                    m_failed_table, NULL,
> -                                    get_timeout(), 0)))
> +    case OT_ADD_HISTORY_PARTITION:
> +      if (!m_thd->locked_tables_mode)
> +        result= lock_table_names(m_thd, m_thd->lex->create_info, m_failed_table,
> +                                NULL, get_timeout(), 0);
> +      else
> +      {
> +        DBUG_ASSERT(!result);
> +        DBUG_ASSERT(m_action == OT_ADD_HISTORY_PARTITION);

It seems you've solved recover_from_failed_open under LOCK TABLES,
so it could be possible to make OT_DISCOVER and OT_REPAIR to work there too.
Is that right? It's just a question, I don't imply it should be part of this MDEV.

> +      }
> +      /*
> +         We are now under MDL_EXCLUSIVE mode. Other threads have no table share
> +         acquired: they are blocked either at open_table_get_mdl_lock() in
> +         open_table() or at lock_table_names() here.
> +      */
> +      if (result)
> +      {
> +        if (m_action == OT_ADD_HISTORY_PARTITION)
> +        {
> +          TABLE_SHARE *share= tdc_acquire_share(m_thd, m_failed_table,
> +                                                GTS_TABLE, NULL);
> +          if (share)
> +          {
> +            share->vers_skip_auto_create= false;
> +            tdc_release_share(share);
> +          }
> +          if (m_thd->get_stmt_da()->sql_errno() == ER_LOCK_WAIT_TIMEOUT)
> +          {
> +            // MDEV-23642 Locking timeout caused by auto-creation affects original DML
> +            m_thd->clear_error();
> +            vers_create_count= 0;
> +            result= false;
> +          }
> +        }
>          break;
> +      }
>  
> -      tdc_remove_table(m_thd, m_failed_table->db.str,
> -                       m_failed_table->table_name.str);
> +      /*
> +         We don't need to remove share under OT_ADD_HISTORY_PARTITION.
> +         Moreover fast_alter_partition_table() works with TABLE instance.
> +      */
> +      if (m_action != OT_ADD_HISTORY_PARTITION)
> +        tdc_remove_table(m_thd, m_failed_table->db.str,
> +                        m_failed_table->table_name.str);
>  
>        switch (m_action)
>        {
> @@ -3206,6 +3374,70 @@ Open_table_context::recover_from_failed_open()
>          case OT_REPAIR:
>            result= auto_repair_table(m_thd, m_failed_table);
>            break;
> +        case OT_ADD_HISTORY_PARTITION:
> +        {
> +          result= false;
> +          TABLE *table= open_ltable(m_thd, m_failed_table, TL_WRITE,
> +                    MYSQL_OPEN_HAS_MDL_LOCK | MYSQL_OPEN_IGNORE_LOGGING_FORMAT);
> +          if (table == NULL)
> +          {
> +            m_thd->clear_error();
> +            break;
> +          }
> +
> +          DBUG_ASSERT(vers_create_count);
> +          result= vers_create_partitions(m_thd, m_failed_table, vers_create_count);
> +          vers_create_count= 0;
> +          if (!m_thd->transaction->stmt.is_empty())
> +            trans_commit_stmt(m_thd);
> +          DBUG_ASSERT(!result ||
> +                      !m_thd->locked_tables_mode ||
> +                      m_thd->lock->lock_count);
> +          if (result)
> +            break;
> +          if (!m_thd->locked_tables_mode)
> +          {
> +            /*
> +              alter_partition_lock_handling() does mysql_lock_remove() but
> +              does not clear thd->lock completely.
> +            */
> +            DBUG_ASSERT(m_thd->lock->lock_count == 0);
> +            if (!(m_thd->lock->flags & GET_LOCK_ON_THD))
> +              my_free(m_thd->lock);
> +            m_thd->lock= NULL;
> +          }
> +          else if (m_thd->locked_tables_mode == LTM_PRELOCKED)
> +          {
> +            MYSQL_LOCK *lock;
> +            MYSQL_LOCK *merged_lock;
> +
> +            /*
> +              In LTM_LOCK_TABLES table was reopened via locked_tables_list,
> +              but not in prelocked environment where we have to reopen
> +              the table manually.
> +            */
> +            Open_table_context ot_ctx(m_thd, MYSQL_OPEN_REOPEN);
> +            if (open_table(m_thd, m_failed_table, &ot_ctx))
> +            {
> +              result= true;
> +              break;
> +            }
> +            TABLE *table= m_failed_table->table;
> +            table->reginfo.lock_type= m_thd->update_lock_default;
> +            m_thd->in_lock_tables= 1;
> +            lock= mysql_lock_tables(m_thd, &table, 1,
> +                                    MYSQL_OPEN_REOPEN | MYSQL_LOCK_USE_MALLOC);
> +            m_thd->in_lock_tables= 0;
> +            if (lock == NULL ||
> +                !(merged_lock= mysql_lock_merge(m_thd->lock, lock, m_thd)))
> +            {
> +              result= true;
> +              break;
> +            }
> +            m_thd->lock= merged_lock;

What MDEV does it fix?
I wonder why other cases in recover_from_failed_open doesn't do that

> +          }
> +          break;
> +        }
>          case OT_BACKOFF_AND_RETRY:
>          case OT_REOPEN_TABLES:
>          case OT_NO_ACTION:
> @@ -4370,6 +4611,8 @@ bool open_tables(THD *thd, const DDL_options_st &options,
>          {
>            if (ot_ctx.can_recover_from_failed_open())
>            {
> +            // FIXME: is this really used?
> +            DBUG_ASSERT(0);

I take it, you weren't able to get this assert to fire?

>              close_tables_for_reopen(thd, start,
>                                      ot_ctx.start_of_statement_svp());
>              if (ot_ctx.recover_from_failed_open())

Regards,
Sergei
VP of MariaDB Server Engineering
and security@xxxxxxxxxxx


Follow ups