maria-developers team mailing list archive
-
maria-developers team
-
Mailing list archive
-
Message #12735
Re: 38a888da0f1: MDEV-17554 Auto-create new partition for system versioned tables with history partitioned by INTERVAL/LIMIT
Hi Sergei!
I'm sending the reply but still didn't push the changes. Please expect
the changes pushed when I reassign MDEV-17554 to you.
On Sun, May 30, 2021 at 11:41 PM Sergei Golubchik <serg@xxxxxxxxxxx> wrote:
>
> Hi, Aleksey!
>
> Like last time, it's a review of the diff bc04ded2353 22158b3c05a,
> so not only commit 38a888da0f1.
>
> The main thing below - I didn't understand a couple of changes and asked
> for clarifications.
>
> On May 30, Aleksey Midenkov wrote:
> > revision-id: 38a888da0f1 (mariadb-10.5.2-653-g38a888da0f1)
> > parent(s): bc04ded2353
> > author: Aleksey Midenkov <midenok@xxxxxxxxx>
> > committer: Aleksey Midenkov <midenok@xxxxxxxxx>
> > timestamp: 2021-04-22 23:35:06 +0300
> > message:
> >
> > MDEV-17554 Auto-create new partition for system versioned tables with history partitioned by INTERVAL/LIMIT
>
>
> > diff --git a/mysql-test/suite/versioning/r/delete_history.result b/mysql-test/suite/versioning/r/delete_history.result
> > --- a/mysql-test/suite/versioning/r/delete_history.result
> > +++ b/mysql-test/suite/versioning/r/delete_history.result
> > @@ -154,3 +152,21 @@ select * from t1;
> > a
> > 1
> > drop table t1;
> > +#
> > +# MDEV-17554 Auto-create new partition for system versioned tables with history partitioned by INTERVAL/LIMIT
> > +#
> > +# Don't auto-create new partition on DELETE HISTORY:
> > +set timestamp= unix_timestamp('2000-01-01 00:00:00');
> > +create or replace table t (a int) with system versioning
> > +partition by system_time interval 1 hour auto;
> > +set timestamp= unix_timestamp('2000-01-01 10:00:00');
>
> perhaps, make it only 02:00:00 to avoid any chance that
> a partition won't be created because of max-auto-create limit.
> it's possible we put the limit at 10, but 2 is very unlikely.
>
> or does max-auto-create limit always result in a warning?
>
What is max-auto-create limit and how do we make it 10? Did you mean
that condition:
if (*create_count == MAX_PARTITIONS - 2)
{
my_error(ER_TOO_MANY_PARTITIONS_ERROR, MYF(0));
return true;
}
> > +delete history from t;
> > +set timestamp= default;
> > +show create table t;
> > +Table Create Table
> > +t CREATE TABLE `t` (
> > + `a` int(11) DEFAULT NULL
> > +) ENGINE=DEFAULT_ENGINE DEFAULT CHARSET=latin1 WITH SYSTEM VERSIONING
> > + PARTITION BY SYSTEM_TIME INTERVAL 1 HOUR STARTS TIMESTAMP'2000-01-01 00:00:00' AUTO
> > +PARTITIONS 2
> > +drop table t;
> > diff --git a/mysql-test/suite/versioning/r/partition.result b/mysql-test/suite/versioning/r/partition.result
> > --- a/mysql-test/suite/versioning/r/partition.result
> > +++ b/mysql-test/suite/versioning/r/partition.result
> > @@ -1236,27 +1270,826 @@ t1 CREATE TABLE `t1` (
> > PARTITION `p5` HISTORY ENGINE = DEFAULT_ENGINE,
> > PARTITION `pn` CURRENT ENGINE = DEFAULT_ENGINE)
> > alter table t1 add partition partitions 8;
> > +drop table t1;
> > +#
> > +# End of 10.5 tests
> > +#
> > +#
> > +# MDEV-17554 Auto-create new partition for system versioned tables with history partitioned by INTERVAL/LIMIT
> > +#
> > +create or replace table t1 (x int) with system versioning
> > +partition by system_time limit 1 auto;
> > show create table t1;
> > Table Create Table
> > t1 CREATE TABLE `t1` (
> > `x` int(11) DEFAULT NULL
> > ) ENGINE=DEFAULT_ENGINE DEFAULT CHARSET=latin1 WITH SYSTEM VERSIONING
> > - PARTITION BY SYSTEM_TIME LIMIT 1
> > -(PARTITION `p2` HISTORY ENGINE = DEFAULT_ENGINE,
> > - PARTITION `p8` HISTORY ENGINE = DEFAULT_ENGINE,
> > + PARTITION BY SYSTEM_TIME LIMIT 1 AUTO
> > +PARTITIONS 2
> > +insert into t1 values (1);
> > +create or replace table t2 (y int);
> > +insert into t2 values (2);
> > +insert into t1 select * from t2;
> > +insert into t2 select * from t1;
> > +show create table t1;
> > +Table Create Table
> > +t1 CREATE TABLE `t1` (
> > + `x` int(11) DEFAULT NULL
> > +) ENGINE=DEFAULT_ENGINE DEFAULT CHARSET=latin1 WITH SYSTEM VERSIONING
> > + PARTITION BY SYSTEM_TIME LIMIT 1 AUTO
> > +PARTITIONS 2
> > +# Too many partitions error
> > +set timestamp= unix_timestamp('2000-01-01 00:00:00');
> > +create or replace table t1 (x int) with system versioning
> > +partition by system_time interval 1 hour auto;
> > +set timestamp= unix_timestamp('2001-01-01 00:01:00');
> > +update t1 set x= x + 1;
> > +ERROR HY000: Too many partitions (including subpartitions) were defined
> > +# Partition overflow error and manual fix
>
> seems to be a wrong comment? no manual fix here.
Fixed.
>
> > +set timestamp= unix_timestamp('2000-01-01 00:00:00');
> > +create or replace table t1 (x int) with system versioning
> > +partition by system_time interval 1 hour;
> > +insert into t1 values (333);
> > +set timestamp= unix_timestamp('2000-01-01 01:00:00');
> > +update t1 set x= x + 1;
> > +Warnings:
> > +Warning 4114 Versioned table `test`.`t1`: last HISTORY partition (`p0`) is out of INTERVAL, need more HISTORY partitions
> > +drop table t1;
> > +# Partition overflow error and manual fix
> > +set timestamp= unix_timestamp('2000-01-01 00:00:00');
> > +create or replace table t1 (x int) with system versioning
> > +partition by system_time interval 1 hour;
> > +insert into t1 values (440);
> > +set timestamp= unix_timestamp('2000-01-01 00:10:00');
> > +update t1 set x= x + 1;
> > +# Check how pruning boundaries work
> > +explain partitions select * from t1 for system_time as of '2000-01-01 00:59:58';
> > +id select_type table partitions type possible_keys key key_len ref rows Extra
> > +1 SIMPLE t1 p0,pn # NULL NULL NULL NULL # #
> > +explain partitions select * from t1 for system_time as of '2000-01-01 00:59:59';
> > +id select_type table partitions type possible_keys key key_len ref rows Extra
> > +1 SIMPLE t1 pn # NULL NULL NULL NULL # #
>
> Why the first select looks in both partitions and the second one - only in pn?
>
Table is partitioned by row_end: AS OF ts1 means select records with
row_end > ts1.
AS OF '2000-01-01 00:59:59' means select records with row_end >
'2000-01-01 00:59:59' and that is partition in next hour interval.
> > +explain partitions select * from t1 for system_time as of '2000-01-01 01:00:00';
> > +id select_type table partitions type possible_keys key key_len ref rows Extra
> > +1 SIMPLE t1 pn # NULL NULL NULL NULL # #
> > +select * from t1 for system_time as of '2000-01-01 00:09:59';
> > +x
> > +440
> > +set timestamp= unix_timestamp('2000-01-01 02:00:00');
> > +update t1 set x= x + 1;
> > +Warnings:
> > +Warning 4114 Versioned table `test`.`t1`: last HISTORY partition (`p0`) is out of INTERVAL, need more HISTORY partitions
> > +select * from t1 for system_time as of '2000-01-01 01:00:00';
> > +x
> > +select * from t1 partition (p0) order by x;
> > +x
> > +440
> > +441
> > +alter table t1 add partition partitions 3;
> > +select * from t1 for system_time as of '2000-01-01 01:00:00';
> > +x
> > +441
> > +select * from t1 partition (p0) order by x;
> > +x
> > +440
>
> would be good to have EXPLAINs after the overflow and after the manual fix.
> to show how ALTER changes what partition will be pruned
>
> > +drop table t1;
> > +create or replace table t1 (x int) with system versioning
> > +partition by system_time interval 3600 second
> > +starts '2000-01-01 00:00:00' auto partitions 3;
> > +affected rows: 0
> > +insert into t1 values (1);
> > +affected rows: 1
> > +show create table t1;
> > +Table Create Table
> > +t1 CREATE TABLE `t1` (
> > + `x` int(11) DEFAULT NULL
> > +) ENGINE=DEFAULT_ENGINE DEFAULT CHARSET=latin1 WITH SYSTEM VERSIONING
> > + PARTITION BY SYSTEM_TIME INTERVAL 3600 SECOND STARTS TIMESTAMP'2000-01-01 00:00:00' AUTO
> > +PARTITIONS 3
> > ...
> > diff --git a/mysql-test/suite/versioning/r/rpl_mix.result b/mysql-test/suite/versioning/r/rpl_mix.result
> > --- a/mysql-test/suite/versioning/r/rpl_mix.result
> > +++ b/mysql-test/suite/versioning/r/rpl_mix.result
> > @@ -8,4 +8,59 @@ DELETE HISTORY FROM t1;
> > connection slave;
> > connection master;
> > drop table t1;
> > +#
> > +# MDEV-25347 DML events for auto-partitioned tables are written into binary log twice
> > +#
> > +flush binary logs;
> > +create table t1 (a int) with system versioning
> > +partition by system_time limit 1 auto;
> > +insert into t1 values (1);
> > +update t1 set a= a + 1;
> > +update t1 set a= a + 1;
>
> better do a+2 the second time. otherwise it's a bit confusing, the bug says
> "written twice" and you indeed have update a=a+1 written twice :)
> with clearly different statements the test will prove that every event is
> written once, and not one is skipped and the other is duplicated
Yep, I use this trick. Fixed.
>
> > +show create table t1;
> > +Table Create Table
> > +t1 CREATE TABLE `t1` (
> > + `a` int(11) DEFAULT NULL
> > +) ENGINE=ENGINE DEFAULT CHARSET=latin1 WITH SYSTEM VERSIONING
> > + PARTITION BY SYSTEM_TIME LIMIT 1 AUTO
> > +PARTITIONS 3
> > +select * from t1;
> > +a
> > +3
> > +include/show_binlog_events.inc
> > +Log_name Pos Event_type Server_id End_log_pos Info
> > +master-bin.000002 # Binlog_checkpoint # # master-bin.000002
> > +master-bin.000002 # Gtid # # GTID #-#-#
> > +master-bin.000002 # Query # # use `test`; create table t1 (a int) with system versioning
> > +partition by system_time limit 1 auto
> > +master-bin.000002 # Gtid # # BEGIN GTID #-#-#
> > +master-bin.000002 # Annotate_rows # # insert into t1 values (1)
> > +master-bin.000002 # Table_map # # table_id: # (test.t1)
> > +master-bin.000002 # Write_rows_v1 # # table_id: # flags: STMT_END_F
> > +master-bin.000002 # Query # # COMMIT
> > +master-bin.000002 # Gtid # # BEGIN GTID #-#-#
> > +master-bin.000002 # Annotate_rows # # update t1 set a= a + 1
> > +master-bin.000002 # Table_map # # table_id: # (test.t1)
> > +master-bin.000002 # Update_rows_v1 # # table_id: #
> > +master-bin.000002 # Write_rows_v1 # # table_id: # flags: STMT_END_F
> > +master-bin.000002 # Query # # COMMIT
> > +master-bin.000002 # Gtid # # BEGIN GTID #-#-#
> > +master-bin.000002 # Annotate_rows # # update t1 set a= a + 1
> > +master-bin.000002 # Table_map # # table_id: # (test.t1)
> > +master-bin.000002 # Update_rows_v1 # # table_id: #
> > +master-bin.000002 # Write_rows_v1 # # table_id: # flags: STMT_END_F
> > +master-bin.000002 # Query # # COMMIT
> > +connection slave;
> > +show create table t1;
> > +Table Create Table
> > +t1 CREATE TABLE `t1` (
> > + `a` int(11) DEFAULT NULL
> > +) ENGINE=ENGINE DEFAULT CHARSET=latin1 WITH SYSTEM VERSIONING
> > + PARTITION BY SYSTEM_TIME LIMIT 1 AUTO
> > +PARTITIONS 3
> > +select * from t1;
> > +a
> > +3
> > +connection master;
> > +drop table t1;
> > include/rpl_end.inc
> > diff --git a/mysql-test/suite/versioning/r/rpl_row.result b/mysql-test/suite/versioning/r/rpl_row.result
> > --- a/mysql-test/suite/versioning/r/rpl_row.result
> > +++ b/mysql-test/suite/versioning/r/rpl_row.result
> > @@ -11,4 +11,59 @@ connection slave;
> > connection master;
> > drop table t1;
> > set binlog_row_image= @old_row_image;
> > +#
> > +# MDEV-25347 DML events for auto-partitioned tables are written into binary log twice
> > +#
> > +flush binary logs;
> > +create table t1 (a int) with system versioning
> > +partition by system_time limit 1 auto;
> > +insert into t1 values (1);
> > +update t1 set a= a + 1;
> > +update t1 set a= a + 1;
>
> same. as a general rule it's helpful to use slightly different statement,
> to be able to map them to binlog events uniquely. a+2, or 1+a, or whatever.
>
> > +show create table t1;
> > +Table Create Table
> > +t1 CREATE TABLE `t1` (
> > + `a` int(11) DEFAULT NULL
> > +) ENGINE=ENGINE DEFAULT CHARSET=latin1 WITH SYSTEM VERSIONING
> > + PARTITION BY SYSTEM_TIME LIMIT 1 AUTO
> > +PARTITIONS 3
> > +select * from t1;
> > +a
> > +3
> > +include/show_binlog_events.inc
> > +Log_name Pos Event_type Server_id End_log_pos Info
> > +master-bin.000002 # Binlog_checkpoint # # master-bin.000002
> > +master-bin.000002 # Gtid # # GTID #-#-#
> > +master-bin.000002 # Query # # use `test`; create table t1 (a int) with system versioning
> > +partition by system_time limit 1 auto
> > +master-bin.000002 # Gtid # # BEGIN GTID #-#-#
> > +master-bin.000002 # Annotate_rows # # insert into t1 values (1)
> > +master-bin.000002 # Table_map # # table_id: # (test.t1)
> > +master-bin.000002 # Write_rows_v1 # # table_id: # flags: STMT_END_F
> > +master-bin.000002 # Query # # COMMIT
> > +master-bin.000002 # Gtid # # BEGIN GTID #-#-#
> > +master-bin.000002 # Annotate_rows # # update t1 set a= a + 1
> > +master-bin.000002 # Table_map # # table_id: # (test.t1)
> > +master-bin.000002 # Update_rows_v1 # # table_id: #
> > +master-bin.000002 # Write_rows_v1 # # table_id: # flags: STMT_END_F
> > +master-bin.000002 # Query # # COMMIT
> > +master-bin.000002 # Gtid # # BEGIN GTID #-#-#
> > +master-bin.000002 # Annotate_rows # # update t1 set a= a + 1
> > +master-bin.000002 # Table_map # # table_id: # (test.t1)
> > +master-bin.000002 # Update_rows_v1 # # table_id: #
> > +master-bin.000002 # Write_rows_v1 # # table_id: # flags: STMT_END_F
> > +master-bin.000002 # Query # # COMMIT
> > +connection slave;
> > +show create table t1;
> > +Table Create Table
> > +t1 CREATE TABLE `t1` (
> > + `a` int(11) DEFAULT NULL
> > +) ENGINE=ENGINE DEFAULT CHARSET=latin1 WITH SYSTEM VERSIONING
> > + PARTITION BY SYSTEM_TIME LIMIT 1 AUTO
> > +PARTITIONS 3
> > +select * from t1;
> > +a
> > +3
> > +connection master;
> > +drop table t1;
> > include/rpl_end.inc
> > diff --git a/mysql-test/suite/versioning/t/partition.test b/mysql-test/suite/versioning/t/partition.test
> > --- a/mysql-test/suite/versioning/t/partition.test
> > +++ b/mysql-test/suite/versioning/t/partition.test
> > @@ -1068,13 +1078,456 @@ alter table t1 add partition partitions 2;
> > --replace_result $default_engine DEFAULT_ENGINE
> > show create table t1;
> > alter table t1 add partition partitions 8;
> > +drop table t1;
> > +
> > +--echo #
> > +--echo # End of 10.5 tests
> > +--echo #
> > +
> > +--echo #
> > +--echo # MDEV-17554 Auto-create new partition for system versioned tables with history partitioned by INTERVAL/LIMIT
> > +--echo #
> > +create or replace table t1 (x int) with system versioning
> > +partition by system_time limit 1 auto;
> > +--replace_result $default_engine DEFAULT_ENGINE
> > +show create table t1;
> > +
>
> in the previous version of this patch you had clarifying comments here, like
>
> --echo # INSERT, INSERT .. SELECT don't auto-create partitions
>
> or
>
> --echo # Number of partitions goes from 3 to 5
>
> I kind of miss them now, they were helpful
Yes, they were helpful as is. I removed them like I promised in the
previous thread. Though, Sergei, you are free to add your own comments
in the form you like. I'm really sorry, but nobody won't tell me what
words to use unless I am not able to be clear and bear the common
sense.
>
> > +insert into t1 values (1);
> > +
> > +create or replace table t2 (y int);
> > +insert into t2 values (2);
> > +
> > +insert into t1 select * from t2;
> > +insert into t2 select * from t1;
> > +--replace_result $default_engine DEFAULT_ENGINE
> > +show create table t1;
> > +
> > ...
> > diff --git a/sql/ha_partition.h b/sql/ha_partition.h
> > --- a/sql/ha_partition.h
> > +++ b/sql/ha_partition.h
> > @@ -1607,7 +1607,7 @@ class ha_partition :public handler
> > for (; part_id < part_id_end; ++part_id)
> > {
> > handler *file= m_file[part_id];
> > - DBUG_ASSERT(bitmap_is_set(&(m_part_info->read_partitions), part_id));
> > + bitmap_set_bit(&(m_part_info->read_partitions), part_id);
>
> would be good to have a comment before the method that it's
> only used in vers_set_hist_part(), if (vers_info->limit).
>
> because if it's a generic method to return the number of rows in a partition
> (incl. subpartitions) then it's totally not clear why it modifies
> read_partitions bitmap.
>
> or may be you'd want to rename the method to make it look more
> specific for LIMIT partitions in vers_set_hist_part() ?
Added a comment if it will matter to anyone. No, I prefer generic and
easy names instead of long and capacious ones. I believe the latter
ones are non-productive.
>
> > file->info(HA_STATUS_VARIABLE | HA_STATUS_NO_LOCK | HA_STATUS_OPEN);
> > part_recs+= file->stats.records;
> > }
> > diff --git a/sql/handler.cc b/sql/handler.cc
> > --- a/sql/handler.cc
> > +++ b/sql/handler.cc
> > @@ -1574,6 +1574,8 @@ int ha_commit_trans(THD *thd, bool all)
> > DBUG_ASSERT(thd->transaction->stmt.ha_list == NULL ||
> > trans == &thd->transaction->stmt);
> >
> > + DBUG_ASSERT(!thd->in_sub_stmt);
> > if (thd->in_sub_stmt)
> > {
> > DBUG_ASSERT(0);
>
> This is a bit redundant, don't you think?
> One DBUG_ASSERT would be enough.
> (personally, I'd prefer the one you added, not DBUG_ASSERT(0))
>
Oh, that's kind of strange I missed that. Okay.
> > diff --git a/sql/lock.cc b/sql/lock.cc
> > --- a/sql/lock.cc
> > +++ b/sql/lock.cc
> > @@ -662,16 +662,28 @@ MYSQL_LOCK *mysql_lock_merge(MYSQL_LOCK *a,MYSQL_LOCK *b)
> > DBUG_PRINT("enter", ("a->lock_count: %u b->lock_count: %u",
> > a->lock_count, b->lock_count));
> >
> > - if (!(sql_lock= (MYSQL_LOCK*)
> > - my_malloc(key_memory_MYSQL_LOCK, sizeof(*sql_lock) +
> > - sizeof(THR_LOCK_DATA*)*((a->lock_count+b->lock_count)*2) +
> > - sizeof(TABLE*)*(a->table_count+b->table_count),MYF(MY_WME))))
> > - DBUG_RETURN(0); // Fatal error
> > + const size_t lock_size= sizeof(*sql_lock) +
> > + sizeof(THR_LOCK_DATA *) * ((a->lock_count + b->lock_count) * 2) +
> > + sizeof(TABLE *) * (a->table_count + b->table_count);
> > + if (thd)
> > + {
> > + sql_lock= (MYSQL_LOCK *) thd->alloc(lock_size);
> > + if (!sql_lock)
> > + DBUG_RETURN(0);
> > + sql_lock->flags= GET_LOCK_ON_THD;
>
> why? the commit comment for MDEV-23639 doesn't mention it.
>
> generally, I think it's a good idea and, likely, should be used more than
> just in recover_from_failed_open(). But why right now and in this commit?
> Does this commit (22158b3c) rely on locks being on THD memroot? How so?
Yes, it is possible to have locks on THD at that moment. Sorry, I was
too tired to investigate the exact cause nor to have interest in that.
And I don't really feel that misc refactoring is something important
to mention. All I can say: I am not happy with the two allocation
methods for locks. They should always be allocated from mem_root.
Locked_tables_list has its own dedicated one AFAIR.
>
> > + }
> > + else
> > + {
> > + sql_lock= (MYSQL_LOCK *)
> > + my_malloc(key_memory_MYSQL_LOCK, lock_size, MYF(MY_WME));
> > + if (!sql_lock)
> > + DBUG_RETURN(0);
> > + sql_lock->flags= 0;
> > + }
> > sql_lock->lock_count=a->lock_count+b->lock_count;
> > sql_lock->table_count=a->table_count+b->table_count;
> > sql_lock->locks=(THR_LOCK_DATA**) (sql_lock+1);
> > sql_lock->table=(TABLE**) (sql_lock->locks+sql_lock->lock_count*2);
> > - sql_lock->flags= 0;
> > memcpy(sql_lock->locks,a->locks,a->lock_count*sizeof(*a->locks));
> > memcpy(sql_lock->locks+a->lock_count,b->locks,
> > b->lock_count*sizeof(*b->locks));
> > @@ -705,8 +717,10 @@ MYSQL_LOCK *mysql_lock_merge(MYSQL_LOCK *a,MYSQL_LOCK *b)
> > a->lock_count, b->lock_count);
> >
> > /* Delete old, not needed locks */
> > - my_free(a);
> > - my_free(b);
> > + if (!(a->flags & GET_LOCK_ON_THD))
> > + my_free(a);
> > + if (!(b->flags & GET_LOCK_ON_THD))
> > + my_free(b);
>
> this even looks like a bug, no check that old locks weren't on THD memroot.
> (probably impossible in this particular case, but still rather fragile)
> thanks!
>
> > DBUG_RETURN(sql_lock);
> > }
> >
> > diff --git a/sql/share/errmsg-utf8.txt b/sql/share/errmsg-utf8.txt
> > --- a/sql/share/errmsg-utf8.txt
> > +++ b/sql/share/errmsg-utf8.txt
> > @@ -7985,3 +7985,5 @@ ER_JSON_TABLE_MULTIPLE_MATCHES
> > eng "Can't store multiple matches of the path in the column '%s' of JSON_TABLE '%s'."
> > ER_WITH_TIES_NEEDS_ORDER
> > eng "FETCH ... WITH TIES requires ORDER BY clause to be present"
> > +ER_VERS_HIST_PART_FAILED
> > + eng "Versioned table %`s.%`s: adding HISTORY partition failed"
>
> Can you come up with a test for this error?
Why? There are 5 cases, for which of them? I'll try to explain: the
code in these calls is vanilla and should be already tested. So, why?
>
> > diff --git a/sql/sql_base.cc b/sql/sql_base.cc
> > --- a/sql/sql_base.cc
> > +++ b/sql/sql_base.cc
> > @@ -931,7 +934,7 @@ void close_thread_table(THD *thd, TABLE **table_ptr)
> > DBUG_PRINT("tcache", ("table: '%s'.'%s' %p", table->s->db.str,
> > table->s->table_name.str, table));
> > DBUG_ASSERT(!table->file->keyread_enabled());
> > - DBUG_ASSERT(!table->file || table->file->inited == handler::NONE);
> > + DBUG_ASSERT(table->file->inited == handler::NONE);
>
> indeed
>
> >
> > /*
> > The metadata lock must be released after giving back
> > @@ -1625,6 +1625,125 @@ bool is_locked_view(THD *thd, TABLE_LIST *t)
> > }
> >
> >
> > +#ifdef WITH_PARTITION_STORAGE_ENGINE
> > +/**
> > + Switch part_info->hist_part and request partition creation if needed.
> > +
> > + @retval true Error or partition creation was requested.
> > + @retval false No error
> > +*/
> > +bool TABLE::vers_switch_partition(THD *thd, TABLE_LIST *table_list,
> > + Open_table_context *ot_ctx)
> > +{
> > + if (!part_info || part_info->part_type != VERSIONING_PARTITION ||
> > + table_list->vers_conditions.delete_history ||
> > + thd->stmt_arena->is_stmt_prepare() ||
> > + table_list->lock_type < TL_WRITE_ALLOW_WRITE ||
> > + table_list->mdl_request.type < MDL_SHARED_WRITE ||
> > + table_list->mdl_request.type == MDL_EXCLUSIVE)
> > + {
> > + return false;
> > + }
> > +
> > + switch (thd->lex->sql_command)
> > + {
> > + case SQLCOM_INSERT:
> > + if (thd->lex->duplicates != DUP_UPDATE)
> > + return false;
> > + break;
> > + case SQLCOM_LOAD:
> > + if (thd->lex->duplicates != DUP_REPLACE)
> > + return false;
> > + break;
> > + case SQLCOM_LOCK_TABLES:
> > + case SQLCOM_DELETE:
> > + case SQLCOM_UPDATE:
> > + case SQLCOM_REPLACE:
> > + case SQLCOM_REPLACE_SELECT:
> > + case SQLCOM_DELETE_MULTI:
> > + case SQLCOM_UPDATE_MULTI:
> > + break;
> > + default:
> > + /*
> > + TODO: make row events set thd->lex->sql_command appropriately.
> > +
> > + Sergei Golubchik: f.ex. currently row events increment
> > + thd->status_var.com_stat[] each event for its own SQLCOM_xxx, it won't be
> > + needed if they'll just set thd->lex->sql_command.
> > + */
> > + if (thd->rgi_slave && thd->rgi_slave->current_event &&
> > + thd->lex->sql_command == SQLCOM_END)
> > + {
> > + switch (thd->rgi_slave->current_event->get_type_code())
> > + {
> > + case UPDATE_ROWS_EVENT:
> > + case UPDATE_ROWS_EVENT_V1:
> > + case DELETE_ROWS_EVENT:
> > + case DELETE_ROWS_EVENT_V1:
> > + break;
> > + default:;
> > + return false;
> > + }
> > + }
> > + break;
> > + }
> > +
> > + TABLE *table= this;
> > +
> > + /*
> > + NOTE: The semantics of vers_set_hist_part() is double: even when we
>
> twofold
>
Dual, double or twofold. What is the difference between the synonyms?
> > + don't need auto-create, we need to update part_info->hist_part.
> > + */
> > + uint *create_count= table_list->vers_skip_create ?
> > + NULL : &ot_ctx->vers_create_count;
> > + table_list->vers_skip_create= true;
> > + if (table->part_info->vers_set_hist_part(thd, create_count))
> > + {
> > + MYSQL_UNBIND_TABLE(table->file);
> > + tc_release_table(table);
> > + return true;
> > + }
> > + if (ot_ctx->vers_create_count)
> > + {
> > + Open_table_context::enum_open_table_action action;
> > + TABLE_LIST *table_arg;
> > + mysql_mutex_lock(&table->s->LOCK_share);
> > + if (!table->s->vers_skip_auto_create)
> > + {
> > + table->s->vers_skip_auto_create= true;
> > + action= Open_table_context::OT_ADD_HISTORY_PARTITION;
> > + table_arg= table_list;
> > + }
> > + else
> > + {
> > + /*
> > + NOTE: this may repeat multiple times until creating thread acquires
> > + MDL_EXCLUSIVE. Since auto-creation is rare operation this is acceptable.
> > + We could suspend this thread on cond-var but we must first exit
> > + MDL_SHARED_WRITE first and we cannot store cond-var into TABLE_SHARE
> > + because it is already released and there is no guarantee that it will
> > + be same instance if we acquire it again.
> > + */
> > + table_list->vers_skip_create= false;
> > + ot_ctx->vers_create_count= 0;
> > + action= Open_table_context::OT_REOPEN_TABLES;
> > + table_arg= NULL;
> > + }
>
> I'm afraid I don't understand. All this business with vers_skip_create and
> vers_skip_auto_create, it wasn't in the previous version of the patch. So, I
> believe it was a fix for one of the MDEV bugs reported and fixed meanwhile.
>
> What MDEV was it?
> Could you explain what was the issue and what was the fix, please?
No, that was the multi-threaded case which worked good for me, but
suddenly I discovered it fails on some buildbot.
That was this case AFAIR:
--echo # Concurrent DML
set timestamp= unix_timestamp('2000-01-01 00:00:00');
create or replace table t1 (x int) with system versioning
partition by system_time interval 1 hour auto partitions 3;
...
>
> > + mysql_mutex_unlock(&table->s->LOCK_share);
> > + if (!thd->locked_tables_mode)
> > + {
> > + MYSQL_UNBIND_TABLE(table->file);
> > + tc_release_table(table);
> > + }
> > + ot_ctx->request_backoff_action(action, table_arg);
> > + return true;
> > + }
> > +
> > + return false;
> > +}
> > +#endif /* WITH_PARTITION_STORAGE_ENGINE */
> > +
> > +
> > /**
> > Open a base table.
> >
> > @@ -2572,11 +2699,13 @@ void Locked_tables_list::mark_table_for_reopen(THD *thd, TABLE *table)
> > table_list; table_list= table_list->next_global)
> > {
> > if (table_list->table->s == share)
> > + {
> > table_list->table->internal_set_needs_reopen(true);
> > + some_table_marked_for_reopen= 1;
> > + }
> > }
> > /* This is needed in the case where lock tables where not used */
> > table->internal_set_needs_reopen(true);
>
> why ^^^ this doesn't count as "some_table_marked_for_reopen" ?
Because reopen_tables() works only on table_list member. I tried to
minimize spurious calls for reopen_tables().
>
> > - some_table_marked_for_reopen= 1;
> > }
> >
> >
> > @@ -3172,13 +3304,49 @@ Open_table_context::recover_from_failed_open()
> > break;
> > case OT_DISCOVER:
> > case OT_REPAIR:
> > - if ((result= lock_table_names(m_thd, m_thd->lex->create_info,
> > - m_failed_table, NULL,
> > - get_timeout(), 0)))
> > + case OT_ADD_HISTORY_PARTITION:
> > + if (!m_thd->locked_tables_mode)
> > + result= lock_table_names(m_thd, m_thd->lex->create_info, m_failed_table,
> > + NULL, get_timeout(), 0);
> > + else
> > + {
> > + DBUG_ASSERT(!result);
> > + DBUG_ASSERT(m_action == OT_ADD_HISTORY_PARTITION);
>
> It seems you've solved recover_from_failed_open under LOCK TABLES,
> so it could be possible to make OT_DISCOVER and OT_REPAIR to work there too.
> Is that right? It's just a question, I don't imply it should be part of this MDEV.
The main conditions in my solution are: 1. keep the table share in
memory; 2. let the Locked_tables_list reopen the tables. Are these
conditions applicable to these cases, I don't know.
>
> > + }
> > + /*
> > + We are now under MDL_EXCLUSIVE mode. Other threads have no table share
> > + acquired: they are blocked either at open_table_get_mdl_lock() in
> > + open_table() or at lock_table_names() here.
> > + */
> > + if (result)
> > + {
> > + if (m_action == OT_ADD_HISTORY_PARTITION)
> > + {
> > + TABLE_SHARE *share= tdc_acquire_share(m_thd, m_failed_table,
> > + GTS_TABLE, NULL);
> > + if (share)
> > + {
> > + share->vers_skip_auto_create= false;
> > + tdc_release_share(share);
> > + }
> > + if (m_thd->get_stmt_da()->sql_errno() == ER_LOCK_WAIT_TIMEOUT)
> > + {
> > + // MDEV-23642 Locking timeout caused by auto-creation affects original DML
> > + m_thd->clear_error();
> > + vers_create_count= 0;
> > + result= false;
> > + }
> > + }
> > break;
> > + }
> >
> > - tdc_remove_table(m_thd, m_failed_table->db.str,
> > - m_failed_table->table_name.str);
> > + /*
> > + We don't need to remove share under OT_ADD_HISTORY_PARTITION.
> > + Moreover fast_alter_partition_table() works with TABLE instance.
> > + */
> > + if (m_action != OT_ADD_HISTORY_PARTITION)
> > + tdc_remove_table(m_thd, m_failed_table->db.str,
> > + m_failed_table->table_name.str);
> >
> > switch (m_action)
> > {
> > @@ -3206,6 +3374,70 @@ Open_table_context::recover_from_failed_open()
> > case OT_REPAIR:
> > result= auto_repair_table(m_thd, m_failed_table);
> > break;
> > + case OT_ADD_HISTORY_PARTITION:
> > + {
> > + result= false;
> > + TABLE *table= open_ltable(m_thd, m_failed_table, TL_WRITE,
> > + MYSQL_OPEN_HAS_MDL_LOCK | MYSQL_OPEN_IGNORE_LOGGING_FORMAT);
> > + if (table == NULL)
> > + {
> > + m_thd->clear_error();
> > + break;
> > + }
> > +
> > + DBUG_ASSERT(vers_create_count);
> > + result= vers_create_partitions(m_thd, m_failed_table, vers_create_count);
> > + vers_create_count= 0;
> > + if (!m_thd->transaction->stmt.is_empty())
> > + trans_commit_stmt(m_thd);
> > + DBUG_ASSERT(!result ||
> > + !m_thd->locked_tables_mode ||
> > + m_thd->lock->lock_count);
> > + if (result)
> > + break;
> > + if (!m_thd->locked_tables_mode)
> > + {
> > + /*
> > + alter_partition_lock_handling() does mysql_lock_remove() but
> > + does not clear thd->lock completely.
> > + */
> > + DBUG_ASSERT(m_thd->lock->lock_count == 0);
> > + if (!(m_thd->lock->flags & GET_LOCK_ON_THD))
> > + my_free(m_thd->lock);
> > + m_thd->lock= NULL;
> > + }
> > + else if (m_thd->locked_tables_mode == LTM_PRELOCKED)
> > + {
> > + MYSQL_LOCK *lock;
> > + MYSQL_LOCK *merged_lock;
> > +
> > + /*
> > + In LTM_LOCK_TABLES table was reopened via locked_tables_list,
> > + but not in prelocked environment where we have to reopen
> > + the table manually.
> > + */
> > + Open_table_context ot_ctx(m_thd, MYSQL_OPEN_REOPEN);
> > + if (open_table(m_thd, m_failed_table, &ot_ctx))
> > + {
> > + result= true;
> > + break;
> > + }
> > + TABLE *table= m_failed_table->table;
> > + table->reginfo.lock_type= m_thd->update_lock_default;
> > + m_thd->in_lock_tables= 1;
> > + lock= mysql_lock_tables(m_thd, &table, 1,
> > + MYSQL_OPEN_REOPEN | MYSQL_LOCK_USE_MALLOC);
> > + m_thd->in_lock_tables= 0;
> > + if (lock == NULL ||
> > + !(merged_lock= mysql_lock_merge(m_thd->lock, lock, m_thd)))
> > + {
> > + result= true;
> > + break;
> > + }
> > + m_thd->lock= merged_lock;
>
> What MDEV does it fix?
> I wonder why other cases in recover_from_failed_open doesn't do that
This is LTM_PRELOCKED case (triggers f.ex.) It doesn't use
Locked_tables_list unless it is LTM_PRELOCKED_UNDER_LOCK_TABLES.
>
> > + }
> > + break;
> > + }
> > case OT_BACKOFF_AND_RETRY:
> > case OT_REOPEN_TABLES:
> > case OT_NO_ACTION:
> > @@ -4370,6 +4611,8 @@ bool open_tables(THD *thd, const DDL_options_st &options,
> > {
> > if (ot_ctx.can_recover_from_failed_open())
> > {
> > + // FIXME: is this really used?
> > + DBUG_ASSERT(0);
>
> I take it, you weren't able to get this assert to fire?
None of the tests have failed so far.
>
> > close_tables_for_reopen(thd, start,
> > ot_ctx.start_of_statement_svp());
> > if (ot_ctx.recover_from_failed_open())
>
> Regards,
> Sergei
> VP of MariaDB Server Engineering
> and security@xxxxxxxxxxx
--
All the best,
Aleksey Midenkov
@midenok
Follow ups
References