← Back to team overview

maria-developers team mailing list archive

Re: 8742d176bc2: Added support for more functions when using partitioned S3 tables

 

Hi, Michael!

On Apr 13, Michael Widenius wrote:
> revision-id: 8742d176bc2 (mariadb-10.5.2-127-g8742d176bc2)
> parent(s): bf32018be96
> author: Michael Widenius <monty@xxxxxxxxxxx>
> committer: Michael Widenius <monty@xxxxxxxxxxx>
> timestamp: 2020-04-09 16:41:42 +0300
> message:
> 
> Added support for more functions when using partitioned S3 tables
> 
> MDEV-22088 S3 partitioning support
> 
> All ALTER PARTITION commands should now work on S3 tables except
> 
> REBUILD PARTITION
> TRUNCATE PARTITION
> REORGANIZE PARTITION
> 
> In addition, PARTIONED S3 TABLES can also be replicated.
> This is achived by storing the partition tables .frm and .par file on S3
> for partitioned shared (S3) tables.
> 
> The discovery methods are enchanced by allowing engines that supports
> discovery to also support of the partitioned tables .frm and .par file
> 
> Things in more detail
> 
> - The .frm and .par files of partitioned tables are stored in S3 and kept
>   in sync.
> - Added hton callback create_partitioning_metadata to inform handler
>   that metadata for a partitoned file has changed
> - Added back handler::discover_check_version() to be able to check if
>   a table's or a part table's definition has changed.
> - Added handler::check_if_updates_are_ignored(). Needed for partitioning.
> - Renamed rebind() -> rebind_psi(), as it was before.
> - Changed CHF_xxx hadnler flags to an enum
> - Changed some checks from using table->file->ht to use
>   table->file->partition_ht() to get discovery to work with partitioning.
> - If TABLE_SHARE::init_from_binary_frm_image() fails, ensure that we
>   don't leave any .frm or .par files around.
> - Fixed that writefrm() doesn't leave unusable .frm files around
> - Appended extension to path for writefrm() to be able to reuse to function
>   for creating .par files.
> - Added DBUG_PUSH("") to a a few functions that caused a lot of not
>   critical tracing.
>
> diff --git a/mysql-test/suite/s3/replication_partition.test b/mysql-test/suite/s3/replication_partition.test
> new file mode 100644
> index 00000000000..8a177f8f075
> --- /dev/null
> +++ b/mysql-test/suite/s3/replication_partition.test
> @@ -0,0 +1,170 @@
> +--source include/have_s3.inc
> +--source include/have_partition.inc
> +--source include/master-slave.inc

master-slave should always be included last, after all other have_xxx includes.

> +--source include/have_binlog_format_mixed.inc
> +--source include/have_innodb.inc
> +--source include/have_sequence.inc
> +--source create_database.inc
> +
> +connection slave;
> +let $MYSQLD_DATADIR= `select @@datadir`;
> +--replace_result $database database
> +--eval use $database
> +connection master;
> +
> +--echo #
> +--echo # Check replication of parititioned S3 tables
> +--echo #
> +
> +CREATE TABLE t1 (
> +  c1 INT DEFAULT NULL
> +) ENGINE=Aria
> +  PARTITION BY HASH (c1)
> +  PARTITIONS 3;
> +INSERT INTO t1 VALUE (1), (2), (101), (102), (201), (202);
> +ALTER TABLE t1 ENGINE=S3;
> +ALTER TABLE t1 ADD PARTITION PARTITIONS 6;
> +select sum(c1) from t1;
> +ALTER TABLE t1 ADD COLUMN c INT;
> +select sum(c1) from t1;
> +sync_slave_with_master;
> +show create table t1;
> +select sum(c1) from t1;
> +connection master;
> +drop table t1;
> +
> +--echo #
> +--echo # Checking that the slave is keeping in sync with changed partitions
> +--echo #
> +
> +CREATE TABLE t1 (
> +  c1 int primary key,
> +  c2 int DEFAULT NULL
> +) ENGINE=InnoDB
> +  PARTITION BY RANGE (c1)
> + (PARTITION p1 VALUES LESS THAN (200),
> + PARTITION p2 VALUES LESS THAN (300),
> + PARTITION p3 VALUES LESS THAN (400));
> +insert into t1 select seq*100,seq*100 from seq_1_to_3;
> +alter table t1 engine=S3;
> +show create table t1;
> +
> +sync_slave_with_master;
> +select sum(c1) from t1;
> +--file_exists $MYSQLD_DATADIR/$database/t1.frm
> +--file_exists $MYSQLD_DATADIR/$database/t1.par
> +stop slave;
> +connection master;
> +ALTER TABLE t1 ADD PARTITION (PARTITION p4 VALUES LESS THAN (500));
> +connection slave;
> +show create table t1;
> +select sum(c1) from t1;
> +start slave;
> +connection master;
> +sync_slave_with_master;
> +select sum(c1)+0 from t1;
> +stop slave;
> +
> +# .frm amd .par files should not exists on the salve as it has just seen the
> +# ALTER TABLE which cased the removal of the .frm and .par files. The table
> +# from the above "select sum()" came from table cache and was used as it's
> +# id matches the one in S3
> +--error 1
> +--file_exists $MYSQLD_DATADIR/$database/t1.frm
> +--error 1
> +--file_exists $MYSQLD_DATADIR/$database/t1.par
> +# Flushing the table cache will force the .frm and .par files to be
> +# re-generated
> +flush tables;
> +select sum(c1)+0 from t1;
> +--file_exists $MYSQLD_DATADIR/$database/t1.frm
> +--file_exists $MYSQLD_DATADIR/$database/t1.par
> +
> +connection master;
> +drop table t1;
> +connection slave;
> +--file_exists $MYSQLD_DATADIR/$database/t1.par
> +--replace_result $database database
> +--error ER_NO_SUCH_TABLE
> +select sum(c1) from t1;
> +--error 1
> +--file_exists $MYSQLD_DATADIR/$database/t1.par
> +start slave;
> +connection master;
> +
> +--echo #
> +--echo # Check altering partitioned table to S3 and back
> +--echo # Checks also rename partitoned table and drop partition
> +--echo #
> +
> +CREATE TABLE t2 (
> +  c1 int primary key,
> +  c2 int DEFAULT NULL
> +) ENGINE=InnoDB
> +  PARTITION BY RANGE (c1)
> + (PARTITION p1 VALUES LESS THAN (200),
> + PARTITION p2 VALUES LESS THAN (300),
> + PARTITION p3 VALUES LESS THAN (400));
> +insert into t2 select seq*100,seq*100 from seq_1_to_3;
> +alter table t2 engine=S3;
> +rename table t2 to t1;
> +alter table t1 drop partition p1;
> +sync_slave_with_master;
> +select sum(c1) from t1;
> +connection master;
> +alter table t1 engine=innodb;
> +sync_slave_with_master;
> +select sum(c1) from t1;
> +connection master;
> +drop table t1;
> +
> +--echo #
> +--echo # Check that slaves ignores changes to S3 tables.
> +--echo #
> +
> +connection master;
> +CREATE TABLE t1 (
> +  c1 int primary key,
> +  c2 int DEFAULT NULL
> +) ENGINE=InnoDB
> +  PARTITION BY RANGE (c1)
> + (PARTITION p1 VALUES LESS THAN (200),
> + PARTITION p2 VALUES LESS THAN (300),
> + PARTITION p3 VALUES LESS THAN (400));
> +insert into t1 select seq*100,seq*100 from seq_1_to_3;
> +create table t2 like t1;
> +alter table t2 remove partitioning;
> +insert into t2 values (450,450);
> +sync_slave_with_master;
> +stop slave;
> +connection master;
> +alter table t1 engine=s3;
> +alter table t2 engine=s3;
> +ALTER TABLE t1 ADD PARTITION (PARTITION p4 VALUES LESS THAN (500));
> +alter table t1 exchange partition p4 with table t2;
> +select count(*) from t1;
> +drop table t1,t2;
> +connection slave;
> +start slave;
> +connection master;
> +sync_slave_with_master;
> +--replace_result $database database
> +--error ER_NO_SUCH_TABLE
> +select sum(c1) from t1;
> +connection master;
> +
> +--echo #
> +--echo # Check slave binary log
> +--echo #
> +
> +sync_slave_with_master;
> +--let $binlog_database=$database
> +--source include/show_binlog_events.inc
> +connection master;
> +
> +--echo #
> +--echo # clean up
> +--echo #
> +--source drop_database.inc
> +sync_slave_with_master;
> +--source include/rpl_end.inc
> diff --git a/mysys/my_symlink.c b/mysys/my_symlink.c
> index cbee78a7f5c..323ae69a39c 100644
> --- a/mysys/my_symlink.c
> +++ b/mysys/my_symlink.c
> @@ -154,7 +154,8 @@ int my_realpath(char *to, const char *filename, myf MyFlags)
>        original name but will at least be able to resolve paths that starts
>        with '.'.
>      */
> -    DBUG_PRINT("error",("realpath failed with errno: %d", errno));
> +    if (MyFlags)
> +      DBUG_PRINT("error",("realpath failed with errno: %d", errno));

This is kind of against the concept of dbug. dbug's idea is that one should
not edit the code every time, but put DBUG points one and enable/disable them
at run-time.

In cases like that you can specify precisely what you want to see in the trace
with complicated command-line --debug string. But I usually just let it
log everything and then use dbug/remove_function_from_trace.pl script.

>      my_errno=errno;
>      if (MyFlags & MY_WME)
>        my_error(EE_REALPATH, MYF(0), filename, my_errno);
> diff --git a/sql/discover.cc b/sql/discover.cc
> index e49a2a3b0c0..7f3fe73c155 100644
> --- a/sql/discover.cc
> +++ b/sql/discover.cc
> @@ -99,23 +99,23 @@ int readfrm(const char *name, const uchar **frmdata, size_t *len)
>  
>  
>  /*
> -  Write the content of a frm data pointer 
> -  to a frm file.
> +  Write the content of a frm data pointer to a frm or par file.

really? writefrm() writes to a .par file?
may be rename it to writefile() then?

>  
> -  @param path           path to table-file "db/name"
> -  @param frmdata        frm data
> -  @param len            length of the frmdata
> +  @param path           full path to table-file "db/name.frm" or .par
> +  @param db             Database name. Only used for my_error()
> +  @param table          Table name. Only used for my_error()
> +  @param data           data to write to file
> +  @param len            length of the data
>  
>    @retval
>      0	ok
>    @retval
> -    2    Could not write file
> +    <> 0    Could not write file. In this case the file is not created
>  */
>  
>  int writefrm(const char *path, const char *db, const char *table,
> -             bool tmp_table, const uchar *frmdata, size_t len)
> +             bool tmp_table, const uchar *data, size_t len)
>  {
> -  char	 file_name[FN_REFLEN+1];
>    int error;
>    int create_flags= O_RDWR | O_TRUNC;
>    DBUG_ENTER("writefrm");
> diff --git a/sql/ha_partition.cc b/sql/ha_partition.cc
> index 0ec1f2138ab..b8b5085f389 100644
> --- a/sql/ha_partition.cc
> +++ b/sql/ha_partition.cc
> @@ -629,7 +629,8 @@ int ha_partition::rename_table(const char *from, const char *to)
>  
>    SYNOPSIS
>      create_partitioning_metadata()
> -    name                              Full path of table name
> +    path                              Full path of new table name
> +    old_p                             Full path of old table name

This is a rather meaningless phrase, there cannot be a "path" of a "name"

May be, write "A path to the new/old frm file but without the extension"
instead?

>      create_info                       Create info generated for CREATE TABLE
>  
>    RETURN VALUE
> @@ -678,6 +681,19 @@ int ha_partition::create_partitioning_metadata(const char *path,
>        DBUG_RETURN(1);
>      }
>    }
> +
> +  /* m_part_info is only NULL when we failed to create a partition table */

How can m_part_info be NULL here?
create_handler_file() dereferences m_part_info unconditionally, if
m_part_info would've been NULL it'll crash.

> +  if (m_part_info)
> +  {
> +    part= m_part_info->partitions.head();
> +    if ((part->engine_type)->create_partitioning_metadata &&
> +        ((part->engine_type)->create_partitioning_metadata)(path, old_path,
> +                                                            action_flag))
> +    {
> +      my_error(ER_CANT_CREATE_HANDLER_FILE, MYF(0));
> +      DBUG_RETURN(1);
> +    }
> +  }
>    DBUG_RETURN(0);
>  }
>  
> @@ -1604,6 +1620,7 @@ int ha_partition::prepare_new_partition(TABLE *tbl,
>  
>    if (!(file->ht->flags & HTON_CAN_READ_CONNECT_STRING_IN_PARTITION))
>      tbl->s->connect_string= p_elem->connect_string;
> +  create_info->options|= HA_CREATE_TMP_ALTER;

This looks wrong. The partition isn't created as a temporary file.
HA_CREATE_TMP_ALTER is used for autiding and perfschema to distinguish
between normal tables, temporary tables and these alter-tmp-tables.

I see that you've hijacked this flag and check it inside s3, but it wasn't
designed for that, so change s3 instead, not what the flag means.
I don't think you need that if inside s3 at all, if hton flag says
HTON_TEMPORARY_NOT_SUPPORTED, then s3 won't be asked to create a temporary
table and you don't need a check for that in create.

>    if ((error= file->ha_create(part_name, tbl, create_info)))
>    {
>      /*
> @@ -2361,14 +2379,20 @@ uint ha_partition::del_ren_table(const char *from, const char *to)
>    const char *to_path= NULL;
>    uint i;
>    handler **file, **abort_file;
> +  THD *thd= ha_thd();
>    DBUG_ENTER("ha_partition::del_ren_table");
>  
> -  if (get_from_handler_file(from, ha_thd()->mem_root, false))
> -    DBUG_RETURN(TRUE);
> +  if (get_from_handler_file(from, thd->mem_root, false))
> +    DBUG_RETURN(my_errno ? my_errno : ENOENT);
>    DBUG_ASSERT(m_file_buffer);
>    DBUG_PRINT("enter", ("from: (%s) to: (%s)", from, to ? to : "(nil)"));
>    name_buffer_ptr= m_name_buffer_ptr;
> +
>    file= m_file;
> +  /* The command should be logged with IF EXISTS if using a shared table */
> +  if (m_file[0]->ht->flags & HTON_TABLE_MAY_NOT_EXIST_ON_SLAVE)
> +    thd->replication_flags|= OPTION_IF_EXISTS;

I don't think this and the whole thd->replication_flags is needed,
because in the sql_table.cc you can check directly

 table->file->partition_ht()->flags && HTON_TABLE_MAY_NOT_EXIST_ON_SLAVE

> +
>    if (to == NULL)
>    {
>      /*
> @@ -2424,7 +2453,35 @@ uint ha_partition::del_ren_table(const char *from, const char *to)
>        goto rename_error;
>      }
>    }
> +
> +  /* Update .par file in the handlers that supports it */
> +  if ((*m_file)->ht->create_partitioning_metadata)
> +  {
> +    if (to == NULL)
> +      error= (*m_file)->ht->create_partitioning_metadata(NULL, from,
> +                                                         CHF_DELETE_FLAG);
> +    else
> +      error= (*m_file)->ht->create_partitioning_metadata(to, from,
> +                                                         CHF_RENAME_FLAG);

why not just

  error= (*m_file)->ht->create_partitioning_metadata(to, from);

the engine can distingush CREATE from RENAME from DELETE by looking
at whether 'to' or `from` is NULL.

> +    DBUG_EXECUTE_IF("failed_create_partitioning_metadata",
> +                    { my_message_sql(ER_OUT_OF_RESOURCES,"Simulated crash",MYF(0));
> +                      error= 1;
> +                    });
> +    if (error)
> +    {
> +      if (to)
> +      {
> +        (void) handler::rename_table(to, from);
> +        (void) (*m_file)->ht->create_partitioning_metadata(from, to,
> +                                                           CHF_RENAME_FLAG);
> +        goto rename_error;
> +      }
> +      else
> +        save_error=error;
> +    }
> +  }
>    DBUG_RETURN(save_error);
> +
>  rename_error:
>    name_buffer_ptr= m_name_buffer_ptr;
>    for (abort_file= file, file= m_file; file < abort_file; file++)
> @@ -3729,6 +3787,16 @@ int ha_partition::rebind()
>  #endif /* HAVE_M_PSI_PER_PARTITION */
>  
>  
> +/*
> +  Check if the table definition has changed for the part tables
> +  We use the first partition for the check.
> +*/
> +
> +int ha_partition::discover_check_version()
> +{
> +  return m_file[0]->discover_check_version();
> +}

1. generally, you have to do it for every open partition, not just for the
first one.
2. m_file[0] partition is not necessarily open, you need to use the first
open partition here (and everywhere)

> +
>  /**
>    Clone the open and locked partitioning handler.
>  
> @@ -11382,6 +11450,12 @@ int ha_partition::end_bulk_delete()
>  }
>  
>  
> +bool ha_partition::check_if_updates_are_ignored(const char *op) const
> +{
> +  return (handler::check_if_updates_are_ignored(op) ||
> +          ha_check_if_updates_are_ignored(table->in_use, partition_ht(), op));

no you don't need this virtual method at all, simply do

-  if (ha_check_if_updates_are_ignored(ha_thd(), ht, "DROP"))
+  if (ha_check_if_updates_are_ignored(ha_thd(), partition_ht(), "DROP"))

that's why partition_ht() was created in the first place.

> +}
> +
>  /**
>    Perform initialization for a direct update request.
>  
> diff --git a/sql/handler.h b/sql/handler.h
> index 8c45c64bec8..eadbf28229c 100644
> --- a/sql/handler.h
> +++ b/sql/handler.h
> @@ -3183,7 +3200,10 @@ class handler :public Sql_alloc
>  
>  public:
>    virtual void unbind_psi();
> -  virtual int rebind();
> +  virtual void rebind_psi();

this doesn't have to be virtual anymore, because you
moved the check into the virtual discover_check_version()

> +  /* Return error if definition doesn't match for already opened table */
> +  virtual int discover_check_version() { return 0; }
> +
>    /**
>      Put the handler in 'batch' mode when collecting
>      table io instrumented events.
> diff --git a/sql/sql_parse.cc b/sql/sql_parse.cc
> index 53ccdb5d4c3..b1756b83056 100644
> --- a/sql/sql_parse.cc
> +++ b/sql/sql_parse.cc
> @@ -5906,6 +5906,8 @@ mysql_execute_command(THD *thd)
>    case SQLCOM_CALL:
>    case SQLCOM_REVOKE:
>    case SQLCOM_GRANT:
> +    if (thd->variables.option_bits & OPTION_IF_EXISTS)
> +      lex->create_info.set(DDL_options_st::OPT_IF_EXISTS);

1. please remove the same (now redundant) code from
   Sql_cmd_alter_table::execute()
2. better put it only for the relevant case. Like this

  case SQLCOM_ALTER_TABLE:
        if (thd->variables.option_bits & OPTION_IF_EXISTS)
          lex->create_info.set(DDL_options_st::OPT_IF_EXISTS);
        /* fall through */
  case SQLCOM_ANALYZE_TABLE:
        ...

>      DBUG_ASSERT(lex->m_sql_cmd != NULL);
>      res= lex->m_sql_cmd->execute(thd);
>      DBUG_PRINT("result", ("res: %d  killed: %d  is_error: %d",
> diff --git a/sql/sql_partition.cc b/sql/sql_partition.cc
> index 7385f9059e1..351464939e2 100644
> --- a/sql/sql_partition.cc
> +++ b/sql/sql_partition.cc
> @@ -7057,6 +7059,10 @@ uint fast_alter_partition_table(THD *thd, TABLE *table,
>    lpt->pack_frm_data= NULL;
>    lpt->pack_frm_len= 0;
>  
> +  /* Add IF EXISTS to binlog if shared table */
> +  if (table->file->partition_ht()->flags & HTON_TABLE_MAY_NOT_EXIST_ON_SLAVE)
> +    thd->variables.option_bits|= OPTION_IF_EXISTS;
> +

No need to, you already have that in mysql_alter_table(). Just fix the check
there, like this:

  if (!if_exists &&
-      (table->s->db_type()->flags & HTON_TABLE_MAY_NOT_EXIST_ON_SLAVE))
+      (table->file->partition_ht()->flags & HTON_TABLE_MAY_NOT_EXIST_ON_SLAVE))

>    if (table->file->alter_table_flags(alter_info->flags) &
>          HA_PARTITION_ONE_PHASE)
>    {
> diff --git a/sql/sql_partition_admin.cc b/sql/sql_partition_admin.cc
> index ed77c0938f3..d29c014bdc2 100644
> --- a/sql/sql_partition_admin.cc
> +++ b/sql/sql_partition_admin.cc
> @@ -529,14 +530,46 @@ bool Sql_cmd_alter_table_exchange_partition::
>    table_list->mdl_request.set_type(MDL_SHARED_NO_WRITE);
>    if (unlikely(open_tables(thd, &table_list, &table_counter, 0,
>                             &alter_prelocking_strategy)))
> +  {
> +    if (thd->lex->if_exists() &&
> +        thd->get_stmt_da()->sql_errno() == ER_NO_SUCH_TABLE)
> +    {
> +      /*
> +        ALTER TABLE IF EXISTS was used on not existing table
> +        We have to log the query on a slave as the table may be a shared one
> +        from the master and we need to ensure that the next slave can see
> +        the statement as this slave may not have the table shared
> +      */
> +      thd->clear_error();
> +      if (thd->slave_thread &&
> +          write_bin_log(thd, true, thd->query(), thd->query_length()))
> +        DBUG_RETURN(true);
> +      my_ok(thd);
> +      DBUG_RETURN(false);
> +    }

I don't like that. There are many Sql_cmd_alter_table* classes and lots of
duplicated code. Okay, you've put your OPTION_IF_EXISTS check into the big
switch in the mysql_execute_command() and thus avoided duplicating that code.

But this if() is also in mysql_alter_table() (and may be more?)
I'm thinking that doing many Sql_cmd_alter_table* classes was a bad idea
which leads to code duplication. How can we fix that?

>      DBUG_RETURN(true);
> +  }
>  
>    part_table= table_list->table;
>    swap_table= swap_table_list->table;
>  
> +  if (part_table->file->check_if_updates_are_ignored("ALTER"))
> +  {
> +    if (thd->slave_thread &&
> +        write_bin_log_with_if_exists(thd, true, false, true))
> +      DBUG_RETURN(true);
> +    my_ok(thd);
> +    DBUG_RETURN(false);
> +  }

another duplicated one

> +
>    if (unlikely(check_exchange_partition(swap_table, part_table)))
>      DBUG_RETURN(TRUE);
>  
> +  /* Add IF EXISTS to binlog if shared table */
> +  if (part_table->file->partition_ht()->flags &
> +      HTON_TABLE_MAY_NOT_EXIST_ON_SLAVE)
> +    force_if_exists= 1;

and again

> +
>    /* set lock pruning on first table */
>    partition_name= alter_info->partition_names.head();
>    if (unlikely(table_list->table->part_info->
> @@ -784,16 +821,51 @@ bool Sql_cmd_alter_table_truncate_partition::execute(THD *thd)
>  #endif /* WITH_WSREP */
>  
>    if (open_tables(thd, &first_table, &table_counter, 0))
> -    DBUG_RETURN(true);
> +  {
> +    if (thd->lex->if_exists() &&
> +        thd->get_stmt_da()->sql_errno() == ER_NO_SUCH_TABLE)
> +    {
> +      /*
> +        ALTER TABLE IF EXISTS was used on not existing table
> +        We have to log the query on a slave as the table may be a shared one
> +        from the master and we need to ensure that the next slave can see
> +        the statement as this slave may not have the table shared
> +      */
> +      thd->clear_error();
> +      if (thd->slave_thread &&
> +          write_bin_log(thd, true, thd->query(), thd->query_length()))
> +        DBUG_RETURN(TRUE);
> +      my_ok(thd);
> +      DBUG_RETURN(FALSE);
> +    }
> +    DBUG_RETURN(TRUE);
> +  }

see?

>  
> -  if (!first_table->table || first_table->view ||
> -      first_table->table->s->db_type() != partition_hton)
> +  if (!first_table->table || first_table->view)
>    {
>      my_error(ER_PARTITION_MGMT_ON_NONPARTITIONED, MYF(0));
>      DBUG_RETURN(TRUE);
>    }
>  
> -  
> +  if (first_table->table->file->check_if_updates_are_ignored("ALTER"))
> +  {
> +    if (thd->slave_thread &&
> +        write_bin_log_with_if_exists(thd, true, false, 1))
> +      DBUG_RETURN(true);
> +    my_ok(thd);
> +    DBUG_RETURN(false);
> +  }
> +
> +  if (first_table->table->s->db_type() != partition_hton)
> +  {
> +    my_error(ER_PARTITION_MGMT_ON_NONPARTITIONED, MYF(0));
> +    DBUG_RETURN(TRUE);
> +  }

why did you split the check for ER_PARTITION_MGMT_ON_NONPARTITIONED?

> +
> +  if (first_table->table->file->partition_ht()->flags &
> +      HTON_TABLE_MAY_NOT_EXIST_ON_SLAVE)
> +    force_if_exists= 1;
> +
>    /*
>      Prune all, but named partitions,
>      to avoid excessive calls to external_lock().
> diff --git a/sql/table.cc b/sql/table.cc
> index fe096835144..c6b42b7ba36 100644
> --- a/sql/table.cc
> +++ b/sql/table.cc
> @@ -25,7 +25,8 @@
>                                                  // primary_key_name
>  #include "sql_parse.h"                          // free_items
>  #include "strfunc.h"                            // unhex_type2
> -#include "sql_partition.h"       // mysql_unpack_partition,
> +#include "ha_partition.h"        // PART_EXT
> +                                 // mysql_unpack_partition,

also included below

>                                   // fix_partition_func, partition_info
>  #include "sql_base.h"
>  #include "create_options.h"
> @@ -42,6 +43,7 @@
>  #include "rpl_filter.h"
>  #include "sql_cte.h"
>  #include "ha_sequence.h"
> +#include "ha_partition.h"

also included above

>  #include "sql_show.h"
>  #include "opt_trace.h"
>  
> @@ -620,6 +622,9 @@ enum open_frm_error open_table_def(THD *thd, TABLE_SHARE *share, uint flags)
>    {
>      DBUG_ASSERT(flags & GTS_TABLE);
>      DBUG_ASSERT(flags & GTS_USE_DISCOVERY);
> +    /* Delete .frm and .par files */
> +    mysql_file_delete_with_symlink(key_file_frm, path, "", MYF(0));
> +    strxmov(path, share->normalized_path.str, PAR_EXT, NullS);
>      mysql_file_delete_with_symlink(key_file_frm, path, "", MYF(0));

no need to strxmov, this third "" argument is the file extension, you can directly do

    mysql_file_delete_with_symlink(key_file_frm, share->normalized_path.str, PAR_EXT, MYF(0));

also it should be key_file_partition_ddl_log not key_file_frm
(it's not a ddl log, but sql_table.cc uses key_file_partition_ddl_log for par files)

>      file= -1;
>    }
> @@ -1679,12 +1687,13 @@ class Field_data_type_info_array
>  
>    42..46 are unused since 5.0 (were for RAID support)
>    Also, there're few unused bytes in forminfo.
> -
>  */
>  
>  int TABLE_SHARE::init_from_binary_frm_image(THD *thd, bool write,
>                                              const uchar *frm_image,
> -                                            size_t frm_length)
> +                                            size_t frm_length,
> +                                            const uchar *par_image,
> +                                            size_t par_length)

I don't think writing .par file belongs here, why not to do it in the caller?

Or, better, don't send .par file to S3 and don't discover it, it'll make
everything much simpler. .par can be created from the .frm, see sql_table.cc:

  file= mysql_create_frm_image(thd, orig_db, orig_table_name, create_info,
                               alter_info, create_table_mode, key_info,
                               key_count, frm);
  ...
  if (file->ha_create_partitioning_metadata(path, NULL, CHF_CREATE_FLAG))
    goto err;

>  {
>    TABLE_SHARE *share= this;
>    uint new_frm_ver, field_pack_length, new_field_pack_flag;
> @@ -1715,24 +1724,31 @@ int TABLE_SHARE::init_from_binary_frm_image(THD *thd, bool write,
>    uint len;
>    uint ext_key_parts= 0;
>    plugin_ref se_plugin= 0;
> -  bool vers_can_native= false;
> +  bool vers_can_native= false, frm_created= 0;
>    Field_data_type_info_array field_data_type_info_array;
> -
>    MEM_ROOT *old_root= thd->mem_root;
>    Virtual_column_info **table_check_constraints;
>    extra2_fields extra2;
> -
>    DBUG_ENTER("TABLE_SHARE::init_from_binary_frm_image");
>  
>    keyinfo= &first_keyinfo;
>    thd->mem_root= &share->mem_root;
>  
> -  if (write && write_frm_image(frm_image, frm_length))
> -    goto err;
> -
>    if (frm_length < FRM_HEADER_SIZE + FRM_FORMINFO_SIZE)
>      goto err;
>  
> +  if (write)
> +  {
> +#ifdef WITH_PARTITION_STORAGE_ENGINE
> +    if (par_image)
> +      if (write_par_image(par_image, par_length))
> +        goto err;
> +#endif
> +    frm_created= 1;
> +    if (write_frm_image(frm_image, frm_length))
> +      goto err;
> +  }

why not to create it at the end? then you won't need to delete it on an error.

> +
>    share->frm_version= frm_image[2];
>    /*
>      Check if .frm file created by MySQL 5.0. In this case we want to
> diff --git a/storage/maria/ha_maria.cc b/storage/maria/ha_maria.cc
> index ed9c5404a7f..1f99fd9d895 100644
> --- a/storage/maria/ha_maria.cc
> +++ b/storage/maria/ha_maria.cc
> @@ -2724,7 +2724,7 @@ void ha_maria::drop_table(const char *name)
>  {
>    DBUG_ASSERT(file->s->temporary);
>    (void) ha_close();
> -  (void) maria_delete_table_files(name, 1, 0);
> +  (void) maria_delete_table_files(name, 1, MY_WME);

MYF(MY_WME) as always?

>  }
>  
>  


Regards,
Sergei
VP of MariaDB Server Engineering
and security@xxxxxxxxxxx


Follow ups