← Back to team overview

maria-developers team mailing list archive

Re: c80f867b0df: MDEV-23844 Atomic DROP TABLE

 

Hi, Michael!

On May 12, Michael Widenius wrote:
> revision-id: c80f867b0df (mariadb-10.6.0-78-gc80f867b0df)
> parent(s): eebebe8ef75
> author: Michael Widenius <michael.widenius@xxxxxxxxx>
> committer: Michael Widenius <michael.widenius@xxxxxxxxx>
> timestamp: 2021-05-04 21:09:11 +0300
> message:
> 
> MDEV-23844 Atomic DROP TABLE
...
> diff --git a/sql/sql_view.cc b/sql/sql_view.cc
> index e6d726b30d7..ff8aa55adef 100644
> --- a/sql/sql_view.cc
> +++ b/sql/sql_view.cc
> @@ -1861,8 +1868,18 @@ bool mysql_drop_view(THD *thd, TABLE_LIST *views, enum_drop_mode drop_mode)
>          not_exists_count++;
>        continue;
>      }
> +    if (!first_table++)

yuck. so your "first_table" actually means "not_first_view",
because when it's FALSE, it's the first view.

why not to rename it to "not_first_view" ?

> +    {
> +      if (ddl_log_drop_view_init(thd, &ddl_log_state))
> +        DBUG_RETURN(TRUE);

what about a crash here? the first entry is written, the second is not.

> +    }
> +    if (ddl_log_drop_view(thd, &ddl_log_state, &cpath, &view->db,
> +                          &view->table_name))
> +      DBUG_RETURN(TRUE);
> +    debug_crash_here("ddl_log_drop_before_delete_view");
>      if (unlikely(mysql_file_delete(key_file_frm, path, MYF(MY_WME))))
>        delete_error= TRUE;
> +    debug_crash_here("ddl_log_drop_after_delete_view");
>  
>      some_views_deleted= TRUE;
>  
> diff --git a/sql/log.cc b/sql/log.cc
> index 6e0a9706ec8..dba999054e4 100644
> --- a/sql/log.cc
> +++ b/sql/log.cc
> @@ -10471,16 +10483,16 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name,
>    bool last_gtid_standalone= false;
>    bool last_gtid_valid= false;
>  #endif
> +  DBUG_ENTER("TC_LOG_BINLOG::recover");
>  
>    if (! fdle->is_valid() ||
> -      (do_xa && my_hash_init(key_memory_binlog_recover_exec, &xids,
> -                             &my_charset_bin, TC_LOG_PAGE_SIZE/3, 0,
> -                             sizeof(my_xid), 0, 0, MYF(0))))
> +      (my_hash_init(key_memory_binlog_recover_exec, &xids,
> +                    &my_charset_bin, TC_LOG_PAGE_SIZE/3, 0,
> +                    sizeof(my_xid), 0, 0, MYF(0))))
>      goto err1;
>  
> -  if (do_xa)
> -    init_alloc_root(key_memory_binlog_recover_exec, &mem_root,
> -                    TC_LOG_PAGE_SIZE, TC_LOG_PAGE_SIZE, MYF(0));
> +  init_alloc_root(key_memory_binlog_recover_exec, &mem_root,
> +                  TC_LOG_PAGE_SIZE, TC_LOG_PAGE_SIZE, MYF(0));

I don't think these two changes are correct. do_xa is TRUE on crash recovery
and FALSE on gtid state recovery on 5.5->10.0 upgade or if gtid state
file was deleted. There is no point to initialize the xid hash or mem_root,
as it's not a crash recovery and there can be no incomplete transactions
or ddl statements.

Yes, I've seen the commit comment:

> - TC_LOG_BINLOG::recover() was changed to always collect Xid's from the
>   binary log and always call ddl_log_close_binlogged_events(). This was
>   needed to be able to collect DROP TABLE events with embedded Xid's
>   (used by ddl log).

but I cannot understand how it applies here.

>    fdle->flags&= ~LOG_EVENT_BINLOG_IN_USE_F; // abort on the first error
>  
> diff --git a/sql/sql_table.cc b/sql/sql_table.cc
> index 7a88ffdfd1b..0a83396ae14 100644
> --- a/sql/sql_table.cc
> +++ b/sql/sql_table.cc
> @@ -1363,6 +1373,17 @@ int mysql_rm_table_no_locks(THD *thd, TABLE_LIST *tables, bool if_exists,
>  
>      thd->replication_flags= 0;
>      was_view= table_type == TABLE_TYPE_VIEW;
> +
> +    if (!first_table++)

first_table -> not_first_table

> +    {
> +      LEX_CSTRING comment= {comment_start, (size_t) comment_len};
> +      if (ddl_log_drop_table_init(thd, &ddl_log_state, &comment))
> +      {
> +        error= 1;
> +        goto err;
> +      }
> +    }
> +
>      if ((table_type == TABLE_TYPE_UNKNOWN) || (was_view && !drop_view) ||
>          (table_type != TABLE_TYPE_SEQUENCE && drop_sequence))
>      {
> @@ -1565,6 +1611,7 @@ int mysql_rm_table_no_locks(THD *thd, TABLE_LIST *tables, bool if_exists,
>                                  table_name.str, (uint)table_name.length);
>        mysql_audit_drop_table(thd, table);
>      }
> +    ddl_log_update_phase(&ddl_log_state, DDL_DROP_PHASE_COLLECT);

better call it DDL_DROP_PHASE_BINLOG. It would've saved me ~10 minutes
reading the code back and forth, as ""DDL_DROP_PHASE_BINLOG" immediately
explains what this phase is for.

>  
>      if (!dont_log_query &&
>          (!error || table_dropped || non_existing_table_error(error)))
> diff --git a/sql/ddl_log.h b/sql/ddl_log.h
> index 316e6708f22..b843404fc3c 100644
> --- a/sql/ddl_log.h
> +++ b/sql/ddl_log.h
> @@ -97,6 +101,14 @@ enum enum_ddl_log_rename_table_phase {
>    DDL_RENAME_PHASE_TABLE,
>  };
>  
> +enum enum_ddl_log_drop_table_phase {
> +  DDL_DROP_PHASE_TABLE=0,
> +  DDL_DROP_PHASE_TRIGGER,

no phase for deleting data from stat tables?

> +  DDL_DROP_PHASE_COLLECT,
> +  DDL_DROP_PHASE_RESET,  /* Reset found list of dropped tables */

this one is unused
(also unused in further patches, I've checked)

> +  DDL_DROP_PHASE_END
> +};
> +
>  /*
>    Setting ddl_log_entry.phase to this has the same effect as setting
>    action_type to DDL_IGNORE_LOG_ENTRY_CODE
> diff --git a/sql/ddl_log.cc b/sql/ddl_log.cc
> index 777dfdddbc7..6145b2f7318 100644
> --- a/sql/ddl_log.cc
> +++ b/sql/ddl_log.cc
> @@ -109,6 +112,7 @@ struct st_global_ddl_log
>  };
>  
>  st_global_ddl_log global_ddl_log;
> +String ddl_drop_query;                      // Used during startup recovery

static

>  
>  mysql_mutex_t LOCK_gdl;
>  
> @@ -1132,6 +1149,122 @@ static int ddl_log_execute_action(THD *thd, MEM_ROOT *mem_root,
>      (void) update_phase(entry_pos, DDL_LOG_FINAL_PHASE);
>    }
>    break;
> +  case DDL_LOG_DROP_TABLE_INIT_ACTION:
> +  {
> +    LEX_CSTRING *comment= &ddl_log_entry->tmp_name;
> +    ddl_drop_query.length(0);
> +    ddl_drop_query.set_charset(system_charset_info);
> +    ddl_drop_query.append(STRING_WITH_LEN("DROP TABLE IF EXISTS "));
> +    if (comment->length)
> +    {
> +      ddl_drop_query.append(comment);
> +      ddl_drop_query.append(' ');
> +    }
> +    /* We don't increment phase as we want to retry this in case of crash */
> +    break;
> +  }
> +  case DDL_LOG_DROP_TABLE_ACTION:
> +  {
> +    LEX_CSTRING db, table, path;
> +    db=     ddl_log_entry->db;
> +    table=  ddl_log_entry->name;
> +    /* Note that path is without .frm extension */
> +    path=   ddl_log_entry->tmp_name;
> +
> +    switch (ddl_log_entry->phase) {
> +    case DDL_DROP_PHASE_TABLE:
> +      if (hton)
> +      {
> +        if ((error= hton->drop_table(hton, path.str)))
> +        {
> +          if (!non_existing_table_error(error))
> +            break;
> +          error= -1;
> +        }
> +      }
> +      else
> +        error= ha_delete_table_force(thd, path.str, &db, &table);
> +      if (error <= 0)
> +      {
> +        /* Not found or already deleted. Delete .frm if it exists */
> +        strxnmov(to_path, sizeof(to_path)-1, path.str, reg_ext, NullS);
> +        mysql_file_delete(key_file_frm, to_path, MYF(MY_WME|MY_IGNORE_ENOENT));

this should rather be inside the previous if().
no need to do it for ha_delete_table_force(), as it doesn't
leave any traces and removes all remnants of a table.

> +      }
> +      if (ddl_log_increment_phase_no_lock(entry_pos))
> +        break;
> +      (void) ddl_log_sync_no_lock();

No need to, ddl_log_increment_phase_no_lock() syncs internally

> +      /* Fall through */
> +    case DDL_DROP_PHASE_TRIGGER:
> +      Table_triggers_list::drop_all_triggers(thd, &db, &table,
> +                                             MYF(MY_WME | MY_IGNORE_ENOENT));
> +      if (ddl_log_increment_phase_no_lock(entry_pos))
> +        break;
> +      (void) ddl_log_sync_no_lock();

same. and in other places too, I'd expect.

> +      /* Fall through */
> +
> +      case DDL_DROP_PHASE_COLLECT:

indentation?

> +      append_identifier(thd, &ddl_drop_query, &db);
> +      ddl_drop_query.append('.');
> +      append_identifier(thd, &ddl_drop_query, &table);
> +      ddl_drop_query.append(',');
> +      /* We don't increment phase as we want to retry this in case of crash */
> +
> +      if (!ddl_log_entry->next_entry && mysql_bin_log.is_open())
> +      {
> +        /* Last drop table. Write query to binlog */
> +        LEX_CSTRING end_comment=
> +          { STRING_WITH_LEN(" /* generated by ddl log */")};

"generated by ddl log" ? a "log" cannot "generate" a statement,
may be "generated during ddl recovery" ?

> +        ddl_drop_query.length(ddl_drop_query.length()-1);
> +        ddl_drop_query.append(&end_comment);
> +
> +        mysql_mutex_unlock(&LOCK_gdl);
> +        (void) thd->binlog_query(THD::STMT_QUERY_TYPE, ddl_drop_query.ptr(),
> +                                 ddl_drop_query.length(), TRUE, FALSE,
> +                                 FALSE, 0);
> +        mysql_mutex_lock(&LOCK_gdl);
> +      }
> +      break;
> +    }
> +    break;
> +  }
> +  case DDL_LOG_DROP_VIEW_INIT_ACTION:
> +  {
> +    ddl_drop_query.length(0);
> +    ddl_drop_query.set_charset(system_charset_info);
> +    ddl_drop_query.append(STRING_WITH_LEN("DROP VIEW IF EXISTS "));
> +    /* We don't increment phase as we want to retry this in case of crash */
> +    break;
> +  }
> +  case DDL_LOG_DROP_VIEW_ACTION:
> +  {
> +    LEX_CSTRING db, table, path;
> +    db=     ddl_log_entry->db;
> +    table=  ddl_log_entry->name;
> +    /* Note that for views path is WITH .frm extension */
> +    path=   ddl_log_entry->tmp_name;
> +
> +    mysql_file_delete(key_file_frm, path.str, MYF(MY_WME|MY_IGNORE_ENOENT));
> +    append_identifier(thd, &ddl_drop_query, &db);
> +    ddl_drop_query.append('.');
> +    append_identifier(thd, &ddl_drop_query, &table);
> +    ddl_drop_query.append(',');
> +
> +    if (!ddl_log_entry->next_entry)
> +    {
> +      /* Last drop view. Write query to binlog */
> +      LEX_CSTRING end_comment=
> +        { STRING_WITH_LEN(" /* generated by ddl log */")};
> +      ddl_drop_query.length(ddl_drop_query.length()-1);
> +      ddl_drop_query.append(&end_comment);
> +
> +      mysql_mutex_unlock(&LOCK_gdl);
> +      (void) thd->binlog_query(THD::STMT_QUERY_TYPE, ddl_drop_query.ptr(),
> +                               ddl_drop_query.length(), TRUE, FALSE,
> +                               FALSE, 0);
> +      mysql_mutex_lock(&LOCK_gdl);
> +    }
> +    break;
> +  }
>    default:
>      DBUG_ASSERT(0);
>      break;
> @@ -1682,6 +1823,7 @@ void ddl_log_release_entries(DDL_LOG_STATE *ddl_log_state)
>      next= log_entry->next_active_log_entry;
>      ddl_log_release_memory_entry(log_entry);
>    }
> +  ddl_log_state->list= 0;

may be you should also clear ddl_log_state->execute_entry below?

>  
>    if (ddl_log_state->execute_entry)
>      ddl_log_release_memory_entry(ddl_log_state->execute_entry);
> @@ -1776,6 +1918,33 @@ bool ddl_log_update_xid(DDL_LOG_STATE *state, ulonglong xid)
>  }
>  
>  
> +/*
> +  Write ddl_log_entry and write or update ddl_execute_entry
> +*/
> +
> +static bool ddl_log_write(DDL_LOG_STATE *ddl_state,
> +                          DDL_LOG_ENTRY *ddl_log_entry)

could you use a more descriptive name, please?

> +{
> +  int error;
> +  DDL_LOG_MEMORY_ENTRY *log_entry;
> +  DBUG_ENTER("ddl_log_write");
> +
> +  mysql_mutex_lock(&LOCK_gdl);
> +  error= ((ddl_log_write_entry(ddl_log_entry, &log_entry)) ||
> +          ddl_log_write_execute_entry(log_entry->entry_pos,
> +                                      &ddl_state->execute_entry));
> +  mysql_mutex_unlock(&LOCK_gdl);
> +  if (error)
> +  {
> +    if (log_entry)
> +      ddl_log_release_memory_entry(log_entry);
> +    DBUG_RETURN(1);
> +  }
> +  add_log_entry(ddl_state, log_entry);
> +  DBUG_RETURN(0);
> +}
> +
> +
>  /**
>     Logging of rename table
>  */

Regards,
Sergei
VP of MariaDB Server Engineering
and security@xxxxxxxxxxx


Follow ups