← Back to team overview

maria-developers team mailing list archive

Re: MDEV-181: XID crash recovery across binlog boundaries

 

Hi, Kristian!

Here's my review.
Only few comments and questions, see below.

On May 14, Kristian Nielsen wrote:
> At http://bazaar.launchpad.net/~maria-captains/maria/5.5
> 
> ------------------------------------------------------------
> revno: 3372
> revision-id: knielsen@xxxxxxxxxxxxxxx-20120514095140-n685kn0n4vtronn5
> timestamp: Mon 2012-05-14 11:51:40 +0200
> message:
>   MDEV-181: XID crash recovery across binlog boundaries

> === added file 'mysql-test/suite/binlog/t/binlog_xa_recover.test'
> --- mysql-test/suite/binlog/t/binlog_xa_recover.test	1970-01-01 00:00:00 +0000
> +++ mysql-test/suite/binlog/t/binlog_xa_recover.test	2012-05-14 09:51:40 +0000
> @@ -0,0 +1,172 @@
> +--source include/have_innodb.inc
> +--source include/have_debug.inc
> +--source include/have_debug_sync.inc
> +--source include/have_binlog_format_row.inc
> +
> +SET GLOBAL max_binlog_size= 4096;
> +
> +CREATE TABLE t1 (a INT PRIMARY KEY, b MEDIUMTEXT) ENGINE=Innodb;
> +CREATE TABLE t2 (a INT PRIMARY KEY, b MEDIUMTEXT) ENGINE=Myisam;
> +
> +# Transactions are not guaranteed stored durably on disk in the engine until
> +# they are fsync()ed, which normally happens during commit(). But there is no
> +# guarantee that they will _not_ be durable, in particular loosing results
> +# of a write(2) system call normally requires a kernel crash (as opposed to
> +# just mysqld crash), which is inconvenient to do in a test suite.
> +# So instead we do an error insert to prevent commit_ordered() from being
> +# called in the engine - so nothing will be written to disk at all, and crash
> +# recovery is sure to be needed.

Are these assumptions reasonable? You seem to assume that without
commit_ordered the engine will still work normally. but simply won't
write changes to disk. Is it always the case?

> +SET @@global.debug_dbug='+d,skip_commit_ordered';

I don't see where you restore the old value of debug_dbug

> +
> +INSERT INTO t1 VALUES (0, REPEAT("x", 4100));

how comes this insert is not affected by skip_commit_ordered ?

> +
> +# Now start a bunch of transactions that span multiple binlog
> +# files. Leave then in the state prepared-but-not-committed in the engine
> +# and crash the server. Check that crash recovery is able to recover all
> +# of them.
> +
> +connect(con1,localhost,root,,);
> +SET DEBUG_SYNC= "ha_commit_trans_after_log_and_order SIGNAL con1_ready WAIT_FOR _ever";
> +send INSERT INTO t1 VALUES (1, REPEAT("x", 4100));
> +
> +connection default;
> +SET DEBUG_SYNC= "now WAIT_FOR con1_ready";
> +INSERT INTO t2 VALUES (1, "force binlog rotation");
> +
> +connect(con2,localhost,root,,);
> +SET DEBUG_SYNC= "ha_commit_trans_after_log_and_order SIGNAL con2_ready WAIT_FOR _ever";
> +send INSERT INTO t1 VALUES (2, NULL);
> +
> +connection default;
> +SET DEBUG_SYNC= "now WAIT_FOR con2_ready";
> +
> +connect(con3,localhost,root,,);
> +SET DEBUG_SYNC= "ha_commit_trans_after_log_and_order SIGNAL con3_ready WAIT_FOR _ever";
> +send INSERT INTO t1 VALUES (3, REPEAT("x", 4100));
> +connection default;
> +SET DEBUG_SYNC= "now WAIT_FOR con3_ready";
> +INSERT INTO t2 VALUES (2, "force binlog rotation");
> +# So we won't get warnings about t2 being crashed.
> +FLUSH TABLES t2;
> +
> +# Check that everything is committed in binary log.
> +--source include/show_binary_logs.inc
> +--let $binlog_file= master-bin.000001
> +--let $binlog_start= 4
> +--source include/show_binlog_events.inc
> +--let $binlog_file= master-bin.000002
> +--source include/show_binlog_events.inc
> +--let $binlog_file= master-bin.000003
> +--source include/show_binlog_events.inc
> +--let $binlog_file= master-bin.000004
> +--source include/show_binlog_events.inc
> +
> +# Check that transactions really are not yet committed in engine.
> +# (This works because of debug_dbug='+d,skip_commit_ordered').
> +--echo We should see only one entry here, a=0:
> +SELECT a FROM t1 ORDER BY a;
> +
> +
> +# Check that server will not purge too much.
> +PURGE BINARY LOGS TO "master-bin.000004";
> +--source include/show_binary_logs.inc
> +
> +# Now crash the server with one more transaction in prepared state.
> +system echo wait-binlog_xa_recover.test >> $MYSQLTEST_VARDIR/tmp/mysqld.1.expect;

better use --write_file directive

> +SET SESSION debug_dbug="+d,crash_commit_after_log";
> +--error 2006,2013
> +INSERT INTO t1 VALUES (4, NULL);
> +
> +system echo restart-group_commit_binlog_pos.test >> $MYSQLTEST_VARDIR/tmp/mysqld.1.expect;

same

> +
> +connection default;
> +--enable_reconnect
> +--source include/wait_until_connected_again.inc
> +
> +# Check that all transactions are recovered.
> +SELECT a FROM t1 ORDER BY a;
> +
> +
> +--echo *** Test that RESET MASTER waits for pending XIDs to be unlogged.

Ah, good point

> +
> +SET @old_max_binlog_size= @@global.max_binlog_size;
> +SET GLOBAL max_binlog_size= 4096;
> +# con10 will hang with a pending XID, blocking RESET MASTER.
> +connect(con10,localhost,root,,);
> +SET DEBUG_SYNC= "ha_commit_trans_after_log_and_order SIGNAL con10_ready WAIT_FOR con10_go";
> +send INSERT INTO t1 VALUES (10, NULL);
> +
> +connection default;
> +SET DEBUG_SYNC= "now WAIT_FOR con10_ready";
> +# Let's add a few binlog rotations just for good measure.
> +INSERT INTO t2 VALUES (10, REPEAT("x", 4100));
> +INSERT INTO t2 VALUES (11, REPEAT("x", 4100));
> +--source include/show_binary_logs.inc
> +SET DEBUG_SYNC= "execute_command_after_close_tables SIGNAL reset_master_done";
> +send RESET MASTER;
> +
> +connect(con11,localhost,root,,);
> +--echo This will timeout, as RESET MASTER is blocked
> +SET DEBUG_SYNC= "now WAIT_FOR reset_master_done TIMEOUT 1";
> +# Wake up transaction to allow RESET MASTER to complete.
> +SET DEBUG_SYNC= "now SIGNAL con10_go";
> +
> +connection con10;
> +reap;
> +
> +connection default;
> +reap;
> +--source include/show_binary_logs.inc
> +
> +
> +--echo *** Test that binlog N is active, and last pending trx in (N-1) is
> +--echo unlogged while there is still a pending trx in (N-2).
> +
> +connection con10;
> +SET DEBUG_SYNC= "ha_commit_trans_after_log_and_order SIGNAL con10_ready WAIT_FOR con10_continue";
> +send INSERT INTO t1 VALUES (20, REPEAT("x", 4100));
> +
> +connection default;
> +SET DEBUG_SYNC= "now WAIT_FOR con10_ready";
> +INSERT INTO t2 VALUES (3, "force binlog rotation");
> +
> +connection con11;
> +SET DEBUG_SYNC= "ha_commit_trans_after_log_and_order SIGNAL con11_ready WAIT_FOR con11_continue";
> +send INSERT INTO t1 VALUES (21, REPEAT("x", 4100));
> +
> +connection default;
> +SET DEBUG_SYNC= "now WAIT_FOR con11_ready";
> +INSERT INTO t2 VALUES (4, "force binlog rotation");
> +--source include/show_binary_logs.inc
> +--let $binlog_file= master-bin.000001
> +--source include/show_binlog_events.inc
> +--let $binlog_file= master-bin.000002
> +--source include/show_binlog_events.inc
> +--let $binlog_file= master-bin.000003
> +--source include/show_binlog_events.inc
> +
> +SET DEBUG_SYNC= "now SIGNAL con11_continue";
> +
> +connection con11;
> +reap;
> +
> +connection default;
> +--echo con10 is still pending, no new binlog checkpoint should have been logged.
> +--let $binlog_file= master-bin.000003
> +--source include/show_binlog_events.inc
> +
> +SET DEBUG_SYNC= "now SIGNAL con10_continue";
> +
> +connection con10;
> +reap;
> +
> +connection default;
> +--echo No XIDs are pending, a new binlog checkpoint should have been logged.

Could you also test recovery with multiple checkpoints in a binlog?
E.g. binlog, like the above, then crash, and see that it uses
the *last* checkpoint in the binlog.

> +--let $binlog_file= master-bin.000003
> +--source include/show_binlog_events.inc
> +
> +
> +# Cleanup
> +connection default;
> +DROP TABLE t1, t2;
> +SET GLOBAL max_binlog_size= @old_max_binlog_size;
> 
> === modified file 'sql/log.h'
> --- sql/log.h	2012-01-16 19:16:35 +0000
> +++ sql/log.h	2012-05-14 09:51:40 +0000
> @@ -348,6 +348,15 @@
>    time_t last_time;
>  };
>  
> +/*
> +  We assign each binlog file an internal ID, used to identify them for unlog().
> +  Ids start from BINLOG_COOKIE_START; the value BINLOG_COOKIE_DUMMY is special
> +  meaning "no binlog" (we cannot use zero as that is reserved for error return
> +  from log_and_order).
> +*/
> +#define BINLOG_COOKIE_DUMMY 1
> +#define BINLOG_COOKIE_START 2

Are these values completely internal and can be changed in any minor release without
any compatibility concerns? Or they're written to binlog or something?

> +
>  class binlog_cache_mngr;
>  class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
>  {
> @@ -388,10 +397,40 @@
>      bool all;
>    };
>  
> +  /*
> +    A list of struct xid_count_per_binlog is used to keep track of how many
> +    XIDs are in preared, but not committed, state in each binlog.

"prepared"

> +
> +    When count drops to zero in a binlog after rotation, it means that there
> +    are no more XIDs in prepared state, so that binlog is no longer needed
> +    for XA crash recovery, and we can log a new binlog checkpoint event.
> +
> +    The list is protected against simultaneous access from multiple
> +    threads by LOCK_xid_list.
> +  */
> +  struct xid_count_per_binlog : public ilink {
> +    char *binlog_name;
> +    uint binlog_name_len;
> +    ulong binlog_id;
> +    long xid_count;
> +    xid_count_per_binlog();   /* Give link error if constructor used. */
> +  };
> +  ulong current_binlog_id;
> +  I_List<xid_count_per_binlog> binlog_xid_count_list;

Why I_List?

> +  /*
> +    When this is set, a RESET MASTER is in progress.
> +
> +    Then we should not write any binlog checkpoints into the binlog (that
> +    could result in deadlock on LOCK_log, and we will delete all binlog files
> +    anyway). Instead we should signal COND_prep_xids whenever a new binlog

COND_xid_list?

> +    checkpoint arrives - when all have arrived, RESET MASTER will complete.
> +  */
> +  bool reset_master_pending;
> +
>    /* LOCK_log and LOCK_index are inited by init_pthread_objects() */
>    mysql_mutex_t LOCK_index;
> -  mysql_mutex_t LOCK_prep_xids;
> -  mysql_cond_t  COND_prep_xids;
> +  mysql_mutex_t LOCK_xid_list;
> +  mysql_cond_t  COND_xid_list;
>    mysql_cond_t update_cond;
>    ulonglong bytes_written;
>    IO_CACHE index_file;
> @@ -415,7 +454,6 @@

It'd be great, if your diff would include function names (diff -p).
An easy way of doing it is to use my diff_p plugin, see the last section in
http://kb.askmonty.org/en/how-to-get-more-out-of-bzr-when-working-on-mariadb
it's the most flexible, robust and universal solution that I have, and it
puts function names in all bzr-generated diffs, not only in "bzr diff" command.

>       fix_max_relay_log_size).
>    */
>    ulong max_size;
> -  long prepared_xids; /* for tc log - number of xids to remember */
>    // current file sequence number for load data infile binary logging
>    uint file_id;
>    uint open_count;				// For replication
> === modified file 'sql/log_event.cc'
> --- sql/log_event.cc	2012-05-09 07:35:46 +0000
> +++ sql/log_event.cc	2012-05-14 09:51:40 +0000
> @@ -5842,6 +5848,84 @@
>  
>  
>  /**************************************************************************
> +  Binlog_checkpoint_log_event methods
> +**************************************************************************/
> +
> +#if defined(HAVE_REPLICATION) && !defined(MYSQL_CLIENT)
> +void Binlog_checkpoint_log_event::pack_info(Protocol *protocol)
> +{
> +  protocol->store(binlog_file_name, binlog_file_len, &my_charset_bin);
> +}
> +#endif
> +
> +
> +#ifdef MYSQL_CLIENT
> +void Binlog_checkpoint_log_event::print(FILE *file,
> +                                        PRINT_EVENT_INFO *print_event_info)
> +{
> +  Write_on_release_cache cache(&print_event_info->head_cache, file,
> +                               Write_on_release_cache::FLUSH_F);
> +
> +  if (print_event_info->short_form)
> +    return;
> +  print_header(&cache, print_event_info, FALSE);
> +  my_b_printf(&cache, "\tBinlog checkpoint ");
> +  my_b_write(&cache, (uchar*)binlog_file_name, binlog_file_len);
> +  my_b_printf(&cache, "\n");
> +}
> +#endif  /* MYSQL_CLIENT */
> +
> +
> +#ifdef MYSQL_SERVER
> +Binlog_checkpoint_log_event::Binlog_checkpoint_log_event(
> +        const char *binlog_file_name_arg,
> +        uint binlog_file_len_arg)
> +  :Log_event(),
> +   binlog_file_name(my_strndup(binlog_file_name_arg, binlog_file_len_arg,
> +                               MYF(MY_WME))),
> +   binlog_file_len(binlog_file_len_arg)
> +{
> +  cache_type= EVENT_NO_CACHE;
> +}
> +#endif  /* MYSQL_SERVER */
> +
> +
> +Binlog_checkpoint_log_event::Binlog_checkpoint_log_event(
> +       const char *buf, uint event_len,
> +       const Format_description_log_event *description_event)
> +  :Log_event(buf, description_event), binlog_file_name(0)
> +{
> +  uint8 header_size= description_event->common_header_len;
> +  uint8 post_header_len=
> +    description_event->post_header_len[BINLOG_CHECKPOINT_EVENT-1];
> +  if (event_len < header_size + post_header_len ||
> +      post_header_len < 4)

I'd replace this 4 with BINLOG_CHECKPOINT_HEADER_LEN

> +    return;
> +  buf+= header_size;

and here I'd add

   compile_time_assert(BINLOG_CHECKPOINT_HEADER_LEN == 4); // see uint4korr and int4store below

> +  binlog_file_len= uint4korr(buf);
> +  if (event_len - (header_size + post_header_len) < binlog_file_len)
> +    return;
> +  binlog_file_name= my_strndup(buf + post_header_len, binlog_file_len,
> +                               MYF(MY_WME));
> +  return;
> +}
> +
> +
> +#ifndef MYSQL_CLIENT
> +bool Binlog_checkpoint_log_event::write(IO_CACHE *file)
> +{
> +  uchar buf[BINLOG_CHECKPOINT_HEADER_LEN];
> +  int4store(buf, binlog_file_len);
> +  return write_header(file, BINLOG_CHECKPOINT_HEADER_LEN + binlog_file_len) ||
> +    wrapper_my_b_safe_write(file, buf, BINLOG_CHECKPOINT_HEADER_LEN) ||
> +    wrapper_my_b_safe_write(file, (const uchar *)binlog_file_name,
> +                            binlog_file_len) ||
> +    write_footer(file);
> +}
> +#endif  /* MYSQL_CLIENT */
> +
> +
> +/**************************************************************************
>  	Intvar_log_event methods
>  **************************************************************************/
>  
> === modified file 'sql/log.cc'
> --- sql/log.cc	2012-04-10 06:28:13 +0000
> +++ sql/log.cc	2012-05-14 09:51:40 +0000
> @@ -2912,13 +2929,30 @@
>    DBUG_ENTER("cleanup");
>    if (inited)
>    {
> +    xid_count_per_binlog *b;
> +
>      inited= 0;
>      close(LOG_CLOSE_INDEX|LOG_CLOSE_STOP_EVENT);
>      delete description_event_for_queue;
>      delete description_event_for_exec;
> +
> +    while ((b= binlog_xid_count_list.get()))

'get' - what a bad name. I checked the implementation and now I know
what it's doing, but absoletely not what one would expect :(
It's not your fault, of course, just ranting...

> +    {
> +      /*
> +        There should be no pending XIDs at shutdown, and only one entry (for
> +        the active binlog file) in the list.
> +      */
> +      DBUG_ASSERT(b->xid_count == 0);
> +      DBUG_ASSERT(!binlog_xid_count_list.head());
> +      my_free(b);
> +    }
> +
>      mysql_mutex_destroy(&LOCK_log);
>      mysql_mutex_destroy(&LOCK_index);
> +    mysql_mutex_destroy(&LOCK_xid_list);
>      mysql_cond_destroy(&update_cond);
> +    mysql_cond_destroy(&COND_queue_busy);
> +    mysql_cond_destroy(&COND_xid_list);
>    }
>    DBUG_VOID_RETURN;
>  }
> @@ -3506,6 +3587,42 @@
>    mysql_mutex_lock(&LOCK_log);
>    mysql_mutex_lock(&LOCK_index);
>  
> +  if (!is_relay_log)
> +  {
> +    /*
> +      We are going to nuke all binary log files.
> +      So first wait until all pending binlog checkpoints have completed.
> +    */
> +    mysql_mutex_lock(&LOCK_xid_list);
> +    xid_count_per_binlog *b;
> +    reset_master_pending= true;
> +    for (;;)
> +    {
> +      I_List_iterator<xid_count_per_binlog> it(binlog_xid_count_list);
> +      while ((b= it++))
> +      {
> +        if (b->xid_count > 0)
> +          break;
> +      }
> +      if (!b)
> +        break;                                  /* No more pending XIDs */
> +      /*
> +        Wait until signalled that one more binlog dropped to zero, then check
> +        again.
> +      */
> +      mysql_cond_wait(&COND_xid_list, &LOCK_xid_list);
> +    }

I would've done it simpler, like

while (b= binlog_xid_count_list.head())
{
   while (b->xid_count > 0)
     mysql_cond_wait(&COND_xid_list, &LOCK_xid_list);
   my_free(binlog_xid_count_list.get());
}

> +
> +    /*
> +      Now all XIDs are fully flushed to disk, and we are holding LOCK_log so
> +      no new ones will be written. So we can proceed to delete the logs.
> +    */
> +    while ((b= binlog_xid_count_list.get()))
> +      my_free(b);
> +    reset_master_pending= false;
> +    mysql_mutex_unlock(&LOCK_xid_list);
> +  }
> +
>    /*
>      The following mutex is needed to ensure that no threads call
>      'delete thd' as we would then risk missing a 'rollback' from this
> @@ -5761,6 +5877,37 @@
>    DBUG_RETURN(error);
>  }
>  
> +void
> +MYSQL_BIN_LOG::write_binlog_checkpoint_event_already_locked(const char *name,
> +                                                            uint len)
> +{
> +  Binlog_checkpoint_log_event ev(name, len);
> +  /*
> +    Note that we must sync the binlog checkpoint to disk.
> +    Otherwise a subsequent log purge could delete binlogs that XA recovery
> +    thinks are needed (even though they are not really).
> +  */
> +  if (!ev.write(&log_file) && !flush_and_sync(0))
> +  {
> +    bool check_purge= false;
> +    signal_update();
> +    rotate(false, &check_purge);
> +    if (check_purge)
> +      purge();

why "write binlog checkpoint event" implies signal_update(), rotate(), and purge() ?

> +    return;
> +  }
> +
> +  /*
> +    If we fail to write the checkpoint event, something is probably really
> +    bad with the binlog. We complain in the error log.
> +    Note that failure to write binlog checkpoint does not compromise the
> +    ability to do crash recovery - crash recovery will just have to scan a
> +    bit more of the binlog than strictly necessary.
> +  */
> +  sql_print_error("Failed to write binlog checkpoint event to binary log\n");
> +}
> +
> +
>  /**
>    Write a cached log entry to the binary log.
>    - To support transaction over replication, we wrap the transaction
> @@ -6061,11 +6207,11 @@
>        {
>          /*
>            If we fail to rotate, which thread should get the error?
> -          We give the error to the *last* transaction thread; that seems to
> -          make the most sense, as it was the last to write to the log.
> +          We give the error to the leader, as any my_error() thrown inside
> +          rotate() will have been registered for the leader THD.
>          */
> -        last_in_queue->error= ER_ERROR_ON_WRITE;
> -        last_in_queue->commit_errno= errno;
> +        leader->error= ER_ERROR_ON_WRITE;
> +        leader->commit_errno= errno;

Where did that change happen? Is it related to MDEV-181 at all, or it's an unrelated bugfix?

>          check_purge= false;
>        }
>      }
> @@ -6082,9 +6228,6 @@
>    */
>    mysql_mutex_unlock(&LOCK_log);
>  
> -  if (check_purge) 
> -    purge();

Why was it moved down?

> -
>    DEBUG_SYNC(leader->thd, "commit_after_release_LOCK_log");
>    ++num_group_commits;
>  
> @@ -7427,11 +7560,23 @@
>  
>    cache_mngr->using_xa= TRUE;
>    cache_mngr->xa_xid= xid;
> +#ifndef DBUG_OFF
> +  cache_mngr->cookie= 0;
> +#endif
>    err= binlog_commit_flush_xid_caches(thd, cache_mngr, all, xid);
>  
>    DEBUG_SYNC(thd, "binlog_after_log_and_order");
>  
> -  DBUG_RETURN(!err);
> +  if (err)
> +    DBUG_RETURN(0);
> +  /*
> +    If using explicit user XA, we will not have XID. We must still return a
> +    non-zero cookie (as zero cookie signals error).
> +  */
> +  if (!xid)
> +    DBUG_RETURN(1);

s/1/BINLOG_COOKIE_DUMMY/

> +  DBUG_ASSERT(cache_mngr->cookie != 0);
> +  DBUG_RETURN(cache_mngr->cookie);
>  }
>  
>  /*
> @@ -7446,40 +7591,135 @@
>    binary log.
>  */
>  void
> -TC_LOG_BINLOG::mark_xids_active(uint xid_count)
> +TC_LOG_BINLOG::mark_xids_active(ulong cookie, uint xid_count)
>  {
> +  xid_count_per_binlog *b;
> +
>    DBUG_ENTER("TC_LOG_BINLOG::mark_xids_active");
> -  DBUG_PRINT("info", ("xid_count=%u", xid_count));
> -  mysql_mutex_lock(&LOCK_prep_xids);
> -  prepared_xids+= xid_count;
> -  mysql_mutex_unlock(&LOCK_prep_xids);
> +  DBUG_PRINT("info", ("cookie=%lu xid_count=%u", cookie, xid_count));
> +  DBUG_ASSERT(cookie != 0 && cookie != BINLOG_COOKIE_DUMMY);
> +
> +  mysql_mutex_lock(&LOCK_xid_list);
> +  I_List_iterator<xid_count_per_binlog> it(binlog_xid_count_list);
> +  while ((b= it++))
> +  {
> +    if (b->binlog_id == cookie)
> +    {
> +      b->xid_count += xid_count;
> +      break;
> +    }
> +  }
> +  /*
> +    As we do not delete elements until count reach zero, elements should always
> +    be found.
> +  */
> +  DBUG_ASSERT(b);
> +  mysql_mutex_unlock(&LOCK_xid_list);
>    DBUG_VOID_RETURN;
>  }
>  
>  /*
> -  Once an XID is committed, it is safe to rotate the binary log, as it can no
> -  longer be needed during crash recovery.
> +  Once an XID is committed, it can no longer be needed during crash recovery,
> +  as it has been durably recorded on disk as "committed".
>  
>    This function is called to mark an XID this way. It needs to decrease the
> -  count of pending XIDs, and signal the log rotator thread when it reaches zero.
> +  count of pending XIDs in the corresponding binlog. When the count reaches
> +  zero (for an "old" binlog that is not the active one), that binlog file no
> +  longer need to be scanned during crash recovery, so we can log a new binlog
> +  checkpoint.
>  */
>  void
> -TC_LOG_BINLOG::mark_xid_done()
> +TC_LOG_BINLOG::mark_xid_done(ulong cookie)
>  {
> -  my_bool send_signal;
> +  xid_count_per_binlog *b;
> +  bool first;
> +  ulong current;
>  
>    DBUG_ENTER("TC_LOG_BINLOG::mark_xid_done");
> -  mysql_mutex_lock(&LOCK_prep_xids);
> -  // prepared_xids can be 0 if the transaction had ignorable errors.
> -  DBUG_ASSERT(prepared_xids >= 0);
> -  if (prepared_xids > 0)
> -    prepared_xids--;
> -  send_signal= (prepared_xids == 0);
> -  mysql_mutex_unlock(&LOCK_prep_xids);
> -  if (send_signal) {
> -    DBUG_PRINT("info", ("prepared_xids=%lu", prepared_xids));
> -    mysql_cond_signal(&COND_prep_xids);
> -  }
> +  if (cookie == BINLOG_COOKIE_DUMMY)
> +    DBUG_VOID_RETURN;                           /* Nothing to do. */
> +
> +  mysql_mutex_lock(&LOCK_xid_list);
> +  current= current_binlog_id;
> +  I_List_iterator<xid_count_per_binlog> it(binlog_xid_count_list);
> +  first= true;
> +  while ((b= it++))
> +  {
> +    if (b->binlog_id == cookie)
> +    {
> +      --b->xid_count;
> +      break;
> +    }
> +    first= false;
> +  }
> +  /* Binlog is always found, as we do not remove until count reaches 0 */
> +  DBUG_ASSERT(b);
> +  if (likely(cookie == current && !reset_master_pending) ||
> +      b->xid_count != 0 || !first)
> +  {
> +    /* No new binlog checkpoint reached yet. */
> +    mysql_mutex_unlock(&LOCK_xid_list);
> +    DBUG_VOID_RETURN;
> +  }
> +
> +  /*
> +    Now log a binlog checkpoint for the first binlog file with a non-zero count.
> +
> +    Note that it is possible (though perhaps unlikely) that when count of
> +    binlog (N-2) drops to zero, binlog (N-1) is already at zero. So we may
> +    need to skip several entries before we find the one to log in the binlog
> +    checkpoint event.
> +
> +    We chain the locking of LOCK_xid_list and LOCK_log, so that we ensure that
> +    Binlog_checkpoint_events are logged in order. This simplifies recovery a
> +    bit, as it can just take the last binlog checkpoint in the log, rather
> +    than compare all found against each other to find the one pointing to the
> +    most recent binlog.
> +
> +    Note also that we need to first release LOCK_xid_list, then aquire
> +    LOCK_log, then re-aquire LOCK_xid_list. If we were to take LOCK_log while
> +    holding LOCK_xid_list, we might deadlock with other threads that take the
> +    locks in the opposite order.
> +
> +    If a RESET MASTER is pending, we are about to remove all log files, and
> +    the RESET MASTER thread is waiting for all pending unlog() calls to
> +    complete while holding LOCK_log. In this case we should not log a binlog
> +    checkpoint event (it would be deleted immediately anywat and we would
> +    deadlock on LOCK_log) but just signal the thread.
> +  */
> +  if (!reset_master_pending)
> +  {
> +    mysql_mutex_unlock(&LOCK_xid_list);
> +    mysql_mutex_lock(&LOCK_log);
> +    mysql_mutex_lock(&LOCK_xid_list);
> +  }
> +  for (;;)
> +  {
> +    /* Remove initial element(s) with zero count. */
> +    b= binlog_xid_count_list.get();

Eh. You need to check whether
binlog_xid_count_list.head()->xid_count == 0 before removing it like that.
When you released LOCK_xid_list another thread could've taken over and
deleted all zero count elements. So now you could find that
binlog_xid_count_list.head() has non-zero count.

> +    my_free(b);
> +    b= binlog_xid_count_list.head();
> +    /*
> +      Normally, we must not remove all elements in the list.
> +      Only if a RESET MASTER is in progress may we delete everything - RESET
> +      MASTER has LOCK_log held, and will create a new initial element before
> +      releasing the lock.
> +    */
> +    DBUG_ASSERT(b || reset_master_pending);
> +    if (unlikely(!b) || likely(b->binlog_id == current) || b->xid_count > 0)
> +      break;
> +  }
> +  if (reset_master_pending)
> +  {
> +    mysql_cond_signal(&COND_xid_list);
> +    mysql_mutex_unlock(&LOCK_xid_list);
> +    DBUG_VOID_RETURN;
> +  }
> +
> +  mysql_mutex_unlock(&LOCK_xid_list);
> +  write_binlog_checkpoint_event_already_locked(b->binlog_name,
> +                                               b->binlog_name_len);
> +  mysql_mutex_unlock(&LOCK_log);
>    DBUG_VOID_RETURN;
>  }
>  
> @@ -7507,19 +7755,104 @@
>  
>    fdle->flags&= ~LOG_EVENT_BINLOG_IN_USE_F; // abort on the first error
>  
> -  while ((ev= Log_event::read_log_event(log, 0, fdle,
> -                                        opt_master_verify_checksum))
> -         && ev->is_valid())
> +  /*
> +    Scan the binlog for XIDs that need to be committed if still in the
> +    prepared stage.
> +
> +    Start with the latest binlog file, then continue with any other binlog
> +    files if the last found binlog checkpoint indicates it is needed.
> +  */
> +
> +  first_round= true;
> +  for (;;)
>    {
> -    if (ev->get_type_code() == XID_EVENT)
> -    {
> -      Xid_log_event *xev=(Xid_log_event *)ev;
> -      uchar *x= (uchar *) memdup_root(&mem_root, (uchar*) &xev->xid,
> -                                      sizeof(xev->xid));
> -      if (!x || my_hash_insert(&xids, x))
> +    while ((ev= Log_event::read_log_event(first_round ? first_log : &log,
> +                                          0, fdle, opt_master_verify_checksum))
> +           && ev->is_valid())
> +    {
> +      switch (ev->get_type_code())
> +      {
> +      case XID_EVENT:
> +      {
> +        Xid_log_event *xev=(Xid_log_event *)ev;
> +        uchar *x= (uchar *) memdup_root(&mem_root, (uchar*) &xev->xid,
> +                                        sizeof(xev->xid));
> +        if (!x || my_hash_insert(&xids, x))
> +        {
> +          delete ev;
> +          goto err2;
> +        }
> +        break;
> +      }
> +      case BINLOG_CHECKPOINT_EVENT:
> +        if (first_round)
> +        {
> +          Binlog_checkpoint_log_event *cev= (Binlog_checkpoint_log_event *)ev;
> +          if (cev->binlog_file_len >= FN_REFLEN)
> +          {
> +            sql_print_error("Incorrect binlog checkpoint event with too long "
> +                            "file name found. Aborting.");

This is a recovery. After a crash. You can be more tolerant here.
In this particular case there is no need to abort, you can safely ignore
incorrect checkpoints. This only means that the recovery will start from the
previous checkpoint and it'll take longer.

> +            delete ev;
> +            goto err2;
> +          }
> +          binlog_checkpoint_len= cev->binlog_file_len;
> +          memcpy(binlog_checkpoint_name, cev->binlog_file_name,
> +                 binlog_checkpoint_len);
> +          binlog_checkpoint_name[binlog_checkpoint_len]= '\0';
> +          break;
> +        }
> +      default:
> +        /* Nothing. */
> +        break;
> +      }
> +      delete ev;
> +    }
> +
> +    /*
> +      If the last binlog checkpoint event points to an older log, we have to
> +      scan all logs from there also, to get all possible XIDs to recover.
> +
> +      If there was no binlog checkpoint event at all, this means the log was
> +      written by an older version of MariaDB (or MySQL) - these always have an
> +      (implicit) binlog checkpoint event at the start of the last binlog file.
> +    */
> +    if (first_round)
> +    {
> +      if (!binlog_checkpoint_len)
> +        break;
> +      first_round= false;
> +      if (find_log_pos(linfo, binlog_checkpoint_name, 1))
> +      {
> +        sql_print_error("Binlog file '%s' not found in binlog index, needed "
> +                        "for recovery. Aborting.", binlog_checkpoint_name);
>          goto err2;
> -    }
> -    delete ev;
> +      }
> +    }
> +    else
> +    {
> +      end_io_cache(&log);
> +      mysql_file_close(file, MYF(MY_WME));
> +      file= -1;
> +    }
> +
> +    if (0 == strcmp(linfo->log_file_name, last_log_name))
> +      break;                                    // No more files to do
> +    if ((file= open_binlog(&log, linfo->log_file_name, &errmsg)) < 0)
> +    {
> +      sql_print_error("%s", errmsg);
> +      goto err2;
> +    }
> +    /*
> +      We do not need to read the Format_description_log_event of other binlog
> +      files. It is not possible for a binlog checkpoint to span multiple
> +      binlog files written by different versions of the server. So we can use
> +      the first one read for reading from all binlog files.
> +    */
> +    if (find_next_log(linfo, 1))
> +    {
> +      sql_print_error("Error reading binlog files during recovery. Aborting.");
> +      goto err2;
> +    }
>    }
>  
>    if (ha_recover(&xids))
> === modified file 'sql/sql_repl.cc'
> --- sql/sql_repl.cc	2012-04-27 08:20:38 +0000
> +++ sql/sql_repl.cc	2012-05-14 09:51:40 +0000
> @@ -624,6 +624,30 @@
>    }
>  
>    /*
> +    Do not send binlog checkpoint events to a slave that does not understand it.
> +  */
> +  if (unlikely(event_type == BINLOG_CHECKPOINT_EVENT) &&
> +      mariadb_slave_capability < MARIA_SLAVE_CAPABILITY_BINLOG_CHECKPOINT)
> +  {
> +    if (mariadb_slave_capability >= MARIA_SLAVE_CAPABILITY_TOLERATE_HOLES)
> +    {
> +      /* This slave can tolerate events omitted from the binlog stream. */
> +      return NULL;
> +    }
> +    else
> +    {
> +      /*
> +        The slave does not understand BINLOG_CHECKPOINT_EVENT. Send a dummy
> +        event instead, with same length so slave does not get confused about
> +        binlog positions.
> +      */

Hm. Above and below the code that you've added events are simply skipped,
without checks and dummy events. Why do you bother to replace with dummies?

> +      if (Query_log_event::dummy_event(packet, ev_offset, current_checksum_alg))
> +        return "Failed to replace binlog checkpoint event with dummy: "
> +               "too small event.";
> +    }
> +  }
> +
> +  /*
>      Skip events with the @@skip_replication flag set, if slave requested
>      skipping of such events.
>    */

Regards,
Sergei


Follow ups