← Back to team overview

maria-developers team mailing list archive

Re: [Commits] Rev 3435: MDEV-532: Async InnoDB commit checkpoint. in http://bazaar.launchpad.net/~maria-captains/maria/10.0

 

Hi Serg,

As we discussed under review of MDEV-232, here is a separate patch that makes
InnoDB/XtraDB commit checkpointing be more asynchroneous.

See MDEV-532 for further description of this task.

I hope you will review this, at your convenience.

 - Kristian.

knielsen@xxxxxxxxxxxxxxx writes:

> At http://bazaar.launchpad.net/~maria-captains/maria/10.0
>
> ------------------------------------------------------------
> revno: 3435
> revision-id: knielsen@xxxxxxxxxxxxxxx-20120914124453-zsap6hjclq3vrb6n
> parent: knielsen@xxxxxxxxxxxxxxx-20120913123129-kaujy4cw0jc9o08k
> committer: knielsen@xxxxxxxxxxxxxxx
> branch nick: work-10.0-mdev225-181-232
> timestamp: Fri 2012-09-14 14:44:53 +0200
> message:
>   MDEV-532: Async InnoDB commit checkpoint.
>   
>   Make the commit checkpoint inside InnoDB be asynchroneous.
>   Implement a background thread in binlog to do the writing and flushing of
>   binlog checkpoint events to disk.
> === modified file 'mysql-test/suite/binlog/r/binlog_checkpoint.result'
> --- a/mysql-test/suite/binlog/r/binlog_checkpoint.result	2012-09-13 12:31:29 +0000
> +++ b/mysql-test/suite/binlog/r/binlog_checkpoint.result	2012-09-14 12:44:53 +0000
> @@ -70,8 +70,14 @@ show binlog events in 'master-bin.000003
>  Log_name        Pos     Event_type      Server_id       End_log_pos     Info
>  master-bin.000003       #       Format_desc     #       #       SERVER_VERSION, BINLOG_VERSION
>  master-bin.000003       #       Binlog_checkpoint       #       #       master-bin.000001
> +SET DEBUG_SYNC= "RESET";
> +SET @old_dbug= @@global.DEBUG_DBUG;
> +SET GLOBAL debug_dbug="+d,binlog_background_checkpoint_processed";
>  SET DEBUG_SYNC= "now SIGNAL con2_continue";
>  con1 is still pending, no new binlog checkpoint should have been logged.
> +SET DEBUG_SYNC= "now WAIT_FOR binlog_background_checkpoint_processed";
> +SET GLOBAL debug_dbug= @old_dbug;
> +SET DEBUG_SYNC= "RESET";
>  show binlog events in 'master-bin.000003' from <binlog_start>;
>  Log_name        Pos     Event_type      Server_id       End_log_pos     Info
>  master-bin.000003       #       Format_desc     #       #       SERVER_VERSION, BINLOG_VERSION
>
> === modified file 'mysql-test/suite/binlog/r/binlog_xa_recover.result'
> --- a/mysql-test/suite/binlog/r/binlog_xa_recover.result	2012-09-13 12:31:29 +0000
> +++ b/mysql-test/suite/binlog/r/binlog_xa_recover.result	2012-09-14 12:44:53 +0000
> @@ -118,7 +118,11 @@ master-bin.00000<binlog_start>	#	Table_m
>  master-bin.00000<binlog_start>  #       Write_rows      #       #       table_id: # flags: STMT_END_F
>  master-bin.00000<binlog_start>  #       Xid     #       #       COMMIT /* XID */
>  SET DEBUG_SYNC= "now SIGNAL con10_cont";
> +SET @old_dbug= @@global.DEBUG_DBUG;
> +SET GLOBAL debug_dbug="+d,binlog_background_checkpoint_processed";
>  SET DEBUG_SYNC= "now SIGNAL con12_cont";
> +SET DEBUG_SYNC= "now WAIT_FOR binlog_background_checkpoint_processed";
> +SET GLOBAL debug_dbug= @old_dbug;
>  SET DEBUG_SYNC= "now SIGNAL con11_cont";
>  Checking that master-bin.000004 is the last binlog checkpoint
>  show binlog events in 'master-bin.00000<binlog_start>' from <binlog_start>;
>
> === modified file 'mysql-test/suite/binlog/t/binlog_checkpoint.test'
> --- a/mysql-test/suite/binlog/t/binlog_checkpoint.test	2012-09-13 12:31:29 +0000
> +++ b/mysql-test/suite/binlog/t/binlog_checkpoint.test	2012-09-14 12:44:53 +0000
> @@ -71,6 +71,12 @@ SET DEBUG_SYNC= "now WAIT_FOR con2_ready
>  --let $binlog_file= master-bin.000003
>  --source include/show_binlog_events.inc
>  
> +# We need to sync the test case with the background processing of the
> +# commit checkpoint, otherwise we get nondeterministic results.
> +SET DEBUG_SYNC= "RESET";
> +SET @old_dbug= @@global.DEBUG_DBUG;
> +SET GLOBAL debug_dbug="+d,binlog_background_checkpoint_processed";
> +
>  SET DEBUG_SYNC= "now SIGNAL con2_continue";
>  
>  connection con2;
> @@ -78,6 +84,12 @@ reap;
>  
>  connection default;
>  --echo con1 is still pending, no new binlog checkpoint should have been logged.
> +# Make sure commit checkpoint is processed before we check that no checkpoint
> +# event has been binlogged.
> +SET DEBUG_SYNC= "now WAIT_FOR binlog_background_checkpoint_processed";
> +SET GLOBAL debug_dbug= @old_dbug;
> +SET DEBUG_SYNC= "RESET";
> +
>  --let $binlog_file= master-bin.000003
>  --source include/show_binlog_events.inc
>  
>
> === modified file 'mysql-test/suite/binlog/t/binlog_xa_recover.test'
> --- a/mysql-test/suite/binlog/t/binlog_xa_recover.test	2012-09-13 12:31:29 +0000
> +++ b/mysql-test/suite/binlog/t/binlog_xa_recover.test	2012-09-14 12:44:53 +0000
> @@ -14,8 +14,24 @@ CREATE TABLE t1 (a INT PRIMARY KEY, b ME
>  # Insert some data to force a couple binlog rotations (3), so we get some
>  # normal binlog checkpoints before starting the test.
>  INSERT INTO t1 VALUES (100, REPEAT("x", 4100));
> +# Wait for the master-bin.000002 binlog checkpoint to appear.
> +--let $wait_for_all= 0
> +--let $show_statement= SHOW BINLOG EVENTS IN "master-bin.000002"
> +--let $field= Info
> +--let $condition= = "master-bin.000002"
> +--source include/wait_show_condition.inc
>  INSERT INTO t1 VALUES (101, REPEAT("x", 4100));
> +--let $wait_for_all= 0
> +--let $show_statement= SHOW BINLOG EVENTS IN "master-bin.000003"
> +--let $field= Info
> +--let $condition= = "master-bin.000003"
> +--source include/wait_show_condition.inc
>  INSERT INTO t1 VALUES (102, REPEAT("x", 4100));
> +--let $wait_for_all= 0
> +--let $show_statement= SHOW BINLOG EVENTS IN "master-bin.000004"
> +--let $field= Info
> +--let $condition= = "master-bin.000004"
> +--source include/wait_show_condition.inc
>  
>  # Now start a bunch of transactions that span multiple binlog
>  # files. Leave then in the state prepared-but-not-committed in the engine
> @@ -153,10 +169,19 @@ SET DEBUG_SYNC= "now SIGNAL con10_cont";
>  connection con10;
>  reap;
>  connection default;
> +
> +# We need to sync the test case with the background processing of the
> +# commit checkpoint, otherwise we get nondeterministic results.
> +SET @old_dbug= @@global.DEBUG_DBUG;
> +SET GLOBAL debug_dbug="+d,binlog_background_checkpoint_processed";
> +
>  SET DEBUG_SYNC= "now SIGNAL con12_cont";
>  connection con12;
>  reap;
>  connection default;
> +SET DEBUG_SYNC= "now WAIT_FOR binlog_background_checkpoint_processed";
> +SET GLOBAL debug_dbug= @old_dbug;
> +
>  SET DEBUG_SYNC= "now SIGNAL con11_cont";
>  connection con11;
>  reap;
> @@ -210,7 +235,20 @@ RESET MASTER;
>  # crash recovery fails due to the error insert used for previous test.
>  INSERT INTO t1 VALUES (21, REPEAT("x", 4100));
>  INSERT INTO t1 VALUES (22, REPEAT("x", 4100));
> +# Wait for the master-bin.000003 binlog checkpoint to appear.
> +--let $wait_for_all= 0
> +--let $show_statement= SHOW BINLOG EVENTS IN "master-bin.000003"
> +--let $field= Info
> +--let $condition= = "master-bin.000003"
> +--source include/wait_show_condition.inc
>  INSERT INTO t1 VALUES (23, REPEAT("x", 4100));
> +# Wait for the last (master-bin.000004) binlog checkpoint to appear.
> +--let $wait_for_all= 0
> +--let $show_statement= SHOW BINLOG EVENTS IN "master-bin.000004"
> +--let $field= Info
> +--let $condition= = "master-bin.000004"
> +--source include/wait_show_condition.inc
> +
>  --write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
>  wait-binlog_xa_recover.test
>  EOF
>
> === modified file 'mysql-test/suite/perfschema/r/all_instances.result'
> --- a/mysql-test/suite/perfschema/r/all_instances.result	2012-06-22 09:46:28 +0000
> +++ b/mysql-test/suite/perfschema/r/all_instances.result	2012-09-14 12:44:53 +0000
> @@ -76,6 +76,7 @@ wait/synch/mutex/sql/Master_info::run_lo
>  wait/synch/mutex/sql/Master_info::sleep_lock
>  wait/synch/mutex/sql/MDL_map::mutex
>  wait/synch/mutex/sql/MDL_wait::LOCK_wait_status
> +wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_binlog_thread
>  wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_index
>  wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_xid_list
>  wait/synch/mutex/sql/MYSQL_RELAY_LOG::LOCK_index
> @@ -129,6 +130,8 @@ wait/synch/cond/sql/Master_info::sleep_c
>  wait/synch/cond/sql/Master_info::start_cond
>  wait/synch/cond/sql/Master_info::stop_cond
>  wait/synch/cond/sql/MDL_context::COND_wait_status
> +wait/synch/cond/sql/MYSQL_BIN_LOG::COND_binlog_thread
> +wait/synch/cond/sql/MYSQL_BIN_LOG::COND_binlog_thread_end
>  wait/synch/cond/sql/MYSQL_BIN_LOG::COND_queue_busy
>  wait/synch/cond/sql/MYSQL_BIN_LOG::COND_xid_list
>  wait/synch/cond/sql/MYSQL_BIN_LOG::update_cond
>
> === modified file 'mysql-test/suite/perfschema/r/relaylog.result'
> --- a/mysql-test/suite/perfschema/r/relaylog.result	2012-06-22 09:46:28 +0000
> +++ b/mysql-test/suite/perfschema/r/relaylog.result	2012-09-14 12:44:53 +0000
> @@ -56,8 +56,11 @@ where event_name like "%MYSQL_BIN_LOG%"
>    and event_name not like "%MYSQL_BIN_LOG::update_cond"
>    order by event_name;
>  EVENT_NAME      COUNT_STAR
> +wait/synch/cond/sql/MYSQL_BIN_LOG::COND_binlog_thread   NONE
> +wait/synch/cond/sql/MYSQL_BIN_LOG::COND_binlog_thread_end       NONE
>  wait/synch/cond/sql/MYSQL_BIN_LOG::COND_queue_busy      NONE
>  wait/synch/cond/sql/MYSQL_BIN_LOG::COND_xid_list        NONE
> +wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_binlog_thread  MANY
>  wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_index  MANY
>  wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_xid_list       MANY
>  "Expect no slave relay log"
> @@ -131,8 +134,11 @@ where event_name like "%MYSQL_BIN_LOG%"
>    and event_name not like "%MYSQL_BIN_LOG::update_cond"
>    order by event_name;
>  EVENT_NAME      COUNT_STAR
> +wait/synch/cond/sql/MYSQL_BIN_LOG::COND_binlog_thread   MANY
> +wait/synch/cond/sql/MYSQL_BIN_LOG::COND_binlog_thread_end       NONE
>  wait/synch/cond/sql/MYSQL_BIN_LOG::COND_queue_busy      NONE
> -wait/synch/cond/sql/MYSQL_BIN_LOG::COND_xid_list        NONE
> +wait/synch/cond/sql/MYSQL_BIN_LOG::COND_xid_list        MANY
> +wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_binlog_thread  MANY
>  wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_index  MANY
>  wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_xid_list       MANY
>  "Expect a slave relay log"
>
> === modified file 'sql/debug_sync.cc'
> --- a/sql/debug_sync.cc	2012-03-28 17:26:00 +0000
> +++ b/sql/debug_sync.cc	2012-09-14 12:44:53 +0000
> @@ -984,6 +984,7 @@ static bool debug_sync_eval_action(THD *
>    DBUG_ENTER("debug_sync_eval_action");
>    DBUG_ASSERT(thd);
>    DBUG_ASSERT(action_str);
> +  DBUG_PRINT("debug_sync", ("action_str='%s'", action_str));
>  
>    /*
>      Get debug sync point name. Or a special command.
>
> === modified file 'sql/log.cc'
> --- a/sql/log.cc	2012-09-13 12:31:29 +0000
> +++ b/sql/log.cc	2012-09-14 12:44:53 +0000
> @@ -53,6 +53,7 @@
>  #include "rpl_handler.h"
>  #include "debug_sync.h"
>  #include "sql_show.h"
> +#include "my_pthread.h"
>  
>  /* max size of the log message */
>  #define MAX_LOG_BUFFER_SIZE 1024
> @@ -106,6 +107,14 @@ static SHOW_VAR binlog_status_vars_detai
>    {NullS, NullS, SHOW_LONG}
>  };
>  
> +/* Variables for the binlog background thread. */
> +static bool binlog_thread_started= false;
> +static bool binlog_background_thread_stop= false;
> +static MYSQL_BIN_LOG::xid_count_per_binlog *
> +    binlog_background_thread_queue= NULL;
> +
> +static bool start_binlog_background_thread();
> +
>  
>  /**
>     purge logs, master and slave sides both, related error code
> @@ -2957,12 +2966,27 @@ void MYSQL_BIN_LOG::cleanup()
>        my_free(b);
>      }
>  
> +    /* Wait for the binlog thread to stop. */
> +    if (!is_relay_log && binlog_thread_started)
> +    {
> +      mysql_mutex_lock(&LOCK_binlog_thread);
> +      binlog_background_thread_stop= true;
> +      mysql_cond_signal(&COND_binlog_thread);
> +      while (binlog_background_thread_stop)
> +        mysql_cond_wait(&COND_binlog_thread_end, &LOCK_binlog_thread);
> +      mysql_mutex_unlock(&LOCK_binlog_thread);
> +      binlog_thread_started= false;
> +    }
> +
>      mysql_mutex_destroy(&LOCK_log);
>      mysql_mutex_destroy(&LOCK_index);
>      mysql_mutex_destroy(&LOCK_xid_list);
> +    mysql_mutex_destroy(&LOCK_binlog_thread);
>      mysql_cond_destroy(&update_cond);
>      mysql_cond_destroy(&COND_queue_busy);
>      mysql_cond_destroy(&COND_xid_list);
> +    mysql_cond_destroy(&COND_binlog_thread);
> +    mysql_cond_destroy(&COND_binlog_thread_end);
>    }
>    DBUG_VOID_RETURN;
>  }
> @@ -2988,6 +3012,11 @@ void MYSQL_BIN_LOG::init_pthread_objects
>    mysql_cond_init(m_key_update_cond, &update_cond, 0);
>    mysql_cond_init(m_key_COND_queue_busy, &COND_queue_busy, 0);
>    mysql_cond_init(key_BINLOG_COND_xid_list, &COND_xid_list, 0);
> +
> +  mysql_mutex_init(key_BINLOG_LOCK_binlog_thread,
> +                   &LOCK_binlog_thread, MY_MUTEX_INIT_FAST);
> +  mysql_cond_init(key_BINLOG_COND_binlog_thread, &COND_binlog_thread, 0);
> +  mysql_cond_init(key_BINLOG_COND_binlog_thread_end, &COND_binlog_thread_end, 0);
>  }
>  
>  
> @@ -3085,6 +3114,10 @@ bool MYSQL_BIN_LOG::open(const char *log
>    DBUG_ENTER("MYSQL_BIN_LOG::open");
>    DBUG_PRINT("enter",("log_type: %d",(int) log_type_arg));
>  
> +  if (!is_relay_log && !binlog_thread_started &&
> +      start_binlog_background_thread())
> +    DBUG_RETURN(1);
> +
>    if (init_and_set_log_file_name(log_name, new_name, log_type_arg,
>                                   io_cache_type_arg))
>    {
> @@ -5540,11 +5573,7 @@ bool general_log_write(THD *thd, enum en
>  }
>  
>  
> -/*
> -  I would like to make this function static, but this causes compiler warnings
> -  when it is declared as friend function in log.h.
> -*/
> -void
> +static void
>  binlog_checkpoint_callback(void *cookie)
>  {
>    MYSQL_BIN_LOG::xid_count_per_binlog *entry=
> @@ -8116,9 +8145,128 @@ int TC_LOG_BINLOG::unlog(ulong cookie, m
>  void
>  TC_LOG_BINLOG::commit_checkpoint_notify(void *cookie)
>  {
> -  mark_xid_done(((xid_count_per_binlog *)cookie)->binlog_id, true);
> +  xid_count_per_binlog *entry= static_cast<xid_count_per_binlog *>(cookie);
> +  mysql_mutex_lock(&LOCK_binlog_thread);
> +  entry->next_in_queue= binlog_background_thread_queue;
> +  binlog_background_thread_queue= entry;
> +  mysql_cond_signal(&COND_binlog_thread);
> +  mysql_mutex_unlock(&LOCK_binlog_thread);
>  }
>  
> +/*
> +  Binlog service thread.
> +
> +  This thread is used to log binlog checkpoints in the background, rather than
> +  in the context of random storage engine threads that happen to call
> +  commit_checkpoint_notify_ha() and may not like the delays while syncing
> +  binlog to disk or may not be setup with all my_thread_init() and other
> +  necessary stuff.
> +
> +  In the future, this thread could also be used to do log rotation in the
> +  background, which could elimiate all stalls around binlog rotations.
> +*/
> +pthread_handler_t
> +binlog_background_thread(void *arg __attribute__((unused)))
> +{
> +  bool stop;
> +  MYSQL_BIN_LOG::xid_count_per_binlog *queue, *next;
> +  THD *thd;
> +
> +  my_thread_init();
> +  thd= new THD;
> +  thd->system_thread= SYSTEM_THREAD_BINLOG_BACKGROUND;
> +  my_pthread_setspecific_ptr(THR_THD, thd);
> +  mysql_mutex_lock(&LOCK_thread_count);
> +  thd->thread_id= thread_id++;
> +  mysql_mutex_unlock(&LOCK_thread_count);
> +
> +  for (;;)
> +  {
> +    /*
> +      Wait until there is something in the queue to process, or we are asked
> +      to shut down.
> +    */
> +    thd_proc_info(thd, "Waiting for background binlog tasks");
> +    mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_thread);
> +    for (;;)
> +    {
> +      stop= binlog_background_thread_stop;
> +      queue= binlog_background_thread_queue;
> +      if (stop || queue)
> +        break;
> +      mysql_cond_wait(&mysql_bin_log.COND_binlog_thread,
> +                      &mysql_bin_log.LOCK_binlog_thread);
> +    }
> +    /* Grab the queue, if any. */
> +    binlog_background_thread_queue= NULL;
> +    mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_thread);
> +
> +    /* Process any incoming commit_checkpoint_notify() calls. */
> +    while (queue)
> +    {
> +      thd_proc_info(thd, "Processing binlog checkpoint notification");
> +      /* Grab next pointer first, as mark_xid_done() may free the element. */
> +      next= queue->next_in_queue;
> +      mysql_bin_log.mark_xid_done(queue->binlog_id, true);
> +      queue= next;
> +
> +      DBUG_EXECUTE_IF("binlog_background_checkpoint_processed",
> +        DBUG_ASSERT(!debug_sync_set_action(
> +          thd,
> +          STRING_WITH_LEN("now SIGNAL binlog_background_checkpoint_processed")));
> +        );
> +    }
> +
> +    if (stop)
> +      break;
> +  }
> +
> +  thd_proc_info(thd, "Stopping binlog background thread");
> +
> +  mysql_mutex_lock(&LOCK_thread_count);
> +  delete thd;
> +  mysql_mutex_unlock(&LOCK_thread_count);
> +
> +  my_thread_end();
> +
> +  /* Signal that we are (almost) stopped. */
> +  mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_thread);
> +  binlog_background_thread_stop= false;
> +  mysql_cond_signal(&mysql_bin_log.COND_binlog_thread_end);
> +  mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_thread);
> +
> +  return 0;
> +}
> +
> +#ifdef HAVE_PSI_INTERFACE
> +static PSI_thread_key key_thread_binlog;
> +
> +static PSI_thread_info all_binlog_threads[]=
> +{
> +  { &key_thread_binlog, "binlog_background", PSI_FLAG_GLOBAL},
> +};
> +#endif /* HAVE_PSI_INTERFACE */
> +
> +static bool
> +start_binlog_background_thread()
> +{
> +  pthread_t th;
> +
> +#ifdef HAVE_PSI_INTERFACE
> +  if (PSI_server)
> +    PSI_server->register_thread("sql", all_binlog_threads,
> +                                array_elements(all_binlog_threads));
> +#endif
> +
> +  if (mysql_thread_create(key_thread_binlog, &th, NULL,
> +                          binlog_background_thread, NULL))
> +    return 1;
> +
> +  binlog_thread_started= true;
> +  return 0;
> +}
> +
> +
>  int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name,
>                             IO_CACHE *first_log,
>                             Format_description_log_event *fdle)
>
> === modified file 'sql/log.h'
> --- a/sql/log.h	2012-09-13 12:31:29 +0000
> +++ b/sql/log.h	2012-09-14 12:44:53 +0000
> @@ -395,8 +395,6 @@ class MYSQL_QUERY_LOG: public MYSQL_LOG
>  #define BINLOG_COOKIE_IS_DUMMY(c) \
>    ( ((ulong)(c)>>1) == BINLOG_COOKIE_DUMMY_ID )
>  
> -void binlog_checkpoint_callback(void *cookie);
> -
>  class binlog_cache_mngr;
>  class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
>  {
> @@ -451,27 +449,6 @@ class MYSQL_BIN_LOG: public TC_LOG, priv
>    };
>  
>    /*
> -    A list of struct xid_count_per_binlog is used to keep track of how many
> -    XIDs are in prepared, but not committed, state in each binlog. And how
> -    many commit_checkpoint_request()'s are pending.
> -
> -    When count drops to zero in a binlog after rotation, it means that there
> -    are no more XIDs in prepared state, so that binlog is no longer needed
> -    for XA crash recovery, and we can log a new binlog checkpoint event.
> -
> -    The list is protected against simultaneous access from multiple
> -    threads by LOCK_xid_list.
> -  */
> -  struct xid_count_per_binlog : public ilink {
> -    char *binlog_name;
> -    uint binlog_name_len;
> -    ulong binlog_id;
> -    /* Total prepared XIDs and pending checkpoint requests in this binlog. */
> -    long xid_count;
> -    xid_count_per_binlog();   /* Give link error if constructor used. */
> -  };
> -  I_List<xid_count_per_binlog> binlog_xid_count_list;
> -  /*
>      When this is set, a RESET MASTER is in progress.
>  
>      Then we should not write any binlog checkpoints into the binlog (that
> @@ -480,7 +457,6 @@ class MYSQL_BIN_LOG: public TC_LOG, priv
>      checkpoint arrives - when all have arrived, RESET MASTER will complete.
>    */
>    bool reset_master_pending;
> -  friend void binlog_checkpoint_callback(void *cookie);
>  
>    /* LOCK_log and LOCK_index are inited by init_pthread_objects() */
>    mysql_mutex_t LOCK_index;
> @@ -553,10 +529,35 @@ class MYSQL_BIN_LOG: public TC_LOG, priv
>    int write_transaction_or_stmt(group_commit_entry *entry);
>    bool write_transaction_to_binlog_events(group_commit_entry *entry);
>    void trx_group_commit_leader(group_commit_entry *leader);
> -  void mark_xid_done(ulong cookie, bool write_checkpoint);
> -  void mark_xids_active(ulong cookie, uint xid_count);
>  
>  public:
> +  /*
> +    A list of struct xid_count_per_binlog is used to keep track of how many
> +    XIDs are in prepared, but not committed, state in each binlog. And how
> +    many commit_checkpoint_request()'s are pending.
> +
> +    When count drops to zero in a binlog after rotation, it means that there
> +    are no more XIDs in prepared state, so that binlog is no longer needed
> +    for XA crash recovery, and we can log a new binlog checkpoint event.
> +
> +    The list is protected against simultaneous access from multiple
> +    threads by LOCK_xid_list.
> +  */
> +  struct xid_count_per_binlog : public ilink {
> +    char *binlog_name;
> +    uint binlog_name_len;
> +    ulong binlog_id;
> +    /* Total prepared XIDs and pending checkpoint requests in this binlog. */
> +    long xid_count;
> +    /* For linking in requests to the binlog background thread. */
> +    xid_count_per_binlog *next_in_queue;
> +    xid_count_per_binlog();   /* Give link error if constructor used. */
> +  };
> +  I_List<xid_count_per_binlog> binlog_xid_count_list;
> +  mysql_mutex_t LOCK_binlog_thread;
> +  mysql_cond_t COND_binlog_thread;
> +  mysql_cond_t COND_binlog_thread_end;
> +
>    using MYSQL_LOG::generate_name;
>    using MYSQL_LOG::is_open;
>  
> @@ -712,6 +713,8 @@ class MYSQL_BIN_LOG: public TC_LOG, priv
>    bool appendv(const char* buf,uint len,...);
>    bool append(Log_event* ev);
>  
> +  void mark_xids_active(ulong cookie, uint xid_count);
> +  void mark_xid_done(ulong cookie, bool write_checkpoint);
>    void make_log_name(char* buf, const char* log_ident);
>    bool is_active(const char* log_file_name);
>    bool can_purge_log(const char *log_file_name);
>
> === modified file 'sql/mysqld.cc'
> --- a/sql/mysqld.cc	2012-09-13 12:31:29 +0000
> +++ b/sql/mysqld.cc	2012-09-14 12:44:53 +0000
> @@ -724,6 +724,7 @@ PSI_mutex_key key_LOCK_des_key_file;
>  #endif /* HAVE_OPENSSL */
>  
>  PSI_mutex_key key_BINLOG_LOCK_index, key_BINLOG_LOCK_xid_list,
> +  key_BINLOG_LOCK_binlog_thread,
>    key_delayed_insert_mutex, key_hash_filo_lock, key_LOCK_active_mi,
>    key_LOCK_connection_count, key_LOCK_crypt, key_LOCK_delayed_create,
>    key_LOCK_delayed_insert, key_LOCK_delayed_status, key_LOCK_error_log,
> @@ -766,6 +767,7 @@ static PSI_mutex_info all_server_mutexes
>  
>    { &key_BINLOG_LOCK_index, "MYSQL_BIN_LOG::LOCK_index", 0},
>    { &key_BINLOG_LOCK_xid_list, "MYSQL_BIN_LOG::LOCK_xid_list", 0},
> +  { &key_BINLOG_LOCK_binlog_thread, "MYSQL_BIN_LOG::LOCK_binlog_thread", 0},
>    { &key_RELAYLOG_LOCK_index, "MYSQL_RELAY_LOG::LOCK_index", 0},
>    { &key_delayed_insert_mutex, "Delayed_insert::mutex", 0},
>    { &key_hash_filo_lock, "hash_filo::lock", 0},
> @@ -834,6 +836,7 @@ PSI_cond_key key_PAGE_cond, key_COND_act
>  #endif /* HAVE_MMAP */
>  
>  PSI_cond_key key_BINLOG_COND_xid_list, key_BINLOG_update_cond,
> +  key_BINLOG_COND_binlog_thread, key_BINLOG_COND_binlog_thread_end,
>    key_COND_cache_status_changed, key_COND_manager,
>    key_COND_rpl_status, key_COND_server_started,
>    key_delayed_insert_cond, key_delayed_insert_cond_client,
> @@ -863,6 +866,8 @@ static PSI_cond_info all_server_conds[]=
>  #endif /* HAVE_MMAP */
>    { &key_BINLOG_COND_xid_list, "MYSQL_BIN_LOG::COND_xid_list", 0},
>    { &key_BINLOG_update_cond, "MYSQL_BIN_LOG::update_cond", 0},
> +  { &key_BINLOG_COND_binlog_thread, "MYSQL_BIN_LOG::COND_binlog_thread", 0},
> +  { &key_BINLOG_COND_binlog_thread_end, "MYSQL_BIN_LOG::COND_binlog_thread_end", 0},
>    { &key_BINLOG_COND_queue_busy, "MYSQL_BIN_LOG::COND_queue_busy", 0},
>    { &key_RELAYLOG_update_cond, "MYSQL_RELAY_LOG::update_cond", 0},
>    { &key_RELAYLOG_COND_queue_busy, "MYSQL_RELAY_LOG::COND_queue_busy", 0},
>
> === modified file 'sql/mysqld.h'
> --- a/sql/mysqld.h	2012-09-13 12:31:29 +0000
> +++ b/sql/mysqld.h	2012-09-14 12:44:53 +0000
> @@ -226,6 +226,7 @@ extern PSI_mutex_key key_LOCK_des_key_fi
>  #endif
>  
>  extern PSI_mutex_key key_BINLOG_LOCK_index, key_BINLOG_LOCK_xid_list,
> +  key_BINLOG_LOCK_binlog_thread,
>    key_delayed_insert_mutex, key_hash_filo_lock, key_LOCK_active_mi,
>    key_LOCK_connection_count, key_LOCK_crypt, key_LOCK_delayed_create,
>    key_LOCK_delayed_insert, key_LOCK_delayed_status, key_LOCK_error_log,
> @@ -257,6 +258,7 @@ extern PSI_cond_key key_PAGE_cond, key_C
>  #endif /* HAVE_MMAP */
>  
>  extern PSI_cond_key key_BINLOG_COND_xid_list, key_BINLOG_update_cond,
> +  key_BINLOG_COND_binlog_thread, key_BINLOG_COND_binlog_thread_end,
>    key_COND_cache_status_changed, key_COND_manager,
>    key_COND_rpl_status, key_COND_server_started,
>    key_delayed_insert_cond, key_delayed_insert_cond_client,
>
> === modified file 'sql/rpl_rli.cc'
> --- a/sql/rpl_rli.cc	2012-09-13 12:31:29 +0000
> +++ b/sql/rpl_rli.cc	2012-09-14 12:44:53 +0000
> @@ -58,6 +58,7 @@ Relay_log_info::Relay_log_info(bool is_s
>  {
>    DBUG_ENTER("Relay_log_info::Relay_log_info");
>  
> +  relay_log.is_relay_log= TRUE;
>  #ifdef HAVE_PSI_INTERFACE
>    relay_log.set_psi_keys(key_RELAYLOG_LOCK_index,
>                           key_RELAYLOG_update_cond,
> @@ -206,8 +207,6 @@ a file name for --relay-log-index option
>        name_warning_sent= 1;
>      }
>  
> -    rli->relay_log.is_relay_log= TRUE;
> -
>      /*
>        note, that if open() fails, we'll still have index file open
>        but a destructor will take care of that
>
> === modified file 'sql/sql_class.h'
> --- a/sql/sql_class.h	2012-09-13 12:31:29 +0000
> +++ b/sql/sql_class.h	2012-09-14 12:44:53 +0000
> @@ -1244,7 +1244,8 @@ enum enum_thread_type
>    SYSTEM_THREAD_SLAVE_SQL= 4,
>    SYSTEM_THREAD_NDBCLUSTER_BINLOG= 8,
>    SYSTEM_THREAD_EVENT_SCHEDULER= 16,
> -  SYSTEM_THREAD_EVENT_WORKER= 32
> +  SYSTEM_THREAD_EVENT_WORKER= 32,
> +  SYSTEM_THREAD_BINLOG_BACKGROUND= 64
>  };
>  
>  inline char const *
>
> === modified file 'storage/innobase/handler/ha_innodb.cc'
> --- a/storage/innobase/handler/ha_innodb.cc	2012-09-13 12:31:29 +0000
> +++ b/storage/innobase/handler/ha_innodb.cc	2012-09-14 12:44:53 +0000
> @@ -106,6 +106,7 @@ static ulong commit_threads = 0;
>  static mysql_mutex_t commit_threads_m;
>  static mysql_cond_t commit_cond;
>  static mysql_mutex_t commit_cond_m;
> +static mysql_mutex_t pending_checkpoint_mutex;
>  static bool innodb_inited = 0;
>  
>  #define INSIDE_HA_INNOBASE_CC
> @@ -222,11 +223,13 @@ static mysql_pfs_key_t	innobase_share_mu
>  static mysql_pfs_key_t  commit_threads_m_key;
>  static mysql_pfs_key_t  commit_cond_mutex_key;
>  static mysql_pfs_key_t  commit_cond_key;
> +static mysql_pfs_key_t  pending_checkpoint_mutex_key;
>  
>  static PSI_mutex_info   all_pthread_mutexes[] = {
>          {&commit_threads_m_key, "commit_threads_m", 0},
>          {&commit_cond_mutex_key, "commit_cond_mutex", 0},
> -        {&innobase_share_mutex_key, "innobase_share_mutex", 0}
> +        {&innobase_share_mutex_key, "innobase_share_mutex", 0},
> +        {&pending_checkpoint_mutex_key, "pending_checkpoint_mutex", 0}
>  };
>  
>  static PSI_cond_info    all_innodb_conds[] = {
> @@ -2592,6 +2595,9 @@ innobase_init(
>          mysql_mutex_init(commit_cond_mutex_key,
>                           &commit_cond_m, MY_MUTEX_INIT_FAST);
>          mysql_cond_init(commit_cond_key, &commit_cond, NULL);
> +        mysql_mutex_init(pending_checkpoint_mutex_key,
> +                         &pending_checkpoint_mutex,
> +                         MY_MUTEX_INIT_FAST);
>          innodb_inited= 1;
>  #ifdef MYSQL_DYNAMIC_PLUGIN
>          if (innobase_hton != p) {
> @@ -2639,6 +2645,7 @@ innobase_end(
>                  mysql_mutex_destroy(&commit_threads_m);
>                  mysql_mutex_destroy(&commit_cond_m);
>                  mysql_cond_destroy(&commit_cond);
> +                mysql_mutex_destroy(&pending_checkpoint_mutex);
>          }
>  
>          DBUG_RETURN(err);
> @@ -3008,6 +3015,16 @@ innobase_rollback_trx(
>          DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
>  }
>  
> +
> +struct pending_checkpoint {
> +        struct pending_checkpoint *next;
> +        handlerton *hton;
> +        void *cookie;
> +        ib_uint64_t lsn;
> +};
> +static struct pending_checkpoint *pending_checkpoint_list;
> +static struct pending_checkpoint *pending_checkpoint_list_end;
> +
>  /*****************************************************************//**
>  Handle a commit checkpoint request from server layer.
>  We simply flush the redo log immediately and do the notify call.*/
> @@ -3017,8 +3034,113 @@ innobase_checkpoint_request(
>          handlerton *hton,
>          void *cookie)
>  {
> -        log_buffer_flush_to_disk();
> -        commit_checkpoint_notify_ha(hton, cookie);
> +        ib_uint64_t                     lsn;
> +        ib_uint64_t                     flush_lsn;
> +        struct pending_checkpoint *     entry;
> +
> +        /* Do the allocation outside of lock to reduce contention. The normal
> +        case is that not everything is flushed, so we will need to enqueue. */
> +        entry = static_cast<struct pending_checkpoint *>
> +                (my_malloc(sizeof(*entry), MYF(MY_WME)));
> +        if (!entry) {
> +                sql_print_error("Failed to allocate %u bytes."
> +                                " Commit checkpoint will be skipped.",
> +                                static_cast<unsigned>(sizeof(*entry)));
> +                return;
> +        }
> +
> +        entry->next = NULL;
> +        entry->hton = hton;
> +        entry->cookie = cookie;
> +
> +        mysql_mutex_lock(&pending_checkpoint_mutex);
> +        lsn = log_get_lsn();
> +        flush_lsn = log_get_flush_lsn();
> +        if (lsn > flush_lsn) {
> +                /* Put the request in queue.
> +                When the log gets flushed past the lsn, we will remove the
> +                entry from the queue and notify the upper layer. */
> +                entry->lsn = lsn;
> +                if (pending_checkpoint_list_end) {
> +                        pending_checkpoint_list_end->next = entry;
> +                } else {
> +                        pending_checkpoint_list = entry;
> +                }
> +                pending_checkpoint_list_end = entry;
> +                entry = NULL;
> +        }
> +        mysql_mutex_unlock(&pending_checkpoint_mutex);
> +
> +        if (entry) {
> +                /* We are already flushed. Notify the checkpoint immediately. */
> +                commit_checkpoint_notify_ha(entry->hton, entry->cookie);
> +                my_free(entry);
> +        }
> +}
> +
> +/*****************************************************************//**
> +Log code calls this whenever log has been written and/or flushed up
> +to a new position. We use this to notify upper layer of a new commit
> +checkpoint when necessary.*/
> +extern "C" UNIV_INTERN
> +void
> +innobase_mysql_log_notify(
> +/*===============*/
> +        ib_uint64_t     write_lsn,      /*!< in: LSN written to log file */
> +        ib_uint64_t     flush_lsn)      /*!< in: LSN flushed to disk */
> +{
> +        struct pending_checkpoint *     pending;
> +        struct pending_checkpoint *     entry;
> +        struct pending_checkpoint *     last_ready;
> +
> +        /* It is safe to do a quick check for NULL first without lock.
> +        Even if we should race, we will at most skip one checkpoint and
> +        take the next one, which is harmless. */
> +        if (!pending_checkpoint_list)
> +                return;
> +
> +        mysql_mutex_lock(&pending_checkpoint_mutex);
> +        pending = pending_checkpoint_list;
> +        if (!pending)
> +        {
> +                mysql_mutex_unlock(&pending_checkpoint_mutex);
> +                return;
> +        }
> +
> +        last_ready = NULL;
> +        for (entry = pending; entry != NULL; entry = entry -> next)
> +        {
> +                if (entry->lsn > flush_lsn)
> +                        break;
> +                last_ready = entry;
> +        }
> +
> +        if (last_ready)
> +        {
> +                /* We found some pending checkpoints that are now flushed to
> +                disk. So remove them from the list. */
> +                pending_checkpoint_list = entry;
> +                if (!entry)
> +                        pending_checkpoint_list_end = NULL;
> +        }
> +
> +        mysql_mutex_unlock(&pending_checkpoint_mutex);
> +
> +        if (!last_ready)
> +                return;
> +
> +        /* Now that we have released the lock, notify upper layer about all
> +        commit checkpoints that have now completed. */
> +        for (;;) {
> +                entry = pending;
> +                pending = pending->next;
> +
> +                commit_checkpoint_notify_ha(entry->hton, entry->cookie);
> +
> +                my_free(entry);
> +                if (entry == last_ready)
> +                        break;
> +        }
>  }
>  
>  /*****************************************************************//**
>
> === modified file 'storage/innobase/include/ha_prototypes.h'
> --- a/storage/innobase/include/ha_prototypes.h	2011-04-26 17:55:52 +0000
> +++ b/storage/innobase/include/ha_prototypes.h	2012-09-14 12:44:53 +0000
> @@ -136,6 +136,17 @@ innobase_mysql_print_thd(
>          uint    max_query_len); /*!< in: max query length to print, or 0 to
>                                     use the default max length */
>  
> +/*****************************************************************//**
> +Log code calls this whenever log has been written and/or flushed up
> +to a new position. We use this to notify upper layer of a new commit
> +checkpoint when necessary.*/
> +UNIV_INTERN
> +void
> +innobase_mysql_log_notify(
> +/*===============*/
> +        ib_uint64_t     write_lsn,      /*!< in: LSN written to log file */
> +        ib_uint64_t     flush_lsn);     /*!< in: LSN flushed to disk */
> +
>  /**************************************************************//**
>  Converts a MySQL type to an InnoDB type. Note that this function returns
>  the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1
>
> === modified file 'storage/innobase/include/log0log.h'
> --- a/storage/innobase/include/log0log.h	2012-06-07 13:44:26 +0000
> +++ b/storage/innobase/include/log0log.h	2012-09-14 12:44:53 +0000
> @@ -151,6 +151,13 @@ UNIV_INLINE
>  ib_uint64_t
>  log_get_lsn(void);
>  /*=============*/
> +/************************************************************//**
> +Gets the last lsn that is fully flushed to disk.
> +@return last flushed lsn */
> +UNIV_INLINE
> +ib_uint64_t
> +log_get_flush_lsn(void);
> +/*=============*/
>  /****************************************************************
>  Gets the log group capacity. It is OK to read the value without
>  holding log_sys->mutex because it is constant.
>
> === modified file 'storage/innobase/include/log0log.ic'
> --- a/storage/innobase/include/log0log.ic	2011-04-05 07:18:43 +0000
> +++ b/storage/innobase/include/log0log.ic	2012-09-14 12:44:53 +0000
> @@ -411,6 +411,25 @@ log_get_lsn(void)
>          return(lsn);
>  }
>  
> +/************************************************************//**
> +Gets the last lsn that is fully flushed to disk.
> +@return last flushed lsn */
> +UNIV_INLINE
> +ib_uint64_t
> +log_get_flush_lsn(void)
> +/*=============*/
> +{
> +        ib_uint64_t     lsn;
> +
> +        mutex_enter(&(log_sys->mutex));
> +
> +        lsn = log_sys->flushed_to_disk_lsn;
> +
> +        mutex_exit(&(log_sys->mutex));
> +
> +        return(lsn);
> +}
> +
>  /****************************************************************
>  Gets the log group capacity. It is OK to read the value without
>  holding log_sys->mutex because it is constant.
>
> === modified file 'storage/innobase/log/log0log.c'
> --- a/storage/innobase/log/log0log.c	2012-03-21 03:48:12 +0000
> +++ b/storage/innobase/log/log0log.c	2012-09-14 12:44:53 +0000
> @@ -1353,6 +1353,8 @@ log_write_up_to(
>          ulint           loop_count      = 0;
>  #endif /* UNIV_DEBUG */
>          ulint           unlock;
> +        ib_uint64_t     write_lsn;
> +        ib_uint64_t     flush_lsn;
>  
>          if (recv_no_ibuf_operations) {
>                  /* Recovery is running and no operations on the log files are
> @@ -1530,8 +1532,13 @@ log_write_up_to(
>  
>          log_flush_do_unlocks(unlock);
>  
> +        write_lsn = log_sys->write_lsn;
> +        flush_lsn = log_sys->flushed_to_disk_lsn;
> +
>          mutex_exit(&(log_sys->mutex));
>  
> +        innobase_mysql_log_notify(write_lsn, flush_lsn);
> +
>          return;
>  
>  do_waits:
>
> === modified file 'storage/xtradb/handler/ha_innodb.cc'
> --- a/storage/xtradb/handler/ha_innodb.cc	2012-09-13 12:31:29 +0000
> +++ b/storage/xtradb/handler/ha_innodb.cc	2012-09-14 12:44:53 +0000
> @@ -120,6 +120,7 @@ static ulong commit_threads = 0;
>  static mysql_mutex_t commit_threads_m;
>  static mysql_cond_t commit_cond;
>  static mysql_mutex_t commit_cond_m;
> +static mysql_mutex_t pending_checkpoint_mutex;
>  static bool innodb_inited = 0;
>  
>  
> @@ -253,11 +254,13 @@ static mysql_pfs_key_t	innobase_share_mu
>  static mysql_pfs_key_t  commit_threads_m_key;
>  static mysql_pfs_key_t  commit_cond_mutex_key;
>  static mysql_pfs_key_t  commit_cond_key;
> +static mysql_pfs_key_t  pending_checkpoint_mutex_key;
>  
>  static PSI_mutex_info   all_pthread_mutexes[] = {
>          {&commit_threads_m_key, "commit_threads_m", 0},
>          {&commit_cond_mutex_key, "commit_cond_mutex", 0},
> -        {&innobase_share_mutex_key, "innobase_share_mutex", 0}
> +        {&innobase_share_mutex_key, "innobase_share_mutex", 0},
> +        {&pending_checkpoint_mutex_key, "pending_checkpoint_mutex", 0}
>  };
>  
>  static PSI_cond_info    all_innodb_conds[] = {
> @@ -3060,6 +3063,9 @@ innobase_init(
>          mysql_mutex_init(commit_cond_mutex_key,
>                           &commit_cond_m, MY_MUTEX_INIT_FAST);
>          mysql_cond_init(commit_cond_key, &commit_cond, NULL);
> +        mysql_mutex_init(pending_checkpoint_mutex_key,
> +                         &pending_checkpoint_mutex,
> +                         MY_MUTEX_INIT_FAST);
>          innodb_inited= 1;
>  #ifdef MYSQL_DYNAMIC_PLUGIN
>          if (innobase_hton != p) {
> @@ -3107,6 +3113,7 @@ innobase_end(
>                  mysql_mutex_destroy(&commit_threads_m);
>                  mysql_mutex_destroy(&commit_cond_m);
>                  mysql_cond_destroy(&commit_cond);
> +                mysql_mutex_destroy(&pending_checkpoint_mutex);
>          }
>  
>          DBUG_RETURN(err);
> @@ -3500,6 +3507,16 @@ innobase_rollback_trx(
>          DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
>  }
>  
> +
> +struct pending_checkpoint {
> +        struct pending_checkpoint *next;
> +        handlerton *hton;
> +        void *cookie;
> +        ib_uint64_t lsn;
> +};
> +static struct pending_checkpoint *pending_checkpoint_list;
> +static struct pending_checkpoint *pending_checkpoint_list_end;
> +
>  /*****************************************************************//**
>  Handle a commit checkpoint request from server layer.
>  We simply flush the redo log immediately and do the notify call.*/
> @@ -3509,8 +3526,113 @@ innobase_checkpoint_request(
>          handlerton *hton,
>          void *cookie)
>  {
> -        log_buffer_flush_to_disk();
> -        commit_checkpoint_notify_ha(hton, cookie);
> +        ib_uint64_t                     lsn;
> +        ib_uint64_t                     flush_lsn;
> +        struct pending_checkpoint *     entry;
> +
> +        /* Do the allocation outside of lock to reduce contention. The normal
> +        case is that not everything is flushed, so we will need to enqueue. */
> +        entry = static_cast<struct pending_checkpoint *>
> +                (my_malloc(sizeof(*entry), MYF(MY_WME)));
> +        if (!entry) {
> +                sql_print_error("Failed to allocate %u bytes."
> +                                " Commit checkpoint will be skipped.",
> +                                static_cast<unsigned>(sizeof(*entry)));
> +                return;
> +        }
> +
> +        entry->next = NULL;
> +        entry->hton = hton;
> +        entry->cookie = cookie;
> +
> +        mysql_mutex_lock(&pending_checkpoint_mutex);
> +        lsn = log_get_lsn();
> +        flush_lsn = log_get_flush_lsn();
> +        if (lsn > flush_lsn) {
> +                /* Put the request in queue.
> +                When the log gets flushed past the lsn, we will remove the
> +                entry from the queue and notify the upper layer. */
> +                entry->lsn = lsn;
> +                if (pending_checkpoint_list_end) {
> +                        pending_checkpoint_list_end->next = entry;
> +                } else {
> +                        pending_checkpoint_list = entry;
> +                }
> +                pending_checkpoint_list_end = entry;
> +                entry = NULL;
> +        }
> +        mysql_mutex_unlock(&pending_checkpoint_mutex);
> +
> +        if (entry) {
> +                /* We are already flushed. Notify the checkpoint immediately. */
> +                commit_checkpoint_notify_ha(entry->hton, entry->cookie);
> +                my_free(entry);
> +        }
> +}
> +
> +/*****************************************************************//**
> +Log code calls this whenever log has been written and/or flushed up
> +to a new position. We use this to notify upper layer of a new commit
> +checkpoint when necessary.*/
> +extern "C" UNIV_INTERN
> +void
> +innobase_mysql_log_notify(
> +/*===============*/
> +        ib_uint64_t     write_lsn,      /*!< in: LSN written to log file */
> +        ib_uint64_t     flush_lsn)      /*!< in: LSN flushed to disk */
> +{
> +        struct pending_checkpoint *     pending;
> +        struct pending_checkpoint *     entry;
> +        struct pending_checkpoint *     last_ready;
> +
> +        /* It is safe to do a quick check for NULL first without lock.
> +        Even if we should race, we will at most skip one checkpoint and
> +        take the next one, which is harmless. */
> +        if (!pending_checkpoint_list)
> +                return;
> +
> +        mysql_mutex_lock(&pending_checkpoint_mutex);
> +        pending = pending_checkpoint_list;
> +        if (!pending)
> +        {
> +                mysql_mutex_unlock(&pending_checkpoint_mutex);
> +                return;
> +        }
> +
> +        last_ready = NULL;
> +        for (entry = pending; entry != NULL; entry = entry -> next)
> +        {
> +                if (entry->lsn > flush_lsn)
> +                        break;
> +                last_ready = entry;
> +        }
> +
> +        if (last_ready)
> +        {
> +                /* We found some pending checkpoints that are now flushed to
> +                disk. So remove them from the list. */
> +                pending_checkpoint_list = entry;
> +                if (!entry)
> +                        pending_checkpoint_list_end = NULL;
> +        }
> +
> +        mysql_mutex_unlock(&pending_checkpoint_mutex);
> +
> +        if (!last_ready)
> +                return;
> +
> +        /* Now that we have released the lock, notify upper layer about all
> +        commit checkpoints that have now completed. */
> +        for (;;) {
> +                entry = pending;
> +                pending = pending->next;
> +
> +                commit_checkpoint_notify_ha(entry->hton, entry->cookie);
> +
> +                my_free(entry);
> +                if (entry == last_ready)
> +                        break;
> +        }
>  }
>  
>  /*****************************************************************//**
>
> === modified file 'storage/xtradb/include/ha_prototypes.h'
> --- a/storage/xtradb/include/ha_prototypes.h	2012-02-21 19:51:56 +0000
> +++ b/storage/xtradb/include/ha_prototypes.h	2012-09-14 12:44:53 +0000
> @@ -136,6 +136,17 @@ innobase_mysql_print_thd(
>          uint    max_query_len); /*!< in: max query length to print, or 0 to
>                                     use the default max length */
>  
> +/*****************************************************************//**
> +Log code calls this whenever log has been written and/or flushed up
> +to a new position. We use this to notify upper layer of a new commit
> +checkpoint when necessary.*/
> +UNIV_INTERN
> +void
> +innobase_mysql_log_notify(
> +/*===============*/
> +        ib_uint64_t     write_lsn,      /*!< in: LSN written to log file */
> +        ib_uint64_t     flush_lsn);     /*!< in: LSN flushed to disk */
> +
>  /**************************************************************//**
>  Converts a MySQL type to an InnoDB type. Note that this function returns
>  the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1
>
> === modified file 'storage/xtradb/include/log0log.h'
> --- a/storage/xtradb/include/log0log.h	2012-08-27 16:13:17 +0000
> +++ b/storage/xtradb/include/log0log.h	2012-09-14 12:44:53 +0000
> @@ -151,6 +151,13 @@ UNIV_INLINE
>  ib_uint64_t
>  log_get_lsn(void);
>  /*=============*/
> +/************************************************************//**
> +Gets the last lsn that is fully flushed to disk.
> +@return last flushed lsn */
> +UNIV_INLINE
> +ib_uint64_t
> +log_get_flush_lsn(void);
> +/*=============*/
>  /****************************************************************
>  Gets the log group capacity. It is OK to read the value without
>  holding log_sys->mutex because it is constant.
>
> === modified file 'storage/xtradb/include/log0log.ic'
> --- a/storage/xtradb/include/log0log.ic	2011-07-14 19:22:41 +0000
> +++ b/storage/xtradb/include/log0log.ic	2012-09-14 12:44:53 +0000
> @@ -411,6 +411,25 @@ log_get_lsn(void)
>          return(lsn);
>  }
>  
> +/************************************************************//**
> +Gets the last lsn that is fully flushed to disk.
> +@return last flushed lsn */
> +UNIV_INLINE
> +ib_uint64_t
> +log_get_flush_lsn(void)
> +/*=============*/
> +{
> +        ib_uint64_t     lsn;
> +
> +        mutex_enter(&(log_sys->mutex));
> +
> +        lsn = log_sys->flushed_to_disk_lsn;
> +
> +        mutex_exit(&(log_sys->mutex));
> +
> +        return(lsn);
> +}
> +
>  /****************************************************************
>  Gets the log group capacity. It is OK to read the value without
>  holding log_sys->mutex because it is constant.
>
> === modified file 'storage/xtradb/log/log0log.c'
> --- a/storage/xtradb/log/log0log.c	2012-08-27 16:13:17 +0000
> +++ b/storage/xtradb/log/log0log.c	2012-09-14 12:44:53 +0000
> @@ -1390,6 +1390,8 @@ log_write_up_to(
>          ulint           loop_count      = 0;
>  #endif /* UNIV_DEBUG */
>          ulint           unlock;
> +        ib_uint64_t     write_lsn;
> +        ib_uint64_t     flush_lsn;
>  
>          if (recv_no_ibuf_operations) {
>                  /* Recovery is running and no operations on the log files are
> @@ -1568,8 +1570,13 @@ log_write_up_to(
>  
>          log_flush_do_unlocks(unlock);
>  
> +        write_lsn = log_sys->write_lsn;
> +        flush_lsn = log_sys->flushed_to_disk_lsn;
> +
>          mutex_exit(&(log_sys->mutex));
>  
> +        innobase_mysql_log_notify(write_lsn, flush_lsn);
> +
>          return;
>  
>  do_waits:
>
> _______________________________________________
> commits mailing list
> commits@xxxxxxxxxxx
> https://lists.askmonty.org/cgi-bin/mailman/listinfo/commits


Follow ups