← Back to team overview

maria-developers team mailing list archive

bzr commit into mariadb 5.1, with Maria 2.0:maria/2.0 branch (sanja:2722)

 

#At lp:maria/2.0

 2722 sanja@xxxxxxxxxxxx	2009-03-17
      Group commit patch.
      Transaction log syncs statistic.
      Buffer finish optimisation in case of absence of CRC page protection or sector protetction.
      Fixed bug of waiting for flush pass end.
      
      (put together only Monty review, will be pushed separately)
      modified:
        mysql-test/suite/maria/r/maria3.result
        mysys/my_init.c
        storage/maria/ha_maria.cc
        storage/maria/ma_init.c
        storage/maria/ma_loghandler.c
        storage/maria/ma_loghandler.h
        tests/fork_big2.pl

per-file messages:
  mysql-test/suite/maria/r/maria3.result
    Transaction log syncs statistic.
  mysys/my_init.c
    The initialization should be here but it do niot want to link (works without it because initialization do not do something except checks).
  storage/maria/ha_maria.cc
    Transaction log syncs statistic.
    Group commit control variables and routines.
  storage/maria/ma_init.c
    Finishing of soft group commit thread.
  storage/maria/ma_loghandler.c
    Group commit (soft and hard) added.
    Transaction log syncs statistic.
    Buffer finish optimisation in case of absence of CRC page protection or sector protetction.
    Fixed bug of waiting for flush pass end.
  storage/maria/ma_loghandler.h
    Group commit control variables and routines.
    Transaction log syncs statistic.
  tests/fork_big2.pl
    Fixed test written by windows to get it working under linux (line endings)
=== modified file 'mysql-test/suite/maria/r/maria3.result'
--- a/mysql-test/suite/maria/r/maria3.result	2008-10-09 20:03:54 +0000
+++ b/mysql-test/suite/maria/r/maria3.result	2009-03-17 20:32:17 +0000
@@ -264,6 +264,8 @@ Variable_name	Value
 maria_block_size	8192
 maria_checkpoint_interval	30
 maria_force_start_after_recovery_failures	0
+maria_group_commit	none
+maria_group_commit_rate	800
 maria_log_file_size	4294959104
 maria_log_purge_type	immediate
 maria_max_sort_file_size	9223372036854775807

=== modified file 'mysys/my_init.c'
--- a/mysys/my_init.c	2008-12-10 09:02:25 +0000
+++ b/mysys/my_init.c	2009-03-17 20:32:17 +0000
@@ -40,6 +40,7 @@ static void netware_init();
 #else
 #define netware_init()
 #endif
+#include <my_atomic.h>
 
 my_bool my_init_done= 0;
 uint	mysys_usage_id= 0;              /* Incremented for each my_init() */
@@ -82,6 +83,10 @@ my_bool my_init(void)
   if (my_progname)
     my_progname_short= my_progname + dirname_length(my_progname);
 
+/*
+  if (my_atomic_initialize())
+    return 1;
+*/
 #if defined(THREAD) && defined(SAFE_MUTEX)
   safe_mutex_global_init();		/* Must be called early */
 #endif

=== modified file 'storage/maria/ha_maria.cc'
--- a/storage/maria/ha_maria.cc	2009-01-16 16:18:17 +0000
+++ b/storage/maria/ha_maria.cc	2009-03-17 20:32:17 +0000
@@ -101,22 +101,40 @@ TYPELIB maria_translog_purge_type_typeli
   array_elements(maria_translog_purge_type_names) - 1, "",
   maria_translog_purge_type_names, NULL
 };
+
+/* transactional log directory sync */
 const char *maria_sync_log_dir_names[]=
 {
   "NEVER", "NEWFILE", "ALWAYS", NullS
 };
-
 TYPELIB maria_sync_log_dir_typelib=
 {
   array_elements(maria_sync_log_dir_names) - 1, "",
   maria_sync_log_dir_names, NULL
 };
 
+/* transactional log group commit */
+const char *maria_group_commit_names[]=
+{
+  "none", "hard", "soft", NullS
+};
+TYPELIB maria_group_commit_typelib=
+{
+  array_elements(maria_group_commit_names) - 1, "",
+  maria_group_commit_names, NULL
+};
+
 /** Interval between background checkpoints in seconds */
 static ulong checkpoint_interval;
 static void update_checkpoint_interval(MYSQL_THD thd,
                                        struct st_mysql_sys_var *var,
                                        void *var_ptr, const void *save);
+static void update_maria_group_commit(MYSQL_THD thd,
+                                      struct st_mysql_sys_var *var,
+                                      void *var_ptr, const void *save);
+static void update_maria_group_commit_rate(MYSQL_THD thd,
+                                           struct st_mysql_sys_var *var,
+                                           void *var_ptr, const void *save);
 /** After that many consecutive recovery failures, remove logs */
 static ulong force_start_after_recovery_failures;
 static void update_log_file_size(MYSQL_THD thd,
@@ -163,6 +181,22 @@ static MYSQL_SYSVAR_ULONG(log_file_size,
        NULL, update_log_file_size, TRANSLOG_FILE_SIZE,
        TRANSLOG_MIN_FILE_SIZE, 0xffffffffL, TRANSLOG_PAGE_SIZE);
 
+static MYSQL_SYSVAR_ENUM(group_commit, group_commit,
+       PLUGIN_VAR_RQCMDARG,
+       "Specifies maria group commit mode. "
+       "Possible values are \"none\" (no group commit), "
+       "\"hard\" (with waiting to actual commit), "
+       "\"soft\" (no wait for commit (DANGEROUS!!!))",
+       NULL, update_maria_group_commit,
+       TRANSLOG_GCOMMIT_NONE, &maria_group_commit_typelib);
+
+static MYSQL_SYSVAR_ULONG(group_commit_rate, group_commit_rate,
+       PLUGIN_VAR_RQCMDARG,
+       "If group commits switched on commit will happens with about every "
+       "100/maria_group_commit_rate second. 0 is special value which switch "
+       "rate off",
+       NULL, update_maria_group_commit_rate, 800, 0, UINT_MAX, 1);
+
 static MYSQL_SYSVAR_ENUM(log_purge_type, log_purge_type,
        PLUGIN_VAR_RQCMDARG,
        "Specifies how maria transactional log will be purged. "
@@ -3247,6 +3281,8 @@ static struct st_mysql_sys_var* system_v
   MYSQL_SYSVAR(block_size),
   MYSQL_SYSVAR(checkpoint_interval),
   MYSQL_SYSVAR(force_start_after_recovery_failures),
+  MYSQL_SYSVAR(group_commit),
+  MYSQL_SYSVAR(group_commit_rate),
   MYSQL_SYSVAR(page_checksum),
   MYSQL_SYSVAR(log_dir_path),
   MYSQL_SYSVAR(log_file_size),
@@ -3277,6 +3313,97 @@ static void update_checkpoint_interval(M
 }
 
 /**
+   @brief Updates group commit mode
+*/
+
+static void update_maria_group_commit(MYSQL_THD thd,
+                                      struct st_mysql_sys_var *var,
+                                      void *var_ptr, const void *save)
+{
+  ulong value= (ulong)*((long *)var_ptr);
+  DBUG_ENTER("update_maria_group_commit");
+  DBUG_PRINT("enter", ("old value: %lu  new value %lu  rate %lu",
+                       value, (ulong)(*(long *)save), group_commit_rate));
+  /* old value */
+  switch (value)
+  {
+  case TRANSLOG_GCOMMIT_NONE:
+    break;
+  case TRANSLOG_GCOMMIT_HARD:
+    translog_hard_group_commit(FALSE);
+    break;
+  case TRANSLOG_GCOMMIT_SOFT:
+    translog_soft_sync(FALSE);
+    if (group_commit_rate)
+      translog_soft_sync_end();
+    break;
+  default:
+    DBUG_ASSERT(0); /* impossible */
+  }
+  value= *(ulong *)var_ptr= (ulong)(*(long *)save);
+  translog_sync();
+  /* new value */
+  switch (value)
+  {
+  case TRANSLOG_GCOMMIT_NONE:
+    break;
+  case TRANSLOG_GCOMMIT_HARD:
+    translog_hard_group_commit(TRUE);
+    break;
+  case TRANSLOG_GCOMMIT_SOFT:
+    translog_soft_sync(TRUE);
+    /* variable change made under global lock so we can just read it */
+    if (group_commit_rate)
+      translog_soft_sync_start();
+    break;
+  default:
+    DBUG_ASSERT(0); /* impossible */
+  }
+  DBUG_VOID_RETURN;
+}
+
+/**
+   @brief Updates group commit rate
+*/
+
+static void update_maria_group_commit_rate(MYSQL_THD thd,
+                                           struct st_mysql_sys_var *var,
+                                           void *var_ptr, const void *save)
+{
+  ulong new_value= (ulong)*((long *)save);
+  DBUG_ENTER("update_maria_group_commit_rate");
+  DBUG_PRINT("enter", ("old value: %lu  new value %lu  group commit %lu",
+                        *(ulong *)var_ptr, new_value, group_commit));
+  if (new_value &&
+      ((TRANSLOG_RATE_BASE * 1000000000ULL / new_value +
+        TRANSLOG_RATE_BASE / 2) /
+       TRANSLOG_RATE_BASE) == 0)
+    new_value= 0; /* protection against too small value */
+  /* variable change made under global lock so we can just read it */
+  switch (group_commit)
+  {
+    case TRANSLOG_GCOMMIT_NONE:
+      *(ulong *)var_ptr= new_value;
+      translog_set_group_commit_rate(new_value);
+      break;
+    case TRANSLOG_GCOMMIT_HARD:
+      *(ulong *)var_ptr= new_value;
+      translog_set_group_commit_rate(new_value);
+      break;
+    case TRANSLOG_GCOMMIT_SOFT:
+      if (*(ulong *)var_ptr)
+        translog_soft_sync_end();
+      translog_set_group_commit_rate(new_value);
+      if ((*(ulong *)var_ptr= new_value))
+        translog_soft_sync_start();
+      break;
+    default:
+      DBUG_ASSERT(0); /* impossible */
+  }
+  DBUG_VOID_RETURN;
+}
+
+/**
    @brief Updates the transaction log file limit.
 */
 
@@ -3298,6 +3425,7 @@ static SHOW_VAR status_variables[]= {
   {"Maria_pagecache_reads",              (char*) &maria_pagecache_var.global_cache_read, SHOW_LONGLONG},
   {"Maria_pagecache_write_requests",     (char*) &maria_pagecache_var.global_cache_w_requests, SHOW_LONGLONG},
   {"Maria_pagecache_writes",             (char*) &maria_pagecache_var.global_cache_write, SHOW_LONGLONG},
+  {"Maria_transaction_log_syncs",        (char*) &translog_syncs, SHOW_LONGLONG},
   {NullS, NullS, SHOW_LONG}
 };
 

=== modified file 'storage/maria/ma_init.c'
--- a/storage/maria/ma_init.c	2008-10-09 20:03:54 +0000
+++ b/storage/maria/ma_init.c	2009-03-17 20:32:17 +0000
@@ -82,6 +82,11 @@ void maria_end(void)
     maria_inited= maria_multi_threaded= FALSE;
     ft_free_stopwords();
     ma_checkpoint_end();
+    if (translog_status == TRANSLOG_OK)
+    {
+      translog_soft_sync_end();
+      translog_sync();
+    }
     if ((trid= trnman_get_max_trid()) > max_trid_in_control_file)
     {
       /*

=== modified file 'storage/maria/ma_loghandler.c'
--- a/storage/maria/ma_loghandler.c	2009-01-16 09:38:02 +0000
+++ b/storage/maria/ma_loghandler.c	2009-03-17 20:32:17 +0000
@@ -18,6 +18,7 @@
 #include "ma_blockrec.h" /* for some constants and in-write hooks */
 #include "ma_key_recover.h" /* For some in-write hooks */
 #include "ma_checkpoint.h"
+#include "ma_servicethread.h"
 
 /*
   On Windows, neither my_open() nor my_sync() work for directories.
@@ -47,6 +48,15 @@
 #include <m_ctype.h>
 #endif
 
+/** @brief protects checkpoint_in_progress */
+static pthread_mutex_t LOCK_soft_sync;
+/** @brief for killing the background checkpoint thread */
+static pthread_cond_t  COND_soft_sync;
+/** @brief control structure for checkpoint background thread */
+static MA_SERVICE_THREAD_CONTROL soft_sync_control=
+  {THREAD_DEAD, FALSE, &LOCK_soft_sync, &COND_soft_sync};
+
+
 /* transaction log file descriptor */
 typedef struct st_translog_file
 {
@@ -124,10 +134,20 @@ struct st_translog_buffer
   /* Previous buffer offset to detect it flush finish */
   TRANSLOG_ADDRESS prev_buffer_offset;
   /*
+    If the buffer was forced to close it save value of its horizon
+    otherwise LSN_IMPOSSIBLE
+  */
+  TRANSLOG_ADDRESS pre_force_close_horizon;
+  /*
      How much is written (or will be written when copy_to_buffer_in_progress
      become 0) to this buffer
   */
   translog_size_t size;
+  /*
+    How much data was skipped during moving page from previous buffer
+    to this one (it is optimisation of forcing buffer to finish
+  */
+  uint skipped_data;
   /* File handler for this buffer */
   TRANSLOG_FILE *file;
   /* Threads which are waiting for buffer filling/freeing */
@@ -304,6 +324,7 @@ struct st_translog_descriptor
   */
   pthread_mutex_t log_flush_lock;
   pthread_cond_t log_flush_cond;
+  pthread_cond_t new_goal_cond;
 
   /* Protects changing of headers of finished files (max_lsn) */
   pthread_mutex_t file_header_lock;
@@ -333,6 +354,8 @@ struct st_translog_descriptor
   my_bool is_everything_flushed;
   /* True when flush pass is in progress */
   my_bool flush_in_progress;
+  /* The flush number (used to distinguish two flushes goes one by one) */
+  volatile int flush_no;
   /* Next flush pass variables */
   TRANSLOG_ADDRESS next_pass_max_lsn;
   pthread_t max_lsn_requester;
@@ -343,12 +366,40 @@ static struct st_translog_descriptor log
 ulong log_purge_type= TRANSLOG_PURGE_IMMIDIATE;
 ulong log_file_size= TRANSLOG_FILE_SIZE;
 ulong sync_log_dir= TRANSLOG_SYNC_DIR_NEWFILE;
+ulong group_commit= TRANSLOG_GCOMMIT_NONE;
+ulong group_commit_rate= 0;
 
 /* Marker for end of log */
 static uchar end_of_log= 0;
 #define END_OF_LOG &end_of_log
+static my_atomic_rwlock_t soft_sync_rwl;
+static my_atomic_rwlock_t hgroup_commit_rwl;
+/**
+  Switch for "soft" sync (no real sync() but periodical sync by service
+  thread)
+*/
+static volatile uint32 soft_sync= FALSE;
+/**
+  Switch for "hard" group commit mode
+*/
+static uint32 hgroup_commit= FALSE;
+/**
+  File numbers interval which have to be sync()
+*/
+static uint32 soft_sync_min= 0;
+static uint32 soft_sync_max= 0;
+static my_atomic_rwlock_t group_commit_wait_rwl;
+/**
+  stores interval in nanoseconds/TRANSLOG_RATE_BASE (to
+  fit into uint32)
+*/
+static uint32 group_commit_wait= 0;
 
 enum enum_translog_status translog_status= TRANSLOG_UNINITED;
+ulonglong translog_syncs= 0; /* Number of sync()s */
+
+/* time of last flush */
+static ulonglong flush_start= 0;
 
 /* chunk types */
 #define TRANSLOG_CHUNK_LSN   0x00      /* 0 chunk refer as LSN (head or tail */
@@ -978,12 +1029,17 @@ static TRANSLOG_FILE *get_logfile_by_num
 static TRANSLOG_FILE *get_current_logfile()
 {
   TRANSLOG_FILE *file;
+  DBUG_ENTER("get_current_logfile");
   rw_rdlock(&log_descriptor.open_files_lock);
+  DBUG_PRINT("info", ("max_file: %lu  min_file: %lu  open_files: %lu",
+                      (ulong) log_descriptor.max_file,
+                      (ulong) log_descriptor.min_file,
+                      (ulong) log_descriptor.open_files.elements));
   DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 ==
               log_descriptor.open_files.elements);
   file= *dynamic_element(&log_descriptor.open_files, 0, TRANSLOG_FILE **);
   rw_unlock(&log_descriptor.open_files_lock);
-  return (file);
+  DBUG_RETURN(file);
 }
 
 uchar	NEAR maria_trans_file_magic[]=
@@ -1067,6 +1123,7 @@ static my_bool translog_write_file_heade
 static my_bool translog_max_lsn_to_header(File file, LSN lsn)
 {
   uchar lsn_buff[LSN_STORE_SIZE];
+  my_bool rc;
   DBUG_ENTER("translog_max_lsn_to_header");
   DBUG_PRINT("enter", ("File descriptor: %ld  "
                        "lsn: (%lu,0x%lx)",
@@ -1075,11 +1132,13 @@ static my_bool translog_max_lsn_to_heade
 
   lsn_store(lsn_buff, lsn);
 
-  DBUG_RETURN(my_pwrite(file, lsn_buff,
-                        LSN_STORE_SIZE,
-                        (LOG_HEADER_DATA_SIZE - LSN_STORE_SIZE),
-                        log_write_flags) != 0 ||
-              my_sync(file, MYF(MY_WME)) != 0);
+  if (!(rc= (my_pwrite(file, lsn_buff,
+                       LSN_STORE_SIZE,
+                       (LOG_HEADER_DATA_SIZE - LSN_STORE_SIZE),
+                       log_write_flags) != 0 ||
+             my_sync(file, MYF(MY_WME)) != 0)))
+    translog_syncs++;
+   DBUG_RETURN(rc);
 }
 
 
@@ -1421,7 +1480,9 @@ LSN translog_get_file_max_lsn_stored(uin
 static my_bool translog_buffer_init(struct st_translog_buffer *buffer, int num)
 {
   DBUG_ENTER("translog_buffer_init");
-  buffer->prev_last_lsn= buffer->last_lsn= LSN_IMPOSSIBLE;
+  buffer->pre_force_close_horizon=
+    buffer->prev_last_lsn= buffer->last_lsn=
+    LSN_IMPOSSIBLE;
   DBUG_PRINT("info", ("last_lsn  and prev_last_lsn set to 0  buffer: 0x%lx",
                       (ulong) buffer));
 
@@ -1433,6 +1494,7 @@ static my_bool translog_buffer_init(stru
   memset(buffer->buffer, TRANSLOG_FILLER, TRANSLOG_WRITE_BUFFER);
   /* Buffer size */
   buffer->size= 0;
+  buffer->skipped_data= 0;
   /* cond of thread which is waiting for buffer filling */
   if (pthread_cond_init(&buffer->waiting_filling_buffer, 0))
     DBUG_RETURN(1);
@@ -1487,7 +1549,10 @@ static my_bool translog_close_log_file(T
     TODO: sync only we have changed the log
   */
   if (!file->is_sync)
+  {
     rc= my_sync(file->handler.file, MYF(MY_WME));
+    translog_syncs++;
+  }
   rc|= my_close(file->handler.file, MYF(MY_WME));
   my_free(file, MYF(0));
   return test(rc);
@@ -2042,7 +2107,8 @@ static void translog_start_buffer(struct
               (ulong) LSN_OFFSET(log_descriptor.horizon),
               (ulong) LSN_OFFSET(log_descriptor.horizon)));
   DBUG_ASSERT(buffer_no == buffer->buffer_no);
-  buffer->prev_last_lsn= buffer->last_lsn= LSN_IMPOSSIBLE;
+  buffer->pre_force_close_horizon=
+    buffer->prev_last_lsn= buffer->last_lsn= LSN_IMPOSSIBLE;
   DBUG_PRINT("info", ("last_lsn and prev_last_lsn set to 0  buffer: 0x%lx",
                       (ulong) buffer));
   buffer->offset= log_descriptor.horizon;
@@ -2050,6 +2116,7 @@ static void translog_start_buffer(struct
   buffer->file= get_current_logfile();
   buffer->overlay= 0;
   buffer->size= 0;
+  buffer->skipped_data= 0;
   translog_cursor_init(cursor, buffer, buffer_no);
   DBUG_PRINT("info", ("file: #%ld (%d)  init cursor #%u: 0x%lx  "
                       "chaser: %d  Size: %lu (%lu)",
@@ -2521,6 +2588,7 @@ static my_bool translog_buffer_flush(str
   TRANSLOG_ADDRESS offset= buffer->offset;
   TRANSLOG_FILE *file= buffer->file;
   uint8 ver= buffer->ver;
+  uint skipped_data;
   DBUG_ENTER("translog_buffer_flush");
   DBUG_PRINT("enter",
              ("Buffer: #%u 0x%lx file: %d  offset: (%lu,0x%lx)  size: %lu",
@@ -2555,6 +2623,8 @@ static my_bool translog_buffer_flush(str
     disk
   */
   file= buffer->file;
+  skipped_data= buffer->skipped_data;
+  DBUG_ASSERT(skipped_data < TRANSLOG_PAGE_SIZE);
   for (i= 0, pg= LSN_OFFSET(buffer->offset) / TRANSLOG_PAGE_SIZE;
        i < buffer->size;
        i+= TRANSLOG_PAGE_SIZE, pg++)
@@ -2571,13 +2641,16 @@ static my_bool translog_buffer_flush(str
     DBUG_ASSERT(i + TRANSLOG_PAGE_SIZE <= buffer->size);
     if (translog_status != TRANSLOG_OK && translog_status != TRANSLOG_SHUTDOWN)
       DBUG_RETURN(1);
-    if (pagecache_inject(log_descriptor.pagecache,
+    if (pagecache_write_part(log_descriptor.pagecache,
                         &file->handler, pg, 3,
                         buffer->buffer + i,
                         PAGECACHE_PLAIN_PAGE,
                         PAGECACHE_LOCK_LEFT_UNLOCKED,
-                        PAGECACHE_PIN_LEFT_UNPINNED, 0,
-                        LSN_IMPOSSIBLE))
+                        PAGECACHE_PIN_LEFT_UNPINNED,
+                        PAGECACHE_WRITE_DONE, 0,
+                        LSN_IMPOSSIBLE,
+                        skipped_data,
+                        TRANSLOG_PAGE_SIZE - skipped_data))
     {
       DBUG_PRINT("error",
                  ("Can't write page (%lu,0x%lx) to pagecache, error: %d",
@@ -2587,10 +2660,12 @@ static my_bool translog_buffer_flush(str
       translog_stop_writing();
       DBUG_RETURN(1);
     }
+    skipped_data= 0;
   }
   file->is_sync= 0;
-  if (my_pwrite(file->handler.file, buffer->buffer,
-                buffer->size, LSN_OFFSET(buffer->offset),
+  if (my_pwrite(file->handler.file, buffer->buffer + buffer->skipped_data,
+                buffer->size - buffer->skipped_data,
+                LSN_OFFSET(buffer->offset) + buffer->skipped_data,
                 log_write_flags))
   {
     DBUG_PRINT("error", ("Can't write buffer (%lu,0x%lx) size %lu "
@@ -3319,11 +3394,17 @@ static my_bool translog_truncate_log(TRA
       ((my_chsize(fd, next_page_offset, TRANSLOG_FILLER, MYF(MY_WME)) ||
         (page_rest && my_pwrite(fd, page_buff, page_rest, LSN_OFFSET(addr),
                                 log_write_flags)) ||
-        my_sync(fd, MYF(MY_WME))) |
+        my_sync(fd, MYF(MY_WME))) ||
        my_close(fd, MYF(MY_WME))) ||
       (sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS &&
        sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD))))
     DBUG_RETURN(1);
+  /*
+    if above 'if' failed all translog initialization failed so it does not
+    matter if sync will be done but translog_syncs will not be increased in
+    this case
+  */
+  translog_syncs+= 2; /*sunc() log and sync() dir */
 
   /* fix the horizon */
   log_descriptor.horizon= addr;
@@ -3485,6 +3566,8 @@ my_bool translog_init_with_table(const c
   id_to_share= NULL;
   log_descriptor.directory_fd= -1;
   log_descriptor.is_everything_flushed= 1;
+  log_descriptor.flush_in_progress= 0;
+  log_descriptor.flush_no= 0;
   log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
 
   (*init_table_func)();
@@ -3495,6 +3578,8 @@ my_bool translog_init_with_table(const c
     log_descriptor.open_flags= O_BINARY | O_RDONLY;
   else
     log_descriptor.open_flags= O_BINARY | O_RDWR;
+  my_atomic_rwlock_init(&soft_sync_rwl);
+  my_atomic_rwlock_init(&group_commit_wait_rwl);
   if (pthread_mutex_init(&log_descriptor.sent_to_disk_lock,
                          MY_MUTEX_INIT_FAST) ||
       pthread_mutex_init(&log_descriptor.file_header_lock,
@@ -3508,6 +3593,7 @@ my_bool translog_init_with_table(const c
       pthread_mutex_init(&log_descriptor.dirty_buffer_mask_lock,
                          MY_MUTEX_INIT_FAST) ||
       pthread_cond_init(&log_descriptor.log_flush_cond, 0) ||
+      pthread_cond_init(&log_descriptor.new_goal_cond, 0) ||
       my_rwlock_init(&log_descriptor.open_files_lock,
                      NULL) ||
       my_init_dynamic_array(&log_descriptor.open_files,
@@ -3909,7 +3995,6 @@ my_bool translog_init_with_table(const c
     log_descriptor.flushed= log_descriptor.horizon;
   log_descriptor.in_buffers_only= log_descriptor.bc.buffer->offset;
   log_descriptor.max_lsn= LSN_IMPOSSIBLE; /* set to 0 */
-  log_descriptor.previous_flush_horizon= log_descriptor.horizon;
   /*
     Now 'flushed' is set to 'horizon' value, but 'horizon' is (potentially)
     address of the next LSN and we want indicate that all LSNs that are
@@ -3992,6 +4077,10 @@ my_bool translog_init_with_table(const c
             It is beginning of the log => there is no LSNs in the log =>
             There is no harm in leaving it "as-is".
           */
+          log_descriptor.previous_flush_horizon= log_descriptor.horizon;
+          DBUG_PRINT("info", ("previous_flush_horizon: (%lu,0x%lx)",
+                              LSN_IN_PARTS(log_descriptor.
+                                           previous_flush_horizon)));
           DBUG_RETURN(0);
         }
         file_no--;
@@ -4067,6 +4156,9 @@ my_bool translog_init_with_table(const c
       translog_free_record_header(&rec);
     }
   }
+  log_descriptor.previous_flush_horizon= log_descriptor.horizon;
+  DBUG_PRINT("info", ("previous_flush_horizon: (%lu,0x%lx)",
+                      LSN_IN_PARTS(log_descriptor.previous_flush_horizon)));
   DBUG_RETURN(0);
 err:
   ma_message_no_user(0, "log initialization failed");
@@ -4154,6 +4246,9 @@ void translog_destroy()
   pthread_mutex_destroy(&log_descriptor.log_flush_lock);
   pthread_mutex_destroy(&log_descriptor.dirty_buffer_mask_lock);
   pthread_cond_destroy(&log_descriptor.log_flush_cond);
+  pthread_cond_destroy(&log_descriptor.new_goal_cond);
+  my_atomic_rwlock_destroy(&soft_sync_rwl);
+  my_atomic_rwlock_destroy(&group_commit_wait_rwl);
   rwlock_destroy(&log_descriptor.open_files_lock);
   delete_dynamic(&log_descriptor.open_files);
   delete_dynamic(&log_descriptor.unfinished_files);
@@ -7383,18 +7478,20 @@ static void translog_force_current_buffe
     */
     DBUG_PRINT("info", ("left: %u", (uint) left));
 
+    old_buffer->pre_force_close_horizon=
+      old_buffer->offset + old_buffer->size;
     /* decrease offset */
     new_buff_beginning-= log_descriptor.bc.current_page_fill;
     current_page_fill= log_descriptor.bc.current_page_fill;
 
     memset(log_descriptor.bc.ptr, TRANSLOG_FILLER, left);
-    log_descriptor.bc.buffer->size+= left;
+    old_buffer->size+= left;
     DBUG_PRINT("info", ("Finish Page buffer #%u: 0x%lx  "
                         "Size: %lu",
-                        (uint) log_descriptor.bc.buffer->buffer_no,
-                        (ulong) log_descriptor.bc.buffer,
-                        (ulong) log_descriptor.bc.buffer->size));
-    DBUG_ASSERT(log_descriptor.bc.buffer->buffer_no ==
+                        (uint) old_buffer->buffer_no,
+                        (ulong) old_buffer,
+                        (ulong) old_buffer->size));
+    DBUG_ASSERT(old_buffer->buffer_no ==
                 log_descriptor.bc.buffer_no);
   }
   else
@@ -7425,6 +7522,10 @@ static void translog_force_current_buffe
   log_descriptor.bc.buffer->offset= new_buff_beginning;
   log_descriptor.bc.write_counter= write_counter;
   log_descriptor.bc.previous_offset= previous_offset;
+  new_buffer->prev_last_lsn= BUFFER_MAX_LSN(old_buffer);
+  DBUG_PRINT("info", ("prev_last_lsn set to (%lu,0x%lx)  buffer: 0x%lx",
+                      LSN_IN_PARTS(new_buffer->prev_last_lsn),
+                      (ulong) new_buffer));
 
   /*
     Advances this log pointer, increases writers and let other threads to
@@ -7501,11 +7602,14 @@ static void translog_force_current_buffe
 
   if (left)
   {
-    /*
-      TODO: do not copy beginning of the page if we have no CRC or sector
-      checks on
-    */
-    memcpy(new_buffer->buffer, data, current_page_fill);
+    if (log_descriptor.flags &
+        (TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION))
+      memcpy(new_buffer->buffer, data, current_page_fill);
+    else
+    {
+      new_buffer->skipped_data= current_page_fill;
+      DBUG_ASSERT(new_buffer->skipped_data < TRANSLOG_PAGE_SIZE);
+    }
   }
   old_buffer->next_buffer_offset= new_buffer->offset;
   translog_buffer_lock(new_buffer);
@@ -7545,6 +7649,7 @@ void  translog_flush_wait_for_end(LSN ls
 
 void translog_flush_set_new_goal_and_wait(TRANSLOG_ADDRESS lsn)
 {
+  int flush_no= log_descriptor.flush_no;
   DBUG_ENTER("translog_flush_set_new_goal_and_wait");
   DBUG_PRINT("enter", ("LSN: (%lu,0x%lx)", LSN_IN_PARTS(lsn)));
   safe_mutex_assert_owner(&log_descriptor.log_flush_lock);
@@ -7552,8 +7657,9 @@ void translog_flush_set_new_goal_and_wai
   {
     log_descriptor.next_pass_max_lsn= lsn;
     log_descriptor.max_lsn_requester= pthread_self();
+    pthread_cond_broadcast(&log_descriptor.new_goal_cond);
   }
-  while (log_descriptor.flush_in_progress)
+  while (flush_no == log_descriptor.flush_no)
   {
     pthread_cond_wait(&log_descriptor.log_flush_cond,
                       &log_descriptor.log_flush_lock);
@@ -7563,67 +7669,71 @@ void translog_flush_set_new_goal_and_wai
 
 
 /**
-  @brief Flush the log up to given LSN (included)
-
-  @param  lsn            log record serial number up to which (inclusive)
-                         the log has to be flushed
-
-  @return Operation status
-    @retval 0      OK
-    @retval 1      Error
+  @brief sync() range of files (inclusive) and directory (by request)
 
+  @param min             from file
+  @param max             to file
+  @param sync_dir        need sync directory
 */
 
-my_bool translog_flush(TRANSLOG_ADDRESS lsn)
+static my_bool translog_sync_files(uint32 min, uint32 max,
+                                   my_bool sync_dir)
 {
-  LSN sent_to_disk= LSN_IMPOSSIBLE;
-  TRANSLOG_ADDRESS flush_horizon;
-  uint fn, i;
-  dirty_buffer_mask_t dirty_buffer_mask;
-  uint8 last_buffer_no, start_buffer_no;
+  uint fn;
   my_bool rc= 0;
-  DBUG_ENTER("translog_flush");
-  DBUG_PRINT("enter", ("Flush up to LSN: (%lu,0x%lx)", LSN_IN_PARTS(lsn)));
-  DBUG_ASSERT(translog_status == TRANSLOG_OK ||
-              translog_status == TRANSLOG_READONLY);
-  LINT_INIT(sent_to_disk);
+  DBUG_ENTER("translog_sync_files");
+  DBUG_PRINT("info", ("min: %lu  max: %lu  sync dir: %d",
+                      (ulong) min, (ulong) max, (int) sync_dir));
+  DBUG_ASSERT(min <= max);
 
-  pthread_mutex_lock(&log_descriptor.log_flush_lock);
-  DBUG_PRINT("info", ("Everything is flushed up to (%lu,0x%lx)",
-                      LSN_IN_PARTS(log_descriptor.flushed)));
-  if (cmp_translog_addr(log_descriptor.flushed, lsn) >= 0)
-  {
-    pthread_mutex_unlock(&log_descriptor.log_flush_lock);
-    DBUG_RETURN(0);
-  }
-  if (log_descriptor.flush_in_progress)
+  flush_start= my_getsystime();
+  for (fn= min; fn <= max; fn++)
   {
-    translog_flush_set_new_goal_and_wait(lsn);
-    if (!pthread_equal(log_descriptor.max_lsn_requester, pthread_self()))
+    TRANSLOG_FILE *file= get_logfile_by_number(fn);
+    DBUG_ASSERT(file != NULL);
+    if (!file->is_sync)
     {
-      /* fix lsn if it was horizon */
-      if (cmp_translog_addr(lsn, log_descriptor.bc.buffer->last_lsn) > 0)
-          lsn= BUFFER_MAX_LSN(log_descriptor.bc.buffer);
-      translog_flush_wait_for_end(lsn);
-      pthread_mutex_unlock(&log_descriptor.log_flush_lock);
-      DBUG_RETURN(0);
+      if (my_sync(file->handler.file, MYF(MY_WME)))
+      {
+        rc= 1;
+        translog_stop_writing();
+        DBUG_RETURN(rc);
+      }
+      translog_syncs++;
+      file->is_sync= 1;
     }
-    log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
   }
-  log_descriptor.flush_in_progress= 1;
-  flush_horizon= log_descriptor.previous_flush_horizon;
-  DBUG_PRINT("info", ("flush_in_progress is set"));
-  pthread_mutex_unlock(&log_descriptor.log_flush_lock);
 
-  translog_lock();
-  if (log_descriptor.is_everything_flushed)
+  if (sync_dir)
   {
-    DBUG_PRINT("info", ("everything is flushed"));
-    rc= (translog_status == TRANSLOG_READONLY);
-    translog_unlock();
-    goto out;
+    if (!(rc|= sync_dir(log_descriptor.directory_fd,
+                        MYF(MY_WME | MY_IGNORE_BADFD))))
+      translog_syncs++;
   }
 
+  DBUG_RETURN(rc);
+}
+
+
+/*
+  @brief Flushes buffers up to lsn
+
+  @param lsn             LSN to which we should flush
+  @parem sent_to_disk    returns 'sent to disk' position
+  @param flush_horizon   returns horizon of the flush
+
+  @note See comment to translog_flush().
+*/
+
+void translog_flush_buffers(TRANSLOG_ADDRESS *lsn,
+                               TRANSLOG_ADDRESS *sent_to_disk,
+                               TRANSLOG_ADDRESS *flush_horizon)
+{
+  dirty_buffer_mask_t dirty_buffer_mask;
+  uint i;
+  uint8 last_buffer_no, start_buffer_no;
+  DBUG_ENTER("translog_flush_buffers");
+
   /*
     We will recheck information when will lock buffers one by
     one so we can use unprotected read here (this is just for
@@ -7647,15 +7757,15 @@ my_bool translog_flush(TRANSLOG_ADDRESS 
   /*
     if LSN up to which we have to flush bigger then maximum LSN of previous
     buffer and at least one LSN was saved in the current buffer (last_lsn !=
-    LSN_IMPOSSIBLE) then we better finish the current buffer.
+    LSN_IMPOSSIBLE) then we have to close the current buffer.
   */
-  if (cmp_translog_addr(lsn, log_descriptor.bc.buffer->prev_last_lsn) > 0 &&
+  if (cmp_translog_addr(*lsn, log_descriptor.bc.buffer->prev_last_lsn) > 0 &&
       log_descriptor.bc.buffer->last_lsn != LSN_IMPOSSIBLE)
   {
     struct st_translog_buffer *buffer= log_descriptor.bc.buffer;
-    lsn= log_descriptor.bc.buffer->last_lsn; /* fix lsn if it was horizon */
+    *lsn= log_descriptor.bc.buffer->last_lsn; /* fix lsn if it was horizon */
     DBUG_PRINT("info", ("LSN to flush fixed to last lsn: (%lu,0x%lx)",
-               LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn)));
+                        LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn)));
     last_buffer_no= log_descriptor.bc.buffer_no;
     log_descriptor.is_everything_flushed= 1;
     translog_force_current_buffer_to_finish();
@@ -7667,8 +7777,10 @@ my_bool translog_flush(TRANSLOG_ADDRESS 
                      TRANSLOG_BUFFERS_NO);
     translog_unlock();
   }
-  sent_to_disk= translog_get_sent_to_disk();
-  if (cmp_translog_addr(lsn, sent_to_disk) > 0)
+
+  /* flush buffers */
+  *sent_to_disk= translog_get_sent_to_disk();
+  if (cmp_translog_addr(*lsn, *sent_to_disk) > 0)
   {
 
     DBUG_PRINT("info", ("Start buffer #: %u  last buffer #: %u",
@@ -7688,52 +7800,224 @@ my_bool translog_flush(TRANSLOG_ADDRESS 
                           LSN_IN_PARTS(buffer->last_lsn),
                           (buffer->file ?
                            "dirty" : "closed")));
-      if (buffer->prev_last_lsn <= lsn &&
+      if (buffer->prev_last_lsn <= *lsn &&
           buffer->file != NULL)
       {
-        DBUG_ASSERT(flush_horizon <= buffer->offset + buffer->size);
-        flush_horizon= buffer->offset + buffer->size;
+        DBUG_ASSERT(*flush_horizon <= buffer->offset + buffer->size);
+        *flush_horizon= (buffer->pre_force_close_horizon ?
+                         buffer->pre_force_close_horizon :
+                         buffer->offset + buffer->size);
+        DBUG_PRINT("info", ("flush_horizon: (%lu,0x%lx)",
+                            LSN_IN_PARTS(*flush_horizon)));
+        DBUG_ASSERT(*flush_horizon <= log_descriptor.horizon);
         translog_buffer_flush(buffer);
       }
       translog_buffer_unlock(buffer);
       i= (i + 1) % TRANSLOG_BUFFERS_NO;
     } while (i != last_buffer_no);
-    sent_to_disk= translog_get_sent_to_disk();
+    *sent_to_disk= translog_get_sent_to_disk();
   }
 
-  /* sync files from previous flush till current one */
-  for (fn= LSN_FILE_NO(log_descriptor.flushed); fn <= LSN_FILE_NO(lsn); fn++)
+  DBUG_VOID_RETURN;
+}
+
+/**
+  @brief Flush the log up to given LSN (included)
+
+  @param  lsn            log record serial number up to which (inclusive)
+                         the log has to be flushed
+
+  @return Operation status
+    @retval 0      OK
+    @retval 1      Error
+
+  @note
+  - Non group commit logic: Commits made in passes. Thread which started
+  flush first is performing actual flush, other threads sets new goal (LSN)
+  of the next pass (if it is maximum) and waits for the pass end or just
+  wait for the pass end.
+  - If hard group commit switched on and rate set to zero the first thread
+  repeat pass if finds at the end of pass new goals (just take them as its
+  new goal). The process can not loop forever because we have limited number
+  of threads.
+  - If hard group commit switched ON and rate is not zero then at the end of
+  the pass first tread waits for new goals but not more then need to have
+  actual rate close to set value. If time exceed it stop pass and let other
+  thread start new pass if time is not over take new goal and repeats the
+  pass.
+  - If soft group commit switched on everything work as without group commit
+  but this procedure do not perform real sync(). If rate is not zero real
+  sync() will be performed by service thread with the rate and if there was
+  new LSN appeared.
+
+  @note Terminology:
+  'sent to disk' means written to disk but not sync()ed,
+  'flushed' mean sent to disk and synced().
+*/
+
+my_bool translog_flush(TRANSLOG_ADDRESS lsn)
+{
+  struct timespec abstime;
+  ulonglong flush_interval;
+  ulonglong time_spent;
+  LSN sent_to_disk= LSN_IMPOSSIBLE;
+  TRANSLOG_ADDRESS flush_horizon;
+  my_bool rc= 0;
+  my_bool hgroup_commit_at_start;
+
+  DBUG_ENTER("translog_flush");
+  DBUG_PRINT("enter", ("Flush up to LSN: (%lu,0x%lx)", LSN_IN_PARTS(lsn)));
+  DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+              translog_status == TRANSLOG_READONLY);
+  LINT_INIT(sent_to_disk);
+
+  pthread_mutex_lock(&log_descriptor.log_flush_lock);
+  DBUG_PRINT("info", ("Everything is flushed up to (%lu,0x%lx)",
+                      LSN_IN_PARTS(log_descriptor.flushed)));
+  if (cmp_translog_addr(log_descriptor.flushed, lsn) >= 0)
   {
-    TRANSLOG_FILE *file= get_logfile_by_number(fn);
-    DBUG_ASSERT(file != NULL);
-    if (!file->is_sync)
+    pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+    DBUG_RETURN(0);
+  }
+  if (log_descriptor.flush_in_progress)
+  {
+    translog_lock();
+    /* fix lsn if it was horizon */
+    if (cmp_translog_addr(lsn, log_descriptor.bc.buffer->last_lsn) > 0)
+      lsn= BUFFER_MAX_LSN(log_descriptor.bc.buffer);
+    translog_unlock();
+    translog_flush_set_new_goal_and_wait(lsn);
+    if (!pthread_equal(log_descriptor.max_lsn_requester, pthread_self()))
     {
-      if (my_sync(file->handler.file, MYF(MY_WME)))
+      translog_flush_wait_for_end(lsn);
+      pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+      DBUG_RETURN(0);
+    }
+    log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
+  }
+  log_descriptor.flush_in_progress= 1;
+  flush_horizon= log_descriptor.previous_flush_horizon;
+  DBUG_PRINT("info", ("flush_in_progress is set, flush_horizon: (%lu,0x%lx)",
+                      LSN_IN_PARTS(flush_horizon)));
+  pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+
+  my_atomic_rwlock_wrlock(&hgroup_commit_rwl);
+  hgroup_commit_at_start= my_atomic_load32(&hgroup_commit);
+  my_atomic_rwlock_wrunlock(&hgroup_commit_rwl);
+  if (hgroup_commit_at_start)
+  {
+    my_atomic_rwlock_rdlock(&group_commit_wait_rwl);
+    flush_interval= my_atomic_load32(&group_commit_wait) * TRANSLOG_RATE_BASE;
+    my_atomic_rwlock_rdunlock(&group_commit_wait_rwl);
+  }
+
+  translog_lock();
+  if (log_descriptor.is_everything_flushed)
+  {
+    DBUG_PRINT("info", ("everything is flushed"));
+    rc= (translog_status == TRANSLOG_READONLY);
+    translog_unlock();
+    goto out;
+  }
+
+  for (;;)
+  {
+    translog_flush_buffers(&lsn, &sent_to_disk, &flush_horizon);
+
+    if (!hgroup_commit_at_start)
+      break;  /* flush pass is ended */
+
+retest:
+    if (flush_interval != 0 &&
+        (my_getsystime() - flush_start) >= flush_interval)
+      break;  /* flush pass is ended */
+
+    pthread_mutex_lock(&log_descriptor.log_flush_lock);
+    if (log_descriptor.next_pass_max_lsn != LSN_IMPOSSIBLE)
+    {
+      /* take next goal */
+      lsn= log_descriptor.next_pass_max_lsn;
+      log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
+      /* prevent other thread from continue */
+      log_descriptor.max_lsn_requester= pthread_self();
+      DBUG_PRINT("info", ("flush took next goal: (%lu,0x%lx)",
+                          LSN_IN_PARTS(lsn)));
+    }
+    else
+    {
+      if (flush_interval == 0 ||
+          (time_spent= (my_getsystime() - flush_start)) >= flush_interval)
       {
-        rc= 1;
-        translog_stop_writing();
-        sent_to_disk= LSN_IMPOSSIBLE;
-        goto out;
+        pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+        break;
       }
-      file->is_sync= 1;
+      DBUG_PRINT("info", ("flush waits: %llu  interval: %llu  spent: %llu",
+                          flush_interval - time_spent,
+                          flush_interval, time_spent));
+      /* wait time or next goal */
+      set_timespec_nsec(abstime, flush_interval - time_spent);
+      pthread_cond_timedwait(&log_descriptor.new_goal_cond,
+                             &log_descriptor.log_flush_lock,
+                             &abstime);
+      pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+      DBUG_PRINT("info", ("retest conditions"));
+      goto retest;
     }
+    pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+
+    /* next flush pass */
+    DBUG_PRINT("info", ("next flush pass"));
+    translog_lock();
   }
 
-  if (sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS &&
-      (LSN_FILE_NO(log_descriptor.previous_flush_horizon) !=
-       LSN_FILE_NO(flush_horizon) ||
-       ((LSN_OFFSET(log_descriptor.previous_flush_horizon) - 1) /
-        TRANSLOG_PAGE_SIZE) !=
-       ((LSN_OFFSET(flush_horizon) - 1) / TRANSLOG_PAGE_SIZE)))
-    rc|= sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD));
+  /*
+    sync() files from previous flush till current one
+
+    We read soft_sync unprotected because it is actually 1 bit
+    value and we do not care much about "a bit old value" speed
+    of test is much more valuable here.
+  */
+  if (!soft_sync || hgroup_commit_at_start)
+  {
+    if ((rc=
+         translog_sync_files(LSN_FILE_NO(log_descriptor.flushed),
+                             LSN_FILE_NO(lsn),
+                             sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS &&
+                             (LSN_FILE_NO(log_descriptor.
+                                          previous_flush_horizon) !=
+                              LSN_FILE_NO(flush_horizon) ||
+                              ((LSN_OFFSET(log_descriptor.
+                                           previous_flush_horizon) - 1) /
+                               TRANSLOG_PAGE_SIZE) !=
+                              ((LSN_OFFSET(flush_horizon) - 1) /
+                               TRANSLOG_PAGE_SIZE)))))
+    {
+      sent_to_disk= LSN_IMPOSSIBLE;
+      goto out;
+    }
+    /* keep values for soft sync() and forced sync() actual */
+    my_atomic_rwlock_wrlock(&soft_sync_rwl);
+    my_atomic_store32(&soft_sync_min, LSN_FILE_NO(lsn));
+    my_atomic_store32(&soft_sync_max, LSN_FILE_NO(lsn));
+    my_atomic_rwlock_wrunlock(&soft_sync_rwl);
+  }
+  else
+  {
+    my_atomic_rwlock_wrlock(&soft_sync_rwl);
+    my_atomic_store32(&soft_sync_max, LSN_FILE_NO(lsn));
+    my_atomic_rwlock_wrunlock(&soft_sync_rwl);
+  }
+
+  DBUG_ASSERT(flush_horizon <= log_descriptor.horizon);
   log_descriptor.previous_flush_horizon= flush_horizon;
 out:
   pthread_mutex_lock(&log_descriptor.log_flush_lock);
   if (sent_to_disk != LSN_IMPOSSIBLE)
     log_descriptor.flushed= sent_to_disk;
   log_descriptor.flush_in_progress= 0;
+  log_descriptor.flush_no++;
   DBUG_PRINT("info", ("flush_in_progress is dropped"));
-  pthread_mutex_unlock(&log_descriptor.log_flush_lock);\
+  pthread_mutex_unlock(&log_descriptor.log_flush_lock);
   pthread_cond_broadcast(&log_descriptor.log_flush_cond);
   DBUG_RETURN(rc);
 }
@@ -8103,6 +8387,7 @@ LSN translog_first_theoretical_lsn()
 my_bool translog_purge(TRANSLOG_ADDRESS low)
 {
   uint32 last_need_file= LSN_FILE_NO(low);
+  uint32 min_unsync;
   TRANSLOG_ADDRESS horizon= translog_get_horizon();
   int rc= 0;
   DBUG_ENTER("translog_purge");
@@ -8110,6 +8395,12 @@ my_bool translog_purge(TRANSLOG_ADDRESS 
   DBUG_ASSERT(translog_status == TRANSLOG_OK ||
               translog_status == TRANSLOG_READONLY);
 
+  my_atomic_rwlock_wrlock(&soft_sync_rwl);
+  min_unsync= my_atomic_load32(&soft_sync_min);
+  my_atomic_rwlock_wrunlock(&soft_sync_rwl);
+  if (min_unsync < last_need_file)
+    last_need_file= min_unsync;
+
   pthread_mutex_lock(&log_descriptor.purger_lock);
   if (LSN_FILE_NO(log_descriptor.last_lsn_checked) < last_need_file)
   {
@@ -8346,6 +8637,160 @@ my_bool translog_log_debug_info(TRN *trn
 }
 
 
+
+/**
+  Sets soft sync mode
+
+  @param mode            TRUE if we need switch soft sync on else off
+*/
+
+void translog_soft_sync(my_bool mode)
+{
+  my_atomic_rwlock_wrlock(&soft_sync_rwl);
+  my_atomic_store32(&soft_sync, (uint32) mode);
+  my_atomic_rwlock_wrunlock(&soft_sync_rwl);
+}
+
+
+/**
+  Sets hard group commit
+
+  @param mode            TRUE if we need switch hard group commit on else off
+*/
+
+void translog_hard_group_commit(my_bool mode)
+{
+  my_atomic_rwlock_wrlock(&hgroup_commit_rwl);
+  my_atomic_store32(&hgroup_commit, (uint32) mode);
+  my_atomic_rwlock_wrunlock(&hgroup_commit_rwl);
+}
+
+
+/**
+  @brief forced log sync (used when we are switching modes)
+*/
+
+void translog_sync()
+{
+  uint32 max= get_current_logfile()->number;
+  uint32 min;
+  DBUG_ENTER("ma_translog_sync");
+
+  my_atomic_rwlock_rdlock(&soft_sync_rwl);
+  min= my_atomic_load32(&soft_sync_min);
+  my_atomic_rwlock_rdunlock(&soft_sync_rwl);
+  if (!min)
+    min= max;
+
+  translog_sync_files(min, max, sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS);
+
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+  @brief set rate for group commit
+
+  @param rate            rate to set.
+
+  @note internally it stores interval in nanoseconds/TRANSLOG_RATE_BASE (to
+  fit into uint32)
+*/
+
+void translog_set_group_commit_rate(uint32 rate)
+{
+  DBUG_ENTER("translog_set_group_commit_rate");
+  ulonglong wait_time;
+  if (rate)
+  {
+    wait_time= ((TRANSLOG_RATE_BASE * 1000000000ULL / rate +
+                 TRANSLOG_RATE_BASE / 2) /
+                TRANSLOG_RATE_BASE);
+    if (wait_time == 0)
+      wait_time= 1; /* protection from getting special value */
+  }
+  else
+    wait_time= 0;
+  my_atomic_rwlock_wrlock(&group_commit_wait_rwl);
+  my_atomic_store32(&group_commit_wait, (uint32)wait_time);
+  my_atomic_rwlock_wrunlock(&group_commit_wait_rwl);
+  DBUG_PRINT("info", ("rate: %lu  wait: %llu",
+                      (ulong)rate, (ulonglong)wait_time));
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+  @brief syncing service thread
+*/
+
+static pthread_handler_t
+ma_soft_sync_background( void *arg __attribute__((unused)))
+{
+
+  my_thread_init();
+  DBUG_ENTER("ma_soft_sync_background");
+  for(;;)
+  {
+    ulonglong prev_loop= my_getsystime();
+    ulonglong time, sleep;
+    uint32 min, max;
+    my_atomic_rwlock_rdlock(&soft_sync_rwl);
+    min= my_atomic_load32(&soft_sync_min);
+    max= my_atomic_load32(&soft_sync_max);
+    my_atomic_store32(&soft_sync_min, max);
+    my_atomic_rwlock_rdunlock(&soft_sync_rwl);
+
+    my_atomic_rwlock_rdlock(&group_commit_wait_rwl);
+    sleep= my_atomic_load32(&group_commit_wait) * TRANSLOG_RATE_BASE;
+    my_atomic_rwlock_rdunlock(&group_commit_wait_rwl);
+    translog_sync_files(min, max, FALSE);
+    time= my_getsystime() - prev_loop;
+    if (time > sleep)
+      sleep= 0;
+    else
+      sleep-= time;
+    if (my_service_thread_sleep(&soft_sync_control, sleep))
+      break;
+  }
+  my_service_thread_signal_end(&soft_sync_control);
+  my_thread_end();
+  DBUG_RETURN(0);
+}
+
+
+/**
+  @brief Starts syncing thread
+*/
+
+int translog_soft_sync_start(void)
+{
+  pthread_t th;
+  int res= 0;
+  DBUG_ENTER("translog_soft_sync_start");
+  if (ma_service_thread_control_init(&soft_sync_control))
+    res= 1;
+  else if (!(res= pthread_create(&th, NULL, ma_soft_sync_background, NULL)))
+      soft_sync_control.status= THREAD_RUNNING;
+  DBUG_RETURN(res);
+}
+
+
+/**
+  @brief Stops syncing thread
+*/
+
+void  translog_soft_sync_end(void)
+{
+  DBUG_ENTER("translog_soft_sync_end");
+  if (soft_sync_control.inited)
+  {
+    ma_service_thread_control_end(&soft_sync_control);
+  }
+  DBUG_VOID_RETURN;
+}
+
+
 #ifdef MARIA_DUMP_LOG
 #include <my_getopt.h>
 extern void translog_example_table_init();

=== modified file 'storage/maria/ma_loghandler.h'
--- a/storage/maria/ma_loghandler.h	2009-01-15 22:25:53 +0000
+++ b/storage/maria/ma_loghandler.h	2009-03-17 20:32:17 +0000
@@ -342,6 +342,14 @@ enum enum_translog_status
   TRANSLOG_SHUTDOWN  /* going to shutdown the loghandler */
 };
 extern enum enum_translog_status translog_status;
+extern ulonglong translog_syncs; /* Number of sync()s */
+
+void translog_soft_sync(my_bool mode);
+void translog_hard_group_commit(my_bool mode);
+int translog_soft_sync_start(void);
+void  translog_soft_sync_end(void);
+void translog_sync();
+void translog_set_group_commit_rate(uint32 rate);
 
 /*
   all the rest added because of recovery; should we make
@@ -441,6 +449,18 @@ extern LOG_DESC log_record_type_descript
 
 typedef enum
 {
+  TRANSLOG_GCOMMIT_NONE,
+  TRANSLOG_GCOMMIT_HARD,
+  TRANSLOG_GCOMMIT_SOFT
+} enum_maria_group_commit;
+extern ulong group_commit;
+extern ulong group_commit_rate;
+/**
+  group commit interval is TRANSLOG_RATE_BASE/<rate> seconds
+*/
+#define TRANSLOG_RATE_BASE 100
+typedef enum
+{
   TRANSLOG_PURGE_IMMIDIATE,
   TRANSLOG_PURGE_EXTERNAL,
   TRANSLOG_PURGE_ONDEMAND

=== modified file 'tests/fork_big2.pl'
--- a/tests/fork_big2.pl	2006-02-12 21:26:30 +0000
+++ b/tests/fork_big2.pl	2009-03-17 20:32:17 +0000
@@ -16,21 +16,21 @@ package main;
 
 $opt_skip_create=$opt_skip_in=$opt_verbose=$opt_fast_insert=
 $opt_lock_tables=$opt_debug=$opt_skip_drop=$opt_fast=$opt_force=0;
-$opt_thread_factor=1;
-$opt_insert=1;
-$opt_select=6;$opt_join=4;
-$opt_select_count=$opt_join_count=0;
-$opt_update=1;$opt_delete=0;
-$opt_flush=$opt_check=$opt_repair=$opt_alter=0;
-$opt_join_range=100;
+$opt_thread_factor=1;
+$opt_insert=1;
+$opt_select=6;$opt_join=4;
+$opt_select_count=$opt_join_count=0;
+$opt_update=1;$opt_delete=0;
+$opt_flush=$opt_check=$opt_repair=$opt_alter=0;
+$opt_join_range=100;
 $opt_resize_interval=0;
 $opt_time=0;
 $opt_host=$opt_user=$opt_password=""; $opt_db="test";
 $opt_verbose=$opt_debug=$opt_lock_tables=$opt_fast_insert=$opt_fast=$opt_skip_in=$opt_force=undef;  # Ignore warnings from these
 
-GetOptions("host=s","db=s","user=s","password=s","loop-count=i","skip-create","skip-in","skip-drop",
-           "verbose","fast-insert","lock-tables","debug","fast","force","thread-factor=i",
-		   "insert=i", "select=i", "join=i", "select-count=i", "join-count=i", "update=i", "delete=i",
+GetOptions("host=s","db=s","user=s","password=s","loop-count=i","skip-create","skip-in","skip-drop",
+           "verbose","fast-insert","lock-tables","debug","fast","force","thread-factor=i",
+		   "insert=i", "select=i", "join=i", "select-count=i", "join-count=i", "update=i", "delete=i",
 		   "flush=i", "check=i", "repair=i", "alter=i", "resize-interval=i", "max-join_range=i", "time=i") || die "Aborted";
 
 print "Test of multiple connections that test the following things:\n";
@@ -48,20 +48,20 @@ srand 100;			# Make random numbers repea
 
 ####
 ####  Start timeing and start test
-####
-
+####
+
 $opt_insert*=$opt_thread_factor;
-$opt_select*=$opt_thread_factor;
-$opt_join*=$opt_thread_factor;
-$opt_select_count*=$opt_thread_factor;
-$opt_join_count*=$opt_thread_factor;
-$opt_update*=$opt_thread_factor;
-$opt_delete*=$opt_thread_factor;
-
-if ($opt_time == 0 && $opt_insert == 0)
-{
-  $opt_insert=1;
-}
+$opt_select*=$opt_thread_factor;
+$opt_join*=$opt_thread_factor;
+$opt_select_count*=$opt_thread_factor;
+$opt_join_count*=$opt_thread_factor;
+$opt_update*=$opt_thread_factor;
+$opt_delete*=$opt_thread_factor;
+
+if ($opt_time == 0 && $opt_insert == 0)
+{
+  $opt_insert=1;
+}
 
 $start_time=new Benchmark;
 $dbh = DBI->connect("DBI:mysql:$opt_db:$opt_host",
@@ -100,71 +100,71 @@ $|= 1;				# Autoflush
 ####
 #### Start the tests
 ####
-if ($opt_time != 0)
-{
-	test_abort() if (($pid=fork()) == 0); $work{$pid}="abort";
+if ($opt_time != 0)
+{
+	test_abort() if (($pid=fork()) == 0); $work{$pid}="abort";
 }
 for ($i=0 ; $i < $opt_insert ; $i ++)
 {
   test_insert() if (($pid=fork()) == 0); $work{$pid}="insert";
-}
+}
 $threads=$i;
-for ($i=0 ; $i < $opt_select ; $i ++)
-{
-  test_select() if (($pid=fork()) == 0); $work{$pid}="select";
-}
-$threads+=$i;
-for ($i=0 ; $i < $opt_join ; $i ++)
-{
-  test_join() if (($pid=fork()) == 0); $work{$pid}="join";
-}
-$threads+=$i;
+for ($i=0 ; $i < $opt_select ; $i ++)
+{
+  test_select() if (($pid=fork()) == 0); $work{$pid}="select";
+}
+$threads+=$i;
+for ($i=0 ; $i < $opt_join ; $i ++)
+{
+  test_join() if (($pid=fork()) == 0); $work{$pid}="join";
+}
+$threads+=$i;
 for ($i=0 ; $i < $opt_select_count ; $i ++)
 {
   test_select_count() if (($pid=fork()) == 0); $work{$pid}="select_count";
 }
-$threads+=$i;
-for ($i=0 ; $i < $opt_join_count ; $i ++)
-{
-  test_join_count() if (($pid=fork()) == 0); $work{$pid}="join_count";
-}
-$threads+=$i;
-for ($i=0 ; $i < $opt_update ; $i ++)
-{
-  test_update() if (($pid=fork()) == 0); $work{$pid}="update";
-}
-$threads+=$i;
-for ($i=0 ; $i < $opt_delete ; $i ++)
-{
-  test_delete() if (($pid=fork()) == 0); $work{$pid}="delete";
-}
-$threads+=$i;
-for ($i=0 ; $i < $opt_flush ; $i ++)
-{
-  test_flush() if (($pid=fork()) == 0); $work{$pid}="flush";
-}
-$threads+=$i;
-for ($i=0 ; $i < $opt_check ; $i ++)
-{
-  test_check() if (($pid=fork()) == 0); $work{$pid}="check";
-}
-$threads+=$i;
-for ($i=0 ; $i < $opt_repair ; $i ++)
-{
-  test_repair() if (($pid=fork()) == 0); $work{$pid}="repair";
-}
-$threads+=$i;
-for ($i=0 ; $i < $opt_alter ; $i ++)
-{
-  test_alter() if (($pid=fork()) == 0); $work{$pid}="alter";
-}
-$threads+=$i;
+$threads+=$i;
+for ($i=0 ; $i < $opt_join_count ; $i ++)
+{
+  test_join_count() if (($pid=fork()) == 0); $work{$pid}="join_count";
+}
+$threads+=$i;
+for ($i=0 ; $i < $opt_update ; $i ++)
+{
+  test_update() if (($pid=fork()) == 0); $work{$pid}="update";
+}
+$threads+=$i;
+for ($i=0 ; $i < $opt_delete ; $i ++)
+{
+  test_delete() if (($pid=fork()) == 0); $work{$pid}="delete";
+}
+$threads+=$i;
+for ($i=0 ; $i < $opt_flush ; $i ++)
+{
+  test_flush() if (($pid=fork()) == 0); $work{$pid}="flush";
+}
+$threads+=$i;
+for ($i=0 ; $i < $opt_check ; $i ++)
+{
+  test_check() if (($pid=fork()) == 0); $work{$pid}="check";
+}
+$threads+=$i;
+for ($i=0 ; $i < $opt_repair ; $i ++)
+{
+  test_repair() if (($pid=fork()) == 0); $work{$pid}="repair";
+}
+$threads+=$i;
+for ($i=0 ; $i < $opt_alter ; $i ++)
+{
+  test_alter() if (($pid=fork()) == 0); $work{$pid}="alter";
+}
+$threads+=$i;
 if ($opt_resize_interval != 0)
 {
   test_resize() if (($pid=fork()) == 0); $work{$pid}="resize";
   $threads+=1;
 }
-
+
 print "Started $threads threads\n";
 
 $errors=0;
@@ -172,17 +172,17 @@ $running_insert_threads=$opt_insert;
 while (($pid=wait()) != -1)
 {
   $ret=$?/256;
-  print "thread '" . $work{$pid} . "' finished with exit code $ret\n";
-  if ($opt_time == 0)
+  print "thread '" . $work{$pid} . "' finished with exit code $ret\n";
+  if ($opt_time == 0)
   {
     if ($work{$pid} =~ /^insert/)
     {
       if (!--$running_insert_threads)
-      {
-
+      {
+
         # Time to stop other threads
         signal_abort();
-      }
+      }
 	}
   }
   $errors++ if ($ret != 0);
@@ -214,17 +214,17 @@ print "Total time: " .
 
 exit(0);
 
-#
-# Sleep and then abort other threads
-#
-
-sub test_abort
-{
-  sleep($opt_time);
-  signal_abort();
-  exit(0);
-}
-
+#
+# Sleep and then abort other threads
+#
+
+sub test_abort
+{
+  sleep($opt_time);
+  signal_abort();
+  exit(0);
+}
+
 
 #
 # Insert records in the table
@@ -363,58 +363,58 @@ sub test_join
   $dbh->disconnect; $dbh=0;
   print "Test_join: Executed $count joins\n";
   exit(0);
-}
-
-#
-# select records
-# Do continously joins between the first and second for range and count selected rows
-#
-
-sub test_join_count
-{
-  my ($dbh, $i, $j, $count, $loop);
-
-  $dbh = DBI->connect("DBI:mysql:$opt_db:$opt_host",
-		      $opt_user, $opt_password,
-		    { PrintError => 0}) || die $DBI::errstr;
-
-  $count_query=make_count_query($numtables);
-  $count=0;
-  $loop=9999;
-  $sum=0;
-
-  srand();
-
-  $i=0;
-  while (($i++ % 10) || !test_if_abort($dbh))
-  {
-    if ($loop++ >= 10)
-    {
-      $loop=0;
-      $row_counts=simple_query($dbh, $count_query);
-    }
-    for ($j=0 ; $j < $numtables-1 ; $j++)
-    {
-      my ($id1)= int rand $row_counts->[$j];
-      my ($id2)= int rand $row_counts->[$j];
-	  if ($id1 > $id2)
-	  {
-	    my $id0=$id1; $id1=$id2; $id2=$id0;
-		if ($id2-$id1 > $opt_join_range)
-		{
-		  $id2=$id1+$opt_join_range;
-		}
-	  }
-      my ($t1,$t2)= ($testtables[$j]->[0],$testtables[$j+1]->[0]);
-      $row=simple_query($dbh, "select count(*) from $t1, $t2 where $t1.id=$t2.id and $t1.id between $id1 and $id2");
-      $sum+=$row->[0];
-      $count++;
-    }
-  }
-  $dbh->disconnect; $dbh=0;
-  print "Test_join_count: Executed $count joins: total $sum rows\n";
-  exit(0);
-}
+}
+
+#
+# select records
+# Do continously joins between the first and second for range and count selected rows
+#
+
+sub test_join_count
+{
+  my ($dbh, $i, $j, $count, $loop);
+
+  $dbh = DBI->connect("DBI:mysql:$opt_db:$opt_host",
+		      $opt_user, $opt_password,
+		    { PrintError => 0}) || die $DBI::errstr;
+
+  $count_query=make_count_query($numtables);
+  $count=0;
+  $loop=9999;
+  $sum=0;
+
+  srand();
+
+  $i=0;
+  while (($i++ % 10) || !test_if_abort($dbh))
+  {
+    if ($loop++ >= 10)
+    {
+      $loop=0;
+      $row_counts=simple_query($dbh, $count_query);
+    }
+    for ($j=0 ; $j < $numtables-1 ; $j++)
+    {
+      my ($id1)= int rand $row_counts->[$j];
+      my ($id2)= int rand $row_counts->[$j];
+	  if ($id1 > $id2)
+	  {
+	    my $id0=$id1; $id1=$id2; $id2=$id0;
+		if ($id2-$id1 > $opt_join_range)
+		{
+		  $id2=$id1+$opt_join_range;
+		}
+	  }
+      my ($t1,$t2)= ($testtables[$j]->[0],$testtables[$j+1]->[0]);
+      $row=simple_query($dbh, "select count(*) from $t1, $t2 where $t1.id=$t2.id and $t1.id between $id1 and $id2");
+      $sum+=$row->[0];
+      $count++;
+    }
+  }
+  $dbh->disconnect; $dbh=0;
+  print "Test_join_count: Executed $count joins: total $sum rows\n";
+  exit(0);
+}
 
 
 #