LTP GCOV extension - code coverage report
Current view: directory - storage/maria - ma_checkpoint.c
Test: maria-mtr.html
Date: 2009-03-04 Instrumented lines: 334
Code covered: 53.3 % Executed lines: 178

       1                 : /* Copyright (C) 2006,2007 MySQL AB
       2                 : 
       3                 :    This program is free software; you can redistribute it and/or modify
       4                 :    it under the terms of the GNU General Public License as published by
       5                 :    the Free Software Foundation; version 2 of the License.
       6                 : 
       7                 :    This program is distributed in the hope that it will be useful,
       8                 :    but WITHOUT ANY WARRANTY; without even the implied warranty of
       9                 :    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      10                 :    GNU General Public License for more details.
      11                 : 
      12                 :    You should have received a copy of the GNU General Public License
      13                 :    along with this program; if not, write to the Free Software
      14                 :    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
      15                 : 
      16                 : /*
      17                 :   WL#3071 Maria checkpoint
      18                 :   First version written by Guilhem Bichot on 2006-04-27.
      19                 : */
      20                 : 
      21                 : /* Here is the implementation of this module */
      22                 : 
      23                 : /** @todo RECOVERY BUG this is unreviewed code */
      24                 : /*
      25                 :   Summary:
      26                 :   checkpoints are done either by a background thread (checkpoint every Nth
      27                 :   second) or by a client.
      28                 :   In ha_maria, it's not made available to clients, and will soon be done by a
      29                 :   background thread (periodically taking checkpoints and flushing dirty
      30                 :   pages).
      31                 : */
      32                 : 
      33                 : #include "maria_def.h"
      34                 : #include "ma_pagecache.h"
      35                 : #include "ma_blockrec.h"
      36                 : #include "ma_checkpoint.h"
      37                 : #include "ma_loghandler_lsn.h"
      38                 : 
      39                 : 
      40                 : /** @brief type of checkpoint currently running */
      41                 : static CHECKPOINT_LEVEL checkpoint_in_progress= CHECKPOINT_NONE;
      42                 : /** @brief protects checkpoint_in_progress */
      43                 : static pthread_mutex_t LOCK_checkpoint;
      44                 : /** @brief for killing the background checkpoint thread */
      45                 : static pthread_cond_t  COND_checkpoint;
      46                 : /** @brief if checkpoint module was inited or not */
      47                 : static my_bool checkpoint_inited= FALSE;
      48                 : /** @brief 'kill' flag for the background checkpoint thread */
      49                 : static int checkpoint_thread_die;
      50                 : /* is ulong like pagecache->blocks_changed */
      51                 : static ulong pages_to_flush_before_next_checkpoint;
      52                 : static PAGECACHE_FILE *dfiles, /**< data files to flush in background */
      53                 :   *dfiles_end; /**< list of data files ends here */
      54                 : static PAGECACHE_FILE *kfiles, /**< index files to flush in background */
      55                 :   *kfiles_end; /**< list of index files ends here */
      56                 : /* those two statistics below could serve in SHOW GLOBAL STATUS */
      57                 : static uint checkpoints_total= 0, /**< all checkpoint requests made */
      58                 :   checkpoints_ok_total= 0; /**< all checkpoints which succeeded */
      59                 : 
      60                 : struct st_filter_param
      61                 : {
      62                 :   LSN up_to_lsn; /**< only pages with rec_lsn < this LSN */
      63                 :   uint max_pages; /**< stop after flushing this number pages */
      64                 : }; /**< information to determine which dirty pages should be flushed */
      65                 : 
      66                 : static enum pagecache_flush_filter_result
      67                 : filter_flush_file_medium(enum pagecache_page_type type,
      68                 :                          pgcache_page_no_t page,
      69                 :                          LSN rec_lsn, void *arg);
      70                 : static enum pagecache_flush_filter_result
      71                 : filter_flush_file_full(enum pagecache_page_type type,
      72                 :                        pgcache_page_no_t page,
      73                 :                        LSN rec_lsn, void *arg);
      74                 : static enum pagecache_flush_filter_result
      75                 : filter_flush_file_evenly(enum pagecache_page_type type,
      76                 :                          pgcache_page_no_t pageno,
      77                 :                          LSN rec_lsn, void *arg);
      78                 : static int really_execute_checkpoint(void);
      79                 : pthread_handler_t ma_checkpoint_background(void *arg);
      80                 : static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon);
      81                 : 
      82                 : /**
      83                 :    @brief Does a checkpoint
      84                 : 
      85                 :    @param  level               what level of checkpoint to do
      86                 :    @param  no_wait             if another checkpoint of same or stronger level
      87                 :                                is already running, consider our job done
      88                 : 
      89                 :    @note In ha_maria, there can never be two threads trying a checkpoint at
      90                 :    the same time.
      91                 : 
      92                 :    @return Operation status
      93                 :     @retval 0 ok
      94                 :     @retval !=0 error
      95                 : */
      96                 : 
      97                 : int ma_checkpoint_execute(CHECKPOINT_LEVEL level, my_bool no_wait)
      98               4 : {
      99               4 :   int result= 0;
     100               4 :   DBUG_ENTER("ma_checkpoint_execute");
     101                 : 
     102               4 :   if (!checkpoint_inited)
     103                 :   {
     104                 :     /*
     105                 :       If ha_maria failed to start, maria_panic_hton is called, we come here.
     106                 :     */
     107               0 :     DBUG_RETURN(0);
     108                 :   }
     109               4 :   DBUG_ASSERT(level > CHECKPOINT_NONE);
     110                 : 
     111                 :   /* look for already running checkpoints */
     112               4 :   pthread_mutex_lock(&LOCK_checkpoint);
     113               8 :   while (checkpoint_in_progress != CHECKPOINT_NONE)
     114                 :   {
     115               0 :     if (no_wait && (checkpoint_in_progress >= level))
     116                 :     {
     117                 :       /*
     118                 :         If we are the checkpoint background thread, we don't wait (it's
     119                 :         smarter to flush pages instead of waiting here while the other thread
     120                 :         finishes its checkpoint).
     121                 :       */
     122               0 :       pthread_mutex_unlock(&LOCK_checkpoint);
     123               0 :       goto end;
     124                 :     }
     125               0 :     pthread_cond_wait(&COND_checkpoint, &LOCK_checkpoint);
     126                 :   }
     127                 : 
     128               4 :   checkpoint_in_progress= level;
     129               4 :   pthread_mutex_unlock(&LOCK_checkpoint);
     130                 :   /* from then on, we are sure to be and stay the only checkpointer */
     131                 : 
     132               4 :   result= really_execute_checkpoint();
     133               4 :   pthread_cond_broadcast(&COND_checkpoint);
     134               4 : end:
     135               4 :   DBUG_RETURN(result);
     136                 : }
     137                 : 
     138                 : 
     139                 : /**
     140                 :    @brief Does a checkpoint, really; expects no other checkpoints
     141                 :    running.
     142                 : 
     143                 :    Checkpoint level requested is read from checkpoint_in_progress.
     144                 : 
     145                 :    @return Operation status
     146                 :     @retval 0   ok
     147                 :     @retval !=0 error
     148                 : */
     149                 : 
     150                 : static int really_execute_checkpoint(void)
     151               4 : {
     152               4 :   uint i, error= 0;
     153                 :   /** @brief checkpoint_start_log_horizon will be stored there */
     154                 :   char *ptr;
     155                 :   LEX_STRING record_pieces[4]; /**< only malloc-ed pieces */
     156                 :   LSN min_page_rec_lsn, min_trn_rec_lsn, min_first_undo_lsn;
     157                 :   TRANSLOG_ADDRESS checkpoint_start_log_horizon;
     158                 :   char checkpoint_start_log_horizon_char[LSN_STORE_SIZE];
     159               4 :   DBUG_ENTER("really_execute_checkpoint");
     160               4 :   DBUG_PRINT("enter", ("level: %d", checkpoint_in_progress));
     161               4 :   bzero(&record_pieces, sizeof(record_pieces));
     162                 : 
     163                 :   /*
     164                 :     STEP 1: record current end-of-log position using log's lock. It is
     165                 :     critical for the correctness of Checkpoint (related to memory visibility
     166                 :     rules, the log's lock is a mutex).
     167                 :     "Horizon" is a lower bound of the LSN of the next log record.
     168                 :   */
     169               4 :   checkpoint_start_log_horizon= translog_get_horizon();
     170               4 :   DBUG_PRINT("info",("checkpoint_start_log_horizon (%lu,0x%lx)",
     171                 :                      LSN_IN_PARTS(checkpoint_start_log_horizon)));
     172               4 :   lsn_store(checkpoint_start_log_horizon_char, checkpoint_start_log_horizon);
     173                 : 
     174                 :   /*
     175                 :     STEP 2: fetch information about transactions.
     176                 :     We must fetch transactions before dirty pages. Indeed, a transaction
     177                 :     first sets its rec_lsn then sets the page's rec_lsn then sets its rec_lsn
     178                 :     to 0. If we fetched pages first, we may see no dirty page yet, then we
     179                 :     fetch transactions but the transaction has already reset its rec_lsn to 0
     180                 :     so we miss rec_lsn again.
     181                 :     For a similar reason (over-allocated bitmap pages) we have to fetch
     182                 :     transactions before flushing bitmap pages.
     183                 : 
     184                 :     min_trn_rec_lsn will serve to lower the starting point of the REDO phase
     185                 :     (down from checkpoint_start_log_horizon).
     186                 :  */
     187               4 :   if (unlikely(trnman_collect_transactions(&record_pieces[0],
     188                 :                                            &record_pieces[1],
     189                 :                                            &min_trn_rec_lsn,
     190                 :                                            &min_first_undo_lsn)))
     191               4 :     goto err;
     192                 : 
     193                 : 
     194                 :   /* STEP 3: fetch information about table files */
     195               4 :   if (unlikely(collect_tables(&record_pieces[2],
     196                 :                               checkpoint_start_log_horizon)))
     197               4 :     goto err;
     198                 : 
     199                 : 
     200                 :   /* STEP 4: fetch information about dirty pages */
     201                 :   /*
     202                 :     It's better to do it _after_ having flushed some data pages (which
     203                 :     collect_tables() may have done), because those are now non-dirty and so we
     204                 :     have a more up-to-date dirty pages list to put into the checkpoint record,
     205                 :     and thus we will have less work at Recovery.
     206                 :   */
     207                 :   /* Using default pagecache for now */
     208               4 :   if (unlikely(pagecache_collect_changed_blocks_with_lsn(maria_pagecache,
     209                 :                                                          &record_pieces[3],
     210                 :                                                          &min_page_rec_lsn)))
     211               4 :     goto err;
     212                 : 
     213                 : 
     214                 :   /* LAST STEP: now write the checkpoint log record */
     215                 :   {
     216                 :     LSN lsn;
     217                 :     translog_size_t total_rec_length;
     218                 :     /*
     219                 :       the log handler is allowed to modify "str" and "length" (but not "*str")
     220                 :       of its argument, so we must not pass it record_pieces directly,
     221                 :       otherwise we would later not know what memory pieces to my_free().
     222                 :     */
     223                 :     LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 5];
     224               4 :     log_array[TRANSLOG_INTERNAL_PARTS + 0].str=
     225                 :       (uchar*) checkpoint_start_log_horizon_char;
     226               4 :     log_array[TRANSLOG_INTERNAL_PARTS + 0].length= total_rec_length=
     227                 :       sizeof(checkpoint_start_log_horizon_char);
     228              20 :     for (i= 0; i < (sizeof(record_pieces)/sizeof(record_pieces[0])); i++)
     229                 :     {
     230              16 :       log_array[TRANSLOG_INTERNAL_PARTS + 1 + i]=
     231                 :         *(LEX_CUSTRING *)&record_pieces[i];
     232              16 :       total_rec_length+= (translog_size_t) record_pieces[i].length;
     233                 :     }
     234               4 :     if (unlikely(translog_write_record(&lsn, LOGREC_CHECKPOINT,
     235                 :                                        &dummy_transaction_object, NULL,
     236                 :                                        total_rec_length,
     237                 :                                        sizeof(log_array)/sizeof(log_array[0]),
     238                 :                                        log_array, NULL, NULL) ||
     239                 :                  translog_flush(lsn)))
     240               4 :       goto err;
     241               4 :     translog_lock();
     242                 :     /*
     243                 :       This cannot be done as a inwrite_rec_hook of LOGREC_CHECKPOINT, because
     244                 :       such hook would be called before translog_flush (and we must be sure
     245                 :       that log was flushed before we write to the control file).
     246                 :     */
     247               4 :     if (unlikely(ma_control_file_write_and_force(lsn, last_logno,
     248                 :                                                  max_trid_in_control_file,
     249                 :                                                  recovery_failures)))
     250                 :     {
     251               0 :       translog_unlock();
     252               0 :       goto err;
     253                 :     }
     254               4 :     translog_unlock();
     255                 :   }
     256                 : 
     257                 :   /*
     258                 :     Note that we should not alter memory structures until we have successfully
     259                 :     written the checkpoint record and control file.
     260                 :   */
     261                 :   /* checkpoint succeeded */
     262               4 :   ptr= record_pieces[3].str;
     263               4 :   pages_to_flush_before_next_checkpoint= uint4korr(ptr);
     264               4 :   DBUG_PRINT("checkpoint",("%u pages to flush before next checkpoint",
     265                 :                            (uint)pages_to_flush_before_next_checkpoint));
     266                 : 
     267                 :   /* compute log's low-water mark */
     268                 :   {
     269               4 :     TRANSLOG_ADDRESS log_low_water_mark= min_page_rec_lsn;
     270               4 :     set_if_smaller(log_low_water_mark, min_trn_rec_lsn);
     271               4 :     set_if_smaller(log_low_water_mark, min_first_undo_lsn);
     272               4 :     set_if_smaller(log_low_water_mark, checkpoint_start_log_horizon);
     273                 :     /**
     274                 :        Now purge unneeded logs.
     275                 :        As some systems have an unreliable fsync (drive lying), we could try to
     276                 :        be robust against that: remember a few previous checkpoints in the
     277                 :        control file, and not purge logs immediately... Think about it.
     278                 :     */
     279               4 :     if (translog_purge(log_low_water_mark))
     280               0 :       ma_message_no_user(0, "log purging failed");
     281                 :   }
     282                 : 
     283                 :   goto end;
     284                 : 
     285               0 : err:
     286               0 :   error= 1;
     287               0 :   ma_message_no_user(0, "checkpoint failed");
     288                 :   /* we were possibly not able to determine what pages to flush */
     289               0 :   pages_to_flush_before_next_checkpoint= 0;
     290                 : 
     291               4 : end:
     292              20 :   for (i= 0; i < (sizeof(record_pieces)/sizeof(record_pieces[0])); i++)
     293              16 :     my_free(record_pieces[i].str, MYF(MY_ALLOW_ZERO_PTR));
     294               4 :   pthread_mutex_lock(&LOCK_checkpoint);
     295               4 :   checkpoint_in_progress= CHECKPOINT_NONE;
     296               4 :   checkpoints_total++;
     297               4 :   checkpoints_ok_total+= !error;
     298               4 :   pthread_mutex_unlock(&LOCK_checkpoint);
     299               4 :   DBUG_RETURN(error);
     300                 : }
     301                 : 
     302                 : 
     303                 : /**
     304                 :    @brief Initializes the checkpoint module
     305                 : 
     306                 :    @param  interval           If one wants the module to create a
     307                 :                               thread which will periodically do
     308                 :                               checkpoints, and flush dirty pages, in the
     309                 :                               background, it should specify a non-zero
     310                 :                               interval in seconds. The thread will then be
     311                 :                               created and will take checkpoints separated by
     312                 :                               approximately 'interval' second.
     313                 : 
     314                 :    @note A checkpoint is taken only if there has been some significant
     315                 :    activity since the previous checkpoint. Between checkpoint N and N+1 the
     316                 :    thread flushes all dirty pages which were already dirty at the time of
     317                 :    checkpoint N.
     318                 : 
     319                 :    @return Operation status
     320                 :     @retval 0   ok
     321                 :     @retval !=0 error
     322                 : */
     323                 : 
     324                 : int ma_checkpoint_init(ulong interval)
     325              10 : {
     326                 :   pthread_t th;
     327              10 :   int res= 0;
     328              10 :   DBUG_ENTER("ma_checkpoint_init");
     329              10 :   checkpoint_inited= TRUE;
     330              10 :   checkpoint_thread_die= 2; /* not yet born == dead */
     331              10 :   if (pthread_mutex_init(&LOCK_checkpoint, MY_MUTEX_INIT_SLOW) ||
     332                 :       pthread_cond_init(&COND_checkpoint, 0))
     333               0 :     res= 1;
     334              10 :   else if (interval > 0)
     335                 :   {
     336                 :     compile_time_assert(sizeof(void *) >= sizeof(ulong));
     337               5 :     if (!(res= pthread_create(&th, NULL, ma_checkpoint_background,
     338                 :                               (void *)interval)))
     339               5 :       checkpoint_thread_die= 0; /* thread lives, will have to be killed */
     340                 :   }
     341              10 :   DBUG_RETURN(res);
     342                 : }
     343                 : 
     344                 : 
     345                 : #ifndef DBUG_OFF
     346                 : /**
     347                 :    Function used to test recovery: flush some table pieces and then caller
     348                 :    crashes.
     349                 : 
     350                 :    @param  what_to_flush   0: current bitmap and all data pages
     351                 :                            1: state
     352                 :                            2: all bitmap pages
     353                 : */
     354                 : static void flush_all_tables(int what_to_flush)
     355               0 : {
     356               0 :   int res= 0;
     357                 :   LIST *pos; /**< to iterate over open tables */
     358               0 :   pthread_mutex_lock(&THR_LOCK_maria);
     359               0 :   for (pos= maria_open_list; pos; pos= pos->next)
     360                 :   {
     361               0 :     MARIA_HA *info= (MARIA_HA*)pos->data;
     362               0 :     if (info->s->now_transactional)
     363                 :     {
     364               0 :       switch (what_to_flush)
     365                 :       {
     366                 :       case 0:
     367               0 :         res= _ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
     368                 :                                    FLUSH_KEEP, FLUSH_KEEP);
     369               0 :         break;
     370                 :       case 1:
     371               0 :         res= _ma_state_info_write(info->s,
     372                 :                                   MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET|
     373                 :                                   MA_STATE_INFO_WRITE_LOCK);
     374               0 :         DBUG_PRINT("maria_flush_states",
     375                 :                    ("is_of_horizon: LSN (%lu,0x%lx)",
     376                 :                     LSN_IN_PARTS(info->s->state.is_of_horizon)));
     377               0 :         break;
     378                 :       case 2:
     379               0 :         res= _ma_bitmap_flush_all(info->s);
     380                 :         break;
     381                 :       }
     382                 :     }
     383               0 :     DBUG_ASSERT(res == 0);
     384                 :   }
     385               0 :   pthread_mutex_unlock(&THR_LOCK_maria);
     386                 : }
     387                 : #endif
     388                 : 
     389                 : 
     390                 : /**
     391                 :    @brief Destroys the checkpoint module
     392                 : */
     393                 : 
     394                 : void ma_checkpoint_end(void)
     395             121 : {
     396             121 :   DBUG_ENTER("ma_checkpoint_end");
     397                 :   /*
     398                 :     Some intentional crash methods, usually triggered by
     399                 :     SET MARIA_CHECKPOINT_INTERVAL=X
     400                 :   */
     401             121 :   DBUG_EXECUTE_IF("maria_flush_bitmap",
     402                 :                   {
     403                 :                     DBUG_PRINT("maria_flush_bitmap", ("now"));
     404                 :                     flush_all_tables(2);
     405                 :                   });
     406             121 :   DBUG_EXECUTE_IF("maria_flush_whole_page_cache",
     407                 :                   {
     408                 :                     DBUG_PRINT("maria_flush_whole_page_cache", ("now"));
     409                 :                     flush_all_tables(0);
     410                 :                   });
     411             121 :   DBUG_EXECUTE_IF("maria_flush_whole_log",
     412                 :                   {
     413                 :                     DBUG_PRINT("maria_flush_whole_log", ("now"));
     414                 :                     translog_flush(translog_get_horizon());
     415                 :                   });
     416                 :   /*
     417                 :     Note that for WAL reasons, maria_flush_states requires
     418                 :     maria_flush_whole_log.
     419                 :   */
     420             121 :   DBUG_EXECUTE_IF("maria_flush_states",
     421                 :                   {
     422                 :                     DBUG_PRINT("maria_flush_states", ("now"));
     423                 :                     flush_all_tables(1);
     424                 :                   });
     425             121 :   DBUG_EXECUTE_IF("maria_crash",
     426                 :                   { DBUG_PRINT("maria_crash", ("now")); DBUG_ABORT(); });
     427                 : 
     428             121 :   if (checkpoint_inited)
     429                 :   {
     430               9 :     pthread_mutex_lock(&LOCK_checkpoint);
     431               9 :     if (checkpoint_thread_die != 2) /* thread was started ok */
     432                 :     {
     433               4 :       DBUG_PRINT("info",("killing Maria background checkpoint thread"));
     434               4 :       checkpoint_thread_die= 1; /* kill it */
     435                 :       do /* and wait for it to be dead */
     436                 :       {
     437                 :         /* wake it up if it was in a sleep */
     438               4 :         pthread_cond_broadcast(&COND_checkpoint);
     439               4 :         DBUG_PRINT("info",("waiting for Maria background checkpoint thread"
     440                 :                            " to die"));
     441               4 :         pthread_cond_wait(&COND_checkpoint, &LOCK_checkpoint);
     442                 :       }
     443               4 :       while (checkpoint_thread_die != 2);
     444                 :     }
     445               9 :     pthread_mutex_unlock(&LOCK_checkpoint);
     446               9 :     my_free((uchar *)dfiles, MYF(MY_ALLOW_ZERO_PTR));
     447               9 :     my_free((uchar *)kfiles, MYF(MY_ALLOW_ZERO_PTR));
     448               9 :     dfiles= kfiles= NULL;
     449               9 :     pthread_mutex_destroy(&LOCK_checkpoint);
     450               9 :     pthread_cond_destroy(&COND_checkpoint);
     451               9 :     checkpoint_inited= FALSE;
     452                 :   }
     453             121 :   DBUG_VOID_RETURN;
     454                 : }
     455                 : 
     456                 : 
     457                 : /**
     458                 :    @brief dirty-page filtering criteria for MEDIUM checkpoint.
     459                 : 
     460                 :    We flush data/index pages which have been dirty since the previous
     461                 :    checkpoint (this is the two-checkpoint rule: the REDO phase will not have
     462                 :    to start from earlier than the next-to-last checkpoint).
     463                 :    Bitmap pages are handled by _ma_bitmap_flush_all().
     464                 : 
     465                 :    @param  type                Page's type
     466                 :    @param  pageno              Page's number
     467                 :    @param  rec_lsn             Page's rec_lsn
     468                 :    @param  arg                 filter_param
     469                 : */
     470                 : 
     471                 : static enum pagecache_flush_filter_result
     472                 : filter_flush_file_medium(enum pagecache_page_type type,
     473                 :                          pgcache_page_no_t pageno __attribute__ ((unused)),
     474                 :                          LSN rec_lsn, void *arg)
     475               0 : {
     476               0 :   struct st_filter_param *param= (struct st_filter_param *)arg;
     477               0 :   return (type == PAGECACHE_LSN_PAGE) &&
     478                 :     (cmp_translog_addr(rec_lsn, param->up_to_lsn) <= 0);
     479                 : }
     480                 : 
     481                 : 
     482                 : /**
     483                 :    @brief dirty-page filtering criteria for FULL checkpoint.
     484                 : 
     485                 :    We flush all dirty data/index pages.
     486                 :    Bitmap pages are handled by _ma_bitmap_flush_all().
     487                 : 
     488                 :    @param  type                Page's type
     489                 :    @param  pageno              Page's number
     490                 :    @param  rec_lsn             Page's rec_lsn
     491                 :    @param  arg                 filter_param
     492                 : */
     493                 : 
     494                 : static enum pagecache_flush_filter_result
     495                 : filter_flush_file_full(enum pagecache_page_type type,
     496                 :                        pgcache_page_no_t pageno __attribute__ ((unused)),
     497                 :                        LSN rec_lsn __attribute__ ((unused)),
     498                 :                        void *arg __attribute__ ((unused)))
     499               0 : {
     500               0 :   return (type == PAGECACHE_LSN_PAGE);
     501                 : }
     502                 : 
     503                 : 
     504                 : /**
     505                 :    @brief dirty-page filtering criteria for background flushing thread.
     506                 : 
     507                 :    We flush data/index pages which have been dirty since the previous
     508                 :    checkpoint (this is the two-checkpoint rule: the REDO phase will not have
     509                 :    to start from earlier than the next-to-last checkpoint), and no
     510                 :    bitmap pages. But we flush no more than a certain number of pages (to have
     511                 :    an even flushing, no write burst).
     512                 :    The reason to not flush bitmap pages is that they may not be in a flushable
     513                 :    state at this moment and we don't want to wait for them.
     514                 : 
     515                 :    @param  type                Page's type
     516                 :    @param  pageno              Page's number
     517                 :    @param  rec_lsn             Page's rec_lsn
     518                 :    @param  arg                 filter_param
     519                 : */
     520                 : 
     521                 : static enum pagecache_flush_filter_result
     522                 : filter_flush_file_evenly(enum pagecache_page_type type,
     523                 :                          pgcache_page_no_t pageno __attribute__ ((unused)),
     524                 :                          LSN rec_lsn, void *arg)
     525               0 : {
     526               0 :   struct st_filter_param *param= (struct st_filter_param *)arg;
     527               0 :   if (unlikely(param->max_pages == 0)) /* all flushed already */
     528               0 :     return FLUSH_FILTER_SKIP_ALL;
     529               0 :   if ((type == PAGECACHE_LSN_PAGE) &&
     530                 :       (cmp_translog_addr(rec_lsn, param->up_to_lsn) <= 0))
     531                 :   {
     532               0 :     param->max_pages--;
     533               0 :     return FLUSH_FILTER_OK;
     534                 :   }
     535               0 :   return FLUSH_FILTER_SKIP_TRY_NEXT;
     536                 : }
     537                 : 
     538                 : 
     539                 : /**
     540                 :    @brief Background thread which does checkpoints and flushes periodically.
     541                 : 
     542                 :    Takes a checkpoint. After this, all pages dirty at the time of that
     543                 :    checkpoint are flushed evenly until it is time to take another checkpoint.
     544                 :    This ensures that the REDO phase starts at earliest (in LSN time) at the
     545                 :    next-to-last checkpoint record ("two-checkpoint rule").
     546                 : 
     547                 :    @note MikaelR questioned why the same thread does two different jobs, the
     548                 :    risk could be that while a checkpoint happens no LRD flushing happens.
     549                 : */
     550                 : 
     551                 : pthread_handler_t ma_checkpoint_background(void *arg)
     552               5 : {
     553                 :   /** @brief At least this of log/page bytes written between checkpoints */
     554               5 :   const uint checkpoint_min_activity= 2*1024*1024;
     555                 :   /*
     556                 :     If the interval could be changed by the user while we are in this thread,
     557                 :     it could be annoying: for example it could cause "case 2" to be executed
     558                 :     right after "case 0", thus having 'dfile' unset. So the thread cares only
     559                 :     about the interval's value when it started.
     560                 :   */
     561               5 :   const ulong interval= (ulong)arg;
     562                 :   uint sleeps, sleep_time;
     563                 :   TRANSLOG_ADDRESS log_horizon_at_last_checkpoint=
     564               5 :     translog_get_horizon();
     565                 :   ulonglong pagecache_flushes_at_last_checkpoint=
     566               5 :     maria_pagecache->global_cache_write;
     567                 :   uint pages_bunch_size;
     568                 :   struct st_filter_param filter_param;
     569                 :   PAGECACHE_FILE *dfile; /**< data file currently being flushed */
     570                 :   PAGECACHE_FILE *kfile; /**< index file currently being flushed */
     571               5 :   LINT_INIT(kfile);
     572               5 :   LINT_INIT(dfile);
     573               5 :   LINT_INIT(pages_bunch_size);
     574                 : 
     575               5 :   my_thread_init();
     576               5 :   DBUG_PRINT("info",("Maria background checkpoint thread starts"));
     577               5 :   DBUG_ASSERT(interval > 0);
     578                 : 
     579                 :   /*
     580                 :     Recovery ended with all tables closed and a checkpoint: no need to take
     581                 :     one immediately.
     582                 :   */
     583               5 :   sleeps= 1;
     584               5 :   pages_to_flush_before_next_checkpoint= 0;
     585                 : 
     586                 :   for(;;) /* iterations of checkpoints and dirty page flushing */
     587                 :   {
     588                 : #if 0 /* good for testing, to do a lot of checkpoints, finds a lot of bugs */
     589                 :     sleeps=0;
     590                 : #endif
     591                 :     struct timespec abstime;
     592               8 :     switch (sleeps % interval)
     593                 :     {
     594                 :     case 0:
     595                 :       /*
     596                 :         With background flushing evenly distributed over the time
     597                 :         between two checkpoints, we should have only little flushing to do
     598                 :         in the checkpoint.
     599                 :       */
     600                 :       /*
     601                 :         No checkpoint if little work of interest for recovery was done
     602                 :         since last checkpoint. Such work includes log writing (lengthens
     603                 :         recovery, checkpoint would shorten it), page flushing (checkpoint
     604                 :         would decrease the amount of read pages in recovery).
     605                 :         In case of one short statement per minute (very low load), we don't
     606                 :         want to checkpoint every minute, hence the positive
     607                 :         checkpoint_min_activity.
     608                 :       */
     609               3 :       if (((translog_get_horizon() - log_horizon_at_last_checkpoint) +
     610                 :            (maria_pagecache->global_cache_write -
     611                 :             pagecache_flushes_at_last_checkpoint) *
     612                 :            maria_pagecache->block_size) < checkpoint_min_activity)
     613                 :       {
     614                 :         /* don't take checkpoint, so don't know what to flush */
     615               3 :         pages_to_flush_before_next_checkpoint= 0;
     616               3 :         sleep_time= interval;
     617               3 :         break;
     618                 :       }
     619               0 :       sleep_time= 1;
     620               0 :       ma_checkpoint_execute(CHECKPOINT_MEDIUM, TRUE);
     621                 :       /*
     622                 :         Snapshot this kind of "state" of the engine. Note that the value below
     623                 :         is possibly greater than last_checkpoint_lsn.
     624                 :       */
     625               0 :       log_horizon_at_last_checkpoint= translog_get_horizon();
     626               0 :       pagecache_flushes_at_last_checkpoint=
     627                 :         maria_pagecache->global_cache_write;
     628                 :       /*
     629                 :         If the checkpoint above succeeded it has set d|kfiles and
     630                 :         d|kfiles_end. If is has failed, it has set
     631                 :         pages_to_flush_before_next_checkpoint to 0 so we will skip flushing
     632                 :         and sleep until the next checkpoint.
     633                 :       */
     634               0 :       break;
     635                 :     case 1:
     636                 :       /* set up parameters for background page flushing */
     637               5 :       filter_param.up_to_lsn= last_checkpoint_lsn;
     638               5 :       pages_bunch_size= pages_to_flush_before_next_checkpoint / interval;
     639               5 :       dfile= dfiles;
     640               5 :       kfile= kfiles;
     641                 :       /* fall through */
     642                 :     default:
     643               5 :       if (pages_bunch_size > 0)
     644                 :       {
     645               0 :         DBUG_PRINT("checkpoint",
     646                 :                    ("Maria background checkpoint thread: %u pages",
     647                 :                     pages_bunch_size));
     648                 :         /* flush a bunch of dirty pages */
     649               0 :         filter_param.max_pages= pages_bunch_size;
     650               0 :         while (dfile != dfiles_end)
     651                 :         {
     652                 :           /*
     653                 :             We use FLUSH_KEEP_LAZY: if a file is already in flush, it's
     654                 :             smarter to move to the next file than wait for this one to be
     655                 :             completely flushed, which may take long.
     656                 :             StaleFilePointersInFlush: notice how below we use "dfile" which
     657                 :             is an OS file descriptor plus some function and MARIA_SHARE
     658                 :             pointers; this data dates from a previous checkpoint; since then,
     659                 :             the table may have been closed (so MARIA_SHARE* became stale), and
     660                 :             the file descriptor reassigned to another table which does not
     661                 :             have the same CRC-read-set callbacks: it is thus important that
     662                 :             flush_pagecache_blocks_with_filter() does not use the pointers,
     663                 :             only the OS file descriptor.
     664                 :           */
     665                 :           int res=
     666                 :             flush_pagecache_blocks_with_filter(maria_pagecache,
     667                 :                                                dfile, FLUSH_KEEP_LAZY,
     668                 :                                                filter_flush_file_evenly,
     669               0 :                                                &filter_param);
     670               0 :           if (unlikely(res & PCFLUSH_ERROR))
     671               0 :             ma_message_no_user(0, "background data page flush failed");
     672               0 :           if (filter_param.max_pages == 0) /* bunch all flushed, sleep */
     673               0 :             break; /* and we will continue with the same file */
     674               0 :           dfile++; /* otherwise all this file is flushed, move to next file */
     675                 :           /*
     676                 :             MikaelR noted that he observed that Linux's file cache may never
     677                 :             fsync to  disk until this cache is full, at which point it decides
     678                 :             to empty the cache, making the machine very slow. A solution was
     679                 :             to fsync after writing 2 MB. So we might want to fsync() here if
     680                 :             we wrote enough pages.
     681                 :           */
     682                 :         }
     683               0 :         while (kfile != kfiles_end)
     684                 :         {
     685                 :           int res=
     686                 :             flush_pagecache_blocks_with_filter(maria_pagecache,
     687                 :                                                kfile, FLUSH_KEEP_LAZY,
     688                 :                                                filter_flush_file_evenly,
     689               0 :                                                &filter_param);
     690               0 :           if (unlikely(res & PCFLUSH_ERROR))
     691               0 :             ma_message_no_user(0, "background index page flush failed");
     692               0 :           if (filter_param.max_pages == 0) /* bunch all flushed, sleep */
     693               0 :             break; /* and we will continue with the same file */
     694               0 :           kfile++; /* otherwise all this file is flushed, move to next file */
     695                 :         }
     696               0 :         sleep_time= 1;
     697                 :       }
     698                 :       else
     699                 :       {
     700                 :         /* Can directly sleep until the next checkpoint moment */
     701               5 :         sleep_time= interval - (sleeps % interval);
     702                 :       }
     703                 :     }
     704               8 :     pthread_mutex_lock(&LOCK_checkpoint);
     705               8 :     if (checkpoint_thread_die == 1)
     706               8 :       break;
     707                 : #if 0 /* good for testing, to do a lot of checkpoints, finds a lot of bugs */
     708                 :     pthread_mutex_unlock(&LOCK_checkpoint);
     709                 :     my_sleep(100000); /* a tenth of a second */
     710                 :     pthread_mutex_lock(&LOCK_checkpoint);
     711                 : #else
     712                 :     /* To have a killable sleep, we use timedwait like our SQL GET_LOCK() */
     713               8 :     DBUG_PRINT("info", ("sleeping %u seconds", sleep_time));
     714               8 :     set_timespec(abstime, sleep_time);
     715               8 :     pthread_cond_timedwait(&COND_checkpoint, &LOCK_checkpoint, &abstime);
     716                 : #endif
     717               7 :     if (checkpoint_thread_die == 1)
     718               3 :       break;
     719               3 :     pthread_mutex_unlock(&LOCK_checkpoint);
     720               3 :     sleeps+= sleep_time;
     721               3 :   }
     722               4 :   pthread_mutex_unlock(&LOCK_checkpoint);
     723               4 :   DBUG_PRINT("info",("Maria background checkpoint thread ends"));
     724                 :   {
     725               4 :     CHECKPOINT_LEVEL level= CHECKPOINT_FULL;
     726                 :     /*
     727                 :       That's the final one, which guarantees that a clean shutdown always ends
     728                 :       with a checkpoint.
     729                 :     */
     730               4 :     DBUG_EXECUTE_IF("maria_checkpoint_indirect", level= CHECKPOINT_INDIRECT;);
     731               4 :     ma_checkpoint_execute(level, FALSE);
     732                 :   }
     733               4 :   pthread_mutex_lock(&LOCK_checkpoint);
     734               4 :   checkpoint_thread_die= 2; /* indicate that we are dead */
     735                 :   /* wake up ma_checkpoint_end() which may be waiting for our death */
     736               4 :   pthread_cond_broadcast(&COND_checkpoint);
     737                 :   /* broadcast was inside unlock because ma_checkpoint_end() destroys mutex */
     738               4 :   pthread_mutex_unlock(&LOCK_checkpoint);
     739               4 :   my_thread_end();
     740               4 :   return 0;
     741                 : }
     742                 : 
     743                 : 
     744                 : /**
     745                 :    @brief Allocates buffer and stores in it some info about open tables,
     746                 :    does some flushing on those.
     747                 : 
     748                 :    Does the allocation because the caller cannot know the size itself.
     749                 :    Memory freeing is to be done by the caller (if the "str" member of the
     750                 :    LEX_STRING is not NULL).
     751                 :    The caller is taking a checkpoint.
     752                 : 
     753                 :    @param[out]  str        pointer to where the allocated buffer,
     754                 :                            and its size, will be put; buffer will be filled
     755                 :                            with info about open tables
     756                 :    @param       checkpoint_start_log_horizon  Of the in-progress checkpoint
     757                 :                                               record.
     758                 : 
     759                 :    @return Operation status
     760                 :      @retval 0      OK
     761                 :      @retval 1      Error
     762                 : */
     763                 : 
     764                 : static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon)
     765               4 : {
     766               4 :   MARIA_SHARE **distinct_shares= NULL;
     767                 :   char *ptr;
     768               4 :   uint error= 1, sync_error= 0, nb, nb_stored, i;
     769               4 :   my_bool unmark_tables= TRUE;
     770                 :   uint total_names_length;
     771                 :   LIST *pos; /**< to iterate over open tables */
     772                 :   struct st_state_copy {
     773                 :     uint index;
     774                 :     MARIA_STATE_INFO state;
     775                 :   };
     776               4 :   struct st_state_copy *state_copies= NULL, /**< fixed-size cache of states */
     777                 :     *state_copies_end, /**< cache ends here */
     778                 :     *state_copy; /**< iterator in cache */
     779                 :   TRANSLOG_ADDRESS state_copies_horizon; /**< horizon of states' _copies_ */
     780                 :   struct st_filter_param filter_param;
     781                 :   PAGECACHE_FLUSH_FILTER filter;
     782               4 :   DBUG_ENTER("collect_tables");
     783                 : 
     784               4 :   LINT_INIT(state_copies_horizon);
     785                 :   /* let's make a list of distinct shares */
     786               4 :   pthread_mutex_lock(&THR_LOCK_maria);
     787               4 :   for (nb= 0, pos= maria_open_list; pos; pos= pos->next)
     788                 :   {
     789               0 :     MARIA_HA *info= (MARIA_HA*)pos->data;
     790               0 :     MARIA_SHARE *share= info->s;
     791                 :     /* the first three variables below can never change */
     792               0 :     if (share->base.born_transactional && !share->temporary &&
     793                 :         share->mode != O_RDONLY &&
     794                 :         !(share->in_checkpoint & MARIA_CHECKPOINT_SEEN_IN_LOOP))
     795                 :     {
     796                 :       /*
     797                 :         Apart from us, only maria_close() reads/sets in_checkpoint but cannot
     798                 :         run now as we hold THR_LOCK_maria.
     799                 :       */
     800                 :       /*
     801                 :         This table is relevant for checkpoint and not already seen. Mark it,
     802                 :         so that it is not seen again in the loop.
     803                 :       */
     804               0 :       nb++;
     805               0 :       DBUG_ASSERT(share->in_checkpoint == 0);
     806                 :       /* This flag ensures that we count only _distinct_ shares. */
     807               0 :       share->in_checkpoint= MARIA_CHECKPOINT_SEEN_IN_LOOP;
     808                 :     }
     809                 :   }
     810               4 :   if (unlikely((distinct_shares=
     811                 :                 (MARIA_SHARE **)my_malloc(nb * sizeof(MARIA_SHARE *),
     812                 :                                           MYF(MY_WME))) == NULL))
     813               4 :     goto err;
     814               4 :   for (total_names_length= 0, i= 0, pos= maria_open_list; pos; pos= pos->next)
     815                 :   {
     816               0 :     MARIA_HA *info= (MARIA_HA*)pos->data;
     817               0 :     MARIA_SHARE *share= info->s;
     818               0 :     if (share->in_checkpoint & MARIA_CHECKPOINT_SEEN_IN_LOOP)
     819                 :     {
     820               0 :       distinct_shares[i++]= share;
     821                 :       /*
     822                 :         With this we prevent the share from going away while we later flush
     823                 :         and force it without holding THR_LOCK_maria. For example if the share
     824                 :         could be my_free()d by maria_close() we would have a problem when we
     825                 :         access it to flush the table. We "pin" the share pointer.
     826                 :         And we also take down MARIA_CHECKPOINT_SEEN_IN_LOOP, so that it is
     827                 :         not seen again in the loop.
     828                 :       */
     829               0 :       share->in_checkpoint= MARIA_CHECKPOINT_LOOKS_AT_ME;
     830                 :       /** @todo avoid strlen() */
     831               0 :       total_names_length+= share->open_file_name.length;
     832                 :     }
     833                 :   }
     834                 : 
     835               4 :   DBUG_ASSERT(i == nb);
     836               4 :   pthread_mutex_unlock(&THR_LOCK_maria);
     837               4 :   DBUG_PRINT("info",("found %u table shares", nb));
     838                 : 
     839               4 :   str->length=
     840                 :     4 +               /* number of tables */
     841                 :     (2 +              /* short id */
     842                 :      LSN_STORE_SIZE + /* first_log_write_at_lsn */
     843                 :      1                /* end-of-name 0 */
     844                 :      ) * nb + total_names_length;
     845               4 :   if (unlikely((str->str= my_malloc(str->length, MYF(MY_WME))) == NULL))
     846               4 :     goto err;
     847                 : 
     848               4 :   ptr= str->str;
     849               4 :   ptr+= 4; /* real number of stored tables is not yet know */
     850                 : 
     851                 :   /* only possible checkpointer, so can do the read below without mutex */
     852               4 :   filter_param.up_to_lsn= last_checkpoint_lsn;
     853               4 :   switch(checkpoint_in_progress)
     854                 :   {
     855                 :   case CHECKPOINT_MEDIUM:
     856               0 :     filter= &filter_flush_file_medium;
     857               0 :     break;
     858                 :   case CHECKPOINT_FULL:
     859               4 :     filter= &filter_flush_file_full;
     860               4 :     break;
     861                 :   case CHECKPOINT_INDIRECT:
     862               0 :     filter= NULL;
     863               0 :     break;
     864                 :   default:
     865               0 :     DBUG_ASSERT(0);
     866                 :     goto err;
     867                 :   }
     868                 : 
     869                 :   /*
     870                 :     The principle of reading/writing the state below is explained in
     871                 :     ma_recovery.c, look for "Recovery of the state".
     872                 :   */
     873                 : #define STATE_COPIES 1024
     874               4 :   state_copies= (struct st_state_copy *)
     875                 :     my_malloc(STATE_COPIES * sizeof(struct st_state_copy), MYF(MY_WME));
     876               4 :   dfiles= (PAGECACHE_FILE *)my_realloc((uchar *)dfiles,
     877                 :                                        /* avoid size of 0 for my_realloc */
     878                 :                                        max(1, nb) * sizeof(PAGECACHE_FILE),
     879                 :                                        MYF(MY_WME | MY_ALLOW_ZERO_PTR));
     880               4 :   kfiles= (PAGECACHE_FILE *)my_realloc((uchar *)kfiles,
     881                 :                                        /* avoid size of 0 for my_realloc */
     882                 :                                        max(1, nb) * sizeof(PAGECACHE_FILE),
     883                 :                                        MYF(MY_WME | MY_ALLOW_ZERO_PTR));
     884               4 :   if (unlikely((state_copies == NULL) ||
     885                 :                (dfiles == NULL) || (kfiles == NULL)))
     886               4 :     goto err;
     887               4 :   state_copy= state_copies_end= NULL;
     888               4 :   dfiles_end= dfiles;
     889               4 :   kfiles_end= kfiles;
     890                 : 
     891               4 :   for (nb_stored= 0, i= 0; i < nb; i++)
     892                 :   {
     893               0 :     MARIA_SHARE *share= distinct_shares[i];
     894                 :     PAGECACHE_FILE kfile, dfile;
     895                 :     my_bool ignore_share;
     896               0 :     if (!(share->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME))
     897                 :     {
     898                 :       /*
     899                 :         No need for a mutex to read the above, only us can write *this* bit of
     900                 :         the in_checkpoint bitmap
     901                 :       */
     902               0 :       continue;
     903                 :     }
     904                 :     /**
     905                 :        @todo We should not look at tables which didn't change since last
     906                 :        checkpoint.
     907                 :     */
     908               0 :     DBUG_PRINT("info",("looking at table '%s'", share->open_file_name.str));
     909               0 :     if (state_copy == state_copies_end) /* we have no more cached states */
     910                 :     {
     911                 :       /*
     912                 :         Collect and cache a bunch of states. We do this for many states at a
     913                 :         time, to not lock/unlock the log's lock too often.
     914                 :       */
     915               0 :       uint j, bound= min(nb, i + STATE_COPIES);
     916               0 :       state_copy= state_copies;
     917                 :       /* part of the state is protected by log's lock */
     918               0 :       translog_lock();
     919               0 :       state_copies_horizon= translog_get_horizon_no_lock();
     920               0 :       for (j= i; j < bound; j++)
     921                 :       {
     922               0 :         MARIA_SHARE *share2= distinct_shares[j];
     923               0 :         if (!(share2->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME))
     924               0 :           continue;
     925               0 :         state_copy->index= j;
     926               0 :         state_copy->state= share2->state; /* we copy the state */
     927               0 :         state_copy++;
     928                 :         /*
     929                 :           data_file_length is not updated under log's lock by the bitmap
     930                 :           code, but writing a wrong data_file_length is ok: a next
     931                 :           maria_close() will correct it; if we crash before, Recovery will
     932                 :           set it to the true physical size.
     933                 :         */
     934                 :       }
     935               0 :       translog_unlock();
     936                 :       /**
     937                 :          We are going to flush these states.
     938                 :          Before, all records describing how to undo such state must be
     939                 :          in the log (WAL). Usually this means UNDOs. In the special case of
     940                 :          data|key_file_length, recovery just needs to open the table to fix the
     941                 :          length, so any LOGREC_FILE_ID/REDO/UNDO allowing recovery to
     942                 :          understand it must open a table, is enough; so as long as
     943                 :          data|key_file_length is updated after writing any log record it's ok:
     944                 :          if we copied new value above, it means the record was before
     945                 :          state_copies_horizon and we flush such record below.
     946                 :          Apart from data|key_file_length which are easily recoverable from the
     947                 :          real file's size, all other state members must be updated only when
     948                 :          writing the UNDO; otherwise, if updated before, if their new value is
     949                 :          flushed by a checkpoint and there is a crash before UNDO is written,
     950                 :          their REDO group will be missing or at least incomplete and skipped
     951                 :          by recovery, so bad state value will stay. For example, setting
     952                 :          key_root before writing the UNDO: the table would have old index
     953                 :          pages (they were pinned at time of crash) and a new, thus wrong,
     954                 :          key_root.
     955                 :          @todo RECOVERY BUG check that all code honours that.
     956                 :       */
     957               0 :       if (translog_flush(state_copies_horizon))
     958               0 :         goto err;
     959                 :       /* now we have cached states and they are WAL-safe*/
     960               0 :       state_copies_end= state_copy;
     961               0 :       state_copy= state_copies;
     962                 :     }
     963                 : 
     964                 :     /* locate our state among these cached ones */
     965               0 :     for ( ; state_copy->index != i; state_copy++)
     966               0 :       DBUG_ASSERT(state_copy < state_copies_end);
     967                 : 
     968                 :     /* OS file descriptors are ints which we stored in 4 bytes */
     969                 :     compile_time_assert(sizeof(int) <= 4);
     970                 :     /*
     971                 :       Protect against maria_close() (which does some memory freeing in
     972                 :       MARIA_FILE_BITMAP) with close_lock. intern_lock is not
     973                 :       sufficient as we, as well as maria_close(), are going to unlock
     974                 :       intern_lock in the middle of manipulating the table. Serializing us and
     975                 :       maria_close() should help avoid problems.
     976                 :     */
     977               0 :     pthread_mutex_lock(&share->close_lock);
     978               0 :     pthread_mutex_lock(&share->intern_lock);
     979                 :     /*
     980                 :       Tables in a normal state have their two file descriptors open.
     981                 :       In some rare cases like REPAIR, some descriptor may be closed or even
     982                 :       -1. If that happened, the _ma_state_info_write() may fail. This is
     983                 :       prevented by enclosing all all places which close/change kfile.file with
     984                 :       intern_lock.
     985                 :     */
     986               0 :     kfile= share->kfile;
     987               0 :     dfile= share->bitmap.file;
     988                 :     /*
     989                 :       Ignore table which has no logged writes (all its future log records will
     990                 :       be found naturally by Recovery). Ignore obsolete shares (_before_
     991                 :       setting themselves to last_version=0 they already did all flush and
     992                 :       sync; if we flush their state now we may be flushing an obsolete state
     993                 :       onto a newer one (assuming the table has been reopened with a different
     994                 :       share but of course same physical index file).
     995                 :     */
     996               0 :     ignore_share= (share->id == 0) | (share->last_version == 0);
     997               0 :     DBUG_PRINT("info", ("ignore_share: %d", ignore_share));
     998               0 :     if (!ignore_share)
     999                 :     {
    1000               0 :       uint open_file_name_len= share->open_file_name.length + 1;
    1001                 :       /* remember the descriptors for background flush */
    1002               0 :       *(dfiles_end++)= dfile;
    1003               0 :       *(kfiles_end++)= kfile;
    1004                 :       /* we will store this table in the record */
    1005               0 :       nb_stored++;
    1006               0 :       int2store(ptr, share->id);
    1007               0 :       ptr+= 2;
    1008               0 :       lsn_store(ptr, share->lsn_of_file_id);
    1009               0 :       ptr+= LSN_STORE_SIZE;
    1010                 :       /*
    1011                 :         first_bitmap_with_space is not updated under log's lock, and is
    1012                 :         important. We would need the bitmap's lock to get it right. Recovery
    1013                 :         of this is not clear, so we just play safe: write it out as
    1014                 :         unknown: if crash, _ma_bitmap_init() at next open (for example in
    1015                 :         Recovery) will convert it to 0 and thus the first insertion will
    1016                 :         search for free space from the file's first bitmap (0) -
    1017                 :         under-optimal but safe.
    1018                 :         If no crash, maria_close() will write the exact value.
    1019                 :       */
    1020               0 :       state_copy->state.first_bitmap_with_space= ~(ulonglong)0;
    1021               0 :       memcpy(ptr, share->open_file_name.str, open_file_name_len);
    1022               0 :       ptr+= open_file_name_len;
    1023               0 :       if (cmp_translog_addr(share->state.is_of_horizon,
    1024                 :                             checkpoint_start_log_horizon) >= 0)
    1025                 :       {
    1026                 :         /*
    1027                 :           State was flushed recently, it does not hold down the log's
    1028                 :           low-water mark and will not give avoidable work to Recovery. So we
    1029                 :           needn't flush it. Also, it is possible that while we copied the
    1030                 :           state above (under log's lock, without intern_lock) it was being
    1031                 :           modified in memory or flushed to disk (without log's lock, under
    1032                 :           intern_lock, like in maria_extra()), so our copy may be incorrect
    1033                 :           and we should not flush it.
    1034                 :           It may also be a share which got last_version==0 since we checked
    1035                 :           last_version; in this case, it flushed its state and the LSN test
    1036                 :           above will catch it.
    1037                 :         */
    1038                 :       }
    1039                 :       else
    1040                 :       {
    1041                 :         /*
    1042                 :           We could do the state flush only if share->changed, but it's
    1043                 :           tricky.
    1044                 :           Consider a maria_write() which has written REDO,UNDO, and before it
    1045                 :           calls _ma_writeinfo() (setting share->changed=1), checkpoint
    1046                 :           happens and sees share->changed=0, does not flush state. It is
    1047                 :           possible that Recovery does not start from before the REDO and thus
    1048                 :           the state is not recovered. A solution may be to set
    1049                 :           share->changed=1 under log mutex when writing log records.
    1050                 :           But as anyway we have another problem below, this optimization would
    1051                 :           be of little use.
    1052                 :         */
    1053                 :         /** @todo flush state only if changed since last checkpoint */
    1054               0 :         DBUG_ASSERT(share->last_version != 0);
    1055               0 :         state_copy->state.is_of_horizon= share->state.is_of_horizon=
    1056                 :           state_copies_horizon;
    1057               0 :         if (kfile.file >= 0)
    1058               0 :           sync_error|=
    1059                 :             _ma_state_info_write_sub(kfile.file, &state_copy->state,
    1060                 :                                      MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET);
    1061                 :         /*
    1062                 :           We don't set share->changed=0 because it may interfere with a
    1063                 :           concurrent _ma_writeinfo() doing share->changed=1 (cancel its
    1064                 :           effect). The sad consequence is that we will flush the same state at
    1065                 :           each checkpoint if the table was once written and then not anymore.
    1066                 :         */
    1067                 :       }
    1068                 :     }
    1069                 :     /*
    1070                 :       _ma_bitmap_flush_all() may wait, so don't keep intern_lock as
    1071                 :       otherwise this would deadlock with allocate_and_write_block_record()
    1072                 :       calling _ma_set_share_data_file_length()
    1073                 :     */
    1074               0 :     pthread_mutex_unlock(&share->intern_lock);
    1075                 :     
    1076               0 :     if (!ignore_share)
    1077                 :     {
    1078                 :       /*
    1079                 :         share->bitmap is valid because it's destroyed under close_lock which
    1080                 :         we hold.
    1081                 :       */
    1082               0 :       if (_ma_bitmap_flush_all(share))
    1083                 :       {
    1084               0 :         sync_error= 1;
    1085                 :         /** @todo all write failures should mark table corrupted */
    1086               0 :         ma_message_no_user(0, "checkpoint bitmap page flush failed");
    1087                 :       }
    1088               0 :       DBUG_ASSERT(share->pagecache == maria_pagecache);
    1089                 :     }
    1090                 :     /*
    1091                 :       Clean up any unused states.
    1092                 :       TODO: Only do this call if there has been # (10?) ended transactions
    1093                 :       since last call.
    1094                 :       We had to release intern_lock to respect lock order with LOCK_trn_list.
    1095                 :     */
    1096               0 :     _ma_remove_not_visible_states_with_lock(share, FALSE);
    1097                 : 
    1098               0 :     if (share->in_checkpoint & MARIA_CHECKPOINT_SHOULD_FREE_ME)
    1099                 :     {
    1100                 :       /*
    1101                 :         maria_close() left us free the share. When it run it set share->id
    1102                 :         to 0. As it run before we locked close_lock, we should have seen this
    1103                 :         and so this assertion should be true:
    1104                 :       */
    1105               0 :       DBUG_ASSERT(ignore_share);
    1106               0 :       pthread_mutex_destroy(&share->intern_lock);
    1107               0 :       pthread_mutex_unlock(&share->close_lock);
    1108               0 :       pthread_mutex_destroy(&share->close_lock);
    1109               0 :       my_free((uchar *)share, MYF(0));
    1110                 :     }
    1111                 :     else
    1112                 :     {
    1113                 :       /* share goes back to normal state */
    1114               0 :       share->in_checkpoint= 0;
    1115               0 :       pthread_mutex_unlock(&share->close_lock);
    1116                 :     }
    1117                 : 
    1118                 :     /*
    1119                 :       We do the big disk writes out of intern_lock to not block other
    1120                 :       users of this table (intern_lock is taken at the start and end of
    1121                 :       every statement). This means that file descriptors may be invalid
    1122                 :       (files may have been closed for example by HA_EXTRA_PREPARE_FOR_*
    1123                 :       under Windows, or REPAIR). This should not be a problem as we use
    1124                 :       MY_IGNORE_BADFD. Descriptors may even point to other files but then
    1125                 :       the old blocks (of before the close) must have been flushed for sure,
    1126                 :       so our flush will flush new blocks (of after the latest open) and that
    1127                 :       should do no harm.
    1128                 :     */
    1129                 :     /*
    1130                 :       If CHECKPOINT_MEDIUM, this big flush below may result in a
    1131                 :       serious write burst. Realize that all pages dirtied between the
    1132                 :       last checkpoint and the one we are doing now, will be flushed at
    1133                 :       next checkpoint, except those evicted by LRU eviction (depending on
    1134                 :       the size of the page cache compared to the size of the working data
    1135                 :       set, eviction may be rare or frequent).
    1136                 :       We avoid that burst by anticipating: those pages are flushed
    1137                 :       in bunches spanned regularly over the time interval between now and
    1138                 :       the next checkpoint, by a background thread. Thus the next checkpoint
    1139                 :       will have only little flushing to do (CHECKPOINT_MEDIUM should thus be
    1140                 :       only a little slower than CHECKPOINT_INDIRECT).
    1141                 :     */
    1142                 : 
    1143                 :     /*
    1144                 :       PageCacheFlushConcurrencyBugs
    1145                 :       Inside the page cache, calls to flush_pagecache_blocks_int() on the same
    1146                 :       file are serialized. Examples of concurrency bugs which happened when we
    1147                 :       didn't have this serialization:
    1148                 :       - maria_chk_size() (via CHECK TABLE) happens concurrently with
    1149                 :       Checkpoint: Checkpoint is flushing a page: it pins the page and is
    1150                 :       pre-empted, maria_chk_size() wants to flush this page too so gets an
    1151                 :       error because Checkpoint pinned this page. Such error makes
    1152                 :       maria_chk_size() mark the table as corrupted.
    1153                 :       - maria_close() happens concurrently with Checkpoint:
    1154                 :       Checkpoint is flushing a page: it registers a request on the page, is
    1155                 :       pre-empted ; maria_close() flushes this page too with FLUSH_RELEASE:
    1156                 :       FLUSH_RELEASE will cause a free_block() which assumes the page is in the
    1157                 :       LRU, but it is not (as Checkpoint registered a request). Crash.
    1158                 :       - one thread is evicting a page of the file out of the LRU: it marks it
    1159                 :       iPC_BLOCK_IN_SWITCH and is pre-empted. Then two other threads do flushes
    1160                 :       of the same file concurrently (like above). Then one flusher sees the
    1161                 :       page is in switch, removes it from changed_blocks[] and puts it in its
    1162                 :       first_in_switch, so the other flusher will not see the page at all and
    1163                 :       return too early. If it's maria_close() which returns too early, then
    1164                 :       maria_close() may close the file descriptor, and the other flusher, and
    1165                 :       the evicter will fail to write their page: corruption.
    1166                 :     */
    1167                 : 
    1168               0 :     if (!ignore_share)
    1169                 :     {
    1170               0 :       if (filter != NULL)
    1171                 :       {
    1172               0 :         if ((flush_pagecache_blocks_with_filter(maria_pagecache,
    1173                 :                                                 &dfile, FLUSH_KEEP_LAZY,
    1174                 :                                                 filter, &filter_param) &
    1175                 :              PCFLUSH_ERROR))
    1176               0 :           ma_message_no_user(0, "checkpoint data page flush failed");
    1177               0 :         if ((flush_pagecache_blocks_with_filter(maria_pagecache,
    1178                 :                                                 &kfile, FLUSH_KEEP_LAZY,
    1179                 :                                                 filter, &filter_param) &
    1180                 :              PCFLUSH_ERROR))
    1181               0 :           ma_message_no_user(0, "checkpoint index page flush failed");
    1182                 :       }
    1183                 :       /*
    1184                 :         fsyncs the fd, that's the loooong operation (e.g. max 150 fsync
    1185                 :         per second, so if you have touched 1000 files it's 7 seconds).
    1186                 :       */
    1187               0 :       sync_error|=
    1188                 :         my_sync(dfile.file, MYF(MY_WME | MY_IGNORE_BADFD)) |
    1189                 :         my_sync(kfile.file, MYF(MY_WME | MY_IGNORE_BADFD));
    1190                 :       /*
    1191                 :         in case of error, we continue because writing other tables to disk is
    1192                 :         still useful.
    1193                 :       */
    1194                 :     }
    1195                 :   }
    1196                 : 
    1197               4 :   if (sync_error)
    1198               4 :     goto err;
    1199                 :   /* We maybe over-estimated (due to share->id==0 or last_version==0) */
    1200               4 :   DBUG_ASSERT(str->length >= (uint)(ptr - str->str));
    1201               4 :   str->length= (uint)(ptr - str->str);
    1202                 :   /*
    1203                 :     As we support max 65k tables open at a time (2-byte short id), we
    1204                 :     assume uint is enough for the cumulated length of table names; and
    1205                 :     LEX_STRING::length is uint.
    1206                 :   */
    1207               4 :   int4store(str->str, nb_stored);
    1208               4 :   error= unmark_tables= 0;
    1209                 : 
    1210               4 : err:
    1211               4 :   if (unlikely(unmark_tables))
    1212                 :   {
    1213                 :     /* maria_close() uses THR_LOCK_maria from start to end */
    1214               0 :     pthread_mutex_lock(&THR_LOCK_maria);
    1215               0 :     for (i= 0; i < nb; i++)
    1216                 :     {
    1217               0 :       MARIA_SHARE *share= distinct_shares[i];
    1218               0 :       if (share->in_checkpoint & MARIA_CHECKPOINT_SHOULD_FREE_ME)
    1219                 :       {
    1220                 :         /* maria_close() left us to free the share */
    1221               0 :         pthread_mutex_destroy(&share->intern_lock);
    1222               0 :         my_free((uchar *)share, MYF(0));
    1223                 :       }
    1224                 :       else
    1225                 :       {
    1226                 :         /* share goes back to normal state */
    1227               0 :         share->in_checkpoint= 0;
    1228                 :       }
    1229                 :     }
    1230               0 :     pthread_mutex_unlock(&THR_LOCK_maria);
    1231                 :   }
    1232               4 :   my_free((uchar *)distinct_shares, MYF(MY_ALLOW_ZERO_PTR));
    1233               4 :   my_free((uchar *)state_copies, MYF(MY_ALLOW_ZERO_PTR));
    1234               4 :   DBUG_RETURN(error);
    1235                 : }

Generated by: LTP GCOV extension version 1.4