1 : /* Copyright (C) 2006,2007 MySQL AB
2 :
3 : This program is free software; you can redistribute it and/or modify
4 : it under the terms of the GNU General Public License as published by
5 : the Free Software Foundation; version 2 of the License.
6 :
7 : This program is distributed in the hope that it will be useful,
8 : but WITHOUT ANY WARRANTY; without even the implied warranty of
9 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 : GNU General Public License for more details.
11 :
12 : You should have received a copy of the GNU General Public License
13 : along with this program; if not, write to the Free Software
14 : Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
15 :
16 : /*
17 : WL#3071 Maria checkpoint
18 : First version written by Guilhem Bichot on 2006-04-27.
19 : */
20 :
21 : /* Here is the implementation of this module */
22 :
23 : /** @todo RECOVERY BUG this is unreviewed code */
24 : /*
25 : Summary:
26 : checkpoints are done either by a background thread (checkpoint every Nth
27 : second) or by a client.
28 : In ha_maria, it's not made available to clients, and will soon be done by a
29 : background thread (periodically taking checkpoints and flushing dirty
30 : pages).
31 : */
32 :
33 : #include "maria_def.h"
34 : #include "ma_pagecache.h"
35 : #include "ma_blockrec.h"
36 : #include "ma_checkpoint.h"
37 : #include "ma_loghandler_lsn.h"
38 :
39 :
40 : /** @brief type of checkpoint currently running */
41 : static CHECKPOINT_LEVEL checkpoint_in_progress= CHECKPOINT_NONE;
42 : /** @brief protects checkpoint_in_progress */
43 : static pthread_mutex_t LOCK_checkpoint;
44 : /** @brief for killing the background checkpoint thread */
45 : static pthread_cond_t COND_checkpoint;
46 : /** @brief if checkpoint module was inited or not */
47 : static my_bool checkpoint_inited= FALSE;
48 : /** @brief 'kill' flag for the background checkpoint thread */
49 : static int checkpoint_thread_die;
50 : /* is ulong like pagecache->blocks_changed */
51 : static ulong pages_to_flush_before_next_checkpoint;
52 : static PAGECACHE_FILE *dfiles, /**< data files to flush in background */
53 : *dfiles_end; /**< list of data files ends here */
54 : static PAGECACHE_FILE *kfiles, /**< index files to flush in background */
55 : *kfiles_end; /**< list of index files ends here */
56 : /* those two statistics below could serve in SHOW GLOBAL STATUS */
57 : static uint checkpoints_total= 0, /**< all checkpoint requests made */
58 : checkpoints_ok_total= 0; /**< all checkpoints which succeeded */
59 :
60 : struct st_filter_param
61 : {
62 : LSN up_to_lsn; /**< only pages with rec_lsn < this LSN */
63 : uint max_pages; /**< stop after flushing this number pages */
64 : }; /**< information to determine which dirty pages should be flushed */
65 :
66 : static enum pagecache_flush_filter_result
67 : filter_flush_file_medium(enum pagecache_page_type type,
68 : pgcache_page_no_t page,
69 : LSN rec_lsn, void *arg);
70 : static enum pagecache_flush_filter_result
71 : filter_flush_file_full(enum pagecache_page_type type,
72 : pgcache_page_no_t page,
73 : LSN rec_lsn, void *arg);
74 : static enum pagecache_flush_filter_result
75 : filter_flush_file_evenly(enum pagecache_page_type type,
76 : pgcache_page_no_t pageno,
77 : LSN rec_lsn, void *arg);
78 : static int really_execute_checkpoint(void);
79 : pthread_handler_t ma_checkpoint_background(void *arg);
80 : static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon);
81 :
82 : /**
83 : @brief Does a checkpoint
84 :
85 : @param level what level of checkpoint to do
86 : @param no_wait if another checkpoint of same or stronger level
87 : is already running, consider our job done
88 :
89 : @note In ha_maria, there can never be two threads trying a checkpoint at
90 : the same time.
91 :
92 : @return Operation status
93 : @retval 0 ok
94 : @retval !=0 error
95 : */
96 :
97 : int ma_checkpoint_execute(CHECKPOINT_LEVEL level, my_bool no_wait)
98 4 : {
99 4 : int result= 0;
100 4 : DBUG_ENTER("ma_checkpoint_execute");
101 :
102 4 : if (!checkpoint_inited)
103 : {
104 : /*
105 : If ha_maria failed to start, maria_panic_hton is called, we come here.
106 : */
107 0 : DBUG_RETURN(0);
108 : }
109 4 : DBUG_ASSERT(level > CHECKPOINT_NONE);
110 :
111 : /* look for already running checkpoints */
112 4 : pthread_mutex_lock(&LOCK_checkpoint);
113 8 : while (checkpoint_in_progress != CHECKPOINT_NONE)
114 : {
115 0 : if (no_wait && (checkpoint_in_progress >= level))
116 : {
117 : /*
118 : If we are the checkpoint background thread, we don't wait (it's
119 : smarter to flush pages instead of waiting here while the other thread
120 : finishes its checkpoint).
121 : */
122 0 : pthread_mutex_unlock(&LOCK_checkpoint);
123 0 : goto end;
124 : }
125 0 : pthread_cond_wait(&COND_checkpoint, &LOCK_checkpoint);
126 : }
127 :
128 4 : checkpoint_in_progress= level;
129 4 : pthread_mutex_unlock(&LOCK_checkpoint);
130 : /* from then on, we are sure to be and stay the only checkpointer */
131 :
132 4 : result= really_execute_checkpoint();
133 4 : pthread_cond_broadcast(&COND_checkpoint);
134 4 : end:
135 4 : DBUG_RETURN(result);
136 : }
137 :
138 :
139 : /**
140 : @brief Does a checkpoint, really; expects no other checkpoints
141 : running.
142 :
143 : Checkpoint level requested is read from checkpoint_in_progress.
144 :
145 : @return Operation status
146 : @retval 0 ok
147 : @retval !=0 error
148 : */
149 :
150 : static int really_execute_checkpoint(void)
151 4 : {
152 4 : uint i, error= 0;
153 : /** @brief checkpoint_start_log_horizon will be stored there */
154 : char *ptr;
155 : LEX_STRING record_pieces[4]; /**< only malloc-ed pieces */
156 : LSN min_page_rec_lsn, min_trn_rec_lsn, min_first_undo_lsn;
157 : TRANSLOG_ADDRESS checkpoint_start_log_horizon;
158 : char checkpoint_start_log_horizon_char[LSN_STORE_SIZE];
159 4 : DBUG_ENTER("really_execute_checkpoint");
160 4 : DBUG_PRINT("enter", ("level: %d", checkpoint_in_progress));
161 4 : bzero(&record_pieces, sizeof(record_pieces));
162 :
163 : /*
164 : STEP 1: record current end-of-log position using log's lock. It is
165 : critical for the correctness of Checkpoint (related to memory visibility
166 : rules, the log's lock is a mutex).
167 : "Horizon" is a lower bound of the LSN of the next log record.
168 : */
169 4 : checkpoint_start_log_horizon= translog_get_horizon();
170 4 : DBUG_PRINT("info",("checkpoint_start_log_horizon (%lu,0x%lx)",
171 : LSN_IN_PARTS(checkpoint_start_log_horizon)));
172 4 : lsn_store(checkpoint_start_log_horizon_char, checkpoint_start_log_horizon);
173 :
174 : /*
175 : STEP 2: fetch information about transactions.
176 : We must fetch transactions before dirty pages. Indeed, a transaction
177 : first sets its rec_lsn then sets the page's rec_lsn then sets its rec_lsn
178 : to 0. If we fetched pages first, we may see no dirty page yet, then we
179 : fetch transactions but the transaction has already reset its rec_lsn to 0
180 : so we miss rec_lsn again.
181 : For a similar reason (over-allocated bitmap pages) we have to fetch
182 : transactions before flushing bitmap pages.
183 :
184 : min_trn_rec_lsn will serve to lower the starting point of the REDO phase
185 : (down from checkpoint_start_log_horizon).
186 : */
187 4 : if (unlikely(trnman_collect_transactions(&record_pieces[0],
188 : &record_pieces[1],
189 : &min_trn_rec_lsn,
190 : &min_first_undo_lsn)))
191 4 : goto err;
192 :
193 :
194 : /* STEP 3: fetch information about table files */
195 4 : if (unlikely(collect_tables(&record_pieces[2],
196 : checkpoint_start_log_horizon)))
197 4 : goto err;
198 :
199 :
200 : /* STEP 4: fetch information about dirty pages */
201 : /*
202 : It's better to do it _after_ having flushed some data pages (which
203 : collect_tables() may have done), because those are now non-dirty and so we
204 : have a more up-to-date dirty pages list to put into the checkpoint record,
205 : and thus we will have less work at Recovery.
206 : */
207 : /* Using default pagecache for now */
208 4 : if (unlikely(pagecache_collect_changed_blocks_with_lsn(maria_pagecache,
209 : &record_pieces[3],
210 : &min_page_rec_lsn)))
211 4 : goto err;
212 :
213 :
214 : /* LAST STEP: now write the checkpoint log record */
215 : {
216 : LSN lsn;
217 : translog_size_t total_rec_length;
218 : /*
219 : the log handler is allowed to modify "str" and "length" (but not "*str")
220 : of its argument, so we must not pass it record_pieces directly,
221 : otherwise we would later not know what memory pieces to my_free().
222 : */
223 : LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 5];
224 4 : log_array[TRANSLOG_INTERNAL_PARTS + 0].str=
225 : (uchar*) checkpoint_start_log_horizon_char;
226 4 : log_array[TRANSLOG_INTERNAL_PARTS + 0].length= total_rec_length=
227 : sizeof(checkpoint_start_log_horizon_char);
228 20 : for (i= 0; i < (sizeof(record_pieces)/sizeof(record_pieces[0])); i++)
229 : {
230 16 : log_array[TRANSLOG_INTERNAL_PARTS + 1 + i]=
231 : *(LEX_CUSTRING *)&record_pieces[i];
232 16 : total_rec_length+= (translog_size_t) record_pieces[i].length;
233 : }
234 4 : if (unlikely(translog_write_record(&lsn, LOGREC_CHECKPOINT,
235 : &dummy_transaction_object, NULL,
236 : total_rec_length,
237 : sizeof(log_array)/sizeof(log_array[0]),
238 : log_array, NULL, NULL) ||
239 : translog_flush(lsn)))
240 4 : goto err;
241 4 : translog_lock();
242 : /*
243 : This cannot be done as a inwrite_rec_hook of LOGREC_CHECKPOINT, because
244 : such hook would be called before translog_flush (and we must be sure
245 : that log was flushed before we write to the control file).
246 : */
247 4 : if (unlikely(ma_control_file_write_and_force(lsn, last_logno,
248 : max_trid_in_control_file,
249 : recovery_failures)))
250 : {
251 0 : translog_unlock();
252 0 : goto err;
253 : }
254 4 : translog_unlock();
255 : }
256 :
257 : /*
258 : Note that we should not alter memory structures until we have successfully
259 : written the checkpoint record and control file.
260 : */
261 : /* checkpoint succeeded */
262 4 : ptr= record_pieces[3].str;
263 4 : pages_to_flush_before_next_checkpoint= uint4korr(ptr);
264 4 : DBUG_PRINT("checkpoint",("%u pages to flush before next checkpoint",
265 : (uint)pages_to_flush_before_next_checkpoint));
266 :
267 : /* compute log's low-water mark */
268 : {
269 4 : TRANSLOG_ADDRESS log_low_water_mark= min_page_rec_lsn;
270 4 : set_if_smaller(log_low_water_mark, min_trn_rec_lsn);
271 4 : set_if_smaller(log_low_water_mark, min_first_undo_lsn);
272 4 : set_if_smaller(log_low_water_mark, checkpoint_start_log_horizon);
273 : /**
274 : Now purge unneeded logs.
275 : As some systems have an unreliable fsync (drive lying), we could try to
276 : be robust against that: remember a few previous checkpoints in the
277 : control file, and not purge logs immediately... Think about it.
278 : */
279 4 : if (translog_purge(log_low_water_mark))
280 0 : ma_message_no_user(0, "log purging failed");
281 : }
282 :
283 : goto end;
284 :
285 0 : err:
286 0 : error= 1;
287 0 : ma_message_no_user(0, "checkpoint failed");
288 : /* we were possibly not able to determine what pages to flush */
289 0 : pages_to_flush_before_next_checkpoint= 0;
290 :
291 4 : end:
292 20 : for (i= 0; i < (sizeof(record_pieces)/sizeof(record_pieces[0])); i++)
293 16 : my_free(record_pieces[i].str, MYF(MY_ALLOW_ZERO_PTR));
294 4 : pthread_mutex_lock(&LOCK_checkpoint);
295 4 : checkpoint_in_progress= CHECKPOINT_NONE;
296 4 : checkpoints_total++;
297 4 : checkpoints_ok_total+= !error;
298 4 : pthread_mutex_unlock(&LOCK_checkpoint);
299 4 : DBUG_RETURN(error);
300 : }
301 :
302 :
303 : /**
304 : @brief Initializes the checkpoint module
305 :
306 : @param interval If one wants the module to create a
307 : thread which will periodically do
308 : checkpoints, and flush dirty pages, in the
309 : background, it should specify a non-zero
310 : interval in seconds. The thread will then be
311 : created and will take checkpoints separated by
312 : approximately 'interval' second.
313 :
314 : @note A checkpoint is taken only if there has been some significant
315 : activity since the previous checkpoint. Between checkpoint N and N+1 the
316 : thread flushes all dirty pages which were already dirty at the time of
317 : checkpoint N.
318 :
319 : @return Operation status
320 : @retval 0 ok
321 : @retval !=0 error
322 : */
323 :
324 : int ma_checkpoint_init(ulong interval)
325 10 : {
326 : pthread_t th;
327 10 : int res= 0;
328 10 : DBUG_ENTER("ma_checkpoint_init");
329 10 : checkpoint_inited= TRUE;
330 10 : checkpoint_thread_die= 2; /* not yet born == dead */
331 10 : if (pthread_mutex_init(&LOCK_checkpoint, MY_MUTEX_INIT_SLOW) ||
332 : pthread_cond_init(&COND_checkpoint, 0))
333 0 : res= 1;
334 10 : else if (interval > 0)
335 : {
336 : compile_time_assert(sizeof(void *) >= sizeof(ulong));
337 5 : if (!(res= pthread_create(&th, NULL, ma_checkpoint_background,
338 : (void *)interval)))
339 5 : checkpoint_thread_die= 0; /* thread lives, will have to be killed */
340 : }
341 10 : DBUG_RETURN(res);
342 : }
343 :
344 :
345 : #ifndef DBUG_OFF
346 : /**
347 : Function used to test recovery: flush some table pieces and then caller
348 : crashes.
349 :
350 : @param what_to_flush 0: current bitmap and all data pages
351 : 1: state
352 : 2: all bitmap pages
353 : */
354 : static void flush_all_tables(int what_to_flush)
355 0 : {
356 0 : int res= 0;
357 : LIST *pos; /**< to iterate over open tables */
358 0 : pthread_mutex_lock(&THR_LOCK_maria);
359 0 : for (pos= maria_open_list; pos; pos= pos->next)
360 : {
361 0 : MARIA_HA *info= (MARIA_HA*)pos->data;
362 0 : if (info->s->now_transactional)
363 : {
364 0 : switch (what_to_flush)
365 : {
366 : case 0:
367 0 : res= _ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
368 : FLUSH_KEEP, FLUSH_KEEP);
369 0 : break;
370 : case 1:
371 0 : res= _ma_state_info_write(info->s,
372 : MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET|
373 : MA_STATE_INFO_WRITE_LOCK);
374 0 : DBUG_PRINT("maria_flush_states",
375 : ("is_of_horizon: LSN (%lu,0x%lx)",
376 : LSN_IN_PARTS(info->s->state.is_of_horizon)));
377 0 : break;
378 : case 2:
379 0 : res= _ma_bitmap_flush_all(info->s);
380 : break;
381 : }
382 : }
383 0 : DBUG_ASSERT(res == 0);
384 : }
385 0 : pthread_mutex_unlock(&THR_LOCK_maria);
386 : }
387 : #endif
388 :
389 :
390 : /**
391 : @brief Destroys the checkpoint module
392 : */
393 :
394 : void ma_checkpoint_end(void)
395 121 : {
396 121 : DBUG_ENTER("ma_checkpoint_end");
397 : /*
398 : Some intentional crash methods, usually triggered by
399 : SET MARIA_CHECKPOINT_INTERVAL=X
400 : */
401 121 : DBUG_EXECUTE_IF("maria_flush_bitmap",
402 : {
403 : DBUG_PRINT("maria_flush_bitmap", ("now"));
404 : flush_all_tables(2);
405 : });
406 121 : DBUG_EXECUTE_IF("maria_flush_whole_page_cache",
407 : {
408 : DBUG_PRINT("maria_flush_whole_page_cache", ("now"));
409 : flush_all_tables(0);
410 : });
411 121 : DBUG_EXECUTE_IF("maria_flush_whole_log",
412 : {
413 : DBUG_PRINT("maria_flush_whole_log", ("now"));
414 : translog_flush(translog_get_horizon());
415 : });
416 : /*
417 : Note that for WAL reasons, maria_flush_states requires
418 : maria_flush_whole_log.
419 : */
420 121 : DBUG_EXECUTE_IF("maria_flush_states",
421 : {
422 : DBUG_PRINT("maria_flush_states", ("now"));
423 : flush_all_tables(1);
424 : });
425 121 : DBUG_EXECUTE_IF("maria_crash",
426 : { DBUG_PRINT("maria_crash", ("now")); DBUG_ABORT(); });
427 :
428 121 : if (checkpoint_inited)
429 : {
430 9 : pthread_mutex_lock(&LOCK_checkpoint);
431 9 : if (checkpoint_thread_die != 2) /* thread was started ok */
432 : {
433 4 : DBUG_PRINT("info",("killing Maria background checkpoint thread"));
434 4 : checkpoint_thread_die= 1; /* kill it */
435 : do /* and wait for it to be dead */
436 : {
437 : /* wake it up if it was in a sleep */
438 4 : pthread_cond_broadcast(&COND_checkpoint);
439 4 : DBUG_PRINT("info",("waiting for Maria background checkpoint thread"
440 : " to die"));
441 4 : pthread_cond_wait(&COND_checkpoint, &LOCK_checkpoint);
442 : }
443 4 : while (checkpoint_thread_die != 2);
444 : }
445 9 : pthread_mutex_unlock(&LOCK_checkpoint);
446 9 : my_free((uchar *)dfiles, MYF(MY_ALLOW_ZERO_PTR));
447 9 : my_free((uchar *)kfiles, MYF(MY_ALLOW_ZERO_PTR));
448 9 : dfiles= kfiles= NULL;
449 9 : pthread_mutex_destroy(&LOCK_checkpoint);
450 9 : pthread_cond_destroy(&COND_checkpoint);
451 9 : checkpoint_inited= FALSE;
452 : }
453 121 : DBUG_VOID_RETURN;
454 : }
455 :
456 :
457 : /**
458 : @brief dirty-page filtering criteria for MEDIUM checkpoint.
459 :
460 : We flush data/index pages which have been dirty since the previous
461 : checkpoint (this is the two-checkpoint rule: the REDO phase will not have
462 : to start from earlier than the next-to-last checkpoint).
463 : Bitmap pages are handled by _ma_bitmap_flush_all().
464 :
465 : @param type Page's type
466 : @param pageno Page's number
467 : @param rec_lsn Page's rec_lsn
468 : @param arg filter_param
469 : */
470 :
471 : static enum pagecache_flush_filter_result
472 : filter_flush_file_medium(enum pagecache_page_type type,
473 : pgcache_page_no_t pageno __attribute__ ((unused)),
474 : LSN rec_lsn, void *arg)
475 0 : {
476 0 : struct st_filter_param *param= (struct st_filter_param *)arg;
477 0 : return (type == PAGECACHE_LSN_PAGE) &&
478 : (cmp_translog_addr(rec_lsn, param->up_to_lsn) <= 0);
479 : }
480 :
481 :
482 : /**
483 : @brief dirty-page filtering criteria for FULL checkpoint.
484 :
485 : We flush all dirty data/index pages.
486 : Bitmap pages are handled by _ma_bitmap_flush_all().
487 :
488 : @param type Page's type
489 : @param pageno Page's number
490 : @param rec_lsn Page's rec_lsn
491 : @param arg filter_param
492 : */
493 :
494 : static enum pagecache_flush_filter_result
495 : filter_flush_file_full(enum pagecache_page_type type,
496 : pgcache_page_no_t pageno __attribute__ ((unused)),
497 : LSN rec_lsn __attribute__ ((unused)),
498 : void *arg __attribute__ ((unused)))
499 0 : {
500 0 : return (type == PAGECACHE_LSN_PAGE);
501 : }
502 :
503 :
504 : /**
505 : @brief dirty-page filtering criteria for background flushing thread.
506 :
507 : We flush data/index pages which have been dirty since the previous
508 : checkpoint (this is the two-checkpoint rule: the REDO phase will not have
509 : to start from earlier than the next-to-last checkpoint), and no
510 : bitmap pages. But we flush no more than a certain number of pages (to have
511 : an even flushing, no write burst).
512 : The reason to not flush bitmap pages is that they may not be in a flushable
513 : state at this moment and we don't want to wait for them.
514 :
515 : @param type Page's type
516 : @param pageno Page's number
517 : @param rec_lsn Page's rec_lsn
518 : @param arg filter_param
519 : */
520 :
521 : static enum pagecache_flush_filter_result
522 : filter_flush_file_evenly(enum pagecache_page_type type,
523 : pgcache_page_no_t pageno __attribute__ ((unused)),
524 : LSN rec_lsn, void *arg)
525 0 : {
526 0 : struct st_filter_param *param= (struct st_filter_param *)arg;
527 0 : if (unlikely(param->max_pages == 0)) /* all flushed already */
528 0 : return FLUSH_FILTER_SKIP_ALL;
529 0 : if ((type == PAGECACHE_LSN_PAGE) &&
530 : (cmp_translog_addr(rec_lsn, param->up_to_lsn) <= 0))
531 : {
532 0 : param->max_pages--;
533 0 : return FLUSH_FILTER_OK;
534 : }
535 0 : return FLUSH_FILTER_SKIP_TRY_NEXT;
536 : }
537 :
538 :
539 : /**
540 : @brief Background thread which does checkpoints and flushes periodically.
541 :
542 : Takes a checkpoint. After this, all pages dirty at the time of that
543 : checkpoint are flushed evenly until it is time to take another checkpoint.
544 : This ensures that the REDO phase starts at earliest (in LSN time) at the
545 : next-to-last checkpoint record ("two-checkpoint rule").
546 :
547 : @note MikaelR questioned why the same thread does two different jobs, the
548 : risk could be that while a checkpoint happens no LRD flushing happens.
549 : */
550 :
551 : pthread_handler_t ma_checkpoint_background(void *arg)
552 5 : {
553 : /** @brief At least this of log/page bytes written between checkpoints */
554 5 : const uint checkpoint_min_activity= 2*1024*1024;
555 : /*
556 : If the interval could be changed by the user while we are in this thread,
557 : it could be annoying: for example it could cause "case 2" to be executed
558 : right after "case 0", thus having 'dfile' unset. So the thread cares only
559 : about the interval's value when it started.
560 : */
561 5 : const ulong interval= (ulong)arg;
562 : uint sleeps, sleep_time;
563 : TRANSLOG_ADDRESS log_horizon_at_last_checkpoint=
564 5 : translog_get_horizon();
565 : ulonglong pagecache_flushes_at_last_checkpoint=
566 5 : maria_pagecache->global_cache_write;
567 : uint pages_bunch_size;
568 : struct st_filter_param filter_param;
569 : PAGECACHE_FILE *dfile; /**< data file currently being flushed */
570 : PAGECACHE_FILE *kfile; /**< index file currently being flushed */
571 5 : LINT_INIT(kfile);
572 5 : LINT_INIT(dfile);
573 5 : LINT_INIT(pages_bunch_size);
574 :
575 5 : my_thread_init();
576 5 : DBUG_PRINT("info",("Maria background checkpoint thread starts"));
577 5 : DBUG_ASSERT(interval > 0);
578 :
579 : /*
580 : Recovery ended with all tables closed and a checkpoint: no need to take
581 : one immediately.
582 : */
583 5 : sleeps= 1;
584 5 : pages_to_flush_before_next_checkpoint= 0;
585 :
586 : for(;;) /* iterations of checkpoints and dirty page flushing */
587 : {
588 : #if 0 /* good for testing, to do a lot of checkpoints, finds a lot of bugs */
589 : sleeps=0;
590 : #endif
591 : struct timespec abstime;
592 8 : switch (sleeps % interval)
593 : {
594 : case 0:
595 : /*
596 : With background flushing evenly distributed over the time
597 : between two checkpoints, we should have only little flushing to do
598 : in the checkpoint.
599 : */
600 : /*
601 : No checkpoint if little work of interest for recovery was done
602 : since last checkpoint. Such work includes log writing (lengthens
603 : recovery, checkpoint would shorten it), page flushing (checkpoint
604 : would decrease the amount of read pages in recovery).
605 : In case of one short statement per minute (very low load), we don't
606 : want to checkpoint every minute, hence the positive
607 : checkpoint_min_activity.
608 : */
609 3 : if (((translog_get_horizon() - log_horizon_at_last_checkpoint) +
610 : (maria_pagecache->global_cache_write -
611 : pagecache_flushes_at_last_checkpoint) *
612 : maria_pagecache->block_size) < checkpoint_min_activity)
613 : {
614 : /* don't take checkpoint, so don't know what to flush */
615 3 : pages_to_flush_before_next_checkpoint= 0;
616 3 : sleep_time= interval;
617 3 : break;
618 : }
619 0 : sleep_time= 1;
620 0 : ma_checkpoint_execute(CHECKPOINT_MEDIUM, TRUE);
621 : /*
622 : Snapshot this kind of "state" of the engine. Note that the value below
623 : is possibly greater than last_checkpoint_lsn.
624 : */
625 0 : log_horizon_at_last_checkpoint= translog_get_horizon();
626 0 : pagecache_flushes_at_last_checkpoint=
627 : maria_pagecache->global_cache_write;
628 : /*
629 : If the checkpoint above succeeded it has set d|kfiles and
630 : d|kfiles_end. If is has failed, it has set
631 : pages_to_flush_before_next_checkpoint to 0 so we will skip flushing
632 : and sleep until the next checkpoint.
633 : */
634 0 : break;
635 : case 1:
636 : /* set up parameters for background page flushing */
637 5 : filter_param.up_to_lsn= last_checkpoint_lsn;
638 5 : pages_bunch_size= pages_to_flush_before_next_checkpoint / interval;
639 5 : dfile= dfiles;
640 5 : kfile= kfiles;
641 : /* fall through */
642 : default:
643 5 : if (pages_bunch_size > 0)
644 : {
645 0 : DBUG_PRINT("checkpoint",
646 : ("Maria background checkpoint thread: %u pages",
647 : pages_bunch_size));
648 : /* flush a bunch of dirty pages */
649 0 : filter_param.max_pages= pages_bunch_size;
650 0 : while (dfile != dfiles_end)
651 : {
652 : /*
653 : We use FLUSH_KEEP_LAZY: if a file is already in flush, it's
654 : smarter to move to the next file than wait for this one to be
655 : completely flushed, which may take long.
656 : StaleFilePointersInFlush: notice how below we use "dfile" which
657 : is an OS file descriptor plus some function and MARIA_SHARE
658 : pointers; this data dates from a previous checkpoint; since then,
659 : the table may have been closed (so MARIA_SHARE* became stale), and
660 : the file descriptor reassigned to another table which does not
661 : have the same CRC-read-set callbacks: it is thus important that
662 : flush_pagecache_blocks_with_filter() does not use the pointers,
663 : only the OS file descriptor.
664 : */
665 : int res=
666 : flush_pagecache_blocks_with_filter(maria_pagecache,
667 : dfile, FLUSH_KEEP_LAZY,
668 : filter_flush_file_evenly,
669 0 : &filter_param);
670 0 : if (unlikely(res & PCFLUSH_ERROR))
671 0 : ma_message_no_user(0, "background data page flush failed");
672 0 : if (filter_param.max_pages == 0) /* bunch all flushed, sleep */
673 0 : break; /* and we will continue with the same file */
674 0 : dfile++; /* otherwise all this file is flushed, move to next file */
675 : /*
676 : MikaelR noted that he observed that Linux's file cache may never
677 : fsync to disk until this cache is full, at which point it decides
678 : to empty the cache, making the machine very slow. A solution was
679 : to fsync after writing 2 MB. So we might want to fsync() here if
680 : we wrote enough pages.
681 : */
682 : }
683 0 : while (kfile != kfiles_end)
684 : {
685 : int res=
686 : flush_pagecache_blocks_with_filter(maria_pagecache,
687 : kfile, FLUSH_KEEP_LAZY,
688 : filter_flush_file_evenly,
689 0 : &filter_param);
690 0 : if (unlikely(res & PCFLUSH_ERROR))
691 0 : ma_message_no_user(0, "background index page flush failed");
692 0 : if (filter_param.max_pages == 0) /* bunch all flushed, sleep */
693 0 : break; /* and we will continue with the same file */
694 0 : kfile++; /* otherwise all this file is flushed, move to next file */
695 : }
696 0 : sleep_time= 1;
697 : }
698 : else
699 : {
700 : /* Can directly sleep until the next checkpoint moment */
701 5 : sleep_time= interval - (sleeps % interval);
702 : }
703 : }
704 8 : pthread_mutex_lock(&LOCK_checkpoint);
705 8 : if (checkpoint_thread_die == 1)
706 8 : break;
707 : #if 0 /* good for testing, to do a lot of checkpoints, finds a lot of bugs */
708 : pthread_mutex_unlock(&LOCK_checkpoint);
709 : my_sleep(100000); /* a tenth of a second */
710 : pthread_mutex_lock(&LOCK_checkpoint);
711 : #else
712 : /* To have a killable sleep, we use timedwait like our SQL GET_LOCK() */
713 8 : DBUG_PRINT("info", ("sleeping %u seconds", sleep_time));
714 8 : set_timespec(abstime, sleep_time);
715 8 : pthread_cond_timedwait(&COND_checkpoint, &LOCK_checkpoint, &abstime);
716 : #endif
717 7 : if (checkpoint_thread_die == 1)
718 3 : break;
719 3 : pthread_mutex_unlock(&LOCK_checkpoint);
720 3 : sleeps+= sleep_time;
721 3 : }
722 4 : pthread_mutex_unlock(&LOCK_checkpoint);
723 4 : DBUG_PRINT("info",("Maria background checkpoint thread ends"));
724 : {
725 4 : CHECKPOINT_LEVEL level= CHECKPOINT_FULL;
726 : /*
727 : That's the final one, which guarantees that a clean shutdown always ends
728 : with a checkpoint.
729 : */
730 4 : DBUG_EXECUTE_IF("maria_checkpoint_indirect", level= CHECKPOINT_INDIRECT;);
731 4 : ma_checkpoint_execute(level, FALSE);
732 : }
733 4 : pthread_mutex_lock(&LOCK_checkpoint);
734 4 : checkpoint_thread_die= 2; /* indicate that we are dead */
735 : /* wake up ma_checkpoint_end() which may be waiting for our death */
736 4 : pthread_cond_broadcast(&COND_checkpoint);
737 : /* broadcast was inside unlock because ma_checkpoint_end() destroys mutex */
738 4 : pthread_mutex_unlock(&LOCK_checkpoint);
739 4 : my_thread_end();
740 4 : return 0;
741 : }
742 :
743 :
744 : /**
745 : @brief Allocates buffer and stores in it some info about open tables,
746 : does some flushing on those.
747 :
748 : Does the allocation because the caller cannot know the size itself.
749 : Memory freeing is to be done by the caller (if the "str" member of the
750 : LEX_STRING is not NULL).
751 : The caller is taking a checkpoint.
752 :
753 : @param[out] str pointer to where the allocated buffer,
754 : and its size, will be put; buffer will be filled
755 : with info about open tables
756 : @param checkpoint_start_log_horizon Of the in-progress checkpoint
757 : record.
758 :
759 : @return Operation status
760 : @retval 0 OK
761 : @retval 1 Error
762 : */
763 :
764 : static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon)
765 4 : {
766 4 : MARIA_SHARE **distinct_shares= NULL;
767 : char *ptr;
768 4 : uint error= 1, sync_error= 0, nb, nb_stored, i;
769 4 : my_bool unmark_tables= TRUE;
770 : uint total_names_length;
771 : LIST *pos; /**< to iterate over open tables */
772 : struct st_state_copy {
773 : uint index;
774 : MARIA_STATE_INFO state;
775 : };
776 4 : struct st_state_copy *state_copies= NULL, /**< fixed-size cache of states */
777 : *state_copies_end, /**< cache ends here */
778 : *state_copy; /**< iterator in cache */
779 : TRANSLOG_ADDRESS state_copies_horizon; /**< horizon of states' _copies_ */
780 : struct st_filter_param filter_param;
781 : PAGECACHE_FLUSH_FILTER filter;
782 4 : DBUG_ENTER("collect_tables");
783 :
784 4 : LINT_INIT(state_copies_horizon);
785 : /* let's make a list of distinct shares */
786 4 : pthread_mutex_lock(&THR_LOCK_maria);
787 4 : for (nb= 0, pos= maria_open_list; pos; pos= pos->next)
788 : {
789 0 : MARIA_HA *info= (MARIA_HA*)pos->data;
790 0 : MARIA_SHARE *share= info->s;
791 : /* the first three variables below can never change */
792 0 : if (share->base.born_transactional && !share->temporary &&
793 : share->mode != O_RDONLY &&
794 : !(share->in_checkpoint & MARIA_CHECKPOINT_SEEN_IN_LOOP))
795 : {
796 : /*
797 : Apart from us, only maria_close() reads/sets in_checkpoint but cannot
798 : run now as we hold THR_LOCK_maria.
799 : */
800 : /*
801 : This table is relevant for checkpoint and not already seen. Mark it,
802 : so that it is not seen again in the loop.
803 : */
804 0 : nb++;
805 0 : DBUG_ASSERT(share->in_checkpoint == 0);
806 : /* This flag ensures that we count only _distinct_ shares. */
807 0 : share->in_checkpoint= MARIA_CHECKPOINT_SEEN_IN_LOOP;
808 : }
809 : }
810 4 : if (unlikely((distinct_shares=
811 : (MARIA_SHARE **)my_malloc(nb * sizeof(MARIA_SHARE *),
812 : MYF(MY_WME))) == NULL))
813 4 : goto err;
814 4 : for (total_names_length= 0, i= 0, pos= maria_open_list; pos; pos= pos->next)
815 : {
816 0 : MARIA_HA *info= (MARIA_HA*)pos->data;
817 0 : MARIA_SHARE *share= info->s;
818 0 : if (share->in_checkpoint & MARIA_CHECKPOINT_SEEN_IN_LOOP)
819 : {
820 0 : distinct_shares[i++]= share;
821 : /*
822 : With this we prevent the share from going away while we later flush
823 : and force it without holding THR_LOCK_maria. For example if the share
824 : could be my_free()d by maria_close() we would have a problem when we
825 : access it to flush the table. We "pin" the share pointer.
826 : And we also take down MARIA_CHECKPOINT_SEEN_IN_LOOP, so that it is
827 : not seen again in the loop.
828 : */
829 0 : share->in_checkpoint= MARIA_CHECKPOINT_LOOKS_AT_ME;
830 : /** @todo avoid strlen() */
831 0 : total_names_length+= share->open_file_name.length;
832 : }
833 : }
834 :
835 4 : DBUG_ASSERT(i == nb);
836 4 : pthread_mutex_unlock(&THR_LOCK_maria);
837 4 : DBUG_PRINT("info",("found %u table shares", nb));
838 :
839 4 : str->length=
840 : 4 + /* number of tables */
841 : (2 + /* short id */
842 : LSN_STORE_SIZE + /* first_log_write_at_lsn */
843 : 1 /* end-of-name 0 */
844 : ) * nb + total_names_length;
845 4 : if (unlikely((str->str= my_malloc(str->length, MYF(MY_WME))) == NULL))
846 4 : goto err;
847 :
848 4 : ptr= str->str;
849 4 : ptr+= 4; /* real number of stored tables is not yet know */
850 :
851 : /* only possible checkpointer, so can do the read below without mutex */
852 4 : filter_param.up_to_lsn= last_checkpoint_lsn;
853 4 : switch(checkpoint_in_progress)
854 : {
855 : case CHECKPOINT_MEDIUM:
856 0 : filter= &filter_flush_file_medium;
857 0 : break;
858 : case CHECKPOINT_FULL:
859 4 : filter= &filter_flush_file_full;
860 4 : break;
861 : case CHECKPOINT_INDIRECT:
862 0 : filter= NULL;
863 0 : break;
864 : default:
865 0 : DBUG_ASSERT(0);
866 : goto err;
867 : }
868 :
869 : /*
870 : The principle of reading/writing the state below is explained in
871 : ma_recovery.c, look for "Recovery of the state".
872 : */
873 : #define STATE_COPIES 1024
874 4 : state_copies= (struct st_state_copy *)
875 : my_malloc(STATE_COPIES * sizeof(struct st_state_copy), MYF(MY_WME));
876 4 : dfiles= (PAGECACHE_FILE *)my_realloc((uchar *)dfiles,
877 : /* avoid size of 0 for my_realloc */
878 : max(1, nb) * sizeof(PAGECACHE_FILE),
879 : MYF(MY_WME | MY_ALLOW_ZERO_PTR));
880 4 : kfiles= (PAGECACHE_FILE *)my_realloc((uchar *)kfiles,
881 : /* avoid size of 0 for my_realloc */
882 : max(1, nb) * sizeof(PAGECACHE_FILE),
883 : MYF(MY_WME | MY_ALLOW_ZERO_PTR));
884 4 : if (unlikely((state_copies == NULL) ||
885 : (dfiles == NULL) || (kfiles == NULL)))
886 4 : goto err;
887 4 : state_copy= state_copies_end= NULL;
888 4 : dfiles_end= dfiles;
889 4 : kfiles_end= kfiles;
890 :
891 4 : for (nb_stored= 0, i= 0; i < nb; i++)
892 : {
893 0 : MARIA_SHARE *share= distinct_shares[i];
894 : PAGECACHE_FILE kfile, dfile;
895 : my_bool ignore_share;
896 0 : if (!(share->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME))
897 : {
898 : /*
899 : No need for a mutex to read the above, only us can write *this* bit of
900 : the in_checkpoint bitmap
901 : */
902 0 : continue;
903 : }
904 : /**
905 : @todo We should not look at tables which didn't change since last
906 : checkpoint.
907 : */
908 0 : DBUG_PRINT("info",("looking at table '%s'", share->open_file_name.str));
909 0 : if (state_copy == state_copies_end) /* we have no more cached states */
910 : {
911 : /*
912 : Collect and cache a bunch of states. We do this for many states at a
913 : time, to not lock/unlock the log's lock too often.
914 : */
915 0 : uint j, bound= min(nb, i + STATE_COPIES);
916 0 : state_copy= state_copies;
917 : /* part of the state is protected by log's lock */
918 0 : translog_lock();
919 0 : state_copies_horizon= translog_get_horizon_no_lock();
920 0 : for (j= i; j < bound; j++)
921 : {
922 0 : MARIA_SHARE *share2= distinct_shares[j];
923 0 : if (!(share2->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME))
924 0 : continue;
925 0 : state_copy->index= j;
926 0 : state_copy->state= share2->state; /* we copy the state */
927 0 : state_copy++;
928 : /*
929 : data_file_length is not updated under log's lock by the bitmap
930 : code, but writing a wrong data_file_length is ok: a next
931 : maria_close() will correct it; if we crash before, Recovery will
932 : set it to the true physical size.
933 : */
934 : }
935 0 : translog_unlock();
936 : /**
937 : We are going to flush these states.
938 : Before, all records describing how to undo such state must be
939 : in the log (WAL). Usually this means UNDOs. In the special case of
940 : data|key_file_length, recovery just needs to open the table to fix the
941 : length, so any LOGREC_FILE_ID/REDO/UNDO allowing recovery to
942 : understand it must open a table, is enough; so as long as
943 : data|key_file_length is updated after writing any log record it's ok:
944 : if we copied new value above, it means the record was before
945 : state_copies_horizon and we flush such record below.
946 : Apart from data|key_file_length which are easily recoverable from the
947 : real file's size, all other state members must be updated only when
948 : writing the UNDO; otherwise, if updated before, if their new value is
949 : flushed by a checkpoint and there is a crash before UNDO is written,
950 : their REDO group will be missing or at least incomplete and skipped
951 : by recovery, so bad state value will stay. For example, setting
952 : key_root before writing the UNDO: the table would have old index
953 : pages (they were pinned at time of crash) and a new, thus wrong,
954 : key_root.
955 : @todo RECOVERY BUG check that all code honours that.
956 : */
957 0 : if (translog_flush(state_copies_horizon))
958 0 : goto err;
959 : /* now we have cached states and they are WAL-safe*/
960 0 : state_copies_end= state_copy;
961 0 : state_copy= state_copies;
962 : }
963 :
964 : /* locate our state among these cached ones */
965 0 : for ( ; state_copy->index != i; state_copy++)
966 0 : DBUG_ASSERT(state_copy < state_copies_end);
967 :
968 : /* OS file descriptors are ints which we stored in 4 bytes */
969 : compile_time_assert(sizeof(int) <= 4);
970 : /*
971 : Protect against maria_close() (which does some memory freeing in
972 : MARIA_FILE_BITMAP) with close_lock. intern_lock is not
973 : sufficient as we, as well as maria_close(), are going to unlock
974 : intern_lock in the middle of manipulating the table. Serializing us and
975 : maria_close() should help avoid problems.
976 : */
977 0 : pthread_mutex_lock(&share->close_lock);
978 0 : pthread_mutex_lock(&share->intern_lock);
979 : /*
980 : Tables in a normal state have their two file descriptors open.
981 : In some rare cases like REPAIR, some descriptor may be closed or even
982 : -1. If that happened, the _ma_state_info_write() may fail. This is
983 : prevented by enclosing all all places which close/change kfile.file with
984 : intern_lock.
985 : */
986 0 : kfile= share->kfile;
987 0 : dfile= share->bitmap.file;
988 : /*
989 : Ignore table which has no logged writes (all its future log records will
990 : be found naturally by Recovery). Ignore obsolete shares (_before_
991 : setting themselves to last_version=0 they already did all flush and
992 : sync; if we flush their state now we may be flushing an obsolete state
993 : onto a newer one (assuming the table has been reopened with a different
994 : share but of course same physical index file).
995 : */
996 0 : ignore_share= (share->id == 0) | (share->last_version == 0);
997 0 : DBUG_PRINT("info", ("ignore_share: %d", ignore_share));
998 0 : if (!ignore_share)
999 : {
1000 0 : uint open_file_name_len= share->open_file_name.length + 1;
1001 : /* remember the descriptors for background flush */
1002 0 : *(dfiles_end++)= dfile;
1003 0 : *(kfiles_end++)= kfile;
1004 : /* we will store this table in the record */
1005 0 : nb_stored++;
1006 0 : int2store(ptr, share->id);
1007 0 : ptr+= 2;
1008 0 : lsn_store(ptr, share->lsn_of_file_id);
1009 0 : ptr+= LSN_STORE_SIZE;
1010 : /*
1011 : first_bitmap_with_space is not updated under log's lock, and is
1012 : important. We would need the bitmap's lock to get it right. Recovery
1013 : of this is not clear, so we just play safe: write it out as
1014 : unknown: if crash, _ma_bitmap_init() at next open (for example in
1015 : Recovery) will convert it to 0 and thus the first insertion will
1016 : search for free space from the file's first bitmap (0) -
1017 : under-optimal but safe.
1018 : If no crash, maria_close() will write the exact value.
1019 : */
1020 0 : state_copy->state.first_bitmap_with_space= ~(ulonglong)0;
1021 0 : memcpy(ptr, share->open_file_name.str, open_file_name_len);
1022 0 : ptr+= open_file_name_len;
1023 0 : if (cmp_translog_addr(share->state.is_of_horizon,
1024 : checkpoint_start_log_horizon) >= 0)
1025 : {
1026 : /*
1027 : State was flushed recently, it does not hold down the log's
1028 : low-water mark and will not give avoidable work to Recovery. So we
1029 : needn't flush it. Also, it is possible that while we copied the
1030 : state above (under log's lock, without intern_lock) it was being
1031 : modified in memory or flushed to disk (without log's lock, under
1032 : intern_lock, like in maria_extra()), so our copy may be incorrect
1033 : and we should not flush it.
1034 : It may also be a share which got last_version==0 since we checked
1035 : last_version; in this case, it flushed its state and the LSN test
1036 : above will catch it.
1037 : */
1038 : }
1039 : else
1040 : {
1041 : /*
1042 : We could do the state flush only if share->changed, but it's
1043 : tricky.
1044 : Consider a maria_write() which has written REDO,UNDO, and before it
1045 : calls _ma_writeinfo() (setting share->changed=1), checkpoint
1046 : happens and sees share->changed=0, does not flush state. It is
1047 : possible that Recovery does not start from before the REDO and thus
1048 : the state is not recovered. A solution may be to set
1049 : share->changed=1 under log mutex when writing log records.
1050 : But as anyway we have another problem below, this optimization would
1051 : be of little use.
1052 : */
1053 : /** @todo flush state only if changed since last checkpoint */
1054 0 : DBUG_ASSERT(share->last_version != 0);
1055 0 : state_copy->state.is_of_horizon= share->state.is_of_horizon=
1056 : state_copies_horizon;
1057 0 : if (kfile.file >= 0)
1058 0 : sync_error|=
1059 : _ma_state_info_write_sub(kfile.file, &state_copy->state,
1060 : MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET);
1061 : /*
1062 : We don't set share->changed=0 because it may interfere with a
1063 : concurrent _ma_writeinfo() doing share->changed=1 (cancel its
1064 : effect). The sad consequence is that we will flush the same state at
1065 : each checkpoint if the table was once written and then not anymore.
1066 : */
1067 : }
1068 : }
1069 : /*
1070 : _ma_bitmap_flush_all() may wait, so don't keep intern_lock as
1071 : otherwise this would deadlock with allocate_and_write_block_record()
1072 : calling _ma_set_share_data_file_length()
1073 : */
1074 0 : pthread_mutex_unlock(&share->intern_lock);
1075 :
1076 0 : if (!ignore_share)
1077 : {
1078 : /*
1079 : share->bitmap is valid because it's destroyed under close_lock which
1080 : we hold.
1081 : */
1082 0 : if (_ma_bitmap_flush_all(share))
1083 : {
1084 0 : sync_error= 1;
1085 : /** @todo all write failures should mark table corrupted */
1086 0 : ma_message_no_user(0, "checkpoint bitmap page flush failed");
1087 : }
1088 0 : DBUG_ASSERT(share->pagecache == maria_pagecache);
1089 : }
1090 : /*
1091 : Clean up any unused states.
1092 : TODO: Only do this call if there has been # (10?) ended transactions
1093 : since last call.
1094 : We had to release intern_lock to respect lock order with LOCK_trn_list.
1095 : */
1096 0 : _ma_remove_not_visible_states_with_lock(share, FALSE);
1097 :
1098 0 : if (share->in_checkpoint & MARIA_CHECKPOINT_SHOULD_FREE_ME)
1099 : {
1100 : /*
1101 : maria_close() left us free the share. When it run it set share->id
1102 : to 0. As it run before we locked close_lock, we should have seen this
1103 : and so this assertion should be true:
1104 : */
1105 0 : DBUG_ASSERT(ignore_share);
1106 0 : pthread_mutex_destroy(&share->intern_lock);
1107 0 : pthread_mutex_unlock(&share->close_lock);
1108 0 : pthread_mutex_destroy(&share->close_lock);
1109 0 : my_free((uchar *)share, MYF(0));
1110 : }
1111 : else
1112 : {
1113 : /* share goes back to normal state */
1114 0 : share->in_checkpoint= 0;
1115 0 : pthread_mutex_unlock(&share->close_lock);
1116 : }
1117 :
1118 : /*
1119 : We do the big disk writes out of intern_lock to not block other
1120 : users of this table (intern_lock is taken at the start and end of
1121 : every statement). This means that file descriptors may be invalid
1122 : (files may have been closed for example by HA_EXTRA_PREPARE_FOR_*
1123 : under Windows, or REPAIR). This should not be a problem as we use
1124 : MY_IGNORE_BADFD. Descriptors may even point to other files but then
1125 : the old blocks (of before the close) must have been flushed for sure,
1126 : so our flush will flush new blocks (of after the latest open) and that
1127 : should do no harm.
1128 : */
1129 : /*
1130 : If CHECKPOINT_MEDIUM, this big flush below may result in a
1131 : serious write burst. Realize that all pages dirtied between the
1132 : last checkpoint and the one we are doing now, will be flushed at
1133 : next checkpoint, except those evicted by LRU eviction (depending on
1134 : the size of the page cache compared to the size of the working data
1135 : set, eviction may be rare or frequent).
1136 : We avoid that burst by anticipating: those pages are flushed
1137 : in bunches spanned regularly over the time interval between now and
1138 : the next checkpoint, by a background thread. Thus the next checkpoint
1139 : will have only little flushing to do (CHECKPOINT_MEDIUM should thus be
1140 : only a little slower than CHECKPOINT_INDIRECT).
1141 : */
1142 :
1143 : /*
1144 : PageCacheFlushConcurrencyBugs
1145 : Inside the page cache, calls to flush_pagecache_blocks_int() on the same
1146 : file are serialized. Examples of concurrency bugs which happened when we
1147 : didn't have this serialization:
1148 : - maria_chk_size() (via CHECK TABLE) happens concurrently with
1149 : Checkpoint: Checkpoint is flushing a page: it pins the page and is
1150 : pre-empted, maria_chk_size() wants to flush this page too so gets an
1151 : error because Checkpoint pinned this page. Such error makes
1152 : maria_chk_size() mark the table as corrupted.
1153 : - maria_close() happens concurrently with Checkpoint:
1154 : Checkpoint is flushing a page: it registers a request on the page, is
1155 : pre-empted ; maria_close() flushes this page too with FLUSH_RELEASE:
1156 : FLUSH_RELEASE will cause a free_block() which assumes the page is in the
1157 : LRU, but it is not (as Checkpoint registered a request). Crash.
1158 : - one thread is evicting a page of the file out of the LRU: it marks it
1159 : iPC_BLOCK_IN_SWITCH and is pre-empted. Then two other threads do flushes
1160 : of the same file concurrently (like above). Then one flusher sees the
1161 : page is in switch, removes it from changed_blocks[] and puts it in its
1162 : first_in_switch, so the other flusher will not see the page at all and
1163 : return too early. If it's maria_close() which returns too early, then
1164 : maria_close() may close the file descriptor, and the other flusher, and
1165 : the evicter will fail to write their page: corruption.
1166 : */
1167 :
1168 0 : if (!ignore_share)
1169 : {
1170 0 : if (filter != NULL)
1171 : {
1172 0 : if ((flush_pagecache_blocks_with_filter(maria_pagecache,
1173 : &dfile, FLUSH_KEEP_LAZY,
1174 : filter, &filter_param) &
1175 : PCFLUSH_ERROR))
1176 0 : ma_message_no_user(0, "checkpoint data page flush failed");
1177 0 : if ((flush_pagecache_blocks_with_filter(maria_pagecache,
1178 : &kfile, FLUSH_KEEP_LAZY,
1179 : filter, &filter_param) &
1180 : PCFLUSH_ERROR))
1181 0 : ma_message_no_user(0, "checkpoint index page flush failed");
1182 : }
1183 : /*
1184 : fsyncs the fd, that's the loooong operation (e.g. max 150 fsync
1185 : per second, so if you have touched 1000 files it's 7 seconds).
1186 : */
1187 0 : sync_error|=
1188 : my_sync(dfile.file, MYF(MY_WME | MY_IGNORE_BADFD)) |
1189 : my_sync(kfile.file, MYF(MY_WME | MY_IGNORE_BADFD));
1190 : /*
1191 : in case of error, we continue because writing other tables to disk is
1192 : still useful.
1193 : */
1194 : }
1195 : }
1196 :
1197 4 : if (sync_error)
1198 4 : goto err;
1199 : /* We maybe over-estimated (due to share->id==0 or last_version==0) */
1200 4 : DBUG_ASSERT(str->length >= (uint)(ptr - str->str));
1201 4 : str->length= (uint)(ptr - str->str);
1202 : /*
1203 : As we support max 65k tables open at a time (2-byte short id), we
1204 : assume uint is enough for the cumulated length of table names; and
1205 : LEX_STRING::length is uint.
1206 : */
1207 4 : int4store(str->str, nb_stored);
1208 4 : error= unmark_tables= 0;
1209 :
1210 4 : err:
1211 4 : if (unlikely(unmark_tables))
1212 : {
1213 : /* maria_close() uses THR_LOCK_maria from start to end */
1214 0 : pthread_mutex_lock(&THR_LOCK_maria);
1215 0 : for (i= 0; i < nb; i++)
1216 : {
1217 0 : MARIA_SHARE *share= distinct_shares[i];
1218 0 : if (share->in_checkpoint & MARIA_CHECKPOINT_SHOULD_FREE_ME)
1219 : {
1220 : /* maria_close() left us to free the share */
1221 0 : pthread_mutex_destroy(&share->intern_lock);
1222 0 : my_free((uchar *)share, MYF(0));
1223 : }
1224 : else
1225 : {
1226 : /* share goes back to normal state */
1227 0 : share->in_checkpoint= 0;
1228 : }
1229 : }
1230 0 : pthread_mutex_unlock(&THR_LOCK_maria);
1231 : }
1232 4 : my_free((uchar *)distinct_shares, MYF(MY_ALLOW_ZERO_PTR));
1233 4 : my_free((uchar *)state_copies, MYF(MY_ALLOW_ZERO_PTR));
1234 4 : DBUG_RETURN(error);
1235 : }
|