1 : /* Copyright (C) 2007-2008 Michael Widenius
2 :
3 : This program is free software; you can redistribute it and/or modify
4 : it under the terms of the GNU General Public License as published by
5 : the Free Software Foundation; version 2 of the License.
6 :
7 : This program is distributed in the hope that it will be useful,
8 : but WITHOUT ANY WARRANTY; without even the implied warranty of
9 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 : GNU General Public License for more details.
11 :
12 : You should have received a copy of the GNU General Public License
13 : along with this program; if not, write to the Free Software
14 : Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
15 :
16 : /*
17 : Storage of records in block
18 :
19 : Some clarifications about the abbrev used:
20 :
21 : NULL fields -> Fields that may have contain a NULL value.
22 : Not null fields -> Fields that may not contain a NULL value.
23 : Critical fields -> Fields that can't be null and can't be dropped without
24 : causing a table reorganization.
25 :
26 :
27 : Maria will have a LSN at start of each page (excluding the bitmap pages)
28 :
29 : The different page types that are in a data file are:
30 :
31 : Bitmap pages Map of free pages in the next extent (8192 page size
32 : gives us 256M of mapped pages / bitmap)
33 : Head page Start of rows are stored on this page.
34 : A rowid always points to a head page
35 : Blob page This page is totally filled with data from one blob or by
36 : a set of long VARCHAR/CHAR fields
37 : Tail page This contains the last part from different rows, blobs
38 : or varchar fields.
39 :
40 : The data file starts with a bitmap page, followed by as many data
41 : pages as the bitmap can cover. After this there is a new bitmap page
42 : and more data pages etc.
43 :
44 : For information about the bitmap page, see ma_bitmap.c
45 :
46 : Structure of data and tail page:
47 :
48 : The page has a row directory at end of page to allow us to do deletes
49 : without having to reorganize the page. It also allows us to later store
50 : some more bytes after each row to allow them to grow without having to move
51 : around other rows.
52 :
53 : Page header:
54 :
55 : LSN 7 bytes Log position for last page change
56 : PAGE_TYPE 1 uchar 1 for head / 2 for tail / 3 for blob
57 : DIR_COUNT 1 uchar Number of row/tail entries on page
58 : FREE_DIR_LINK 1 uchar Pointer to first free director entry or 255 if no
59 : empty space 2 bytes Empty space on page
60 :
61 : The most significant bit in PAGE_TYPE is set to 1 if the data on the page
62 : can be compacted to get more space. (PAGE_CAN_BE_COMPACTED)
63 :
64 : Row data
65 :
66 : Row directory of NO entries, that consist of the following for each row
67 : (in reverse order; i.e., first record is stored last):
68 :
69 : Position 2 bytes Position of row on page
70 : Length 2 bytes Length of entry
71 :
72 : For Position and Length, the 1 most significant bit of the position and
73 : the 1 most significant bit of the length could be used for some states of
74 : the row (in other words, we should try to keep these reserved)
75 :
76 : Position is 0 if the entry is not used. In this case length[0] points
77 : to a previous free entry (255 if no previous entry) and length[1]
78 : to the next free entry (or 255 if last free entry). This works because
79 : the directory entry 255 can never be marked free (if the first directory
80 : entry is freed, the directory is shrinked).
81 :
82 : checksum 4 bytes Reserved for full page read testing and live backup.
83 :
84 : ----------------
85 :
86 : Structure of blob pages:
87 :
88 : LSN 7 bytes Log position for last page change
89 : PAGE_TYPE 1 uchar 3
90 :
91 : data
92 :
93 : -----------------
94 :
95 : Row data structure:
96 :
97 : Flag 1 uchar Marker of which header field exists
98 : TRANSID 6 bytes TRANSID of changing transaction
99 : (optional, added on insert and first
100 : update/delete)
101 : VER_PTR 7 bytes Pointer to older version in log
102 : (undo record)
103 : (optional, added after first
104 : update/delete)
105 : DELETE_TRANSID 6 bytes (optional). TRANSID of original row.
106 : Added on delete.
107 : Nulls_extended 1 uchar To allow us to add new DEFAULT NULL
108 : fields (optional, added after first
109 : change of row after alter table)
110 : Number of ROW_EXTENT's 1-3 uchar Length encoded, optional
111 : This is the number of extents the
112 : row is split into
113 : First row_extent 7 uchar Pointer to first row extent (optional)
114 :
115 : Total length of length array 1-3 uchar Only used if we have
116 : char/varchar/blob fields.
117 : Row checksum 1 uchar Only if table created with checksums
118 : Null_bits .. One bit for each NULL field (a field that may
119 : have the value NULL)
120 : Empty_bits .. One bit for each field that may be 'empty'.
121 : (Both for null and not null fields).
122 : This bit is 1 if the value for the field is
123 : 0 or empty string.
124 :
125 : field_offsets 2 byte/offset
126 : For each 32'th field, there is one offset
127 : that points to where the field information
128 : starts in the block. This is to provide
129 : fast access to later field in the row
130 : when we only need to return a small
131 : set of fields.
132 : TODO: Implement this.
133 :
134 : Things marked above as 'optional' will only be present if the
135 : corresponding bit is set in 'Flag' field. Flag gives us a way to
136 : get more space on a page when doing page compaction as we don't need
137 : to store TRANSID that have committed before the smallest running
138 : transaction we have in memory.
139 :
140 : Data in the following order:
141 : (Field order is precalculated when table is created)
142 :
143 : Critical fixed length, not null, fields. (Note, these can't be dropped)
144 : Fixed length, null fields
145 :
146 : Length array, 1-4 uchar per field for all CHAR/VARCHAR/BLOB fields.
147 : Number of bytes used in length array per entry is depending on max length
148 : for field.
149 :
150 : ROW_EXTENT's
151 : CHAR data (space stripped)
152 : VARCHAR data
153 : BLOB data
154 :
155 : Fields marked in null_bits or empty_bits are not stored in data part or
156 : length array.
157 :
158 : If row doesn't fit into the given block, then the first EXTENT will be
159 : stored last on the row. This is done so that we don't break any field
160 : data in the middle.
161 :
162 : We first try to store the full row into one block. If that's not possible
163 : we move out each big blob into their own extents. If this is not enough we
164 : move out a concatenation of all varchars to their own extent.
165 :
166 : Each blob and the concatenated char/varchar fields are stored the following
167 : way:
168 : - Store the parts in as many full-contiguous pages as possible.
169 : - The last part, that doesn't fill a full page, is stored in tail page.
170 :
171 : When doing an insert of a new row, we don't have to have
172 : VER_PTR in the row. This will make rows that are not changed stored
173 : efficiently. On update and delete we would add TRANSID (if it was an old
174 : committed row) and VER_PTR to
175 : the row. On row page compaction we can easily detect rows where
176 : TRANSID was committed before the longest running transaction
177 : started and we can then delete TRANSID and VER_PTR from the row to
178 : gain more space.
179 :
180 : If a row is deleted in Maria, we change TRANSID to the deleting
181 : transaction's id, change VER_PTR to point to the undo record for the delete,
182 : and add DELETE_TRANSID (the id of the transaction which last
183 : inserted/updated the row before its deletion). DELETE_TRANSID allows an old
184 : transaction to avoid reading the log to know if it can see the last version
185 : before delete (in other words it reduces the probability of having to follow
186 : VER_PTR). TODO: depending on a compilation option, evaluate the performance
187 : impact of not storing DELETE_TRANSID (which would make the row smaller).
188 :
189 : Description of the different parts:
190 :
191 : Flag is coded as:
192 :
193 : Description bit
194 : TRANS_ID_exists 0
195 : VER_PTR_exists 1
196 : Row is deleted 2 (Means that DELETE_TRANSID exists)
197 : Nulls_extended_exists 3
198 : Row is split 7 This means that 'Number_of_row_extents' exists
199 :
200 : Nulls_extended is the number of new DEFAULT NULL fields in the row
201 : compared to the number of DEFAULT NULL fields when the first version
202 : of the table was created. If Nulls_extended doesn't exist in the row,
203 : we know it's 0 as this must be one of the original rows from when the
204 : table was created first time. This coding allows us to add 255*8 =
205 : 2048 new fields without requiring a full alter table.
206 :
207 : Empty_bits is used to allow us to store 0, 0.0, empty string, empty
208 : varstring and empty blob efficiently. (This is very good for data
209 : warehousing where NULL's are often regarded as evil). Having this
210 : bitmap also allows us to drop information of a field during a future
211 : delete if field was deleted with ALTER TABLE DROP COLUMN. To be able
212 : to handle DROP COLUMN, we must store in the index header the fields
213 : that has been dropped. When unpacking a row we will ignore dropped
214 : fields. When storing a row, we will mark a dropped field either with a
215 : null in the null bit map or in the empty_bits and not store any data
216 : for it.
217 : TODO: Add code for handling dropped fields.
218 :
219 :
220 : A ROW EXTENT is range of pages. One ROW_EXTENT is coded as:
221 :
222 : START_PAGE 5 bytes
223 : PAGE_COUNT 2 bytes. Bit 16 is set if this is a tail page.
224 : Bit 15 is to set if this is start of a new
225 : blob extent.
226 :
227 : With 8K pages, we can cover 256M in one extent. This coding gives us a
228 : maximum file size of 2^40*8192 = 8192 tera
229 :
230 : As an example of ROW_EXTENT handling, assume a row with one integer
231 : field (value 5), two big VARCHAR fields (size 250 and 8192*3), and 2
232 : big BLOB fields that we have updated.
233 :
234 : The record format for storing this into an empty file would be:
235 :
236 : Page 1:
237 :
238 : 00 00 00 00 00 00 00 LSN
239 : 01 Only one row in page
240 : FF No free dir entry
241 : xx xx Empty space on page
242 :
243 : 10 Flag: row split, VER_PTR exists
244 : 01 00 00 00 00 00 TRANSID 1
245 : 00 00 00 00 00 01 00 VER_PTR to first block in LOG file 1
246 : 5 Number of row extents
247 : 02 00 00 00 00 03 00 VARCHAR's are stored in full pages 2,3,4
248 : 0 No null fields
249 : 0 No empty fields
250 : 05 00 00 00 00 00 80 Tail page for VARCHAR, rowid 0
251 : 06 00 00 00 00 80 00 First blob, stored at page 6-133
252 : 05 00 00 00 00 01 80 Tail of first blob (896 bytes) at page 5
253 : 86 00 00 00 00 80 00 Second blob, stored at page 134-262
254 : 05 00 00 00 00 02 80 Tail of second blob (896 bytes) at page 5
255 : 05 00 5 integer
256 : FA Length of first varchar field (size 250)
257 : 00 60 Length of second varchar field (size 8192*3)
258 : 00 60 10 First medium BLOB, 1M
259 : 01 00 10 00 Second BLOB, 1M
260 : xx xx xx xx xx xx Varchars are stored here until end of page
261 :
262 : ..... until end of page
263 :
264 : 09 00 F4 1F Start position 9, length 8180
265 : xx xx xx xx Checksum
266 :
267 : A data page is allowed to have a wrong CRC and header as long as it is
268 : marked empty in the bitmap and its directory's count is 0.
269 : */
270 :
271 : #include "maria_def.h"
272 : #include "ma_blockrec.h"
273 : #include "trnman.h"
274 : #include "ma_key_recover.h"
275 : #include "ma_recovery_util.h"
276 : #include <lf.h>
277 :
278 : /*
279 : Struct for having a cursor over a set of extent.
280 : This is used to loop over all extents for a row when reading
281 : the row data. It's also used to store the tail positions for
282 : a read row to be used by a later update/delete command.
283 : */
284 :
285 : typedef struct st_maria_extent_cursor
286 : {
287 : /*
288 : Pointer to packed uchar array of extents for the row.
289 : Format is described above in the header
290 : */
291 : uchar *extent;
292 : /* Where data starts on page; Only for debugging */
293 : uchar *data_start;
294 : /* Position to all tails in the row. Updated when reading a row */
295 : MARIA_RECORD_POS *tail_positions;
296 : /* Current page */
297 : pgcache_page_no_t page;
298 : /* How many pages in the page region */
299 : uint page_count;
300 : /* What kind of lock to use for tail pages */
301 : enum pagecache_page_lock lock_for_tail_pages;
302 : /* Total number of extents (i.e., entries in the 'extent' slot) */
303 : uint extent_count;
304 : /* <> 0 if current extent is a tail page; Set while using cursor */
305 : uint tail;
306 : /* Position for tail on tail page */
307 : uint tail_row_nr;
308 : /*
309 : == 1 if we are working on the first extent (i.e., the one that is stored in
310 : the row header, not an extent that is stored as part of the row data).
311 : */
312 : my_bool first_extent;
313 : } MARIA_EXTENT_CURSOR;
314 :
315 :
316 : /**
317 : @brief Structure for passing down info to write_hook_for_clr_end().
318 : This hooks needs to know the variation of the live checksum caused by the
319 : current operation to update state.checksum under log's mutex,
320 : needs to know the transaction's previous undo_lsn to set
321 : trn->undo_lsn under log mutex, and needs to know the type of UNDO being
322 : undone now to modify state.records under log mutex.
323 : */
324 :
325 : /** S:share,D:checksum_delta,E:expression,P:pointer_into_record,L:length */
326 : #define store_checksum_in_rec(S,D,E,P,L) do \
327 : { \
328 : D= 0; \
329 : if ((S)->calc_checksum != NULL) \
330 : { \
331 : D= (E); \
332 : ha_checksum_store(P, D); \
333 : L+= HA_CHECKSUM_STORE_SIZE; \
334 : } \
335 : } while (0)
336 :
337 : static my_bool delete_tails(MARIA_HA *info, MARIA_RECORD_POS *tails);
338 : static my_bool delete_head_or_tail(MARIA_HA *info,
339 : pgcache_page_no_t page, uint record_number,
340 : my_bool head, my_bool from_update);
341 : #ifndef DBUG_OFF
342 : static void _ma_print_directory(uchar *buff, uint block_size);
343 : #endif
344 : static uchar *store_page_range(uchar *to, MARIA_BITMAP_BLOCK *block,
345 : uint block_size, ulong length,
346 : uint *tot_ranges);
347 : static size_t fill_insert_undo_parts(MARIA_HA *info, const uchar *record,
348 : LEX_CUSTRING *log_parts,
349 : uint *log_parts_count);
350 : static size_t fill_update_undo_parts(MARIA_HA *info, const uchar *oldrec,
351 : const uchar *newrec,
352 : LEX_CUSTRING *log_parts,
353 : uint *log_parts_count);
354 :
355 : /****************************************************************************
356 : Initialization
357 : ****************************************************************************/
358 :
359 : /*
360 : Initialize data needed for block structures
361 : */
362 :
363 :
364 : /* Size of the different header elements for a row */
365 :
366 : static uchar header_sizes[]=
367 : {
368 : TRANSID_SIZE,
369 : VERPTR_SIZE,
370 : TRANSID_SIZE, /* Delete transid */
371 : 1 /* Null extends */
372 : };
373 :
374 : /*
375 : Calculate array of all used headers
376 :
377 : Used to speed up:
378 :
379 : size= 1;
380 : if (flag & 1)
381 : size+= TRANSID_SIZE;
382 : if (flag & 2)
383 : size+= VERPTR_SIZE;
384 : if (flag & 4)
385 : size+= TRANSID_SIZE
386 : if (flag & 8)
387 : size+= 1;
388 :
389 : NOTES
390 : This is called only once at startup of Maria
391 : */
392 :
393 : static uchar total_header_size[1 << array_elements(header_sizes)];
394 : #define PRECALC_HEADER_BITMASK (array_elements(total_header_size) -1)
395 :
396 : void _ma_init_block_record_data(void)
397 2553 : {
398 : uint i;
399 2553 : bzero(total_header_size, sizeof(total_header_size));
400 2553 : total_header_size[0]= FLAG_SIZE; /* Flag uchar */
401 40848 : for (i= 1; i < array_elements(total_header_size); i++)
402 : {
403 38295 : uint size= FLAG_SIZE, j, bit;
404 163392 : for (j= 0; (bit= (1 << j)) <= i; j++)
405 : {
406 125097 : if (i & bit)
407 81696 : size+= header_sizes[j];
408 : }
409 38295 : total_header_size[i]= size;
410 : }
411 : }
412 :
413 :
414 : my_bool _ma_once_init_block_record(MARIA_SHARE *share, File data_file)
415 2781 : {
416 :
417 2781 : share->base.max_data_file_length=
418 : (((ulonglong) 1 << ((share->base.rec_reflength-1)*8))-1) *
419 : share->block_size;
420 : #if SIZEOF_OFF_T == 4
421 : set_if_smaller(share->base.max_data_file_length, INT_MAX32);
422 : #endif
423 2781 : return _ma_bitmap_init(share, data_file);
424 : }
425 :
426 :
427 : my_bool _ma_once_end_block_record(MARIA_SHARE *share)
428 2682 : {
429 2682 : int res= _ma_bitmap_end(share);
430 2682 : if (share->bitmap.file.file >= 0)
431 : {
432 2667 : if (flush_pagecache_blocks(share->pagecache, &share->bitmap.file,
433 : share->temporary ? FLUSH_IGNORE_CHANGED :
434 : FLUSH_RELEASE))
435 0 : res= 1;
436 : /*
437 : File must be synced as it is going out of the maria_open_list and so
438 : becoming unknown to Checkpoint.
439 : */
440 2667 : if (share->now_transactional &&
441 : my_sync(share->bitmap.file.file, MYF(MY_WME)))
442 0 : res= 1;
443 2667 : if (my_close(share->bitmap.file.file, MYF(MY_WME)))
444 0 : res= 1;
445 : /*
446 : Trivial assignment to guard against multiple invocations
447 : (May happen if file are closed but we want to keep the maria object
448 : around a bit longer)
449 : */
450 2667 : share->bitmap.file.file= -1;
451 : }
452 2682 : if (share->id != 0)
453 : {
454 : /*
455 : We de-assign the id even though index has not been flushed, this is ok
456 : as close_lock serializes us with a Checkpoint looking at our share.
457 : */
458 600 : translog_deassign_id_from_share(share);
459 : }
460 2682 : return res;
461 : }
462 :
463 :
464 : /* Init info->cur_row structure */
465 :
466 : my_bool _ma_init_block_record(MARIA_HA *info)
467 2781 : {
468 2781 : MARIA_ROW *row= &info->cur_row, *new_row= &info->new_row;
469 2781 : MARIA_SHARE *share= info->s;
470 : uint default_extents;
471 2781 : DBUG_ENTER("_ma_init_block_record");
472 :
473 2781 : if (!my_multi_malloc(MY_WME,
474 : &row->empty_bits, share->base.pack_bytes,
475 : &row->field_lengths,
476 : share->base.max_field_lengths + 2,
477 : &row->blob_lengths, sizeof(ulong) * share->base.blobs,
478 : &row->null_field_lengths, (sizeof(uint) *
479 : (share->base.fields -
480 : share->base.blobs +
481 : EXTRA_LENGTH_FIELDS)),
482 : &row->tail_positions, (sizeof(MARIA_RECORD_POS) *
483 : (share->base.blobs + 2)),
484 : &new_row->empty_bits, share->base.pack_bytes,
485 : &new_row->field_lengths,
486 : share->base.max_field_lengths + 2,
487 : &new_row->blob_lengths,
488 : sizeof(ulong) * share->base.blobs,
489 : &new_row->null_field_lengths, (sizeof(uint) *
490 : (share->base.fields -
491 : share->base.blobs +
492 : EXTRA_LENGTH_FIELDS)),
493 : &info->log_row_parts,
494 : sizeof(*info->log_row_parts) *
495 : (TRANSLOG_INTERNAL_PARTS + 3 +
496 : share->base.fields + 3),
497 : &info->update_field_data,
498 : (share->base.fields * 4 +
499 : share->base.max_field_lengths + 1 + 4),
500 : NullS, 0))
501 0 : DBUG_RETURN(1);
502 : /* Skip over bytes used to store length of field length for logging */
503 2781 : row->field_lengths+= 2;
504 2781 : new_row->field_lengths+= 2;
505 :
506 : /* Reserve some initial space to avoid mallocs during execution */
507 2781 : default_extents= (ELEMENTS_RESERVED_FOR_MAIN_PART + 1 +
508 : (AVERAGE_BLOB_SIZE /
509 : FULL_PAGE_SIZE(share->block_size) /
510 : BLOB_SEGMENT_MIN_SIZE));
511 :
512 2781 : if (my_init_dynamic_array(&info->bitmap_blocks,
513 : sizeof(MARIA_BITMAP_BLOCK), default_extents,
514 : 64))
515 2781 : goto err;
516 2781 : info->cur_row.extents_buffer_length= default_extents * ROW_EXTENT_SIZE;
517 2781 : if (!(info->cur_row.extents= my_malloc(info->cur_row.extents_buffer_length,
518 : MYF(MY_WME))))
519 2781 : goto err;
520 :
521 2781 : info->row_base_length= share->base_length;
522 2781 : info->row_flag= share->base.default_row_flag;
523 :
524 : /*
525 : We need to reserve 'EXTRA_LENGTH_FIELDS' number of parts in
526 : null_field_lengths to allow splitting of rows in 'find_where_to_split_row'
527 : */
528 2781 : row->null_field_lengths+= EXTRA_LENGTH_FIELDS;
529 2781 : new_row->null_field_lengths+= EXTRA_LENGTH_FIELDS;
530 :
531 2781 : DBUG_RETURN(0);
532 :
533 0 : err:
534 0 : _ma_end_block_record(info);
535 0 : DBUG_RETURN(1);
536 : }
537 :
538 :
539 : void _ma_end_block_record(MARIA_HA *info)
540 2682 : {
541 2682 : DBUG_ENTER("_ma_end_block_record");
542 2682 : my_free(info->cur_row.empty_bits, MYF(MY_ALLOW_ZERO_PTR));
543 2682 : delete_dynamic(&info->bitmap_blocks);
544 2682 : my_free(info->cur_row.extents, MYF(MY_ALLOW_ZERO_PTR));
545 2682 : my_free(info->blob_buff, MYF(MY_ALLOW_ZERO_PTR));
546 : /*
547 : The data file is closed, when needed, in ma_once_end_block_record().
548 : The following protects us from doing an extra, not allowed, close
549 : in maria_close()
550 : */
551 2682 : info->dfile.file= -1;
552 2682 : DBUG_VOID_RETURN;
553 : }
554 :
555 :
556 : /****************************************************************************
557 : Helper functions
558 : ****************************************************************************/
559 :
560 : /*
561 : Return the next unused postion on the page after a directory entry.
562 :
563 : SYNOPSIS
564 : start_of_next_entry()
565 : dir Directory entry to be used. This can not be the
566 : the last entry on the page!
567 :
568 : RETURN
569 : # Position in page where next entry starts.
570 : Everything between the '*dir' and this are free to be used.
571 : */
572 :
573 : static inline uint start_of_next_entry(uchar *dir)
574 60124 : {
575 : uchar *prev;
576 : /*
577 : Find previous used entry. (There is always a previous entry as
578 : the directory never starts with a deleted entry)
579 : */
580 60124 : for (prev= dir - DIR_ENTRY_SIZE ;
581 129322 : prev[0] == 0 && prev[1] == 0 ;
582 9074 : prev-= DIR_ENTRY_SIZE)
583 : {}
584 60124 : return (uint) uint2korr(prev);
585 : }
586 :
587 :
588 : /*
589 : Return the offset where the previous entry ends (before on page)
590 :
591 : SYNOPSIS
592 : end_of_previous_entry()
593 : dir Address for current directory entry
594 : end Address to last directory entry
595 :
596 : RETURN
597 : # Position where previous entry ends (smallest address on page)
598 : Everything between # and current entry are free to be used.
599 : */
600 :
601 :
602 : static inline uint end_of_previous_entry(uchar *dir, uchar *end)
603 38881 : {
604 : uchar *pos;
605 1016872 : for (pos= dir + DIR_ENTRY_SIZE ; pos < end ; pos+= DIR_ENTRY_SIZE)
606 : {
607 : uint offset;
608 992229 : if ((offset= uint2korr(pos)))
609 14238 : return offset + uint2korr(pos+2);
610 : }
611 24643 : return PAGE_HEADER_SIZE;
612 : }
613 :
614 :
615 : #ifndef DBUG_OFF
616 :
617 : static void _ma_print_directory(uchar *buff, uint block_size)
618 0 : {
619 0 : uint max_entry= (uint) ((uchar *) buff)[DIR_COUNT_OFFSET], row= 0;
620 0 : uint end_of_prev_row= PAGE_HEADER_SIZE;
621 : uchar *dir, *end;
622 :
623 0 : dir= dir_entry_pos(buff, block_size, max_entry-1);
624 0 : end= dir_entry_pos(buff, block_size, 0);
625 :
626 0 : DBUG_LOCK_FILE;
627 0 : fprintf(DBUG_FILE,"Directory dump (pos:length):\n");
628 :
629 0 : for (row= 1; dir <= end ; end-= DIR_ENTRY_SIZE, row++)
630 : {
631 0 : uint offset= uint2korr(end);
632 0 : uint length= uint2korr(end+2);
633 0 : fprintf(DBUG_FILE, " %4u:%4u", offset, offset ? length : 0);
634 0 : if (!(row % (80/12)))
635 0 : fputc('\n', DBUG_FILE);
636 0 : if (offset)
637 : {
638 0 : DBUG_ASSERT(offset >= end_of_prev_row);
639 0 : end_of_prev_row= offset + length;
640 : }
641 : }
642 0 : fputc('\n', DBUG_FILE);
643 0 : fflush(DBUG_FILE);
644 0 : DBUG_UNLOCK_FILE;
645 : }
646 :
647 :
648 : static void check_directory(uchar *buff, uint block_size, uint min_row_length)
649 1021085 : {
650 : uchar *dir, *end;
651 1021085 : uint max_entry= (uint) buff[DIR_COUNT_OFFSET];
652 : uint start_of_dir, deleted;
653 : uchar free_entry, prev_free_entry;
654 1021085 : uint end_of_prev_row= PAGE_HEADER_SIZE;
655 :
656 1021085 : dir= dir_entry_pos(buff, block_size, max_entry-1);
657 1021085 : start_of_dir= (uint) (dir - buff);
658 1021085 : end= dir_entry_pos(buff, block_size, 0);
659 1021085 : deleted= 0;
660 :
661 : /* Ensure that all rows are in increasing order and no overlaps */
662 61919570 : for (; dir <= end ; end-= DIR_ENTRY_SIZE)
663 : {
664 60898485 : uint offset= uint2korr(end);
665 60898485 : uint length= uint2korr(end+2);
666 60898485 : if (offset)
667 : {
668 48289326 : DBUG_ASSERT(offset >= end_of_prev_row);
669 48289326 : DBUG_ASSERT(!length || length >= min_row_length);
670 48289326 : end_of_prev_row= offset + length;
671 : }
672 : else
673 12609159 : deleted++;
674 : }
675 1021085 : DBUG_ASSERT(end_of_prev_row <= start_of_dir);
676 :
677 : /* check free links */
678 1021085 : free_entry= buff[DIR_FREE_OFFSET];
679 1021085 : prev_free_entry= END_OF_DIR_FREE_LIST;
680 14651329 : while (free_entry != END_OF_DIR_FREE_LIST)
681 : {
682 12609159 : uchar *dir= dir_entry_pos(buff, block_size, free_entry);
683 12609159 : DBUG_ASSERT(dir[0] == 0 && dir[1] == 0);
684 12609159 : DBUG_ASSERT(dir[2] == prev_free_entry);
685 12609159 : prev_free_entry= free_entry;
686 12609159 : free_entry= dir[3];
687 12609159 : deleted--;
688 : }
689 1021085 : DBUG_ASSERT(deleted == 0);
690 : }
691 : #else
692 : #define check_directory(A,B,C)
693 : #endif /* DBUG_OFF */
694 :
695 :
696 : /**
697 : @brief Calculate if there is enough entries on the page
698 : */
699 :
700 : my_bool enough_free_entries(uchar *buff, uint block_size, uint wanted_entries)
701 6261 : {
702 6261 : uint entries= (uint) buff[DIR_COUNT_OFFSET];
703 : uint needed_free_entries, free_entry;
704 :
705 6261 : if (entries + wanted_entries <= MAX_ROWS_PER_PAGE)
706 6261 : return 1;
707 :
708 : /* Check if enough free entries in free list */
709 0 : needed_free_entries= entries + wanted_entries - MAX_ROWS_PER_PAGE;
710 :
711 0 : free_entry= (uint) buff[DIR_FREE_OFFSET];
712 0 : while (free_entry != END_OF_DIR_FREE_LIST)
713 : {
714 : uchar *dir;
715 0 : if (!--needed_free_entries)
716 0 : return 1;
717 0 : dir= dir_entry_pos(buff, block_size, free_entry);
718 0 : free_entry= dir[3];
719 : }
720 0 : return 0; /* Not enough entries */
721 : }
722 :
723 :
724 : /**
725 : @brief Extend a record area to fit a given size block
726 :
727 : @fn extend_area_on_page()
728 : @param info Handler if head page and 0 if tail page
729 : @param buff Page buffer
730 : @param dir Pointer to dir entry in buffer
731 : @param rownr Row number we working on
732 : @param block_size Block size of buffer
733 : @param request_length How much data we want to put at [dir]
734 : @param empty_space Total empty space in buffer
735 : This is updated with length after dir
736 : is allocated and current block freed
737 :
738 : @implementation
739 : The logic is as follows (same as in _ma_update_block_record())
740 : - If new data fits in old block, use old block.
741 : - Extend block with empty space before block. If enough, use it.
742 : - Extend block with empty space after block. If enough, use it.
743 : - Use _ma_compact_block_page() to get all empty space at dir.
744 :
745 : @note
746 : The given directory entry is set to rec length.
747 : empty_space doesn't include the new directory entry
748 :
749 :
750 : @return
751 : @retval 0 ok
752 : @retval ret_offset Pointer to store offset to found area
753 : @retval ret_length Pointer to store length of found area
754 : @retval [dir] rec_offset is store here too
755 :
756 : @retval 1 error (wrong info in block)
757 : */
758 :
759 : static my_bool extend_area_on_page(MARIA_HA *info,
760 : uchar *buff, uchar *dir,
761 : uint rownr, uint block_size,
762 : uint request_length,
763 : uint *empty_space, uint *ret_offset,
764 : uint *ret_length)
765 89344 : {
766 : uint rec_offset, length;
767 89344 : uint max_entry= (uint) buff[DIR_COUNT_OFFSET];
768 89344 : DBUG_ENTER("extend_area_on_page");
769 :
770 89344 : rec_offset= uint2korr(dir);
771 89344 : if (rec_offset)
772 : {
773 : /* Extending old row; Mark current space as 'free' */
774 61800 : length= uint2korr(dir + 2);
775 61800 : DBUG_PRINT("info", ("rec_offset: %u length: %u request_length: %u "
776 : "empty_space: %u",
777 : rec_offset, length, request_length, *empty_space));
778 :
779 61800 : *empty_space+= length;
780 : }
781 : else
782 : {
783 : /* Reusing free directory entry; Free it from the directory list */
784 27544 : if (dir[2] == END_OF_DIR_FREE_LIST)
785 5992 : buff[DIR_FREE_OFFSET]= dir[3];
786 : else
787 : {
788 21552 : uchar *prev_dir= dir_entry_pos(buff, block_size, (uint) dir[2]);
789 21552 : DBUG_ASSERT(uint2korr(prev_dir) == 0 && prev_dir[3] == (uchar) rownr);
790 21552 : prev_dir[3]= dir[3];
791 : }
792 27544 : if (dir[3] != END_OF_DIR_FREE_LIST)
793 : {
794 25796 : uchar *next_dir= dir_entry_pos(buff, block_size, (uint) dir[3]);
795 25796 : DBUG_ASSERT(uint2korr(next_dir) == 0 && next_dir[2] == (uchar) rownr);
796 25796 : next_dir[2]= dir[2];
797 : }
798 27544 : rec_offset= start_of_next_entry(dir);
799 27544 : length= 0;
800 : }
801 89344 : if (length < request_length)
802 : {
803 : uint old_rec_offset;
804 : /*
805 : New data did not fit in old position.
806 : Find first possible position where to put new data.
807 : */
808 38821 : old_rec_offset= rec_offset;
809 38821 : rec_offset= end_of_previous_entry(dir, buff + block_size -
810 : PAGE_SUFFIX_SIZE);
811 38821 : length+= (uint) (old_rec_offset - rec_offset);
812 38821 : DBUG_ASSERT(old_rec_offset);
813 : /*
814 : 'length' is 0 if we are doing an insert into a not allocated block.
815 : This can only happen during "REDO of INSERT" or "UNDO of DELETE."
816 : */
817 38821 : if (length < request_length)
818 : {
819 : /*
820 : Did not fit in current block + empty space. Extend with
821 : empty space after block.
822 : */
823 32686 : if (rownr == max_entry - 1)
824 : {
825 : /* Last entry; Everything is free between this and directory */
826 166 : length= ((block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE * max_entry) -
827 : rec_offset);
828 : }
829 : else
830 32520 : length= start_of_next_entry(dir) - rec_offset;
831 32686 : DBUG_ASSERT((int) length >= 0);
832 32686 : if (length < request_length)
833 : {
834 : /* Not enough continuous space, compact page to get more */
835 32156 : int2store(dir, rec_offset);
836 : /* Reset length, as this may be a deleted block */
837 32156 : int2store(dir+2, 0);
838 32156 : _ma_compact_block_page(buff, block_size, rownr, 1,
839 : info ? info->trn->min_read_from: 0,
840 : info ? info->s->base.min_block_length : 0);
841 32156 : rec_offset= uint2korr(dir);
842 32156 : length= uint2korr(dir+2);
843 32156 : if (length < request_length)
844 : {
845 0 : DBUG_PRINT("error", ("Not enough space: "
846 : "length: %u request_length: %u",
847 : length, request_length));
848 0 : my_errno= HA_ERR_WRONG_IN_RECORD; /* File crashed */
849 0 : DBUG_RETURN(1); /* Error in block */
850 : }
851 32156 : *empty_space= length; /* All space is here */
852 : }
853 : }
854 : }
855 89344 : int2store(dir, rec_offset);
856 89344 : int2store(dir + 2, length);
857 89344 : *ret_offset= rec_offset;
858 89344 : *ret_length= length;
859 89344 : check_directory(buff, block_size, info ? info->s->base.min_block_length : 0);
860 89344 : DBUG_RETURN(0);
861 : }
862 :
863 :
864 : /**
865 : @brief Copy not changed fields from 'from' to 'to'
866 :
867 : @notes
868 : Assumption is that most fields are not changed!
869 : (Which is why we don't test if all bits are set for some bytes in bitmap)
870 : */
871 :
872 : void copy_not_changed_fields(MARIA_HA *info, MY_BITMAP *changed_fields,
873 : uchar *to, uchar *from)
874 1671 : {
875 : MARIA_COLUMNDEF *column, *end_column;
876 1671 : uchar *bitmap= (uchar*) changed_fields->bitmap;
877 1671 : MARIA_SHARE *share= info->s;
878 1671 : uint bit= 1;
879 :
880 1671 : for (column= share->columndef, end_column= column+ share->base.fields;
881 11064 : column < end_column; column++)
882 : {
883 9393 : if (!(*bitmap & bit))
884 : {
885 857 : uint field_length= column->length;
886 857 : if (column->type == FIELD_VARCHAR)
887 : {
888 0 : if (column->fill_length == 1)
889 0 : field_length= (uint) from[column->offset] + 1;
890 : else
891 0 : field_length= uint2korr(from + column->offset) + 2;
892 : }
893 857 : memcpy(to + column->offset, from + column->offset, field_length);
894 : }
895 9393 : if ((bit= (bit << 1)) == 256)
896 : {
897 0 : bitmap++;
898 0 : bit= 1;
899 : }
900 : }
901 : }
902 :
903 : #ifdef NOT_YET_NEEDED
904 : /* Calculate empty space on a page */
905 :
906 : static uint empty_space_on_page(uchar *buff, uint block_size)
907 : {
908 : enum en_page_type;
909 : page_type= (enum en_page_type) (buff[PAGE_TYPE_OFFSET] &
910 : ~(uchar) PAGE_CAN_BE_COMPACTED);
911 : if (page_type == UNALLOCATED_PAGE)
912 : return block_size;
913 : if ((uint) page_type <= TAIL_PAGE)
914 : return uint2korr(buff+EMPTY_SPACE_OFFSET);
915 : return 0; /* Blob page */
916 : }
917 : #endif
918 :
919 :
920 : /*
921 : @brief Ensure we have space for new directory entries
922 :
923 : @fn make_space_for_directory()
924 : @param buff Page buffer
925 : @param block_size Block size for pages
926 : @param max_entry Number of current entries in directory
927 : @param count Number of new entries to be added to directory
928 : @param first_dir First directory entry on page
929 : @param empty_space Total empty space in buffer. It's updated
930 : to reflect the new empty space
931 : @param first_pos Store position to last data byte on page here
932 :
933 : @note
934 : This function is inline as the argument passing is the biggest
935 : part of the function
936 :
937 : @return
938 : @retval 0 ok
939 : @retval 1 error (No data on page, fatal error)
940 : */
941 :
942 : static inline my_bool
943 : make_space_for_directory(MARIA_HA *info,
944 : uchar *buff, uint block_size, uint max_entry,
945 : uint count, uchar *first_dir, uint *empty_space,
946 : uint *first_pos)
947 227327 : {
948 227327 : uint length_needed= DIR_ENTRY_SIZE * count;
949 :
950 : /*
951 : The following is not true only in the case and UNDO is used to reinsert
952 : a row on a previously not used page
953 : */
954 227327 : if (likely(max_entry))
955 : {
956 : /* Check if there is place for the directory entry on the page */
957 226022 : *first_pos= uint2korr(first_dir) + uint2korr(first_dir + 2);
958 :
959 226022 : if ((uint) (first_dir - buff) < *first_pos + length_needed)
960 : {
961 : /* Create place for directory */
962 66 : _ma_compact_block_page(buff, block_size, max_entry - 1, 0,
963 : info ? info->trn->min_read_from : 0,
964 : info ? info->s->base.min_block_length : 0);
965 66 : *first_pos= (uint2korr(first_dir) + uint2korr(first_dir + 2));
966 66 : *empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
967 66 : if (*empty_space < length_needed)
968 : {
969 : /*
970 : We should always have space, as we only come here for
971 : UNDO of DELETE (in which case we know the row was on the
972 : page before) or if the bitmap told us there was space on page
973 : */
974 0 : DBUG_ASSERT(0);
975 : return(1);
976 : }
977 : }
978 : }
979 : else
980 1305 : *first_pos= PAGE_HEADER_SIZE;
981 :
982 : /* Reduce directory entry size from free space size */
983 227327 : (*empty_space)-= length_needed;
984 227327 : buff[DIR_COUNT_OFFSET]= (uchar) (max_entry + count);
985 227327 : return(0);
986 : }
987 :
988 :
989 : /*
990 : Find free position in directory
991 :
992 : SYNOPSIS
993 : find_free_position()
994 : info Handler if head page and 0 otherwise
995 : buff Page
996 : block_size Size of page
997 : res_rownr Store index to free position here
998 : res_length Store length of found segment here
999 : empty_space Store length of empty space on disk here. This is
1000 : all empty space, including the found block.
1001 :
1002 : NOTES
1003 : If there is a free directory entry (entry with position == 0),
1004 : then use it and change it to be the size of the empty block
1005 : after the previous entry. This guarantees that all row entries
1006 : are stored on disk in inverse directory order, which makes life easier for
1007 : '_ma_compact_block_page()' and to know if there is free space after any
1008 : block.
1009 :
1010 : If there is no free entry (entry with position == 0), then we create
1011 : a new one. If there is not space for the directory entry (because
1012 : the last block overlapps with the directory), we compact the page.
1013 :
1014 : We will update the offset and the length of the found dir entry to
1015 : match the position and empty space found.
1016 :
1017 : buff[EMPTY_SPACE_OFFSET] is NOT updated but left up to the caller
1018 :
1019 : See start of file for description of how free directory entires are linked
1020 :
1021 : RETURN
1022 : 0 Error (directory full or last block goes over directory)
1023 : # Pointer to directory entry on page
1024 : */
1025 :
1026 : static uchar *find_free_position(MARIA_HA *info,
1027 : uchar *buff, uint block_size, uint *res_rownr,
1028 : uint *res_length, uint *empty_space)
1029 183649 : {
1030 : uint max_entry, free_entry;
1031 : uint length, first_pos;
1032 : uchar *dir, *first_dir;
1033 183649 : DBUG_ENTER("find_free_position");
1034 :
1035 183649 : max_entry= (uint) buff[DIR_COUNT_OFFSET];
1036 183649 : free_entry= (uint) buff[DIR_FREE_OFFSET];
1037 183649 : *empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
1038 :
1039 183649 : DBUG_PRINT("info", ("max_entry: %u free_entry: %u", max_entry, free_entry));
1040 :
1041 183649 : first_dir= dir_entry_pos(buff, block_size, max_entry - 1);
1042 :
1043 : /* Search after first free position */
1044 183649 : if (free_entry != END_OF_DIR_FREE_LIST)
1045 : {
1046 60 : if (free_entry >= max_entry)
1047 0 : DBUG_RETURN(0); /* Consistency error */
1048 60 : dir= dir_entry_pos(buff, block_size, free_entry);
1049 60 : DBUG_ASSERT(uint2korr(dir) == 0 && dir[2] == END_OF_DIR_FREE_LIST);
1050 : /* Relink free list */
1051 60 : if ((buff[DIR_FREE_OFFSET]= dir[3]) != END_OF_DIR_FREE_LIST)
1052 : {
1053 0 : uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]);
1054 0 : DBUG_ASSERT((uint) next_entry[2] == free_entry &&
1055 : uint2korr(next_entry) == 0);
1056 0 : next_entry[2]= END_OF_DIR_FREE_LIST; /* Backlink */
1057 : }
1058 :
1059 60 : first_pos= end_of_previous_entry(dir, buff + block_size -
1060 : PAGE_SUFFIX_SIZE);
1061 60 : length= start_of_next_entry(dir) - first_pos;
1062 60 : int2store(dir, first_pos); /* Update dir entry */
1063 60 : int2store(dir + 2, 0);
1064 60 : *res_rownr= free_entry;
1065 60 : *res_length= length;
1066 :
1067 60 : check_directory(buff, block_size,
1068 : info ? info->s->base.min_block_length : 0);
1069 60 : DBUG_RETURN(dir);
1070 : }
1071 : /* No free places in dir; create a new one */
1072 :
1073 : /* Check if there is place for the directory entry */
1074 183589 : if (max_entry == MAX_ROWS_PER_PAGE)
1075 0 : DBUG_RETURN(0);
1076 :
1077 183589 : if (make_space_for_directory(info, buff, block_size, max_entry, 1,
1078 : first_dir, empty_space, &first_pos))
1079 0 : DBUG_RETURN(0);
1080 :
1081 183589 : dir= first_dir - DIR_ENTRY_SIZE;
1082 183589 : length= (uint) (dir - buff - first_pos);
1083 183589 : DBUG_ASSERT(length <= *empty_space);
1084 183589 : int2store(dir, first_pos);
1085 183589 : int2store(dir + 2, 0); /* Max length of region */
1086 183589 : *res_rownr= max_entry;
1087 183589 : *res_length= length;
1088 :
1089 183589 : check_directory(buff, block_size, info ? info->s->base.min_block_length : 0);
1090 183589 : DBUG_RETURN(dir);
1091 : }
1092 :
1093 :
1094 : /**
1095 : @brief Enlarge page directory to hold more entries
1096 :
1097 : @fn extend_directory()
1098 : @param info Handler if head page and 0 otherwise
1099 : @param buff Page buffer
1100 : @param block_size Block size
1101 : @param max_entry Number of directory entries on page
1102 : @param new_entry Position for new entry
1103 : @param empty_space Total empty space in buffer. It's updated
1104 : to reflect the new empty space
1105 :
1106 : @note
1107 : This is only called on UNDO when we want to expand the directory
1108 : to be able to re-insert row in a given position
1109 :
1110 : The new directory entry will be set to cover the maximum possible space
1111 :
1112 : @return
1113 : @retval 0 ok
1114 : @retval 1 error (No data on page, fatal error)
1115 : */
1116 :
1117 : static my_bool extend_directory(MARIA_HA *info, uchar *buff, uint block_size,
1118 : uint max_entry, uint new_entry,
1119 : uint *empty_space)
1120 43738 : {
1121 : uint length, first_pos;
1122 : uchar *dir, *first_dir;
1123 43738 : DBUG_ENTER("extend_directory");
1124 :
1125 : /*
1126 : Note that in if max_entry is 0, then first_dir will point to
1127 : an illegal directory entry. This is ok, as in this case we will
1128 : not access anything through first_dir.
1129 : */
1130 43738 : first_dir= dir_entry_pos(buff, block_size, max_entry) + DIR_ENTRY_SIZE;
1131 :
1132 43738 : if (make_space_for_directory(info, buff, block_size, max_entry,
1133 : new_entry - max_entry + 1,
1134 : first_dir, empty_space, &first_pos))
1135 0 : DBUG_RETURN(1);
1136 :
1137 : /* Set the new directory entry to cover the max possible length */
1138 43738 : dir= first_dir - DIR_ENTRY_SIZE * (new_entry - max_entry + 1);
1139 43738 : length= (uint) (dir - buff - first_pos);
1140 43738 : int2store(dir, first_pos);
1141 43738 : int2store(dir+2, length);
1142 43738 : *empty_space-= length;
1143 :
1144 43738 : if (new_entry-- > max_entry)
1145 : {
1146 : /* Link all row entries between new_entry and max_entry into free list */
1147 612 : uint free_entry= (uint) buff[DIR_FREE_OFFSET];
1148 612 : uint prev_entry= END_OF_DIR_FREE_LIST;
1149 612 : buff[DIR_FREE_OFFSET]= new_entry;
1150 : do
1151 : {
1152 26620 : dir+= DIR_ENTRY_SIZE;
1153 26620 : dir[0]= dir[1]= 0;
1154 26620 : dir[2]= (uchar) prev_entry;
1155 26620 : dir[3]= (uchar) new_entry-1;
1156 26620 : prev_entry= new_entry;
1157 26620 : } while (new_entry-- > max_entry);
1158 612 : if ((dir[3]= free_entry) != END_OF_DIR_FREE_LIST)
1159 : {
1160 : /* Relink next entry to point to newly freed entry */
1161 0 : uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]);
1162 0 : DBUG_ASSERT(uint2korr(next_entry) == 0 &&
1163 : next_entry[2] == END_OF_DIR_FREE_LIST);
1164 0 : next_entry[2]= max_entry;
1165 : }
1166 : }
1167 :
1168 43738 : check_directory(buff, block_size,
1169 : info ? min(info->s->base.min_block_length, length) : 0);
1170 43738 : DBUG_RETURN(0);
1171 : }
1172 :
1173 :
1174 : /****************************************************************************
1175 : Updating records
1176 : ****************************************************************************/
1177 :
1178 : /*
1179 : Calculate length of all the different field parts
1180 :
1181 : SYNOPSIS
1182 : calc_record_size()
1183 : info Maria handler
1184 : record Row to store
1185 : row Store statistics about row here
1186 :
1187 : NOTES
1188 : The statistics is used to find out how much space a row will need
1189 : and also where we can split a row when we need to split it into several
1190 : extents.
1191 : */
1192 :
1193 : static void calc_record_size(MARIA_HA *info, const uchar *record,
1194 : MARIA_ROW *row)
1195 199849 : {
1196 199849 : MARIA_SHARE *share= info->s;
1197 : uchar *field_length_data;
1198 : MARIA_COLUMNDEF *column, *end_column;
1199 199849 : uint *null_field_lengths= row->null_field_lengths;
1200 199849 : ulong *blob_lengths= row->blob_lengths;
1201 199849 : DBUG_ENTER("calc_record_size");
1202 :
1203 199849 : row->normal_length= row->char_length= row->varchar_length=
1204 : row->blob_length= row->extents_count= 0;
1205 :
1206 : /* Create empty bitmap and calculate length of each varlength/char field */
1207 199849 : bzero(row->empty_bits, share->base.pack_bytes);
1208 199849 : field_length_data= row->field_lengths;
1209 : for (column= share->columndef + share->base.fixed_not_null_fields,
1210 199849 : end_column= share->columndef + share->base.fields;
1211 622990 : column < end_column; column++, null_field_lengths++)
1212 : {
1213 423141 : if ((record[column->null_pos] & column->null_bit))
1214 : {
1215 665 : if (column->type != FIELD_BLOB)
1216 400 : *null_field_lengths= 0;
1217 : else
1218 265 : *blob_lengths++= 0;
1219 : continue;
1220 : }
1221 422476 : switch (column->type) {
1222 : case FIELD_CHECK:
1223 : case FIELD_NORMAL: /* Fixed length field */
1224 : case FIELD_ZERO:
1225 150 : DBUG_ASSERT(column->empty_bit == 0);
1226 : /* fall through */
1227 : case FIELD_SKIP_PRESPACE: /* Not packed */
1228 150 : row->normal_length+= column->length;
1229 150 : *null_field_lengths= column->length;
1230 150 : break;
1231 : case FIELD_SKIP_ZERO: /* Fixed length field */
1232 182743 : if (memcmp(record+ column->offset, maria_zero_string,
1233 : column->length) == 0)
1234 : {
1235 177 : row->empty_bits[column->empty_pos] |= column->empty_bit;
1236 177 : *null_field_lengths= 0;
1237 : }
1238 : else
1239 : {
1240 182566 : row->normal_length+= column->length;
1241 182566 : *null_field_lengths= column->length;
1242 : }
1243 : break;
1244 : case FIELD_SKIP_ENDSPACE: /* CHAR */
1245 : {
1246 : const uchar *pos, *end;
1247 190278 : for (pos= record + column->offset, end= pos + column->length;
1248 190278 : end > pos && end[-1] == ' '; end--)
1249 : ;
1250 190278 : if (pos == end) /* If empty string */
1251 : {
1252 2725 : row->empty_bits[column->empty_pos]|= column->empty_bit;
1253 2725 : *null_field_lengths= 0;
1254 : }
1255 : else
1256 : {
1257 187553 : uint length= (uint) (end - pos);
1258 187553 : if (column->length <= 255)
1259 187553 : *field_length_data++= (uchar) length;
1260 : else
1261 : {
1262 0 : int2store(field_length_data, length);
1263 0 : field_length_data+= 2;
1264 : }
1265 187553 : row->char_length+= length;
1266 187553 : *null_field_lengths= length;
1267 : }
1268 : break;
1269 : }
1270 : case FIELD_VARCHAR:
1271 : {
1272 : uint length, field_length_data_length;
1273 1233 : const uchar *field_pos= record + column->offset;
1274 :
1275 : /* 256 is correct as this includes the length uchar */
1276 1233 : field_length_data[0]= field_pos[0];
1277 1233 : if (column->length <= 256)
1278 : {
1279 915 : length= (uint) (uchar) *field_pos;
1280 915 : field_length_data_length= 1;
1281 : }
1282 : else
1283 : {
1284 318 : length= uint2korr(field_pos);
1285 318 : field_length_data[1]= field_pos[1];
1286 318 : field_length_data_length= 2;
1287 : }
1288 1233 : *null_field_lengths= length;
1289 1233 : if (!length)
1290 : {
1291 0 : row->empty_bits[column->empty_pos]|= column->empty_bit;
1292 0 : break;
1293 : }
1294 1233 : row->varchar_length+= length;
1295 1233 : *null_field_lengths= length;
1296 1233 : field_length_data+= field_length_data_length;
1297 1233 : break;
1298 : }
1299 : case FIELD_BLOB:
1300 : {
1301 48072 : const uchar *field_pos= record + column->offset;
1302 48072 : uint size_length= column->length - portable_sizeof_char_ptr;
1303 48072 : ulong blob_length= _ma_calc_blob_length(size_length, field_pos);
1304 :
1305 48072 : *blob_lengths++= blob_length;
1306 48072 : if (!blob_length)
1307 37235 : row->empty_bits[column->empty_pos]|= column->empty_bit;
1308 : else
1309 : {
1310 10837 : row->blob_length+= blob_length;
1311 10837 : memcpy(field_length_data, field_pos, size_length);
1312 10837 : field_length_data+= size_length;
1313 : }
1314 : break;
1315 : }
1316 : default:
1317 0 : DBUG_ASSERT(0);
1318 : }
1319 : }
1320 199849 : row->field_lengths_length= (uint) (field_length_data - row->field_lengths);
1321 : /*
1322 : - info->row_base_length is base information we must have on a page in first
1323 : extent:
1324 : - flag byte (1) + is_nulls_extended (0 | 1) + null_bytes + pack_bytes +
1325 : table_checksum (0 | 1)
1326 : - row->min_length is minimum amount of data we must store on
1327 : a page. bitmap code will ensure we get at list this much +
1328 : total number of extents and one extent information
1329 : - fixed_not_null_fields_length is length of fixed length fields that can't
1330 : be compacted
1331 : - head_length is the amount of data for the head page
1332 : (ie, all fields except blobs)
1333 : */
1334 199849 : row->min_length= (info->row_base_length +
1335 : (share->base.max_field_lengths ?
1336 : size_to_store_key_length(row->field_lengths_length) :
1337 : 0));
1338 199849 : row->head_length= (row->min_length +
1339 : share->base.fixed_not_null_fields_length +
1340 : row->field_lengths_length +
1341 : row->normal_length +
1342 : row->char_length + row->varchar_length);
1343 199849 : row->total_length= (row->head_length + row->blob_length);
1344 199849 : if (row->total_length < share->base.min_block_length)
1345 3567 : row->total_length= share->base.min_block_length;
1346 199849 : DBUG_PRINT("exit", ("head_length: %lu total_length: %lu",
1347 : (ulong) row->head_length, (ulong) row->total_length));
1348 199849 : DBUG_VOID_RETURN;
1349 : }
1350 :
1351 :
1352 : /**
1353 : Compact page by removing all space between rows
1354 :
1355 : Moves up all rows to start of page. Moves blocks that are directly after
1356 : each other with one memmove.
1357 :
1358 : @note if rownr is the last row in the page, and extend_block is false,
1359 : caller has to make sure to update bitmap page afterwards to reflect freed
1360 : space.
1361 :
1362 : @param buff Page to compact
1363 : @param block_size Size of page
1364 : @param rownr Put empty data after this row
1365 : @param extend_block If 1, extend the block at 'rownr' to cover the
1366 : whole block.
1367 : @param min_read_from If <> 0, remove all trid's that are less than this
1368 : */
1369 :
1370 : void _ma_compact_block_page(uchar *buff, uint block_size, uint rownr,
1371 : my_bool extend_block, TrID min_read_from,
1372 : uint min_row_length)
1373 32784 : {
1374 32784 : uint max_entry= (uint) buff[DIR_COUNT_OFFSET];
1375 : uint page_pos, next_free_pos, start_of_found_block, diff, end_of_found_block;
1376 32784 : uint freed_size= 0;
1377 : uchar *dir, *end;
1378 32784 : DBUG_ENTER("_ma_compact_block_page");
1379 32784 : DBUG_PRINT("enter", ("rownr: %u", rownr));
1380 32784 : DBUG_ASSERT(max_entry > 0 &&
1381 : max_entry < (block_size - PAGE_HEADER_SIZE -
1382 : PAGE_SUFFIX_SIZE) / DIR_ENTRY_SIZE);
1383 :
1384 : /* Move all entries before and including rownr up to start of page */
1385 32784 : dir= dir_entry_pos(buff, block_size, rownr);
1386 32784 : end= dir_entry_pos(buff, block_size, 0);
1387 32784 : page_pos= next_free_pos= start_of_found_block= PAGE_HEADER_SIZE;
1388 32784 : diff= 0;
1389 1410161 : for (; dir <= end ; end-= DIR_ENTRY_SIZE)
1390 : {
1391 1377377 : uint offset= uint2korr(end);
1392 :
1393 1377377 : if (offset)
1394 : {
1395 358419 : uint row_length= uint2korr(end + 2);
1396 358419 : DBUG_ASSERT(offset >= page_pos);
1397 358419 : DBUG_ASSERT(buff + offset + row_length <= dir);
1398 358419 : DBUG_ASSERT(row_length >= min_row_length || row_length == 0);
1399 :
1400 : /* Row length can be zero if row is to be deleted */
1401 358419 : if (min_read_from && row_length && (buff[offset] & ROW_FLAG_TRANSID))
1402 : {
1403 8760 : TrID transid= transid_korr(buff+offset+1);
1404 8760 : if (transid < min_read_from)
1405 : {
1406 : /* Remove transid from row by moving the start point of the row up */
1407 8760 : buff[offset + TRANSID_SIZE]= buff[offset] & ~ROW_FLAG_TRANSID;
1408 8760 : offset+= TRANSID_SIZE;
1409 8760 : freed_size+= TRANSID_SIZE;
1410 8760 : row_length-= TRANSID_SIZE;
1411 8760 : int2store(end+2, row_length);
1412 : }
1413 : }
1414 :
1415 358419 : if (offset != next_free_pos)
1416 : {
1417 18045 : uint length= (next_free_pos - start_of_found_block);
1418 : /*
1419 : There was empty space before this and prev block
1420 : Check if we have to move previous block up to page start
1421 : */
1422 18045 : if (page_pos != start_of_found_block)
1423 : {
1424 : /* move up previous block */
1425 12913 : memmove(buff + page_pos, buff + start_of_found_block, length);
1426 : }
1427 18045 : page_pos+= length;
1428 : /* next continuous block starts here */
1429 18045 : start_of_found_block= offset;
1430 18045 : diff= offset - page_pos;
1431 : }
1432 358419 : int2store(end, offset - diff); /* correct current pos */
1433 358419 : next_free_pos= offset + row_length;
1434 :
1435 358419 : if (unlikely(row_length < min_row_length) && row_length)
1436 : {
1437 : /*
1438 : This can only happen in the case we compacted transid and
1439 : the row become 'too short'
1440 :
1441 : Move the current row down to it's right place and extend it
1442 : with 0.
1443 : */
1444 40 : uint row_diff= min_row_length - row_length;
1445 40 : uint length= (next_free_pos - start_of_found_block);
1446 :
1447 40 : DBUG_ASSERT(page_pos != start_of_found_block);
1448 40 : bmove(buff + page_pos, buff + start_of_found_block, length);
1449 40 : bzero(buff+ page_pos + length, row_diff);
1450 40 : page_pos+= min_row_length;
1451 40 : int2store(end+2, min_row_length);
1452 40 : freed_size-= row_diff;
1453 40 : next_free_pos= start_of_found_block= page_pos;
1454 40 : diff= 0;
1455 : }
1456 : }
1457 : }
1458 32784 : if (page_pos != start_of_found_block)
1459 : {
1460 5092 : uint length= (next_free_pos - start_of_found_block);
1461 5092 : memmove(buff + page_pos, buff + start_of_found_block, length);
1462 : }
1463 32784 : start_of_found_block= uint2korr(dir);
1464 :
1465 32784 : if (rownr != max_entry - 1)
1466 : {
1467 : /* Move all entries after rownr to end of page */
1468 : uint rownr_length;
1469 32393 : next_free_pos= end_of_found_block= page_pos=
1470 : block_size - DIR_ENTRY_SIZE * max_entry - PAGE_SUFFIX_SIZE;
1471 32393 : diff= 0;
1472 : /* End points to entry before 'rownr' */
1473 1396093 : for (dir= buff + end_of_found_block ; dir <= end ; dir+= DIR_ENTRY_SIZE)
1474 : {
1475 1363700 : uint offset= uint2korr(dir);
1476 : uint row_length;
1477 : uint row_end;
1478 1363700 : if (!offset)
1479 1216707 : continue;
1480 1216707 : row_length= uint2korr(dir + 2);
1481 1216707 : row_end= offset + row_length;
1482 1216707 : DBUG_ASSERT(offset >= start_of_found_block &&
1483 : row_end <= next_free_pos && row_length >= min_row_length);
1484 :
1485 1216707 : if (min_read_from && (buff[offset] & ROW_FLAG_TRANSID))
1486 : {
1487 10317 : TrID transid= transid_korr(buff + offset+1);
1488 10317 : if (transid < min_read_from)
1489 : {
1490 : /* Remove transid from row */
1491 10317 : buff[offset + TRANSID_SIZE]= buff[offset] & ~ROW_FLAG_TRANSID;
1492 10317 : offset+= TRANSID_SIZE;
1493 10317 : row_length-= TRANSID_SIZE;
1494 10317 : int2store(dir+2, row_length);
1495 : }
1496 10317 : if (unlikely(row_length < min_row_length))
1497 : {
1498 : /*
1499 : This can only happen in the case we compacted transid and
1500 : the row become 'too short'
1501 : */
1502 925 : uint row_diff= min_row_length - row_length;
1503 925 : if (next_free_pos < row_end + row_diff)
1504 : {
1505 : /*
1506 : Not enough space for extending next block with enough
1507 : end 0's. Move current data down to get place for them
1508 : */
1509 0 : uint move_down= row_diff - (next_free_pos - row_end);
1510 0 : bmove(buff + offset - move_down, buff + offset, row_length);
1511 0 : offset-= move_down;
1512 : }
1513 : /*
1514 : Extend the next block with 0, which will be part of current
1515 : row when the blocks are joined together later
1516 : */
1517 925 : bzero(buff + next_free_pos - row_diff, row_diff);
1518 925 : next_free_pos-= row_diff;
1519 925 : int2store(dir+2, min_row_length);
1520 : }
1521 10317 : row_end= offset + row_length;
1522 : }
1523 :
1524 1216707 : if (row_end != next_free_pos)
1525 : {
1526 43348 : uint length= (end_of_found_block - next_free_pos);
1527 43348 : if (page_pos != end_of_found_block)
1528 : {
1529 : /* move next block down */
1530 14144 : memmove(buff + page_pos - length, buff + next_free_pos, length);
1531 : }
1532 43348 : page_pos-= length;
1533 : /* next continuous block starts here */
1534 43348 : end_of_found_block= row_end;
1535 43348 : diff= page_pos - row_end;
1536 : }
1537 1216707 : int2store(dir, offset + diff); /* correct current pos */
1538 1216707 : next_free_pos= offset;
1539 : }
1540 :
1541 32393 : if (page_pos != end_of_found_block)
1542 : {
1543 29204 : uint length= (end_of_found_block - next_free_pos);
1544 29204 : memmove(buff + page_pos - length, buff + next_free_pos, length);
1545 29204 : next_free_pos= page_pos- length;
1546 : }
1547 : /* Extend rownr block to cover hole */
1548 32393 : rownr_length= next_free_pos - start_of_found_block;
1549 32393 : int2store(dir+2, rownr_length);
1550 32393 : DBUG_ASSERT(rownr_length >= min_row_length);
1551 : }
1552 : else
1553 : {
1554 391 : if (extend_block)
1555 : {
1556 : /* Extend last block to cover whole page */
1557 67 : uint length= ((uint) (dir - buff) - start_of_found_block);
1558 67 : int2store(dir+2, length);
1559 67 : DBUG_ASSERT(length >= min_row_length);
1560 : }
1561 : else
1562 : {
1563 : /* Add length gained from freed transaction id's to this page */
1564 324 : uint length= uint2korr(buff+ EMPTY_SPACE_OFFSET) + freed_size;
1565 324 : int2store(buff + EMPTY_SPACE_OFFSET, length);
1566 : }
1567 391 : buff[PAGE_TYPE_OFFSET]&= ~(uchar) PAGE_CAN_BE_COMPACTED;
1568 : }
1569 32784 : check_directory(buff, block_size, min_row_length);
1570 32784 : DBUG_EXECUTE("directory", _ma_print_directory(buff, block_size););
1571 32784 : DBUG_VOID_RETURN;
1572 : }
1573 :
1574 :
1575 : /*
1576 : Create an empty tail or head page
1577 :
1578 : SYNOPSIS
1579 : make_empty_page()
1580 : buff Page buffer
1581 : block_size Block size
1582 : page_type HEAD_PAGE or TAIL_PAGE
1583 : create_dir_entry TRUE of we should create a directory entry
1584 :
1585 : NOTES
1586 : EMPTY_SPACE is not updated
1587 : */
1588 :
1589 : static void make_empty_page(MARIA_HA *info, uchar *buff, uint page_type,
1590 : my_bool create_dir_entry)
1591 11366 : {
1592 11366 : uint block_size= info->s->block_size;
1593 11366 : DBUG_ENTER("make_empty_page");
1594 :
1595 11366 : bzero(buff, PAGE_HEADER_SIZE);
1596 :
1597 : #if !defined(DONT_ZERO_PAGE_BLOCKS) || defined(HAVE_purify)
1598 : /*
1599 : We zero the rest of the block to avoid getting old memory information
1600 : to disk and to allow the file to be compressed better if archived.
1601 : The code does not assume the block is zeroed.
1602 : */
1603 11366 : if (page_type != BLOB_PAGE)
1604 7252 : bzero(buff+ PAGE_HEADER_SIZE, block_size - PAGE_HEADER_SIZE);
1605 : #endif
1606 11366 : buff[PAGE_TYPE_OFFSET]= (uchar) page_type;
1607 11366 : buff[DIR_COUNT_OFFSET]= (int) create_dir_entry;
1608 11366 : buff[DIR_FREE_OFFSET]= END_OF_DIR_FREE_LIST;
1609 11366 : if (create_dir_entry)
1610 : {
1611 : /* Create directory entry to point to start of page with size 0 */
1612 5947 : buff+= block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE;
1613 5947 : int2store(buff, PAGE_HEADER_SIZE);
1614 5947 : int2store(buff+2, 0);
1615 : }
1616 11366 : DBUG_VOID_RETURN;
1617 : }
1618 :
1619 :
1620 : /*
1621 : Read or initialize new head or tail page
1622 :
1623 : SYNOPSIS
1624 : get_head_or_tail_page()
1625 : info Maria handler
1626 : block Block to read
1627 : buff Suggest this buffer to key cache
1628 : length Minimum space needed
1629 : page_type HEAD_PAGE || TAIL_PAGE
1630 : res Store result position here
1631 :
1632 : NOTES
1633 : We don't decremented buff[EMPTY_SPACE_OFFSET] with the allocated data
1634 : as we don't know how much data the caller will actually use.
1635 :
1636 : res->empty_space is set to length of empty space
1637 :
1638 : RETURN
1639 : 0 ok All slots in 'res' are updated
1640 : 1 error my_errno is set
1641 : */
1642 :
1643 : struct st_row_pos_info
1644 : {
1645 : uchar *buff; /* page buffer */
1646 : uchar *data; /* Place for data */
1647 : uchar *dir; /* Directory */
1648 : uint length; /* Length for data */
1649 : uint rownr; /* Offset in directory */
1650 : uint empty_space; /* Space left on page */
1651 : };
1652 :
1653 :
1654 : static my_bool get_head_or_tail_page(MARIA_HA *info,
1655 : MARIA_BITMAP_BLOCK *block,
1656 : uchar *buff, uint length, uint page_type,
1657 : enum pagecache_page_lock lock,
1658 : struct st_row_pos_info *res)
1659 188086 : {
1660 : uint block_size;
1661 : MARIA_PINNED_PAGE page_link;
1662 188086 : MARIA_SHARE *share= info->s;
1663 188086 : DBUG_ENTER("get_head_or_tail_page");
1664 188086 : DBUG_PRINT("enter", ("length: %u", length));
1665 :
1666 188086 : block_size= share->block_size;
1667 188086 : if (block->org_bitmap_value == 0) /* Empty block */
1668 : {
1669 : /* New page */
1670 4437 : make_empty_page(info, buff, page_type, 1);
1671 4437 : res->buff= buff;
1672 4437 : res->empty_space= res->length= (block_size - PAGE_OVERHEAD_SIZE);
1673 4437 : res->data= (buff + PAGE_HEADER_SIZE);
1674 4437 : res->dir= res->data + res->length;
1675 4437 : res->rownr= 0;
1676 4437 : DBUG_ASSERT(length <= res->length);
1677 : }
1678 : else
1679 : {
1680 : uchar *dir;
1681 : /* Read old page */
1682 183649 : page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
1683 183649 : res->buff= pagecache_read(share->pagecache, &info->dfile,
1684 : block->page, 0, 0, share->page_type,
1685 : lock, &page_link.link);
1686 183649 : page_link.changed= res->buff != 0;
1687 183649 : push_dynamic(&info->pinned_pages, (void*) &page_link);
1688 183649 : if (!page_link.changed)
1689 183649 : goto crashed;
1690 :
1691 183649 : DBUG_ASSERT((res->buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == page_type);
1692 183649 : if (!(dir= find_free_position(page_type == HEAD_PAGE ? info : 0,
1693 : res->buff, block_size, &res->rownr,
1694 : &res->length, &res->empty_space)))
1695 183649 : goto crashed;
1696 :
1697 183649 : if (res->length < length)
1698 : {
1699 8 : if (res->empty_space + res->length >= length)
1700 : {
1701 8 : _ma_compact_block_page(res->buff, block_size, res->rownr, 1,
1702 : (page_type == HEAD_PAGE ?
1703 : info->trn->min_read_from : 0),
1704 : (page_type == HEAD_PAGE ?
1705 : share->base.min_block_length :
1706 : 0));
1707 : /* All empty space are now after current position */
1708 8 : dir= dir_entry_pos(res->buff, block_size, res->rownr);
1709 8 : res->length= res->empty_space= uint2korr(dir+2);
1710 : }
1711 8 : if (res->length < length)
1712 : {
1713 0 : DBUG_PRINT("error", ("length: %u res->length: %u empty_space: %u",
1714 : length, res->length, res->empty_space));
1715 0 : goto crashed; /* Wrong bitmap information */
1716 : }
1717 : }
1718 183649 : res->dir= dir;
1719 183649 : res->data= res->buff + uint2korr(dir);
1720 : }
1721 188086 : DBUG_RETURN(0);
1722 :
1723 0 : crashed:
1724 0 : my_errno= HA_ERR_WRONG_IN_RECORD; /* File crashed */
1725 0 : DBUG_RETURN(1);
1726 : }
1727 :
1728 :
1729 : /*
1730 : @brief Create room for a head or tail row on a given page at given position
1731 :
1732 : @fn get_rowpos_in_head_or_tail_page()
1733 : @param info Maria handler
1734 : @param block Block to read
1735 : @param buff Suggest this buffer to key cache
1736 : @param length Minimum space needed
1737 : @param page_type HEAD_PAGE || TAIL_PAGE
1738 : @param rownr Rownr to use
1739 : @param res Store result position here
1740 :
1741 : @note
1742 : This is essential same as get_head_or_tail_page, with the difference
1743 : that the caller species at what position the row should be put.
1744 : This is used when restoring a row to it's original position as
1745 : part of UNDO DELETE or UNDO UPDATE
1746 :
1747 : @return
1748 : @retval 0 ok All slots in 'res' are updated
1749 : @retval 1 error my_errno is set
1750 : */
1751 :
1752 : static my_bool get_rowpos_in_head_or_tail_page(MARIA_HA *info,
1753 : MARIA_BITMAP_BLOCK *block,
1754 : uchar *buff, uint length,
1755 : uint page_type,
1756 : enum pagecache_page_lock lock,
1757 : uint rownr,
1758 : struct st_row_pos_info *res)
1759 14496 : {
1760 : MARIA_PINNED_PAGE page_link;
1761 14496 : MARIA_SHARE *share= info->s;
1762 : uchar *dir;
1763 14496 : uint block_size= share->block_size;
1764 : uint max_entry, max_length, rec_offset;
1765 14496 : DBUG_ENTER("get_rowpos_in_head_or_tail_page");
1766 :
1767 14496 : if (block->org_bitmap_value == 0) /* Empty block */
1768 : {
1769 : /* New page */
1770 547 : make_empty_page(info, buff, page_type, 0);
1771 547 : res->empty_space= block_size - PAGE_HEADER_SIZE - PAGE_SUFFIX_SIZE;
1772 : }
1773 : else
1774 : {
1775 13949 : page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
1776 13949 : buff= pagecache_read(share->pagecache, &info->dfile,
1777 : block->page, 0, 0, share->page_type,
1778 : lock, &page_link.link);
1779 13949 : page_link.changed= buff != 0;
1780 13949 : push_dynamic(&info->pinned_pages, (void*) &page_link);
1781 13949 : if (!page_link.changed) /* Read error */
1782 13949 : goto err;
1783 13949 : DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) ==
1784 : (uchar) page_type);
1785 13949 : if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != (uchar) page_type)
1786 13949 : goto err;
1787 13949 : res->empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
1788 : }
1789 :
1790 14496 : max_entry= (uint) buff[DIR_COUNT_OFFSET];
1791 14496 : if (max_entry <= rownr)
1792 : {
1793 736 : if (extend_directory(page_type == HEAD_PAGE ? info : 0, buff, block_size,
1794 : max_entry, rownr, &res->empty_space))
1795 14496 : goto err;
1796 : }
1797 :
1798 14496 : dir= dir_entry_pos(buff, block_size, rownr);
1799 : #ifdef SANITY_CHECKS
1800 : /* Tail's should always be unused */
1801 14496 : if (page_type == TAIL_PAGE && max_entry > rownr &&
1802 : (dir[0] != 0 || dir[1] != 0))
1803 : {
1804 0 : DBUG_ASSERT(0);
1805 : goto err;
1806 : }
1807 : #endif
1808 :
1809 14496 : if (extend_area_on_page(page_type == HEAD_PAGE ? info : 0, buff, dir,
1810 : rownr, block_size, length,
1811 : &res->empty_space, &rec_offset, &max_length))
1812 14496 : goto err;
1813 :
1814 14496 : res->buff= buff;
1815 14496 : res->rownr= rownr;
1816 14496 : res->dir= dir;
1817 14496 : res->data= buff + rec_offset;
1818 14496 : res->length= length;
1819 14496 : DBUG_RETURN(0);
1820 :
1821 0 : err:
1822 0 : my_errno= HA_ERR_WRONG_IN_RECORD; /* File crashed */
1823 0 : DBUG_RETURN(1);
1824 : }
1825 :
1826 :
1827 : /*
1828 : Write tail for head data or blob
1829 :
1830 : SYNOPSIS
1831 : write_tail()
1832 : info Maria handler
1833 : block Block to tail page
1834 : row_part Data to write to page
1835 : length Length of data
1836 :
1837 : NOTES
1838 : block->page_count is updated to the directory offset for the tail
1839 : so that we can store the position in the row extent information
1840 :
1841 : RETURN
1842 : 0 ok
1843 : block->page_count is set to point (dir entry + TAIL_BIT)
1844 :
1845 : 1 error; In this case my_errno is set to the error
1846 : */
1847 :
1848 : static my_bool write_tail(MARIA_HA *info,
1849 : MARIA_BITMAP_BLOCK *block,
1850 : uchar *row_part, uint org_length)
1851 3298 : {
1852 3298 : MARIA_SHARE *share= info->s;
1853 : MARIA_PINNED_PAGE page_link;
1854 3298 : uint block_size= share->block_size, empty_space, length= org_length;
1855 : struct st_row_pos_info row_pos;
1856 : my_off_t position;
1857 : my_bool res, block_is_read;
1858 3298 : DBUG_ENTER("write_tail");
1859 3298 : DBUG_PRINT("enter", ("page: %lu length: %u",
1860 : (ulong) block->page, length));
1861 :
1862 3298 : info->keyread_buff_used= 1;
1863 : /*
1864 : Don't allocate smaller block than MIN_TAIL_SIZE (we want to give rows
1865 : some place to grow in the future)
1866 : */
1867 3298 : if (length < MIN_TAIL_SIZE)
1868 0 : length= MIN_TAIL_SIZE;
1869 :
1870 3298 : if (block->page_count == TAIL_PAGE_COUNT_MARKER)
1871 : {
1872 : /*
1873 : Create new tail
1874 : page will be pinned & locked by get_head_or_tail_page
1875 : */
1876 2716 : if (get_head_or_tail_page(info, block, info->keyread_buff, length,
1877 : TAIL_PAGE, PAGECACHE_LOCK_WRITE,
1878 : &row_pos))
1879 0 : DBUG_RETURN(1);
1880 : }
1881 : else
1882 : {
1883 : /* Write tail on predefined row position */
1884 582 : if (get_rowpos_in_head_or_tail_page(info, block, info->keyread_buff,
1885 : length, TAIL_PAGE,
1886 : PAGECACHE_LOCK_WRITE,
1887 : block->page_count & ~TAIL_BIT,
1888 : &row_pos))
1889 0 : DBUG_RETURN(1);
1890 : }
1891 3298 : DBUG_PRINT("info", ("tailid: %lu (%lu:%u)",
1892 : (ulong) ma_recordpos(block->page, row_pos.rownr),
1893 : (ulong) block->page, row_pos.rownr));
1894 :
1895 3298 : block_is_read= block->org_bitmap_value != 0;
1896 :
1897 3298 : memcpy(row_pos.data, row_part, org_length);
1898 :
1899 3298 : if (share->now_transactional)
1900 : {
1901 : /* Log changes in tail block */
1902 : uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE];
1903 : LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
1904 : LSN lsn;
1905 :
1906 : /*
1907 : Log REDO changes of tail page
1908 : Note that we have to log length, not org_length, to be sure that
1909 : REDO, which doesn't use write_tail, also creates a block of at least
1910 : MIN_TAIL_SIZE
1911 : */
1912 2682 : page_store(log_data + FILEID_STORE_SIZE, block->page);
1913 2682 : dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE,
1914 : row_pos.rownr);
1915 2682 : log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
1916 2682 : log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
1917 2682 : log_array[TRANSLOG_INTERNAL_PARTS + 1].str= row_pos.data;
1918 2682 : log_array[TRANSLOG_INTERNAL_PARTS + 1].length= length;
1919 2682 : if (translog_write_record(&lsn,
1920 : (block_is_read ? LOGREC_REDO_INSERT_ROW_TAIL :
1921 : LOGREC_REDO_NEW_ROW_TAIL),
1922 : info->trn, info,
1923 : (translog_size_t) (sizeof(log_data) + length),
1924 : TRANSLOG_INTERNAL_PARTS + 2, log_array,
1925 : log_data, NULL))
1926 0 : DBUG_RETURN(1);
1927 : }
1928 :
1929 3298 : int2store(row_pos.dir + 2, length);
1930 3298 : empty_space= row_pos.empty_space - length;
1931 3298 : int2store(row_pos.buff + EMPTY_SPACE_OFFSET, empty_space);
1932 3298 : block->page_count= row_pos.rownr + TAIL_BIT;
1933 : /*
1934 : If there is less directory entries free than number of possible tails
1935 : we can write for a row, we mark the page full to ensure that we don't
1936 : during _ma_bitmap_find_place() allocate more entries on the tail page
1937 : than it can hold
1938 : */
1939 3298 : block->empty_space= (enough_free_entries(row_pos.buff, share->block_size,
1940 : 1 + share->base.blobs) ?
1941 : empty_space : 0);
1942 3298 : block->used= BLOCKUSED_USED | BLOCKUSED_TAIL;
1943 :
1944 : /* Increase data file size, if extended */
1945 3298 : position= (my_off_t) block->page * block_size;
1946 3298 : if (share->state.state.data_file_length <= position)
1947 : {
1948 : /*
1949 : We are modifying a state member before writing the UNDO; this is a WAL
1950 : violation. But for data_file_length this is ok, as long as we change
1951 : data_file_length after writing any log record (FILE_ID/REDO/UNDO) (see
1952 : collect_tables()).
1953 : */
1954 1241 : _ma_set_share_data_file_length(share, position + block_size);
1955 : }
1956 :
1957 3298 : if (block_is_read)
1958 : {
1959 : /* Current page link is last element in pinned_pages */
1960 : MARIA_PINNED_PAGE *page_link;
1961 1576 : page_link= dynamic_element(&info->pinned_pages,
1962 : info->pinned_pages.elements-1,
1963 : MARIA_PINNED_PAGE*);
1964 1576 : pagecache_unlock_by_link(share->pagecache, page_link->link,
1965 : PAGECACHE_LOCK_WRITE_TO_READ,
1966 : PAGECACHE_PIN_LEFT_PINNED, LSN_IMPOSSIBLE,
1967 : LSN_IMPOSSIBLE, 1, FALSE);
1968 1576 : DBUG_ASSERT(page_link->changed);
1969 1576 : page_link->unlock= PAGECACHE_LOCK_READ_UNLOCK;
1970 1576 : res= 0;
1971 : }
1972 1722 : else if (!(res= pagecache_write(share->pagecache,
1973 : &info->dfile, block->page, 0,
1974 : row_pos.buff,share->page_type,
1975 : PAGECACHE_LOCK_READ,
1976 : PAGECACHE_PIN,
1977 : PAGECACHE_WRITE_DELAY, &page_link.link,
1978 : LSN_IMPOSSIBLE)))
1979 : {
1980 1722 : page_link.unlock= PAGECACHE_LOCK_READ_UNLOCK;
1981 1722 : page_link.changed= 1;
1982 1722 : push_dynamic(&info->pinned_pages, (void*) &page_link);
1983 : }
1984 3298 : DBUG_RETURN(res);
1985 : }
1986 :
1987 :
1988 : /*
1989 : Write full pages
1990 :
1991 : SYNOPSIS
1992 : write_full_pages()
1993 : info Maria handler
1994 : lsn LSN for the undo record
1995 : block Where to write data
1996 : data Data to write
1997 : length Length of data
1998 :
1999 : NOTES
2000 : Logging of the changes to the full pages are done in the caller
2001 : write_block_record().
2002 :
2003 : RETURN
2004 : 0 ok
2005 : 1 error on write
2006 : */
2007 :
2008 : static my_bool write_full_pages(MARIA_HA *info,
2009 : LSN lsn,
2010 : MARIA_BITMAP_BLOCK *block,
2011 : uchar *data, ulong length)
2012 4076 : {
2013 : pgcache_page_no_t page;
2014 4076 : MARIA_SHARE *share= info->s;
2015 4076 : uint block_size= share->block_size;
2016 4076 : uint data_size= FULL_PAGE_SIZE(block_size);
2017 4076 : uchar *buff= info->keyread_buff;
2018 : uint page_count, sub_blocks;
2019 : my_off_t position;
2020 4076 : DBUG_ENTER("write_full_pages");
2021 4076 : DBUG_PRINT("enter", ("length: %lu page: %lu page_count: %lu",
2022 : (ulong) length, (ulong) block->page,
2023 : (ulong) block->page_count));
2024 4076 : DBUG_ASSERT((block->page_count & TAIL_BIT) == 0);
2025 :
2026 4076 : info->keyread_buff_used= 1;
2027 4076 : page= block->page;
2028 4076 : page_count= block->page_count;
2029 4076 : sub_blocks= block->sub_blocks;
2030 :
2031 4076 : position= (my_off_t) (page + page_count) * block_size;
2032 4076 : if (share->state.state.data_file_length < position)
2033 1991 : _ma_set_share_data_file_length(share, position);
2034 :
2035 : /* Increase data file size, if extended */
2036 :
2037 11834 : for (; length; data+= data_size)
2038 : {
2039 : uint copy_length;
2040 11834 : if (!page_count--)
2041 : {
2042 0 : if (!--sub_blocks)
2043 : {
2044 0 : DBUG_ASSERT(0); /* Wrong in bitmap or UNDO */
2045 : my_errno= HA_ERR_WRONG_IN_RECORD; /* File crashed */
2046 : DBUG_RETURN(1);
2047 : }
2048 :
2049 0 : block++;
2050 0 : page= block->page;
2051 0 : page_count= block->page_count - 1;
2052 0 : DBUG_PRINT("info", ("page: %lu page_count: %lu",
2053 : (ulong) block->page, (ulong) block->page_count));
2054 :
2055 0 : position= (page + page_count + 1) * block_size;
2056 0 : if (share->state.state.data_file_length < position)
2057 0 : _ma_set_share_data_file_length(share, position);
2058 : }
2059 11834 : lsn_store(buff, lsn);
2060 11834 : buff[PAGE_TYPE_OFFSET]= (uchar) BLOB_PAGE;
2061 11834 : copy_length= min(data_size, length);
2062 11834 : memcpy(buff + LSN_SIZE + PAGE_TYPE_SIZE, data, copy_length);
2063 11834 : length-= copy_length;
2064 :
2065 : /*
2066 : Zero out old information from the block. This removes possible
2067 : sensitive information from the block and also makes the file
2068 : easier to compress and easier to compare after recovery.
2069 : */
2070 11834 : if (copy_length != data_size)
2071 807 : bzero(buff + block_size - PAGE_SUFFIX_SIZE - (data_size - copy_length),
2072 : (data_size - copy_length) + PAGE_SUFFIX_SIZE);
2073 :
2074 11834 : if (pagecache_write(share->pagecache,
2075 : &info->dfile, page, 0,
2076 : buff, share->page_type,
2077 : PAGECACHE_LOCK_LEFT_UNLOCKED,
2078 : PAGECACHE_PIN_LEFT_UNPINNED,
2079 : PAGECACHE_WRITE_DELAY,
2080 : 0, info->trn->rec_lsn))
2081 0 : DBUG_RETURN(1);
2082 11834 : page++;
2083 11834 : DBUG_ASSERT(block->used & BLOCKUSED_USED);
2084 : }
2085 4076 : DBUG_RETURN(0);
2086 : }
2087 :
2088 :
2089 : /*
2090 : Store ranges of full pages in compact format for logging
2091 :
2092 : SYNOPSIS
2093 : store_page_range()
2094 : to Store data here
2095 : block Where pages are to be written
2096 : block_size block size
2097 : length Length of data to be written
2098 : Normally this is full pages, except for the last
2099 : tail block that may only partly fit the last page.
2100 : tot_ranges Add here the number of ranges used
2101 :
2102 : NOTES
2103 : The format of one entry is:
2104 :
2105 : Ranges SUB_RANGE_SIZE
2106 : Empty bytes at end of last byte BLOCK_FILLER_SIZE
2107 : For each range
2108 : Page number PAGE_STORE_SIZE
2109 : Number of pages PAGERANGE_STORE_SIZE
2110 :
2111 : RETURN
2112 : # end position for 'to'
2113 : */
2114 :
2115 : static uchar *store_page_range(uchar *to, MARIA_BITMAP_BLOCK *block,
2116 : uint block_size, ulong length,
2117 : uint *tot_ranges)
2118 3301 : {
2119 3301 : uint data_size= FULL_PAGE_SIZE(block_size);
2120 3301 : ulong pages_left= (length + data_size -1) / data_size;
2121 : uint page_count, ranges, empty_space;
2122 : uchar *to_start;
2123 3301 : DBUG_ENTER("store_page_range");
2124 :
2125 3301 : to_start= to;
2126 3301 : to+= SUB_RANGE_SIZE;
2127 :
2128 : /* Store number of unused bytes at last page */
2129 3301 : empty_space= (uint) (pages_left * data_size - length);
2130 3301 : int2store(to, empty_space);
2131 3301 : to+= BLOCK_FILLER_SIZE;
2132 :
2133 3301 : ranges= 0;
2134 : do
2135 : {
2136 : pgcache_page_no_t page;
2137 3301 : page= block->page;
2138 3301 : page_count= block->page_count;
2139 3301 : block++;
2140 3301 : if (page_count > pages_left)
2141 0 : page_count= pages_left;
2142 :
2143 3301 : page_store(to, page);
2144 3301 : to+= PAGE_STORE_SIZE;
2145 3301 : pagerange_store(to, page_count);
2146 3301 : to+= PAGERANGE_STORE_SIZE;
2147 3301 : ranges++;
2148 3301 : } while ((pages_left-= page_count));
2149 : /* Store number of ranges for this block */
2150 3301 : int2store(to_start, ranges);
2151 3301 : (*tot_ranges)+= ranges;
2152 :
2153 3301 : DBUG_RETURN(to);
2154 : }
2155 :
2156 :
2157 : /*
2158 : Store packed extent data
2159 :
2160 : SYNOPSIS
2161 : store_extent_info()
2162 : to Store first packed data here
2163 : row_extents_second_part Store rest here
2164 : first_block First block to store
2165 : count Number of blocks
2166 :
2167 : NOTES
2168 : We don't have to store the position for the head block
2169 :
2170 : We have to set the START_EXTENT_BIT for every extent where the
2171 : blob will be stored on a page of it's own. We need this in the
2172 : UNDO phase to generate MARIA_BITMAP_BLOCK's for undo-delete and
2173 : undo-update.
2174 : */
2175 :
2176 : static void store_extent_info(uchar *to,
2177 : uchar *row_extents_second_part,
2178 : MARIA_BITMAP_BLOCK *first_block,
2179 : uint count)
2180 4105 : {
2181 : MARIA_BITMAP_BLOCK *block, *end_block;
2182 : uint copy_length;
2183 4105 : my_bool first_found= 0;
2184 :
2185 4105 : for (block= first_block, end_block= first_block+count ;
2186 11479 : block < end_block; block++)
2187 : {
2188 : /* The following is only false for marker blocks */
2189 7374 : if (likely(block->used & BLOCKUSED_USED))
2190 : {
2191 7374 : uint page_count= block->page_count;
2192 7374 : DBUG_ASSERT(page_count != 0);
2193 7374 : page_store(to, block->page);
2194 7374 : if (block->sub_blocks)
2195 : {
2196 : /*
2197 : Set a bit so that we later know that this was the first block
2198 : for a blob
2199 : */
2200 4105 : page_count|= START_EXTENT_BIT;
2201 : }
2202 7374 : pagerange_store(to + PAGE_STORE_SIZE, page_count);
2203 7374 : to+= ROW_EXTENT_SIZE;
2204 7374 : if (!first_found)
2205 : {
2206 4105 : first_found= 1;
2207 4105 : to= row_extents_second_part;
2208 : }
2209 : }
2210 : }
2211 4105 : copy_length= (count - 1) * ROW_EXTENT_SIZE;
2212 : /*
2213 : In some unlikely cases we have allocated to many blocks. Clear this
2214 : data.
2215 : */
2216 4105 : bzero(to, (size_t) (row_extents_second_part + copy_length - to));
2217 : }
2218 :
2219 :
2220 : /**
2221 : @brief
2222 : Convert extent info read from file to MARIA_BITMAP_BLOCKS suitable
2223 : for write_block_record
2224 :
2225 : @note
2226 : In case of blobs, this function marks all the blob pages in the bitmap
2227 : as full pages. The bitmap bits for other pages will be marked
2228 : when write_block_record() calls _ma_bitmap_release_unused().
2229 :
2230 : This function will be removed in Maria 2.0 when we instead of delete rows
2231 : mark them as deleted and only remove them after commit.
2232 :
2233 : @return
2234 : @retval 0 ok
2235 : @retval 1 Error (out of memory or disk error changing bitmap)
2236 : */
2237 :
2238 : static my_bool extent_to_bitmap_blocks(MARIA_HA *info,
2239 : MARIA_BITMAP_BLOCKS *blocks,
2240 : pgcache_page_no_t head_page,
2241 : uint extent_count,
2242 : const uchar *extent_info)
2243 15585 : {
2244 : MARIA_BITMAP_BLOCK *block, *start_block;
2245 15585 : MARIA_SHARE *share= info->s;
2246 : uint i;
2247 15585 : DBUG_ENTER("extent_to_bitmap_blocks");
2248 :
2249 15585 : if (allocate_dynamic(&info->bitmap_blocks, extent_count + 2))
2250 0 : DBUG_RETURN(1);
2251 15585 : block= blocks->block= dynamic_element(&info->bitmap_blocks, 0,
2252 : MARIA_BITMAP_BLOCK*);
2253 15585 : blocks->count= extent_count + 1;
2254 15585 : blocks->tail_page_skipped= blocks->page_skipped= 0;
2255 15585 : block->page= head_page;
2256 15585 : block->page_count= 1;
2257 15585 : block->used= BLOCKUSED_USED | BLOCKUSED_USE_ORG_BITMAP;
2258 : /* Impossible value, will force storage of real value */
2259 15585 : block->org_bitmap_value= 255;
2260 :
2261 15585 : start_block= block++;
2262 15585 : for (i=0 ;
2263 32449 : i++ < extent_count ;
2264 1279 : block++, extent_info+= ROW_EXTENT_SIZE)
2265 : {
2266 1279 : uint page_count= uint2korr(extent_info + ROW_EXTENT_PAGE_SIZE);
2267 1279 : if (page_count & START_EXTENT_BIT)
2268 : {
2269 705 : page_count&= ~START_EXTENT_BIT;
2270 705 : start_block->sub_blocks= (uint) (block - start_block);
2271 705 : start_block= block;
2272 :
2273 : }
2274 1279 : block->page= page_korr(extent_info);
2275 1279 : block->page_count= page_count;
2276 1279 : block->sub_blocks= 0;
2277 :
2278 1279 : if (page_count & TAIL_BIT)
2279 : {
2280 582 : block->org_bitmap_value= _ma_bitmap_get_page_bits(info, &share->bitmap,
2281 : block->page);
2282 582 : block->used= (BLOCKUSED_TAIL | BLOCKUSED_USED |
2283 : BLOCKUSED_USE_ORG_BITMAP);
2284 : }
2285 : else
2286 : {
2287 : my_bool res;
2288 697 : pthread_mutex_lock(&share->bitmap.bitmap_lock);
2289 697 : res= _ma_bitmap_set_full_page_bits(info, &share->bitmap,
2290 : block->page, block->page_count);
2291 697 : pthread_mutex_unlock(&share->bitmap.bitmap_lock);
2292 697 : if (res)
2293 0 : DBUG_RETURN(1);
2294 697 : block->used= BLOCKUSED_USED;
2295 : }
2296 : }
2297 15585 : start_block->sub_blocks= (uint) (block - start_block);
2298 15585 : DBUG_RETURN(0);
2299 : }
2300 :
2301 :
2302 : /*
2303 : Free regions of pages with logging
2304 :
2305 : NOTES
2306 : We are removing filler events and tail page events from
2307 : row->extents to get smaller log.
2308 :
2309 : RETURN
2310 : 0 ok
2311 : 1 error
2312 : */
2313 :
2314 : static my_bool free_full_pages(MARIA_HA *info, MARIA_ROW *row)
2315 3505 : {
2316 : uchar log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE];
2317 : LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
2318 : LSN lsn;
2319 : size_t extents_length;
2320 3505 : uchar *extents= row->extents;
2321 3505 : DBUG_ENTER("free_full_pages");
2322 :
2323 3505 : if (info->s->now_transactional)
2324 : {
2325 : /* Compact events by removing filler and tail events */
2326 2920 : uchar *new_block= 0;
2327 : uchar *end, *to, *compact_extent_info;
2328 : my_bool res;
2329 : uint extents_count;
2330 :
2331 2920 : if (!(compact_extent_info= my_alloca(row->extents_count *
2332 : ROW_EXTENT_SIZE)))
2333 0 : DBUG_RETURN(1);
2334 :
2335 2920 : to= compact_extent_info;
2336 2920 : for (end= extents + row->extents_count * ROW_EXTENT_SIZE ;
2337 11117 : extents < end ;
2338 5277 : extents+= ROW_EXTENT_SIZE)
2339 : {
2340 5277 : uint page_count= uint2korr(extents + ROW_EXTENT_PAGE_SIZE);
2341 5277 : page_count&= ~START_EXTENT_BIT;
2342 5277 : if (! (page_count & TAIL_BIT) && page_count != 0)
2343 : {
2344 : /* Found correct extent */
2345 2899 : if (!new_block)
2346 2899 : new_block= extents; /* First extent in range */
2347 : continue;
2348 : }
2349 : /* Found extent to remove, copy everything found so far */
2350 2378 : if (new_block)
2351 : {
2352 2357 : size_t length= (size_t) (extents - new_block);
2353 2357 : memcpy(to, new_block, length);
2354 2357 : to+= length;
2355 2357 : new_block= 0;
2356 : }
2357 : }
2358 2920 : if (new_block)
2359 : {
2360 542 : size_t length= (size_t) (extents - new_block);
2361 542 : memcpy(to, new_block, length);
2362 542 : to+= length;
2363 : }
2364 :
2365 2920 : if (!unlikely(extents_length= (uint) (to - compact_extent_info)))
2366 : {
2367 : /*
2368 : No ranges. This happens in the rear case when we have a allocated
2369 : place for a blob on a tail page but it did fit into the main page.
2370 : */
2371 : my_afree(compact_extent_info);
2372 21 : DBUG_RETURN(0);
2373 : }
2374 2899 : extents_count= (uint) (extents_length / ROW_EXTENT_SIZE);
2375 2899 : pagerange_store(log_data + FILEID_STORE_SIZE, extents_count);
2376 2899 : log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
2377 2899 : log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
2378 2899 : log_array[TRANSLOG_INTERNAL_PARTS + 1].str= compact_extent_info;
2379 2899 : log_array[TRANSLOG_INTERNAL_PARTS + 1].length= extents_length;
2380 2899 : res= translog_write_record(&lsn, LOGREC_REDO_FREE_BLOCKS, info->trn,
2381 : info,
2382 : (translog_size_t) (sizeof(log_data) +
2383 : extents_length),
2384 : TRANSLOG_INTERNAL_PARTS + 2, log_array,
2385 : log_data, NULL);
2386 : my_afree(compact_extent_info);
2387 2899 : if (res)
2388 0 : DBUG_RETURN(1);
2389 : }
2390 :
2391 3484 : DBUG_RETURN(_ma_bitmap_free_full_pages(info, row->extents,
2392 : row->extents_count));
2393 : }
2394 :
2395 :
2396 : /*
2397 : Free one page range
2398 :
2399 : NOTES
2400 : This is very similar to free_full_pages()
2401 :
2402 : RETURN
2403 : 0 ok
2404 : 1 error
2405 : */
2406 :
2407 : static my_bool free_full_page_range(MARIA_HA *info, pgcache_page_no_t page,
2408 : uint count)
2409 181 : {
2410 181 : my_bool res= 0;
2411 : uint delete_count;
2412 181 : MARIA_SHARE *share= info->s;
2413 181 : DBUG_ENTER("free_full_page_range");
2414 :
2415 181 : delete_count= count;
2416 181 : if (share->state.state.data_file_length ==
2417 : (page + count) * share->block_size)
2418 : {
2419 : /*
2420 : Don't delete last page from pagecache as this will make the file
2421 : shorter than expected if the last operation extended the file
2422 : */
2423 128 : delete_count--;
2424 : }
2425 181 : if (delete_count &&
2426 : pagecache_delete_pages(share->pagecache, &info->dfile,
2427 : page, delete_count, PAGECACHE_LOCK_WRITE, 0))
2428 0 : res= 1;
2429 :
2430 181 : if (share->now_transactional)
2431 : {
2432 : LSN lsn;
2433 : /** @todo unify log_data's shape with delete_head_or_tail() */
2434 : uchar log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE +
2435 : ROW_EXTENT_SIZE];
2436 : LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
2437 139 : DBUG_ASSERT(info->trn->rec_lsn);
2438 139 : pagerange_store(log_data + FILEID_STORE_SIZE, 1);
2439 139 : page_store(log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE,
2440 : page);
2441 139 : int2store(log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE +
2442 : PAGE_STORE_SIZE, count);
2443 139 : log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
2444 139 : log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
2445 :
2446 139 : if (translog_write_record(&lsn, LOGREC_REDO_FREE_BLOCKS,
2447 : info->trn, info,
2448 : (translog_size_t) sizeof(log_data),
2449 : TRANSLOG_INTERNAL_PARTS + 1, log_array,
2450 : log_data, NULL))
2451 0 : res= 1;
2452 : }
2453 181 : pthread_mutex_lock(&share->bitmap.bitmap_lock);
2454 181 : if (_ma_bitmap_reset_full_page_bits(info, &share->bitmap, page, count))
2455 0 : res= 1;
2456 181 : pthread_mutex_unlock(&share->bitmap.bitmap_lock);
2457 181 : DBUG_RETURN(res);
2458 : }
2459 :
2460 :
2461 : /**
2462 : @brief Write a record to a (set of) pages
2463 :
2464 : @fn write_block_record()
2465 : @param info Maria handler
2466 : @param old_record Original record in case of update; NULL in case of
2467 : insert
2468 : @param record Record we should write
2469 : @param row Statistics about record (calculated by
2470 : calc_record_size())
2471 : @param map_blocks On which pages the record should be stored
2472 : @param row_pos Position on head page where to put head part of
2473 : record
2474 : @param undo_lsn <> LSN_ERROR if we are executing an UNDO
2475 : @param old_record_checksum Checksum of old_record: ignored if table does
2476 : not have live checksum; otherwise if
2477 : old_record==NULL it must be 0.
2478 :
2479 : @note
2480 : On return all pinned pages are released.
2481 :
2482 : [page_buff + EMPTY_SPACE_OFFSET] is set to
2483 : row_pos->empty_space - head_length
2484 :
2485 : @return Operation status
2486 : @retval 0 OK
2487 : @retval 1 Error
2488 : */
2489 :
2490 : static my_bool write_block_record(MARIA_HA *info,
2491 : const uchar *old_record,
2492 : const uchar *record,
2493 : MARIA_ROW *row,
2494 : MARIA_BITMAP_BLOCKS *bitmap_blocks,
2495 : my_bool head_block_is_read,
2496 : struct st_row_pos_info *row_pos,
2497 : LSN undo_lsn,
2498 : ha_checksum old_record_checksum)
2499 213763 : {
2500 : uchar *data, *end_of_data, *tmp_data_used, *tmp_data;
2501 : uchar *row_extents_first_part, *row_extents_second_part;
2502 : uchar *field_length_data;
2503 : uchar *page_buff;
2504 : MARIA_BITMAP_BLOCK *block, *head_block;
2505 213763 : MARIA_SHARE *share= info->s;
2506 : MARIA_COLUMNDEF *column, *end_column;
2507 : MARIA_PINNED_PAGE page_link;
2508 : uint block_size, flag, head_length;
2509 : ulong *blob_lengths;
2510 : my_bool row_extents_in_use, blob_full_pages_exists;
2511 : LSN lsn;
2512 : my_off_t position;
2513 : uint save_my_errno;
2514 213763 : DBUG_ENTER("write_block_record");
2515 :
2516 213763 : LINT_INIT(row_extents_first_part);
2517 213763 : LINT_INIT(row_extents_second_part);
2518 :
2519 213763 : head_block= bitmap_blocks->block;
2520 213763 : block_size= share->block_size;
2521 :
2522 213763 : page_buff= row_pos->buff;
2523 : /* Position on head page where we should store the head part */
2524 213763 : data= row_pos->data;
2525 213763 : end_of_data= data + row_pos->length;
2526 :
2527 : /* Write header */
2528 213763 : flag= info->row_flag;
2529 213763 : row_extents_in_use= 0;
2530 213763 : if (unlikely(row->total_length > row_pos->length))
2531 : {
2532 : /* Need extent */
2533 4105 : DBUG_ASSERT(bitmap_blocks->count > 1);
2534 4105 : if (bitmap_blocks->count <= 1)
2535 4105 : goto crashed; /* Wrong in bitmap */
2536 4105 : flag|= ROW_FLAG_EXTENTS;
2537 4105 : row_extents_in_use= 1;
2538 : }
2539 : /* For now we have only a minimum header */
2540 213763 : *data++= (uchar) flag;
2541 213763 : if (flag & ROW_FLAG_TRANSID)
2542 : {
2543 48041 : transid_store(data, info->trn->trid);
2544 48041 : data+= TRANSID_SIZE;
2545 : }
2546 :
2547 213763 : if (unlikely(flag & ROW_FLAG_NULLS_EXTENDED))
2548 0 : *data++= (uchar) (share->base.null_bytes -
2549 : share->base.original_null_bytes);
2550 213763 : if (row_extents_in_use)
2551 : {
2552 : /* Store first extent in header */
2553 4105 : store_key_length_inc(data, bitmap_blocks->count - 1);
2554 4105 : row_extents_first_part= data;
2555 4105 : data+= ROW_EXTENT_SIZE;
2556 : }
2557 213763 : if (share->base.max_field_lengths)
2558 213202 : store_key_length_inc(data, row->field_lengths_length);
2559 213763 : if (share->calc_checksum)
2560 : {
2561 98062 : *(data++)= (uchar) (row->checksum); /* store least significant byte */
2562 98062 : DBUG_ASSERT(!((old_record_checksum != 0) && (old_record == NULL)));
2563 : }
2564 213763 : memcpy(data, record, share->base.null_bytes);
2565 213763 : data+= share->base.null_bytes;
2566 213763 : memcpy(data, row->empty_bits, share->base.pack_bytes);
2567 213763 : data+= share->base.pack_bytes;
2568 :
2569 213763 : DBUG_ASSERT(row_extents_in_use || undo_lsn != LSN_ERROR ||
2570 : (uint) (data - row_pos->data) == row->min_length);
2571 :
2572 : /*
2573 : Allocate a buffer of rest of data (except blobs)
2574 :
2575 : To avoid double copying of data, we copy as many columns that fits into
2576 : the page. The rest goes into info->packed_row.
2577 :
2578 : Using an extra buffer, instead of doing continuous writes to different
2579 : pages, uses less code and we don't need to have to do a complex call
2580 : for every data segment we want to store.
2581 : */
2582 213763 : if (_ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size,
2583 : row->head_length))
2584 0 : DBUG_RETURN(1);
2585 :
2586 213763 : tmp_data_used= 0; /* Either 0 or last used uchar in 'data' */
2587 213763 : tmp_data= data;
2588 :
2589 213763 : if (row_extents_in_use)
2590 : {
2591 4105 : uint copy_length= (bitmap_blocks->count - 2) * ROW_EXTENT_SIZE;
2592 4105 : if (!tmp_data_used && tmp_data + copy_length > end_of_data)
2593 : {
2594 0 : tmp_data_used= tmp_data;
2595 0 : tmp_data= info->rec_buff;
2596 : }
2597 4105 : row_extents_second_part= tmp_data;
2598 : /*
2599 : We will copy the extents here when we have figured out the tail
2600 : positions.
2601 : */
2602 4105 : tmp_data+= copy_length;
2603 : }
2604 :
2605 : /* Copy fields that has fixed lengths (primary key etc) */
2606 : for (column= share->columndef,
2607 213763 : end_column= column + share->base.fixed_not_null_fields;
2608 1037765 : column < end_column; column++)
2609 : {
2610 824002 : if (!tmp_data_used && tmp_data + column->length > end_of_data)
2611 : {
2612 0 : tmp_data_used= tmp_data;
2613 0 : tmp_data= info->rec_buff;
2614 : }
2615 824002 : memcpy(tmp_data, record + column->offset, column->length);
2616 824002 : tmp_data+= column->length;
2617 : }
2618 :
2619 : /* Copy length of data for variable length fields */
2620 213763 : if (!tmp_data_used && tmp_data + row->field_lengths_length > end_of_data)
2621 : {
2622 0 : tmp_data_used= tmp_data;
2623 0 : tmp_data= info->rec_buff;
2624 : }
2625 213763 : field_length_data= row->field_lengths;
2626 213763 : memcpy(tmp_data, field_length_data, row->field_lengths_length);
2627 213763 : tmp_data+= row->field_lengths_length;
2628 :
2629 213763 : DBUG_ASSERT(row_extents_in_use || undo_lsn != LSN_ERROR ||
2630 : (uint) (tmp_data - row_pos->data) == row->min_length +
2631 : share->base.fixed_not_null_fields_length +
2632 : row->field_lengths_length);
2633 :
2634 : /* Copy variable length fields and fields with null/zero */
2635 213763 : for (end_column= share->columndef + share->base.fields - share->base.blobs;
2636 829762 : column < end_column ;
2637 402236 : column++)
2638 : {
2639 : const uchar *field_pos;
2640 : ulong length;
2641 402236 : if ((record[column->null_pos] & column->null_bit) ||
2642 : (row->empty_bits[column->empty_pos] & column->empty_bit))
2643 : continue;
2644 :
2645 398727 : field_pos= record + column->offset;
2646 398727 : switch (column->type) {
2647 : case FIELD_NORMAL: /* Fixed length field */
2648 : case FIELD_SKIP_PRESPACE:
2649 : case FIELD_SKIP_ZERO: /* Fixed length field */
2650 196350 : length= column->length;
2651 196350 : break;
2652 : case FIELD_SKIP_ENDSPACE: /* CHAR */
2653 : /* Char that is space filled */
2654 201144 : if (column->length <= 255)
2655 201144 : length= (uint) (uchar) *field_length_data++;
2656 : else
2657 : {
2658 0 : length= uint2korr(field_length_data);
2659 0 : field_length_data+= 2;
2660 : }
2661 : break;
2662 : case FIELD_VARCHAR:
2663 1233 : if (column->length <= 256)
2664 : {
2665 915 : length= (uint) (uchar) *field_length_data++;
2666 915 : field_pos++; /* Skip length uchar */
2667 : }
2668 : else
2669 : {
2670 318 : length= uint2korr(field_length_data);
2671 318 : field_length_data+= 2;
2672 318 : field_pos+= 2;
2673 : }
2674 1233 : DBUG_ASSERT(length <= column->length);
2675 : break;
2676 : default: /* Wrong data */
2677 0 : DBUG_ASSERT(0);
2678 : length=0;
2679 : break;
2680 : }
2681 398727 : if (!tmp_data_used && tmp_data + length > end_of_data)
2682 : {
2683 : /* Data didn't fit in page; Change to use tmp buffer */
2684 0 : tmp_data_used= tmp_data;
2685 0 : tmp_data= info->rec_buff;
2686 : }
2687 398727 : memcpy((char*) tmp_data, field_pos, length);
2688 398727 : tmp_data+= length;
2689 : }
2690 :
2691 213763 : block= head_block + head_block->sub_blocks; /* Point to first blob data */
2692 :
2693 213763 : end_column= column + share->base.blobs;
2694 213763 : blob_lengths= row->blob_lengths;
2695 213763 : if (!tmp_data_used)
2696 : {
2697 : /* Still room on page; Copy as many blobs we can into this page */
2698 213763 : data= tmp_data;
2699 479176 : for (; column < end_column &&
2700 : *blob_lengths <= (ulong)(end_of_data - data);
2701 51650 : column++, blob_lengths++)
2702 : {
2703 : uchar *tmp_pos;
2704 : uint length;
2705 51650 : if (!*blob_lengths) /* Null or "" */
2706 7758 : continue;
2707 7758 : length= column->length - portable_sizeof_char_ptr;
2708 7758 : memcpy_fixed((uchar*) &tmp_pos, record + column->offset + length,
2709 : sizeof(char*));
2710 7758 : memcpy(data, tmp_pos, *blob_lengths);
2711 7758 : data+= *blob_lengths;
2712 : /* Skip over tail page that was to be used to store blob */
2713 7758 : block++;
2714 7758 : bitmap_blocks->tail_page_skipped= 1;
2715 : }
2716 213763 : if (head_block->sub_blocks > 1)
2717 : {
2718 : /* We have allocated pages that where not used */
2719 0 : bitmap_blocks->page_skipped= 1;
2720 : }
2721 : }
2722 : else
2723 0 : data= tmp_data_used; /* Get last used on page */
2724 :
2725 : /* Update page directory */
2726 213763 : head_length= (uint) (data - row_pos->data);
2727 213763 : DBUG_PRINT("info", ("Used head length on page: %u", head_length));
2728 213763 : DBUG_ASSERT(data <= end_of_data);
2729 213763 : if (head_length < share->base.min_block_length)
2730 : {
2731 : /* Extend row to be of size min_block_length */
2732 3591 : uint diff_length= share->base.min_block_length - head_length;
2733 3591 : bzero(data, diff_length);
2734 3591 : data+= diff_length;
2735 3591 : head_length= share->base.min_block_length;
2736 : }
2737 213763 : int2store(row_pos->dir + 2, head_length);
2738 : /* update empty space at start of block */
2739 213763 : row_pos->empty_space-= head_length;
2740 213763 : int2store(page_buff + EMPTY_SPACE_OFFSET, row_pos->empty_space);
2741 : /* Mark in bitmaps how the current page was actually used */
2742 213763 : head_block->empty_space= row_pos->empty_space;
2743 213763 : if (page_buff[DIR_COUNT_OFFSET] == MAX_ROWS_PER_PAGE &&
2744 : page_buff[DIR_FREE_OFFSET] == END_OF_DIR_FREE_LIST)
2745 0 : head_block->empty_space= 0; /* Page is full */
2746 213763 : head_block->used|= BLOCKUSED_USED;
2747 :
2748 213763 : check_directory(page_buff, share->block_size, share->base.min_block_length);
2749 :
2750 : /*
2751 : Now we have to write tail pages, as we need to store the position
2752 : to them in the row extent header.
2753 :
2754 : We first write out all blob tails, to be able to store them in
2755 : the current page or 'tmp_data'.
2756 :
2757 : Then we write the tail of the non-blob fields (The position to the
2758 : tail page is stored either in row header, the extents in the head
2759 : page or in the first full page of the non-blob data. It's never in
2760 : the tail page of the non-blob data)
2761 : */
2762 :
2763 213763 : blob_full_pages_exists= 0;
2764 213763 : if (row_extents_in_use)
2765 : {
2766 4105 : if (column != end_column) /* If blob fields */
2767 : {
2768 4105 : MARIA_COLUMNDEF *save_column= column;
2769 4105 : MARIA_BITMAP_BLOCK *save_block= block;
2770 : MARIA_BITMAP_BLOCK *end_block;
2771 4105 : ulong *save_blob_lengths= blob_lengths;
2772 :
2773 8210 : for (; column < end_column; column++, blob_lengths++)
2774 : {
2775 : uchar *blob_pos;
2776 4105 : if (!*blob_lengths) /* Null or "" */
2777 4105 : continue;
2778 4105 : if (block[block->sub_blocks - 1].used & BLOCKUSED_TAIL)
2779 : {
2780 : uint length;
2781 3298 : length= column->length - portable_sizeof_char_ptr;
2782 3298 : memcpy_fixed((uchar *) &blob_pos, record + column->offset + length,
2783 : sizeof(char*));
2784 3298 : length= *blob_lengths % FULL_PAGE_SIZE(block_size); /* tail size */
2785 3298 : if (length != *blob_lengths)
2786 3269 : blob_full_pages_exists= 1;
2787 3298 : if (write_tail(info, block + block->sub_blocks-1,
2788 : blob_pos + *blob_lengths - length,
2789 : length))
2790 : goto disk_err;
2791 : }
2792 : else
2793 807 : blob_full_pages_exists= 1;
2794 :
2795 11479 : for (end_block= block + block->sub_blocks; block < end_block; block++)
2796 : {
2797 : /*
2798 : Set only a bit, to not cause bitmap code to believe a block is full
2799 : when there is still a lot of entries in it
2800 : */
2801 7374 : block->used|= BLOCKUSED_USED;
2802 : }
2803 : }
2804 4105 : column= save_column;
2805 4105 : block= save_block;
2806 4105 : blob_lengths= save_blob_lengths;
2807 : }
2808 :
2809 4105 : if (tmp_data_used) /* non blob data overflows */
2810 : {
2811 : MARIA_BITMAP_BLOCK *cur_block, *end_block, *last_head_block;
2812 0 : MARIA_BITMAP_BLOCK *head_tail_block= 0;
2813 : ulong length;
2814 0 : ulong data_length= (ulong) (tmp_data - info->rec_buff);
2815 :
2816 : #ifdef SANITY_CHECKS
2817 0 : DBUG_ASSERT(head_block->sub_blocks != 1);
2818 0 : if (head_block->sub_blocks == 1)
2819 0 : goto crashed; /* no reserved full or tails */
2820 : #endif
2821 : /*
2822 : Find out where to write tail for non-blob fields.
2823 :
2824 : Problem here is that the bitmap code may have allocated more
2825 : space than we need. We have to handle the following cases:
2826 :
2827 : - Bitmap code allocated a tail page we don't need.
2828 : - The last full page allocated needs to be changed to a tail page
2829 : (Because we where able to put more data on the head page than
2830 : the bitmap allocation assumed)
2831 :
2832 : The reserved pages in bitmap_blocks for the main page has one of
2833 : the following allocations:
2834 : - Full pages, with following blocks:
2835 : # * full pages
2836 : empty page ; To be used if we change last full to tail page. This
2837 : has 'count' = 0.
2838 : tail page (optional, if last full page was part full)
2839 : - One tail page
2840 : */
2841 :
2842 0 : cur_block= head_block + 1;
2843 0 : end_block= head_block + head_block->sub_blocks;
2844 : /*
2845 : Loop until we have find a block bigger than we need or
2846 : we find the empty page block.
2847 : */
2848 0 : while (data_length >= (length= (cur_block->page_count *
2849 : FULL_PAGE_SIZE(block_size))) &&
2850 : cur_block->page_count)
2851 : {
2852 : #ifdef SANITY_CHECKS
2853 0 : DBUG_ASSERT(!((cur_block == end_block) ||
2854 : (cur_block->used & BLOCKUSED_USED)));
2855 0 : if ((cur_block == end_block) || (cur_block->used & BLOCKUSED_USED))
2856 : goto crashed;
2857 : #endif
2858 0 : data_length-= length;
2859 0 : (cur_block++)->used|= BLOCKUSED_USED;
2860 : }
2861 0 : last_head_block= cur_block;
2862 0 : if (data_length)
2863 : {
2864 0 : if (cur_block->page_count == 0)
2865 : {
2866 : /* Skip empty filler block */
2867 0 : cur_block++;
2868 : }
2869 : #ifdef SANITY_CHECKS
2870 0 : DBUG_ASSERT(!(cur_block >= end_block));
2871 0 : if ((cur_block >= end_block))
2872 0 : goto crashed;
2873 : #endif
2874 0 : if (cur_block->used & BLOCKUSED_TAIL)
2875 : {
2876 0 : DBUG_ASSERT(data_length < MAX_TAIL_SIZE(block_size));
2877 : /* tail written to full tail page */
2878 0 : cur_block->used|= BLOCKUSED_USED;
2879 0 : head_tail_block= cur_block;
2880 : }
2881 0 : else if (data_length > length - MAX_TAIL_SIZE(block_size))
2882 : {
2883 : /* tail written to full page */
2884 0 : cur_block->used|= BLOCKUSED_USED;
2885 0 : if ((cur_block != end_block - 1) &&
2886 : (end_block[-1].used & BLOCKUSED_TAIL))
2887 0 : bitmap_blocks->tail_page_skipped= 1;
2888 : }
2889 : else
2890 : {
2891 : /*
2892 : cur_block is a full block, followed by an empty and optional
2893 : tail block. Change cur_block to a tail block or split it
2894 : into full blocks and tail blocks.
2895 :
2896 : TODO:
2897 : If there is enough space on the following tail block, use
2898 : this instead of creating a new tail block.
2899 : */
2900 0 : DBUG_ASSERT(cur_block[1].page_count == 0);
2901 0 : if (cur_block->page_count == 1)
2902 : {
2903 : /* convert full block to tail block */
2904 0 : cur_block->used|= BLOCKUSED_USED | BLOCKUSED_TAIL;
2905 0 : head_tail_block= cur_block;
2906 : }
2907 : else
2908 : {
2909 0 : DBUG_ASSERT(data_length < length - FULL_PAGE_SIZE(block_size));
2910 0 : DBUG_PRINT("info", ("Splitting blocks into full and tail"));
2911 0 : cur_block[1].page= (cur_block->page + cur_block->page_count - 1);
2912 0 : cur_block[1].page_count= 1; /* Avoid DBUG_ASSERT */
2913 0 : cur_block[1].used= BLOCKUSED_USED | BLOCKUSED_TAIL;
2914 0 : cur_block->page_count--;
2915 0 : cur_block->used|= BLOCKUSED_USED;
2916 0 : last_head_block= head_tail_block= cur_block+1;
2917 : }
2918 0 : if (end_block[-1].used & BLOCKUSED_TAIL)
2919 0 : bitmap_blocks->tail_page_skipped= 1;
2920 : }
2921 : }
2922 : else
2923 : {
2924 : /* Must be an empty or tail page */
2925 0 : DBUG_ASSERT(cur_block->page_count == 0 ||
2926 : cur_block->used & BLOCKUSED_TAIL);
2927 0 : if (end_block[-1].used & BLOCKUSED_TAIL)
2928 0 : bitmap_blocks->tail_page_skipped= 1;
2929 : }
2930 :
2931 : /*
2932 : Write all extents into page or tmp_data
2933 :
2934 : Note that we still don't have a correct position for the tail
2935 : of the non-blob fields.
2936 : */
2937 0 : store_extent_info(row_extents_first_part,
2938 : row_extents_second_part,
2939 : head_block+1, bitmap_blocks->count - 1);
2940 0 : if (head_tail_block)
2941 : {
2942 0 : ulong block_length= (ulong) (tmp_data - info->rec_buff);
2943 : uchar *extent_data;
2944 :
2945 0 : length= (uint) (block_length % FULL_PAGE_SIZE(block_size));
2946 0 : if (write_tail(info, head_tail_block,
2947 : info->rec_buff + block_length - length,
2948 : length))
2949 0 : goto disk_err;
2950 0 : tmp_data-= length; /* Remove the tail */
2951 0 : if (tmp_data == info->rec_buff)
2952 : {
2953 : /* We have no full blocks to write for the head part */
2954 0 : tmp_data_used= 0;
2955 : }
2956 :
2957 : /* Store the tail position for the non-blob fields */
2958 0 : if (head_tail_block == head_block + 1)
2959 : {
2960 : /*
2961 : We had a head block + tail block, which means that the
2962 : tail block is the first extent
2963 : */
2964 0 : extent_data= row_extents_first_part;
2965 : }
2966 : else
2967 : {
2968 : /*
2969 : We have a head block + some full blocks + tail block
2970 : last_head_block is pointing after the last used extent
2971 : for the head block.
2972 : */
2973 0 : extent_data= row_extents_second_part +
2974 : ((last_head_block - head_block) - 2) * ROW_EXTENT_SIZE;
2975 : }
2976 0 : DBUG_ASSERT(uint2korr(extent_data+5) & TAIL_BIT);
2977 0 : page_store(extent_data, head_tail_block->page);
2978 0 : int2store(extent_data + PAGE_STORE_SIZE, head_tail_block->page_count);
2979 : }
2980 : }
2981 : else
2982 4105 : store_extent_info(row_extents_first_part,
2983 : row_extents_second_part,
2984 : head_block+1, bitmap_blocks->count - 1);
2985 : }
2986 :
2987 213763 : if (share->now_transactional)
2988 : {
2989 : uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE];
2990 : LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
2991 :
2992 : /* Log REDO changes of head page */
2993 153291 : page_store(log_data + FILEID_STORE_SIZE, head_block->page);
2994 153291 : dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE,
2995 : row_pos->rownr);
2996 153291 : log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
2997 153291 : log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
2998 153291 : log_array[TRANSLOG_INTERNAL_PARTS + 1].str= row_pos->data;
2999 153291 : log_array[TRANSLOG_INTERNAL_PARTS + 1].length= head_length;
3000 153291 : if (translog_write_record(&lsn,
3001 : head_block_is_read ?
3002 : LOGREC_REDO_INSERT_ROW_HEAD :
3003 : LOGREC_REDO_NEW_ROW_HEAD,
3004 : info->trn,
3005 : info,
3006 : (translog_size_t) (sizeof(log_data) +
3007 : head_length),
3008 : TRANSLOG_INTERNAL_PARTS + 2, log_array,
3009 : log_data, NULL))
3010 213763 : goto disk_err;
3011 : }
3012 :
3013 : #ifdef RECOVERY_EXTRA_DEBUG
3014 : if (info->trn->undo_lsn != LSN_IMPOSSIBLE)
3015 : {
3016 : /* Stop right after the REDO; testing incomplete log record groups */
3017 : DBUG_EXECUTE_IF("maria_flush_whole_log",
3018 : {
3019 : DBUG_PRINT("maria_flush_whole_log", ("now"));
3020 : translog_flush(translog_get_horizon());
3021 : });
3022 : DBUG_EXECUTE_IF("maria_crash",
3023 : { DBUG_PRINT("maria_crash", ("now")); DBUG_ABORT(); });
3024 : }
3025 : #endif
3026 :
3027 : /* Increase data file size, if extended */
3028 213763 : position= (my_off_t) head_block->page * block_size;
3029 213763 : if (share->state.state.data_file_length <= position)
3030 2378 : _ma_set_share_data_file_length(share, position + block_size);
3031 :
3032 213763 : if (head_block_is_read)
3033 : {
3034 : MARIA_PINNED_PAGE *page_link;
3035 : /* Head page is always the first pinned page */
3036 210501 : page_link= dynamic_element(&info->pinned_pages, 0,
3037 : MARIA_PINNED_PAGE*);
3038 210501 : pagecache_unlock_by_link(share->pagecache, page_link->link,
3039 : PAGECACHE_LOCK_WRITE_TO_READ,
3040 : PAGECACHE_PIN_LEFT_PINNED, LSN_IMPOSSIBLE,
3041 : LSN_IMPOSSIBLE, 1, FALSE);
3042 210501 : page_link->unlock= PAGECACHE_LOCK_READ_UNLOCK;
3043 210501 : page_link->changed= 1;
3044 : }
3045 : else
3046 : {
3047 3262 : if (pagecache_write(share->pagecache,
3048 : &info->dfile, head_block->page, 0,
3049 : page_buff, share->page_type,
3050 : head_block_is_read ? PAGECACHE_LOCK_WRITE_TO_READ :
3051 : PAGECACHE_LOCK_READ,
3052 : head_block_is_read ? PAGECACHE_PIN_LEFT_PINNED :
3053 : PAGECACHE_PIN,
3054 : PAGECACHE_WRITE_DELAY, &page_link.link,
3055 : LSN_IMPOSSIBLE))
3056 3262 : goto disk_err;
3057 3262 : page_link.unlock= PAGECACHE_LOCK_READ_UNLOCK;
3058 3262 : page_link.changed= 1;
3059 3262 : push_dynamic(&info->pinned_pages, (void*) &page_link);
3060 : }
3061 :
3062 213763 : if (share->now_transactional && (tmp_data_used || blob_full_pages_exists))
3063 : {
3064 : /*
3065 : Log REDO writes for all full pages (head part and all blobs)
3066 : We write all here to be able to generate the UNDO record early
3067 : so that we can write the LSN for the UNDO record to all full pages.
3068 : */
3069 : uchar tmp_log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE +
3070 : (ROW_EXTENT_SIZE + BLOCK_FILLER_SIZE + SUB_RANGE_SIZE) *
3071 : ROW_EXTENTS_ON_STACK];
3072 : uchar *log_data, *log_pos;
3073 : LEX_CUSTRING tmp_log_array[TRANSLOG_INTERNAL_PARTS + 2 +
3074 : ROW_EXTENTS_ON_STACK];
3075 : LEX_CUSTRING *log_array_pos, *log_array;
3076 : int error;
3077 3301 : translog_size_t log_entry_length= 0;
3078 3301 : uint ext_length, extents= 0, sub_extents= 0;
3079 :
3080 : /* If few extents, then allocate things on stack to avoid a malloc call */
3081 3301 : if (bitmap_blocks->count < ROW_EXTENTS_ON_STACK)
3082 : {
3083 3301 : log_array= tmp_log_array;
3084 3301 : log_data= tmp_log_data;
3085 : }
3086 : else
3087 : {
3088 0 : if (!my_multi_malloc(MY_WME, &log_array,
3089 : (uint) ((bitmap_blocks->count +
3090 : TRANSLOG_INTERNAL_PARTS + 2) *
3091 : sizeof(*log_array)),
3092 : &log_data, FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE +
3093 : bitmap_blocks->count * (ROW_EXTENT_SIZE +
3094 : BLOCK_FILLER_SIZE +
3095 : SUB_RANGE_SIZE),
3096 : NullS))
3097 3301 : goto disk_err;
3098 : }
3099 3301 : log_pos= log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE * 2;
3100 3301 : log_array_pos= log_array+ TRANSLOG_INTERNAL_PARTS+1;
3101 :
3102 3301 : if (tmp_data_used)
3103 : {
3104 : /* Full head page */
3105 : translog_size_t block_length= (translog_size_t) (tmp_data -
3106 0 : info->rec_buff);
3107 0 : log_pos= store_page_range(log_pos, head_block+1, block_size,
3108 : (ulong) block_length, &extents);
3109 0 : log_array_pos->str= info->rec_buff;
3110 0 : log_array_pos->length= block_length;
3111 0 : log_entry_length+= block_length;
3112 0 : log_array_pos++;
3113 0 : sub_extents++;
3114 : }
3115 3301 : if (blob_full_pages_exists)
3116 : {
3117 3301 : MARIA_COLUMNDEF *tmp_column= column;
3118 3301 : ulong *tmp_blob_lengths= blob_lengths;
3119 3301 : MARIA_BITMAP_BLOCK *tmp_block= block;
3120 :
3121 : /* Full blob pages */
3122 6602 : for (; tmp_column < end_column; tmp_column++, tmp_blob_lengths++)
3123 : {
3124 : ulong blob_length;
3125 : uint length;
3126 :
3127 3301 : if (!*tmp_blob_lengths) /* Null or "" */
3128 3301 : continue;
3129 3301 : blob_length= *tmp_blob_lengths;
3130 3301 : length= tmp_column->length - portable_sizeof_char_ptr;
3131 : /*
3132 : If last part of blog was on tail page, change blob_length to
3133 : reflect this
3134 : */
3135 3301 : if (tmp_block[tmp_block->sub_blocks - 1].used & BLOCKUSED_TAIL)
3136 2659 : blob_length-= (blob_length % FULL_PAGE_SIZE(block_size));
3137 3301 : if (blob_length)
3138 : {
3139 3301 : memcpy_fixed((uchar*) &log_array_pos->str,
3140 : record + tmp_column->offset + length,
3141 : sizeof(uchar*));
3142 3301 : log_array_pos->length= blob_length;
3143 3301 : log_entry_length+= blob_length;
3144 3301 : log_array_pos++;
3145 3301 : sub_extents++;
3146 :
3147 3301 : log_pos= store_page_range(log_pos, tmp_block, block_size,
3148 : blob_length, &extents);
3149 : }
3150 3301 : tmp_block+= tmp_block->sub_blocks;
3151 : }
3152 : }
3153 :
3154 3301 : log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
3155 3301 : ext_length= (uint) (log_pos - log_data);
3156 3301 : log_array[TRANSLOG_INTERNAL_PARTS + 0].length= ext_length;
3157 3301 : pagerange_store(log_data+ FILEID_STORE_SIZE, extents);
3158 3301 : pagerange_store(log_data+ FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE,
3159 : sub_extents);
3160 :
3161 3301 : log_entry_length+= ext_length;
3162 : /* trn->rec_lsn is already set earlier in this function */
3163 3301 : error= translog_write_record(&lsn, LOGREC_REDO_INSERT_ROW_BLOBS,
3164 : info->trn, info, log_entry_length,
3165 : (uint) (log_array_pos - log_array),
3166 : log_array, log_data, NULL);
3167 3301 : if (log_array != tmp_log_array)
3168 0 : my_free(log_array, MYF(0));
3169 3301 : if (error)
3170 213763 : goto disk_err;
3171 : }
3172 :
3173 : /* Write UNDO or CLR record */
3174 213763 : lsn= LSN_IMPOSSIBLE;
3175 213763 : if (share->now_transactional)
3176 : {
3177 153291 : LEX_CUSTRING *log_array= info->log_row_parts;
3178 :
3179 153291 : if (undo_lsn != LSN_ERROR)
3180 : {
3181 : /*
3182 : Store if this CLR is about UNDO_DELETE or UNDO_UPDATE;
3183 : in the first case, Recovery, when it sees the CLR_END in the
3184 : REDO phase, may decrement the records' count.
3185 : */
3186 15585 : if (_ma_write_clr(info, undo_lsn,
3187 : old_record ? LOGREC_UNDO_ROW_UPDATE :
3188 : LOGREC_UNDO_ROW_DELETE,
3189 : share->calc_checksum != 0,
3190 : row->checksum - old_record_checksum,
3191 : &lsn, (void*) 0))
3192 : goto disk_err;
3193 : }
3194 : else
3195 : {
3196 : uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE +
3197 : PAGE_STORE_SIZE + DIRPOS_STORE_SIZE +
3198 : HA_CHECKSUM_STORE_SIZE + 2 + PAGERANGE_STORE_SIZE +
3199 : ROW_EXTENT_SIZE];
3200 : ha_checksum checksum_delta;
3201 :
3202 : /* LOGREC_UNDO_ROW_INSERT & LOGREC_UNDO_ROW_UPDATE share same header */
3203 137706 : lsn_store(log_data, info->trn->undo_lsn);
3204 137706 : page_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE,
3205 : head_block->page);
3206 137706 : dirpos_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE +
3207 : PAGE_STORE_SIZE,
3208 : row_pos->rownr);
3209 :
3210 137706 : log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
3211 137706 : log_array[TRANSLOG_INTERNAL_PARTS + 0].length=
3212 : (LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE +
3213 : DIRPOS_STORE_SIZE);
3214 137706 : store_checksum_in_rec(share, checksum_delta,
3215 : row->checksum - old_record_checksum,
3216 : log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE +
3217 : PAGE_STORE_SIZE + DIRPOS_STORE_SIZE,
3218 : log_array[TRANSLOG_INTERNAL_PARTS + 0].length);
3219 : compile_time_assert(sizeof(ha_checksum) == HA_CHECKSUM_STORE_SIZE);
3220 :
3221 137706 : if (!old_record)
3222 : {
3223 : /* Store undo_lsn in case we are aborting the insert */
3224 128018 : row->orig_undo_lsn= info->trn->undo_lsn;
3225 : /* Write UNDO log record for the INSERT */
3226 128018 : if (translog_write_record(&lsn, LOGREC_UNDO_ROW_INSERT,
3227 : info->trn, info,
3228 : (translog_size_t)
3229 : log_array[TRANSLOG_INTERNAL_PARTS +
3230 : 0].length,
3231 : TRANSLOG_INTERNAL_PARTS + 1,
3232 : log_array,
3233 : log_data + LSN_STORE_SIZE, &checksum_delta))
3234 : goto disk_err;
3235 : }
3236 : else
3237 : {
3238 : /* Write UNDO log record for the UPDATE */
3239 : uchar *log_pos= (log_data +
3240 9688 : log_array[TRANSLOG_INTERNAL_PARTS + 0].length);
3241 : size_t row_length, extents_length;
3242 : uint row_parts_count;
3243 :
3244 : /*
3245 : Write head length and extents of the original row so that we
3246 : during UNDO can put it back in the original position
3247 : */
3248 9688 : int2store(log_pos, info->cur_row.head_length);
3249 9688 : pagerange_store(log_pos + 2, info->cur_row.extents_count);
3250 9688 : log_pos+= 2 + PAGERANGE_STORE_SIZE;
3251 9688 : log_array[TRANSLOG_INTERNAL_PARTS + 0].length+= (2 +
3252 : PAGERANGE_STORE_SIZE);
3253 9688 : info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].str=
3254 : info->cur_row.extents;
3255 9688 : info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].length=
3256 : extents_length= info->cur_row.extents_count * ROW_EXTENT_SIZE;
3257 :
3258 9688 : row_length= fill_update_undo_parts(info, old_record, record,
3259 : log_array +
3260 : TRANSLOG_INTERNAL_PARTS + 2,
3261 : &row_parts_count);
3262 9688 : if (translog_write_record(&lsn, LOGREC_UNDO_ROW_UPDATE, info->trn,
3263 : info,
3264 : (translog_size_t)
3265 : (log_array[TRANSLOG_INTERNAL_PARTS +
3266 : 0].length + extents_length +
3267 : row_length),
3268 : TRANSLOG_INTERNAL_PARTS + 2 +
3269 : row_parts_count,
3270 : log_array,
3271 : log_data + LSN_STORE_SIZE,
3272 : &checksum_delta))
3273 213763 : goto disk_err;
3274 : }
3275 : }
3276 : }
3277 : /* Release not used space in used pages */
3278 213763 : if (_ma_bitmap_release_unused(info, bitmap_blocks))
3279 213763 : goto disk_err;
3280 213763 : _ma_unpin_all_pages(info, lsn);
3281 :
3282 213763 : if (tmp_data_used)
3283 : {
3284 : /*
3285 : Write data stored in info->rec_buff to pages
3286 : This is the char/varchar data that didn't fit into the head page.
3287 : */
3288 0 : DBUG_ASSERT(bitmap_blocks->count != 0);
3289 0 : if (write_full_pages(info, lsn, head_block + 1,
3290 : info->rec_buff, (ulong) (tmp_data - info->rec_buff)))
3291 : goto disk_err;
3292 : }
3293 :
3294 : /* Write rest of blobs (data, but no tails as they are already written) */
3295 4105 : for (; column < end_column; column++, blob_lengths++)
3296 : {
3297 : uchar *blob_pos;
3298 : uint length;
3299 : ulong blob_length;
3300 4105 : if (!*blob_lengths) /* Null or "" */
3301 4105 : continue;
3302 4105 : length= column->length - portable_sizeof_char_ptr;
3303 4105 : memcpy_fixed((uchar*) &blob_pos, record + column->offset + length,
3304 : sizeof(char*));
3305 : /* remove tail part */
3306 4105 : blob_length= *blob_lengths;
3307 4105 : if (block[block->sub_blocks - 1].used & BLOCKUSED_TAIL)
3308 3298 : blob_length-= (blob_length % FULL_PAGE_SIZE(block_size));
3309 :
3310 4105 : if (blob_length && write_full_pages(info, lsn, block,
3311 : blob_pos, blob_length))
3312 4105 : goto disk_err;
3313 4105 : block+= block->sub_blocks;
3314 : }
3315 :
3316 213763 : _ma_finalize_row(info);
3317 213763 : DBUG_RETURN(0);
3318 :
3319 0 : crashed:
3320 : /* Something was wrong with data on page */
3321 0 : my_errno= HA_ERR_WRONG_IN_RECORD;
3322 :
3323 0 : disk_err:
3324 : /**
3325 : @todo RECOVERY we are going to let dirty pages go to disk while we have
3326 : logged UNDO, this violates WAL. We must mark the table corrupted!
3327 :
3328 : @todo RECOVERY we have written some REDOs without a closing UNDO,
3329 : it's possible that a next operation by this transaction succeeds and then
3330 : Recovery would glue the "orphan REDOs" to the succeeded operation and
3331 : execute the failed REDOs. We need some mark "abort this group" in the
3332 : log, or mark the table corrupted (then user will repair it and thus REDOs
3333 : will be skipped).
3334 :
3335 : @todo RECOVERY to not let write errors go unnoticed, pagecache_write()
3336 : should take a MARIA_HA* in argument, and it it
3337 : fails when flushing a page to disk it should call
3338 : (*the_maria_ha->write_error_func)(the_maria_ha)
3339 : and this hook will mark the table corrupted.
3340 : Maybe hook should be stored in the pagecache's block structure, or in a
3341 : hash "file->maria_ha*".
3342 :
3343 : @todo RECOVERY we should distinguish below between log write error and
3344 : table write error. The former should stop Maria immediately, the latter
3345 : should mark the table corrupted.
3346 : */
3347 : /*
3348 : Unpin all pinned pages to not cause problems for disk cache. This is
3349 : safe to call even if we already called _ma_unpin_all_pages() above.
3350 : */
3351 0 : save_my_errno= my_errno;
3352 0 : _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
3353 0 : my_errno= save_my_errno;
3354 0 : DBUG_RETURN(1);
3355 : }
3356 :
3357 :
3358 : /*
3359 : @brief Write a record
3360 :
3361 : @fn allocate_and_write_block_record()
3362 : @param info Maria handler
3363 : @param record Record to write
3364 : @param row Information about fields in 'record'
3365 : @param undo_lsn <> LSN_ERROR if we are executing an UNDO
3366 :
3367 : @return
3368 : @retval 0 ok
3369 : @retval 1 Error
3370 : */
3371 :
3372 : static my_bool allocate_and_write_block_record(MARIA_HA *info,
3373 : const uchar *record,
3374 : MARIA_ROW *row,
3375 : LSN undo_lsn)
3376 185370 : {
3377 : struct st_row_pos_info row_pos;
3378 185370 : MARIA_BITMAP_BLOCKS *blocks= &row->insert_blocks;
3379 : int save_my_errno;
3380 185370 : DBUG_ENTER("allocate_and_write_block_record");
3381 :
3382 185370 : _ma_bitmap_flushable(info, 1);
3383 185370 : if (_ma_bitmap_find_place(info, row, blocks))
3384 185370 : goto err; /* Error reading bitmap */
3385 :
3386 : /*
3387 : Sleep; a checkpoint will happen and should not send this over-allocated
3388 : bitmap to disk but rather wait.
3389 : */
3390 185370 : DBUG_EXECUTE_IF("maria_over_alloc_bitmap", sleep(10););
3391 :
3392 : /* page will be pinned & locked by get_head_or_tail_page */
3393 185370 : if (get_head_or_tail_page(info, blocks->block, info->buff,
3394 : row->space_on_head_page, HEAD_PAGE,
3395 : PAGECACHE_LOCK_WRITE, &row_pos))
3396 185370 : goto err;
3397 185370 : row->lastpos= ma_recordpos(blocks->block->page, row_pos.rownr);
3398 185370 : if (info->s->calc_checksum)
3399 : {
3400 76241 : if (undo_lsn == LSN_ERROR)
3401 76241 : row->checksum= (info->s->calc_checksum)(info, record);
3402 : else
3403 : {
3404 : /* _ma_apply_undo_row_delete() already set row's checksum. Verify it. */
3405 0 : DBUG_ASSERT(row->checksum == (info->s->calc_checksum)(info, record));
3406 : }
3407 : }
3408 185370 : if (write_block_record(info, (uchar*) 0, record, row,
3409 : blocks, blocks->block->org_bitmap_value != 0,
3410 : &row_pos, undo_lsn, 0))
3411 185370 : goto err; /* Error reading bitmap */
3412 185370 : DBUG_PRINT("exit", ("rowid: %lu (%lu:%u)", (ulong) row->lastpos,
3413 : (ulong) ma_recordpos_to_page(row->lastpos),
3414 : ma_recordpos_to_dir_entry(row->lastpos)));
3415 : /* Now let checkpoint happen but don't commit */
3416 185370 : DBUG_EXECUTE_IF("maria_over_alloc_bitmap", sleep(1000););
3417 185370 : DBUG_RETURN(0);
3418 :
3419 0 : err:
3420 0 : save_my_errno= my_errno;
3421 0 : if (info->non_flushable_state)
3422 0 : _ma_bitmap_flushable(info, -1);
3423 0 : _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
3424 0 : my_errno= save_my_errno;
3425 0 : DBUG_RETURN(1);
3426 : }
3427 :
3428 :
3429 : /*
3430 : Write a record and return rowid for it
3431 :
3432 : SYNOPSIS
3433 : _ma_write_init_block_record()
3434 : info Maria handler
3435 : record Record to write
3436 :
3437 : NOTES
3438 : This is done BEFORE we write the keys to the row!
3439 :
3440 : RETURN
3441 : HA_OFFSET_ERROR Something went wrong
3442 : # Rowid for row
3443 : */
3444 :
3445 : MARIA_RECORD_POS _ma_write_init_block_record(MARIA_HA *info,
3446 : const uchar *record)
3447 185370 : {
3448 185370 : DBUG_ENTER("_ma_write_init_block_record");
3449 :
3450 185370 : calc_record_size(info, record, &info->cur_row);
3451 185370 : if (allocate_and_write_block_record(info, record,
3452 : &info->cur_row, LSN_ERROR))
3453 0 : DBUG_RETURN(HA_OFFSET_ERROR);
3454 185370 : DBUG_RETURN(info->cur_row.lastpos);
3455 : }
3456 :
3457 :
3458 : /*
3459 : Dummy function for (*info->s->write_record)()
3460 :
3461 : Nothing to do here, as we already wrote the record in
3462 : _ma_write_init_block_record()
3463 : */
3464 :
3465 : my_bool _ma_write_block_record(MARIA_HA *info __attribute__ ((unused)),
3466 : const uchar *record __attribute__ ((unused)))
3467 116072 : {
3468 116072 : return 0; /* Row already written */
3469 : }
3470 :
3471 :
3472 : /**
3473 : @brief Remove row written by _ma_write_block_record() and log undo
3474 :
3475 : @param info Maria handler
3476 :
3477 : @note
3478 : This is called in case we got a duplicate unique key while
3479 : writing keys.
3480 :
3481 : @return Operation status
3482 : @retval 0 OK
3483 : @retval 1 Error
3484 : */
3485 :
3486 : my_bool _ma_write_abort_block_record(MARIA_HA *info)
3487 58018 : {
3488 58018 : my_bool res= 0;
3489 58018 : MARIA_BITMAP_BLOCKS *blocks= &info->cur_row.insert_blocks;
3490 : MARIA_BITMAP_BLOCK *block, *end;
3491 58018 : LSN lsn= LSN_IMPOSSIBLE;
3492 58018 : MARIA_SHARE *share= info->s;
3493 58018 : DBUG_ENTER("_ma_write_abort_block_record");
3494 :
3495 58018 : _ma_bitmap_flushable(info, 1);
3496 58018 : if (delete_head_or_tail(info,
3497 : ma_recordpos_to_page(info->cur_row.lastpos),
3498 : ma_recordpos_to_dir_entry(info->cur_row.lastpos), 1,
3499 : 0))
3500 0 : res= 1;
3501 116340 : for (block= blocks->block + 1, end= block + blocks->count - 1; block < end;
3502 304 : block++)
3503 : {
3504 304 : if (block->used & BLOCKUSED_USED)
3505 : {
3506 304 : if (block->used & BLOCKUSED_TAIL)
3507 : {
3508 : /*
3509 : block->page_count is set to the tail directory entry number in
3510 : write_block_record()
3511 : */
3512 123 : if (delete_head_or_tail(info, block->page,
3513 : block->page_count & ~TAIL_BIT,
3514 : 0, 0))
3515 0 : res= 1;
3516 : }
3517 : else
3518 : {
3519 181 : if (free_full_page_range(info, block->page, block->page_count))
3520 0 : res= 1;
3521 : }
3522 : }
3523 : }
3524 58018 : if (share->now_transactional)
3525 : {
3526 39641 : if (_ma_write_clr(info, info->cur_row.orig_undo_lsn,
3527 : LOGREC_UNDO_ROW_INSERT,
3528 : share->calc_checksum != 0,
3529 : (ha_checksum) 0 - info->cur_row.checksum,
3530 : &lsn, (void*) 0))
3531 0 : res= 1;
3532 : }
3533 58018 : _ma_bitmap_flushable(info, -1);
3534 58018 : _ma_unpin_all_pages_and_finalize_row(info, lsn);
3535 58018 : DBUG_RETURN(res);
3536 : }
3537 :
3538 :
3539 : /*
3540 : Update a record
3541 :
3542 : NOTES
3543 : For the moment, we assume that info->curr_row.extents is always updated
3544 : when a row is read. In the future we may decide to read this on demand
3545 : for rows split into many extents.
3546 : */
3547 :
3548 : static my_bool _ma_update_block_record2(MARIA_HA *info,
3549 : MARIA_RECORD_POS record_pos,
3550 : const uchar *oldrec,
3551 : const uchar *record,
3552 : LSN undo_lsn)
3553 12808 : {
3554 12808 : MARIA_BITMAP_BLOCKS *blocks= &info->cur_row.insert_blocks;
3555 : uchar *buff;
3556 12808 : MARIA_ROW *cur_row= &info->cur_row, *new_row= &info->new_row;
3557 : MARIA_PINNED_PAGE page_link;
3558 : uint rownr, org_empty_size, head_length;
3559 12808 : uint block_size= info->s->block_size;
3560 : uchar *dir;
3561 : pgcache_page_no_t page;
3562 : struct st_row_pos_info row_pos;
3563 : my_bool res;
3564 : ha_checksum old_checksum;
3565 12808 : MARIA_SHARE *share= info->s;
3566 12808 : DBUG_ENTER("_ma_update_block_record2");
3567 12808 : DBUG_PRINT("enter", ("rowid: %lu", (long) record_pos));
3568 :
3569 : #ifdef ENABLE_IF_PROBLEM_WITH_UPDATE
3570 : DBUG_DUMP("oldrec", oldrec, share->base.reclength);
3571 : DBUG_DUMP("newrec", record, share->base.reclength);
3572 : #endif
3573 :
3574 : /*
3575 : Checksums of new and old rows were computed by callers already; new
3576 : row's was put into cur_row, old row's was put into new_row.
3577 : */
3578 12808 : old_checksum= new_row->checksum;
3579 12808 : new_row->checksum= cur_row->checksum;
3580 12808 : calc_record_size(info, record, new_row);
3581 12808 : page= ma_recordpos_to_page(record_pos);
3582 :
3583 12808 : _ma_bitmap_flushable(info, 1);
3584 12808 : buff= pagecache_read(share->pagecache,
3585 : &info->dfile, (pgcache_page_no_t) page, 0, 0,
3586 : share->page_type,
3587 : PAGECACHE_LOCK_WRITE, &page_link.link);
3588 12808 : page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
3589 12808 : page_link.changed= buff != 0;
3590 12808 : push_dynamic(&info->pinned_pages, (void*) &page_link);
3591 12808 : if (!buff)
3592 12808 : goto err;
3593 :
3594 12808 : org_empty_size= uint2korr(buff + EMPTY_SPACE_OFFSET);
3595 12808 : rownr= ma_recordpos_to_dir_entry(record_pos);
3596 12808 : dir= dir_entry_pos(buff, block_size, rownr);
3597 :
3598 12808 : if ((org_empty_size + cur_row->head_length) >= new_row->total_length)
3599 : {
3600 : uint rec_offset, length;
3601 : MARIA_BITMAP_BLOCK block;
3602 :
3603 : /*
3604 : We can fit the new row in the same page as the original head part
3605 : of the row
3606 : */
3607 12429 : block.org_bitmap_value= _ma_free_size_to_head_pattern(&share->bitmap,
3608 : org_empty_size);
3609 12429 : if (extend_area_on_page(info, buff, dir, rownr, block_size,
3610 : new_row->total_length, &org_empty_size,
3611 : &rec_offset, &length))
3612 12429 : goto err;
3613 :
3614 12429 : row_pos.buff= buff;
3615 12429 : row_pos.rownr= rownr;
3616 12429 : row_pos.empty_space= org_empty_size;
3617 12429 : row_pos.dir= dir;
3618 12429 : row_pos.data= buff + rec_offset;
3619 12429 : row_pos.length= length;
3620 12429 : blocks->block= █
3621 12429 : blocks->count= 1;
3622 12429 : block.page= page;
3623 12429 : block.sub_blocks= 1;
3624 12429 : block.used= BLOCKUSED_USED | BLOCKUSED_USE_ORG_BITMAP;
3625 12429 : block.empty_space= row_pos.empty_space;
3626 :
3627 12429 : if (*cur_row->tail_positions &&
3628 : delete_tails(info, cur_row->tail_positions))
3629 12429 : goto err;
3630 12429 : if (cur_row->extents_count && free_full_pages(info, cur_row))
3631 12429 : goto err;
3632 12429 : res= write_block_record(info, oldrec, record, new_row, blocks,
3633 : 1, &row_pos, undo_lsn, old_checksum);
3634 : /* We can't update or delete this without re-reading it again */
3635 12429 : info->update&= ~HA_STATE_AKTIV;
3636 12429 : DBUG_RETURN(res);
3637 : }
3638 : /* Delete old row */
3639 379 : if (*cur_row->tail_positions &&
3640 : delete_tails(info, cur_row->tail_positions))
3641 379 : goto err;
3642 379 : if (cur_row->extents_count && free_full_pages(info, cur_row))
3643 379 : goto err;
3644 :
3645 379 : head_length= uint2korr(dir + 2);
3646 379 : if (_ma_bitmap_find_new_place(info, new_row, page, head_length +
3647 : org_empty_size, blocks))
3648 379 : goto err;
3649 :
3650 : /*
3651 : Allocate all size in block for record
3652 : TODO:
3653 : Need to improve this to do compact if we can fit one more blob into
3654 : the head page
3655 : */
3656 379 : if ((head_length < new_row->space_on_head_page ||
3657 : (new_row->total_length <= head_length &&
3658 : org_empty_size + head_length >= new_row->total_length)))
3659 : {
3660 296 : _ma_compact_block_page(buff, block_size, rownr, 1,
3661 : info->trn->min_read_from,
3662 : share->base.min_block_length);
3663 296 : org_empty_size= 0;
3664 296 : head_length= uint2korr(dir + 2);
3665 : }
3666 :
3667 379 : row_pos.buff= buff;
3668 379 : row_pos.rownr= rownr;
3669 379 : row_pos.empty_space= org_empty_size + head_length;
3670 379 : row_pos.dir= dir;
3671 379 : row_pos.data= buff + uint2korr(dir);
3672 379 : row_pos.length= head_length;
3673 379 : if ((res= write_block_record(info, oldrec, record, new_row, blocks, 1,
3674 : &row_pos, undo_lsn, old_checksum)))
3675 379 : goto err;
3676 379 : DBUG_RETURN(0);
3677 :
3678 0 : err:
3679 0 : if (info->non_flushable_state)
3680 0 : _ma_bitmap_flushable(info, -1);
3681 0 : _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
3682 0 : DBUG_RETURN(1);
3683 : }
3684 :
3685 :
3686 : /*
3687 : @brief Store new row on it's original position
3688 :
3689 : @note
3690 : This is basicly a copy of _ma_update_block_record2
3691 : When we have a purge thread for deleted row, we can remove this function
3692 : and use _ma_update_block_record2 instead.
3693 :
3694 : This is the main reason we don't make a lot of subfunctions that are
3695 : common between _ma_update_block_record2() and this function.
3696 : */
3697 :
3698 : static my_bool _ma_update_at_original_place(MARIA_HA *info,
3699 : pgcache_page_no_t page,
3700 : uint rownr,
3701 : uint length_on_head_page,
3702 : uint extent_count,
3703 : const uchar *extent_info,
3704 : const uchar *oldrec,
3705 : const uchar *record,
3706 : LSN undo_lsn)
3707 1671 : {
3708 : MARIA_BITMAP_BLOCKS *blocks;
3709 : MARIA_BITMAP_BLOCK *block;
3710 1671 : MARIA_ROW *cur_row= &info->cur_row, *new_row= &info->new_row;
3711 : MARIA_PINNED_PAGE page_link;
3712 1671 : MARIA_SHARE *share= info->s;
3713 : ha_checksum old_checksum;
3714 : uint org_empty_size, empty_size;
3715 1671 : uint block_size= info->s->block_size;
3716 : uchar *dir, *buff;
3717 : struct st_row_pos_info row_pos;
3718 : my_bool res;
3719 : uint rec_offset, length;
3720 1671 : DBUG_ENTER("_ma_update_at_original_place");
3721 :
3722 : #ifdef ENABLE_IF_PROBLEM_WITH_UPDATE
3723 : DBUG_DUMP("oldrec", oldrec, share->base.reclength);
3724 : DBUG_DUMP("newrec", record, share->base.reclength);
3725 : #endif
3726 :
3727 : /*
3728 : Checksums of new and old rows were computed by callers already; new
3729 : row's was put into cur_row, old row's was put into new_row.
3730 : */
3731 1671 : old_checksum= new_row->checksum;
3732 1671 : new_row->checksum= cur_row->checksum;
3733 1671 : calc_record_size(info, record, new_row);
3734 :
3735 1671 : _ma_bitmap_flushable(info, 1);
3736 1671 : buff= pagecache_read(share->pagecache,
3737 : &info->dfile, (pgcache_page_no_t) page, 0, 0,
3738 : share->page_type,
3739 : PAGECACHE_LOCK_WRITE, &page_link.link);
3740 1671 : page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
3741 1671 : page_link.changed= buff != 0;
3742 1671 : push_dynamic(&info->pinned_pages, (void*) &page_link);
3743 1671 : if (!buff)
3744 1671 : goto err;
3745 :
3746 1671 : org_empty_size= uint2korr(buff + EMPTY_SPACE_OFFSET);
3747 1671 : dir= dir_entry_pos(buff, block_size, rownr);
3748 :
3749 1671 : if ((org_empty_size + cur_row->head_length) < length_on_head_page)
3750 : {
3751 0 : my_errno= HA_ERR_WRONG_IN_RECORD;
3752 0 : goto err;
3753 : }
3754 :
3755 : /*
3756 : We can fit the new row in the same page as the original head part
3757 : of the row
3758 : */
3759 1671 : empty_size= org_empty_size;
3760 1671 : if (extend_area_on_page(info, buff, dir, rownr, block_size,
3761 : length_on_head_page, &empty_size,
3762 : &rec_offset, &length))
3763 1671 : goto err;
3764 :
3765 1671 : row_pos.buff= buff;
3766 1671 : row_pos.rownr= rownr;
3767 1671 : row_pos.empty_space= empty_size;
3768 1671 : row_pos.dir= dir;
3769 1671 : row_pos.data= buff + rec_offset;
3770 1671 : row_pos.length= length_on_head_page;
3771 :
3772 : /* Delete old row */
3773 1671 : if (*cur_row->tail_positions &&
3774 : delete_tails(info, cur_row->tail_positions))
3775 1671 : goto err;
3776 1671 : if (cur_row->extents_count && free_full_pages(info, cur_row))
3777 1671 : goto err;
3778 :
3779 : /* Change extent information to be usable by write_block_record() */
3780 1671 : blocks= &cur_row->insert_blocks;
3781 1671 : if (extent_to_bitmap_blocks(info, blocks, page, extent_count, extent_info))
3782 1671 : goto err;
3783 1671 : block= blocks->block;
3784 1671 : block->empty_space= row_pos.empty_space;
3785 1671 : block->org_bitmap_value= _ma_free_size_to_head_pattern(&share->bitmap,
3786 : org_empty_size);
3787 1671 : DBUG_ASSERT(block->org_bitmap_value ==
3788 : _ma_bitmap_get_page_bits(info, &info->s->bitmap, page));
3789 1671 : block->used|= BLOCKUSED_USE_ORG_BITMAP;
3790 :
3791 : /*
3792 : We have to use <= below as the new_row may be smaller than the original
3793 : row as the new row doesn't have transaction id
3794 : */
3795 :
3796 1671 : DBUG_ASSERT(blocks->count > 1 ||
3797 : max(new_row->total_length, share->base.min_block_length) <=
3798 : length_on_head_page);
3799 :
3800 1671 : if ((res= write_block_record(info, oldrec, record, new_row, blocks,
3801 : 1, &row_pos, undo_lsn, old_checksum)))
3802 1671 : goto err;
3803 1671 : DBUG_RETURN(0);
3804 :
3805 0 : err:
3806 0 : if (info->non_flushable_state)
3807 0 : _ma_bitmap_flushable(info, -1);
3808 0 : _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
3809 0 : DBUG_RETURN(1);
3810 : }
3811 :
3812 :
3813 : /* Wrapper for _ma_update_block_record2() used by ma_update() */
3814 :
3815 : my_bool _ma_update_block_record(MARIA_HA *info, MARIA_RECORD_POS record_pos,
3816 : const uchar *orig_rec, const uchar *new_rec)
3817 12808 : {
3818 12808 : return _ma_update_block_record2(info, record_pos, orig_rec, new_rec,
3819 : LSN_ERROR);
3820 : }
3821 :
3822 :
3823 : /*
3824 : Delete a directory entry
3825 :
3826 : SYNOPSIS
3827 : delete_dir_entry()
3828 : buff Page buffer
3829 : block_size Block size
3830 : record_number Record number to delete
3831 : empty_space Empty space on page after delete
3832 :
3833 : RETURN
3834 : -1 Error on page
3835 : 0 ok
3836 : 1 Page is now empty
3837 : */
3838 :
3839 : static int delete_dir_entry(uchar *buff, uint block_size, uint record_number,
3840 : uint *empty_space_res)
3841 230881 : {
3842 230881 : uint number_of_records= (uint) buff[DIR_COUNT_OFFSET];
3843 : uint length, empty_space;
3844 : uchar *dir;
3845 230881 : DBUG_ENTER("delete_dir_entry");
3846 :
3847 : #ifdef SANITY_CHECKS
3848 230881 : if (record_number >= number_of_records ||
3849 : record_number > ((block_size - LSN_SIZE - PAGE_TYPE_SIZE - 1 -
3850 : PAGE_SUFFIX_SIZE) / DIR_ENTRY_SIZE))
3851 : {
3852 0 : DBUG_PRINT("error", ("record_number: %u number_of_records: %u",
3853 : record_number, number_of_records));
3854 :
3855 0 : DBUG_RETURN(-1);
3856 : }
3857 : #endif
3858 :
3859 230881 : check_directory(buff, block_size, 0);
3860 230881 : empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
3861 230881 : dir= dir_entry_pos(buff, block_size, record_number);
3862 230881 : length= uint2korr(dir + 2);
3863 :
3864 230881 : if (record_number == number_of_records - 1)
3865 : {
3866 : /* Delete this entry and all following free directory entries */
3867 130021 : uchar *end= buff + block_size - PAGE_SUFFIX_SIZE;
3868 130021 : number_of_records--;
3869 130021 : dir+= DIR_ENTRY_SIZE;
3870 130021 : empty_space+= DIR_ENTRY_SIZE;
3871 :
3872 : /* Unlink and free the next empty ones */
3873 339924 : while (dir < end && dir[0] == 0 && dir[1] == 0)
3874 : {
3875 79882 : number_of_records--;
3876 79882 : if (dir[2] == END_OF_DIR_FREE_LIST)
3877 66521 : buff[DIR_FREE_OFFSET]= dir[3];
3878 : else
3879 : {
3880 13361 : uchar *prev_entry= dir_entry_pos(buff, block_size, (uint) dir[2]);
3881 13361 : DBUG_ASSERT(uint2korr(prev_entry) == 0 && prev_entry[3] ==
3882 : number_of_records);
3883 13361 : prev_entry[3]= dir[3];
3884 : }
3885 79882 : if (dir[3] != END_OF_DIR_FREE_LIST)
3886 : {
3887 75690 : uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]);
3888 75690 : DBUG_ASSERT(uint2korr(next_entry) == 0 && next_entry[2] ==
3889 : number_of_records);
3890 75690 : next_entry[2]= dir[2];
3891 : }
3892 79882 : dir+= DIR_ENTRY_SIZE;
3893 79882 : empty_space+= DIR_ENTRY_SIZE;
3894 : }
3895 :
3896 130021 : if (number_of_records == 0)
3897 : {
3898 : /* All entries on page deleted */
3899 3955 : DBUG_PRINT("info", ("Page marked as unallocated"));
3900 3955 : buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE;
3901 : #ifdef IDENTICAL_PAGES_AFTER_RECOVERY
3902 : {
3903 : dir= dir_entry_pos(buff, block_size, record_number);
3904 : bzero(dir, (record_number+1) * DIR_ENTRY_SIZE);
3905 : }
3906 : #endif
3907 3955 : *empty_space_res= block_size;
3908 3955 : DBUG_RETURN(1);
3909 : }
3910 126066 : buff[DIR_COUNT_OFFSET]= (uchar) number_of_records;
3911 : }
3912 : else
3913 : {
3914 : /* Update directory */
3915 100860 : dir[0]= dir[1]= 0;
3916 100860 : dir[2]= END_OF_DIR_FREE_LIST;
3917 100860 : if ((dir[3]= buff[DIR_FREE_OFFSET]) != END_OF_DIR_FREE_LIST)
3918 : {
3919 : /* Relink next entry to point to newly freed entry */
3920 98626 : uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]);
3921 98626 : DBUG_ASSERT(uint2korr(next_entry) == 0 &&
3922 : next_entry[2] == END_OF_DIR_FREE_LIST);
3923 98626 : next_entry[2]= record_number;
3924 : }
3925 100860 : buff[DIR_FREE_OFFSET]= record_number;
3926 : }
3927 226926 : empty_space+= length;
3928 :
3929 226926 : int2store(buff + EMPTY_SPACE_OFFSET, empty_space);
3930 226926 : buff[PAGE_TYPE_OFFSET]|= (uchar) PAGE_CAN_BE_COMPACTED;
3931 :
3932 226926 : *empty_space_res= empty_space;
3933 :
3934 226926 : check_directory(buff, block_size, 0);
3935 226926 : DBUG_RETURN(0);
3936 : }
3937 :
3938 :
3939 : /*
3940 : Delete a head a tail part
3941 :
3942 : SYNOPSIS
3943 : delete_head_or_tail()
3944 : info Maria handler
3945 : page Page (not file offset!) on which the row is
3946 : head 1 if this is a head page
3947 : from_update 1 if we are called from update. In this case we
3948 : leave the page as write locked as we may put
3949 : the new row into the old position.
3950 :
3951 : RETURN
3952 : 0 ok
3953 : 1 error
3954 : */
3955 :
3956 : static my_bool delete_head_or_tail(MARIA_HA *info,
3957 : pgcache_page_no_t page, uint record_number,
3958 : my_bool head, my_bool from_update)
3959 175909 : {
3960 175909 : MARIA_SHARE *share= info->s;
3961 : uint empty_space;
3962 175909 : uint block_size= share->block_size;
3963 : uchar *buff;
3964 : LSN lsn;
3965 : MARIA_PINNED_PAGE page_link;
3966 : int res;
3967 : enum pagecache_page_lock lock_at_write, lock_at_unpin;
3968 175909 : DBUG_ENTER("delete_head_or_tail");
3969 175909 : DBUG_PRINT("enter", ("id: %lu (%lu:%u)",
3970 : (ulong) ma_recordpos(page, record_number),
3971 : (ulong) page, record_number));
3972 :
3973 175909 : buff= pagecache_read(share->pagecache,
3974 : &info->dfile, page, 0, 0,
3975 : share->page_type,
3976 : PAGECACHE_LOCK_WRITE, &page_link.link);
3977 175909 : page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
3978 175909 : page_link.changed= buff != 0;
3979 175909 : push_dynamic(&info->pinned_pages, (void*) &page_link);
3980 175909 : if (!buff)
3981 0 : DBUG_RETURN(1);
3982 175909 : DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) ==
3983 : (head ? HEAD_PAGE : TAIL_PAGE));
3984 :
3985 175909 : if (from_update)
3986 : {
3987 34659 : lock_at_write= PAGECACHE_LOCK_LEFT_WRITELOCKED;
3988 34659 : lock_at_unpin= PAGECACHE_LOCK_WRITE_UNLOCK;
3989 : }
3990 : else
3991 : {
3992 141250 : lock_at_write= PAGECACHE_LOCK_WRITE_TO_READ;
3993 141250 : lock_at_unpin= PAGECACHE_LOCK_READ_UNLOCK;
3994 : }
3995 :
3996 175909 : res= delete_dir_entry(buff, block_size, record_number, &empty_space);
3997 175909 : if (res < 0)
3998 0 : DBUG_RETURN(1);
3999 175909 : if (res == 0) /* after our deletion, page is still not empty */
4000 : {
4001 : uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE];
4002 : LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
4003 171954 : if (share->now_transactional)
4004 : {
4005 : /* Log REDO data */
4006 130175 : page_store(log_data + FILEID_STORE_SIZE, page);
4007 130175 : dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE,
4008 : record_number);
4009 :
4010 130175 : log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
4011 130175 : log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
4012 130175 : if (translog_write_record(&lsn, (head ? LOGREC_REDO_PURGE_ROW_HEAD :
4013 : LOGREC_REDO_PURGE_ROW_TAIL),
4014 : info->trn, info,
4015 : (translog_size_t) sizeof(log_data),
4016 : TRANSLOG_INTERNAL_PARTS + 1, log_array,
4017 : log_data, NULL))
4018 0 : DBUG_RETURN(1);
4019 : }
4020 : }
4021 : else /* page is now empty */
4022 : {
4023 3955 : if (share->now_transactional)
4024 : {
4025 : uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE];
4026 : LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
4027 3195 : page_store(log_data + FILEID_STORE_SIZE, page);
4028 3195 : log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
4029 3195 : log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
4030 3195 : if (translog_write_record(&lsn, LOGREC_REDO_FREE_HEAD_OR_TAIL,
4031 : info->trn, info,
4032 : (translog_size_t) sizeof(log_data),
4033 : TRANSLOG_INTERNAL_PARTS + 1, log_array,
4034 : log_data, NULL))
4035 0 : DBUG_RETURN(1);
4036 : }
4037 3955 : DBUG_ASSERT(empty_space >= share->bitmap.sizes[0]);
4038 : }
4039 :
4040 175909 : pagecache_unlock_by_link(share->pagecache, page_link.link,
4041 : lock_at_write,
4042 : PAGECACHE_PIN_LEFT_PINNED, LSN_IMPOSSIBLE,
4043 : LSN_IMPOSSIBLE, 1, FALSE);
4044 175909 : page_link.unlock= lock_at_unpin;
4045 175909 : set_dynamic(&info->pinned_pages, (void*) &page_link,
4046 : info->pinned_pages.elements-1);
4047 :
4048 175909 : DBUG_PRINT("info", ("empty_space: %u", empty_space));
4049 :
4050 : /*
4051 : If there is not enough space for all possible tails, mark the
4052 : page full
4053 : */
4054 175909 : if (!head && !enough_free_entries(buff, share->block_size,
4055 : 1 + share->base.blobs))
4056 0 : empty_space= 0;
4057 :
4058 175909 : DBUG_RETURN(_ma_bitmap_set(info, page, head, empty_space));
4059 : }
4060 :
4061 :
4062 : /*
4063 : delete all tails
4064 :
4065 : SYNOPSIS
4066 : delete_tails()
4067 : info Handler
4068 : tails Pointer to vector of tail positions, ending with 0
4069 :
4070 : RETURN
4071 : 0 ok
4072 : 1 error
4073 : */
4074 :
4075 : static my_bool delete_tails(MARIA_HA *info, MARIA_RECORD_POS *tails)
4076 115215 : {
4077 115215 : my_bool res= 0;
4078 115215 : DBUG_ENTER("delete_tails");
4079 118055 : for (; *tails; tails++)
4080 : {
4081 2840 : if (delete_head_or_tail(info,
4082 : ma_recordpos_to_page(*tails),
4083 : ma_recordpos_to_dir_entry(*tails), 0, 1))
4084 0 : res= 1;
4085 : }
4086 115215 : DBUG_RETURN(res);
4087 : }
4088 :
4089 :
4090 : /*
4091 : Delete a record
4092 :
4093 : NOTES
4094 : For the moment, we assume that info->cur_row.extents is always updated
4095 : when a row is read. In the future we may decide to read this on demand
4096 : for rows with many splits.
4097 : */
4098 :
4099 : my_bool _ma_delete_block_record(MARIA_HA *info, const uchar *record)
4100 83109 : {
4101 : pgcache_page_no_t page;
4102 : uint record_number;
4103 83109 : MARIA_SHARE *share= info->s;
4104 83109 : LSN lsn= LSN_IMPOSSIBLE;
4105 83109 : DBUG_ENTER("_ma_delete_block_record");
4106 :
4107 83109 : page= ma_recordpos_to_page(info->cur_row.lastpos);
4108 83109 : record_number= ma_recordpos_to_dir_entry(info->cur_row.lastpos);
4109 83109 : DBUG_PRINT("enter", ("rowid: %lu (%lu:%u)", (ulong) info->cur_row.lastpos,
4110 : (ulong) page, record_number));
4111 :
4112 83109 : _ma_bitmap_flushable(info, 1);
4113 83109 : if (delete_head_or_tail(info, page, record_number, 1, 0) ||
4114 : delete_tails(info, info->cur_row.tail_positions))
4115 : goto err;
4116 :
4117 83109 : if (info->cur_row.extents_count && free_full_pages(info, &info->cur_row))
4118 83109 : goto err;
4119 :
4120 83109 : if (share->now_transactional)
4121 : {
4122 : uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE +
4123 : DIRPOS_STORE_SIZE + 2 + PAGERANGE_STORE_SIZE +
4124 : HA_CHECKSUM_STORE_SIZE];
4125 : uchar *log_pos;
4126 : size_t row_length;
4127 : uint row_parts_count, extents_length;
4128 : ha_checksum checksum_delta;
4129 :
4130 : /* Write UNDO record */
4131 59439 : lsn_store(log_data, info->trn->undo_lsn);
4132 59439 : page_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE, page);
4133 59439 : log_pos= log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE;
4134 59439 : dirpos_store(log_pos, record_number);
4135 59439 : log_pos+= DIRPOS_STORE_SIZE;
4136 59439 : int2store(log_pos, info->cur_row.head_length);
4137 59439 : log_pos+= 2;
4138 59439 : pagerange_store(log_pos, info->cur_row.extents_count);
4139 59439 : log_pos+= PAGERANGE_STORE_SIZE;
4140 :
4141 59439 : info->log_row_parts[TRANSLOG_INTERNAL_PARTS].str= log_data;
4142 59439 : info->log_row_parts[TRANSLOG_INTERNAL_PARTS].length=
4143 : sizeof(log_data) - HA_CHECKSUM_STORE_SIZE;
4144 59439 : store_checksum_in_rec(share, checksum_delta,
4145 : (ha_checksum) 0 - info->cur_row.checksum, log_pos,
4146 : info->log_row_parts[TRANSLOG_INTERNAL_PARTS +
4147 : 0].length);
4148 59439 : info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].str=
4149 : info->cur_row.extents;
4150 59439 : info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].length=
4151 : extents_length= info->cur_row.extents_count * ROW_EXTENT_SIZE;
4152 :
4153 59439 : row_length= fill_insert_undo_parts(info, record,
4154 : (info->log_row_parts +
4155 : TRANSLOG_INTERNAL_PARTS + 2),
4156 : &row_parts_count);
4157 :
4158 59439 : if (translog_write_record(&lsn, LOGREC_UNDO_ROW_DELETE, info->trn,
4159 : info,
4160 : (translog_size_t)
4161 : (info->log_row_parts[TRANSLOG_INTERNAL_PARTS +
4162 : 0].length + row_length +
4163 : extents_length),
4164 : TRANSLOG_INTERNAL_PARTS + 2 + row_parts_count,
4165 : info->log_row_parts,
4166 : log_data + LSN_STORE_SIZE,
4167 : &checksum_delta))
4168 83109 : goto err;
4169 : }
4170 :
4171 83109 : _ma_bitmap_flushable(info, -1);
4172 83109 : _ma_unpin_all_pages_and_finalize_row(info, lsn);
4173 83109 : DBUG_RETURN(0);
4174 :
4175 0 : err:
4176 0 : _ma_bitmap_flushable(info, -1);
4177 0 : _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
4178 0 : DBUG_RETURN(1);
4179 : }
4180 :
4181 :
4182 : /****************************************************************************
4183 : Reading of records
4184 : ****************************************************************************/
4185 :
4186 : /*
4187 : Read position to record from record directory at end of page
4188 :
4189 : SYNOPSIS
4190 : get_record_position()
4191 : buff page buffer
4192 : block_size block size for page
4193 : record_number Record number in index
4194 : end_of_data pointer to end of data for record
4195 :
4196 : RETURN
4197 : 0 Error in data
4198 : # Pointer to start of record.
4199 : In this case *end_of_data is set.
4200 : */
4201 :
4202 : static uchar *get_record_position(uchar *buff, uint block_size,
4203 : uint record_number, uchar **end_of_data)
4204 265498 : {
4205 265498 : uint number_of_records= (uint) buff[DIR_COUNT_OFFSET];
4206 : uchar *dir;
4207 : uchar *data;
4208 : uint offset, length;
4209 :
4210 : #ifdef SANITY_CHECKS
4211 265498 : if (record_number >= number_of_records ||
4212 : record_number > ((block_size - PAGE_HEADER_SIZE - PAGE_SUFFIX_SIZE) /
4213 : DIR_ENTRY_SIZE))
4214 : {
4215 0 : DBUG_PRINT("error",
4216 : ("Wrong row number: record_number: %u number_of_records: %u",
4217 : record_number, number_of_records));
4218 0 : return 0;
4219 : }
4220 : #endif
4221 :
4222 265498 : dir= dir_entry_pos(buff, block_size, record_number);
4223 265498 : offset= uint2korr(dir);
4224 265498 : length= uint2korr(dir + 2);
4225 : #ifdef SANITY_CHECKS
4226 265498 : if (offset < PAGE_HEADER_SIZE ||
4227 : offset + length > (block_size -
4228 : number_of_records * DIR_ENTRY_SIZE -
4229 : PAGE_SUFFIX_SIZE))
4230 : {
4231 0 : DBUG_PRINT("error",
4232 : ("Wrong row position: record_number: %u offset: %u "
4233 : "length: %u number_of_records: %u",
4234 : record_number, offset, length, number_of_records));
4235 0 : return 0;
4236 : }
4237 : #endif
4238 265498 : data= buff + offset;
4239 265498 : *end_of_data= data + length;
4240 265498 : return data;
4241 : }
4242 :
4243 :
4244 : /*
4245 : Init extent
4246 :
4247 : NOTES
4248 : extent is a cursor over which pages to read
4249 : */
4250 :
4251 : static void init_extent(MARIA_EXTENT_CURSOR *extent, uchar *extent_info,
4252 : uint extents, MARIA_RECORD_POS *tail_positions)
4253 13566 : {
4254 : uint page_count;
4255 13566 : extent->extent= extent_info;
4256 13566 : extent->extent_count= extents;
4257 13566 : extent->page= page_korr(extent_info); /* First extent */
4258 13566 : page_count= (uint2korr(extent_info + ROW_EXTENT_PAGE_SIZE) &
4259 : ~START_EXTENT_BIT);
4260 13566 : extent->tail= page_count & TAIL_BIT;
4261 13566 : if (extent->tail)
4262 : {
4263 121 : extent->page_count= 1;
4264 121 : extent->tail_row_nr= page_count & ~TAIL_BIT;
4265 : }
4266 : else
4267 13445 : extent->page_count= page_count;
4268 13566 : extent->tail_positions= tail_positions;
4269 13566 : extent->lock_for_tail_pages= PAGECACHE_LOCK_LEFT_UNLOCKED;
4270 : }
4271 :
4272 :
4273 : /*
4274 : Read next extent
4275 :
4276 : SYNOPSIS
4277 : read_next_extent()
4278 : info Maria handler
4279 : extent Pointer to current extent (this is updated to point
4280 : to next)
4281 : end_of_data Pointer to end of data in read block (out)
4282 :
4283 : NOTES
4284 : New block is read into info->buff
4285 :
4286 : RETURN
4287 : 0 Error; my_errno is set
4288 : # Pointer to start of data in read block
4289 : In this case end_of_data is updated to point to end of data.
4290 : */
4291 :
4292 : static uchar *read_next_extent(MARIA_HA *info, MARIA_EXTENT_CURSOR *extent,
4293 : uchar **end_of_data)
4294 48356 : {
4295 48356 : MARIA_SHARE *share= info->s;
4296 : uchar *buff, *data;
4297 : MARIA_PINNED_PAGE page_link;
4298 : enum pagecache_page_lock lock;
4299 48356 : DBUG_ENTER("read_next_extent");
4300 :
4301 48356 : if (!extent->page_count)
4302 : {
4303 : uint page_count;
4304 9406 : if (!--extent->extent_count)
4305 9406 : goto crashed;
4306 9406 : extent->extent+= ROW_EXTENT_SIZE;
4307 9406 : extent->page= page_korr(extent->extent);
4308 9406 : page_count= (uint2korr(extent->extent+ROW_EXTENT_PAGE_SIZE) &
4309 : ~START_EXTENT_BIT);
4310 9406 : if (!page_count)
4311 9406 : goto crashed;
4312 9406 : extent->tail= page_count & TAIL_BIT;
4313 9406 : if (extent->tail)
4314 9406 : extent->tail_row_nr= page_count & ~TAIL_BIT;
4315 : else
4316 0 : extent->page_count= page_count;
4317 9406 : DBUG_PRINT("info",("New extent. Page: %lu page_count: %u tail_flag: %d",
4318 : (ulong) extent->page, extent->page_count,
4319 : extent->tail != 0));
4320 : }
4321 48356 : extent->first_extent= 0;
4322 :
4323 48356 : lock= PAGECACHE_LOCK_LEFT_UNLOCKED;
4324 48356 : if (extent->tail)
4325 9527 : lock= extent->lock_for_tail_pages;
4326 :
4327 48356 : buff= pagecache_read(share->pagecache,
4328 : &info->dfile, extent->page, 0,
4329 : info->buff, share->page_type,
4330 : lock, &page_link.link);
4331 48356 : if (lock != PAGECACHE_LOCK_LEFT_UNLOCKED)
4332 : {
4333 : /* Read during UNDO */
4334 0 : page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
4335 0 : page_link.changed= buff != 0;
4336 0 : push_dynamic(&info->pinned_pages, (void*) &page_link);
4337 : }
4338 48356 : if (!buff)
4339 : {
4340 : /* check if we tried to read over end of file (ie: bad data in record) */
4341 0 : if ((extent->page + 1) * share->block_size >
4342 : share->state.state.data_file_length)
4343 0 : goto crashed;
4344 0 : DBUG_RETURN(0);
4345 : }
4346 :
4347 48356 : if (!extent->tail)
4348 : {
4349 : /* Full data page */
4350 38829 : if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != BLOB_PAGE)
4351 38829 : goto crashed;
4352 38829 : extent->page++; /* point to next page */
4353 38829 : extent->page_count--;
4354 38829 : *end_of_data= buff + share->block_size - PAGE_SUFFIX_SIZE;
4355 38829 : info->cur_row.full_page_count++; /* For maria_chk */
4356 38829 : DBUG_RETURN(extent->data_start= buff + LSN_SIZE + PAGE_TYPE_SIZE);
4357 : }
4358 :
4359 : /* Found tail */
4360 9527 : if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != TAIL_PAGE)
4361 9527 : goto crashed;
4362 9527 : *(extent->tail_positions++)= ma_recordpos(extent->page,
4363 : extent->tail_row_nr);
4364 9527 : info->cur_row.tail_count++; /* For maria_chk */
4365 :
4366 9527 : if (!(data= get_record_position(buff, share->block_size,
4367 : extent->tail_row_nr,
4368 : end_of_data)))
4369 9527 : goto crashed;
4370 9527 : extent->data_start= data;
4371 9527 : extent->page_count= 0; /* No more data in extent */
4372 9527 : DBUG_RETURN(data);
4373 :
4374 :
4375 0 : crashed:
4376 0 : my_errno= HA_ERR_WRONG_IN_RECORD; /* File crashed */
4377 0 : DBUG_PRINT("error", ("wrong extent information"));
4378 0 : DBUG_RETURN(0);
4379 : }
4380 :
4381 :
4382 : /*
4383 : Read data that may be split over many blocks
4384 :
4385 : SYNOPSIS
4386 : read_long_data()
4387 : info Maria handler
4388 : to Store result string here (this is allocated)
4389 : extent Pointer to current extent position
4390 : data Current position in buffer
4391 : end_of_data End of data in buffer
4392 :
4393 : NOTES
4394 : When we have to read a new buffer, it's read into info->buff
4395 :
4396 : This loop is implemented by goto's instead of a for() loop as
4397 : the code is notable smaller and faster this way (and it's not nice
4398 : to jump into a for loop() or into a 'then' clause)
4399 :
4400 : RETURN
4401 : 0 ok
4402 : 1 error
4403 : */
4404 :
4405 : static my_bool read_long_data(MARIA_HA *info, uchar *to, ulong length,
4406 : MARIA_EXTENT_CURSOR *extent,
4407 : uchar **data, uchar **end_of_data)
4408 1191884 : {
4409 1191884 : DBUG_ENTER("read_long_data");
4410 1191884 : DBUG_PRINT("enter", ("length: %lu left_length: %u",
4411 : length, (uint) (*end_of_data - *data)));
4412 1191884 : DBUG_ASSERT(*data <= *end_of_data);
4413 :
4414 : /*
4415 : Fields are never split in middle. This means that if length > rest-of-data
4416 : we should start reading from the next extent. The reason we may have
4417 : data left on the page is that if the fixed part of the row was less than
4418 : min_block_length the head block was extended to min_block_length.
4419 :
4420 : This may change in the future, which is why we have the loop written
4421 : the way it's written.
4422 : */
4423 1191884 : if (extent->first_extent && length > (ulong) (*end_of_data - *data))
4424 12274 : *end_of_data= *data;
4425 :
4426 : for(;;)
4427 : {
4428 : uint left_length;
4429 1240240 : left_length= (uint) (*end_of_data - *data);
4430 1240240 : if (likely(left_length >= length))
4431 : {
4432 1191884 : memcpy(to, *data, length);
4433 1191884 : (*data)+= length;
4434 1191884 : DBUG_PRINT("info", ("left_length: %u", left_length - (uint) length));
4435 1191884 : DBUG_RETURN(0);
4436 : }
4437 48356 : memcpy(to, *data, left_length);
4438 48356 : to+= left_length;
4439 48356 : length-= left_length;
4440 48356 : if (!(*data= read_next_extent(info, extent, end_of_data)))
4441 : break;
4442 : }
4443 0 : DBUG_RETURN(1);
4444 : }
4445 :
4446 :
4447 : /*
4448 : Read a record from page (helper function for _ma_read_block_record())
4449 :
4450 : SYNOPSIS
4451 : _ma_read_block_record2()
4452 : info Maria handler
4453 : record Store record here
4454 : data Start of head data for row
4455 : end_of_data End of data for row
4456 :
4457 : NOTES
4458 : The head page is already read by caller
4459 : Following data is update in info->cur_row:
4460 :
4461 : cur_row.head_length is set to size of entry in head block
4462 : cur_row.tail_positions is set to point to all tail blocks
4463 : cur_row.extents points to extents data
4464 : cur_row.extents_counts contains number of extents
4465 : cur_row.empty_bits is set to empty bits
4466 : cur_row.field_lengths contains packed length of all fields
4467 : cur_row.blob_length contains total length of all blobs
4468 : cur_row.checksum contains checksum of read record.
4469 :
4470 : RETURN
4471 : 0 ok
4472 : # Error code
4473 : */
4474 :
4475 : int _ma_read_block_record2(MARIA_HA *info, uchar *record,
4476 : uchar *data, uchar *end_of_data)
4477 609072 : {
4478 609072 : MARIA_SHARE *share= info->s;
4479 : uchar *field_length_data, *blob_buffer, *start_of_data;
4480 : uint flag, null_bytes, cur_null_bytes, row_extents, field_lengths;
4481 609072 : my_bool found_blob= 0;
4482 : MARIA_EXTENT_CURSOR extent;
4483 : MARIA_COLUMNDEF *column, *end_column;
4484 609072 : MARIA_ROW *cur_row= &info->cur_row;
4485 609072 : DBUG_ENTER("_ma_read_block_record2");
4486 :
4487 609072 : LINT_INIT(field_length_data);
4488 609072 : LINT_INIT(blob_buffer);
4489 :
4490 609072 : start_of_data= data;
4491 609072 : flag= (uint) (uchar) data[0];
4492 609072 : cur_null_bytes= share->base.original_null_bytes;
4493 609072 : null_bytes= share->base.null_bytes;
4494 609072 : cur_row->head_length= (uint) (end_of_data - data);
4495 609072 : cur_row->full_page_count= cur_row->tail_count= 0;
4496 609072 : cur_row->blob_length= 0;
4497 :
4498 609072 : if (flag & ROW_FLAG_TRANSID)
4499 : {
4500 40152 : cur_row->trid= transid_korr(data+1);
4501 40152 : if (!info->trn)
4502 0 : DBUG_RETURN(my_errno= HA_ERR_WRONG_IN_RECORD); /* File crashed */
4503 40152 : if (!trnman_can_read_from(info->trn, cur_row->trid))
4504 0 : DBUG_RETURN(my_errno= HA_ERR_ROW_NOT_VISIBLE);
4505 : }
4506 :
4507 : /* Skip trans header (for now, until we have MVCC csupport) */
4508 609072 : data+= total_header_size[(flag & PRECALC_HEADER_BITMASK)];
4509 609072 : if (flag & ROW_FLAG_NULLS_EXTENDED)
4510 0 : cur_null_bytes+= data[-1];
4511 :
4512 609072 : row_extents= 0;
4513 609072 : if (flag & ROW_FLAG_EXTENTS)
4514 : {
4515 : uint row_extent_size;
4516 : /*
4517 : Record is split over many data pages.
4518 : Get number of extents and first extent
4519 : */
4520 12274 : get_key_length(row_extents, data);
4521 12274 : cur_row->extents_count= row_extents;
4522 12274 : row_extent_size= row_extents * ROW_EXTENT_SIZE;
4523 12274 : if (cur_row->extents_buffer_length < row_extent_size &&
4524 : _ma_alloc_buffer(&cur_row->extents,
4525 : &cur_row->extents_buffer_length,
4526 : row_extent_size))
4527 0 : DBUG_RETURN(my_errno);
4528 12274 : memcpy(cur_row->extents, data, ROW_EXTENT_SIZE);
4529 12274 : data+= ROW_EXTENT_SIZE;
4530 12274 : init_extent(&extent, cur_row->extents, row_extents,
4531 : cur_row->tail_positions);
4532 : }
4533 : else
4534 : {
4535 596798 : cur_row->extents_count= 0;
4536 596798 : (*cur_row->tail_positions)= 0;
4537 596798 : extent.page_count= 0;
4538 596798 : extent.extent_count= 1;
4539 : }
4540 609072 : extent.first_extent= 1;
4541 :
4542 609072 : field_lengths= 0;
4543 609072 : if (share->base.max_field_lengths)
4544 : {
4545 607848 : get_key_length(field_lengths, data);
4546 607848 : cur_row->field_lengths_length= field_lengths;
4547 : #ifdef SANITY_CHECKS
4548 607848 : if (field_lengths > share->base.max_field_lengths)
4549 609072 : goto err;
4550 : #endif
4551 : }
4552 :
4553 609072 : if (share->calc_checksum)
4554 284046 : cur_row->checksum= (uint) (uchar) *data++;
4555 : /* data now points on null bits */
4556 609072 : memcpy(record, data, cur_null_bytes);
4557 609072 : if (unlikely(cur_null_bytes != null_bytes))
4558 : {
4559 : /*
4560 : This only happens if we have added more NULL columns with
4561 : ALTER TABLE and are fetching an old, not yet modified old row
4562 : */
4563 0 : bzero(record + cur_null_bytes, (uint) (null_bytes - cur_null_bytes));
4564 : }
4565 609072 : data+= null_bytes;
4566 : /* We copy the empty bits to be able to use them for delete/update */
4567 609072 : memcpy(cur_row->empty_bits, data, share->base.pack_bytes);
4568 609072 : data+= share->base.pack_bytes;
4569 :
4570 : /* TODO: Use field offsets, instead of just skipping them */
4571 609072 : data+= share->base.field_offsets * FIELD_OFFSET_SIZE;
4572 :
4573 : /*
4574 : Read row extents (note that first extent was already read into
4575 : cur_row->extents above)
4576 : */
4577 609072 : if (row_extents > 1)
4578 : {
4579 9406 : if (read_long_data(info, cur_row->extents + ROW_EXTENT_SIZE,
4580 : (row_extents - 1) * ROW_EXTENT_SIZE,
4581 : &extent, &data, &end_of_data))
4582 0 : DBUG_RETURN(my_errno);
4583 : }
4584 :
4585 : /*
4586 : Data now points to start of fixed length field data that can't be null
4587 : or 'empty'. Note that these fields can't be split over blocks.
4588 : */
4589 : for (column= share->columndef,
4590 609072 : end_column= column + share->base.fixed_not_null_fields;
4591 3020918 : column < end_column; column++)
4592 : {
4593 2411846 : uint column_length= column->length;
4594 2411846 : if (data + column_length > end_of_data &&
4595 : !(data= read_next_extent(info, &extent, &end_of_data)))
4596 2411846 : goto err;
4597 2411846 : memcpy(record + column->offset, data, column_length);
4598 2411846 : data+= column_length;
4599 : }
4600 :
4601 : /* Read array of field lengths. This may be stored in several extents */
4602 609072 : if (field_lengths)
4603 : {
4604 581820 : field_length_data= cur_row->field_lengths;
4605 581820 : if (read_long_data(info, field_length_data, field_lengths, &extent,
4606 : &data, &end_of_data))
4607 0 : DBUG_RETURN(my_errno);
4608 : }
4609 :
4610 : /* Read variable length data. Each of these may be split over many extents */
4611 609072 : for (end_column= share->columndef + share->base.fields;
4612 1907126 : column < end_column; column++)
4613 : {
4614 1298054 : enum en_fieldtype type= column->type;
4615 1298054 : uchar *field_pos= record + column->offset;
4616 : /* First check if field is present in record */
4617 1298054 : if ((record[column->null_pos] & column->null_bit) ||
4618 : (cur_row->empty_bits[column->empty_pos] & column->empty_bit))
4619 : {
4620 126931 : bfill(record + column->offset, column->fill_length,
4621 : type == FIELD_SKIP_ENDSPACE ? ' ' : 0);
4622 126931 : continue;
4623 : }
4624 1171123 : switch (type) {
4625 : case FIELD_NORMAL: /* Fixed length field */
4626 : case FIELD_SKIP_PRESPACE:
4627 : case FIELD_SKIP_ZERO: /* Fixed length field */
4628 571553 : if (data + column->length > end_of_data &&
4629 : !(data= read_next_extent(info, &extent, &end_of_data)))
4630 571553 : goto err;
4631 571553 : memcpy(field_pos, data, column->length);
4632 571553 : data+= column->length;
4633 571553 : break;
4634 : case FIELD_SKIP_ENDSPACE: /* CHAR */
4635 : {
4636 : /* Char that is space filled */
4637 : uint length;
4638 574357 : if (column->length <= 255)
4639 574357 : length= (uint) (uchar) *field_length_data++;
4640 : else
4641 : {
4642 0 : length= uint2korr(field_length_data);
4643 0 : field_length_data+= 2;
4644 : }
4645 : #ifdef SANITY_CHECKS
4646 574357 : if (length > column->length)
4647 574357 : goto err;
4648 : #endif
4649 574357 : if (read_long_data(info, field_pos, length, &extent, &data,
4650 : &end_of_data))
4651 0 : DBUG_RETURN(my_errno);
4652 574357 : bfill(field_pos + length, column->length - length, ' ');
4653 574357 : break;
4654 : }
4655 : case FIELD_VARCHAR:
4656 : {
4657 : ulong length;
4658 1965 : if (column->length <= 256)
4659 : {
4660 1461 : length= (uint) (uchar) (*field_pos++= *field_length_data++);
4661 : }
4662 : else
4663 : {
4664 504 : length= uint2korr(field_length_data);
4665 504 : field_pos[0]= field_length_data[0];
4666 504 : field_pos[1]= field_length_data[1];
4667 504 : field_pos+= 2;
4668 504 : field_length_data+= 2;
4669 : }
4670 : #ifdef SANITY_CHECKS
4671 1965 : if (length > column->length)
4672 1965 : goto err;
4673 : #endif
4674 1965 : if (read_long_data(info, field_pos, length, &extent, &data,
4675 : &end_of_data))
4676 0 : DBUG_RETURN(my_errno);
4677 : break;
4678 : }
4679 : case FIELD_BLOB:
4680 : {
4681 23248 : uint column_size_length= column->length - portable_sizeof_char_ptr;
4682 : ulong blob_length= _ma_calc_blob_length(column_size_length,
4683 23248 : field_length_data);
4684 :
4685 23248 : if (!found_blob)
4686 : {
4687 : /* Calculate total length for all blobs */
4688 19055 : ulong blob_lengths= 0;
4689 19055 : uchar *length_data= field_length_data;
4690 19055 : MARIA_COLUMNDEF *blob_field= column;
4691 :
4692 19055 : found_blob= 1;
4693 42622 : for (; blob_field < end_column; blob_field++)
4694 : {
4695 : uint size_length;
4696 23567 : if ((record[blob_field->null_pos] & blob_field->null_bit) ||
4697 : (cur_row->empty_bits[blob_field->empty_pos] &
4698 : blob_field->empty_bit))
4699 : continue;
4700 23248 : size_length= blob_field->length - portable_sizeof_char_ptr;
4701 23248 : blob_lengths+= _ma_calc_blob_length(size_length, length_data);
4702 23248 : length_data+= size_length;
4703 : }
4704 19055 : cur_row->blob_length= blob_lengths;
4705 19055 : DBUG_PRINT("info", ("Total blob length: %lu", blob_lengths));
4706 19055 : if (_ma_alloc_buffer(&info->blob_buff, &info->blob_buff_size,
4707 : blob_lengths))
4708 0 : DBUG_RETURN(my_errno);
4709 19055 : blob_buffer= info->blob_buff;
4710 : }
4711 :
4712 23248 : memcpy(field_pos, field_length_data, column_size_length);
4713 23248 : memcpy_fixed(field_pos + column_size_length, (uchar *) &blob_buffer,
4714 : sizeof(char*));
4715 23248 : field_length_data+= column_size_length;
4716 :
4717 : /*
4718 : After we have read one extent, then each blob is in it's own extent
4719 : */
4720 23248 : if (!extent.first_extent || (ulong) (end_of_data - data) < blob_length)
4721 12274 : end_of_data= data; /* Force read of next extent */
4722 :
4723 23248 : if (read_long_data(info, blob_buffer, blob_length, &extent, &data,
4724 : &end_of_data))
4725 0 : DBUG_RETURN(my_errno);
4726 23248 : blob_buffer+= blob_length;
4727 23248 : break;
4728 : }
4729 : default:
4730 : #ifdef EXTRA_DEBUG
4731 0 : DBUG_ASSERT(0); /* purecov: deadcode */
4732 : #endif
4733 : goto err;
4734 : }
4735 : continue;
4736 : }
4737 :
4738 609072 : if (row_extents)
4739 : {
4740 12274 : DBUG_PRINT("info", ("Row read: page_count: %u extent_count: %u",
4741 : extent.page_count, extent.extent_count));
4742 12274 : *extent.tail_positions= 0; /* End marker */
4743 12274 : if (extent.page_count)
4744 12274 : goto err;
4745 12274 : if (extent.extent_count > 1)
4746 : {
4747 0 : if (_ma_check_if_zero(extent.extent + ROW_EXTENT_SIZE,
4748 : (extent.extent_count-1) * ROW_EXTENT_SIZE))
4749 : {
4750 0 : DBUG_PRINT("error", ("Data in extent is not zero"));
4751 0 : DBUG_DUMP("extent", extent.extent + ROW_EXTENT_SIZE,
4752 : (extent.extent_count-1) * ROW_EXTENT_SIZE);
4753 0 : goto err;
4754 : }
4755 : }
4756 : }
4757 : else
4758 : {
4759 596798 : DBUG_PRINT("info", ("Row read"));
4760 : /*
4761 : data should normally point to end_of_date. The only exception is if
4762 : the row is very short in which case we allocated 'min_block_length' data
4763 : for allowing the row to expand.
4764 : */
4765 596798 : if (data != end_of_data && (uint) (end_of_data - start_of_data) >
4766 : share->base.min_block_length)
4767 609072 : goto err;
4768 : }
4769 : #ifdef EXTRA_DEBUG
4770 609072 : if (share->calc_checksum)
4771 : {
4772 : /* Esnure that row checksum is correct */
4773 284046 : DBUG_ASSERT(((share->calc_checksum)(info, record) & 255) ==
4774 : cur_row->checksum);
4775 : }
4776 : #endif
4777 609072 : info->update|= HA_STATE_AKTIV; /* We have an active record */
4778 609072 : DBUG_RETURN(0);
4779 :
4780 0 : err:
4781 : /* Something was wrong with data on record */
4782 0 : DBUG_PRINT("error", ("Found record with wrong data"));
4783 0 : DBUG_RETURN((my_errno= HA_ERR_WRONG_IN_RECORD));
4784 : }
4785 :
4786 :
4787 : /** @brief Read positions to tail blocks and full blocks
4788 :
4789 : @fn read_row_extent_info()
4790 : @param info Handler
4791 :
4792 : @notes
4793 : This function is a simpler version of _ma_read_block_record2()
4794 : The data about the used pages is stored in info->cur_row.
4795 :
4796 : @return Status
4797 : @retval 0 ok
4798 : @retval 1 Error. my_errno contains error number
4799 : */
4800 :
4801 : static my_bool read_row_extent_info(MARIA_HA *info, uchar *buff,
4802 : uint record_number)
4803 31819 : {
4804 31819 : MARIA_SHARE *share= info->s;
4805 : MARIA_EXTENT_CURSOR extent;
4806 : MARIA_RECORD_POS *tail_pos;
4807 : uchar *data, *end_of_data;
4808 : uint flag, row_extents, row_extents_size, field_lengths;
4809 : uchar *extents, *end;
4810 31819 : DBUG_ENTER("read_row_extent_info");
4811 :
4812 31819 : if (!(data= get_record_position(buff, share->block_size,
4813 : record_number, &end_of_data)))
4814 0 : DBUG_RETURN(1); /* Wrong in record */
4815 :
4816 31819 : flag= (uint) (uchar) data[0];
4817 : /* Skip trans header */
4818 31819 : data+= total_header_size[(flag & PRECALC_HEADER_BITMASK)];
4819 :
4820 31819 : row_extents= 0;
4821 31819 : row_extents_size= 0;
4822 31819 : if (flag & ROW_FLAG_EXTENTS)
4823 : {
4824 : /*
4825 : Record is split over many data pages.
4826 : Get number of extents and first extent
4827 : */
4828 1292 : get_key_length(row_extents, data);
4829 1292 : row_extents_size= row_extents * ROW_EXTENT_SIZE;
4830 1292 : if (info->cur_row.extents_buffer_length < row_extents_size &&
4831 : _ma_alloc_buffer(&info->cur_row.extents,
4832 : &info->cur_row.extents_buffer_length,
4833 : row_extents_size))
4834 0 : DBUG_RETURN(1);
4835 1292 : memcpy(info->cur_row.extents, data, ROW_EXTENT_SIZE);
4836 1292 : data+= ROW_EXTENT_SIZE;
4837 1292 : init_extent(&extent, info->cur_row.extents, row_extents,
4838 : info->cur_row.tail_positions);
4839 1292 : extent.first_extent= 1;
4840 : }
4841 31819 : info->cur_row.extents_count= row_extents;
4842 :
4843 31819 : if (share->base.max_field_lengths)
4844 31819 : get_key_length(field_lengths, data);
4845 :
4846 31819 : if (share->calc_checksum)
4847 31819 : info->cur_row.checksum= (uint) (uchar) *data++;
4848 31819 : if (row_extents > 1)
4849 : {
4850 1088 : data+= share->base.null_bytes;
4851 1088 : data+= share->base.pack_bytes;
4852 1088 : data+= share->base.field_offsets * FIELD_OFFSET_SIZE;
4853 :
4854 : /*
4855 : Read row extents (note that first extent was already read into
4856 : info->cur_row.extents above)
4857 : Lock tails with write lock as we will delete them later.
4858 : */
4859 1088 : extent.lock_for_tail_pages= PAGECACHE_LOCK_LEFT_WRITELOCKED;
4860 1088 : if (read_long_data(info, info->cur_row.extents + ROW_EXTENT_SIZE,
4861 : row_extents_size - ROW_EXTENT_SIZE,
4862 : &extent, &data, &end_of_data))
4863 0 : DBUG_RETURN(1);
4864 : }
4865 :
4866 : /* Update tail_positions with pointer to tails */
4867 31819 : tail_pos= info->cur_row.tail_positions;
4868 31819 : for (extents= info->cur_row.extents, end= extents + row_extents_size;
4869 66018 : extents < end;
4870 2380 : extents+= ROW_EXTENT_SIZE)
4871 : {
4872 2380 : pgcache_page_no_t page= uint5korr(extents);
4873 2380 : uint page_count= uint2korr(extents + ROW_EXTENT_PAGE_SIZE);
4874 2380 : if (page_count & TAIL_BIT)
4875 1088 : *(tail_pos++)= ma_recordpos(page, (page_count & ~ (TAIL_BIT |
4876 : START_EXTENT_BIT)));
4877 : }
4878 31819 : *tail_pos= 0; /* End marker */
4879 31819 : DBUG_RETURN(0);
4880 : }
4881 :
4882 :
4883 : /*
4884 : Read a record based on record position
4885 :
4886 : @fn _ma_read_block_record()
4887 : @param info Maria handler
4888 : @param record Store record here
4889 : @param record_pos Record position
4890 :
4891 : @return Status
4892 : @retval 0 ok
4893 : @retval # Error number
4894 : */
4895 :
4896 : int _ma_read_block_record(MARIA_HA *info, uchar *record,
4897 : MARIA_RECORD_POS record_pos)
4898 224152 : {
4899 224152 : MARIA_SHARE *share= info->s;
4900 : uchar *data, *end_of_data, *buff;
4901 : uint offset;
4902 224152 : uint block_size= share->block_size;
4903 224152 : DBUG_ENTER("_ma_read_block_record");
4904 224152 : DBUG_PRINT("enter", ("rowid: %lu", (long) record_pos));
4905 :
4906 224152 : offset= ma_recordpos_to_dir_entry(record_pos);
4907 :
4908 224152 : if (!(buff= pagecache_read(share->pagecache,
4909 : &info->dfile, ma_recordpos_to_page(record_pos), 0,
4910 : info->buff, share->page_type,
4911 : PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
4912 0 : DBUG_RETURN(my_errno);
4913 224152 : DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == HEAD_PAGE);
4914 224152 : if (!(data= get_record_position(buff, block_size, offset, &end_of_data)))
4915 : {
4916 0 : DBUG_PRINT("error", ("Wrong directory entry in data block"));
4917 0 : my_errno= HA_ERR_RECORD_DELETED; /* File crashed */
4918 0 : DBUG_RETURN(HA_ERR_RECORD_DELETED);
4919 : }
4920 224152 : DBUG_RETURN(_ma_read_block_record2(info, record, data, end_of_data));
4921 : }
4922 :
4923 :
4924 : /* compare unique constraint between stored rows */
4925 :
4926 : my_bool _ma_cmp_block_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def,
4927 : const uchar *record, MARIA_RECORD_POS pos)
4928 42 : {
4929 : uchar *org_rec_buff, *old_record;
4930 : size_t org_rec_buff_size;
4931 : int error;
4932 42 : DBUG_ENTER("_ma_cmp_block_unique");
4933 :
4934 42 : if (!(old_record= my_alloca(info->s->base.reclength)))
4935 0 : DBUG_RETURN(1);
4936 :
4937 : /* Don't let the compare destroy blobs that may be in use */
4938 42 : org_rec_buff= info->rec_buff;
4939 42 : org_rec_buff_size= info->rec_buff_size;
4940 42 : if (info->s->base.blobs)
4941 : {
4942 : /* Force realloc of record buffer*/
4943 0 : info->rec_buff= 0;
4944 0 : info->rec_buff_size= 0;
4945 : }
4946 42 : error= _ma_read_block_record(info, old_record, pos);
4947 42 : if (!error)
4948 42 : error= _ma_unique_comp(def, record, old_record, def->null_are_equal);
4949 42 : if (info->s->base.blobs)
4950 : {
4951 0 : my_free(info->rec_buff, MYF(MY_ALLOW_ZERO_PTR));
4952 0 : info->rec_buff= org_rec_buff;
4953 0 : info->rec_buff_size= org_rec_buff_size;
4954 : }
4955 42 : DBUG_PRINT("exit", ("result: %d", error));
4956 : my_afree(old_record);
4957 42 : DBUG_RETURN(error != 0);
4958 : }
4959 :
4960 :
4961 : /****************************************************************************
4962 : Table scan
4963 : ****************************************************************************/
4964 :
4965 : /*
4966 : Allocate buffers for table scan
4967 :
4968 : SYNOPSIS
4969 : _ma_scan_init_block_record(MARIA_HA *info)
4970 :
4971 : IMPLEMENTATION
4972 : We allocate one buffer for the current bitmap and one buffer for the
4973 : current page
4974 :
4975 : RETURN
4976 : 0 ok
4977 : 1 error (couldn't allocate memory or disk error)
4978 : */
4979 :
4980 : my_bool _ma_scan_init_block_record(MARIA_HA *info)
4981 1240 : {
4982 1240 : MARIA_SHARE *share= info->s;
4983 1240 : DBUG_ENTER("_ma_scan_init_block_record");
4984 : /*
4985 : bitmap_buff may already be allocated if this is the second call to
4986 : rnd_init() without a rnd_end() in between, see sql/handler.h
4987 : */
4988 1240 : if (!(info->scan.bitmap_buff ||
4989 : ((info->scan.bitmap_buff=
4990 : (uchar *) my_malloc(share->block_size * 2, MYF(MY_WME))))))
4991 0 : DBUG_RETURN(1);
4992 1240 : info->scan.page_buff= info->scan.bitmap_buff + share->block_size;
4993 1240 : info->scan.bitmap_end= info->scan.bitmap_buff + share->bitmap.total_size;
4994 :
4995 : /* Set scan variables to get _ma_scan_block() to start with reading bitmap */
4996 1240 : info->scan.number_of_rows= 0;
4997 1240 : info->scan.bitmap_pos= info->scan.bitmap_end;
4998 1240 : info->scan.bitmap_page= (pgcache_page_no_t) 0 - share->bitmap.pages_covered;
4999 : /*
5000 : We need to flush what's in memory (bitmap.map) to page cache otherwise, as
5001 : we are going to read bitmaps from page cache in table scan (see
5002 : _ma_scan_block_record()), we may miss recently inserted rows (bitmap page
5003 : in page cache would be too old).
5004 : */
5005 1240 : DBUG_RETURN(_ma_bitmap_flush(info->s));
5006 : }
5007 :
5008 :
5009 : /* Free buffers allocated by _ma_scan_block_init() */
5010 :
5011 : void _ma_scan_end_block_record(MARIA_HA *info)
5012 1366 : {
5013 1366 : DBUG_ENTER("_ma_scan_end_block_record");
5014 1366 : my_free(info->scan.bitmap_buff, MYF(MY_ALLOW_ZERO_PTR));
5015 1366 : info->scan.bitmap_buff= 0;
5016 1366 : if (info->scan_save)
5017 : {
5018 0 : my_free(info->scan_save, MYF(0));
5019 0 : info->scan_save= 0;
5020 : }
5021 1366 : DBUG_VOID_RETURN;
5022 : }
5023 :
5024 :
5025 : /**
5026 : @brief Save current scan position
5027 :
5028 : @note
5029 : For the moment we can only remember one position, but this is
5030 : good enough for MySQL usage
5031 :
5032 : @Warning
5033 : When this function is called, we assume that the thread is not deleting
5034 : or updating the current row before ma_scan_restore_block_record()
5035 : is called!
5036 :
5037 : @return
5038 : @retval 0 ok
5039 : @retval HA_ERR_WRONG_IN_RECORD Could not allocate memory to hold position
5040 : */
5041 :
5042 : int _ma_scan_remember_block_record(MARIA_HA *info,
5043 : MARIA_RECORD_POS *lastpos)
5044 0 : {
5045 : uchar *bitmap_buff;
5046 0 : DBUG_ENTER("_ma_scan_remember_block_record");
5047 0 : if (!(info->scan_save))
5048 : {
5049 0 : if (!(info->scan_save= my_malloc(ALIGN_SIZE(sizeof(*info->scan_save)) +
5050 : info->s->block_size * 2,
5051 : MYF(MY_WME))))
5052 0 : DBUG_RETURN(HA_ERR_OUT_OF_MEM);
5053 0 : info->scan_save->bitmap_buff= ((uchar*) info->scan_save +
5054 : ALIGN_SIZE(sizeof(*info->scan_save)));
5055 : }
5056 : /* Point to the last read row */
5057 0 : *lastpos= info->cur_row.nextpos - 1;
5058 0 : info->scan.dir+= DIR_ENTRY_SIZE;
5059 :
5060 : /* Remember used bitmap and used head page */
5061 0 : bitmap_buff= info->scan_save->bitmap_buff;
5062 0 : memcpy(info->scan_save, &info->scan, sizeof(*info->scan_save));
5063 0 : info->scan_save->bitmap_buff= bitmap_buff;
5064 0 : memcpy(bitmap_buff, info->scan.bitmap_buff, info->s->block_size * 2);
5065 0 : DBUG_RETURN(0);
5066 : }
5067 :
5068 :
5069 : /**
5070 : @brief restore scan block it's original values
5071 :
5072 : @note
5073 : In theory we could swap bitmap buffers instead of copy them.
5074 : For the moment we don't do that because there are variables pointing
5075 : inside the buffers and it's a bit of hassle to either make them relative
5076 : or repoint them.
5077 : */
5078 :
5079 : void _ma_scan_restore_block_record(MARIA_HA *info,
5080 : MARIA_RECORD_POS lastpos)
5081 0 : {
5082 : uchar *bitmap_buff;
5083 0 : DBUG_ENTER("_ma_scan_restore_block_record");
5084 :
5085 0 : info->cur_row.nextpos= lastpos;
5086 0 : bitmap_buff= info->scan.bitmap_buff;
5087 0 : memcpy(&info->scan, info->scan_save, sizeof(*info->scan_save));
5088 0 : info->scan.bitmap_buff= bitmap_buff;
5089 0 : memcpy(bitmap_buff, info->scan_save->bitmap_buff, info->s->block_size * 2);
5090 :
5091 0 : DBUG_VOID_RETURN;
5092 : }
5093 :
5094 :
5095 : /*
5096 : Read next record while scanning table
5097 :
5098 : SYNOPSIS
5099 : _ma_scan_block_record()
5100 : info Maria handler
5101 : record Store found here
5102 : record_pos Value stored in info->cur_row.next_pos after last call
5103 : skip_deleted
5104 :
5105 : NOTES
5106 : - One must have called mi_scan() before this
5107 : - In this version, we don't actually need record_pos, we as easily
5108 : use a variable in info->scan
5109 :
5110 : IMPLEMENTATION
5111 : Current code uses a lot of goto's to separate the different kind of
5112 : states we may be in. This gives us a minimum of executed if's for
5113 : the normal cases. I tried several different ways to code this, but
5114 : the current one was in the end the most readable and fastest.
5115 :
5116 : RETURN
5117 : 0 ok
5118 : # Error code
5119 : */
5120 :
5121 : int _ma_scan_block_record(MARIA_HA *info, uchar *record,
5122 : MARIA_RECORD_POS record_pos,
5123 : my_bool skip_deleted __attribute__ ((unused)))
5124 364179 : {
5125 : uint block_size;
5126 : my_off_t filepos;
5127 364179 : MARIA_SHARE *share= info->s;
5128 364179 : DBUG_ENTER("_ma_scan_block_record");
5129 :
5130 : restart_record_read:
5131 : /* Find next row in current page */
5132 370187 : while (likely(record_pos < info->scan.number_of_rows))
5133 : {
5134 : uint length, offset;
5135 : uchar *data, *end_of_data;
5136 : int error;
5137 :
5138 404522 : while (!(offset= uint2korr(info->scan.dir)))
5139 : {
5140 54829 : info->scan.dir-= DIR_ENTRY_SIZE;
5141 54829 : record_pos++;
5142 : #ifdef SANITY_CHECKS
5143 54829 : if (info->scan.dir < info->scan.dir_end)
5144 : {
5145 0 : DBUG_ASSERT(0);
5146 : goto err;
5147 : }
5148 : #endif
5149 : }
5150 : /* found row */
5151 349693 : info->cur_row.lastpos= info->scan.row_base_page + record_pos;
5152 349693 : info->cur_row.nextpos= record_pos + 1;
5153 349693 : data= info->scan.page_buff + offset;
5154 349693 : length= uint2korr(info->scan.dir + 2);
5155 349693 : end_of_data= data + length;
5156 349693 : info->scan.dir-= DIR_ENTRY_SIZE; /* Point to previous row */
5157 : #ifdef SANITY_CHECKS
5158 349693 : if (end_of_data > info->scan.dir_end ||
5159 : offset < PAGE_HEADER_SIZE || length < share->base.min_block_length)
5160 : {
5161 0 : DBUG_ASSERT(!(end_of_data > info->scan.dir_end));
5162 0 : DBUG_ASSERT(!(offset < PAGE_HEADER_SIZE));
5163 0 : DBUG_ASSERT(!(length < share->base.min_block_length));
5164 : goto err;
5165 : }
5166 : #endif
5167 349693 : DBUG_PRINT("info", ("rowid: %lu", (ulong) info->cur_row.lastpos));
5168 349693 : error= _ma_read_block_record2(info, record, data, end_of_data);
5169 349693 : if (error != HA_ERR_ROW_NOT_VISIBLE)
5170 349693 : DBUG_RETURN(error);
5171 0 : record_pos++;
5172 : }
5173 :
5174 : /* Find next head page in current bitmap */
5175 21193 : restart_bitmap_scan:
5176 21193 : block_size= share->block_size;
5177 21193 : if (likely(info->scan.bitmap_pos < info->scan.bitmap_end))
5178 : {
5179 20494 : uchar *data= info->scan.bitmap_pos;
5180 20494 : longlong bits= info->scan.bits;
5181 20494 : uint bit_pos= info->scan.bit_pos;
5182 :
5183 : do
5184 : {
5185 83070 : while (likely(bits))
5186 : {
5187 63865 : uint pattern= (uint) (bits & 7);
5188 63865 : bits >>= 3;
5189 63865 : bit_pos++;
5190 63865 : if (pattern > 0 && pattern <= 4)
5191 : {
5192 : /* Found head page; Read it */
5193 : pgcache_page_no_t page;
5194 6008 : info->scan.bitmap_pos= data;
5195 6008 : info->scan.bits= bits;
5196 6008 : info->scan.bit_pos= bit_pos;
5197 6008 : page= (info->scan.bitmap_page + 1 +
5198 : (data - info->scan.bitmap_buff) / 6 * 16 + bit_pos - 1);
5199 6008 : info->scan.row_base_page= ma_recordpos(page, 0);
5200 6008 : if (!(pagecache_read(share->pagecache,
5201 : &info->dfile,
5202 : page, 0, info->scan.page_buff,
5203 : share->page_type,
5204 : PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
5205 0 : DBUG_RETURN(my_errno);
5206 6008 : if (((info->scan.page_buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) !=
5207 : HEAD_PAGE))
5208 : {
5209 : /*
5210 : This may happen if someone has been deleting all rows
5211 : from a page since we read the bitmap, so it may be ok.
5212 : Print warning in debug log and continue.
5213 : */
5214 0 : DBUG_PRINT("warning",
5215 : ("Found page of type %d when expecting head page",
5216 : (info->scan.page_buff[PAGE_TYPE_OFFSET] &
5217 : PAGE_TYPE_MASK)));
5218 0 : continue;
5219 : }
5220 6008 : if ((info->scan.number_of_rows=
5221 : (uint) (uchar) info->scan.page_buff[DIR_COUNT_OFFSET]) == 0)
5222 : {
5223 0 : DBUG_PRINT("error", ("Wrong page header"));
5224 0 : DBUG_RETURN((my_errno= HA_ERR_WRONG_IN_RECORD));
5225 : }
5226 6008 : DBUG_PRINT("info", ("Page %lu has %u rows",
5227 : (ulong) page, info->scan.number_of_rows));
5228 6008 : info->scan.dir= (info->scan.page_buff + block_size -
5229 : PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE);
5230 6008 : info->scan.dir_end= (info->scan.dir -
5231 : (info->scan.number_of_rows - 1) *
5232 : DIR_ENTRY_SIZE);
5233 6008 : record_pos= 0;
5234 6008 : goto restart_record_read;
5235 : }
5236 : }
5237 21763134 : for (data+= 6; data < info->scan.bitmap_end; data+= 6)
5238 : {
5239 21748648 : bits= uint6korr(data);
5240 : /* Skip not allocated pages and blob / full tail pages */
5241 21748648 : if (bits && bits != LL(07777777777777777))
5242 21743929 : break;
5243 : }
5244 19205 : bit_pos= 0;
5245 19205 : } while (data < info->scan.bitmap_end);
5246 : }
5247 :
5248 : /* Read next bitmap */
5249 15185 : info->scan.bitmap_page+= share->bitmap.pages_covered;
5250 15185 : filepos= (my_off_t) info->scan.bitmap_page * block_size;
5251 15185 : if (unlikely(filepos >= share->state.state.data_file_length))
5252 : {
5253 14486 : DBUG_PRINT("info", ("Found end of file"));
5254 14486 : DBUG_RETURN((my_errno= HA_ERR_END_OF_FILE));
5255 : }
5256 699 : DBUG_PRINT("info", ("Reading bitmap at %lu",
5257 : (ulong) info->scan.bitmap_page));
5258 699 : if (!(pagecache_read(share->pagecache, &info->s->bitmap.file,
5259 : info->scan.bitmap_page,
5260 : 0, info->scan.bitmap_buff, PAGECACHE_PLAIN_PAGE,
5261 : PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
5262 0 : DBUG_RETURN(my_errno);
5263 : /* Skip scanning 'bits' in bitmap scan code */
5264 699 : info->scan.bitmap_pos= info->scan.bitmap_buff - 6;
5265 699 : info->scan.bits= 0;
5266 699 : goto restart_bitmap_scan;
5267 :
5268 0 : err:
5269 0 : DBUG_PRINT("error", ("Wrong data on page"));
5270 0 : DBUG_RETURN((my_errno= HA_ERR_WRONG_IN_RECORD));
5271 : }
5272 :
5273 :
5274 : /*
5275 : Compare a row against a stored one
5276 :
5277 : NOTES
5278 : Not implemented, as block record is not supposed to be used in a shared
5279 : global environment
5280 : */
5281 :
5282 : my_bool _ma_compare_block_record(MARIA_HA *info __attribute__ ((unused)),
5283 : const uchar *record __attribute__ ((unused)))
5284 102793 : {
5285 102793 : return 0;
5286 : }
5287 :
5288 :
5289 : /*
5290 : Store an integer with simple packing
5291 :
5292 : SYNOPSIS
5293 : ma_store_integer()
5294 : to Store the packed integer here
5295 : nr Integer to store
5296 :
5297 : NOTES
5298 : This is mostly used to store field numbers and lengths of strings.
5299 : We have to cast the result for the LL() becasue of a bug in Forte CC
5300 : compiler.
5301 :
5302 : Packing used is:
5303 : nr < 251 is stored as is (in 1 byte)
5304 : Numbers that require 1-4 bytes are stored as char(250+byte_length), data
5305 : Bigger numbers are stored as 255, data as ulonglong (not yet done).
5306 :
5307 : RETURN
5308 : Position in 'to' after the packed length
5309 : */
5310 :
5311 : uchar *ma_store_length(uchar *to, ulong nr)
5312 79656 : {
5313 79656 : if (nr < 251)
5314 : {
5315 76523 : *to=(uchar) nr;
5316 76523 : return to+1;
5317 : }
5318 3133 : if (nr < 65536)
5319 : {
5320 3127 : if (nr <= 255)
5321 : {
5322 0 : to[0]= (uchar) 251;
5323 0 : to[1]= (uchar) nr;
5324 0 : return to+2;
5325 : }
5326 3127 : to[0]= (uchar) 252;
5327 3127 : int2store(to+1, nr);
5328 3127 : return to+3;
5329 : }
5330 6 : if (nr < 16777216)
5331 : {
5332 6 : *to++= (uchar) 253;
5333 6 : int3store(to, nr);
5334 6 : return to+3;
5335 : }
5336 0 : *to++= (uchar) 254;
5337 0 : int4store(to, nr);
5338 0 : return to+4;
5339 : }
5340 :
5341 :
5342 : /* Calculate how many bytes needed to store a number */
5343 :
5344 : uint ma_calc_length_for_store_length(ulong nr)
5345 9688 : {
5346 9688 : if (nr < 251)
5347 9688 : return 1;
5348 0 : if (nr < 65536)
5349 : {
5350 0 : if (nr <= 255)
5351 0 : return 2;
5352 0 : return 3;
5353 : }
5354 0 : if (nr < 16777216)
5355 0 : return 4;
5356 0 : return 5;
5357 : }
5358 :
5359 :
5360 : /* Retrive a stored number */
5361 :
5362 : static ulong ma_get_length(const uchar **packet)
5363 20698 : {
5364 20698 : reg1 const uchar *pos= *packet;
5365 20698 : if (*pos < 251)
5366 : {
5367 19627 : (*packet)++;
5368 19627 : return (ulong) *pos;
5369 : }
5370 1071 : if (*pos == 251)
5371 : {
5372 0 : (*packet)+= 2;
5373 0 : return (ulong) pos[1];
5374 : }
5375 1071 : if (*pos == 252)
5376 : {
5377 1071 : (*packet)+= 3;
5378 1071 : return (ulong) uint2korr(pos+1);
5379 : }
5380 0 : if (*pos == 253)
5381 : {
5382 0 : (*packet)+= 4;
5383 0 : return (ulong) uint3korr(pos+1);
5384 : }
5385 0 : DBUG_ASSERT(*pos == 254);
5386 0 : (*packet)+= 5;
5387 0 : return (ulong) uint4korr(pos+1);
5388 : }
5389 :
5390 :
5391 : /*
5392 : Fill array with pointers to field parts to be stored in log for insert
5393 :
5394 : SYNOPSIS
5395 : fill_insert_undo_parts()
5396 : info Maria handler
5397 : record Inserted row
5398 : log_parts Store pointers to changed memory areas here
5399 : log_parts_count See RETURN
5400 :
5401 : NOTES
5402 : We have information in info->cur_row about the read row.
5403 :
5404 : RETURN
5405 : length of data in log_parts.
5406 : log_parts_count contains number of used log_parts
5407 : */
5408 :
5409 : static size_t fill_insert_undo_parts(MARIA_HA *info, const uchar *record,
5410 : LEX_CUSTRING *log_parts,
5411 : uint *log_parts_count)
5412 59439 : {
5413 59439 : MARIA_SHARE *share= info->s;
5414 : MARIA_COLUMNDEF *column, *end_column;
5415 59439 : uchar *field_lengths= info->cur_row.field_lengths;
5416 : size_t row_length;
5417 59439 : MARIA_ROW *cur_row= &info->cur_row;
5418 : LEX_CUSTRING *start_log_parts;
5419 59439 : DBUG_ENTER("fill_insert_undo_parts");
5420 :
5421 59439 : start_log_parts= log_parts;
5422 :
5423 : /* Store null bits */
5424 59439 : log_parts->str= record;
5425 59439 : log_parts->length= share->base.null_bytes;
5426 59439 : row_length= log_parts->length;
5427 59439 : log_parts++;
5428 :
5429 : /* Stored bitmap over packed (zero length or all-zero fields) */
5430 59439 : log_parts->str= info->cur_row.empty_bits;
5431 59439 : log_parts->length= share->base.pack_bytes;
5432 59439 : row_length+= log_parts->length;
5433 59439 : log_parts++;
5434 :
5435 59439 : if (share->base.max_field_lengths)
5436 : {
5437 : /* Store length of all not empty char, varchar and blob fields */
5438 59377 : log_parts->str= field_lengths - 2;
5439 59377 : log_parts->length= info->cur_row.field_lengths_length+2;
5440 59377 : int2store(log_parts->str, info->cur_row.field_lengths_length);
5441 59377 : row_length+= log_parts->length;
5442 59377 : log_parts++;
5443 : }
5444 :
5445 59439 : if (share->base.blobs)
5446 : {
5447 : /*
5448 : Store total blob length to make buffer allocation easier during UNDO
5449 : */
5450 15293 : log_parts->str= info->length_buff;
5451 15293 : log_parts->length= (uint) (ma_store_length((uchar *) log_parts->str,
5452 : info->cur_row.blob_length) -
5453 : (uchar*) log_parts->str);
5454 15293 : row_length+= log_parts->length;
5455 15293 : log_parts++;
5456 : }
5457 :
5458 : /* Handle constant length fields that are always present */
5459 : for (column= share->columndef,
5460 59439 : end_column= column+ share->base.fixed_not_null_fields;
5461 353136 : column < end_column;
5462 234258 : column++)
5463 : {
5464 234258 : log_parts->str= record + column->offset;
5465 234258 : log_parts->length= column->length;
5466 234258 : row_length+= log_parts->length;
5467 234258 : log_parts++;
5468 : }
5469 :
5470 : /* Handle NULL fields and CHAR/VARCHAR fields */
5471 59439 : for (end_column= share->columndef + share->base.fields - share->base.blobs;
5472 236366 : column < end_column;
5473 117488 : column++)
5474 : {
5475 : const uchar *column_pos;
5476 : size_t column_length;
5477 117488 : if ((record[column->null_pos] & column->null_bit) ||
5478 : cur_row->empty_bits[column->empty_pos] & column->empty_bit)
5479 : continue;
5480 :
5481 116453 : column_pos= record+ column->offset;
5482 116453 : column_length= column->length;
5483 :
5484 116453 : switch (column->type) {
5485 : case FIELD_CHECK:
5486 : case FIELD_NORMAL: /* Fixed length field */
5487 : case FIELD_ZERO:
5488 : case FIELD_SKIP_PRESPACE: /* Not packed */
5489 : case FIELD_SKIP_ZERO: /* Fixed length field */
5490 : break;
5491 : case FIELD_SKIP_ENDSPACE: /* CHAR */
5492 : {
5493 58060 : if (column->length <= 255)
5494 58060 : column_length= *field_lengths++;
5495 : else
5496 : {
5497 0 : column_length= uint2korr(field_lengths);
5498 0 : field_lengths+= 2;
5499 : }
5500 : break;
5501 : }
5502 : case FIELD_VARCHAR:
5503 : {
5504 162 : if (column->fill_length == 1)
5505 120 : column_length= *field_lengths;
5506 : else
5507 42 : column_length= uint2korr(field_lengths);
5508 162 : field_lengths+= column->fill_length;
5509 162 : column_pos+= column->fill_length;
5510 162 : break;
5511 : }
5512 : default:
5513 0 : DBUG_ASSERT(0);
5514 : }
5515 116453 : log_parts->str= column_pos;
5516 116453 : log_parts->length= column_length;
5517 116453 : row_length+= log_parts->length;
5518 116453 : log_parts++;
5519 : }
5520 :
5521 : /* Add blobs */
5522 74950 : for (end_column+= share->base.blobs; column < end_column; column++)
5523 : {
5524 15511 : const uchar *field_pos= record + column->offset;
5525 15511 : uint size_length= column->length - portable_sizeof_char_ptr;
5526 15511 : ulong blob_length= _ma_calc_blob_length(size_length, field_pos);
5527 :
5528 : /*
5529 : We don't have to check for null, as blob_length is guranteed to be 0
5530 : if the blob is null
5531 : */
5532 15511 : if (blob_length)
5533 : {
5534 : uchar *blob_pos;
5535 2026 : memcpy_fixed(&blob_pos, record + column->offset + size_length,
5536 : sizeof(blob_pos));
5537 2026 : log_parts->str= blob_pos;
5538 2026 : log_parts->length= blob_length;
5539 2026 : row_length+= log_parts->length;
5540 2026 : log_parts++;
5541 : }
5542 : }
5543 59439 : *log_parts_count= (uint) (log_parts - start_log_parts);
5544 59439 : DBUG_RETURN(row_length);
5545 : }
5546 :
5547 :
5548 : /*
5549 : Fill array with pointers to field parts to be stored in log for update
5550 :
5551 : SYNOPSIS
5552 : fill_update_undo_parts()
5553 : info Maria handler
5554 : oldrec Original row
5555 : newrec New row
5556 : log_parts Store pointers to changed memory areas here
5557 : log_parts_count See RETURN
5558 :
5559 : IMPLEMENTATION
5560 : Format of undo record:
5561 :
5562 : Fields are stored in same order as the field array.
5563 :
5564 : Offset to changed field data (packed)
5565 :
5566 : For each changed field
5567 : Fieldnumber (packed)
5568 : Length, if variable length field (packed)
5569 :
5570 : For each changed field
5571 : Data
5572 :
5573 : Packing is using ma_store_integer()
5574 :
5575 : The reason we store field numbers & length separated from data (ie, not
5576 : after each other) is to get better cpu caching when we loop over
5577 : fields (as we probably don't have to access data for each field when we
5578 : want to read and old row through the undo log record).
5579 :
5580 : As a special case, we use '255' for the field number of the null bitmap.
5581 :
5582 : RETURN
5583 : length of data in log_parts.
5584 : log_parts_count contains number of used log_parts
5585 : */
5586 :
5587 : static size_t fill_update_undo_parts(MARIA_HA *info, const uchar *oldrec,
5588 : const uchar *newrec,
5589 : LEX_CUSTRING *log_parts,
5590 : uint *log_parts_count)
5591 9688 : {
5592 9688 : MARIA_SHARE *share= info->s;
5593 : MARIA_COLUMNDEF *column, *end_column;
5594 9688 : MARIA_ROW *old_row= &info->cur_row, *new_row= &info->new_row;
5595 : uchar *field_data, *start_field_data;
5596 9688 : uchar *old_field_lengths= old_row->field_lengths;
5597 9688 : uchar *new_field_lengths= new_row->field_lengths;
5598 9688 : size_t row_length= 0;
5599 : uint field_lengths;
5600 : LEX_CUSTRING *start_log_parts;
5601 : my_bool new_column_is_empty;
5602 9688 : DBUG_ENTER("fill_update_undo_parts");
5603 :
5604 9688 : start_log_parts= log_parts;
5605 :
5606 : /*
5607 : First log part is for number of fields, field numbers and lengths
5608 : The +4 is to reserve place for the number of changed fields.
5609 : */
5610 9688 : start_field_data= field_data= info->update_field_data + 4;
5611 9688 : log_parts++;
5612 :
5613 9688 : if (memcmp(oldrec, newrec, share->base.null_bytes))
5614 : {
5615 : /* Store changed null bits */
5616 0 : *field_data++= (uchar) 255; /* Special case */
5617 0 : log_parts->str= oldrec;
5618 0 : log_parts->length= share->base.null_bytes;
5619 0 : row_length= log_parts->length;
5620 0 : log_parts++;
5621 : }
5622 :
5623 : /* Handle constant length fields */
5624 : for (column= share->columndef,
5625 9688 : end_column= column+ share->base.fixed_not_null_fields;
5626 44338 : column < end_column;
5627 24962 : column++)
5628 : {
5629 24962 : if (memcmp(oldrec + column->offset, newrec + column->offset,
5630 : column->length))
5631 : {
5632 22541 : field_data= ma_store_length(field_data,
5633 : (uint) (column - share->columndef));
5634 22541 : log_parts->str= oldrec + column->offset;
5635 22541 : log_parts->length= column->length;
5636 22541 : row_length+= column->length;
5637 22541 : log_parts++;
5638 : }
5639 : }
5640 :
5641 : /* Handle the rest: NULL fields and CHAR/VARCHAR fields and BLOB's */
5642 9688 : for (end_column= share->columndef + share->base.fields;
5643 37370 : column < end_column;
5644 17994 : column++)
5645 : {
5646 : const uchar *new_column_pos, *old_column_pos;
5647 : size_t new_column_length, old_column_length;
5648 :
5649 : /* First check if old column is null or empty */
5650 17994 : if (oldrec[column->null_pos] & column->null_bit)
5651 : {
5652 : /*
5653 : It's safe to skip this one as either the new column is also null
5654 : (no change) or the new_column is not null, in which case the null-bit
5655 : maps differed and we have already stored the null bitmap.
5656 : */
5657 17688 : continue;
5658 : }
5659 17688 : if (old_row->empty_bits[column->empty_pos] & column->empty_bit)
5660 : {
5661 1545 : if (new_row->empty_bits[column->empty_pos] & column->empty_bit)
5662 241 : continue; /* Both are empty; skip */
5663 :
5664 : /* Store null length column */
5665 241 : field_data= ma_store_length(field_data,
5666 : (uint) (column - share->columndef));
5667 241 : field_data= ma_store_length(field_data, 0);
5668 241 : continue;
5669 : }
5670 : /*
5671 : Remember if the 'new' value is empty (as in this case we must always
5672 : log the original value
5673 : */
5674 16143 : new_column_is_empty= ((newrec[column->null_pos] & column->null_bit) ||
5675 : (new_row->empty_bits[column->empty_pos] &
5676 : column->empty_bit));
5677 :
5678 16143 : old_column_pos= oldrec + column->offset;
5679 16143 : new_column_pos= newrec + column->offset;
5680 16143 : old_column_length= new_column_length= column->length;
5681 :
5682 16143 : switch (column->type) {
5683 : case FIELD_CHECK:
5684 : case FIELD_NORMAL: /* Fixed length field */
5685 : case FIELD_ZERO:
5686 : case FIELD_SKIP_PRESPACE: /* Not packed */
5687 : case FIELD_SKIP_ZERO: /* Fixed length field */
5688 : break;
5689 : case FIELD_VARCHAR:
5690 418 : new_column_length--; /* Skip length prefix */
5691 418 : old_column_pos+= column->fill_length;
5692 418 : new_column_pos+= column->fill_length;
5693 : /* Fall through */
5694 : case FIELD_SKIP_ENDSPACE: /* CHAR */
5695 : {
5696 8253 : if (new_column_length <= 255)
5697 : {
5698 8145 : old_column_length= *old_field_lengths++;
5699 8145 : if (!new_column_is_empty)
5700 8092 : new_column_length= *new_field_lengths++;
5701 : }
5702 : else
5703 : {
5704 108 : old_column_length= uint2korr(old_field_lengths);
5705 108 : old_field_lengths+= 2;
5706 108 : if (!new_column_is_empty)
5707 : {
5708 108 : new_column_length= uint2korr(new_field_lengths);
5709 108 : new_field_lengths+= 2;
5710 : }
5711 : }
5712 : break;
5713 : }
5714 : case FIELD_BLOB:
5715 : {
5716 2535 : uint size_length= column->length - portable_sizeof_char_ptr;
5717 2535 : old_column_length= _ma_calc_blob_length(size_length, old_column_pos);
5718 2535 : memcpy_fixed((uchar*) &old_column_pos,
5719 : oldrec + column->offset + size_length,
5720 : sizeof(old_column_pos));
5721 2535 : if (!new_column_is_empty)
5722 : {
5723 2490 : new_column_length= _ma_calc_blob_length(size_length, new_column_pos);
5724 2490 : memcpy_fixed((uchar*) &new_column_pos,
5725 : newrec + column->offset + size_length,
5726 : sizeof(old_column_pos));
5727 : }
5728 : break;
5729 : }
5730 : default:
5731 0 : DBUG_ASSERT(0);
5732 : }
5733 :
5734 16143 : if (new_column_is_empty || new_column_length != old_column_length ||
5735 : memcmp(old_column_pos, new_column_pos, new_column_length))
5736 : {
5737 15826 : field_data= ma_store_length(field_data,
5738 : (ulong) (column - share->columndef));
5739 15826 : field_data= ma_store_length(field_data, (ulong) old_column_length);
5740 :
5741 15826 : log_parts->str= old_column_pos;
5742 15826 : log_parts->length= old_column_length;
5743 15826 : row_length+= old_column_length;
5744 15826 : log_parts++;
5745 : }
5746 : }
5747 :
5748 9688 : *log_parts_count= (uint) (log_parts - start_log_parts);
5749 :
5750 : /* Store length of field length data before the field/field_lengths */
5751 9688 : field_lengths= (uint) (field_data - start_field_data);
5752 9688 : start_log_parts->str= ((start_field_data -
5753 : ma_calc_length_for_store_length(field_lengths)));
5754 9688 : ma_store_length((uchar*)start_log_parts->str, field_lengths);
5755 9688 : start_log_parts->length= (size_t) (field_data - start_log_parts->str);
5756 9688 : row_length+= start_log_parts->length;
5757 9688 : DBUG_RETURN(row_length);
5758 : }
5759 :
5760 : /***************************************************************************
5761 : In-write hooks called under log's lock when log record is written
5762 : ***************************************************************************/
5763 :
5764 : /**
5765 : @brief Sets transaction's rec_lsn if needed
5766 :
5767 : A transaction sometimes writes a REDO even before the page is in the
5768 : pagecache (example: brand new head or tail pages; full pages). So, if
5769 : Checkpoint happens just after the REDO write, it needs to know that the
5770 : REDO phase must start before this REDO. Scanning the pagecache cannot
5771 : tell that as the page is not in the cache. So, transaction sets its rec_lsn
5772 : to the REDO's LSN or somewhere before, and Checkpoint reads the
5773 : transaction's rec_lsn.
5774 :
5775 : @return Operation status, always 0 (success)
5776 : */
5777 :
5778 : my_bool write_hook_for_redo(enum translog_record_type type
5779 : __attribute__ ((unused)),
5780 : TRN *trn, MARIA_HA *tbl_info
5781 : __attribute__ ((unused)),
5782 : LSN *lsn, void *hook_arg
5783 : __attribute__ ((unused)))
5784 1732407 : {
5785 : /*
5786 : Users of dummy_transaction_object must keep this TRN clean as it
5787 : is used by many threads (like those manipulating non-transactional
5788 : tables). It might be dangerous if one user sets rec_lsn or some other
5789 : member and it is picked up by another user (like putting this rec_lsn into
5790 : a page of a non-transactional table); it's safer if all members stay 0. So
5791 : non-transactional log records (REPAIR, CREATE, RENAME, DROP) should not
5792 : call this hook; we trust them but verify ;)
5793 : */
5794 1732407 : DBUG_ASSERT(trn->trid != 0);
5795 : /*
5796 : If the hook stays so simple, it would be faster to pass
5797 : !trn->rec_lsn ? trn->rec_lsn : some_dummy_lsn
5798 : to translog_write_record(), like Monty did in his original code, and not
5799 : have a hook. For now we keep it like this.
5800 : */
5801 1732407 : if (trn->rec_lsn == 0)
5802 1704962 : trn->rec_lsn= *lsn;
5803 1732407 : return 0;
5804 : }
5805 :
5806 :
5807 : /**
5808 : @brief Sets transaction's undo_lsn, first_undo_lsn if needed
5809 :
5810 : @return Operation status, always 0 (success)
5811 : */
5812 :
5813 : my_bool write_hook_for_undo(enum translog_record_type type
5814 : __attribute__ ((unused)),
5815 : TRN *trn, MARIA_HA *tbl_info
5816 : __attribute__ ((unused)),
5817 : LSN *lsn, void *hook_arg
5818 : __attribute__ ((unused)))
5819 1325193 : {
5820 1325193 : DBUG_ASSERT(trn->trid != 0);
5821 1325193 : trn->undo_lsn= *lsn;
5822 1325193 : if (unlikely(LSN_WITH_FLAGS_TO_LSN(trn->first_undo_lsn) == 0))
5823 682 : trn->first_undo_lsn=
5824 : trn->undo_lsn | LSN_WITH_FLAGS_TO_FLAGS(trn->first_undo_lsn);
5825 1325193 : return 0;
5826 : /*
5827 : when we implement purging, we will specialize this hook: UNDO_PURGE
5828 : records will additionally set trn->undo_purge_lsn
5829 : */
5830 : }
5831 :
5832 :
5833 : /**
5834 : @brief Sets the table's records count and checksum and others to 0, then
5835 : calls the generic REDO hook.
5836 :
5837 : @return Operation status, always 0 (success)
5838 : */
5839 :
5840 : my_bool write_hook_for_redo_delete_all(enum translog_record_type type
5841 : __attribute__ ((unused)),
5842 : TRN *trn, MARIA_HA *tbl_info
5843 : __attribute__ ((unused)),
5844 : LSN *lsn, void *hook_arg)
5845 0 : {
5846 0 : _ma_reset_status(tbl_info);
5847 0 : return write_hook_for_redo(type, trn, tbl_info, lsn, hook_arg);
5848 : }
5849 :
5850 :
5851 : /**
5852 : @brief Updates "records" and "checksum" and calls the generic UNDO hook
5853 :
5854 : @return Operation status, always 0 (success)
5855 : */
5856 :
5857 : my_bool write_hook_for_undo_row_insert(enum translog_record_type type
5858 : __attribute__ ((unused)),
5859 : TRN *trn, MARIA_HA *tbl_info,
5860 : LSN *lsn, void *hook_arg)
5861 128018 : {
5862 128018 : MARIA_SHARE *share= tbl_info->s;
5863 128018 : share->state.state.records++;
5864 128018 : share->state.state.checksum+= *(ha_checksum *)hook_arg;
5865 128018 : return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg);
5866 : }
5867 :
5868 :
5869 : /**
5870 : @brief Upates "records" and calls the generic UNDO hook
5871 :
5872 : @return Operation status, always 0 (success)
5873 : */
5874 :
5875 : my_bool write_hook_for_undo_row_delete(enum translog_record_type type
5876 : __attribute__ ((unused)),
5877 : TRN *trn, MARIA_HA *tbl_info,
5878 : LSN *lsn, void *hook_arg)
5879 59439 : {
5880 59439 : MARIA_SHARE *share= tbl_info->s;
5881 59439 : share->state.state.records--;
5882 59439 : share->state.state.checksum+= *(ha_checksum *)hook_arg;
5883 59439 : return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg);
5884 : }
5885 :
5886 :
5887 : /**
5888 : @brief Upates "records" and "checksum" and calls the generic UNDO hook
5889 :
5890 : @return Operation status, always 0 (success)
5891 : */
5892 :
5893 : my_bool write_hook_for_undo_row_update(enum translog_record_type type
5894 : __attribute__ ((unused)),
5895 : TRN *trn, MARIA_HA *tbl_info,
5896 : LSN *lsn, void *hook_arg)
5897 9688 : {
5898 9688 : MARIA_SHARE *share= tbl_info->s;
5899 9688 : share->state.state.checksum+= *(ha_checksum *)hook_arg;
5900 9688 : return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg);
5901 : }
5902 :
5903 :
5904 : my_bool write_hook_for_undo_bulk_insert(enum translog_record_type type
5905 : __attribute__ ((unused)),
5906 : TRN *trn, MARIA_HA *tbl_info,
5907 : LSN *lsn, void *hook_arg)
5908 0 : {
5909 : /*
5910 : We are going to call maria_delete_all_rows(), but without logging and
5911 : syncing, as an optimization (if we crash before commit, the UNDO will
5912 : empty; if we crash after commit, we have flushed and forced the files).
5913 : Status still needs to be reset under log mutex, in case of a concurrent
5914 : checkpoint.
5915 : */
5916 0 : _ma_reset_status(tbl_info);
5917 0 : return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg);
5918 : }
5919 :
5920 :
5921 : /**
5922 : @brief Updates table's lsn_of_file_id.
5923 :
5924 : @return Operation status, always 0 (success)
5925 : */
5926 :
5927 : my_bool write_hook_for_file_id(enum translog_record_type type
5928 : __attribute__ ((unused)),
5929 : TRN *trn
5930 : __attribute__ ((unused)),
5931 : MARIA_HA *tbl_info,
5932 : LSN *lsn,
5933 : void *hook_arg
5934 : __attribute__ ((unused)))
5935 392 : {
5936 392 : DBUG_ASSERT(cmp_translog_addr(tbl_info->s->lsn_of_file_id, *lsn) < 0);
5937 392 : tbl_info->s->lsn_of_file_id= *lsn;
5938 392 : return 0;
5939 : }
5940 :
5941 :
5942 : /**
5943 : Updates transaction's rec_lsn when committing.
5944 :
5945 : A transaction writes its commit record before being committed in trnman, so
5946 : if Checkpoint happens just between the COMMIT record log write and the
5947 : commit in trnman, it will record that transaction is not committed. Assume
5948 : the transaction (trn1) did an INSERT; after the checkpoint, a second
5949 : transaction (trn2) does a DELETE of what trn1 has inserted. Then crash,
5950 : Checkpoint record says that trn1 was not committed, and REDO phase starts
5951 : from Checkpoint record's LSN. So it will not find the COMMIT record of
5952 : trn1, will want to roll back trn1, which will fail because the row/key
5953 : which it wants to delete does not exist anymore.
5954 : To avoid this, Checkpoint needs to know that the REDO phase must start
5955 : before this COMMIT record, so transaction sets its rec_lsn to the COMMIT's
5956 : record LSN, and as Checkpoint reads the transaction's rec_lsn, Checkpoint
5957 : will know.
5958 :
5959 : @note so after commit trn->rec_lsn is a "commit LSN", which could be of
5960 : use later.
5961 :
5962 : @return Operation status, always 0 (success)
5963 : */
5964 :
5965 : my_bool write_hook_for_commit(enum translog_record_type type
5966 : __attribute__ ((unused)),
5967 : TRN *trn,
5968 : MARIA_HA *tbl_info
5969 : __attribute__ ((unused)),
5970 : LSN *lsn,
5971 : void *hook_arg
5972 : __attribute__ ((unused)))
5973 583 : {
5974 583 : trn->rec_lsn= *lsn;
5975 583 : return 0;
5976 : }
5977 :
5978 :
5979 : /***************************************************************************
5980 : Applying of REDO log records
5981 : ***************************************************************************/
5982 :
5983 : /*
5984 : Apply changes to head and tail pages
5985 :
5986 : SYNOPSIS
5987 : _ma_apply_redo_insert_row_head_or_tail()
5988 : info Maria handler
5989 : lsn LSN to put on page
5990 : page_type HEAD_PAGE or TAIL_PAGE
5991 : new_page True if this is first entry on page
5992 : header Header (without FILEID)
5993 : data Data to be put on page
5994 : data_length Length of data
5995 :
5996 : NOTE
5997 : Handles LOGREC_REDO_INSERT_ROW_HEAD, LOGREC_REDO_INSERT_ROW_TAIL
5998 : LOGREC_REDO_NEW_ROW_HEAD and LOGREC_REDO_NEW_ROW_TAIL
5999 :
6000 : RETURN
6001 : 0 ok
6002 : # Error number
6003 : */
6004 :
6005 : uint _ma_apply_redo_insert_row_head_or_tail(MARIA_HA *info, LSN lsn,
6006 : uint page_type,
6007 : my_bool new_page,
6008 : const uchar *header,
6009 : const uchar *data,
6010 : size_t data_length)
6011 160791 : {
6012 160791 : MARIA_SHARE *share= info->s;
6013 : pgcache_page_no_t page;
6014 : uint rownr, empty_space;
6015 160791 : uint block_size= share->block_size;
6016 : uint rec_offset;
6017 : uchar *buff, *dir;
6018 : uint result;
6019 : MARIA_PINNED_PAGE page_link;
6020 : enum pagecache_page_lock unlock_method;
6021 : enum pagecache_page_pin unpin_method;
6022 : my_off_t end_of_page;
6023 : uint error;
6024 160791 : DBUG_ENTER("_ma_apply_redo_insert_row_head_or_tail");
6025 :
6026 160791 : page= page_korr(header);
6027 160791 : rownr= dirpos_korr(header + PAGE_STORE_SIZE);
6028 :
6029 160791 : DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u data_length: %u",
6030 : (ulong) ma_recordpos(page, rownr),
6031 : (ulong) page, rownr, (uint) data_length));
6032 :
6033 160791 : share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
6034 : STATE_NOT_MOVABLE);
6035 :
6036 160791 : end_of_page= (page + 1) * share->block_size;
6037 160791 : if (end_of_page > share->state.state.data_file_length)
6038 : {
6039 1510 : DBUG_PRINT("info", ("Enlarging data file from %lu to %lu",
6040 : (ulong) share->state.state.data_file_length,
6041 : (ulong) end_of_page));
6042 : /*
6043 : New page at end of file. Note that the test above is also positive if
6044 : data_file_length is not a multiple of block_size (system crashed while
6045 : writing the last page): in this case we just extend the last page and
6046 : fill it entirely with zeroes, then the REDO will put correct data on
6047 : it.
6048 : */
6049 1510 : unlock_method= PAGECACHE_LOCK_WRITE;
6050 1510 : unpin_method= PAGECACHE_PIN;
6051 :
6052 1510 : DBUG_ASSERT(rownr == 0 && new_page);
6053 1510 : if (rownr != 0 || !new_page)
6054 : goto crashed_file;
6055 :
6056 1510 : buff= info->keyread_buff;
6057 1510 : info->keyread_buff_used= 1;
6058 1510 : make_empty_page(info, buff, page_type, 1);
6059 1510 : empty_space= (block_size - PAGE_OVERHEAD_SIZE);
6060 1510 : rec_offset= PAGE_HEADER_SIZE;
6061 1510 : dir= buff+ block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE;
6062 : }
6063 : else
6064 : {
6065 159281 : unlock_method= PAGECACHE_LOCK_LEFT_WRITELOCKED;
6066 159281 : unpin_method= PAGECACHE_PIN_LEFT_PINNED;
6067 :
6068 159281 : share->pagecache->readwrite_flags&= ~MY_WME;
6069 159281 : buff= pagecache_read(share->pagecache, &info->dfile,
6070 : page, 0, 0,
6071 : PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE,
6072 : &page_link.link);
6073 159281 : share->pagecache->readwrite_flags= share->pagecache->org_readwrite_flags;
6074 159281 : if (!buff)
6075 : {
6076 : /* Skip errors when reading outside of file and uninitialized pages */
6077 8 : if (!new_page || (my_errno != HA_ERR_FILE_TOO_SHORT &&
6078 : my_errno != HA_ERR_WRONG_CRC))
6079 : goto err;
6080 : /* Create new page */
6081 8 : buff= pagecache_block_link_to_buffer(page_link.link);
6082 8 : buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE;
6083 : }
6084 159273 : else if (lsn_korr(buff) >= lsn) /* Test if already applied */
6085 : {
6086 : /* Fix bitmap, just in case */
6087 97775 : empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
6088 97775 : if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space))
6089 97775 : goto err;
6090 97775 : pagecache_unlock_by_link(share->pagecache, page_link.link,
6091 : PAGECACHE_LOCK_WRITE_UNLOCK,
6092 : PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6093 : LSN_IMPOSSIBLE, 0, FALSE);
6094 97775 : DBUG_RETURN(0);
6095 : }
6096 :
6097 61506 : if (((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != page_type))
6098 : {
6099 : /*
6100 : This is a page that has been freed before and now should be
6101 : changed to new type.
6102 : */
6103 758 : if (!new_page)
6104 758 : goto crashed_file;
6105 758 : make_empty_page(info, buff, page_type, 0);
6106 758 : empty_space= block_size - PAGE_HEADER_SIZE - PAGE_SUFFIX_SIZE;
6107 758 : (void) extend_directory(page_type == HEAD_PAGE ? info: 0, buff,
6108 : block_size, 0, rownr, &empty_space);
6109 758 : rec_offset= PAGE_HEADER_SIZE;
6110 758 : dir= dir_entry_pos(buff, block_size, rownr);
6111 758 : empty_space+= uint2korr(dir+2);
6112 : }
6113 : else
6114 : {
6115 60748 : uint max_entry= (uint) buff[DIR_COUNT_OFFSET];
6116 : uint length;
6117 :
6118 60748 : DBUG_ASSERT(!new_page);
6119 60748 : dir= dir_entry_pos(buff, block_size, rownr);
6120 60748 : empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
6121 :
6122 60748 : if (max_entry <= rownr)
6123 : {
6124 : /* Add directory entry first in directory and data last on page */
6125 42244 : if (extend_directory(page_type == HEAD_PAGE ? info : 0, buff,
6126 : block_size, max_entry, rownr, &empty_space))
6127 60748 : goto crashed_file;
6128 : }
6129 60748 : if (extend_area_on_page(page_type == HEAD_PAGE ? info : 0, buff,
6130 : dir, rownr, block_size,
6131 : (uint) data_length, &empty_space,
6132 : &rec_offset, &length))
6133 63016 : goto crashed_file;
6134 : }
6135 : }
6136 : /* Copy data */
6137 63016 : int2store(dir+2, data_length);
6138 63016 : memcpy(buff + rec_offset, data, data_length);
6139 63016 : empty_space-= (uint) data_length;
6140 63016 : int2store(buff + EMPTY_SPACE_OFFSET, empty_space);
6141 :
6142 : /*
6143 : If page was not read before, write it but keep it pinned.
6144 : We don't update its LSN When we have processed all REDOs for this page
6145 : in the current REDO's group, we will stamp page with UNDO's LSN
6146 : (if we stamped it now, a next REDO, in
6147 : this group, for this page, would be skipped) and unpin then.
6148 : */
6149 63016 : result= 0;
6150 63016 : if (unlock_method == PAGECACHE_LOCK_WRITE &&
6151 : pagecache_write(share->pagecache,
6152 : &info->dfile, page, 0,
6153 : buff, PAGECACHE_PLAIN_PAGE,
6154 : unlock_method, unpin_method,
6155 : PAGECACHE_WRITE_DELAY, &page_link.link,
6156 : LSN_IMPOSSIBLE))
6157 0 : result= my_errno;
6158 :
6159 : /* Fix bitmap */
6160 63016 : if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space))
6161 63016 : goto err;
6162 :
6163 63016 : page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
6164 63016 : page_link.changed= 1;
6165 63016 : push_dynamic(&info->pinned_pages, (void*) &page_link);
6166 :
6167 : /*
6168 : Data page and bitmap page are in place, we can update data_file_length in
6169 : case we extended the file. We could not do it earlier: bitmap code tests
6170 : data_file_length to know if it has to create a new page or not.
6171 : */
6172 63016 : set_if_bigger(share->state.state.data_file_length, end_of_page);
6173 63016 : DBUG_RETURN(result);
6174 :
6175 0 : crashed_file:
6176 0 : my_errno= HA_ERR_WRONG_IN_RECORD;
6177 0 : err:
6178 0 : error= my_errno;
6179 0 : if (unlock_method == PAGECACHE_LOCK_LEFT_WRITELOCKED)
6180 0 : pagecache_unlock_by_link(share->pagecache, page_link.link,
6181 : PAGECACHE_LOCK_WRITE_UNLOCK,
6182 : PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6183 : LSN_IMPOSSIBLE, 0, FALSE);
6184 0 : _ma_mark_file_crashed(share);
6185 0 : DBUG_ASSERT(0); /* catch recovery errors early */
6186 : DBUG_RETURN((my_errno= error));
6187 : }
6188 :
6189 :
6190 : /*
6191 : Apply LOGREC_REDO_PURGE_ROW_HEAD & LOGREC_REDO_PURGE_ROW_TAIL
6192 :
6193 : SYNOPSIS
6194 : _ma_apply_redo_purge_row_head_or_tail()
6195 : info Maria handler
6196 : lsn LSN to put on page
6197 : page_type HEAD_PAGE or TAIL_PAGE
6198 : header Header (without FILEID)
6199 :
6200 : NOTES
6201 : This function is very similar to delete_head_or_tail()
6202 :
6203 : RETURN
6204 : 0 ok
6205 : # Error number
6206 : */
6207 :
6208 : uint _ma_apply_redo_purge_row_head_or_tail(MARIA_HA *info, LSN lsn,
6209 : uint page_type,
6210 : const uchar *header)
6211 118894 : {
6212 118894 : MARIA_SHARE *share= info->s;
6213 : pgcache_page_no_t page;
6214 : uint rownr, empty_space;
6215 118894 : uint block_size= share->block_size;
6216 : uchar *buff;
6217 : int result;
6218 : uint error;
6219 : MARIA_PINNED_PAGE page_link;
6220 118894 : DBUG_ENTER("_ma_apply_redo_purge_row_head_or_tail");
6221 :
6222 118894 : page= page_korr(header);
6223 118894 : rownr= dirpos_korr(header+PAGE_STORE_SIZE);
6224 118894 : DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u",
6225 : (ulong) ma_recordpos(page, rownr),
6226 : (ulong) page, rownr));
6227 :
6228 118894 : share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
6229 : STATE_NOT_MOVABLE);
6230 :
6231 118894 : if (!(buff= pagecache_read(share->pagecache, &info->dfile,
6232 : page, 0, 0,
6233 : PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE,
6234 : &page_link.link)))
6235 118894 : goto err;
6236 :
6237 118894 : if (lsn_korr(buff) >= lsn)
6238 : {
6239 : /*
6240 : Already applied
6241 : Note that in case the page is not anymore a head or tail page
6242 : a future redo will fix the bitmap.
6243 : */
6244 63922 : if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == page_type)
6245 : {
6246 3385 : empty_space= uint2korr(buff+EMPTY_SPACE_OFFSET);
6247 3385 : if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE,
6248 : empty_space))
6249 63922 : goto err;
6250 : }
6251 63922 : pagecache_unlock_by_link(share->pagecache, page_link.link,
6252 : PAGECACHE_LOCK_WRITE_UNLOCK,
6253 : PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6254 : LSN_IMPOSSIBLE, 0, FALSE);
6255 63922 : DBUG_RETURN(0);
6256 : }
6257 :
6258 54972 : DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == (uchar) page_type);
6259 :
6260 54972 : if (delete_dir_entry(buff, block_size, rownr, &empty_space) < 0)
6261 : {
6262 0 : my_errno= HA_ERR_WRONG_IN_RECORD;
6263 0 : goto err;
6264 : }
6265 :
6266 54972 : page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
6267 54972 : page_link.changed= 1;
6268 54972 : push_dynamic(&info->pinned_pages, (void*) &page_link);
6269 :
6270 54972 : result= 0;
6271 : /* This will work even if the page was marked as UNALLOCATED_PAGE */
6272 54972 : if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space))
6273 0 : result= my_errno;
6274 :
6275 54972 : DBUG_RETURN(result);
6276 :
6277 0 : err:
6278 0 : error= my_errno;
6279 0 : pagecache_unlock_by_link(share->pagecache, page_link.link,
6280 : PAGECACHE_LOCK_WRITE_UNLOCK,
6281 : PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6282 : LSN_IMPOSSIBLE, 0, FALSE);
6283 0 : _ma_mark_file_crashed(share);
6284 0 : DBUG_ASSERT(0);
6285 : DBUG_RETURN((my_errno= error));
6286 :
6287 : }
6288 :
6289 :
6290 : /**
6291 : @brief Apply LOGREC_REDO_FREE_BLOCKS
6292 :
6293 : @param info Maria handler
6294 : @param header Header (without FILEID)
6295 :
6296 : @note It marks the pages free in the bitmap
6297 :
6298 : @return Operation status
6299 : @retval 0 OK
6300 : @retval 1 Error
6301 : */
6302 :
6303 : uint _ma_apply_redo_free_blocks(MARIA_HA *info,
6304 : LSN lsn __attribute__((unused)),
6305 : const uchar *header)
6306 5544 : {
6307 5544 : MARIA_SHARE *share= info->s;
6308 : uint ranges;
6309 5544 : DBUG_ENTER("_ma_apply_redo_free_blocks");
6310 :
6311 5544 : share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
6312 : STATE_NOT_MOVABLE);
6313 :
6314 5544 : ranges= pagerange_korr(header);
6315 5544 : header+= PAGERANGE_STORE_SIZE;
6316 5544 : DBUG_ASSERT(ranges > 0);
6317 :
6318 11088 : while (ranges--)
6319 : {
6320 : my_bool res;
6321 : uint page_range;
6322 : pgcache_page_no_t page, start_page;
6323 :
6324 5544 : start_page= page= page_korr(header);
6325 5544 : header+= PAGE_STORE_SIZE;
6326 : /* Page range may have this bit set to indicate a tail page */
6327 5544 : page_range= pagerange_korr(header) & ~(TAIL_BIT | START_EXTENT_BIT);
6328 5544 : DBUG_ASSERT(page_range > 0);
6329 :
6330 5544 : header+= PAGERANGE_STORE_SIZE;
6331 :
6332 5544 : DBUG_PRINT("info", ("page: %lu pages: %u", (long) page, page_range));
6333 :
6334 : /** @todo leave bitmap lock to the bitmap code... */
6335 5544 : pthread_mutex_lock(&share->bitmap.bitmap_lock);
6336 5544 : res= _ma_bitmap_reset_full_page_bits(info, &share->bitmap, start_page,
6337 : page_range);
6338 5544 : pthread_mutex_unlock(&share->bitmap.bitmap_lock);
6339 5544 : if (res)
6340 : {
6341 0 : _ma_mark_file_crashed(share);
6342 0 : DBUG_ASSERT(0);
6343 : DBUG_RETURN(res);
6344 : }
6345 : }
6346 5544 : DBUG_RETURN(0);
6347 : }
6348 :
6349 :
6350 : /**
6351 : @brief Apply LOGREC_REDO_FREE_HEAD_OR_TAIL
6352 :
6353 : @param info Maria handler
6354 : @param header Header (without FILEID)
6355 :
6356 : @note It marks the page free in the bitmap, and sets the directory's count
6357 : to 0.
6358 :
6359 : @return Operation status
6360 : @retval 0 OK
6361 : @retval 1 Error
6362 : */
6363 :
6364 : uint _ma_apply_redo_free_head_or_tail(MARIA_HA *info, LSN lsn,
6365 : const uchar *header)
6366 4624 : {
6367 4624 : MARIA_SHARE *share= info->s;
6368 : uchar *buff;
6369 : pgcache_page_no_t page;
6370 : MARIA_PINNED_PAGE page_link;
6371 : my_bool res;
6372 4624 : DBUG_ENTER("_ma_apply_redo_free_head_or_tail");
6373 :
6374 4624 : share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
6375 : STATE_NOT_MOVABLE);
6376 :
6377 4624 : page= page_korr(header);
6378 :
6379 4624 : if (!(buff= pagecache_read(share->pagecache,
6380 : &info->dfile,
6381 : page, 0, 0,
6382 : PAGECACHE_PLAIN_PAGE,
6383 : PAGECACHE_LOCK_WRITE, &page_link.link)))
6384 : {
6385 0 : pagecache_unlock_by_link(share->pagecache, page_link.link,
6386 : PAGECACHE_LOCK_WRITE_UNLOCK,
6387 : PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6388 : LSN_IMPOSSIBLE, 0, FALSE);
6389 0 : goto err;
6390 : }
6391 4624 : if (lsn_korr(buff) >= lsn)
6392 : {
6393 : /* Already applied */
6394 2549 : pagecache_unlock_by_link(share->pagecache, page_link.link,
6395 : PAGECACHE_LOCK_WRITE_UNLOCK,
6396 : PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6397 : LSN_IMPOSSIBLE, 0, FALSE);
6398 : }
6399 : else
6400 : {
6401 2075 : buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE;
6402 : #ifdef IDENTICAL_PAGES_AFTER_RECOVERY
6403 : {
6404 : uint number_of_records= (uint) buff[DIR_COUNT_OFFSET];
6405 : uchar *dir= dir_entry_pos(buff, share->block_size,
6406 : number_of_records-1);
6407 : buff[DIR_FREE_OFFSET]= END_OF_DIR_FREE_LIST;
6408 : bzero(dir, number_of_records * DIR_ENTRY_SIZE);
6409 : }
6410 : #endif
6411 :
6412 2075 : page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
6413 2075 : page_link.changed= 1;
6414 2075 : push_dynamic(&info->pinned_pages, (void*) &page_link);
6415 : }
6416 : /** @todo leave bitmap lock to the bitmap code... */
6417 4624 : pthread_mutex_lock(&share->bitmap.bitmap_lock);
6418 4624 : res= _ma_bitmap_reset_full_page_bits(info, &share->bitmap, page, 1);
6419 4624 : pthread_mutex_unlock(&share->bitmap.bitmap_lock);
6420 4624 : if (res)
6421 4624 : goto err;
6422 4624 : DBUG_RETURN(0);
6423 :
6424 0 : err:
6425 0 : _ma_mark_file_crashed(share);
6426 0 : DBUG_ASSERT(0);
6427 : DBUG_RETURN(1);
6428 : }
6429 :
6430 :
6431 : /**
6432 : @brief Apply LOGREC_REDO_INSERT_ROW_BLOBS
6433 :
6434 : @param info Maria handler
6435 : @parma lsn LSN to put on pages
6436 : @param header Header (with FILEID)
6437 : @param redo_lsn REDO record's LSN
6438 : @param[out] number_of_blobs Number of blobs found in log record
6439 : @param[out] number_of_ranges Number of ranges found
6440 : @param[out] first_page First page touched
6441 : @param[out] last_page Last page touched
6442 :
6443 : @note Write full pages (full head & blob pages)
6444 :
6445 : @return Operation status
6446 : @retval 0 OK
6447 : @retval !=0 Error
6448 : */
6449 :
6450 : uint _ma_apply_redo_insert_row_blobs(MARIA_HA *info,
6451 : LSN lsn, const uchar *header,
6452 : LSN redo_lsn,
6453 : uint * const number_of_blobs,
6454 : uint * const number_of_ranges,
6455 : pgcache_page_no_t * const first_page,
6456 : pgcache_page_no_t * const last_page)
6457 6269 : {
6458 6269 : MARIA_SHARE *share= info->s;
6459 : const uchar *data;
6460 6269 : uint data_size= FULL_PAGE_SIZE(share->block_size);
6461 : uint blob_count, ranges;
6462 : uint16 sid;
6463 6269 : pgcache_page_no_t first_page2= ULONGLONG_MAX, last_page2= 0;
6464 6269 : DBUG_ENTER("_ma_apply_redo_insert_row_blobs");
6465 :
6466 6269 : share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
6467 : STATE_NOT_MOVABLE);
6468 :
6469 6269 : sid= fileid_korr(header);
6470 6269 : header+= FILEID_STORE_SIZE;
6471 6269 : *number_of_ranges= ranges= pagerange_korr(header);
6472 6269 : header+= PAGERANGE_STORE_SIZE;
6473 6269 : *number_of_blobs= blob_count= pagerange_korr(header);
6474 6269 : header+= PAGERANGE_STORE_SIZE;
6475 6269 : DBUG_ASSERT(ranges >= blob_count);
6476 :
6477 6269 : data= (header + ranges * ROW_EXTENT_SIZE +
6478 : blob_count * (SUB_RANGE_SIZE + BLOCK_FILLER_SIZE));
6479 :
6480 18807 : while (blob_count--)
6481 : {
6482 : uint sub_ranges, empty_space;
6483 :
6484 6269 : sub_ranges= uint2korr(header);
6485 6269 : header+= SUB_RANGE_SIZE;
6486 6269 : empty_space= uint2korr(header);
6487 6269 : header+= BLOCK_FILLER_SIZE;
6488 6269 : DBUG_ASSERT(sub_ranges <= ranges && empty_space < data_size);
6489 6269 : ranges-= sub_ranges;
6490 :
6491 18807 : while (sub_ranges--)
6492 : {
6493 : uint i;
6494 : uint res;
6495 : uint page_range;
6496 : pgcache_page_no_t page, start_page;
6497 : uchar *buff;
6498 :
6499 6269 : start_page= page= page_korr(header);
6500 6269 : header+= PAGE_STORE_SIZE;
6501 6269 : page_range= pagerange_korr(header);
6502 6269 : header+= PAGERANGE_STORE_SIZE;
6503 :
6504 21678 : for (i= page_range; i-- > 0 ; page++)
6505 : {
6506 : MARIA_PINNED_PAGE page_link;
6507 : enum pagecache_page_lock unlock_method;
6508 : enum pagecache_page_pin unpin_method;
6509 : uint length;
6510 :
6511 15409 : set_if_smaller(first_page2, page);
6512 15409 : set_if_bigger(last_page2, page);
6513 15409 : if (_ma_redo_not_needed_for_page(sid, redo_lsn, page, FALSE))
6514 15409 : continue;
6515 :
6516 15409 : if (((page + 1) * share->block_size) >
6517 : share->state.state.data_file_length)
6518 : {
6519 : /* New page or half written page at end of file */
6520 2619 : DBUG_PRINT("info", ("Enlarging data file from %lu to %lu",
6521 : (ulong) share->state.state.data_file_length,
6522 : (ulong) ((page + 1 ) * share->block_size)));
6523 2619 : share->state.state.data_file_length= (page + 1) * share->block_size;
6524 2619 : buff= info->keyread_buff;
6525 2619 : info->keyread_buff_used= 1;
6526 2619 : make_empty_page(info, buff, BLOB_PAGE, 0);
6527 2619 : unlock_method= PAGECACHE_LOCK_LEFT_UNLOCKED;
6528 2619 : unpin_method= PAGECACHE_PIN_LEFT_UNPINNED;
6529 : }
6530 : else
6531 : {
6532 12790 : share->pagecache->readwrite_flags&= ~MY_WME;
6533 12790 : buff= pagecache_read(share->pagecache,
6534 : &info->dfile,
6535 : page, 0, 0,
6536 : PAGECACHE_PLAIN_PAGE,
6537 : PAGECACHE_LOCK_WRITE, &page_link.link);
6538 12790 : share->pagecache->readwrite_flags= share->pagecache->
6539 : org_readwrite_flags;
6540 12790 : if (!buff)
6541 : {
6542 1495 : if (my_errno != HA_ERR_FILE_TOO_SHORT &&
6543 : my_errno != HA_ERR_WRONG_CRC)
6544 : {
6545 : /* If not read outside of file */
6546 0 : pagecache_unlock_by_link(share->pagecache, page_link.link,
6547 : PAGECACHE_LOCK_WRITE_UNLOCK,
6548 : PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6549 : LSN_IMPOSSIBLE, 0, FALSE);
6550 0 : goto err;
6551 : }
6552 : /*
6553 : Physical file was too short, create new page. It can be that
6554 : recovery started with a file with N pages, wrote page N+2 into
6555 : pagecache (increased data_file_length but not physical file
6556 : length), now reads page N+1: the read fails.
6557 : */
6558 1495 : buff= pagecache_block_link_to_buffer(page_link.link);
6559 1495 : make_empty_page(info, buff, BLOB_PAGE, 0);
6560 : }
6561 : else
6562 : {
6563 : #ifndef DBUG_OFF
6564 11295 : uchar found_page_type= (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK);
6565 : #endif
6566 11295 : if (lsn_korr(buff) >= lsn)
6567 : {
6568 : /* Already applied */
6569 9428 : DBUG_PRINT("info", ("already applied %llu >= %llu",
6570 : lsn_korr(buff), lsn));
6571 9428 : pagecache_unlock_by_link(share->pagecache, page_link.link,
6572 : PAGECACHE_LOCK_WRITE_UNLOCK,
6573 : PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6574 : LSN_IMPOSSIBLE, 0, FALSE);
6575 9428 : continue;
6576 : }
6577 1867 : DBUG_ASSERT((found_page_type == (uchar) BLOB_PAGE) ||
6578 : (found_page_type == (uchar) UNALLOCATED_PAGE));
6579 : }
6580 3362 : unlock_method= PAGECACHE_LOCK_WRITE_UNLOCK;
6581 3362 : unpin_method= PAGECACHE_UNPIN;
6582 : }
6583 :
6584 : /*
6585 : Blob pages are never updated twice in same redo-undo chain, so
6586 : it's safe to update lsn for them here
6587 : */
6588 5981 : lsn_store(buff, lsn);
6589 5981 : buff[PAGE_TYPE_OFFSET]= BLOB_PAGE;
6590 :
6591 5981 : length= data_size;
6592 5981 : if (i == 0 && sub_ranges == 0)
6593 : {
6594 : /*
6595 : Last page may be only partly filled. We zero the rest, like
6596 : write_full_pages() does.
6597 : */
6598 2392 : length-= empty_space;
6599 2392 : bzero(buff + share->block_size - PAGE_SUFFIX_SIZE - empty_space,
6600 : empty_space);
6601 : }
6602 5981 : memcpy(buff+ PAGE_TYPE_OFFSET + 1, data, length);
6603 5981 : data+= length;
6604 5981 : if (pagecache_write(share->pagecache,
6605 : &info->dfile, page, 0,
6606 : buff, PAGECACHE_PLAIN_PAGE,
6607 : unlock_method, unpin_method,
6608 : PAGECACHE_WRITE_DELAY, 0, LSN_IMPOSSIBLE))
6609 15409 : goto err;
6610 : }
6611 : /** @todo leave bitmap lock to the bitmap code... */
6612 6269 : pthread_mutex_lock(&share->bitmap.bitmap_lock);
6613 6269 : res= _ma_bitmap_set_full_page_bits(info, &share->bitmap, start_page,
6614 : page_range);
6615 6269 : pthread_mutex_unlock(&share->bitmap.bitmap_lock);
6616 6269 : if (res)
6617 12538 : goto err;
6618 : }
6619 : }
6620 6269 : *first_page= first_page2;
6621 6269 : *last_page= last_page2;
6622 6269 : DBUG_RETURN(0);
6623 :
6624 0 : err:
6625 0 : _ma_mark_file_crashed(share);
6626 0 : DBUG_ASSERT(0);
6627 : DBUG_RETURN(1);
6628 : }
6629 :
6630 :
6631 : /****************************************************************************
6632 : Applying of UNDO entries
6633 : ****************************************************************************/
6634 :
6635 : /** Execute undo of a row insert (delete the inserted row) */
6636 :
6637 : my_bool _ma_apply_undo_row_insert(MARIA_HA *info, LSN undo_lsn,
6638 : const uchar *header)
6639 31819 : {
6640 : pgcache_page_no_t page;
6641 : uint rownr;
6642 : uchar *buff;
6643 31819 : my_bool res= 1;
6644 : MARIA_PINNED_PAGE page_link;
6645 31819 : MARIA_SHARE *share= info->s;
6646 : ha_checksum checksum;
6647 : LSN lsn;
6648 31819 : DBUG_ENTER("_ma_apply_undo_row_insert");
6649 :
6650 31819 : page= page_korr(header);
6651 31819 : header+= PAGE_STORE_SIZE;
6652 31819 : rownr= dirpos_korr(header);
6653 31819 : header+= DIRPOS_STORE_SIZE;
6654 31819 : DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u",
6655 : (ulong) ma_recordpos(page, rownr),
6656 : (ulong) page, rownr));
6657 :
6658 31819 : buff= pagecache_read(share->pagecache,
6659 : &info->dfile, page, 0,
6660 : 0, share->page_type,
6661 : PAGECACHE_LOCK_WRITE,
6662 : &page_link.link);
6663 31819 : page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
6664 31819 : page_link.changed= buff != 0;
6665 31819 : push_dynamic(&info->pinned_pages, (void*) &page_link);
6666 31819 : if (!buff)
6667 31819 : goto err;
6668 :
6669 31819 : if (read_row_extent_info(info, buff, rownr))
6670 31819 : goto err;
6671 :
6672 31819 : _ma_bitmap_flushable(info, 1);
6673 31819 : if (delete_head_or_tail(info, page, rownr, 1, 1) ||
6674 : delete_tails(info, info->cur_row.tail_positions))
6675 : goto err;
6676 :
6677 31819 : if (info->cur_row.extents_count && free_full_pages(info, &info->cur_row))
6678 31819 : goto err;
6679 :
6680 31819 : checksum= 0;
6681 31819 : if (share->calc_checksum)
6682 31819 : checksum= (ha_checksum) 0 - ha_checksum_korr(header);
6683 31819 : info->last_auto_increment= ~ (ulonglong) 0;
6684 31819 : if (_ma_write_clr(info, undo_lsn, LOGREC_UNDO_ROW_INSERT,
6685 : share->calc_checksum != 0, checksum, &lsn, (void*) 0))
6686 31819 : goto err;
6687 :
6688 31819 : res= 0;
6689 31819 : err:
6690 31819 : if (info->non_flushable_state)
6691 31819 : _ma_bitmap_flushable(info, -1);
6692 31819 : _ma_unpin_all_pages_and_finalize_row(info, lsn);
6693 31819 : DBUG_RETURN(res);
6694 : }
6695 :
6696 :
6697 : /** Execute undo of a row delete (insert the row back where it was) */
6698 :
6699 : my_bool _ma_apply_undo_row_delete(MARIA_HA *info, LSN undo_lsn,
6700 : const uchar *header, size_t header_length
6701 : __attribute__((unused)))
6702 13914 : {
6703 13914 : MARIA_SHARE *share= info->s;
6704 : MARIA_ROW row;
6705 : MARIA_COLUMNDEF *column, *end_column;
6706 : MARIA_BITMAP_BLOCKS *blocks;
6707 : struct st_row_pos_info row_pos;
6708 : uchar *record;
6709 : const uchar *null_bits, *field_length_data, *extent_info;
6710 : pgcache_page_no_t page;
6711 : ulong *blob_lengths;
6712 : uint *null_field_lengths, extent_count, rownr, length_on_head_page;
6713 13914 : DBUG_ENTER("_ma_apply_undo_row_delete");
6714 :
6715 : /*
6716 : Use cur row as a base; We need to make a copy as we will change
6717 : some buffers to point directly to 'header'
6718 : */
6719 13914 : memcpy(&row, &info->cur_row, sizeof(row));
6720 :
6721 13914 : page= page_korr(header);
6722 13914 : header+= PAGE_STORE_SIZE;
6723 13914 : rownr= dirpos_korr(header);
6724 13914 : header+= DIRPOS_STORE_SIZE;
6725 13914 : length_on_head_page= uint2korr(header);
6726 13914 : header+= 2;
6727 13914 : extent_count= pagerange_korr(header);
6728 13914 : header+= PAGERANGE_STORE_SIZE;
6729 13914 : DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u",
6730 : (ulong) ma_recordpos(page, rownr),
6731 : (ulong) page, rownr));
6732 :
6733 13914 : if (share->calc_checksum)
6734 : {
6735 : /*
6736 : We extract the checksum delta here, saving a recomputation in
6737 : allocate_and_write_block_record(). It's only an optimization.
6738 : */
6739 13914 : row.checksum= (ha_checksum) 0 - ha_checksum_korr(header);
6740 13914 : header+= HA_CHECKSUM_STORE_SIZE;
6741 : }
6742 13914 : extent_info= header;
6743 13914 : header+= extent_count * ROW_EXTENT_SIZE;
6744 :
6745 13914 : null_field_lengths= row.null_field_lengths;
6746 13914 : blob_lengths= row.blob_lengths;
6747 :
6748 : /*
6749 : Fill in info->cur_row with information about the row, like in
6750 : calc_record_size(), to be used by write_block_record()
6751 : */
6752 :
6753 13914 : row.normal_length= row.char_length= row.varchar_length=
6754 : row.blob_length= row.extents_count= row.field_lengths_length= 0;
6755 :
6756 13914 : null_bits= header;
6757 13914 : header+= share->base.null_bytes;
6758 : /* This will not be changed */
6759 13914 : row.empty_bits= (uchar*) header;
6760 13914 : header+= share->base.pack_bytes;
6761 13914 : if (share->base.max_field_lengths)
6762 : {
6763 13914 : row.field_lengths_length= uint2korr(header);
6764 13914 : row.field_lengths= (uchar*) header + 2 ;
6765 13914 : header+= 2 + row.field_lengths_length;
6766 : }
6767 13914 : if (share->base.blobs)
6768 7286 : row.blob_length= ma_get_length(&header);
6769 :
6770 : /* We need to build up a record (without blobs) in rec_buff */
6771 13914 : if (!(record= my_malloc(share->base.reclength, MYF(MY_WME))))
6772 0 : DBUG_RETURN(1);
6773 :
6774 13914 : memcpy(record, null_bits, share->base.null_bytes);
6775 :
6776 : /* Copy field information from header to record */
6777 :
6778 : /* Handle constant length fields that are always present */
6779 : for (column= share->columndef,
6780 13914 : end_column= column+ share->base.fixed_not_null_fields;
6781 82560 : column < end_column;
6782 54732 : column++)
6783 : {
6784 54732 : memcpy(record + column->offset, header, column->length);
6785 54732 : header+= column->length;
6786 : }
6787 :
6788 : /* Handle NULL fields and CHAR/VARCHAR fields */
6789 13914 : field_length_data= row.field_lengths;
6790 13914 : for (end_column= share->columndef + share->base.fields;
6791 62678 : column < end_column;
6792 34850 : column++, null_field_lengths++)
6793 : {
6794 34850 : if ((record[column->null_pos] & column->null_bit) ||
6795 : row.empty_bits[column->empty_pos] & column->empty_bit)
6796 : {
6797 6599 : if (column->type != FIELD_BLOB)
6798 207 : *null_field_lengths= 0;
6799 : else
6800 6392 : *blob_lengths++= 0;
6801 6599 : if (share->calc_checksum)
6802 6599 : bfill(record + column->offset, column->fill_length,
6803 : column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0);
6804 : continue;
6805 : }
6806 28251 : switch (column->type) {
6807 : case FIELD_CHECK:
6808 : case FIELD_NORMAL: /* Fixed length field */
6809 : case FIELD_ZERO:
6810 : case FIELD_SKIP_PRESPACE: /* Not packed */
6811 : case FIELD_SKIP_ZERO: /* Fixed length field */
6812 13634 : row.normal_length+= column->length;
6813 13634 : *null_field_lengths= column->length;
6814 13634 : memcpy(record + column->offset, header, column->length);
6815 13634 : header+= column->length;
6816 13634 : break;
6817 : case FIELD_SKIP_ENDSPACE: /* CHAR */
6818 : {
6819 : uint length;
6820 13591 : if (column->length <= 255)
6821 13591 : length= (uint) *field_length_data++;
6822 : else
6823 : {
6824 0 : length= uint2korr(field_length_data);
6825 0 : field_length_data+= 2;
6826 : }
6827 13591 : row.char_length+= length;
6828 13591 : *null_field_lengths= length;
6829 13591 : memcpy(record + column->offset, header, length);
6830 13591 : if (share->calc_checksum)
6831 13591 : bfill(record + column->offset + length, (column->length - length),
6832 : ' ');
6833 13591 : header+= length;
6834 13591 : break;
6835 : }
6836 : case FIELD_VARCHAR:
6837 : {
6838 : uint length;
6839 0 : uchar *field_pos= record + column->offset;
6840 :
6841 : /* 256 is correct as this includes the length uchar */
6842 0 : if (column->fill_length == 1)
6843 : {
6844 0 : field_pos[0]= *field_length_data;
6845 0 : length= (uint) *field_length_data;
6846 : }
6847 : else
6848 : {
6849 0 : field_pos[0]= field_length_data[0];
6850 0 : field_pos[1]= field_length_data[1];
6851 0 : length= uint2korr(field_length_data);
6852 : }
6853 0 : field_length_data+= column->fill_length;
6854 0 : field_pos+= column->fill_length;
6855 0 : row.varchar_length+= length;
6856 0 : *null_field_lengths= length;
6857 0 : memcpy(field_pos, header, length);
6858 0 : header+= length;
6859 0 : break;
6860 : }
6861 : case FIELD_BLOB:
6862 : {
6863 : /* Copy length of blob and pointer to blob data to record */
6864 1026 : uchar *field_pos= record + column->offset;
6865 1026 : uint size_length= column->length - portable_sizeof_char_ptr;
6866 1026 : ulong blob_length= _ma_calc_blob_length(size_length, field_length_data);
6867 :
6868 1026 : memcpy(field_pos, field_length_data, size_length);
6869 1026 : field_length_data+= size_length;
6870 1026 : memcpy(field_pos + size_length, &header, sizeof(&header));
6871 1026 : header+= blob_length;
6872 1026 : *blob_lengths++= blob_length;
6873 1026 : break;
6874 : }
6875 : default:
6876 0 : DBUG_ASSERT(0);
6877 : }
6878 : }
6879 13914 : row.head_length= (info->row_base_length +
6880 : share->base.fixed_not_null_fields_length +
6881 : row.field_lengths_length +
6882 : size_to_store_key_length(row.field_lengths_length) +
6883 : row.normal_length +
6884 : row.char_length + row.varchar_length);
6885 13914 : row.total_length= (row.head_length + row.blob_length);
6886 13914 : if (row.total_length < share->base.min_block_length)
6887 24 : row.total_length= share->base.min_block_length;
6888 :
6889 : /*
6890 : Row is now generated. Now we need to insert record on the original
6891 : pages with original size on each page.
6892 : */
6893 :
6894 13914 : _ma_bitmap_flushable(info, 1);
6895 : /* Change extent information to be usable by write_block_record() */
6896 13914 : blocks= &row.insert_blocks;
6897 13914 : if (extent_to_bitmap_blocks(info, blocks, page, extent_count, extent_info))
6898 13914 : goto err;
6899 13914 : blocks->block->org_bitmap_value= _ma_bitmap_get_page_bits(info,
6900 : &share->bitmap,
6901 : page);
6902 13914 : blocks->block->used|= BLOCKUSED_USE_ORG_BITMAP;
6903 :
6904 : /* Read head page and allocate data for rowid */
6905 13914 : if (get_rowpos_in_head_or_tail_page(info, blocks->block,
6906 : info->buff,
6907 : length_on_head_page,
6908 : HEAD_PAGE, PAGECACHE_LOCK_WRITE,
6909 : rownr, &row_pos))
6910 13914 : goto err;
6911 :
6912 13914 : if (share->calc_checksum)
6913 : {
6914 13914 : DBUG_ASSERT(row.checksum == (share->calc_checksum)(info, record));
6915 : }
6916 13914 : if (write_block_record(info, (uchar*) 0, record, &row,
6917 : blocks, blocks->block->org_bitmap_value != 0,
6918 : &row_pos, undo_lsn, 0))
6919 13914 : goto err;
6920 :
6921 13914 : my_free(record, MYF(0));
6922 13914 : DBUG_RETURN(0);
6923 :
6924 0 : err:
6925 0 : if (info->non_flushable_state)
6926 0 : _ma_bitmap_flushable(info, -1);
6927 0 : _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
6928 0 : my_free(record, MYF(0));
6929 0 : DBUG_RETURN(1);
6930 : }
6931 :
6932 :
6933 : /**
6934 : Execute undo of a row update
6935 :
6936 : @fn _ma_apply_undo_row_update()
6937 :
6938 : @return Operation status
6939 : @retval 0 OK
6940 : @retval 1 Error
6941 : */
6942 :
6943 : my_bool _ma_apply_undo_row_update(MARIA_HA *info, LSN undo_lsn,
6944 : const uchar *header,
6945 : size_t header_length
6946 : __attribute__((unused)))
6947 1671 : {
6948 1671 : MARIA_SHARE *share= info->s;
6949 : MARIA_RECORD_POS record_pos;
6950 : const uchar *field_length_data, *field_length_data_end, *extent_info;
6951 : uchar *current_record, *orig_record;
6952 : pgcache_page_no_t page;
6953 : ha_checksum checksum_delta;
6954 : uint rownr, field_length_header, extent_count, length_on_head_page;
6955 1671 : int error= 1;
6956 1671 : DBUG_ENTER("_ma_apply_undo_row_update");
6957 1671 : LINT_INIT(checksum_delta);
6958 :
6959 1671 : page= page_korr(header);
6960 1671 : header+= PAGE_STORE_SIZE;
6961 1671 : rownr= dirpos_korr(header);
6962 1671 : header+= DIRPOS_STORE_SIZE;
6963 1671 : record_pos= ma_recordpos(page, rownr);
6964 1671 : DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u",
6965 : (ulong) record_pos, (ulong) page, rownr));
6966 :
6967 1671 : if (share->calc_checksum)
6968 : {
6969 1671 : checksum_delta= ha_checksum_korr(header);
6970 1671 : header+= HA_CHECKSUM_STORE_SIZE;
6971 : }
6972 1671 : length_on_head_page= uint2korr(header);
6973 1671 : header+= 2;
6974 1671 : extent_count= pagerange_korr(header);
6975 1671 : header+= PAGERANGE_STORE_SIZE;
6976 1671 : extent_info= header;
6977 1671 : header+= extent_count * ROW_EXTENT_SIZE;
6978 :
6979 : /*
6980 : Set header to point to old field values, generated by
6981 : fill_update_undo_parts()
6982 : */
6983 1671 : field_length_header= ma_get_length(&header);
6984 1671 : field_length_data= (uchar*) header;
6985 1671 : header+= field_length_header;
6986 1671 : field_length_data_end= header;
6987 :
6988 : /* Allocate buffer for current row & original row */
6989 1671 : if (!(current_record= my_malloc(share->base.reclength * 2, MYF(MY_WME))))
6990 0 : DBUG_RETURN(1);
6991 1671 : orig_record= current_record+ share->base.reclength;
6992 :
6993 : /* Read current record */
6994 1671 : if (_ma_read_block_record(info, current_record, record_pos))
6995 1671 : goto err;
6996 :
6997 1671 : if (*field_length_data == 255)
6998 : {
6999 : /* Bitmap changed */
7000 0 : field_length_data++;
7001 0 : memcpy(orig_record, header, share->base.null_bytes);
7002 0 : header+= share->base.null_bytes;
7003 : }
7004 : else
7005 1671 : memcpy(orig_record, current_record, share->base.null_bytes);
7006 1671 : bitmap_clear_all(&info->changed_fields);
7007 :
7008 11878 : while (field_length_data < field_length_data_end)
7009 : {
7010 8536 : uint field_nr= ma_get_length(&field_length_data), field_length;
7011 8536 : MARIA_COLUMNDEF *column= share->columndef + field_nr;
7012 8536 : uchar *orig_field_pos= orig_record + column->offset;
7013 :
7014 8536 : bitmap_set_bit(&info->changed_fields, field_nr);
7015 8536 : if (field_nr >= share->base.fixed_not_null_fields)
7016 : {
7017 3205 : if (!(field_length= ma_get_length(&field_length_data)))
7018 : {
7019 : /* Null field or empty field */
7020 87 : bfill(orig_field_pos, column->fill_length,
7021 : column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0);
7022 87 : continue;
7023 : }
7024 : }
7025 : else
7026 5331 : field_length= column->length;
7027 :
7028 8449 : switch (column->type) {
7029 : case FIELD_CHECK:
7030 : case FIELD_NORMAL: /* Fixed length field */
7031 : case FIELD_ZERO:
7032 : case FIELD_SKIP_PRESPACE: /* Not packed */
7033 5331 : memcpy(orig_field_pos, header, column->length);
7034 5331 : header+= column->length;
7035 5331 : break;
7036 : case FIELD_SKIP_ZERO: /* Number */
7037 : case FIELD_SKIP_ENDSPACE: /* CHAR */
7038 : {
7039 : uint diff;
7040 2779 : memcpy(orig_field_pos, header, field_length);
7041 2779 : if ((diff= (column->length - field_length)))
7042 1460 : bfill(orig_field_pos + column->length - diff, diff,
7043 : column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0);
7044 2779 : header+= field_length;
7045 : }
7046 2779 : break;
7047 : case FIELD_VARCHAR:
7048 0 : if (column->length <= 256)
7049 : {
7050 0 : *orig_field_pos++= (uchar) field_length;
7051 : }
7052 : else
7053 : {
7054 0 : int2store(orig_field_pos, field_length);
7055 0 : orig_field_pos+= 2;
7056 : }
7057 0 : memcpy(orig_field_pos, header, field_length);
7058 0 : header+= field_length;
7059 0 : break;
7060 : case FIELD_BLOB:
7061 : {
7062 339 : uint size_length= column->length - portable_sizeof_char_ptr;
7063 339 : _ma_store_blob_length(orig_field_pos, size_length, field_length);
7064 339 : memcpy_fixed(orig_field_pos + size_length, &header, sizeof(header));
7065 339 : header+= field_length;
7066 339 : break;
7067 : }
7068 : default:
7069 0 : DBUG_ASSERT(0);
7070 : }
7071 : }
7072 1671 : copy_not_changed_fields(info, &info->changed_fields,
7073 : orig_record, current_record);
7074 :
7075 1671 : if (share->calc_checksum)
7076 : {
7077 1671 : info->new_row.checksum= checksum_delta +
7078 : (info->cur_row.checksum= (*share->calc_checksum)(info, orig_record));
7079 : /* verify that record's content is sane */
7080 1671 : DBUG_ASSERT(info->new_row.checksum ==
7081 : (*share->calc_checksum)(info, current_record));
7082 : }
7083 :
7084 1671 : info->last_auto_increment= ~ (ulonglong) 0;
7085 : /* Now records are up to date, execute the update to original values */
7086 1671 : if (_ma_update_at_original_place(info, page, rownr, length_on_head_page,
7087 : extent_count, extent_info,
7088 : current_record, orig_record, undo_lsn))
7089 1671 : goto err;
7090 :
7091 1671 : error= 0;
7092 1671 : err:
7093 1671 : my_free(current_record, MYF(0));
7094 1671 : DBUG_RETURN(error);
7095 : }
7096 :
7097 :
7098 : /**
7099 : Execute undo of a bulk insert which used repair
7100 :
7101 : @return Operation status
7102 : @retval 0 OK
7103 : @retval 1 Error
7104 : */
7105 :
7106 : my_bool _ma_apply_undo_bulk_insert(MARIA_HA *info, LSN undo_lsn)
7107 0 : {
7108 : my_bool error;
7109 : LSN lsn;
7110 0 : DBUG_ENTER("_ma_apply_undo_bulk_insert");
7111 : /*
7112 : We delete all rows, re-enable indices as bulk insert had disabled
7113 : non-unique ones.
7114 : */
7115 0 : error= (maria_delete_all_rows(info) ||
7116 : maria_enable_indexes(info) ||
7117 : /* we enabled indices so need '2' below */
7118 : _ma_state_info_write(info->s,
7119 : MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
7120 : MA_STATE_INFO_WRITE_FULL_INFO |
7121 : MA_STATE_INFO_WRITE_LOCK) ||
7122 : _ma_write_clr(info, undo_lsn, LOGREC_UNDO_BULK_INSERT,
7123 : FALSE, 0, &lsn, NULL));
7124 0 : DBUG_RETURN(error);
7125 : }
7126 :
7127 :
7128 : /**
7129 : @brief Get the TRANSLOG_ADDRESS to flush up to
7130 :
7131 : @param page Page's content
7132 : @param page_no Page's number (<offset>/<page length>)
7133 : @param data_ptr Callback data pointer (pointer to MARIA_SHARE)
7134 :
7135 : @note
7136 : Usable for data (non-bitmap) and index pages
7137 :
7138 : @retval LSN to flush up to
7139 : */
7140 :
7141 : TRANSLOG_ADDRESS
7142 : maria_page_get_lsn(uchar *page,
7143 : pgcache_page_no_t page_no __attribute__((unused)),
7144 : uchar* data_ptr __attribute__((unused)))
7145 0 : {
7146 : #ifndef DBUG_OFF
7147 0 : const MARIA_SHARE *share= (MARIA_SHARE*)data_ptr;
7148 0 : DBUG_ASSERT(share->page_type == PAGECACHE_LSN_PAGE &&
7149 : share->now_transactional);
7150 : #endif
7151 0 : return lsn_korr(page);
7152 : }
7153 :
7154 :
7155 : /**
7156 : @brief Enable reading of all rows, ignoring versioning
7157 :
7158 : @note
7159 : This is mainly useful in single user applications, like maria_pack,
7160 : where we want to be able to read all rows without having to read the
7161 : transaction id from the control file
7162 : */
7163 :
7164 : void maria_ignore_trids(MARIA_HA *info)
7165 820 : {
7166 820 : if (info->s->base.born_transactional)
7167 : {
7168 514 : if (!info->trn)
7169 470 : _ma_set_trn_for_table(info, &dummy_transaction_object);
7170 : /* Ignore transaction id when row is read */
7171 514 : info->trn->min_read_from= ~(TrID) 0;
7172 : }
7173 : }
|