1 : /* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
2 :
3 : This program is free software; you can redistribute it and/or modify
4 : it under the terms of the GNU General Public License as published by
5 : the Free Software Foundation; version 2 of the License.
6 :
7 : This program is distributed in the hope that it will be useful,
8 : but WITHOUT ANY WARRANTY; without even the implied warranty of
9 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 : GNU General Public License for more details.
11 :
12 : You should have received a copy of the GNU General Public License
13 : along with this program; if not, write to the Free Software
14 : Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
15 :
16 : /* Pack MARIA file */
17 :
18 : #ifndef USE_MY_FUNC
19 : #define USE_MY_FUNC /* We need at least my_malloc */
20 : #endif
21 :
22 : #include "maria_def.h"
23 : #include <queues.h>
24 : #include <my_tree.h>
25 : #include "mysys_err.h"
26 : #ifdef MSDOS
27 : #include <io.h>
28 : #endif
29 : #ifndef __GNU_LIBRARY__
30 : #define __GNU_LIBRARY__ /* Skip warnings in getopt.h */
31 : #endif
32 : #include <my_getopt.h>
33 : #include <assert.h>
34 :
35 : #if SIZEOF_LONG_LONG > 4
36 : #define BITS_SAVED 64
37 : #else
38 : #define BITS_SAVED 32
39 : #endif
40 :
41 : #define IS_OFFSET ((uint) 32768) /* Bit if offset or char in tree */
42 : #define HEAD_LENGTH 32
43 : #define ALLOWED_JOIN_DIFF 256 /* Diff allowed to join trees */
44 :
45 : #define DATA_TMP_EXT ".TMD"
46 : #define OLD_EXT ".OLD"
47 : #define WRITE_COUNT MY_HOW_OFTEN_TO_WRITE
48 :
49 : struct st_file_buffer {
50 : File file;
51 : uchar *buffer,*pos,*end;
52 : my_off_t pos_in_file;
53 : int bits;
54 : ulonglong bitbucket;
55 : };
56 :
57 : struct st_huff_tree;
58 : struct st_huff_element;
59 :
60 : typedef struct st_huff_counts {
61 : uint field_length,max_zero_fill;
62 : uint pack_type;
63 : uint max_end_space,max_pre_space,length_bits,min_space;
64 : ulong max_length;
65 : enum en_fieldtype field_type;
66 : struct st_huff_tree *tree; /* Tree for field */
67 : my_off_t counts[256];
68 : my_off_t end_space[8];
69 : my_off_t pre_space[8];
70 : my_off_t tot_end_space,tot_pre_space,zero_fields,empty_fields,bytes_packed;
71 : TREE int_tree; /* Tree for detecting distinct column values. */
72 : uchar *tree_buff; /* Column values, 'field_length' each. */
73 : uchar *tree_pos; /* Points to end of column values in 'tree_buff'. */
74 : } HUFF_COUNTS;
75 :
76 : typedef struct st_huff_element HUFF_ELEMENT;
77 :
78 : /*
79 : WARNING: It is crucial for the optimizations in calc_packed_length()
80 : that 'count' is the first element of 'HUFF_ELEMENT'.
81 : */
82 : struct st_huff_element {
83 : my_off_t count;
84 : union un_element {
85 : struct st_nod {
86 : HUFF_ELEMENT *left,*right;
87 : } nod;
88 : struct st_leaf {
89 : HUFF_ELEMENT *null;
90 : uint element_nr; /* Number of element */
91 : } leaf;
92 : } a;
93 : };
94 :
95 :
96 : typedef struct st_huff_tree {
97 : HUFF_ELEMENT *root,*element_buffer;
98 : HUFF_COUNTS *counts;
99 : uint tree_number;
100 : uint elements;
101 : my_off_t bytes_packed;
102 : uint tree_pack_length;
103 : uint min_chr,max_chr,char_bits,offset_bits,max_offset,height;
104 : ulonglong *code;
105 : uchar *code_len;
106 : } HUFF_TREE;
107 :
108 :
109 : typedef struct st_isam_mrg {
110 : MARIA_HA **file,**current,**end;
111 : uint free_file;
112 : uint count;
113 : uint min_pack_length; /* Theese is used by packed data */
114 : uint max_pack_length;
115 : uint ref_length;
116 : uint max_blob_length;
117 : my_off_t records;
118 : /* true if at least one source file has at least one disabled index */
119 : my_bool src_file_has_indexes_disabled;
120 : } PACK_MRG_INFO;
121 :
122 :
123 : extern int main(int argc,char * *argv);
124 : static void get_options(int *argc,char ***argv);
125 : static MARIA_HA *open_maria_file(char *name,int mode);
126 : static my_bool open_maria_files(PACK_MRG_INFO *mrg,char **names,uint count);
127 : static int compress(PACK_MRG_INFO *file,char *join_name);
128 : static HUFF_COUNTS *init_huff_count(MARIA_HA *info,my_off_t records);
129 : static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees,
130 : uint trees,
131 : HUFF_COUNTS *huff_counts,
132 : uint fields);
133 : static int compare_tree(void* cmp_arg __attribute__((unused)),
134 : const uchar *s,const uchar *t);
135 : static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts);
136 : static void check_counts(HUFF_COUNTS *huff_counts,uint trees,
137 : my_off_t records);
138 : static int test_space_compress(HUFF_COUNTS *huff_counts,my_off_t records,
139 : uint max_space_length,my_off_t *space_counts,
140 : my_off_t tot_space_count,
141 : enum en_fieldtype field_type);
142 : static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts,uint trees);
143 : static int make_huff_tree(HUFF_TREE *tree,HUFF_COUNTS *huff_counts);
144 : static int compare_huff_elements(void *not_used, uchar *a,uchar *b);
145 : static int save_counts_in_queue(uchar *key,element_count count,
146 : HUFF_TREE *tree);
147 : static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,uint flag);
148 : static uint join_same_trees(HUFF_COUNTS *huff_counts,uint trees);
149 : static int make_huff_decode_table(HUFF_TREE *huff_tree,uint trees);
150 : static void make_traverse_code_tree(HUFF_TREE *huff_tree,
151 : HUFF_ELEMENT *element,uint size,
152 : ulonglong code);
153 : static int write_header(PACK_MRG_INFO *isam_file, uint header_length,uint trees,
154 : my_off_t tot_elements,my_off_t filelength);
155 : static void write_field_info(HUFF_COUNTS *counts, uint fields,uint trees);
156 : static my_off_t write_huff_tree(HUFF_TREE *huff_tree,uint trees);
157 : static uint *make_offset_code_tree(HUFF_TREE *huff_tree,
158 : HUFF_ELEMENT *element,
159 : uint *offset);
160 : static uint max_bit(uint value);
161 : static int compress_maria_file(PACK_MRG_INFO *file,HUFF_COUNTS *huff_counts);
162 : static char *make_new_name(char *new_name,char *old_name);
163 : static char *make_old_name(char *new_name,char *old_name);
164 : static void init_file_buffer(File file,pbool read_buffer);
165 : static int flush_buffer(ulong neaded_length);
166 : static void end_file_buffer(void);
167 : static void write_bits(ulonglong value, uint bits);
168 : static void flush_bits(void);
169 : static int save_state(MARIA_HA *isam_file,PACK_MRG_INFO *mrg,
170 : my_off_t new_length, ha_checksum crc);
171 : static int save_state_mrg(File file,PACK_MRG_INFO *isam_file,
172 : my_off_t new_length, ha_checksum crc);
173 : static int mrg_close(PACK_MRG_INFO *mrg);
174 : static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf);
175 : static void mrg_reset(PACK_MRG_INFO *mrg);
176 : #if !defined(DBUG_OFF)
177 : static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count);
178 : static int fakecmp(my_off_t **count1, my_off_t **count2);
179 : #endif
180 :
181 :
182 : static int error_on_write=0,test_only=0,verbose=0,silent=0,
183 : write_loop=0,force_pack=0, isamchk_neaded=0;
184 : static int tmpfile_createflag=O_RDWR | O_TRUNC | O_EXCL;
185 : static my_bool backup, opt_wait;
186 : /*
187 : tree_buff_length is somewhat arbitrary. The bigger it is the better
188 : the chance to win in terms of compression factor. On the other hand,
189 : this table becomes part of the compressed file header. And its length
190 : is coded with 16 bits in the header. Hence the limit is 2**16 - 1.
191 : */
192 : static uint tree_buff_length= 65536 - MALLOC_OVERHEAD;
193 : static char tmp_dir[FN_REFLEN]={0},*join_table;
194 : static my_off_t intervall_length;
195 : static ha_checksum glob_crc;
196 : static struct st_file_buffer file_buffer;
197 : static QUEUE queue;
198 : static HUFF_COUNTS *global_count;
199 : static char zero_string[]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
200 : static const char *load_default_groups[]= { "mariapack",0 };
201 :
202 : /* The main program */
203 :
204 : int main(int argc, char **argv)
205 30 : {
206 : int error,ok;
207 : PACK_MRG_INFO merge;
208 : char **default_argv;
209 30 : MY_INIT(argv[0]);
210 :
211 30 : load_defaults("my",load_default_groups,&argc,&argv);
212 30 : default_argv= argv;
213 30 : get_options(&argc,&argv);
214 30 : maria_init();
215 :
216 30 : error=ok=isamchk_neaded=0;
217 30 : if (join_table)
218 : { /* Join files into one */
219 5 : if (open_maria_files(&merge,argv,(uint) argc) ||
220 : compress(&merge,join_table))
221 0 : error=1;
222 : }
223 50 : else while (argc--)
224 : {
225 : MARIA_HA *isam_file;
226 25 : if (!(isam_file=open_maria_file(*argv++,O_RDWR)))
227 0 : error=1;
228 : else
229 : {
230 25 : merge.file= &isam_file;
231 25 : merge.current=0;
232 25 : merge.free_file=0;
233 25 : merge.count=1;
234 25 : if (compress(&merge,0))
235 0 : error=1;
236 : else
237 25 : ok=1;
238 : }
239 : }
240 30 : if (ok && isamchk_neaded && !silent)
241 0 : puts("Remember to run maria_chk -rq on compressed tables");
242 30 : VOID(fflush(stdout));
243 30 : VOID(fflush(stderr));
244 30 : free_defaults(default_argv);
245 30 : maria_end();
246 30 : my_end(verbose ? MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR);
247 30 : exit(error ? 2 : 0);
248 : #ifndef _lint
249 : return 0; /* No compiler warning */
250 : #endif
251 : }
252 :
253 : enum options_mp {OPT_CHARSETS_DIR_MP=256, OPT_AUTO_CLOSE};
254 :
255 : static struct my_option my_long_options[] =
256 : {
257 : #ifdef __NETWARE__
258 : {"autoclose", OPT_AUTO_CLOSE, "Auto close the screen on exit for Netware.",
259 : 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
260 : #endif
261 : {"backup", 'b', "Make a backup of the table as table_name.OLD.",
262 : (uchar**) &backup, (uchar**) &backup, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
263 : {"character-sets-dir", OPT_CHARSETS_DIR_MP,
264 : "Directory where character sets are.", (uchar**) &charsets_dir,
265 : (uchar**) &charsets_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
266 : {"debug", '#', "Output debug log. Often this is 'd:t:o,filename'.",
267 : 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
268 : {"force", 'f',
269 : "Force packing of table even if it gets bigger or if tempfile exists.",
270 : 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
271 : {"join", 'j',
272 : "Join all given tables into 'new_table_name'. All tables MUST have identical layouts.",
273 : (uchar**) &join_table, (uchar**) &join_table, 0, GET_STR, REQUIRED_ARG, 0, 0, 0,
274 : 0, 0, 0},
275 : {"help", '?', "Display this help and exit.",
276 : 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
277 : {"silent", 's', "Be more silent.",
278 : 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
279 : {"tmpdir", 'T', "Use temporary directory to store temporary table.",
280 : 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
281 : {"test", 't', "Don't pack table, only test packing it.",
282 : 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
283 : {"verbose", 'v', "Write info about progress and packing result. Use many -v for more verbosity!",
284 : 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
285 : {"version", 'V', "Output version information and exit.",
286 : 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
287 : {"wait", 'w', "Wait and retry if table is in use.", (uchar**) &opt_wait,
288 : (uchar**) &opt_wait, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
289 : { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
290 : };
291 :
292 : #include <help_start.h>
293 :
294 : static void print_version(void)
295 0 : {
296 0 : VOID(printf("%s Ver 1.0 for %s on %s\n",
297 : my_progname, SYSTEM_TYPE, MACHINE_TYPE));
298 : NETWARE_SET_SCREEN_MODE(1);
299 : }
300 :
301 :
302 : static void usage(void)
303 0 : {
304 0 : print_version();
305 0 : puts("Copyright 2002-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.");
306 0 : puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,");
307 0 : puts("and you are welcome to modify and redistribute it under the GPL license\n");
308 :
309 0 : puts("Pack a MARIA-table to take much less space.");
310 0 : puts("Keys are not updated, you must run maria_chk -rq on the index (.MAI) file");
311 0 : puts("afterwards to update the keys.");
312 0 : puts("You should give the .MAI file as the filename argument.");
313 0 : puts("To unpack a packed table, run maria_chk -u on the table");
314 :
315 0 : VOID(printf("\nUsage: %s [OPTIONS] filename...\n", my_progname));
316 0 : my_print_help(my_long_options);
317 0 : print_defaults("my", load_default_groups);
318 0 : my_print_variables(my_long_options);
319 : }
320 :
321 : #include <help_end.h>
322 :
323 : static my_bool
324 : get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
325 : char *argument)
326 65 : {
327 : uint length;
328 :
329 65 : switch(optid) {
330 : #ifdef __NETWARE__
331 : case OPT_AUTO_CLOSE:
332 : setscreenmode(SCR_AUTOCLOSE_ON_EXIT);
333 : break;
334 : #endif
335 : case 'f':
336 30 : force_pack= 1;
337 30 : tmpfile_createflag= O_RDWR | O_TRUNC;
338 30 : break;
339 : case 's':
340 30 : write_loop= verbose= 0;
341 30 : silent= 1;
342 30 : break;
343 : case 't':
344 0 : test_only= 1;
345 : /* Avoid to reset 'verbose' if it was already set > 1. */
346 0 : if (! verbose)
347 0 : verbose= 1;
348 : break;
349 : case 'T':
350 0 : length= (uint) (strmov(tmp_dir, argument) - tmp_dir);
351 0 : if (length != dirname_length(tmp_dir))
352 : {
353 0 : tmp_dir[length]=FN_LIBCHAR;
354 0 : tmp_dir[length+1]=0;
355 : }
356 : break;
357 : case 'v':
358 0 : verbose++; /* Allow for selecting the level of verbosity. */
359 0 : silent= 0;
360 0 : break;
361 : case '#':
362 0 : DBUG_PUSH(argument ? argument : "d:t:o,/tmp/maria_pack.trace");
363 0 : break;
364 : case 'V':
365 0 : print_version();
366 0 : exit(0);
367 : case 'I':
368 : case '?':
369 0 : usage();
370 0 : exit(0);
371 : }
372 65 : return 0;
373 : }
374 :
375 : /* reads options */
376 : /* Initiates DEBUG - but no debugging here ! */
377 :
378 : static void get_options(int *argc,char ***argv)
379 30 : {
380 : int ho_error;
381 :
382 30 : my_progname= argv[0][0];
383 30 : if (isatty(fileno(stdout)))
384 0 : write_loop=1;
385 :
386 30 : if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option)))
387 0 : exit(ho_error);
388 :
389 30 : if (!*argc)
390 : {
391 0 : usage();
392 0 : exit(1);
393 : }
394 30 : if (join_table)
395 : {
396 5 : backup=0; /* Not needed */
397 5 : tmp_dir[0]=0;
398 : }
399 : return;
400 : }
401 :
402 :
403 : static MARIA_HA *open_maria_file(char *name,int mode)
404 35 : {
405 : MARIA_HA *isam_file;
406 : MARIA_SHARE *share;
407 35 : DBUG_ENTER("open_maria_file");
408 :
409 35 : if (!(isam_file=maria_open(name, mode, HA_OPEN_IGNORE_MOVED_STATE |
410 : (opt_wait ? HA_OPEN_WAIT_IF_LOCKED :
411 : HA_OPEN_ABORT_IF_LOCKED))))
412 : {
413 0 : VOID(fprintf(stderr, "%s gave error %d on open\n", name, my_errno));
414 0 : DBUG_RETURN(0);
415 : }
416 35 : share=isam_file->s;
417 35 : if (share->options & HA_OPTION_COMPRESS_RECORD && !join_table)
418 : {
419 0 : if (!force_pack)
420 : {
421 0 : VOID(fprintf(stderr, "%s is already compressed\n", name));
422 0 : VOID(maria_close(isam_file));
423 0 : DBUG_RETURN(0);
424 : }
425 0 : if (verbose)
426 0 : puts("Recompressing already compressed table");
427 0 : share->options&= ~HA_OPTION_READ_ONLY_DATA; /* We are modifing it */
428 : }
429 35 : if (! force_pack && share->state.state.records != 0 &&
430 : (share->state.state.records <= 1 ||
431 : share->state.state.data_file_length < 1024))
432 : {
433 0 : VOID(fprintf(stderr, "%s is too small to compress\n", name));
434 0 : VOID(maria_close(isam_file));
435 0 : DBUG_RETURN(0);
436 : }
437 35 : VOID(maria_lock_database(isam_file,F_WRLCK));
438 35 : maria_ignore_trids(isam_file);
439 35 : DBUG_RETURN(isam_file);
440 : }
441 :
442 :
443 : static my_bool open_maria_files(PACK_MRG_INFO *mrg,char **names,uint count)
444 5 : {
445 : uint i,j;
446 5 : mrg->count=0;
447 5 : mrg->current=0;
448 5 : mrg->file=(MARIA_HA**) my_malloc(sizeof(MARIA_HA*)*count,MYF(MY_FAE));
449 5 : mrg->free_file=1;
450 5 : mrg->src_file_has_indexes_disabled= 0;
451 15 : for (i=0; i < count ; i++)
452 : {
453 10 : if (!(mrg->file[i]=open_maria_file(names[i],O_RDONLY)))
454 10 : goto error;
455 :
456 10 : mrg->src_file_has_indexes_disabled|=
457 : ! maria_is_all_keys_active(mrg->file[i]->s->state.key_map,
458 : mrg->file[i]->s->base.keys);
459 : }
460 : /* Check that files are identical */
461 10 : for (j=0 ; j < count-1 ; j++)
462 : {
463 : MARIA_COLUMNDEF *m1,*m2,*end;
464 5 : if (mrg->file[j]->s->base.reclength != mrg->file[j+1]->s->base.reclength ||
465 : mrg->file[j]->s->base.fields != mrg->file[j+1]->s->base.fields)
466 : goto diff_file;
467 5 : m1=mrg->file[j]->s->columndef;
468 5 : end=m1+mrg->file[j]->s->base.fields;
469 5 : m2=mrg->file[j+1]->s->columndef;
470 15 : for ( ; m1 != end ; m1++,m2++)
471 : {
472 10 : if (m1->type != m2->type || m1->length != m2->length)
473 : goto diff_file;
474 : }
475 : }
476 5 : mrg->count=count;
477 5 : return 0;
478 :
479 0 : diff_file:
480 0 : VOID(fprintf(stderr, "%s: Tables '%s' and '%s' are not identical\n",
481 : my_progname, names[j], names[j+1]));
482 : error:
483 0 : while (i--)
484 0 : maria_close(mrg->file[i]);
485 0 : my_free(mrg->file, MYF(0));
486 0 : return 1;
487 : }
488 :
489 :
490 : static int compress(PACK_MRG_INFO *mrg,char *result_table)
491 30 : {
492 : int error;
493 : File new_file,join_maria_file;
494 : MARIA_HA *isam_file;
495 : MARIA_SHARE *share;
496 : char org_name[FN_REFLEN],new_name[FN_REFLEN],temp_name[FN_REFLEN];
497 : uint i,header_length,fields,trees,used_trees;
498 : my_off_t old_length,new_length,tot_elements;
499 : HUFF_COUNTS *huff_counts;
500 : HUFF_TREE *huff_trees;
501 30 : DBUG_ENTER("compress");
502 :
503 30 : isam_file=mrg->file[0]; /* Take this as an example */
504 30 : share=isam_file->s;
505 30 : new_file=join_maria_file= -1;
506 30 : trees=fields=0;
507 30 : huff_trees=0;
508 30 : huff_counts=0;
509 30 : maria_block_size= isam_file->s->block_size;
510 :
511 : /* Create temporary or join file */
512 30 : if (backup)
513 0 : VOID(fn_format(org_name,isam_file->s->open_file_name.str,
514 : "",MARIA_NAME_DEXT, 2));
515 : else
516 30 : VOID(fn_format(org_name,isam_file->s->open_file_name.str,
517 : "",MARIA_NAME_DEXT, 2+4+16));
518 :
519 30 : if (init_pagecache(maria_pagecache, MARIA_MIN_PAGE_CACHE_SIZE, 0, 0,
520 : maria_block_size, MY_WME) == 0)
521 : {
522 0 : fprintf(stderr, "Can't initialize page cache\n");
523 0 : goto err;
524 : }
525 :
526 35 : if (!test_only && result_table)
527 : {
528 : /* Make a new indexfile based on first file in list */
529 : uint length;
530 : uchar *buff;
531 5 : strmov(org_name,result_table); /* Fix error messages */
532 5 : VOID(fn_format(new_name,result_table,"",MARIA_NAME_IEXT,2));
533 5 : if ((join_maria_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME)))
534 : < 0)
535 5 : goto err;
536 5 : length=(uint) share->base.keystart;
537 5 : if (!(buff= (uchar*) my_malloc(length,MYF(MY_WME))))
538 5 : goto err;
539 5 : if (my_pread(share->kfile.file, buff, length, 0L, MYF(MY_WME | MY_NABP)) ||
540 : my_write(join_maria_file,buff,length,
541 : MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
542 : {
543 0 : my_free(buff,MYF(0));
544 0 : goto err;
545 : }
546 5 : my_free(buff,MYF(0));
547 5 : VOID(fn_format(new_name,result_table,"",MARIA_NAME_DEXT,2));
548 : }
549 25 : else if (!tmp_dir[0])
550 25 : VOID(make_new_name(new_name,org_name));
551 : else
552 0 : VOID(fn_format(new_name,org_name,tmp_dir,DATA_TMP_EXT,1+2+4));
553 30 : if (!test_only &&
554 : (new_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME))) < 0)
555 30 : goto err;
556 :
557 : /* Start calculating statistics */
558 :
559 30 : mrg->records=0;
560 65 : for (i=0 ; i < mrg->count ; i++)
561 35 : mrg->records+=mrg->file[i]->s->state.state.records;
562 :
563 30 : DBUG_PRINT("info", ("Compressing %s: (%lu records)",
564 : result_table ? new_name : org_name,
565 : (ulong) mrg->records));
566 30 : if (write_loop || verbose)
567 : {
568 0 : VOID(printf("Compressing %s: (%lu records)\n",
569 : result_table ? new_name : org_name, (ulong) mrg->records));
570 : }
571 30 : trees=fields=share->base.fields;
572 30 : huff_counts=init_huff_count(isam_file,mrg->records);
573 30 : QUICK_SAFEMALLOC;
574 :
575 : /*
576 : Read the whole data file(s) for statistics.
577 : */
578 30 : DBUG_PRINT("info", ("- Calculating statistics"));
579 30 : if (write_loop || verbose)
580 0 : VOID(printf("- Calculating statistics\n"));
581 30 : if (get_statistic(mrg,huff_counts))
582 30 : goto err;
583 30 : NORMAL_SAFEMALLOC;
584 30 : old_length=0;
585 65 : for (i=0; i < mrg->count ; i++)
586 35 : old_length+= (mrg->file[i]->s->state.state.data_file_length -
587 : mrg->file[i]->s->state.state.empty);
588 :
589 : /*
590 : Create a global priority queue in preparation for making
591 : temporary Huffman trees.
592 : */
593 30 : if (init_queue(&queue,256,0,0,compare_huff_elements,0))
594 30 : goto err;
595 :
596 : /*
597 : Check each column if we should use pre-space-compress, end-space-
598 : compress, empty-field-compress or zero-field-compress.
599 : */
600 30 : check_counts(huff_counts,fields,mrg->records);
601 :
602 : /*
603 : Build a Huffman tree for each column.
604 : */
605 30 : huff_trees=make_huff_trees(huff_counts,trees);
606 :
607 : /*
608 : If the packed lengths of combined columns is less then the sum of
609 : the non-combined columns, then create common Huffman trees for them.
610 : We do this only for uchar compressed columns, not for distinct values
611 : compressed columns.
612 : */
613 30 : if ((int) (used_trees=join_same_trees(huff_counts,trees)) < 0)
614 30 : goto err;
615 :
616 : /*
617 : Assign codes to all uchar or column values.
618 : */
619 30 : if (make_huff_decode_table(huff_trees,fields))
620 30 : goto err;
621 :
622 : /* Prepare a file buffer. */
623 30 : init_file_buffer(new_file,0);
624 :
625 : /*
626 : Reserve space in the target file for the fixed compressed file header.
627 : */
628 30 : file_buffer.pos_in_file=HEAD_LENGTH;
629 30 : if (! test_only)
630 30 : VOID(my_seek(new_file,file_buffer.pos_in_file,MY_SEEK_SET,MYF(0)));
631 :
632 : /*
633 : Write field infos: field type, pack type, length bits, tree number.
634 : */
635 30 : write_field_info(huff_counts,fields,used_trees);
636 :
637 : /*
638 : Write decode trees.
639 : */
640 30 : if (!(tot_elements=write_huff_tree(huff_trees,trees)))
641 30 : goto err;
642 :
643 : /*
644 : Calculate the total length of the compression info header.
645 : This includes the fixed compressed file header, the column compression
646 : type descriptions, and the decode trees.
647 : */
648 30 : header_length=(uint) file_buffer.pos_in_file+
649 : (uint) (file_buffer.pos-file_buffer.buffer);
650 :
651 : /*
652 : Compress the source file into the target file.
653 : */
654 30 : DBUG_PRINT("info", ("- Compressing file"));
655 30 : if (write_loop || verbose)
656 0 : VOID(printf("- Compressing file\n"));
657 30 : error=compress_maria_file(mrg,huff_counts);
658 30 : new_length=file_buffer.pos_in_file;
659 30 : if (!error && !test_only)
660 : {
661 : uchar buff[MEMMAP_EXTRA_MARGIN]; /* End marginal for memmap */
662 30 : bzero(buff,sizeof(buff));
663 30 : error=my_write(file_buffer.file,buff,sizeof(buff),
664 : MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
665 : }
666 :
667 : /*
668 : Write the fixed compressed file header.
669 : */
670 30 : if (!error)
671 30 : error=write_header(mrg,header_length,used_trees,tot_elements,
672 : new_length);
673 :
674 : /* Flush the file buffer. */
675 30 : end_file_buffer();
676 :
677 : /* Display statistics. */
678 30 : DBUG_PRINT("info", ("Min record length: %6d Max length: %6d "
679 : "Mean total length: %6ld",
680 : mrg->min_pack_length, mrg->max_pack_length,
681 : (ulong) (mrg->records ? (new_length/mrg->records) : 0)));
682 30 : if (verbose && mrg->records)
683 0 : VOID(printf("Min record length: %6d Max length: %6d "
684 : "Mean total length: %6ld\n", mrg->min_pack_length,
685 : mrg->max_pack_length, (ulong) (new_length/mrg->records)));
686 :
687 : /* Close source and target file. */
688 30 : if (!test_only)
689 : {
690 30 : error|=my_close(new_file,MYF(MY_WME));
691 30 : if (!result_table)
692 : {
693 25 : error|=my_close(isam_file->dfile.file, MYF(MY_WME));
694 25 : isam_file->dfile.file= -1; /* Tell maria_close file is closed */
695 25 : isam_file->s->bitmap.file.file= -1;
696 : }
697 : }
698 :
699 : /* Cleanup. */
700 30 : free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
701 30 : if (! test_only && ! error)
702 : {
703 30 : if (result_table)
704 : {
705 5 : error=save_state_mrg(join_maria_file,mrg,new_length,glob_crc);
706 : }
707 : else
708 : {
709 25 : if (backup)
710 : {
711 0 : if (my_rename(org_name,make_old_name(temp_name,
712 : isam_file->s->open_file_name.str),
713 : MYF(MY_WME)))
714 0 : error=1;
715 : else
716 : {
717 0 : if (tmp_dir[0])
718 0 : error=my_copy(new_name,org_name,MYF(MY_WME));
719 : else
720 0 : error=my_rename(new_name,org_name,MYF(MY_WME));
721 0 : if (!error)
722 : {
723 0 : VOID(my_copystat(temp_name,org_name,MYF(MY_COPYTIME)));
724 0 : if (tmp_dir[0])
725 0 : VOID(my_delete(new_name,MYF(MY_WME)));
726 : }
727 : }
728 : }
729 : else
730 : {
731 25 : if (tmp_dir[0])
732 : {
733 0 : error=my_copy(new_name,org_name,
734 : MYF(MY_WME | MY_HOLD_ORIGINAL_MODES | MY_COPYTIME));
735 0 : if (!error)
736 0 : VOID(my_delete(new_name,MYF(MY_WME)));
737 : }
738 : else
739 25 : error=my_redel(org_name,new_name,MYF(MY_WME | MY_COPYTIME));
740 : }
741 25 : if (! error)
742 25 : error=save_state(isam_file,mrg,new_length,glob_crc);
743 : }
744 : }
745 30 : error|=mrg_close(mrg);
746 30 : if (join_maria_file >= 0)
747 5 : error|=my_close(join_maria_file,MYF(MY_WME));
748 30 : if (error)
749 : {
750 0 : VOID(fprintf(stderr, "Aborting: %s is not compressed\n", org_name));
751 0 : VOID(my_delete(new_name,MYF(MY_WME)));
752 0 : DBUG_RETURN(-1);
753 : }
754 30 : if (write_loop || verbose)
755 : {
756 0 : if (old_length)
757 0 : VOID(printf("%.4g%% \n",
758 : (((longlong) (old_length - new_length)) * 100.0 /
759 : (longlong) old_length)));
760 : else
761 0 : puts("Empty file saved in compressed format");
762 : }
763 30 : DBUG_RETURN(0);
764 :
765 0 : err:
766 0 : end_pagecache(maria_pagecache, 1);
767 0 : free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
768 0 : if (new_file >= 0)
769 0 : VOID(my_close(new_file,MYF(0)));
770 0 : if (join_maria_file >= 0)
771 0 : VOID(my_close(join_maria_file,MYF(0)));
772 0 : mrg_close(mrg);
773 0 : VOID(fprintf(stderr, "Aborted: %s is not compressed\n", org_name));
774 0 : DBUG_RETURN(-1);
775 : }
776 :
777 : /* Init a huff_count-struct for each field and init it */
778 :
779 : static HUFF_COUNTS *init_huff_count(MARIA_HA *info,my_off_t records)
780 30 : {
781 : reg2 uint i;
782 : reg1 HUFF_COUNTS *count;
783 30 : if ((count = (HUFF_COUNTS*) my_malloc(info->s->base.fields*
784 : sizeof(HUFF_COUNTS),
785 : MYF(MY_ZEROFILL | MY_WME))))
786 : {
787 90 : for (i=0 ; i < info->s->base.fields ; i++)
788 : {
789 : enum en_fieldtype type;
790 60 : count[i].field_length=info->s->columndef[i].length;
791 60 : type= count[i].field_type= (enum en_fieldtype) info->s->columndef[i].type;
792 60 : if (type == FIELD_INTERVALL ||
793 : type == FIELD_CONSTANT ||
794 : type == FIELD_ZERO)
795 0 : type = FIELD_NORMAL;
796 60 : if (count[i].field_length <= 8 &&
797 : (type == FIELD_NORMAL ||
798 : type == FIELD_SKIP_ZERO))
799 27 : count[i].max_zero_fill= count[i].field_length;
800 : /*
801 : For every column initialize a tree, which is used to detect distinct
802 : column values. 'int_tree' works together with 'tree_buff' and
803 : 'tree_pos'. It's keys are implemented by pointers into 'tree_buff'.
804 : This is accomplished by '-1' as the element size.
805 : */
806 60 : init_tree(&count[i].int_tree,0,0,-1,(qsort_cmp2) compare_tree,0, NULL,
807 : NULL);
808 60 : if (records && type != FIELD_BLOB && type != FIELD_VARCHAR)
809 60 : count[i].tree_pos=count[i].tree_buff =
810 : my_malloc(count[i].field_length > 1 ? tree_buff_length : 2,
811 : MYF(MY_WME));
812 : }
813 : }
814 30 : return count;
815 : }
816 :
817 :
818 : /* Free memory used by counts and trees */
819 :
820 : static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees, uint trees,
821 : HUFF_COUNTS *huff_counts,
822 : uint fields)
823 30 : {
824 : register uint i;
825 :
826 30 : if (huff_trees)
827 : {
828 90 : for (i=0 ; i < trees ; i++)
829 : {
830 60 : if (huff_trees[i].element_buffer)
831 30 : my_free(huff_trees[i].element_buffer,MYF(0));
832 60 : if (huff_trees[i].code)
833 30 : my_free(huff_trees[i].code,MYF(0));
834 : }
835 30 : my_free(huff_trees,MYF(0));
836 : }
837 30 : if (huff_counts)
838 : {
839 90 : for (i=0 ; i < fields ; i++)
840 : {
841 60 : if (huff_counts[i].tree_buff)
842 : {
843 0 : my_free(huff_counts[i].tree_buff,MYF(0));
844 0 : delete_tree(&huff_counts[i].int_tree);
845 : }
846 : }
847 30 : my_free(huff_counts, MYF(0));
848 : }
849 30 : delete_queue(&queue); /* This is safe to free */
850 : return;
851 : }
852 :
853 : /* Read through old file and gather some statistics */
854 :
855 : static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts)
856 30 : {
857 : int error;
858 : uint length, null_bytes;
859 : ulong reclength,max_blob_length;
860 : uchar *record,*pos,*next_pos,*end_pos,*start_pos;
861 : ha_rows record_count;
862 : HUFF_COUNTS *count,*end_count;
863 : TREE_ELEMENT *element;
864 : ha_checksum(*calc_checksum)(MARIA_HA *, const uchar *);
865 30 : DBUG_ENTER("get_statistic");
866 :
867 30 : reclength= mrg->file[0]->s->base.reclength;
868 30 : null_bytes= mrg->file[0]->s->base.null_bytes;
869 30 : record=(uchar*) my_alloca(reclength);
870 30 : end_count=huff_counts+mrg->file[0]->s->base.fields;
871 30 : record_count=0; glob_crc=0;
872 30 : max_blob_length=0;
873 :
874 : /* Check how to calculate checksum */
875 30 : if (mrg->file[0]->s->data_file_type == STATIC_RECORD)
876 9 : calc_checksum= _ma_static_checksum;
877 : else
878 21 : calc_checksum= _ma_checksum;
879 :
880 30 : mrg_reset(mrg);
881 645 : while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
882 : {
883 585 : ulong tot_blob_length=0;
884 585 : if (! error)
885 : {
886 : /* glob_crc is a checksum over all bytes of all records. */
887 525 : glob_crc+= (*calc_checksum)(mrg->file[0],record);
888 :
889 : /* Count the incidence of values separately for every column. */
890 525 : for (pos=record + null_bytes, count=huff_counts ;
891 2100 : count < end_count ;
892 : count++,
893 1050 : pos=next_pos)
894 : {
895 1050 : next_pos=end_pos=(start_pos=pos)+count->field_length;
896 :
897 : /*
898 : Put the whole column value in a tree if there is room for it.
899 : 'int_tree' is used to quickly check for duplicate values.
900 : 'tree_buff' collects as many distinct column values as
901 : possible. If the field length is > 1, it is tree_buff_length,
902 : else 2 bytes. Each value is 'field_length' bytes big. If there
903 : are more distinct column values than fit into the buffer, we
904 : give up with this tree. BLOBs and VARCHARs do not have a
905 : tree_buff as it can only be used with fixed length columns.
906 : For the special case of field length == 1, we handle only the
907 : case that there is only one distinct value in the table(s).
908 : Otherwise, we can have a maximum of 256 distinct values. This
909 : is then handled by the normal Huffman tree build.
910 :
911 : Another limit for collecting distinct column values is the
912 : number of values itself. Since we would need to build a
913 : Huffman tree for the values, we are limited by the 'IS_OFFSET'
914 : constant. This constant expresses a bit which is used to
915 : determine if a tree element holds a final value or an offset
916 : to a child element. Hence, all values and offsets need to be
917 : smaller than 'IS_OFFSET'. A tree element is implemented with
918 : two integer values, one for the left branch and one for the
919 : right branch. For the extreme case that the first element
920 : points to the last element, the number of integers in the tree
921 : must be less or equal to IS_OFFSET. So the number of elements
922 : must be less or equal to IS_OFFSET / 2.
923 :
924 : WARNING: At first, we insert a pointer into the record buffer
925 : as the key for the tree. If we got a new distinct value, which
926 : is really inserted into the tree, instead of being counted
927 : only, we will copy the column value from the record buffer to
928 : 'tree_buff' and adjust the key pointer of the tree accordingly.
929 : */
930 1050 : if (count->tree_buff)
931 : {
932 1050 : global_count=count;
933 1050 : if (!(element=tree_insert(&count->int_tree,pos, 0,
934 : count->int_tree.custom_arg)) ||
935 : (element->count == 1 &&
936 : (count->tree_buff + tree_buff_length <
937 : count->tree_pos + count->field_length)) ||
938 : (count->int_tree.elements_in_tree > IS_OFFSET / 2) ||
939 : (count->field_length == 1 &&
940 : count->int_tree.elements_in_tree > 1))
941 : {
942 0 : delete_tree(&count->int_tree);
943 0 : my_free(count->tree_buff,MYF(0));
944 0 : count->tree_buff=0;
945 : }
946 : else
947 : {
948 : /*
949 : If tree_insert() succeeds, it either creates a new element
950 : or increments the counter of an existing element.
951 : */
952 1050 : if (element->count == 1)
953 : {
954 : /* Copy the new column value into 'tree_buff'. */
955 855 : memcpy(count->tree_pos,pos,(size_t) count->field_length);
956 : /* Adjust the key pointer in the tree. */
957 855 : tree_set_pointer(element,count->tree_pos);
958 : /* Point behind the last column value so far. */
959 855 : count->tree_pos+=count->field_length;
960 : }
961 : }
962 : }
963 :
964 : /* Save character counters and space-counts and zero-field-counts */
965 1050 : if (count->field_type == FIELD_NORMAL ||
966 : count->field_type == FIELD_SKIP_ENDSPACE)
967 : {
968 : /* Ignore trailing space. */
969 0 : for ( ; end_pos > pos ; end_pos--)
970 990 : if (end_pos[-1] != ' ')
971 0 : break;
972 : /* Empty fields are just counted. Go to the next record. */
973 990 : if (end_pos == pos)
974 : {
975 0 : count->empty_fields++;
976 0 : count->max_zero_fill=0;
977 0 : continue;
978 : }
979 : /*
980 : Count the total of all trailing spaces and the number of
981 : short trailing spaces. Remember the longest trailing space.
982 : */
983 990 : length= (uint) (next_pos-end_pos);
984 990 : count->tot_end_space+=length;
985 990 : if (length < 8)
986 990 : count->end_space[length]++;
987 990 : if (count->max_end_space < length)
988 0 : count->max_end_space = length;
989 : }
990 :
991 1050 : if (count->field_type == FIELD_NORMAL ||
992 : count->field_type == FIELD_SKIP_PRESPACE)
993 : {
994 : /* Ignore leading space. */
995 3135 : for (pos=start_pos; pos < end_pos ; pos++)
996 3135 : if (pos[0] != ' ')
997 2325 : break;
998 : /* Empty fields are just counted. Go to the next record. */
999 810 : if (end_pos == pos)
1000 : {
1001 0 : count->empty_fields++;
1002 0 : count->max_zero_fill=0;
1003 0 : continue;
1004 : }
1005 : /*
1006 : Count the total of all leading spaces and the number of
1007 : short leading spaces. Remember the longest leading space.
1008 : */
1009 810 : length= (uint) (pos-start_pos);
1010 810 : count->tot_pre_space+=length;
1011 810 : if (length < 8)
1012 810 : count->pre_space[length]++;
1013 810 : if (count->max_pre_space < length)
1014 55 : count->max_pre_space = length;
1015 : }
1016 :
1017 : /* Calculate pos, end_pos, and max_length for variable length fields. */
1018 1050 : if (count->field_type == FIELD_BLOB)
1019 : {
1020 0 : uint field_length=count->field_length -portable_sizeof_char_ptr;
1021 0 : ulong blob_length= _ma_calc_blob_length(field_length, start_pos);
1022 0 : memcpy_fixed((char*) &pos, start_pos+field_length,sizeof(char*));
1023 0 : end_pos=pos+blob_length;
1024 0 : tot_blob_length+=blob_length;
1025 0 : set_if_bigger(count->max_length,blob_length);
1026 : }
1027 1050 : else if (count->field_type == FIELD_VARCHAR)
1028 : {
1029 0 : uint pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
1030 0 : length= (pack_length == 1 ? (uint) *(uchar*) start_pos :
1031 : uint2korr(start_pos));
1032 0 : pos= start_pos+pack_length;
1033 0 : end_pos= pos+length;
1034 0 : set_if_bigger(count->max_length,length);
1035 : }
1036 :
1037 : /* Evaluate 'max_zero_fill' for short fields. */
1038 1050 : if (count->field_length <= 8 &&
1039 : (count->field_type == FIELD_NORMAL ||
1040 : count->field_type == FIELD_SKIP_ZERO))
1041 : {
1042 : uint i;
1043 : /* Zero fields are just counted. Go to the next record. */
1044 465 : if (!memcmp(start_pos, zero_string, count->field_length))
1045 : {
1046 0 : count->zero_fields++;
1047 0 : continue;
1048 : }
1049 : /*
1050 : max_zero_fill starts with field_length. It is decreased every
1051 : time a shorter "zero trailer" is found. It is set to zero when
1052 : an empty field is found (see above). This suggests that the
1053 : variable should be called 'min_zero_fill'.
1054 : */
1055 930 : for (i =0 ; i < count->max_zero_fill && ! end_pos[-1 - (int) i] ;
1056 0 : i++) ;
1057 465 : if (i < count->max_zero_fill)
1058 27 : count->max_zero_fill=i;
1059 : }
1060 :
1061 : /* Ignore zero fields and check fields. */
1062 1050 : if (count->field_type == FIELD_ZERO ||
1063 : count->field_type == FIELD_CHECK)
1064 : continue;
1065 :
1066 : /*
1067 : Count the incidence of every uchar value in the
1068 : significant field value.
1069 : */
1070 13425 : for ( ; pos < end_pos ; pos++)
1071 13425 : count->counts[(uchar) *pos]++;
1072 :
1073 : /* Step to next field. */
1074 : }
1075 :
1076 525 : if (tot_blob_length > max_blob_length)
1077 0 : max_blob_length=tot_blob_length;
1078 525 : record_count++;
1079 525 : if (write_loop && record_count % WRITE_COUNT == 0)
1080 : {
1081 0 : VOID(printf("%lu\r", (ulong) record_count));
1082 0 : VOID(fflush(stdout));
1083 : }
1084 : }
1085 60 : else if (error != HA_ERR_RECORD_DELETED)
1086 : {
1087 0 : VOID(fprintf(stderr, "Got error %d while reading rows\n", error));
1088 0 : break;
1089 : }
1090 :
1091 : /* Step to next record. */
1092 : }
1093 30 : if (write_loop)
1094 : {
1095 0 : VOID(printf(" \r"));
1096 0 : VOID(fflush(stdout));
1097 : }
1098 :
1099 : /*
1100 : If --debug=d,fakebigcodes is set, fake the counts to get big Huffman
1101 : codes.
1102 : */
1103 30 : DBUG_EXECUTE_IF("fakebigcodes", fakebigcodes(huff_counts, end_count););
1104 :
1105 30 : DBUG_PRINT("info", ("Found the following number of incidents "
1106 : "of the uchar codes:"));
1107 30 : if (verbose >= 2)
1108 0 : VOID(printf("Found the following number of incidents "
1109 : "of the uchar codes:\n"));
1110 90 : for (count= huff_counts ; count < end_count; count++)
1111 : {
1112 : uint idx;
1113 : my_off_t total_count;
1114 : char llbuf[32];
1115 :
1116 60 : DBUG_PRINT("info", ("column: %3u", (uint) (count - huff_counts + 1)));
1117 60 : if (verbose >= 2)
1118 0 : VOID(printf("column: %3u\n", (uint) (count - huff_counts + 1)));
1119 60 : if (count->tree_buff)
1120 : {
1121 60 : DBUG_PRINT("info", ("number of distinct values: %u",
1122 : (uint) ((count->tree_pos - count->tree_buff) /
1123 : count->field_length)));
1124 60 : if (verbose >= 2)
1125 0 : VOID(printf("number of distinct values: %u\n",
1126 : (uint) ((count->tree_pos - count->tree_buff) /
1127 : count->field_length)));
1128 : }
1129 60 : total_count= 0;
1130 15420 : for (idx= 0; idx < 256; idx++)
1131 : {
1132 15360 : if (count->counts[idx])
1133 : {
1134 600 : total_count+= count->counts[idx];
1135 600 : DBUG_PRINT("info", ("counts[0x%02x]: %12s", idx,
1136 : llstr((longlong) count->counts[idx], llbuf)));
1137 600 : if (verbose >= 2)
1138 0 : VOID(printf("counts[0x%02x]: %12s\n", idx,
1139 : llstr((longlong) count->counts[idx], llbuf)));
1140 : }
1141 : }
1142 60 : DBUG_PRINT("info", ("total: %12s", llstr((longlong) total_count,
1143 : llbuf)));
1144 60 : if ((verbose >= 2) && total_count)
1145 : {
1146 0 : VOID(printf("total: %12s\n",
1147 : llstr((longlong) total_count, llbuf)));
1148 : }
1149 : }
1150 :
1151 30 : mrg->records=record_count;
1152 30 : mrg->max_blob_length=max_blob_length;
1153 : my_afree(record);
1154 30 : DBUG_RETURN(error != HA_ERR_END_OF_FILE);
1155 : }
1156 :
1157 : static int compare_huff_elements(void *not_used __attribute__((unused)),
1158 : uchar *a, uchar *b)
1159 23005 : {
1160 23005 : return *((my_off_t*) a) < *((my_off_t*) b) ? -1 :
1161 : (*((my_off_t*) a) == *((my_off_t*) b) ? 0 : 1);
1162 : }
1163 :
1164 : /* Check each tree if we should use pre-space-compress, end-space-
1165 : compress, empty-field-compress or zero-field-compress */
1166 :
1167 : static void check_counts(HUFF_COUNTS *huff_counts, uint trees,
1168 : my_off_t records)
1169 30 : {
1170 : uint space_fields,fill_zero_fields,field_count[(int) FIELD_enum_val_count];
1171 : my_off_t old_length,new_length,length;
1172 30 : DBUG_ENTER("check_counts");
1173 :
1174 30 : bzero((uchar*) field_count,sizeof(field_count));
1175 30 : space_fields=fill_zero_fields=0;
1176 :
1177 90 : for (; trees-- ; huff_counts++)
1178 : {
1179 60 : if (huff_counts->field_type == FIELD_BLOB)
1180 : {
1181 0 : huff_counts->length_bits=max_bit(huff_counts->max_length);
1182 0 : goto found_pack;
1183 : }
1184 60 : else if (huff_counts->field_type == FIELD_VARCHAR)
1185 : {
1186 0 : huff_counts->length_bits=max_bit(huff_counts->max_length);
1187 0 : goto found_pack;
1188 : }
1189 60 : else if (huff_counts->field_type == FIELD_CHECK)
1190 : {
1191 0 : huff_counts->bytes_packed=0;
1192 0 : huff_counts->counts[0]=0;
1193 0 : goto found_pack;
1194 : }
1195 :
1196 60 : huff_counts->field_type=FIELD_NORMAL;
1197 60 : huff_counts->pack_type=0;
1198 :
1199 : /* Check for zero-filled records (in this column), or zero records. */
1200 60 : if (huff_counts->zero_fields || ! records)
1201 : {
1202 : my_off_t old_space_count;
1203 : /*
1204 : If there are only zero filled records (in this column),
1205 : or no records at all, we are done.
1206 : */
1207 0 : if (huff_counts->zero_fields == records)
1208 : {
1209 0 : huff_counts->field_type= FIELD_ZERO;
1210 0 : huff_counts->bytes_packed=0;
1211 0 : huff_counts->counts[0]=0;
1212 0 : goto found_pack;
1213 : }
1214 : /* Remeber the number of significant spaces. */
1215 0 : old_space_count=huff_counts->counts[' '];
1216 : /* Add all leading and trailing spaces. */
1217 0 : huff_counts->counts[' ']+= (huff_counts->tot_end_space +
1218 : huff_counts->tot_pre_space +
1219 : huff_counts->empty_fields *
1220 : huff_counts->field_length);
1221 : /* Check, what the compressed length of this would be. */
1222 0 : old_length=calc_packed_length(huff_counts,0)+records/8;
1223 : /* Get the number of zero bytes. */
1224 0 : length=huff_counts->zero_fields*huff_counts->field_length;
1225 : /* Add it to the counts. */
1226 0 : huff_counts->counts[0]+=length;
1227 : /* Check, what the compressed length of this would be. */
1228 0 : new_length=calc_packed_length(huff_counts,0);
1229 : /* If the compression without the zeroes would be shorter, we are done. */
1230 0 : if (old_length < new_length && huff_counts->field_length > 1)
1231 : {
1232 0 : huff_counts->field_type=FIELD_SKIP_ZERO;
1233 0 : huff_counts->counts[0]-=length;
1234 0 : huff_counts->bytes_packed=old_length- records/8;
1235 0 : goto found_pack;
1236 : }
1237 : /* Remove the insignificant spaces, but keep the zeroes. */
1238 0 : huff_counts->counts[' ']=old_space_count;
1239 : }
1240 : /* Check, what the compressed length of this column would be. */
1241 60 : huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1242 :
1243 : /*
1244 : If there are enough empty records (in this column),
1245 : treating them specially may pay off.
1246 : */
1247 60 : if (huff_counts->empty_fields)
1248 : {
1249 0 : if (huff_counts->field_length > 2 &&
1250 : huff_counts->empty_fields + (records - huff_counts->empty_fields)*
1251 : (1+max_bit(max(huff_counts->max_pre_space,
1252 : huff_counts->max_end_space))) <
1253 : records * max_bit(huff_counts->field_length))
1254 : {
1255 0 : huff_counts->pack_type |= PACK_TYPE_SPACE_FIELDS;
1256 : }
1257 : else
1258 : {
1259 0 : length=huff_counts->empty_fields*huff_counts->field_length;
1260 0 : if (huff_counts->tot_end_space || ! huff_counts->tot_pre_space)
1261 : {
1262 0 : huff_counts->tot_end_space+=length;
1263 0 : huff_counts->max_end_space=huff_counts->field_length;
1264 0 : if (huff_counts->field_length < 8)
1265 0 : huff_counts->end_space[huff_counts->field_length]+=
1266 : huff_counts->empty_fields;
1267 : }
1268 0 : if (huff_counts->tot_pre_space)
1269 : {
1270 0 : huff_counts->tot_pre_space+=length;
1271 0 : huff_counts->max_pre_space=huff_counts->field_length;
1272 0 : if (huff_counts->field_length < 8)
1273 0 : huff_counts->pre_space[huff_counts->field_length]+=
1274 : huff_counts->empty_fields;
1275 : }
1276 : }
1277 : }
1278 :
1279 : /*
1280 : If there are enough trailing spaces (in this column),
1281 : treating them specially may pay off.
1282 : */
1283 60 : if (huff_counts->tot_end_space)
1284 : {
1285 0 : huff_counts->counts[' ']+=huff_counts->tot_pre_space;
1286 0 : if (test_space_compress(huff_counts,records,huff_counts->max_end_space,
1287 : huff_counts->end_space,
1288 : huff_counts->tot_end_space,FIELD_SKIP_ENDSPACE))
1289 0 : goto found_pack;
1290 0 : huff_counts->counts[' ']-=huff_counts->tot_pre_space;
1291 : }
1292 :
1293 : /*
1294 : If there are enough leading spaces (in this column),
1295 : treating them specially may pay off.
1296 : */
1297 60 : if (huff_counts->tot_pre_space)
1298 : {
1299 30 : if (test_space_compress(huff_counts,records,huff_counts->max_pre_space,
1300 : huff_counts->pre_space,
1301 : huff_counts->tot_pre_space,FIELD_SKIP_PRESPACE))
1302 : goto found_pack;
1303 : }
1304 :
1305 60 : found_pack: /* Found field-packing */
1306 :
1307 : /* Test if we can use zero-fill */
1308 :
1309 60 : if (huff_counts->max_zero_fill &&
1310 : (huff_counts->field_type == FIELD_NORMAL ||
1311 : huff_counts->field_type == FIELD_SKIP_ZERO))
1312 : {
1313 0 : huff_counts->counts[0]-=huff_counts->max_zero_fill*
1314 : (huff_counts->field_type == FIELD_SKIP_ZERO ?
1315 : records - huff_counts->zero_fields : records);
1316 0 : huff_counts->pack_type|=PACK_TYPE_ZERO_FILL;
1317 0 : huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1318 : }
1319 :
1320 : /* Test if intervall-field is better */
1321 :
1322 60 : if (huff_counts->tree_buff)
1323 : {
1324 : HUFF_TREE tree;
1325 :
1326 60 : DBUG_EXECUTE_IF("forceintervall",
1327 : huff_counts->bytes_packed= ~ (my_off_t) 0;);
1328 60 : tree.element_buffer=0;
1329 60 : if (!make_huff_tree(&tree,huff_counts) &&
1330 : tree.bytes_packed+tree.tree_pack_length < huff_counts->bytes_packed)
1331 : {
1332 0 : if (tree.elements == 1)
1333 0 : huff_counts->field_type=FIELD_CONSTANT;
1334 : else
1335 0 : huff_counts->field_type=FIELD_INTERVALL;
1336 0 : huff_counts->pack_type=0;
1337 : }
1338 : else
1339 : {
1340 60 : my_free(huff_counts->tree_buff,MYF(0));
1341 60 : delete_tree(&huff_counts->int_tree);
1342 60 : huff_counts->tree_buff=0;
1343 : }
1344 60 : if (tree.element_buffer)
1345 60 : my_free(tree.element_buffer,MYF(0));
1346 : }
1347 60 : if (huff_counts->pack_type & PACK_TYPE_SPACE_FIELDS)
1348 0 : space_fields++;
1349 60 : if (huff_counts->pack_type & PACK_TYPE_ZERO_FILL)
1350 0 : fill_zero_fields++;
1351 60 : field_count[huff_counts->field_type]++;
1352 : }
1353 30 : DBUG_PRINT("info", ("normal: %3d empty-space: %3d "
1354 : "empty-zero: %3d empty-fill: %3d",
1355 : field_count[FIELD_NORMAL],space_fields,
1356 : field_count[FIELD_SKIP_ZERO],fill_zero_fields));
1357 30 : DBUG_PRINT("info", ("pre-space: %3d end-space: %3d "
1358 : "intervall-fields: %3d zero: %3d",
1359 : field_count[FIELD_SKIP_PRESPACE],
1360 : field_count[FIELD_SKIP_ENDSPACE],
1361 : field_count[FIELD_INTERVALL],
1362 : field_count[FIELD_ZERO]));
1363 30 : if (verbose)
1364 0 : VOID(printf("\nnormal: %3d empty-space: %3d "
1365 : "empty-zero: %3d empty-fill: %3d\n"
1366 : "pre-space: %3d end-space: %3d "
1367 : "intervall-fields: %3d zero: %3d\n",
1368 : field_count[FIELD_NORMAL],space_fields,
1369 : field_count[FIELD_SKIP_ZERO],fill_zero_fields,
1370 : field_count[FIELD_SKIP_PRESPACE],
1371 : field_count[FIELD_SKIP_ENDSPACE],
1372 : field_count[FIELD_INTERVALL],
1373 : field_count[FIELD_ZERO]));
1374 30 : DBUG_VOID_RETURN;
1375 : }
1376 :
1377 :
1378 : /* Test if we can use space-compression and empty-field-compression */
1379 :
1380 : static int
1381 : test_space_compress(HUFF_COUNTS *huff_counts, my_off_t records,
1382 : uint max_space_length, my_off_t *space_counts,
1383 : my_off_t tot_space_count, enum en_fieldtype field_type)
1384 30 : {
1385 : int min_pos;
1386 : uint length_bits,i;
1387 : my_off_t space_count,min_space_count,min_pack,new_length,skip;
1388 :
1389 30 : length_bits=max_bit(max_space_length);
1390 :
1391 : /* Default no end_space-packing */
1392 30 : space_count=huff_counts->counts[(uint) ' '];
1393 30 : min_space_count= (huff_counts->counts[(uint) ' ']+= tot_space_count);
1394 30 : min_pack=calc_packed_length(huff_counts,0);
1395 30 : min_pos= -2;
1396 30 : huff_counts->counts[(uint) ' ']=space_count;
1397 :
1398 : /* Test with allways space-count */
1399 30 : new_length=huff_counts->bytes_packed+length_bits*records/8;
1400 30 : if (new_length+1 < min_pack)
1401 : {
1402 30 : min_pos= -1;
1403 30 : min_pack=new_length;
1404 30 : min_space_count=space_count;
1405 : }
1406 : /* Test with length-flag */
1407 270 : for (skip=0L, i=0 ; i < 8 ; i++)
1408 : {
1409 240 : if (space_counts[i])
1410 : {
1411 55 : if (i)
1412 55 : huff_counts->counts[(uint) ' ']+=space_counts[i];
1413 55 : skip+=huff_counts->pre_space[i];
1414 55 : new_length=calc_packed_length(huff_counts,0)+
1415 : (records+(records-skip)*(1+length_bits))/8;
1416 55 : if (new_length < min_pack)
1417 : {
1418 0 : min_pos=(int) i;
1419 0 : min_pack=new_length;
1420 0 : min_space_count=huff_counts->counts[(uint) ' '];
1421 : }
1422 : }
1423 : }
1424 :
1425 30 : huff_counts->counts[(uint) ' ']=min_space_count;
1426 30 : huff_counts->bytes_packed=min_pack;
1427 30 : switch (min_pos) {
1428 : case -2:
1429 0 : return(0); /* No space-compress */
1430 : case -1: /* Always space-count */
1431 30 : huff_counts->field_type=field_type;
1432 30 : huff_counts->min_space=0;
1433 30 : huff_counts->length_bits=max_bit(max_space_length);
1434 30 : break;
1435 : default:
1436 0 : huff_counts->field_type=field_type;
1437 0 : huff_counts->min_space=(uint) min_pos;
1438 0 : huff_counts->pack_type|=PACK_TYPE_SELECTED;
1439 0 : huff_counts->length_bits=max_bit(max_space_length);
1440 : break;
1441 : }
1442 30 : return(1); /* Using space-compress */
1443 : }
1444 :
1445 :
1446 : /* Make a huff_tree of each huff_count */
1447 :
1448 : static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts, uint trees)
1449 30 : {
1450 : uint tree;
1451 : HUFF_TREE *huff_tree;
1452 30 : DBUG_ENTER("make_huff_trees");
1453 :
1454 30 : if (!(huff_tree=(HUFF_TREE*) my_malloc(trees*sizeof(HUFF_TREE),
1455 : MYF(MY_WME | MY_ZEROFILL))))
1456 0 : DBUG_RETURN(0);
1457 :
1458 90 : for (tree=0 ; tree < trees ; tree++)
1459 : {
1460 60 : if (make_huff_tree(huff_tree+tree,huff_counts+tree))
1461 : {
1462 0 : while (tree--)
1463 0 : my_free(huff_tree[tree].element_buffer,MYF(0));
1464 0 : my_free(huff_tree,MYF(0));
1465 0 : DBUG_RETURN(0);
1466 : }
1467 : }
1468 30 : DBUG_RETURN(huff_tree);
1469 : }
1470 :
1471 : /*
1472 : Build a Huffman tree.
1473 :
1474 : SYNOPSIS
1475 : make_huff_tree()
1476 : huff_tree The Huffman tree.
1477 : huff_counts The counts.
1478 :
1479 : DESCRIPTION
1480 : Build a Huffman tree according to huff_counts->counts or
1481 : huff_counts->tree_buff. tree_buff, if non-NULL contains up to
1482 : tree_buff_length of distinct column values. In that case, whole
1483 : values can be Huffman encoded instead of single bytes.
1484 :
1485 : RETURN
1486 : 0 OK
1487 : != 0 Error
1488 : */
1489 :
1490 : static int make_huff_tree(HUFF_TREE *huff_tree, HUFF_COUNTS *huff_counts)
1491 150 : {
1492 : uint i,found,bits_packed,first,last;
1493 : my_off_t bytes_packed;
1494 : HUFF_ELEMENT *a,*b,*new_huff_el;
1495 :
1496 150 : first=last=0;
1497 150 : if (huff_counts->tree_buff)
1498 : {
1499 : /* Calculate the number of distinct values in tree_buff. */
1500 60 : found= (uint) (huff_counts->tree_pos - huff_counts->tree_buff) /
1501 : huff_counts->field_length;
1502 60 : first=0; last=found-1;
1503 : }
1504 : else
1505 : {
1506 : /* Count the number of uchar codes found in the column. */
1507 23130 : for (i=found=0 ; i < 256 ; i++)
1508 : {
1509 23040 : if (huff_counts->counts[i])
1510 : {
1511 1000 : if (! found++)
1512 90 : first=i;
1513 1000 : last=i;
1514 : }
1515 : }
1516 90 : if (found < 2)
1517 0 : found=2;
1518 : }
1519 :
1520 : /* When using 'tree_buff' we can have more that 256 values. */
1521 150 : if (queue.max_elements < found)
1522 : {
1523 0 : delete_queue(&queue);
1524 0 : if (init_queue(&queue,found,0,0,compare_huff_elements,0))
1525 0 : return -1;
1526 : }
1527 :
1528 : /* Allocate or reallocate an element buffer for the Huffman tree. */
1529 150 : if (!huff_tree->element_buffer)
1530 : {
1531 120 : if (!(huff_tree->element_buffer=
1532 : (HUFF_ELEMENT*) my_malloc(found*2*sizeof(HUFF_ELEMENT),MYF(MY_WME))))
1533 0 : return 1;
1534 : }
1535 : else
1536 : {
1537 : HUFF_ELEMENT *temp;
1538 30 : if (!(temp=
1539 : (HUFF_ELEMENT*) my_realloc((uchar*) huff_tree->element_buffer,
1540 : found*2*sizeof(HUFF_ELEMENT),
1541 : MYF(MY_WME))))
1542 0 : return 1;
1543 30 : huff_tree->element_buffer=temp;
1544 : }
1545 :
1546 150 : huff_counts->tree=huff_tree;
1547 150 : huff_tree->counts=huff_counts;
1548 150 : huff_tree->min_chr=first;
1549 150 : huff_tree->max_chr=last;
1550 150 : huff_tree->char_bits=max_bit(last-first);
1551 150 : huff_tree->offset_bits=max_bit(found-1)+1;
1552 :
1553 150 : if (huff_counts->tree_buff)
1554 : {
1555 60 : huff_tree->elements=0;
1556 60 : huff_tree->tree_pack_length=(1+15+16+5+5+
1557 : (huff_tree->char_bits+1)*found+
1558 : (huff_tree->offset_bits+1)*
1559 : (found-2)+7)/8 +
1560 : (uint) (huff_tree->counts->tree_pos-
1561 : huff_tree->counts->tree_buff);
1562 : /*
1563 : Put a HUFF_ELEMENT into the queue for every distinct column value.
1564 :
1565 : tree_walk() calls save_counts_in_queue() for every element in
1566 : 'int_tree'. This takes elements from the target trees element
1567 : buffer and places references to them into the buffer of the
1568 : priority queue. We insert in column value order, but the order is
1569 : in fact irrelevant here. We will establish the correct order
1570 : later.
1571 : */
1572 60 : tree_walk(&huff_counts->int_tree,
1573 : (int (*)(void*, element_count,void*)) save_counts_in_queue,
1574 : (uchar*) huff_tree, left_root_right);
1575 : }
1576 : else
1577 : {
1578 90 : huff_tree->elements=found;
1579 90 : huff_tree->tree_pack_length=(9+9+5+5+
1580 : (huff_tree->char_bits+1)*found+
1581 : (huff_tree->offset_bits+1)*
1582 : (found-2)+7)/8;
1583 : /*
1584 : Put a HUFF_ELEMENT into the queue for every uchar code found in the column.
1585 :
1586 : The elements are taken from the target trees element buffer.
1587 : Instead of using queue_insert(), we just place references to the
1588 : elements into the buffer of the priority queue. We insert in byte
1589 : value order, but the order is in fact irrelevant here. We will
1590 : establish the correct order later.
1591 : */
1592 5635 : for (i=first, found=0 ; i <= last ; i++)
1593 : {
1594 5545 : if (huff_counts->counts[i])
1595 : {
1596 1000 : new_huff_el=huff_tree->element_buffer+(found++);
1597 1000 : new_huff_el->count=huff_counts->counts[i];
1598 1000 : new_huff_el->a.leaf.null=0;
1599 1000 : new_huff_el->a.leaf.element_nr=i;
1600 1000 : queue.root[found]=(uchar*) new_huff_el;
1601 : }
1602 : }
1603 : /*
1604 : If there is only a single uchar value in this field in all records,
1605 : add a second element with zero incidence. This is required to enter
1606 : the loop, which builds the Huffman tree.
1607 : */
1608 90 : while (found < 2)
1609 : {
1610 0 : new_huff_el=huff_tree->element_buffer+(found++);
1611 0 : new_huff_el->count=0;
1612 0 : new_huff_el->a.leaf.null=0;
1613 0 : if (last)
1614 0 : new_huff_el->a.leaf.element_nr=huff_tree->min_chr=last-1;
1615 : else
1616 0 : new_huff_el->a.leaf.element_nr=huff_tree->max_chr=last+1;
1617 0 : queue.root[found]=(uchar*) new_huff_el;
1618 : }
1619 : }
1620 :
1621 : /* Make a queue from the queue buffer. */
1622 150 : queue.elements=found;
1623 :
1624 : /*
1625 : Make a priority queue from the queue. Construct its index so that we
1626 : have a partially ordered tree.
1627 : */
1628 1010 : for (i=found/2 ; i > 0 ; i--)
1629 860 : _downheap(&queue,i);
1630 :
1631 : /* The Huffman algorithm. */
1632 150 : bytes_packed=0; bits_packed=0;
1633 1855 : for (i=1 ; i < found ; i++)
1634 : {
1635 : /*
1636 : Pop the top element from the queue (the one with the least incidence).
1637 : Popping from a priority queue includes a re-ordering of the queue,
1638 : to get the next least incidence element to the top.
1639 : */
1640 1705 : a=(HUFF_ELEMENT*) queue_remove(&queue,0);
1641 : /*
1642 : Copy the next least incidence element. The queue implementation
1643 : reserves root[0] for temporary purposes. root[1] is the top.
1644 : */
1645 1705 : b=(HUFF_ELEMENT*) queue.root[1];
1646 : /* Get a new element from the element buffer. */
1647 1705 : new_huff_el=huff_tree->element_buffer+found+i;
1648 : /* The new element gets the sum of the two least incidence elements. */
1649 1705 : new_huff_el->count=a->count+b->count;
1650 : /*
1651 : The Huffman algorithm assigns another bit to the code for a byte
1652 : every time that bytes incidence is combined (directly or indirectly)
1653 : to a new element as one of the two least incidence elements.
1654 : This means that one more bit per incidence of that uchar is required
1655 : in the resulting file. So we add the new combined incidence as the
1656 : number of bits by which the result grows.
1657 : */
1658 1705 : bits_packed+=(uint) (new_huff_el->count & 7);
1659 1705 : bytes_packed+=new_huff_el->count/8;
1660 : /* The new element points to its children, lesser in left. */
1661 1705 : new_huff_el->a.nod.left=a;
1662 1705 : new_huff_el->a.nod.right=b;
1663 : /*
1664 : Replace the copied top element by the new element and re-order the
1665 : queue.
1666 : */
1667 1705 : queue.root[1]=(uchar*) new_huff_el;
1668 1705 : queue_replaced(&queue);
1669 : }
1670 150 : huff_tree->root=(HUFF_ELEMENT*) queue.root[1];
1671 150 : huff_tree->bytes_packed=bytes_packed+(bits_packed+7)/8;
1672 150 : return 0;
1673 : }
1674 :
1675 : static int compare_tree(void* cmp_arg __attribute__((unused)),
1676 : register const uchar *s, register const uchar *t)
1677 3285 : {
1678 : uint length;
1679 31010 : for (length=global_count->field_length; length-- ;)
1680 27530 : if (*s++ != *t++)
1681 3090 : return (int) s[-1] - (int) t[-1];
1682 195 : return 0;
1683 : }
1684 :
1685 : /*
1686 : Organize distinct column values and their incidences into a priority queue.
1687 :
1688 : SYNOPSIS
1689 : save_counts_in_queue()
1690 : key The column value.
1691 : count The incidence of this value.
1692 : tree The Huffman tree to be built later.
1693 :
1694 : DESCRIPTION
1695 : We use the element buffer of the targeted tree. The distinct column
1696 : values are organized in a priority queue first. The Huffman
1697 : algorithm will later organize the elements into a Huffman tree. For
1698 : the time being, we just place references to the elements into the
1699 : queue buffer. The buffer will later be organized into a priority
1700 : queue.
1701 :
1702 : RETURN
1703 : 0
1704 : */
1705 :
1706 : static int save_counts_in_queue(uchar *key, element_count count,
1707 : HUFF_TREE *tree)
1708 855 : {
1709 : HUFF_ELEMENT *new_huff_el;
1710 :
1711 855 : new_huff_el=tree->element_buffer+(tree->elements++);
1712 855 : new_huff_el->count=count;
1713 855 : new_huff_el->a.leaf.null=0;
1714 855 : new_huff_el->a.leaf.element_nr= (uint) (key- tree->counts->tree_buff) /
1715 : tree->counts->field_length;
1716 855 : queue.root[tree->elements]=(uchar*) new_huff_el;
1717 855 : return 0;
1718 : }
1719 :
1720 :
1721 : /*
1722 : Calculate length of file if given counts should be used.
1723 :
1724 : SYNOPSIS
1725 : calc_packed_length()
1726 : huff_counts The counts for a column of the table(s).
1727 : add_tree_lenght If the decode tree length should be added.
1728 :
1729 : DESCRIPTION
1730 : We need to follow the Huffman algorithm until we know, how many bits
1731 : are required for each uchar code. But we do not need the resulting
1732 : Huffman tree. Hence, we can leave out some steps which are essential
1733 : in make_huff_tree().
1734 :
1735 : RETURN
1736 : Number of bytes required to compress this table column.
1737 : */
1738 :
1739 : static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,
1740 : uint add_tree_lenght)
1741 175 : {
1742 : uint i,found,bits_packed,first,last;
1743 : my_off_t bytes_packed;
1744 : HUFF_ELEMENT element_buffer[256];
1745 175 : DBUG_ENTER("calc_packed_length");
1746 :
1747 : /*
1748 : WARNING: We use a small hack for efficiency: Instead of placing
1749 : references to HUFF_ELEMENTs into the queue, we just insert
1750 : references to the counts of the uchar codes which appeared in this
1751 : table column. During the Huffman algorithm they are successively
1752 : replaced by references to HUFF_ELEMENTs. This works, because
1753 : HUFF_ELEMENTs have the incidence count at their beginning.
1754 : Regardless, wether the queue array contains references to counts of
1755 : type my_off_t or references to HUFF_ELEMENTs which have the count of
1756 : type my_off_t at their beginning, it always points to a count of the
1757 : same type.
1758 :
1759 : Instead of using queue_insert(), we just copy the references into
1760 : the buffer of the priority queue. We insert in uchar value order, but
1761 : the order is in fact irrelevant here. We will establish the correct
1762 : order later.
1763 : */
1764 175 : first=last=0;
1765 44975 : for (i=found=0 ; i < 256 ; i++)
1766 : {
1767 44800 : if (huff_counts->counts[i])
1768 : {
1769 1670 : if (! found++)
1770 175 : first=i;
1771 1670 : last=i;
1772 : /* We start with root[1], which is the queues top element. */
1773 1670 : queue.root[found]=(uchar*) &huff_counts->counts[i];
1774 : }
1775 : }
1776 175 : if (!found)
1777 0 : DBUG_RETURN(0); /* Empty tree */
1778 : /*
1779 : If there is only a single uchar value in this field in all records,
1780 : add a second element with zero incidence. This is required to enter
1781 : the loop, which follows the Huffman algorithm.
1782 : */
1783 175 : if (found < 2)
1784 0 : queue.root[++found]=(uchar*) &huff_counts->counts[last ? 0 : 1];
1785 :
1786 : /* Make a queue from the queue buffer. */
1787 175 : queue.elements=found;
1788 :
1789 175 : bytes_packed=0; bits_packed=0;
1790 : /* Add the length of the coding table, which would become part of the file. */
1791 175 : if (add_tree_lenght)
1792 30 : bytes_packed=(8+9+5+5+(max_bit(last-first)+1)*found+
1793 : (max_bit(found-1)+1+1)*(found-2) +7)/8;
1794 :
1795 : /*
1796 : Make a priority queue from the queue. Construct its index so that we
1797 : have a partially ordered tree.
1798 : */
1799 1055 : for (i=(found+1)/2 ; i > 0 ; i--)
1800 880 : _downheap(&queue,i);
1801 :
1802 : /* The Huffman algorithm. */
1803 1670 : for (i=0 ; i < found-1 ; i++)
1804 : {
1805 : my_off_t *a;
1806 : my_off_t *b;
1807 : HUFF_ELEMENT *new_huff_el;
1808 :
1809 : /*
1810 : Pop the top element from the queue (the one with the least
1811 : incidence). Popping from a priority queue includes a re-ordering
1812 : of the queue, to get the next least incidence element to the top.
1813 : */
1814 1495 : a= (my_off_t*) queue_remove(&queue, 0);
1815 : /*
1816 : Copy the next least incidence element. The queue implementation
1817 : reserves root[0] for temporary purposes. root[1] is the top.
1818 : */
1819 1495 : b= (my_off_t*) queue.root[1];
1820 : /* Create a new element in a local (automatic) buffer. */
1821 1495 : new_huff_el= element_buffer + i;
1822 : /* The new element gets the sum of the two least incidence elements. */
1823 1495 : new_huff_el->count= *a + *b;
1824 : /*
1825 : The Huffman algorithm assigns another bit to the code for a byte
1826 : every time that bytes incidence is combined (directly or indirectly)
1827 : to a new element as one of the two least incidence elements.
1828 : This means that one more bit per incidence of that uchar is required
1829 : in the resulting file. So we add the new combined incidence as the
1830 : number of bits by which the result grows.
1831 : */
1832 1495 : bits_packed+=(uint) (new_huff_el->count & 7);
1833 1495 : bytes_packed+=new_huff_el->count/8;
1834 : /*
1835 : Replace the copied top element by the new element and re-order the
1836 : queue. This successively replaces the references to counts by
1837 : references to HUFF_ELEMENTs.
1838 : */
1839 1495 : queue.root[1]=(uchar*) new_huff_el;
1840 1495 : queue_replaced(&queue);
1841 : }
1842 175 : DBUG_RETURN(bytes_packed+(bits_packed+7)/8);
1843 : }
1844 :
1845 :
1846 : /* Remove trees that don't give any compression */
1847 :
1848 : static uint join_same_trees(HUFF_COUNTS *huff_counts, uint trees)
1849 30 : {
1850 : uint k,tree_number;
1851 : HUFF_COUNTS count,*i,*j,*last_count;
1852 :
1853 30 : last_count=huff_counts+trees;
1854 90 : for (tree_number=0, i=huff_counts ; i < last_count ; i++)
1855 : {
1856 60 : if (!i->tree->tree_number)
1857 : {
1858 30 : i->tree->tree_number= ++tree_number;
1859 30 : if (i->tree_buff)
1860 30 : continue; /* Don't join intervall */
1861 60 : for (j=i+1 ; j < last_count ; j++)
1862 : {
1863 30 : if (! j->tree->tree_number && ! j->tree_buff)
1864 : {
1865 7710 : for (k=0 ; k < 256 ; k++)
1866 7680 : count.counts[k]=i->counts[k]+j->counts[k];
1867 30 : if (calc_packed_length(&count,1) <=
1868 : i->tree->bytes_packed + j->tree->bytes_packed+
1869 : i->tree->tree_pack_length+j->tree->tree_pack_length+
1870 : ALLOWED_JOIN_DIFF)
1871 : {
1872 30 : memcpy_fixed((uchar*) i->counts,(uchar*) count.counts,
1873 : sizeof(count.counts[0])*256);
1874 30 : my_free((uchar*) j->tree->element_buffer,MYF(0));
1875 30 : j->tree->element_buffer=0;
1876 30 : j->tree=i->tree;
1877 30 : bmove((uchar*) i->counts,(uchar*) count.counts,
1878 : sizeof(count.counts[0])*256);
1879 30 : if (make_huff_tree(i->tree,i))
1880 0 : return (uint) -1;
1881 : }
1882 : }
1883 : }
1884 : }
1885 : }
1886 30 : DBUG_PRINT("info", ("Original trees: %d After join: %d",
1887 : trees, tree_number));
1888 30 : if (verbose)
1889 0 : VOID(printf("Original trees: %d After join: %d\n", trees, tree_number));
1890 30 : return tree_number; /* Return trees left */
1891 : }
1892 :
1893 :
1894 : /*
1895 : Fill in huff_tree encode tables.
1896 :
1897 : SYNOPSIS
1898 : make_huff_decode_table()
1899 : huff_tree An array of HUFF_TREE which are to be encoded.
1900 : trees The number of HUFF_TREE in the array.
1901 :
1902 : RETURN
1903 : 0 success
1904 : != 0 error
1905 : */
1906 :
1907 : static int make_huff_decode_table(HUFF_TREE *huff_tree, uint trees)
1908 30 : {
1909 : uint elements;
1910 90 : for ( ; trees-- ; huff_tree++)
1911 : {
1912 60 : if (huff_tree->tree_number > 0)
1913 : {
1914 30 : elements=huff_tree->counts->tree_buff ? huff_tree->elements : 256;
1915 30 : if (!(huff_tree->code =
1916 : (ulonglong*) my_malloc(elements*
1917 : (sizeof(ulonglong) + sizeof(uchar)),
1918 : MYF(MY_WME | MY_ZEROFILL))))
1919 0 : return 1;
1920 30 : huff_tree->code_len=(uchar*) (huff_tree->code+elements);
1921 30 : make_traverse_code_tree(huff_tree, huff_tree->root,
1922 : 8 * sizeof(ulonglong), LL(0));
1923 : }
1924 : }
1925 30 : return 0;
1926 : }
1927 :
1928 :
1929 : static void make_traverse_code_tree(HUFF_TREE *huff_tree,
1930 : HUFF_ELEMENT *element,
1931 : uint size, ulonglong code)
1932 770 : {
1933 : uint chr;
1934 770 : if (!element->a.leaf.null)
1935 : {
1936 400 : chr=element->a.leaf.element_nr;
1937 400 : huff_tree->code_len[chr]= (uchar) (8 * sizeof(ulonglong) - size);
1938 400 : huff_tree->code[chr]= (code >> size);
1939 400 : if (huff_tree->height < 8 * sizeof(ulonglong) - size)
1940 95 : huff_tree->height= 8 * sizeof(ulonglong) - size;
1941 : }
1942 : else
1943 : {
1944 370 : size--;
1945 370 : make_traverse_code_tree(huff_tree,element->a.nod.left,size,code);
1946 370 : make_traverse_code_tree(huff_tree, element->a.nod.right, size,
1947 : code + (((ulonglong) 1) << size));
1948 : }
1949 : return;
1950 : }
1951 :
1952 :
1953 : /*
1954 : Convert a value into binary digits.
1955 :
1956 : SYNOPSIS
1957 : bindigits()
1958 : value The value.
1959 : length The number of low order bits to convert.
1960 :
1961 : NOTE
1962 : The result string is in static storage. It is reused on every call.
1963 : So you cannot use it twice in one expression.
1964 :
1965 : RETURN
1966 : A pointer to a static NUL-terminated string.
1967 : */
1968 :
1969 : static char *bindigits(ulonglong value, uint bits)
1970 13825 : {
1971 : static char digits[72];
1972 13825 : char *ptr= digits;
1973 13825 : uint idx= bits;
1974 :
1975 13825 : DBUG_ASSERT(idx < sizeof(digits));
1976 48275 : while (idx)
1977 34450 : *(ptr++)= '0' + ((char) (value >> (--idx)) & (char) 1);
1978 13825 : *ptr= '\0';
1979 13825 : return digits;
1980 : }
1981 :
1982 :
1983 : /*
1984 : Convert a value into hexadecimal digits.
1985 :
1986 : SYNOPSIS
1987 : hexdigits()
1988 : value The value.
1989 :
1990 : NOTE
1991 : The result string is in static storage. It is reused on every call.
1992 : So you cannot use it twice in one expression.
1993 :
1994 : RETURN
1995 : A pointer to a static NUL-terminated string.
1996 : */
1997 :
1998 : static char *hexdigits(ulonglong value)
1999 13825 : {
2000 : static char digits[20];
2001 13825 : char *ptr= digits;
2002 13825 : uint idx= 2 * sizeof(value); /* Two hex digits per byte. */
2003 :
2004 13825 : DBUG_ASSERT(idx < sizeof(digits));
2005 235025 : while (idx)
2006 : {
2007 221200 : if ((*(ptr++)= '0' + ((char) (value >> (4 * (--idx))) & (char) 0xf)) > '9')
2008 3200 : *(ptr - 1)+= 'a' - '9' - 1;
2009 : }
2010 13825 : *ptr= '\0';
2011 13825 : return digits;
2012 : }
2013 :
2014 :
2015 : /* Write header to new packed data file */
2016 :
2017 : static int write_header(PACK_MRG_INFO *mrg,uint head_length,uint trees,
2018 : my_off_t tot_elements,my_off_t filelength)
2019 30 : {
2020 30 : uchar *buff= (uchar*) file_buffer.pos;
2021 :
2022 30 : bzero(buff,HEAD_LENGTH);
2023 30 : memcpy_fixed(buff,maria_pack_file_magic,4);
2024 30 : int4store(buff+4,head_length);
2025 30 : int4store(buff+8, mrg->min_pack_length);
2026 30 : int4store(buff+12,mrg->max_pack_length);
2027 30 : int4store(buff+16,tot_elements);
2028 30 : int4store(buff+20,intervall_length);
2029 30 : int2store(buff+24,trees);
2030 30 : buff[26]=(char) mrg->ref_length;
2031 : /* Save record pointer length */
2032 30 : buff[27]= (uchar) maria_get_pointer_length((ulonglong) filelength,2);
2033 30 : if (test_only)
2034 0 : return 0;
2035 30 : VOID(my_seek(file_buffer.file,0L,MY_SEEK_SET,MYF(0)));
2036 30 : return my_write(file_buffer.file,(const uchar *) file_buffer.pos,HEAD_LENGTH,
2037 : MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
2038 : }
2039 :
2040 : /* Write fieldinfo to new packed file */
2041 :
2042 : static void write_field_info(HUFF_COUNTS *counts, uint fields, uint trees)
2043 30 : {
2044 : reg1 uint i;
2045 : uint huff_tree_bits;
2046 30 : huff_tree_bits=max_bit(trees ? trees-1 : 0);
2047 :
2048 30 : DBUG_PRINT("info", (" "));
2049 30 : DBUG_PRINT("info", ("column types:"));
2050 30 : DBUG_PRINT("info", ("FIELD_NORMAL 0"));
2051 30 : DBUG_PRINT("info", ("FIELD_SKIP_ENDSPACE 1"));
2052 30 : DBUG_PRINT("info", ("FIELD_SKIP_PRESPACE 2"));
2053 30 : DBUG_PRINT("info", ("FIELD_SKIP_ZERO 3"));
2054 30 : DBUG_PRINT("info", ("FIELD_BLOB 4"));
2055 30 : DBUG_PRINT("info", ("FIELD_CONSTANT 5"));
2056 30 : DBUG_PRINT("info", ("FIELD_INTERVALL 6"));
2057 30 : DBUG_PRINT("info", ("FIELD_ZERO 7"));
2058 30 : DBUG_PRINT("info", ("FIELD_VARCHAR 8"));
2059 30 : DBUG_PRINT("info", ("FIELD_CHECK 9"));
2060 30 : DBUG_PRINT("info", (" "));
2061 30 : DBUG_PRINT("info", ("pack type as a set of flags:"));
2062 30 : DBUG_PRINT("info", ("PACK_TYPE_SELECTED 1"));
2063 30 : DBUG_PRINT("info", ("PACK_TYPE_SPACE_FIELDS 2"));
2064 30 : DBUG_PRINT("info", ("PACK_TYPE_ZERO_FILL 4"));
2065 30 : DBUG_PRINT("info", (" "));
2066 30 : if (verbose >= 2)
2067 : {
2068 0 : VOID(printf("\n"));
2069 0 : VOID(printf("column types:\n"));
2070 0 : VOID(printf("FIELD_NORMAL 0\n"));
2071 0 : VOID(printf("FIELD_SKIP_ENDSPACE 1\n"));
2072 0 : VOID(printf("FIELD_SKIP_PRESPACE 2\n"));
2073 0 : VOID(printf("FIELD_SKIP_ZERO 3\n"));
2074 0 : VOID(printf("FIELD_BLOB 4\n"));
2075 0 : VOID(printf("FIELD_CONSTANT 5\n"));
2076 0 : VOID(printf("FIELD_INTERVALL 6\n"));
2077 0 : VOID(printf("FIELD_ZERO 7\n"));
2078 0 : VOID(printf("FIELD_VARCHAR 8\n"));
2079 0 : VOID(printf("FIELD_CHECK 9\n"));
2080 0 : VOID(printf("\n"));
2081 0 : VOID(printf("pack type as a set of flags:\n"));
2082 0 : VOID(printf("PACK_TYPE_SELECTED 1\n"));
2083 0 : VOID(printf("PACK_TYPE_SPACE_FIELDS 2\n"));
2084 0 : VOID(printf("PACK_TYPE_ZERO_FILL 4\n"));
2085 0 : VOID(printf("\n"));
2086 : }
2087 90 : for (i=0 ; i++ < fields ; counts++)
2088 : {
2089 60 : write_bits((ulonglong) (int) counts->field_type, 5);
2090 60 : write_bits(counts->pack_type,6);
2091 60 : if (counts->pack_type & PACK_TYPE_ZERO_FILL)
2092 0 : write_bits(counts->max_zero_fill,5);
2093 : else
2094 60 : write_bits(counts->length_bits,5);
2095 60 : write_bits((ulonglong) counts->tree->tree_number - 1, huff_tree_bits);
2096 60 : DBUG_PRINT("info", ("column: %3u type: %2u pack: %2u zero: %4u "
2097 : "lbits: %2u tree: %2u length: %4u",
2098 : i , counts->field_type, counts->pack_type,
2099 : counts->max_zero_fill, counts->length_bits,
2100 : counts->tree->tree_number, counts->field_length));
2101 60 : if (verbose >= 2)
2102 0 : VOID(printf("column: %3u type: %2u pack: %2u zero: %4u lbits: %2u "
2103 : "tree: %2u length: %4u\n", i , counts->field_type,
2104 : counts->pack_type, counts->max_zero_fill, counts->length_bits,
2105 : counts->tree->tree_number, counts->field_length));
2106 : }
2107 30 : flush_bits();
2108 : return;
2109 : }
2110 :
2111 : /* Write all huff_trees to new datafile. Return tot count of
2112 : elements in all trees
2113 : Returns 0 on error */
2114 :
2115 : static my_off_t write_huff_tree(HUFF_TREE *huff_tree, uint trees)
2116 30 : {
2117 : uint i,int_length;
2118 : uint tree_no;
2119 : uint codes;
2120 30 : uint errors= 0;
2121 : uint *packed_tree,*offset,length;
2122 : my_off_t elements;
2123 :
2124 : /* Find the highest number of elements in the trees. */
2125 90 : for (i=length=0 ; i < trees ; i++)
2126 60 : if (huff_tree[i].tree_number > 0 && huff_tree[i].elements > length)
2127 30 : length=huff_tree[i].elements;
2128 : /*
2129 : Allocate a buffer for packing a decode tree. Two numbers per element
2130 : (left child and right child).
2131 : */
2132 30 : if (!(packed_tree=(uint*) my_alloca(sizeof(uint)*length*2)))
2133 : {
2134 0 : my_error(EE_OUTOFMEMORY,MYF(ME_BELL),sizeof(uint)*length*2);
2135 0 : return 0;
2136 : }
2137 :
2138 30 : DBUG_PRINT("info", (" "));
2139 30 : if (verbose >= 2)
2140 0 : VOID(printf("\n"));
2141 30 : tree_no= 0;
2142 30 : intervall_length=0;
2143 90 : for (elements=0; trees-- ; huff_tree++)
2144 : {
2145 : /* Skip columns that have been joined with other columns. */
2146 60 : if (huff_tree->tree_number == 0)
2147 30 : continue; /* Deleted tree */
2148 30 : tree_no++;
2149 30 : DBUG_PRINT("info", (" "));
2150 30 : if (verbose >= 3)
2151 0 : VOID(printf("\n"));
2152 : /* Count the total number of elements (byte codes or column values). */
2153 30 : elements+=huff_tree->elements;
2154 30 : huff_tree->max_offset=2;
2155 : /* Build a tree of offsets and codes for decoding in 'packed_tree'. */
2156 30 : if (huff_tree->elements <= 1)
2157 0 : offset=packed_tree;
2158 : else
2159 30 : offset=make_offset_code_tree(huff_tree,huff_tree->root,packed_tree);
2160 :
2161 : /* This should be the same as 'length' above. */
2162 30 : huff_tree->offset_bits=max_bit(huff_tree->max_offset);
2163 :
2164 : /*
2165 : Since we check this during collecting the distinct column values,
2166 : this should never happen.
2167 : */
2168 30 : if (huff_tree->max_offset >= IS_OFFSET)
2169 : { /* This should be impossible */
2170 0 : VOID(fprintf(stderr, "Tree offset got too big: %d, aborted\n",
2171 : huff_tree->max_offset));
2172 : my_afree(packed_tree);
2173 0 : return 0;
2174 : }
2175 :
2176 30 : DBUG_PRINT("info", ("pos: %lu elements: %u tree-elements: %lu "
2177 : "char_bits: %u\n",
2178 : (ulong) (file_buffer.pos - file_buffer.buffer),
2179 : huff_tree->elements, (ulong) (offset - packed_tree),
2180 : huff_tree->char_bits));
2181 30 : if (!huff_tree->counts->tree_buff)
2182 : {
2183 : /* We do a uchar compression on this column. Mark with bit 0. */
2184 30 : write_bits(0,1);
2185 30 : write_bits(huff_tree->min_chr,8);
2186 30 : write_bits(huff_tree->elements,9);
2187 30 : write_bits(huff_tree->char_bits,5);
2188 30 : write_bits(huff_tree->offset_bits,5);
2189 30 : int_length=0;
2190 : }
2191 : else
2192 : {
2193 0 : int_length=(uint) (huff_tree->counts->tree_pos -
2194 : huff_tree->counts->tree_buff);
2195 : /* We have distinct column values for this column. Mark with bit 1. */
2196 0 : write_bits(1,1);
2197 0 : write_bits(huff_tree->elements,15);
2198 0 : write_bits(int_length,16);
2199 0 : write_bits(huff_tree->char_bits,5);
2200 0 : write_bits(huff_tree->offset_bits,5);
2201 0 : intervall_length+=int_length;
2202 : }
2203 30 : DBUG_PRINT("info", ("tree: %2u elements: %4u char_bits: %2u "
2204 : "offset_bits: %2u %s: %5u codelen: %2u",
2205 : tree_no, huff_tree->elements, huff_tree->char_bits,
2206 : huff_tree->offset_bits, huff_tree->counts->tree_buff ?
2207 : "bufflen" : "min_chr", huff_tree->counts->tree_buff ?
2208 : int_length : huff_tree->min_chr, huff_tree->height));
2209 30 : if (verbose >= 2)
2210 0 : VOID(printf("tree: %2u elements: %4u char_bits: %2u offset_bits: %2u "
2211 : "%s: %5u codelen: %2u\n", tree_no, huff_tree->elements,
2212 : huff_tree->char_bits, huff_tree->offset_bits,
2213 : huff_tree->counts->tree_buff ? "bufflen" : "min_chr",
2214 : huff_tree->counts->tree_buff ? int_length :
2215 : huff_tree->min_chr, huff_tree->height));
2216 :
2217 : /* Check that the code tree length matches the element count. */
2218 30 : length=(uint) (offset-packed_tree);
2219 30 : if (length != huff_tree->elements*2-2)
2220 : {
2221 0 : VOID(fprintf(stderr, "error: Huff-tree-length: %d != calc_length: %d\n",
2222 : length, huff_tree->elements * 2 - 2));
2223 0 : errors++;
2224 0 : break;
2225 : }
2226 :
2227 770 : for (i=0 ; i < length ; i++)
2228 : {
2229 740 : if (packed_tree[i] & IS_OFFSET)
2230 340 : write_bits(packed_tree[i] - IS_OFFSET+ (1 << huff_tree->offset_bits),
2231 : huff_tree->offset_bits+1);
2232 : else
2233 400 : write_bits(packed_tree[i]-huff_tree->min_chr,huff_tree->char_bits+1);
2234 740 : DBUG_PRINT("info", ("tree[0x%04x]: %s0x%04x",
2235 : i, (packed_tree[i] & IS_OFFSET) ?
2236 : " -> " : "", (packed_tree[i] & IS_OFFSET) ?
2237 : packed_tree[i] - IS_OFFSET + i : packed_tree[i]));
2238 740 : if (verbose >= 3)
2239 0 : VOID(printf("tree[0x%04x]: %s0x%04x\n",
2240 : i, (packed_tree[i] & IS_OFFSET) ? " -> " : "",
2241 : (packed_tree[i] & IS_OFFSET) ?
2242 : packed_tree[i] - IS_OFFSET + i : packed_tree[i]));
2243 : }
2244 30 : flush_bits();
2245 :
2246 : /*
2247 : Display coding tables and check their correctness.
2248 : */
2249 30 : codes= huff_tree->counts->tree_buff ? huff_tree->elements : 256;
2250 7710 : for (i= 0; i < codes; i++)
2251 : {
2252 : ulonglong code;
2253 : uint bits;
2254 : uint len;
2255 : uint idx;
2256 :
2257 7680 : if (! (len= huff_tree->code_len[i]))
2258 400 : continue;
2259 400 : DBUG_PRINT("info", ("code[0x%04x]: 0x%s bits: %2u bin: %s", i,
2260 : hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
2261 : bindigits(huff_tree->code[i],
2262 : huff_tree->code_len[i])));
2263 400 : if (verbose >= 3)
2264 0 : VOID(printf("code[0x%04x]: 0x%s bits: %2u bin: %s\n", i,
2265 : hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
2266 : bindigits(huff_tree->code[i], huff_tree->code_len[i])));
2267 :
2268 : /* Check that the encode table decodes correctly. */
2269 400 : code= 0;
2270 400 : bits= 0;
2271 400 : idx= 0;
2272 400 : DBUG_EXECUTE_IF("forcechkerr1", len--;);
2273 400 : DBUG_EXECUTE_IF("forcechkerr2", bits= 8 * sizeof(code););
2274 400 : DBUG_EXECUTE_IF("forcechkerr3", idx= length;);
2275 : for (;;)
2276 : {
2277 2060 : if (! len)
2278 : {
2279 0 : VOID(fflush(stdout));
2280 0 : VOID(fprintf(stderr, "error: code 0x%s with %u bits not found\n",
2281 : hexdigits(huff_tree->code[i]), huff_tree->code_len[i]));
2282 0 : errors++;
2283 0 : break;
2284 : }
2285 2060 : code<<= 1;
2286 2060 : code|= (huff_tree->code[i] >> (--len)) & 1;
2287 2060 : bits++;
2288 2060 : if (bits > 8 * sizeof(code))
2289 : {
2290 0 : VOID(fflush(stdout));
2291 0 : VOID(fprintf(stderr, "error: Huffman code too long: %u/%u\n",
2292 : bits, (uint) (8 * sizeof(code))));
2293 0 : errors++;
2294 0 : break;
2295 : }
2296 2060 : idx+= (uint) code & 1;
2297 2060 : if (idx >= length)
2298 : {
2299 0 : VOID(fflush(stdout));
2300 0 : VOID(fprintf(stderr, "error: illegal tree offset: %u/%u\n",
2301 : idx, length));
2302 0 : errors++;
2303 0 : break;
2304 : }
2305 2060 : if (packed_tree[idx] & IS_OFFSET)
2306 1660 : idx+= packed_tree[idx] & ~IS_OFFSET;
2307 : else
2308 1660 : break; /* Hit a leaf. This contains the result value. */
2309 1660 : }
2310 400 : if (errors)
2311 400 : break;
2312 :
2313 400 : DBUG_EXECUTE_IF("forcechkerr4", packed_tree[idx]++;);
2314 400 : if (packed_tree[idx] != i)
2315 : {
2316 0 : VOID(fflush(stdout));
2317 0 : VOID(fprintf(stderr, "error: decoded value 0x%04x should be: 0x%04x\n",
2318 : packed_tree[idx], i));
2319 0 : errors++;
2320 0 : break;
2321 : }
2322 : } /*end for (codes)*/
2323 30 : if (errors)
2324 30 : break;
2325 :
2326 : /* Write column values in case of distinct column value compression. */
2327 30 : if (huff_tree->counts->tree_buff)
2328 : {
2329 0 : for (i=0 ; i < int_length ; i++)
2330 : {
2331 0 : write_bits((ulonglong) (uchar) huff_tree->counts->tree_buff[i], 8);
2332 0 : DBUG_PRINT("info", ("column_values[0x%04x]: 0x%02x",
2333 : i, (uchar) huff_tree->counts->tree_buff[i]));
2334 0 : if (verbose >= 3)
2335 0 : VOID(printf("column_values[0x%04x]: 0x%02x\n",
2336 : i, (uchar) huff_tree->counts->tree_buff[i]));
2337 : }
2338 : }
2339 30 : flush_bits();
2340 : }
2341 30 : DBUG_PRINT("info", (" "));
2342 30 : if (verbose >= 2)
2343 0 : VOID(printf("\n"));
2344 : my_afree(packed_tree);
2345 30 : if (errors)
2346 : {
2347 0 : VOID(fprintf(stderr, "Error: Generated decode trees are corrupt. Stop.\n"));
2348 0 : return 0;
2349 : }
2350 30 : return elements;
2351 : }
2352 :
2353 :
2354 : static uint *make_offset_code_tree(HUFF_TREE *huff_tree, HUFF_ELEMENT *element,
2355 : uint *offset)
2356 370 : {
2357 : uint *prev_offset;
2358 :
2359 370 : prev_offset= offset;
2360 : /*
2361 : 'a.leaf.null' takes the same place as 'a.nod.left'. If this is null,
2362 : then there is no left child and, hence no right child either. This
2363 : is a property of a binary tree. An element is either a node with two
2364 : childs, or a leaf without childs.
2365 :
2366 : The current element is always a node with two childs. Go left first.
2367 : */
2368 370 : if (!element->a.nod.left->a.leaf.null)
2369 : {
2370 : /* Store the uchar code or the index of the column value. */
2371 215 : prev_offset[0] =(uint) element->a.nod.left->a.leaf.element_nr;
2372 215 : offset+=2;
2373 : }
2374 : else
2375 : {
2376 : /*
2377 : Recursively traverse the tree to the left. Mark it as an offset to
2378 : another tree node (in contrast to a uchar code or column value index).
2379 : */
2380 155 : prev_offset[0]= IS_OFFSET+2;
2381 155 : offset=make_offset_code_tree(huff_tree,element->a.nod.left,offset+2);
2382 : }
2383 :
2384 : /* Now, check the right child. */
2385 370 : if (!element->a.nod.right->a.leaf.null)
2386 : {
2387 : /* Store the uchar code or the index of the column value. */
2388 185 : prev_offset[1]=element->a.nod.right->a.leaf.element_nr;
2389 185 : return offset;
2390 : }
2391 : else
2392 : {
2393 : /*
2394 : Recursively traverse the tree to the right. Mark it as an offset to
2395 : another tree node (in contrast to a uchar code or column value index).
2396 : */
2397 185 : uint temp=(uint) (offset-prev_offset-1);
2398 185 : prev_offset[1]= IS_OFFSET+ temp;
2399 185 : if (huff_tree->max_offset < temp)
2400 80 : huff_tree->max_offset = temp;
2401 185 : return make_offset_code_tree(huff_tree,element->a.nod.right,offset);
2402 : }
2403 : }
2404 :
2405 : /* Get number of bits neaded to represent value */
2406 :
2407 : static uint max_bit(register uint value)
2408 480 : {
2409 480 : reg2 uint power=1;
2410 :
2411 2470 : while ((value>>=1))
2412 1510 : power++;
2413 480 : return (power);
2414 : }
2415 :
2416 :
2417 : static int compress_maria_file(PACK_MRG_INFO *mrg, HUFF_COUNTS *huff_counts)
2418 30 : {
2419 : int error;
2420 : uint i,max_calc_length,pack_ref_length,min_record_length,max_record_length;
2421 : uint intervall,field_length,max_pack_length,pack_blob_length, null_bytes;
2422 : my_off_t record_count;
2423 : char llbuf[32];
2424 : ulong length,pack_length;
2425 : uchar *record,*pos,*end_pos,*record_pos,*start_pos;
2426 : HUFF_COUNTS *count,*end_count;
2427 : HUFF_TREE *tree;
2428 30 : MARIA_HA *isam_file=mrg->file[0];
2429 30 : uint pack_version= (uint) isam_file->s->pack.version;
2430 30 : DBUG_ENTER("compress_maria_file");
2431 :
2432 : /* Allocate a buffer for the records (excluding blobs). */
2433 30 : if (!(record=(uchar*) my_alloca(isam_file->s->base.reclength)))
2434 0 : return -1;
2435 :
2436 30 : end_count=huff_counts+isam_file->s->base.fields;
2437 30 : min_record_length= (uint) ~0;
2438 30 : max_record_length=0;
2439 30 : null_bytes= isam_file->s->base.null_bytes;
2440 :
2441 : /*
2442 : Calculate the maximum number of bits required to pack the records.
2443 : Remember to understand 'max_zero_fill' as 'min_zero_fill'.
2444 : The tree height determines the maximum number of bits per value.
2445 : Some fields skip leading or trailing spaces or zeroes. The skipped
2446 : number of bytes is encoded by 'length_bits' bits.
2447 : Empty blobs and varchar are encoded with a single 1 bit. Other blobs
2448 : and varchar get a leading 0 bit.
2449 : */
2450 30 : max_calc_length= null_bytes;
2451 90 : for (i= 0 ; i < isam_file->s->base.fields ; i++)
2452 : {
2453 60 : if (!(huff_counts[i].pack_type & PACK_TYPE_ZERO_FILL))
2454 60 : huff_counts[i].max_zero_fill=0;
2455 60 : if (huff_counts[i].field_type == FIELD_CONSTANT ||
2456 : huff_counts[i].field_type == FIELD_ZERO ||
2457 : huff_counts[i].field_type == FIELD_CHECK)
2458 : continue;
2459 60 : if (huff_counts[i].field_type == FIELD_INTERVALL)
2460 0 : max_calc_length+=huff_counts[i].tree->height;
2461 60 : else if (huff_counts[i].field_type == FIELD_BLOB ||
2462 : huff_counts[i].field_type == FIELD_VARCHAR)
2463 0 : max_calc_length+=huff_counts[i].tree->height*huff_counts[i].max_length + huff_counts[i].length_bits +1;
2464 : else
2465 60 : max_calc_length+=
2466 : (huff_counts[i].field_length - huff_counts[i].max_zero_fill)*
2467 : huff_counts[i].tree->height+huff_counts[i].length_bits;
2468 : }
2469 30 : max_calc_length= (max_calc_length + 7) / 8;
2470 30 : pack_ref_length= _ma_calc_pack_length(pack_version, max_calc_length);
2471 30 : record_count=0;
2472 : /* 'max_blob_length' is the max length of all blobs of a record. */
2473 30 : pack_blob_length= isam_file->s->base.blobs ?
2474 : _ma_calc_pack_length(pack_version, mrg->max_blob_length) : 0;
2475 30 : max_pack_length=pack_ref_length+pack_blob_length;
2476 :
2477 30 : DBUG_PRINT("fields", ("==="));
2478 30 : mrg_reset(mrg);
2479 645 : while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
2480 : {
2481 585 : ulong tot_blob_length=0;
2482 585 : if (! error)
2483 : {
2484 525 : if (flush_buffer((ulong) max_calc_length + (ulong) max_pack_length +
2485 : null_bytes))
2486 525 : break;
2487 525 : record_pos= file_buffer.pos;
2488 525 : file_buffer.pos+= max_pack_length;
2489 525 : if (null_bytes)
2490 : {
2491 : /* Copy null bits 'as is' */
2492 525 : memcpy(file_buffer.pos, record, null_bytes);
2493 525 : file_buffer.pos+= null_bytes;
2494 : }
2495 525 : for (start_pos=record+null_bytes, count= huff_counts;
2496 2100 : count < end_count ;
2497 1050 : count++)
2498 : {
2499 1050 : end_pos=start_pos+(field_length=count->field_length);
2500 1050 : tree=count->tree;
2501 :
2502 1050 : DBUG_PRINT("fields", ("column: %3lu type: %2u pack: %2u zero: %4u "
2503 : "lbits: %2u tree: %2u length: %4u",
2504 : (ulong) (count - huff_counts + 1),
2505 : count->field_type,
2506 : count->pack_type, count->max_zero_fill,
2507 : count->length_bits, count->tree->tree_number,
2508 : count->field_length));
2509 :
2510 : /* Check if the column contains spaces only. */
2511 1050 : if (count->pack_type & PACK_TYPE_SPACE_FIELDS)
2512 : {
2513 0 : for (pos=start_pos ; *pos == ' ' && pos < end_pos; pos++) ;
2514 0 : if (pos == end_pos)
2515 : {
2516 0 : DBUG_PRINT("fields",
2517 : ("PACK_TYPE_SPACE_FIELDS spaces only, bits: 1"));
2518 0 : DBUG_PRINT("fields", ("---"));
2519 0 : write_bits(1,1);
2520 0 : start_pos=end_pos;
2521 0 : continue;
2522 : }
2523 0 : DBUG_PRINT("fields",
2524 : ("PACK_TYPE_SPACE_FIELDS not only spaces, bits: 1"));
2525 0 : write_bits(0,1);
2526 : }
2527 1050 : end_pos-=count->max_zero_fill;
2528 1050 : field_length-=count->max_zero_fill;
2529 :
2530 1050 : switch (count->field_type) {
2531 : case FIELD_SKIP_ZERO:
2532 0 : if (!memcmp(start_pos, zero_string, field_length))
2533 : {
2534 0 : DBUG_PRINT("fields", ("FIELD_SKIP_ZERO zeroes only, bits: 1"));
2535 0 : write_bits(1,1);
2536 0 : start_pos=end_pos;
2537 0 : break;
2538 : }
2539 0 : DBUG_PRINT("fields", ("FIELD_SKIP_ZERO not only zeroes, bits: 1"));
2540 0 : write_bits(0,1);
2541 : /* Fall through */
2542 : case FIELD_NORMAL:
2543 525 : DBUG_PRINT("fields", ("FIELD_NORMAL %lu bytes",
2544 : (ulong) (end_pos - start_pos)));
2545 13125 : for ( ; start_pos < end_pos ; start_pos++)
2546 : {
2547 12600 : DBUG_PRINT("fields",
2548 : ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2549 : (uchar) *start_pos,
2550 : hexdigits(tree->code[(uchar) *start_pos]),
2551 : (uint) tree->code_len[(uchar) *start_pos],
2552 : bindigits(tree->code[(uchar) *start_pos],
2553 : (uint) tree->code_len[(uchar) *start_pos])));
2554 12600 : write_bits(tree->code[(uchar) *start_pos],
2555 : (uint) tree->code_len[(uchar) *start_pos]);
2556 : }
2557 : break;
2558 : case FIELD_SKIP_ENDSPACE:
2559 0 : for (pos=end_pos ; pos > start_pos && pos[-1] == ' ' ; pos--) ;
2560 0 : length= (ulong) (end_pos - pos);
2561 0 : if (count->pack_type & PACK_TYPE_SELECTED)
2562 : {
2563 0 : if (length > count->min_space)
2564 : {
2565 0 : DBUG_PRINT("fields",
2566 : ("FIELD_SKIP_ENDSPACE more than min_space, bits: 1"));
2567 0 : DBUG_PRINT("fields",
2568 : ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
2569 : length, field_length, count->length_bits));
2570 0 : write_bits(1,1);
2571 0 : write_bits(length,count->length_bits);
2572 : }
2573 : else
2574 : {
2575 0 : DBUG_PRINT("fields",
2576 : ("FIELD_SKIP_ENDSPACE not more than min_space, "
2577 : "bits: 1"));
2578 0 : write_bits(0,1);
2579 0 : pos=end_pos;
2580 : }
2581 : }
2582 : else
2583 : {
2584 0 : DBUG_PRINT("fields",
2585 : ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
2586 : length, field_length, count->length_bits));
2587 0 : write_bits(length,count->length_bits);
2588 : }
2589 : /* Encode all significant bytes. */
2590 0 : DBUG_PRINT("fields", ("FIELD_SKIP_ENDSPACE %lu bytes",
2591 : (ulong) (pos - start_pos)));
2592 0 : for ( ; start_pos < pos ; start_pos++)
2593 : {
2594 0 : DBUG_PRINT("fields",
2595 : ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2596 : (uchar) *start_pos,
2597 : hexdigits(tree->code[(uchar) *start_pos]),
2598 : (uint) tree->code_len[(uchar) *start_pos],
2599 : bindigits(tree->code[(uchar) *start_pos],
2600 : (uint) tree->code_len[(uchar) *start_pos])));
2601 0 : write_bits(tree->code[(uchar) *start_pos],
2602 : (uint) tree->code_len[(uchar) *start_pos]);
2603 : }
2604 0 : start_pos=end_pos;
2605 0 : break;
2606 : case FIELD_SKIP_PRESPACE:
2607 525 : for (pos=start_pos ; pos < end_pos && pos[0] == ' ' ; pos++) ;
2608 525 : length= (ulong) (pos - start_pos);
2609 525 : if (count->pack_type & PACK_TYPE_SELECTED)
2610 : {
2611 0 : if (length > count->min_space)
2612 : {
2613 0 : DBUG_PRINT("fields",
2614 : ("FIELD_SKIP_PRESPACE more than min_space, bits: 1"));
2615 0 : DBUG_PRINT("fields",
2616 : ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
2617 : length, field_length, count->length_bits));
2618 0 : write_bits(1,1);
2619 0 : write_bits(length,count->length_bits);
2620 : }
2621 : else
2622 : {
2623 0 : DBUG_PRINT("fields",
2624 : ("FIELD_SKIP_PRESPACE not more than min_space, "
2625 : "bits: 1"));
2626 0 : pos=start_pos;
2627 0 : write_bits(0,1);
2628 : }
2629 : }
2630 : else
2631 : {
2632 525 : DBUG_PRINT("fields",
2633 : ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
2634 : length, field_length, count->length_bits));
2635 525 : write_bits(length,count->length_bits);
2636 : }
2637 : /* Encode all significant bytes. */
2638 525 : DBUG_PRINT("fields", ("FIELD_SKIP_PRESPACE %lu bytes",
2639 : (ulong) (end_pos - start_pos)));
2640 1350 : for (start_pos=pos ; start_pos < end_pos ; start_pos++)
2641 : {
2642 825 : DBUG_PRINT("fields",
2643 : ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2644 : (uchar) *start_pos,
2645 : hexdigits(tree->code[(uchar) *start_pos]),
2646 : (uint) tree->code_len[(uchar) *start_pos],
2647 : bindigits(tree->code[(uchar) *start_pos],
2648 : (uint) tree->code_len[(uchar) *start_pos])));
2649 825 : write_bits(tree->code[(uchar) *start_pos],
2650 : (uint) tree->code_len[(uchar) *start_pos]);
2651 : }
2652 : break;
2653 : case FIELD_CONSTANT:
2654 : case FIELD_ZERO:
2655 : case FIELD_CHECK:
2656 0 : DBUG_PRINT("fields", ("FIELD_CONSTANT/ZERO/CHECK"));
2657 0 : start_pos=end_pos;
2658 0 : break;
2659 : case FIELD_INTERVALL:
2660 0 : global_count=count;
2661 0 : pos=(uchar*) tree_search(&count->int_tree, start_pos,
2662 : count->int_tree.custom_arg);
2663 0 : intervall=(uint) (pos - count->tree_buff)/field_length;
2664 0 : DBUG_PRINT("fields", ("FIELD_INTERVALL"));
2665 0 : DBUG_PRINT("fields", ("index: %4u code: 0x%s bits: %2u",
2666 : intervall, hexdigits(tree->code[intervall]),
2667 : (uint) tree->code_len[intervall]));
2668 0 : write_bits(tree->code[intervall],(uint) tree->code_len[intervall]);
2669 0 : start_pos=end_pos;
2670 0 : break;
2671 : case FIELD_BLOB:
2672 : {
2673 : ulong blob_length= _ma_calc_blob_length(field_length-
2674 : portable_sizeof_char_ptr,
2675 0 : start_pos);
2676 : /* Empty blobs are encoded with a single 1 bit. */
2677 0 : if (!blob_length)
2678 : {
2679 0 : DBUG_PRINT("fields", ("FIELD_BLOB empty, bits: 1"));
2680 0 : write_bits(1,1);
2681 : }
2682 : else
2683 : {
2684 : uchar *blob,*blob_end;
2685 0 : DBUG_PRINT("fields", ("FIELD_BLOB not empty, bits: 1"));
2686 0 : write_bits(0,1);
2687 : /* Write the blob length. */
2688 0 : DBUG_PRINT("fields", ("FIELD_BLOB %lu bytes, bits: %2u",
2689 : blob_length, count->length_bits));
2690 0 : write_bits(blob_length,count->length_bits);
2691 0 : memcpy_fixed(&blob,end_pos-portable_sizeof_char_ptr,
2692 : sizeof(char*));
2693 0 : blob_end=blob+blob_length;
2694 : /* Encode the blob bytes. */
2695 0 : for ( ; blob < blob_end ; blob++)
2696 : {
2697 0 : DBUG_PRINT("fields",
2698 : ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2699 : (uchar) *blob, hexdigits(tree->code[(uchar) *blob]),
2700 : (uint) tree->code_len[(uchar) *blob],
2701 : bindigits(tree->code[(uchar) *start_pos],
2702 : (uint)tree->code_len[(uchar) *start_pos])));
2703 0 : write_bits(tree->code[(uchar) *blob],
2704 : (uint) tree->code_len[(uchar) *blob]);
2705 : }
2706 0 : tot_blob_length+=blob_length;
2707 : }
2708 0 : start_pos= end_pos;
2709 0 : break;
2710 : }
2711 : case FIELD_VARCHAR:
2712 : {
2713 0 : uint var_pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
2714 : ulong col_length= (var_pack_length == 1 ?
2715 : (uint) *(uchar*) start_pos :
2716 0 : uint2korr(start_pos));
2717 : /* Empty varchar are encoded with a single 1 bit. */
2718 0 : if (!col_length)
2719 : {
2720 0 : DBUG_PRINT("fields", ("FIELD_VARCHAR empty, bits: 1"));
2721 0 : write_bits(1,1); /* Empty varchar */
2722 : }
2723 : else
2724 : {
2725 0 : uchar *end= start_pos + var_pack_length + col_length;
2726 0 : DBUG_PRINT("fields", ("FIELD_VARCHAR not empty, bits: 1"));
2727 0 : write_bits(0,1);
2728 : /* Write the varchar length. */
2729 0 : DBUG_PRINT("fields", ("FIELD_VARCHAR %lu bytes, bits: %2u",
2730 : col_length, count->length_bits));
2731 0 : write_bits(col_length,count->length_bits);
2732 : /* Encode the varchar bytes. */
2733 0 : for (start_pos+= var_pack_length ; start_pos < end ; start_pos++)
2734 : {
2735 0 : DBUG_PRINT("fields",
2736 : ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2737 : (uchar) *start_pos,
2738 : hexdigits(tree->code[(uchar) *start_pos]),
2739 : (uint) tree->code_len[(uchar) *start_pos],
2740 : bindigits(tree->code[(uchar) *start_pos],
2741 : (uint)tree->code_len[(uchar) *start_pos])));
2742 0 : write_bits(tree->code[(uchar) *start_pos],
2743 : (uint) tree->code_len[(uchar) *start_pos]);
2744 : }
2745 : }
2746 0 : start_pos= end_pos;
2747 0 : break;
2748 : }
2749 : case FIELD_LAST:
2750 : case FIELD_enum_val_count:
2751 0 : abort(); /* Impossible */
2752 : }
2753 1050 : start_pos+=count->max_zero_fill;
2754 1050 : DBUG_PRINT("fields", ("---"));
2755 : }
2756 525 : flush_bits();
2757 525 : length=(ulong) (file_buffer.pos - record_pos) - max_pack_length;
2758 525 : pack_length= _ma_save_pack_length(pack_version, record_pos, length);
2759 525 : if (pack_blob_length)
2760 0 : pack_length+= _ma_save_pack_length(pack_version,
2761 : record_pos + pack_length,
2762 : tot_blob_length);
2763 525 : DBUG_PRINT("fields", ("record: %lu length: %lu blob-length: %lu "
2764 : "length-bytes: %lu", (ulong) record_count, length,
2765 : tot_blob_length, pack_length));
2766 525 : DBUG_PRINT("fields", ("==="));
2767 :
2768 : /* Correct file buffer if the header was smaller */
2769 525 : if (pack_length != max_pack_length)
2770 : {
2771 0 : bmove(record_pos+pack_length,record_pos+max_pack_length,length);
2772 0 : file_buffer.pos-= (max_pack_length-pack_length);
2773 : }
2774 525 : if (length < (ulong) min_record_length)
2775 55 : min_record_length=(uint) length;
2776 525 : if (length > (ulong) max_record_length)
2777 35 : max_record_length=(uint) length;
2778 525 : record_count++;
2779 525 : if (write_loop && record_count % WRITE_COUNT == 0)
2780 : {
2781 0 : VOID(printf("%lu\r", (ulong) record_count));
2782 0 : VOID(fflush(stdout));
2783 : }
2784 : }
2785 60 : else if (error != HA_ERR_RECORD_DELETED)
2786 585 : break;
2787 : }
2788 30 : if (error == HA_ERR_END_OF_FILE)
2789 30 : error=0;
2790 : else
2791 : {
2792 0 : VOID(fprintf(stderr, "%s: Got error %d reading records\n",
2793 : my_progname, error));
2794 : }
2795 30 : if (verbose >= 2)
2796 0 : VOID(printf("wrote %s records.\n", llstr((longlong) record_count, llbuf)));
2797 :
2798 : my_afree(record);
2799 30 : mrg->ref_length=max_pack_length;
2800 30 : mrg->min_pack_length=max_record_length ? min_record_length : 0;
2801 30 : mrg->max_pack_length=max_record_length;
2802 30 : DBUG_RETURN(error || error_on_write || flush_buffer(~(ulong) 0));
2803 : }
2804 :
2805 :
2806 : static char *make_new_name(char *new_name, char *old_name)
2807 25 : {
2808 25 : return fn_format(new_name,old_name,"",DATA_TMP_EXT,2+4);
2809 : }
2810 :
2811 : static char *make_old_name(char *new_name, char *old_name)
2812 0 : {
2813 0 : return fn_format(new_name,old_name,"",OLD_EXT,2+4);
2814 : }
2815 :
2816 : /* rutines for bit writing buffer */
2817 :
2818 : static void init_file_buffer(File file, pbool read_buffer)
2819 30 : {
2820 30 : file_buffer.file=file;
2821 30 : file_buffer.buffer= (uchar*) my_malloc(ALIGN_SIZE(RECORD_CACHE_SIZE),
2822 : MYF(MY_WME));
2823 30 : file_buffer.end=file_buffer.buffer+ALIGN_SIZE(RECORD_CACHE_SIZE)-8;
2824 30 : file_buffer.pos_in_file=0;
2825 30 : error_on_write=0;
2826 30 : if (read_buffer)
2827 : {
2828 :
2829 0 : file_buffer.pos=file_buffer.end;
2830 0 : file_buffer.bits=0;
2831 : }
2832 : else
2833 : {
2834 30 : file_buffer.pos=file_buffer.buffer;
2835 30 : file_buffer.bits=BITS_SAVED;
2836 : }
2837 30 : file_buffer.bitbucket= 0;
2838 : }
2839 :
2840 :
2841 : static int flush_buffer(ulong neaded_length)
2842 555 : {
2843 : ulong length;
2844 :
2845 : /*
2846 : file_buffer.end is 8 bytes lower than the real end of the buffer.
2847 : This is done so that the end-of-buffer condition does not need to be
2848 : checked for every uchar (see write_bits()). Consequently,
2849 : file_buffer.pos can become greater than file_buffer.end. The
2850 : algorithms in the other functions ensure that there will never be
2851 : more than 8 bytes written to the buffer without an end-of-buffer
2852 : check. So the buffer cannot be overrun. But we need to check for the
2853 : near-to-buffer-end condition to avoid a negative result, which is
2854 : casted to unsigned and thus becomes giant.
2855 : */
2856 555 : if ((file_buffer.pos < file_buffer.end) &&
2857 : ((ulong) (file_buffer.end - file_buffer.pos) > neaded_length))
2858 525 : return 0;
2859 30 : length=(ulong) (file_buffer.pos-file_buffer.buffer);
2860 30 : file_buffer.pos=file_buffer.buffer;
2861 30 : file_buffer.pos_in_file+=length;
2862 30 : if (test_only)
2863 0 : return 0;
2864 30 : if (error_on_write|| my_write(file_buffer.file,
2865 : (const uchar*) file_buffer.buffer,
2866 : length,
2867 : MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
2868 : {
2869 0 : error_on_write=1;
2870 0 : return 1;
2871 : }
2872 :
2873 30 : if (neaded_length != ~(ulong) 0 &&
2874 : (ulong) (file_buffer.end-file_buffer.buffer) < neaded_length)
2875 : {
2876 : uchar *tmp;
2877 0 : neaded_length+=256; /* some margin */
2878 0 : tmp= (uchar*) my_realloc(file_buffer.buffer, neaded_length,MYF(MY_WME));
2879 0 : if (!tmp)
2880 0 : return 1;
2881 0 : file_buffer.pos= (tmp + (ulong) (file_buffer.pos - file_buffer.buffer));
2882 0 : file_buffer.buffer= tmp;
2883 0 : file_buffer.end= (tmp+neaded_length-8);
2884 : }
2885 30 : return 0;
2886 : }
2887 :
2888 :
2889 : static void end_file_buffer(void)
2890 30 : {
2891 30 : my_free(file_buffer.buffer, MYF(0));
2892 : }
2893 :
2894 : /* output `bits` low bits of `value' */
2895 :
2896 : static void write_bits(register ulonglong value, register uint bits)
2897 15080 : {
2898 15080 : DBUG_ASSERT(((bits < 8 * sizeof(value)) && ! (value >> bits)) ||
2899 : (bits == 8 * sizeof(value)));
2900 :
2901 15080 : if ((file_buffer.bits-= (int) bits) >= 0)
2902 : {
2903 14705 : file_buffer.bitbucket|= value << file_buffer.bits;
2904 : }
2905 : else
2906 : {
2907 : reg3 ulonglong bit_buffer;
2908 375 : bits= (uint) -file_buffer.bits;
2909 375 : bit_buffer= (file_buffer.bitbucket |
2910 : ((bits != 8 * sizeof(value)) ? (value >> bits) : 0));
2911 : #if BITS_SAVED == 64
2912 375 : *file_buffer.pos++= (uchar) (bit_buffer >> 56);
2913 375 : *file_buffer.pos++= (uchar) (bit_buffer >> 48);
2914 375 : *file_buffer.pos++= (uchar) (bit_buffer >> 40);
2915 375 : *file_buffer.pos++= (uchar) (bit_buffer >> 32);
2916 : #endif
2917 375 : *file_buffer.pos++= (uchar) (bit_buffer >> 24);
2918 375 : *file_buffer.pos++= (uchar) (bit_buffer >> 16);
2919 375 : *file_buffer.pos++= (uchar) (bit_buffer >> 8);
2920 375 : *file_buffer.pos++= (uchar) (bit_buffer);
2921 :
2922 375 : if (bits != 8 * sizeof(value))
2923 375 : value&= (((ulonglong) 1) << bits) - 1;
2924 375 : if (file_buffer.pos >= file_buffer.end)
2925 0 : VOID(flush_buffer(~ (ulong) 0));
2926 375 : file_buffer.bits=(int) (BITS_SAVED - bits);
2927 375 : file_buffer.bitbucket= value << (BITS_SAVED - bits);
2928 : }
2929 : return;
2930 : }
2931 :
2932 : /* Flush bits in bit_buffer to buffer */
2933 :
2934 : static void flush_bits(void)
2935 615 : {
2936 : int bits;
2937 : ulonglong bit_buffer;
2938 :
2939 615 : bits= file_buffer.bits & ~7;
2940 615 : bit_buffer= file_buffer.bitbucket >> bits;
2941 615 : bits= BITS_SAVED - bits;
2942 3615 : while (bits > 0)
2943 : {
2944 2385 : bits-= 8;
2945 2385 : *file_buffer.pos++= (uchar) (bit_buffer >> bits);
2946 : }
2947 615 : if (file_buffer.pos >= file_buffer.end)
2948 0 : VOID(flush_buffer(~ (ulong) 0));
2949 615 : file_buffer.bits= BITS_SAVED;
2950 615 : file_buffer.bitbucket= 0;
2951 : }
2952 :
2953 :
2954 : /****************************************************************************
2955 : ** functions to handle the joined files
2956 : ****************************************************************************/
2957 :
2958 : static int save_state(MARIA_HA *isam_file,PACK_MRG_INFO *mrg,
2959 : my_off_t new_length,
2960 : ha_checksum crc)
2961 25 : {
2962 25 : MARIA_SHARE *share=isam_file->s;
2963 25 : uint options=mi_uint2korr(share->state.header.options);
2964 : uint key;
2965 25 : DBUG_ENTER("save_state");
2966 :
2967 25 : options|= HA_OPTION_COMPRESS_RECORD | HA_OPTION_READ_ONLY_DATA;
2968 25 : mi_int2store(share->state.header.options,options);
2969 : /* Save the original file type of we have to undo the packing later */
2970 25 : share->state.header.org_data_file_type= share->state.header.data_file_type;
2971 25 : share->state.header.data_file_type= COMPRESSED_RECORD;
2972 :
2973 25 : share->state.state.data_file_length=new_length;
2974 25 : share->state.state.del=0;
2975 25 : share->state.state.empty=0;
2976 25 : share->state.dellink= HA_OFFSET_ERROR;
2977 25 : share->state.split=(ha_rows) mrg->records;
2978 25 : share->state.version=(ulong) time((time_t*) 0);
2979 25 : if (share->base.born_transactional)
2980 10 : share->state.create_rename_lsn= share->state.is_of_horizon=
2981 : share->state.skip_redo_lsn= LSN_NEEDS_NEW_STATE_LSNS;
2982 25 : if (! maria_is_all_keys_active(share->state.key_map, share->base.keys))
2983 : {
2984 : /*
2985 : Some indexes are disabled, cannot use current key_file_length value
2986 : as an estimate of upper bound of index file size. Use packed data file
2987 : size instead.
2988 : */
2989 0 : share->state.state.key_file_length= new_length;
2990 : }
2991 : /*
2992 : If there are no disabled indexes, keep key_file_length value from
2993 : original file so "maria_chk -rq" can use this value (this is necessary
2994 : because index size cannot be easily calculated for fulltext keys)
2995 : */
2996 25 : maria_clear_all_keys_active(share->state.key_map);
2997 50 : for (key=0 ; key < share->base.keys ; key++)
2998 25 : share->state.key_root[key]= HA_OFFSET_ERROR;
2999 25 : share->state.key_del= HA_OFFSET_ERROR;
3000 25 : share->state.state.checksum= crc; /* Save crc in file */
3001 25 : share->changed=1; /* Force write of header */
3002 25 : share->state.open_count=0;
3003 25 : share->global_changed=0;
3004 25 : VOID(my_chsize(share->kfile.file, share->base.keystart, 0, MYF(0)));
3005 25 : if (share->base.keys)
3006 25 : isamchk_neaded=1;
3007 25 : DBUG_RETURN(_ma_state_info_write_sub(share->kfile.file,
3008 : &share->state,
3009 : MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
3010 : MA_STATE_INFO_WRITE_FULL_INFO));
3011 : }
3012 :
3013 :
3014 : static int save_state_mrg(File file,PACK_MRG_INFO *mrg,my_off_t new_length,
3015 : ha_checksum crc)
3016 5 : {
3017 : MARIA_STATE_INFO state;
3018 5 : MARIA_HA *isam_file=mrg->file[0];
3019 : uint options;
3020 5 : DBUG_ENTER("save_state_mrg");
3021 :
3022 5 : state= isam_file->s->state;
3023 5 : options= (mi_uint2korr(state.header.options) | HA_OPTION_COMPRESS_RECORD |
3024 : HA_OPTION_READ_ONLY_DATA);
3025 5 : mi_int2store(state.header.options,options);
3026 : /* Save the original file type of we have to undo the packing later */
3027 5 : state.header.org_data_file_type= state.header.data_file_type;
3028 5 : state.header.data_file_type= COMPRESSED_RECORD;
3029 :
3030 5 : state.state.data_file_length=new_length;
3031 5 : state.state.del=0;
3032 5 : state.state.empty=0;
3033 5 : state.state.records=state.split=(ha_rows) mrg->records;
3034 5 : state.create_rename_lsn= state.is_of_horizon= state.skip_redo_lsn=
3035 : LSN_NEEDS_NEW_STATE_LSNS;
3036 :
3037 : /* See comment above in save_state about key_file_length handling. */
3038 5 : if (mrg->src_file_has_indexes_disabled)
3039 : {
3040 0 : isam_file->s->state.state.key_file_length=
3041 : max(isam_file->s->state.state.key_file_length, new_length);
3042 : }
3043 5 : state.dellink= HA_OFFSET_ERROR;
3044 5 : state.version=(ulong) time((time_t*) 0);
3045 5 : maria_clear_all_keys_active(state.key_map);
3046 5 : state.state.checksum=crc;
3047 5 : if (isam_file->s->base.keys)
3048 5 : isamchk_neaded=1;
3049 5 : state.changed=STATE_CHANGED | STATE_NOT_ANALYZED; /* Force check of table */
3050 5 : DBUG_RETURN (_ma_state_info_write_sub(file, &state,
3051 : MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
3052 : MA_STATE_INFO_WRITE_FULL_INFO));
3053 : }
3054 :
3055 :
3056 : /* reset for mrg_rrnd */
3057 :
3058 : static void mrg_reset(PACK_MRG_INFO *mrg)
3059 60 : {
3060 60 : if (mrg->current)
3061 : {
3062 30 : maria_extra(*mrg->current, HA_EXTRA_NO_CACHE, 0);
3063 30 : mrg->current=0;
3064 : }
3065 : }
3066 :
3067 : static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf)
3068 1230 : {
3069 : int error;
3070 : MARIA_HA *isam_info;
3071 : my_off_t filepos;
3072 :
3073 1230 : if (!info->current)
3074 : {
3075 60 : isam_info= *(info->current=info->file);
3076 60 : info->end=info->current+info->count;
3077 60 : maria_reset(isam_info);
3078 60 : maria_extra(isam_info, HA_EXTRA_CACHE, 0);
3079 60 : if ((error= maria_scan_init(isam_info)))
3080 0 : return(error);
3081 : }
3082 : else
3083 1170 : isam_info= *info->current;
3084 :
3085 : for (;;)
3086 : {
3087 1240 : if (!(error= maria_scan(isam_info, buf)) ||
3088 : error != HA_ERR_END_OF_FILE)
3089 1170 : return (error);
3090 70 : maria_scan_end(isam_info);
3091 70 : maria_extra(isam_info,HA_EXTRA_NO_CACHE, 0);
3092 70 : if (info->current+1 == info->end)
3093 60 : return(HA_ERR_END_OF_FILE);
3094 10 : info->current++;
3095 10 : isam_info= *info->current;
3096 10 : filepos=isam_info->s->pack.header_length;
3097 10 : maria_reset(isam_info);
3098 10 : maria_extra(isam_info,HA_EXTRA_CACHE, 0);
3099 10 : if ((error= maria_scan_init(isam_info)))
3100 0 : return(error);
3101 : }
3102 : }
3103 :
3104 :
3105 : static int mrg_close(PACK_MRG_INFO *mrg)
3106 30 : {
3107 : uint i;
3108 30 : int error=0;
3109 30 : DBUG_ENTER("mrg_close");
3110 :
3111 65 : for (i=0 ; i < mrg->count ; i++)
3112 35 : error|=maria_close(mrg->file[i]);
3113 30 : if (mrg->free_file)
3114 5 : my_free(mrg->file, MYF(0));
3115 30 : DBUG_RETURN(error);
3116 : }
3117 :
3118 :
3119 : #if !defined(DBUG_OFF)
3120 : /*
3121 : Fake the counts to get big Huffman codes.
3122 :
3123 : SYNOPSIS
3124 : fakebigcodes()
3125 : huff_counts A pointer to the counts array.
3126 : end_count A pointer past the counts array.
3127 :
3128 : DESCRIPTION
3129 :
3130 : Huffman coding works by removing the two least frequent values from
3131 : the list of values and add a new value with the sum of their
3132 : incidences in a loop until only one value is left. Every time a
3133 : value is reused for a new value, it gets one more bit for its
3134 : encoding. Hence, the least frequent values get the longest codes.
3135 :
3136 : To get a maximum code length for a value, two of the values must
3137 : have an incidence of 1. As their sum is 2, the next infrequent value
3138 : must have at least an incidence of 2, then 4, 8, 16 and so on. This
3139 : means that one needs 2**n bytes (values) for a code length of n
3140 : bits. However, using more distinct values forces the use of longer
3141 : codes, or reaching the code length with less total bytes (values).
3142 :
3143 : To get 64(32)-bit codes, I sort the counts by decreasing incidence.
3144 : I assign counts of 1 to the two most frequent values, a count of 2
3145 : for the next one, then 4, 8, and so on until 2**64-1(2**30-1). All
3146 : the remaining values get 1. That way every possible uchar has an
3147 : assigned code, though not all codes are used if not all uchar values
3148 : are present in the column.
3149 :
3150 : This strategy would work with distinct column values too, but
3151 : requires that at least 64(32) values are present. To make things
3152 : easier here, I cancel all distinct column values and force byte
3153 : compression for all columns.
3154 :
3155 : RETURN
3156 : void
3157 : */
3158 :
3159 : static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count)
3160 0 : {
3161 : HUFF_COUNTS *count;
3162 : my_off_t *cur_count_p;
3163 : my_off_t *end_count_p;
3164 : my_off_t **cur_sort_p;
3165 : my_off_t **end_sort_p;
3166 : my_off_t *sort_counts[256];
3167 : my_off_t total;
3168 0 : DBUG_ENTER("fakebigcodes");
3169 :
3170 0 : for (count= huff_counts; count < end_count; count++)
3171 : {
3172 : /*
3173 : Remove distinct column values.
3174 : */
3175 0 : if (huff_counts->tree_buff)
3176 : {
3177 0 : my_free(huff_counts->tree_buff, MYF(0));
3178 0 : delete_tree(&huff_counts->int_tree);
3179 0 : huff_counts->tree_buff= NULL;
3180 0 : DBUG_PRINT("fakebigcodes", ("freed distinct column values"));
3181 : }
3182 :
3183 : /*
3184 : Sort counts by decreasing incidence.
3185 : */
3186 0 : cur_count_p= count->counts;
3187 0 : end_count_p= cur_count_p + 256;
3188 0 : cur_sort_p= sort_counts;
3189 0 : while (cur_count_p < end_count_p)
3190 0 : *(cur_sort_p++)= cur_count_p++;
3191 0 : (void) my_qsort(sort_counts, 256, sizeof(my_off_t*), (qsort_cmp) fakecmp);
3192 :
3193 : /*
3194 : Assign faked counts.
3195 : */
3196 0 : cur_sort_p= sort_counts;
3197 : #if SIZEOF_LONG_LONG > 4
3198 0 : end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 1;
3199 : #else
3200 : end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 2;
3201 : #endif
3202 : /* Most frequent value gets a faked count of 1. */
3203 0 : **(cur_sort_p++)= 1;
3204 0 : total= 1;
3205 0 : while (cur_sort_p < end_sort_p)
3206 : {
3207 0 : **(cur_sort_p++)= total;
3208 0 : total<<= 1;
3209 : }
3210 : /* Set the last value. */
3211 0 : **(cur_sort_p++)= --total;
3212 : /*
3213 : Set the remaining counts.
3214 : */
3215 0 : end_sort_p= sort_counts + 256;
3216 0 : while (cur_sort_p < end_sort_p)
3217 0 : **(cur_sort_p++)= 1;
3218 : }
3219 0 : DBUG_VOID_RETURN;
3220 : }
3221 :
3222 :
3223 : /*
3224 : Compare two counts for reverse sorting.
3225 :
3226 : SYNOPSIS
3227 : fakecmp()
3228 : count1 One count.
3229 : count2 Another count.
3230 :
3231 : RETURN
3232 : 1 count1 < count2
3233 : 0 count1 == count2
3234 : -1 count1 > count2
3235 : */
3236 :
3237 : static int fakecmp(my_off_t **count1, my_off_t **count2)
3238 0 : {
3239 0 : return ((**count1 < **count2) ? 1 :
3240 : (**count1 > **count2) ? -1 : 0);
3241 : }
3242 : #endif
|