1 : /* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
2 :
3 : This program is free software; you can redistribute it and/or modify
4 : it under the terms of the GNU General Public License as published by
5 : the Free Software Foundation; version 2 of the License.
6 :
7 : This program is distributed in the hope that it will be useful,
8 : but WITHOUT ANY WARRANTY; without even the implied warranty of
9 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 : GNU General Public License for more details.
11 :
12 : You should have received a copy of the GNU General Public License
13 : along with this program; if not, write to the Free Software
14 : Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
15 :
16 : /* Written by Sergei A. Golubchik, who has a shared copyright to this code */
17 :
18 : /* TODO: add caching - pre-read several index entries at once */
19 :
20 : /*
21 : Added optimization for full-text queries with plus-words. It was
22 : implemented by sharing maximal document id (max_docid) variable
23 : inside plus subtree. max_docid could be used by any word in plus
24 : subtree, but it could be updated by plus-word only.
25 :
26 : Fulltext "smarter index merge" optimization assumes that rows
27 : it gets are ordered by doc_id. That is not the case when we
28 : search for a word with truncation operator. It may return
29 : rows in random order. Thus we may not use "smarter index merge"
30 : optimization with "trunc-words".
31 :
32 : The idea is: there is no need to search for docid smaller than
33 : biggest docid inside current plus subtree or any upper plus subtree.
34 :
35 : Examples:
36 : +word1 word2
37 : share same max_docid
38 : max_docid updated by word1
39 : +word1 +(word2 word3)
40 : share same max_docid
41 : max_docid updated by word1
42 : +(word1 -word2) +(+word3 word4)
43 : share same max_docid
44 : max_docid updated by word3
45 : +word1 word2 (+word3 word4 (+word5 word6))
46 : three subexpressions (including the top-level one),
47 : every one has its own max_docid, updated by its plus word.
48 : but for the search word6 uses
49 : max(word1.max_docid, word3.max_docid, word5.max_docid),
50 : while word4 uses, accordingly,
51 : max(word1.max_docid, word3.max_docid).
52 : */
53 :
54 : #define FT_CORE
55 : #include "ma_ftdefs.h"
56 :
57 : /* search with boolean queries */
58 :
59 : static double _wghts[11]=
60 : {
61 : 0.131687242798354,
62 : 0.197530864197531,
63 : 0.296296296296296,
64 : 0.444444444444444,
65 : 0.666666666666667,
66 : 1.000000000000000,
67 : 1.500000000000000,
68 : 2.250000000000000,
69 : 3.375000000000000,
70 : 5.062500000000000,
71 : 7.593750000000000};
72 : static double *wghts=_wghts+5; /* wghts[i] = 1.5**i */
73 :
74 : static double _nwghts[11]=
75 : {
76 : -0.065843621399177,
77 : -0.098765432098766,
78 : -0.148148148148148,
79 : -0.222222222222222,
80 : -0.333333333333334,
81 : -0.500000000000000,
82 : -0.750000000000000,
83 : -1.125000000000000,
84 : -1.687500000000000,
85 : -2.531250000000000,
86 : -3.796875000000000};
87 : static double *nwghts=_nwghts+5; /* nwghts[i] = -0.5*1.5**i */
88 :
89 : #define FTB_FLAG_TRUNC 1
90 : /* At most one of the following flags can be set */
91 : #define FTB_FLAG_YES 2
92 : #define FTB_FLAG_NO 4
93 : #define FTB_FLAG_WONLY 8
94 :
95 : typedef struct st_ftb_expr FTB_EXPR;
96 : struct st_ftb_expr
97 : {
98 : FTB_EXPR *up;
99 : uint flags;
100 : /* ^^^^^^^^^^^^^^^^^^ FTB_{EXPR,WORD} common section */
101 : my_off_t docid[2];
102 : my_off_t max_docid;
103 : float weight;
104 : float cur_weight;
105 : LIST *phrase; /* phrase words */
106 : LIST *document; /* for phrase search */
107 : uint yesses; /* number of "yes" words matched */
108 : uint nos; /* number of "no" words matched */
109 : uint ythresh; /* number of "yes" words in expr */
110 : uint yweaks; /* number of "yes" words for scan only */
111 : };
112 :
113 : typedef struct st_ftb_word
114 : {
115 : FTB_EXPR *up;
116 : uint flags;
117 : /* ^^^^^^^^^^^^^^^^^^ FTB_{EXPR,WORD} common section */
118 : my_off_t docid[2]; /* for index search and for scan */
119 : my_off_t key_root;
120 : FTB_EXPR *max_docid_expr;
121 : MARIA_KEYDEF *keyinfo;
122 : struct st_ftb_word *prev;
123 : float weight;
124 : uint ndepth;
125 : uint len;
126 : uchar off;
127 : uchar word[1];
128 : } FTB_WORD;
129 :
130 : typedef struct st_ft_info
131 : {
132 : struct _ft_vft *please;
133 : MARIA_HA *info;
134 : CHARSET_INFO *charset;
135 : FTB_EXPR *root;
136 : FTB_WORD **list;
137 : FTB_WORD *last_word;
138 : MEM_ROOT mem_root;
139 : QUEUE queue;
140 : TREE no_dupes;
141 : my_off_t lastpos;
142 : uint keynr;
143 : uchar with_scan;
144 : enum { UNINITIALIZED, READY, INDEX_SEARCH, INDEX_DONE } state;
145 : } FTB;
146 :
147 : static int FTB_WORD_cmp(my_off_t *v, FTB_WORD *a, FTB_WORD *b)
148 0 : {
149 : int i;
150 :
151 : /* if a==curdoc, take it as a < b */
152 0 : if (v && a->docid[0] == *v)
153 0 : return -1;
154 :
155 : /* ORDER BY docid, ndepth DESC */
156 0 : i=CMP_NUM(a->docid[0], b->docid[0]);
157 0 : if (!i)
158 0 : i=CMP_NUM(b->ndepth,a->ndepth);
159 0 : return i;
160 : }
161 :
162 : static int FTB_WORD_cmp_list(CHARSET_INFO *cs, FTB_WORD **a, FTB_WORD **b)
163 0 : {
164 : /* ORDER BY word, ndepth */
165 : int i= ha_compare_text(cs, (uchar*) (*a)->word + 1,(*a)->len - 1,
166 0 : (uchar*) (*b)->word + 1,(*b)->len - 1, 0, 0);
167 0 : if (!i)
168 0 : i=CMP_NUM((*a)->ndepth, (*b)->ndepth);
169 0 : return i;
170 : }
171 :
172 :
173 : typedef struct st_my_ftb_param
174 : {
175 : FTB *ftb;
176 : FTB_EXPR *ftbe;
177 : uchar *up_quot;
178 : uint depth;
179 : } MY_FTB_PARAM;
180 :
181 :
182 : static int ftb_query_add_word(MYSQL_FTPARSER_PARAM *param,
183 : char *word, int word_len,
184 : MYSQL_FTPARSER_BOOLEAN_INFO *info)
185 0 : {
186 0 : MY_FTB_PARAM *ftb_param= param->mysql_ftparam;
187 : FTB_WORD *ftbw;
188 : FTB_EXPR *ftbe, *tmp_expr;
189 : FT_WORD *phrase_word;
190 : LIST *tmp_element;
191 0 : int r= info->weight_adjust;
192 : float weight= (float)
193 0 : (info->wasign ? nwghts : wghts)[(r>5)?5:((r<-5)?-5:r)];
194 :
195 0 : switch (info->type) {
196 : case FT_TOKEN_WORD:
197 0 : ftbw= (FTB_WORD *)alloc_root(&ftb_param->ftb->mem_root,
198 : sizeof(FTB_WORD) +
199 : (info->trunc ? MARIA_MAX_KEY_BUFF :
200 : word_len * ftb_param->ftb->charset->mbmaxlen +
201 : HA_FT_WLEN +
202 : ftb_param->ftb->info->s->rec_reflength));
203 0 : ftbw->len= word_len + 1;
204 0 : ftbw->flags= 0;
205 0 : ftbw->off= 0;
206 0 : if (info->yesno > 0) ftbw->flags|= FTB_FLAG_YES;
207 0 : if (info->yesno < 0) ftbw->flags|= FTB_FLAG_NO;
208 0 : if (info->trunc) ftbw->flags|= FTB_FLAG_TRUNC;
209 0 : ftbw->weight= weight;
210 0 : ftbw->up= ftb_param->ftbe;
211 0 : ftbw->docid[0]= ftbw->docid[1]= HA_OFFSET_ERROR;
212 0 : ftbw->ndepth= (info->yesno < 0) + ftb_param->depth;
213 0 : ftbw->key_root= HA_OFFSET_ERROR;
214 0 : memcpy(ftbw->word + 1, word, word_len);
215 0 : ftbw->word[0]= word_len;
216 0 : if (info->yesno > 0) ftbw->up->ythresh++;
217 0 : ftb_param->ftb->queue.max_elements++;
218 0 : ftbw->prev= ftb_param->ftb->last_word;
219 0 : ftb_param->ftb->last_word= ftbw;
220 0 : ftb_param->ftb->with_scan|= (info->trunc & FTB_FLAG_TRUNC);
221 0 : for (tmp_expr= ftb_param->ftbe; tmp_expr->up; tmp_expr= tmp_expr->up)
222 0 : if (! (tmp_expr->flags & FTB_FLAG_YES))
223 0 : break;
224 0 : ftbw->max_docid_expr= tmp_expr;
225 : /* fall through */
226 : case FT_TOKEN_STOPWORD:
227 0 : if (! ftb_param->up_quot) break;
228 0 : phrase_word= (FT_WORD *)alloc_root(&ftb_param->ftb->mem_root, sizeof(FT_WORD));
229 0 : tmp_element= (LIST *)alloc_root(&ftb_param->ftb->mem_root, sizeof(LIST));
230 0 : phrase_word->pos= (uchar*) word;
231 0 : phrase_word->len= word_len;
232 0 : tmp_element->data= (void *)phrase_word;
233 0 : ftb_param->ftbe->phrase= list_add(ftb_param->ftbe->phrase, tmp_element);
234 : /* Allocate document list at this point.
235 : It allows to avoid huge amount of allocs/frees for each row.*/
236 0 : tmp_element= (LIST *)alloc_root(&ftb_param->ftb->mem_root, sizeof(LIST));
237 0 : tmp_element->data= alloc_root(&ftb_param->ftb->mem_root, sizeof(FT_WORD));
238 0 : ftb_param->ftbe->document=
239 : list_add(ftb_param->ftbe->document, tmp_element);
240 0 : break;
241 : case FT_TOKEN_LEFT_PAREN:
242 0 : ftbe=(FTB_EXPR *)alloc_root(&ftb_param->ftb->mem_root, sizeof(FTB_EXPR));
243 0 : ftbe->flags= 0;
244 0 : if (info->yesno > 0) ftbe->flags|= FTB_FLAG_YES;
245 0 : if (info->yesno < 0) ftbe->flags|= FTB_FLAG_NO;
246 0 : ftbe->weight= weight;
247 0 : ftbe->up= ftb_param->ftbe;
248 0 : ftbe->max_docid= ftbe->ythresh= ftbe->yweaks= 0;
249 0 : ftbe->docid[0]= ftbe->docid[1]= HA_OFFSET_ERROR;
250 0 : ftbe->phrase= NULL;
251 0 : ftbe->document= 0;
252 0 : if (info->quot) ftb_param->ftb->with_scan|= 2;
253 0 : if (info->yesno > 0) ftbe->up->ythresh++;
254 0 : ftb_param->ftbe= ftbe;
255 0 : ftb_param->depth++;
256 0 : ftb_param->up_quot= (uchar*) info->quot;
257 0 : break;
258 : case FT_TOKEN_RIGHT_PAREN:
259 0 : if (ftb_param->ftbe->document)
260 : {
261 : /* Circuit document list */
262 0 : for (tmp_element= ftb_param->ftbe->document;
263 0 : tmp_element->next; tmp_element= tmp_element->next) /* no-op */;
264 0 : tmp_element->next= ftb_param->ftbe->document;
265 0 : ftb_param->ftbe->document->prev= tmp_element;
266 : }
267 0 : info->quot= 0;
268 0 : if (ftb_param->ftbe->up)
269 : {
270 0 : DBUG_ASSERT(ftb_param->depth);
271 0 : ftb_param->ftbe= ftb_param->ftbe->up;
272 0 : ftb_param->depth--;
273 0 : ftb_param->up_quot= 0;
274 : }
275 : break;
276 : case FT_TOKEN_EOF:
277 : default:
278 : break;
279 : }
280 0 : return(0);
281 : }
282 :
283 :
284 : static int ftb_parse_query_internal(MYSQL_FTPARSER_PARAM *param,
285 : char *query, int len)
286 0 : {
287 0 : MY_FTB_PARAM *ftb_param= param->mysql_ftparam;
288 : MYSQL_FTPARSER_BOOLEAN_INFO info;
289 0 : CHARSET_INFO *cs= ftb_param->ftb->charset;
290 0 : uchar **start= (uchar**) &query;
291 0 : uchar *end= (uchar*) query + len;
292 : FT_WORD w;
293 :
294 0 : info.prev= ' ';
295 0 : info.quot= 0;
296 0 : while (maria_ft_get_word(cs, start, end, &w, &info))
297 0 : param->mysql_add_word(param, (char *) w.pos, w.len, &info);
298 0 : return(0);
299 : }
300 :
301 :
302 : static int _ftb_parse_query(FTB *ftb, uchar *query, uint len,
303 : struct st_mysql_ftparser *parser)
304 0 : {
305 : MYSQL_FTPARSER_PARAM *param;
306 : MY_FTB_PARAM ftb_param;
307 0 : DBUG_ENTER("_ftb_parse_query");
308 0 : DBUG_ASSERT(parser);
309 :
310 0 : if (ftb->state != UNINITIALIZED)
311 0 : DBUG_RETURN(0);
312 0 : if (! (param= maria_ftparser_call_initializer(ftb->info, ftb->keynr, 0)))
313 0 : DBUG_RETURN(1);
314 :
315 0 : ftb_param.ftb= ftb;
316 0 : ftb_param.depth= 0;
317 0 : ftb_param.ftbe= ftb->root;
318 0 : ftb_param.up_quot= 0;
319 :
320 0 : param->mysql_parse= ftb_parse_query_internal;
321 0 : param->mysql_add_word= ftb_query_add_word;
322 0 : param->mysql_ftparam= (void *)&ftb_param;
323 0 : param->cs= ftb->charset;
324 0 : param->doc= (char*) query;
325 0 : param->length= len;
326 0 : param->flags= 0;
327 0 : param->mode= MYSQL_FTPARSER_FULL_BOOLEAN_INFO;
328 0 : DBUG_RETURN(parser->parse(param));
329 : }
330 :
331 :
332 : static int _ftb_no_dupes_cmp(void* not_used __attribute__((unused)),
333 : const void *a,const void *b)
334 0 : {
335 0 : return CMP_NUM((*((my_off_t*)a)), (*((my_off_t*)b)));
336 : }
337 :
338 :
339 : /* returns 1 if the search was finished (must-word wasn't found) */
340 :
341 : static int _ft2_search(FTB *ftb, FTB_WORD *ftbw, my_bool init_search)
342 0 : {
343 : int r;
344 0 : int subkeys=1;
345 : my_bool can_go_down;
346 0 : MARIA_HA *info=ftb->info;
347 0 : uint off, extra=HA_FT_WLEN+info->s->base.rec_reflength;
348 0 : uchar *lastkey_buf= ftbw->word+ftbw->off;
349 : MARIA_KEY key;
350 0 : LINT_INIT(off);
351 :
352 0 : if (ftbw->flags & FTB_FLAG_TRUNC)
353 0 : lastkey_buf+=ftbw->len;
354 :
355 0 : if (init_search)
356 : {
357 0 : ftbw->key_root=info->s->state.key_root[ftb->keynr];
358 0 : ftbw->keyinfo=info->s->keyinfo+ftb->keynr;
359 0 : key.keyinfo= ftbw->keyinfo;
360 0 : key.data= ftbw->word;
361 0 : key.data_length= ftbw->len;
362 0 : key.ref_length= 0;
363 0 : key.flag= 0;
364 :
365 0 : r= _ma_search(info, &key, SEARCH_FIND | SEARCH_BIGGER, ftbw->key_root);
366 : }
367 : else
368 : {
369 0 : uint sflag= SEARCH_BIGGER;
370 0 : my_off_t max_docid=0;
371 : FTB_EXPR *tmp;
372 :
373 0 : for (tmp= ftbw->max_docid_expr; tmp; tmp= tmp->up)
374 0 : set_if_bigger(max_docid, tmp->max_docid);
375 :
376 0 : if (ftbw->docid[0] < max_docid)
377 : {
378 0 : sflag|= SEARCH_SAME;
379 0 : _ma_dpointer(info->s, (uchar*) (ftbw->word + ftbw->len + HA_FT_WLEN),
380 : max_docid);
381 : }
382 :
383 0 : key.keyinfo= ftbw->keyinfo;
384 0 : key.data= lastkey_buf;
385 0 : key.data_length= USE_WHOLE_KEY;
386 0 : key.ref_length= 0;
387 0 : key.flag= 0;
388 :
389 0 : r= _ma_search(info, &key, sflag, ftbw->key_root);
390 : }
391 :
392 0 : can_go_down=(!ftbw->off && (init_search || (ftbw->flags & FTB_FLAG_TRUNC)));
393 : /* Skip rows inserted by concurrent insert */
394 0 : while (!r)
395 : {
396 0 : if (can_go_down)
397 : {
398 : /* going down ? */
399 0 : off= info->last_key.data_length + info->last_key.ref_length - extra;
400 0 : subkeys=ft_sintXkorr(info->last_key.data + off);
401 : }
402 0 : if (subkeys<0 || info->cur_row.lastpos < info->state->data_file_length)
403 : break;
404 0 : r= _ma_search_next(info, &info->last_key, SEARCH_BIGGER, ftbw->key_root);
405 : }
406 :
407 0 : if (!r && !ftbw->off)
408 : {
409 0 : r= ha_compare_text(ftb->charset,
410 : info->last_key.data+1,
411 : info->last_key.data_length + info->last_key.ref_length-
412 : extra-1,
413 : (uchar*) ftbw->word+1,
414 : ftbw->len-1,
415 : (my_bool) (ftbw->flags & FTB_FLAG_TRUNC), 0);
416 : }
417 :
418 0 : if (r) /* not found */
419 : {
420 0 : if (!ftbw->off || !(ftbw->flags & FTB_FLAG_TRUNC))
421 : {
422 0 : ftbw->docid[0]=HA_OFFSET_ERROR;
423 0 : if ((ftbw->flags & FTB_FLAG_YES) && ftbw->up->up==0)
424 : {
425 : /*
426 : This word MUST BE present in every document returned,
427 : so we can stop the search right now
428 : */
429 0 : ftb->state=INDEX_DONE;
430 0 : return 1; /* search is done */
431 : }
432 : else
433 0 : return 0;
434 : }
435 :
436 : /* going up to the first-level tree to continue search there */
437 0 : _ma_dpointer(info->s, (lastkey_buf+HA_FT_WLEN), ftbw->key_root);
438 0 : ftbw->key_root=info->s->state.key_root[ftb->keynr];
439 0 : ftbw->keyinfo=info->s->keyinfo+ftb->keynr;
440 0 : ftbw->off=0;
441 0 : return _ft2_search(ftb, ftbw, 0);
442 : }
443 :
444 : /* matching key found */
445 0 : memcpy(lastkey_buf, info->last_key.data,
446 : info->last_key.data_length + info->last_key.ref_length);
447 0 : if (lastkey_buf == ftbw->word)
448 0 : ftbw->len= info->last_key.data_length + info->last_key.ref_length - extra;
449 :
450 : /* going down ? */
451 0 : if (subkeys<0)
452 : {
453 : /*
454 : yep, going down, to the second-level tree
455 : TODO here: subkey-based optimization
456 : */
457 0 : ftbw->off=off;
458 0 : ftbw->key_root= info->cur_row.lastpos;
459 0 : ftbw->keyinfo=& info->s->ft2_keyinfo;
460 0 : r= _ma_search_first(info, ftbw->keyinfo, ftbw->key_root);
461 0 : DBUG_ASSERT(r==0); /* found something */
462 0 : memcpy(lastkey_buf+off, info->last_key.data,
463 : info->last_key.data_length + info->last_key.ref_length);
464 : }
465 0 : ftbw->docid[0]= info->cur_row.lastpos;
466 0 : if (ftbw->flags & FTB_FLAG_YES && !(ftbw->flags & FTB_FLAG_TRUNC))
467 0 : ftbw->max_docid_expr->max_docid= info->cur_row.lastpos;
468 0 : return 0;
469 : }
470 :
471 : static void _ftb_init_index_search(FT_INFO *ftb)
472 0 : {
473 : int i;
474 : FTB_WORD *ftbw;
475 :
476 0 : if ((ftb->state != READY && ftb->state !=INDEX_DONE) ||
477 : ftb->keynr == NO_SUCH_KEY)
478 : return;
479 0 : ftb->state=INDEX_SEARCH;
480 :
481 0 : for (i=ftb->queue.elements; i; i--)
482 : {
483 0 : ftbw=(FTB_WORD *)(ftb->queue.root[i]);
484 :
485 0 : if (ftbw->flags & FTB_FLAG_TRUNC)
486 : {
487 : /*
488 : special treatment for truncation operator
489 : 1. there are some (besides this) +words
490 : | no need to search in the index, it can never ADD new rows
491 : | to the result, and to remove half-matched rows we do scan anyway
492 : 2. -trunc*
493 : | same as 1.
494 : 3. in 1 and 2, +/- need not be on the same expr. level,
495 : but can be on any upper level, as in +word +(trunc1* trunc2*)
496 : 4. otherwise
497 : | We have to index-search for this prefix.
498 : | It may cause duplicates, as in the index (sorted by <word,docid>)
499 : | <aaaa,row1>
500 : | <aabb,row2>
501 : | <aacc,row1>
502 : | Searching for "aa*" will find row1 twice...
503 : */
504 : FTB_EXPR *ftbe;
505 0 : for (ftbe=(FTB_EXPR*)ftbw;
506 0 : ftbe->up && !(ftbe->up->flags & FTB_FLAG_TRUNC);
507 0 : ftbe->up->flags|= FTB_FLAG_TRUNC, ftbe=ftbe->up)
508 : {
509 0 : if (ftbe->flags & FTB_FLAG_NO || /* 2 */
510 : ftbe->up->ythresh - ftbe->up->yweaks >
511 : (uint) test(ftbe->flags & FTB_FLAG_YES)) /* 1 */
512 : {
513 0 : FTB_EXPR *top_ftbe=ftbe->up;
514 0 : ftbw->docid[0]=HA_OFFSET_ERROR;
515 0 : for (ftbe=(FTB_EXPR *)ftbw;
516 0 : ftbe != top_ftbe && !(ftbe->flags & FTB_FLAG_NO);
517 0 : ftbe=ftbe->up)
518 0 : ftbe->up->yweaks++;
519 0 : ftbe=0;
520 0 : break;
521 : }
522 : }
523 0 : if (!ftbe)
524 0 : continue;
525 : /* 4 */
526 0 : if (!is_tree_inited(& ftb->no_dupes))
527 0 : init_tree(& ftb->no_dupes,0,0,sizeof(my_off_t),
528 : _ftb_no_dupes_cmp,0,0,0);
529 : else
530 0 : reset_tree(& ftb->no_dupes);
531 : }
532 :
533 0 : ftbw->off=0; /* in case of reinit */
534 0 : if (_ft2_search(ftb, ftbw, 1))
535 0 : return;
536 : }
537 0 : queue_fix(& ftb->queue);
538 : }
539 :
540 :
541 : FT_INFO * maria_ft_init_boolean_search(MARIA_HA *info, uint keynr,
542 : uchar *query,
543 : uint query_len, CHARSET_INFO *cs)
544 0 : {
545 : FTB *ftb;
546 : FTB_EXPR *ftbe;
547 : FTB_WORD *ftbw;
548 :
549 0 : if (!(ftb=(FTB *)my_malloc(sizeof(FTB), MYF(MY_WME))))
550 0 : return 0;
551 0 : ftb->please= (struct _ft_vft *) & _ma_ft_vft_boolean;
552 0 : ftb->state=UNINITIALIZED;
553 0 : ftb->info=info;
554 0 : ftb->keynr=keynr;
555 0 : ftb->charset=cs;
556 0 : DBUG_ASSERT(keynr==NO_SUCH_KEY || cs == info->s->keyinfo[keynr].seg->charset);
557 0 : ftb->with_scan=0;
558 0 : ftb->lastpos=HA_OFFSET_ERROR;
559 0 : bzero(& ftb->no_dupes, sizeof(TREE));
560 0 : ftb->last_word= 0;
561 :
562 0 : init_alloc_root(&ftb->mem_root, 1024, 1024);
563 0 : ftb->queue.max_elements= 0;
564 0 : if (!(ftbe=(FTB_EXPR *)alloc_root(&ftb->mem_root, sizeof(FTB_EXPR))))
565 0 : goto err;
566 0 : ftbe->weight=1;
567 0 : ftbe->flags=FTB_FLAG_YES;
568 0 : ftbe->nos=1;
569 0 : ftbe->up=0;
570 0 : ftbe->max_docid= ftbe->ythresh= ftbe->yweaks= 0;
571 0 : ftbe->docid[0]=ftbe->docid[1]=HA_OFFSET_ERROR;
572 0 : ftbe->phrase= NULL;
573 0 : ftbe->document= 0;
574 0 : ftb->root=ftbe;
575 0 : if (unlikely(_ftb_parse_query(ftb, query, query_len,
576 : keynr == NO_SUCH_KEY ? &ft_default_parser :
577 : info->s->keyinfo[keynr].parser)))
578 0 : goto err;
579 : /*
580 : Hack: instead of init_queue, we'll use reinit queue to be able
581 : to alloc queue with alloc_root()
582 : */
583 0 : if (! (ftb->queue.root= (uchar **)alloc_root(&ftb->mem_root,
584 : (ftb->queue.max_elements + 1) *
585 : sizeof(void *))))
586 0 : goto err;
587 0 : reinit_queue(&ftb->queue, ftb->queue.max_elements, 0, 0,
588 : (int (*)(void*, uchar*, uchar*))FTB_WORD_cmp, 0);
589 0 : for (ftbw= ftb->last_word; ftbw; ftbw= ftbw->prev)
590 0 : queue_insert(&ftb->queue, (uchar *)ftbw);
591 0 : ftb->list=(FTB_WORD **)alloc_root(&ftb->mem_root,
592 : sizeof(FTB_WORD *)*ftb->queue.elements);
593 0 : memcpy(ftb->list, ftb->queue.root+1, sizeof(FTB_WORD *)*ftb->queue.elements);
594 0 : my_qsort2(ftb->list, ftb->queue.elements, sizeof(FTB_WORD *),
595 : (qsort2_cmp)FTB_WORD_cmp_list, ftb->charset);
596 0 : if (ftb->queue.elements<2) ftb->with_scan &= ~FTB_FLAG_TRUNC;
597 0 : ftb->state=READY;
598 0 : return ftb;
599 0 : err:
600 0 : free_root(& ftb->mem_root, MYF(0));
601 0 : my_free(ftb, MYF(0));
602 0 : return 0;
603 : }
604 :
605 :
606 : typedef struct st_my_ftb_phrase_param
607 : {
608 : LIST *phrase;
609 : LIST *document;
610 : CHARSET_INFO *cs;
611 : uint phrase_length;
612 : uint document_length;
613 : uint match;
614 : } MY_FTB_PHRASE_PARAM;
615 :
616 :
617 : static int ftb_phrase_add_word(MYSQL_FTPARSER_PARAM *param,
618 : char *word, int word_len,
619 : MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info __attribute__((unused)))
620 0 : {
621 0 : MY_FTB_PHRASE_PARAM *phrase_param= param->mysql_ftparam;
622 0 : FT_WORD *w= (FT_WORD *)phrase_param->document->data;
623 : LIST *phrase, *document;
624 0 : w->pos= (uchar*) word;
625 0 : w->len= word_len;
626 0 : phrase_param->document= phrase_param->document->prev;
627 0 : if (phrase_param->phrase_length > phrase_param->document_length)
628 : {
629 0 : phrase_param->document_length++;
630 0 : return 0;
631 : }
632 : /* TODO: rewrite phrase search to avoid
633 : comparing the same word twice. */
634 0 : for (phrase= phrase_param->phrase, document= phrase_param->document->next;
635 0 : phrase; phrase= phrase->next, document= document->next)
636 : {
637 0 : FT_WORD *phrase_word= (FT_WORD *)phrase->data;
638 0 : FT_WORD *document_word= (FT_WORD *)document->data;
639 0 : if (my_strnncoll(phrase_param->cs, (uchar*) phrase_word->pos,
640 : phrase_word->len,
641 : (uchar*) document_word->pos, document_word->len))
642 0 : return 0;
643 : }
644 0 : phrase_param->match++;
645 0 : return 0;
646 : }
647 :
648 :
649 : static int ftb_check_phrase_internal(MYSQL_FTPARSER_PARAM *param,
650 : char *document, int len)
651 0 : {
652 : FT_WORD word;
653 0 : MY_FTB_PHRASE_PARAM *phrase_param= param->mysql_ftparam;
654 0 : const uchar *docend= (uchar*) document + len;
655 0 : while (maria_ft_simple_get_word(phrase_param->cs, (uchar**) &document,
656 : docend, &word, FALSE))
657 : {
658 0 : param->mysql_add_word(param, (char*) word.pos, word.len, 0);
659 0 : if (phrase_param->match)
660 0 : break;
661 : }
662 0 : return 0;
663 : }
664 :
665 :
666 : /*
667 : Checks if given buffer matches phrase list.
668 :
669 : SYNOPSIS
670 : _ftb_check_phrase()
671 : s0 start of buffer
672 : e0 end of buffer
673 : phrase broken into list phrase
674 : cs charset info
675 :
676 : RETURN VALUE
677 : 1 is returned if phrase found, 0 else.
678 : -1 is returned if error occurs.
679 : */
680 :
681 : static int _ftb_check_phrase(FTB *ftb, const uchar *document, uint len,
682 : FTB_EXPR *ftbe, struct st_mysql_ftparser *parser)
683 0 : {
684 : MY_FTB_PHRASE_PARAM ftb_param;
685 : MYSQL_FTPARSER_PARAM *param;
686 0 : DBUG_ENTER("_ftb_check_phrase");
687 0 : DBUG_ASSERT(parser);
688 :
689 0 : if (! (param= maria_ftparser_call_initializer(ftb->info, ftb->keynr, 1)))
690 0 : DBUG_RETURN(0);
691 0 : ftb_param.phrase= ftbe->phrase;
692 0 : ftb_param.document= ftbe->document;
693 0 : ftb_param.cs= ftb->charset;
694 0 : ftb_param.phrase_length= list_length(ftbe->phrase);
695 0 : ftb_param.document_length= 1;
696 0 : ftb_param.match= 0;
697 :
698 0 : param->mysql_parse= ftb_check_phrase_internal;
699 0 : param->mysql_add_word= ftb_phrase_add_word;
700 0 : param->mysql_ftparam= (void *)&ftb_param;
701 0 : param->cs= ftb->charset;
702 0 : param->doc= (char *) document;
703 0 : param->length= len;
704 0 : param->flags= 0;
705 0 : param->mode= MYSQL_FTPARSER_WITH_STOPWORDS;
706 0 : if (unlikely(parser->parse(param)))
707 0 : return -1;
708 0 : DBUG_RETURN(ftb_param.match ? 1 : 0);
709 : }
710 :
711 :
712 : static int _ftb_climb_the_tree(FTB *ftb, FTB_WORD *ftbw, FT_SEG_ITERATOR *ftsi_orig)
713 0 : {
714 : FT_SEG_ITERATOR ftsi;
715 : FTB_EXPR *ftbe;
716 0 : float weight=ftbw->weight;
717 0 : int yn_flag= ftbw->flags, ythresh, mode=(ftsi_orig != 0);
718 0 : my_off_t curdoc=ftbw->docid[mode];
719 : struct st_mysql_ftparser *parser= ftb->keynr == NO_SUCH_KEY ?
720 : &ft_default_parser :
721 0 : ftb->info->s->keyinfo[ftb->keynr].parser;
722 :
723 0 : for (ftbe=ftbw->up; ftbe; ftbe=ftbe->up)
724 : {
725 0 : ythresh = ftbe->ythresh - (mode ? 0 : ftbe->yweaks);
726 0 : if (ftbe->docid[mode] != curdoc)
727 : {
728 0 : ftbe->cur_weight=0;
729 0 : ftbe->yesses=ftbe->nos=0;
730 0 : ftbe->docid[mode]=curdoc;
731 : }
732 0 : if (ftbe->nos)
733 0 : break;
734 0 : if (yn_flag & FTB_FLAG_YES)
735 : {
736 0 : weight /= ftbe->ythresh;
737 0 : ftbe->cur_weight += weight;
738 0 : if ((int) ++ftbe->yesses == ythresh)
739 : {
740 0 : yn_flag=ftbe->flags;
741 0 : weight=ftbe->cur_weight*ftbe->weight;
742 0 : if (mode && ftbe->phrase)
743 : {
744 0 : int found= 0;
745 :
746 0 : memcpy(&ftsi, ftsi_orig, sizeof(ftsi));
747 0 : while (_ma_ft_segiterator(&ftsi) && !found)
748 : {
749 0 : if (!ftsi.pos)
750 0 : continue;
751 0 : found= _ftb_check_phrase(ftb, ftsi.pos, ftsi.len, ftbe, parser);
752 0 : if (unlikely(found < 0))
753 0 : return 1;
754 : }
755 0 : if (!found)
756 : break;
757 : } /* ftbe->quot */
758 : }
759 : else
760 0 : break;
761 : }
762 : else
763 0 : if (yn_flag & FTB_FLAG_NO)
764 : {
765 : /*
766 : NOTE: special sort function of queue assures that all
767 : (yn_flag & FTB_FLAG_NO) != 0
768 : events for every particular subexpression will
769 : "auto-magically" happen BEFORE all the
770 : (yn_flag & FTB_FLAG_YES) != 0 events. So no
771 : already matched expression can become not-matched again.
772 : */
773 0 : ++ftbe->nos;
774 0 : break;
775 : }
776 : else
777 : {
778 0 : if (ftbe->ythresh)
779 0 : weight/=3;
780 0 : ftbe->cur_weight += weight;
781 0 : if ((int) ftbe->yesses < ythresh)
782 0 : break;
783 0 : if (!(yn_flag & FTB_FLAG_WONLY))
784 0 : yn_flag= ((int) ftbe->yesses++ == ythresh) ? ftbe->flags : FTB_FLAG_WONLY ;
785 0 : weight*= ftbe->weight;
786 : }
787 : }
788 0 : return 0;
789 : }
790 :
791 :
792 : int maria_ft_boolean_read_next(FT_INFO *ftb, char *record)
793 0 : {
794 : FTB_EXPR *ftbe;
795 : FTB_WORD *ftbw;
796 0 : MARIA_HA *info=ftb->info;
797 : my_off_t curdoc;
798 :
799 0 : if (ftb->state != INDEX_SEARCH && ftb->state != INDEX_DONE)
800 0 : return -1;
801 :
802 : /* black magic ON */
803 0 : if ((int) _ma_check_index(info, ftb->keynr) < 0)
804 0 : return my_errno;
805 0 : if (_ma_readinfo(info, F_RDLCK, 1))
806 0 : return my_errno;
807 : /* black magic OFF */
808 :
809 0 : if (!ftb->queue.elements)
810 0 : return my_errno=HA_ERR_END_OF_FILE;
811 :
812 : /* Attention!!! Address of a local variable is used here! See err: label */
813 0 : ftb->queue.first_cmp_arg=(void *)&curdoc;
814 :
815 0 : while (ftb->state == INDEX_SEARCH &&
816 : (curdoc=((FTB_WORD *)queue_top(& ftb->queue))->docid[0]) !=
817 : HA_OFFSET_ERROR)
818 : {
819 0 : while (curdoc == (ftbw=(FTB_WORD *)queue_top(& ftb->queue))->docid[0])
820 : {
821 0 : if (unlikely(_ftb_climb_the_tree(ftb, ftbw, 0)))
822 : {
823 0 : my_errno= HA_ERR_OUT_OF_MEM;
824 0 : goto err;
825 : }
826 :
827 : /* update queue */
828 0 : _ft2_search(ftb, ftbw, 0);
829 0 : queue_replaced(& ftb->queue);
830 : }
831 :
832 0 : ftbe=ftb->root;
833 0 : if (ftbe->docid[0]==curdoc && ftbe->cur_weight>0 &&
834 : ftbe->yesses>=(ftbe->ythresh-ftbe->yweaks) && !ftbe->nos)
835 : {
836 : /* curdoc matched ! */
837 0 : if (is_tree_inited(&ftb->no_dupes) &&
838 : tree_insert(&ftb->no_dupes, &curdoc, 0,
839 : ftb->no_dupes.custom_arg)->count >1)
840 : /* but it managed already to get past this line once */
841 0 : continue;
842 :
843 0 : info->cur_row.lastpos= curdoc;
844 : /* Clear all states, except that the table was updated */
845 0 : info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
846 :
847 0 : if (!(*info->read_record)(info, (uchar *) record, curdoc))
848 : {
849 0 : info->update|= HA_STATE_AKTIV; /* Record is read */
850 0 : if (ftb->with_scan &&
851 : maria_ft_boolean_find_relevance(ftb, (uchar *) record, 0)==0)
852 0 : continue; /* no match */
853 0 : my_errno=0;
854 0 : goto err;
855 : }
856 : goto err;
857 : }
858 : }
859 0 : ftb->state=INDEX_DONE;
860 0 : my_errno=HA_ERR_END_OF_FILE;
861 0 : err:
862 0 : ftb->queue.first_cmp_arg=(void *)0;
863 0 : return my_errno;
864 : }
865 :
866 :
867 : typedef struct st_my_ftb_find_param
868 : {
869 : FT_INFO *ftb;
870 : FT_SEG_ITERATOR *ftsi;
871 : } MY_FTB_FIND_PARAM;
872 :
873 :
874 : static int ftb_find_relevance_add_word(MYSQL_FTPARSER_PARAM *param,
875 : char *word, int len,
876 : MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info __attribute__((unused)))
877 0 : {
878 0 : MY_FTB_FIND_PARAM *ftb_param= param->mysql_ftparam;
879 0 : FT_INFO *ftb= ftb_param->ftb;
880 : FTB_WORD *ftbw;
881 : int a, b, c;
882 : /*
883 : Find right-most element in the array of query words matching this
884 : word from a document.
885 : */
886 0 : for (a= 0, b= ftb->queue.elements, c= (a+b)/2; b-a>1; c= (a+b)/2)
887 : {
888 0 : ftbw= ftb->list[c];
889 0 : if (ha_compare_text(ftb->charset, (uchar*)word, len,
890 : (uchar*)ftbw->word+1, ftbw->len-1,
891 : (my_bool)(ftbw->flags&FTB_FLAG_TRUNC), 0) < 0)
892 0 : b= c;
893 : else
894 0 : a= c;
895 : }
896 : /*
897 : If there were no words with truncation operator, we iterate to the
898 : beginning of an array until array element is equal to the word from
899 : a document. This is done mainly because the same word may be
900 : mentioned twice (or more) in the query.
901 :
902 : In case query has words with truncation operator we must iterate
903 : to the beginning of the array. There may be non-matching query words
904 : between matching word with truncation operator and the right-most
905 : matching element. E.g., if we're looking for 'aaa15' in an array of
906 : 'aaa1* aaa14 aaa15 aaa16'.
907 :
908 : Worse of that there still may be match even if the binary search
909 : above didn't find matching element. E.g., if we're looking for
910 : 'aaa15' in an array of 'aaa1* aaa14 aaa16'. The binary search will
911 : stop at 'aaa16'.
912 : */
913 0 : for (; c >= 0; c--)
914 : {
915 0 : ftbw= ftb->list[c];
916 0 : if (ha_compare_text(ftb->charset, (uchar*)word, len,
917 : (uchar*)ftbw->word + 1,ftbw->len - 1,
918 : (my_bool)(ftbw->flags & FTB_FLAG_TRUNC), 0))
919 : {
920 0 : if (ftb->with_scan & FTB_FLAG_TRUNC)
921 : continue;
922 : else
923 0 : break;
924 : }
925 0 : if (ftbw->docid[1] == ftb->info->cur_row.lastpos)
926 0 : continue;
927 0 : ftbw->docid[1]= ftb->info->cur_row.lastpos;
928 0 : if (unlikely(_ftb_climb_the_tree(ftb, ftbw, ftb_param->ftsi)))
929 0 : return 1;
930 : }
931 0 : return(0);
932 : }
933 :
934 :
935 : static int ftb_find_relevance_parse(MYSQL_FTPARSER_PARAM *param,
936 : char *doc, int len)
937 0 : {
938 0 : MY_FTB_FIND_PARAM *ftb_param= param->mysql_ftparam;
939 0 : FT_INFO *ftb= ftb_param->ftb;
940 0 : uchar *end= (uchar*) doc + len;
941 : FT_WORD w;
942 0 : while (maria_ft_simple_get_word(ftb->charset, (uchar**) &doc,
943 : end, &w, TRUE))
944 0 : param->mysql_add_word(param, (char *) w.pos, w.len, 0);
945 0 : return(0);
946 : }
947 :
948 :
949 : float maria_ft_boolean_find_relevance(FT_INFO *ftb, uchar *record, uint length)
950 0 : {
951 : FTB_EXPR *ftbe;
952 : FT_SEG_ITERATOR ftsi, ftsi2;
953 0 : MARIA_RECORD_POS docid= ftb->info->cur_row.lastpos;
954 : MY_FTB_FIND_PARAM ftb_param;
955 : MYSQL_FTPARSER_PARAM *param;
956 : struct st_mysql_ftparser *parser= ftb->keynr == NO_SUCH_KEY ?
957 : &ft_default_parser :
958 0 : ftb->info->s->keyinfo[ftb->keynr].parser;
959 :
960 0 : if (docid == HA_OFFSET_ERROR)
961 0 : return -2.0;
962 0 : if (!ftb->queue.elements)
963 0 : return 0;
964 0 : if (! (param= maria_ftparser_call_initializer(ftb->info, ftb->keynr, 0)))
965 0 : return 0;
966 :
967 0 : if (ftb->state != INDEX_SEARCH && docid <= ftb->lastpos)
968 : {
969 : FTB_EXPR *x;
970 : uint i;
971 :
972 0 : for (i=0; i < ftb->queue.elements; i++)
973 : {
974 0 : ftb->list[i]->docid[1]=HA_OFFSET_ERROR;
975 0 : for (x=ftb->list[i]->up; x; x=x->up)
976 0 : x->docid[1]=HA_OFFSET_ERROR;
977 : }
978 : }
979 :
980 0 : ftb->lastpos=docid;
981 :
982 0 : if (ftb->keynr==NO_SUCH_KEY)
983 0 : _ma_ft_segiterator_dummy_init(record, length, &ftsi);
984 : else
985 0 : _ma_ft_segiterator_init(ftb->info, ftb->keynr, record, &ftsi);
986 0 : memcpy(&ftsi2, &ftsi, sizeof(ftsi));
987 :
988 0 : ftb_param.ftb= ftb;
989 0 : ftb_param.ftsi= &ftsi2;
990 0 : param->mysql_parse= ftb_find_relevance_parse;
991 0 : param->mysql_add_word= ftb_find_relevance_add_word;
992 0 : param->mysql_ftparam= (void *)&ftb_param;
993 0 : param->flags= 0;
994 0 : param->cs= ftb->charset;
995 0 : param->mode= MYSQL_FTPARSER_SIMPLE_MODE;
996 :
997 0 : while (_ma_ft_segiterator(&ftsi))
998 : {
999 0 : if (!ftsi.pos)
1000 0 : continue;
1001 0 : param->doc= (char *)ftsi.pos;
1002 0 : param->length= ftsi.len;
1003 0 : if (unlikely(parser->parse(param)))
1004 0 : return 0;
1005 : }
1006 0 : ftbe=ftb->root;
1007 0 : if (ftbe->docid[1]==docid && ftbe->cur_weight>0 &&
1008 : ftbe->yesses>=ftbe->ythresh && !ftbe->nos)
1009 : { /* row matched ! */
1010 0 : return ftbe->cur_weight;
1011 : }
1012 : else
1013 : { /* match failed ! */
1014 0 : return 0.0;
1015 : }
1016 : }
1017 :
1018 :
1019 : void maria_ft_boolean_close_search(FT_INFO *ftb)
1020 0 : {
1021 0 : if (is_tree_inited(& ftb->no_dupes))
1022 : {
1023 0 : delete_tree(& ftb->no_dupes);
1024 : }
1025 0 : free_root(& ftb->mem_root, MYF(0));
1026 0 : my_free(ftb, MYF(0));
1027 : }
1028 :
1029 :
1030 : float maria_ft_boolean_get_relevance(FT_INFO *ftb)
1031 0 : {
1032 0 : return ftb->root->cur_weight;
1033 : }
1034 :
1035 :
1036 : void maria_ft_boolean_reinit_search(FT_INFO *ftb)
1037 0 : {
1038 0 : _ftb_init_index_search(ftb);
1039 : }
|