1 : /* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
2 :
3 : This program is free software; you can redistribute it and/or modify
4 : it under the terms of the GNU General Public License as published by
5 : the Free Software Foundation; version 2 of the License.
6 :
7 : This program is distributed in the hope that it will be useful,
8 : but WITHOUT ANY WARRANTY; without even the implied warranty of
9 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 : GNU General Public License for more details.
11 :
12 : You should have received a copy of the GNU General Public License
13 : along with this program; if not, write to the Free Software
14 : Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
15 :
16 : /* Written by Sergei A. Golubchik, who has a shared copyright to this code */
17 :
18 : #define FT_CORE
19 : #include "ma_ftdefs.h"
20 :
21 : /* search with natural language queries */
22 :
23 : typedef struct ft_doc_rec
24 : {
25 : my_off_t dpos;
26 : double weight;
27 : } FT_DOC;
28 :
29 : struct st_ft_info
30 : {
31 : struct _ft_vft *please;
32 : MARIA_HA *info;
33 : int ndocs;
34 : int curdoc;
35 : FT_DOC doc[1];
36 : };
37 :
38 : typedef struct st_all_in_one
39 : {
40 : MARIA_HA *info;
41 : uint keynr;
42 : CHARSET_INFO *charset;
43 : uchar *keybuff;
44 : TREE dtree;
45 : } ALL_IN_ONE;
46 :
47 : typedef struct st_ft_superdoc
48 : {
49 : FT_DOC doc;
50 : FT_WORD *word_ptr;
51 : double tmp_weight;
52 : } FT_SUPERDOC;
53 :
54 : static int FT_SUPERDOC_cmp(void* cmp_arg __attribute__((unused)),
55 : FT_SUPERDOC *p1, FT_SUPERDOC *p2)
56 0 : {
57 0 : if (p1->doc.dpos < p2->doc.dpos)
58 0 : return -1;
59 0 : if (p1->doc.dpos == p2->doc.dpos)
60 0 : return 0;
61 0 : return 1;
62 : }
63 :
64 : static int walk_and_match(FT_WORD *word, uint32 count, ALL_IN_ONE *aio)
65 0 : {
66 : int subkeys, r;
67 : uint doc_cnt;
68 : FT_SUPERDOC sdoc, *sptr;
69 : TREE_ELEMENT *selem;
70 0 : double gweight=1;
71 0 : MARIA_HA *info= aio->info;
72 0 : uchar *keybuff= aio->keybuff;
73 0 : MARIA_KEYDEF *keyinfo= info->s->keyinfo+aio->keynr;
74 0 : my_off_t key_root=info->s->state.key_root[aio->keynr];
75 0 : uint extra=HA_FT_WLEN+info->s->base.rec_reflength;
76 : MARIA_KEY key;
77 : #if HA_FT_WTYPE == HA_KEYTYPE_FLOAT
78 : float tmp_weight;
79 : #else
80 : #error
81 : #endif
82 0 : DBUG_ENTER("walk_and_match");
83 :
84 0 : word->weight=LWS_FOR_QUERY;
85 :
86 0 : _ma_ft_make_key(info, &key, aio->keynr, keybuff, word, 0);
87 0 : key.data_length-= HA_FT_WLEN;
88 0 : doc_cnt=0;
89 :
90 : /* Skip rows inserted by current inserted */
91 0 : for (r= _ma_search(info, &key, SEARCH_FIND, key_root) ;
92 0 : !r &&
93 : (subkeys=ft_sintXkorr(info->last_key.data +
94 : info->last_key.data_length +
95 : info->last_key.ref_length - extra)) > 0 &&
96 : info->cur_row.lastpos >= info->state->data_file_length ;
97 0 : r= _ma_search_next(info, &info->last_key, SEARCH_BIGGER, key_root))
98 : ;
99 :
100 0 : info->update|= HA_STATE_AKTIV; /* for _ma_test_if_changed() */
101 :
102 : /* The following should be safe, even if we compare doubles */
103 0 : while (!r && gweight)
104 : {
105 :
106 0 : if (key.data_length &&
107 : ha_compare_text(aio->charset,
108 : info->last_key.data+1,
109 : info->last_key.data_length +
110 : info->last_key.ref_length - extra - 1,
111 : key.data+1, key.data_length-1, 0, 0))
112 0 : break;
113 :
114 0 : if (subkeys<0)
115 : {
116 0 : if (doc_cnt)
117 0 : DBUG_RETURN(1); /* index is corrupted */
118 : /*
119 : TODO here: unsafe optimization, should this word
120 : be skipped (based on subkeys) ?
121 : */
122 0 : keybuff+= key.data_length;
123 0 : keyinfo= &info->s->ft2_keyinfo;
124 0 : key_root= info->cur_row.lastpos;
125 0 : key.data_length= 0;
126 0 : r= _ma_search_first(info, keyinfo, key_root);
127 0 : goto do_skip;
128 : }
129 : #if HA_FT_WTYPE == HA_KEYTYPE_FLOAT
130 0 : tmp_weight=*(float*)&subkeys;
131 : #else
132 : #error
133 : #endif
134 : /* The following should be safe, even if we compare doubles */
135 0 : if (tmp_weight==0)
136 0 : DBUG_RETURN(doc_cnt); /* stopword, doc_cnt should be 0 */
137 :
138 0 : sdoc.doc.dpos= info->cur_row.lastpos;
139 :
140 : /* saving document matched into dtree */
141 0 : if (!(selem=tree_insert(&aio->dtree, &sdoc, 0, aio->dtree.custom_arg)))
142 0 : DBUG_RETURN(1);
143 :
144 0 : sptr=(FT_SUPERDOC *)ELEMENT_KEY((&aio->dtree), selem);
145 :
146 0 : if (selem->count==1) /* document's first match */
147 0 : sptr->doc.weight=0;
148 : else
149 0 : sptr->doc.weight+=sptr->tmp_weight*sptr->word_ptr->weight;
150 :
151 0 : sptr->word_ptr=word;
152 0 : sptr->tmp_weight=tmp_weight;
153 :
154 0 : doc_cnt++;
155 :
156 0 : gweight=word->weight*GWS_IN_USE;
157 0 : if (gweight < 0 || doc_cnt > 2000000)
158 0 : gweight=0;
159 :
160 0 : if (_ma_test_if_changed(info) == 0)
161 0 : r= _ma_search_next(info, &info->last_key, SEARCH_BIGGER, key_root);
162 : else
163 0 : r= _ma_search(info, &info->last_key, SEARCH_BIGGER, key_root);
164 : do_skip:
165 0 : while ((subkeys=ft_sintXkorr(info->last_key.data +
166 : info->last_key.data_length +
167 : info->last_key.ref_length - extra)) > 0 &&
168 : !r && info->cur_row.lastpos >= info->state->data_file_length)
169 0 : r= _ma_search_next(info, &info->last_key, SEARCH_BIGGER, key_root);
170 :
171 : }
172 0 : word->weight=gweight;
173 :
174 0 : DBUG_RETURN(0);
175 : }
176 :
177 :
178 : static int walk_and_copy(FT_SUPERDOC *from,
179 : uint32 count __attribute__((unused)), FT_DOC **to)
180 0 : {
181 0 : DBUG_ENTER("walk_and_copy");
182 0 : from->doc.weight+=from->tmp_weight*from->word_ptr->weight;
183 0 : (*to)->dpos=from->doc.dpos;
184 0 : (*to)->weight=from->doc.weight;
185 0 : (*to)++;
186 0 : DBUG_RETURN(0);
187 : }
188 :
189 : static int walk_and_push(FT_SUPERDOC *from,
190 : uint32 count __attribute__((unused)), QUEUE *best)
191 0 : {
192 0 : DBUG_ENTER("walk_and_copy");
193 0 : from->doc.weight+=from->tmp_weight*from->word_ptr->weight;
194 0 : set_if_smaller(best->elements, ft_query_expansion_limit-1);
195 0 : queue_insert(best, (uchar *)& from->doc);
196 0 : DBUG_RETURN(0);
197 : }
198 :
199 :
200 : static int FT_DOC_cmp(void *unused __attribute__((unused)),
201 : FT_DOC *a, FT_DOC *b)
202 0 : {
203 0 : return sgn(b->weight - a->weight);
204 : }
205 :
206 :
207 : FT_INFO *maria_ft_init_nlq_search(MARIA_HA *info, uint keynr, uchar *query,
208 : uint query_len, uint flags, uchar *record)
209 0 : {
210 : TREE wtree;
211 : ALL_IN_ONE aio;
212 : FT_DOC *dptr;
213 0 : FT_INFO *dlist=NULL;
214 0 : MARIA_RECORD_POS saved_lastpos= info->cur_row.lastpos;
215 : struct st_mysql_ftparser *parser;
216 : MYSQL_FTPARSER_PARAM *ftparser_param;
217 0 : DBUG_ENTER("maria_ft_init_nlq_search");
218 :
219 : /* black magic ON */
220 0 : if ((int) (keynr = _ma_check_index(info,keynr)) < 0)
221 0 : DBUG_RETURN(NULL);
222 0 : if (_ma_readinfo(info,F_RDLCK,1))
223 0 : DBUG_RETURN(NULL);
224 : /* black magic OFF */
225 :
226 0 : aio.info=info;
227 0 : aio.keynr=keynr;
228 0 : aio.charset=info->s->keyinfo[keynr].seg->charset;
229 0 : aio.keybuff= info->lastkey_buff2;
230 0 : parser= info->s->keyinfo[keynr].parser;
231 0 : if (! (ftparser_param= maria_ftparser_call_initializer(info, keynr, 0)))
232 0 : goto err;
233 :
234 0 : bzero(&wtree,sizeof(wtree));
235 :
236 0 : init_tree(&aio.dtree,0,0,sizeof(FT_SUPERDOC),(qsort_cmp2)&FT_SUPERDOC_cmp,0,
237 : NULL, NULL);
238 :
239 0 : maria_ft_parse_init(&wtree, aio.charset);
240 0 : ftparser_param->flags= 0;
241 0 : if (maria_ft_parse(&wtree, query, query_len, parser, ftparser_param,
242 : &wtree.mem_root))
243 0 : goto err;
244 :
245 0 : if (tree_walk(&wtree, (tree_walk_action)&walk_and_match, &aio,
246 : left_root_right))
247 0 : goto err;
248 :
249 0 : if (flags & FT_EXPAND && ft_query_expansion_limit)
250 : {
251 : QUEUE best;
252 0 : init_queue(&best,ft_query_expansion_limit,0,0, (queue_compare) &FT_DOC_cmp,
253 : 0);
254 0 : tree_walk(&aio.dtree, (tree_walk_action) &walk_and_push,
255 : &best, left_root_right);
256 0 : while (best.elements)
257 : {
258 0 : my_off_t docid=((FT_DOC *)queue_remove(& best, 0))->dpos;
259 0 : if (!(*info->read_record)(info, record, docid))
260 : {
261 0 : info->update|= HA_STATE_AKTIV;
262 0 : ftparser_param->flags= MYSQL_FTFLAGS_NEED_COPY;
263 0 : if (unlikely(_ma_ft_parse(&wtree, info, keynr, record, ftparser_param,
264 : &wtree.mem_root)))
265 : {
266 0 : delete_queue(&best);
267 0 : goto err;
268 : }
269 : }
270 : }
271 0 : delete_queue(&best);
272 0 : reset_tree(&aio.dtree);
273 0 : if (tree_walk(&wtree, (tree_walk_action)&walk_and_match, &aio,
274 : left_root_right))
275 0 : goto err;
276 :
277 : }
278 :
279 : /*
280 : If ndocs == 0, this will not allocate RAM for FT_INFO.doc[],
281 : so if ndocs == 0, FT_INFO.doc[] must not be accessed.
282 : */
283 0 : dlist=(FT_INFO *)my_malloc(sizeof(FT_INFO)+
284 : sizeof(FT_DOC)*
285 : (int)(aio.dtree.elements_in_tree-1),
286 : MYF(0));
287 0 : if (!dlist)
288 0 : goto err;
289 :
290 0 : dlist->please= (struct _ft_vft *) & _ma_ft_vft_nlq;
291 0 : dlist->ndocs=aio.dtree.elements_in_tree;
292 0 : dlist->curdoc=-1;
293 0 : dlist->info=aio.info;
294 0 : dptr=dlist->doc;
295 :
296 0 : tree_walk(&aio.dtree, (tree_walk_action) &walk_and_copy,
297 : &dptr, left_root_right);
298 :
299 0 : if (flags & FT_SORTED)
300 0 : my_qsort2(dlist->doc, dlist->ndocs, sizeof(FT_DOC),
301 : (qsort2_cmp)&FT_DOC_cmp, 0);
302 :
303 0 : err:
304 0 : delete_tree(&aio.dtree);
305 0 : delete_tree(&wtree);
306 0 : info->cur_row.lastpos= saved_lastpos;
307 0 : DBUG_RETURN(dlist);
308 : }
309 :
310 :
311 : int maria_ft_nlq_read_next(FT_INFO *handler, char *record)
312 0 : {
313 0 : MARIA_HA *info= (MARIA_HA *) handler->info;
314 :
315 0 : if (++handler->curdoc >= handler->ndocs)
316 : {
317 0 : --handler->curdoc;
318 0 : return HA_ERR_END_OF_FILE;
319 : }
320 :
321 0 : info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
322 :
323 0 : info->cur_row.lastpos= handler->doc[handler->curdoc].dpos;
324 0 : if (!(*info->read_record)(info, (uchar *) record, info->cur_row.lastpos))
325 : {
326 0 : info->update|= HA_STATE_AKTIV; /* Record is read */
327 0 : return 0;
328 : }
329 0 : return my_errno;
330 : }
331 :
332 :
333 : float maria_ft_nlq_find_relevance(FT_INFO *handler,
334 : uchar *record __attribute__((unused)),
335 : uint length __attribute__((unused)))
336 0 : {
337 : int a,b,c;
338 0 : FT_DOC *docs=handler->doc;
339 0 : MARIA_RECORD_POS docid= handler->info->cur_row.lastpos;
340 :
341 0 : if (docid == HA_POS_ERROR)
342 0 : return -5.0;
343 :
344 : /* Assuming docs[] is sorted by dpos... */
345 :
346 0 : for (a=0, b=handler->ndocs, c=(a+b)/2; b-a>1; c=(a+b)/2)
347 : {
348 0 : if (docs[c].dpos > docid)
349 0 : b=c;
350 : else
351 0 : a=c;
352 : }
353 : /* bounds check to avoid accessing unallocated handler->doc */
354 0 : if (a < handler->ndocs && docs[a].dpos == docid)
355 0 : return (float) docs[a].weight;
356 : else
357 0 : return 0.0;
358 : }
359 :
360 :
361 : void maria_ft_nlq_close_search(FT_INFO *handler)
362 0 : {
363 0 : my_free(handler, MYF(0));
364 : }
365 :
366 :
367 : float maria_ft_nlq_get_relevance(FT_INFO *handler)
368 0 : {
369 0 : return (float) handler->doc[handler->curdoc].weight;
370 : }
371 :
372 :
373 : void maria_ft_nlq_reinit_search(FT_INFO *handler)
374 0 : {
375 0 : handler->curdoc=-1;
376 : }
377 :
|