← Back to team overview

maria-developers team mailing list archive

bzr commit into Mariadb 5.2, with Maria 2.0:maria/5.2 branch (igor:2742)


#At lp:maria/5.2 based on revid:igor@xxxxxxxxxxxx-20091222151209-8ijjadlltdvmr1dy

 2742 Igor Babaev	2009-12-22 [merge]

=== modified file 'include/my_handler.h'
--- a/include/my_handler.h	2009-12-15 07:16:46 +0000
+++ b/include/my_handler.h	2009-12-22 12:33:21 +0000
@@ -138,11 +138,13 @@ extern void my_handler_error_unregister(
 typedef enum icp_result {
+  ICP_ERROR=-1,
 #ifdef	__cplusplus

=== modified file 'libmysqld/Makefile.am'
--- a/libmysqld/Makefile.am	2009-12-21 02:26:15 +0000
+++ b/libmysqld/Makefile.am	2009-12-22 12:49:15 +0000
@@ -45,7 +45,7 @@ libmysqlsources =	errmsg.c get_password.
 noinst_HEADERS =	embedded_priv.h emb_qcache.h
-sqlsources = ds_mrr.cc derror.cc field.cc field_conv.cc strfunc.cc filesort.cc \
+sqlsources = derror.cc field.cc field_conv.cc strfunc.cc filesort.cc \
 	     ha_ndbcluster.cc ha_ndbcluster_cond.cc \
 	ha_ndbcluster_binlog.cc ha_partition.cc \
 	handler.cc sql_handler.cc \
@@ -77,7 +77,8 @@ sqlsources = ds_mrr.cc derror.cc field.c
 	rpl_filter.cc sql_partition.cc sql_builtin.cc sql_plugin.cc \
 	sql_tablespace.cc \
 	rpl_injector.cc my_user.c partition_info.cc \
-	sql_servers.cc event_parse_data.cc opt_table_elimination.cc
+	sql_servers.cc event_parse_data.cc opt_table_elimination.cc \
+	multi_range_read.cc opt_index_cond_pushdown.cc
 libmysqld_int_a_SOURCES= $(libmysqld_sources)
 nodist_libmysqld_int_a_SOURCES= $(libmysqlsources) $(sqlsources)

=== modified file 'mysql-test/include/common-tests.inc'
--- a/mysql-test/include/common-tests.inc	2006-06-09 07:23:59 +0000
+++ b/mysql-test/include/common-tests.inc	2009-12-22 12:33:21 +0000
@@ -1332,7 +1332,7 @@ explain select fld1 from t2 where fld1=2
 # Search with a key with LIKE constant
 # If the like starts with a certain letter key will be used.
 select fld1,fld3 from t2 where companynr = 37 and fld3 like 'f%';
 select fld3 from t2 where fld3 like "L%" and fld3 = "ok";
 select fld3 from t2 where (fld3 like "C%" and fld3 = "Chantilly");

=== modified file 'mysql-test/r/ctype_cp1251.result'
--- a/mysql-test/r/ctype_cp1251.result	2009-12-15 07:16:46 +0000
+++ b/mysql-test/r/ctype_cp1251.result	2009-12-22 12:33:21 +0000
@@ -65,8 +65,8 @@ insert into t1 (a) values ('air'),
 select * from t1 where a like 'we_%';
 a	b
-we_toshko	NULL
-we_ivo	NULL
 we_iliyan	NULL
+we_ivo	NULL
 we_martin	NULL
+we_toshko	NULL
 drop table t1;

=== modified file 'mysql-test/r/index_merge_myisam.result'
--- a/mysql-test/r/index_merge_myisam.result	2009-12-21 02:26:15 +0000
+++ b/mysql-test/r/index_merge_myisam.result	2009-12-22 12:49:15 +0000
@@ -1419,19 +1419,19 @@ drop table t1;
 select @@optimizer_switch;
 set optimizer_switch='index_merge=off,index_merge_union=off';
 select @@optimizer_switch;
 set optimizer_switch='index_merge_union=on';
 select @@optimizer_switch;
 set optimizer_switch='default,index_merge_sort_union=off';
 select @@optimizer_switch;
 set optimizer_switch=4;
 ERROR 42000: Variable 'optimizer_switch' can't be set to the value of '4'
 set optimizer_switch=NULL;
@@ -1458,21 +1458,21 @@ set optimizer_switch=default;
 set optimizer_switch='index_merge=off,index_merge_union=off,default';
 select @@optimizer_switch;
 set optimizer_switch=default;
 select @@global.optimizer_switch;
 set @@global.optimizer_switch=default;
 select @@global.optimizer_switch;
 # Check index_merge's @@optimizer_switch flags
 select @@optimizer_switch;
 create table t0 (a int);
 insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
 create table t1 (a int, b int, c int, filler char(100), 
@@ -1582,5 +1582,5 @@ id	select_type	table	type	possible_keys	
 set optimizer_switch=default;
 show variables like 'optimizer_switch';
 Variable_name	Value
-optimizer_switch	index_merge=on,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on
+optimizer_switch	index_merge=on,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on,index_condition_pushdown=on
 drop table t0, t1;

=== modified file 'mysql-test/r/innodb_mrr.result'
--- a/mysql-test/r/innodb_mrr.result	2009-12-19 19:54:54 +0000
+++ b/mysql-test/r/innodb_mrr.result	2009-12-22 12:33:21 +0000
@@ -292,10 +292,10 @@ NULL	9	0
 NULL	9	0
 drop table t1, t2;
 set storage_engine= @save_storage_engine;
-set @read_rnd_buffer_size_save= @@read_rnd_buffer_size;
-set read_rnd_buffer_size=64;
+set @mrr_buffer_size_save= @@mrr_buffer_size;
+set mrr_buffer_size=64;
-Warning	1292	Truncated incorrect read_rnd_buffer_size value: '64'
+Warning	1292	Truncated incorrect mrr_buffer_size value: '64'
 create table t1(a int);
 insert into t1 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
 create table t2(a char(8), b char(8), c char(8), filler char(100), key(a,b,c) ) engine=InnoDB;
@@ -318,10 +318,10 @@ filler char(10), key(d), primary key (a,
 insert into t2 select A.a, B.a, B.a, A.a, 'filler' from t1 A, t1 B;
 explain select * from t2 force index (d) where d < 10;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t2	range	d	d	5	NULL	53	Using index condition; Using MRR
+1	SIMPLE	t2	range	d	d	5	NULL	#	Using index condition; Using MRR
 drop table t2;
 drop table t1;
-set @@read_rnd_buffer_size= @read_rnd_buffer_size_save;
+set @@mrr_buffer_size= @mrr_buffer_size_save;
 create table t1 (f1 int not null, f2 int not null,f3 int not null, f4 char(1), primary key (f1,f2), key ix(f3))Engine=InnoDB;
 select * from t1 where (f3>=5 and f3<=10) or (f3>=1 and f3<=4);
 f1	f2	f3	f4

=== modified file 'mysql-test/r/myisam_mrr.result'
--- a/mysql-test/r/myisam_mrr.result	2009-12-15 17:23:55 +0000
+++ b/mysql-test/r/myisam_mrr.result	2009-12-22 14:43:00 +0000
@@ -1,8 +1,8 @@
 drop table if exists t1, t2, t3;
-set @read_rnd_buffer_size_save= @@read_rnd_buffer_size;
-set read_rnd_buffer_size=79;
+set @mrr_buffer_size_save= @@mrr_buffer_size;
+set mrr_buffer_size=79;
-Warning	1292	Truncated incorrect read_rnd_buffer_size value: '79'
+Warning	1292	Truncated incorrect mrr_buffer_size value: '79'
 create table t1(a int);
 show create table t1;
 Table	Create Table
@@ -293,7 +293,7 @@ NULL	7	0
 NULL	9	0
 NULL	9	0
 drop table t1, t2;
-set @@read_rnd_buffer_size= @read_rnd_buffer_size_save;
+set @@mrr_buffer_size= @mrr_buffer_size_save;
 ID int(10) unsigned NOT NULL AUTO_INCREMENT,
 col1 int(10) unsigned DEFAULT NULL,
@@ -388,3 +388,29 @@ explain select * from t1 where a < 20  o
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	range	a	a	5	NULL	20	Using index condition
 drop table t0, t1;
+# Part of MWL#67: DS-MRR backport: add an @@optimizer_switch flag for
+# index_condition pushdown: 
+#   - engine_condition_pushdown does not affect ICP
+select @@optimizer_switch;
+create table t0 (a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (a int, b int, key(a));
+insert into t1 select A.a + 10 *(B.a + 10*C.a), A.a + 10 *(B.a + 10*C.a) from t0 A, t0 B, t0 C;
+A query that will use ICP: 
+explain select * from t1 where a < 20;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	range	a	a	5	NULL	20	Using index condition; Using MRR
+set @save_optimizer_switch=@@optimizer_switch;
+set optimizer_switch='index_condition_pushdown=off';
+explain select * from t1 where a < 20;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	range	a	a	5	NULL	20	Using where; Using MRR
+set optimizer_switch='index_condition_pushdown=on';
+explain select * from t1 where a < 20;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	range	a	a	5	NULL	20	Using index condition; Using MRR
+set optimizer_switch=@save_optimizer_switch;
+drop table t0, t1;

=== modified file 'mysql-test/t/ctype_cp1251.test'
--- a/mysql-test/t/ctype_cp1251.test	2005-10-13 14:16:19 +0000
+++ b/mysql-test/t/ctype_cp1251.test	2009-12-22 12:33:21 +0000
@@ -44,6 +44,7 @@ insert into t1 (a) values ('air'),
 select * from t1 where a like 'we_%';
 drop table t1;

=== modified file 'mysql-test/t/innodb_mrr.test'
--- a/mysql-test/t/innodb_mrr.test	2009-12-15 07:16:46 +0000
+++ b/mysql-test/t/innodb_mrr.test	2009-12-22 12:33:21 +0000
@@ -12,8 +12,8 @@ set storage_engine=InnoDB;
 set storage_engine= @save_storage_engine;
 # Try big rowid sizes
-set @read_rnd_buffer_size_save= @@read_rnd_buffer_size;
-set read_rnd_buffer_size=64;
+set @mrr_buffer_size_save= @@mrr_buffer_size;
+set mrr_buffer_size=64;
 # By default InnoDB will fill values only for key parts used by the query,
 # which will cause DS-MRR to supply an invalid tuple on scan restoration. 
@@ -38,11 +38,12 @@ drop table t2;
 create table t2 (a char(100), b char(100), c char(100), d int, 
                  filler char(10), key(d), primary key (a,b,c)) engine= innodb;
 insert into t2 select A.a, B.a, B.a, A.a, 'filler' from t1 A, t1 B;
+--replace_column 9 #
 explain select * from t2 force index (d) where d < 10;
 drop table t2;
 drop table t1;
-set @@read_rnd_buffer_size= @read_rnd_buffer_size_save;
+set @@mrr_buffer_size= @mrr_buffer_size_save;
 # BUG#33033 "MySQL/InnoDB crashes with simple select range query"

=== modified file 'mysql-test/t/myisam_mrr.test'
--- a/mysql-test/t/myisam_mrr.test	2009-12-15 17:23:55 +0000
+++ b/mysql-test/t/myisam_mrr.test	2009-12-22 14:43:00 +0000
@@ -6,12 +6,12 @@
 drop table if exists t1, t2, t3;
-set @read_rnd_buffer_size_save= @@read_rnd_buffer_size;
-set read_rnd_buffer_size=79;
+set @mrr_buffer_size_save= @@mrr_buffer_size;
+set mrr_buffer_size=79;
 -- source include/mrr_tests.inc
-set @@read_rnd_buffer_size= @read_rnd_buffer_size_save;
+set @@mrr_buffer_size= @mrr_buffer_size_save;
 # BUG#30622: Incorrect query results for MRR + filesort
@@ -96,3 +96,32 @@ insert into t1 select A.a + 10 *(B.a + 1
 explain select * from t1 where a < 20  order by a;
 drop table t0, t1;
+-- echo #
+-- echo # Part of MWL#67: DS-MRR backport: add an @@optimizer_switch flag for
+-- echo # index_condition pushdown: 
+-- echo #   - engine_condition_pushdown does not affect ICP
+# Check that optimizer_switch is present
+--replace_regex /,table_elimination=o[nf]*//
+select @@optimizer_switch;
+# Check if it affects ICP 
+create table t0 (a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (a int, b int, key(a));
+insert into t1 select A.a + 10 *(B.a + 10*C.a), A.a + 10 *(B.a + 10*C.a) from t0 A, t0 B, t0 C; 
+-- echo A query that will use ICP: 
+explain select * from t1 where a < 20;
+set @save_optimizer_switch=@@optimizer_switch;
+set optimizer_switch='index_condition_pushdown=off';
+explain select * from t1 where a < 20;
+set optimizer_switch='index_condition_pushdown=on';
+explain select * from t1 where a < 20;
+set optimizer_switch=@save_optimizer_switch;
+drop table t0, t1;

=== modified file 'sql/Makefile.am'
--- a/sql/Makefile.am	2009-12-21 02:26:15 +0000
+++ b/sql/Makefile.am	2009-12-22 14:43:00 +0000
@@ -47,7 +47,7 @@ mysqld_LDADD =		libndb.la \
 			$(yassl_libs) $(openssl_libs) @MYSQLD_EXTRA_LIBS@
-noinst_HEADERS =	ds_mrr.h item.h item_func.h item_sum.h item_cmpfunc.h \
+noinst_HEADERS =	item.h item_func.h item_sum.h item_cmpfunc.h \
 			item_strfunc.h item_timefunc.h \
 			item_xmlfunc.h \
 			item_create.h item_subselect.h item_row.h \
@@ -77,9 +77,10 @@ noinst_HEADERS =	ds_mrr.h item.h item_fu
 			sql_plugin.h authors.h event_parse_data.h \
 			event_data_objects.h event_scheduler.h \
 			sql_partition.h partition_info.h partition_element.h \
-			contributors.h sql_servers.h
+			contributors.h sql_servers.h \
+                        multi_range_read.h
-mysqld_SOURCES =	ds_mrr.cc sql_lex.cc sql_handler.cc sql_partition.cc \
+mysqld_SOURCES =	sql_lex.cc sql_handler.cc sql_partition.cc \
 			item.cc item_sum.cc item_buff.cc item_func.cc \
 			item_cmpfunc.cc item_strfunc.cc item_timefunc.cc \
 			thr_malloc.cc item_create.cc item_subselect.cc \
@@ -123,7 +124,9 @@ mysqld_SOURCES =	ds_mrr.cc sql_lex.cc sq
 			sql_plugin.cc sql_binlog.cc \
 			sql_builtin.cc sql_tablespace.cc partition_info.cc \
 			sql_servers.cc event_parse_data.cc \
-                        opt_table_elimination.cc
+			opt_table_elimination.cc \
+			multi_range_read.cc \
+			opt_index_cond_pushdown.cc
 nodist_mysqld_SOURCES =	mini_client_errors.c pack.c client.c my_time.c my_user.c 
@@ -151,7 +154,7 @@ BUILT_SOURCES =		$(BUILT_MAINT_SRC) lex_
 EXTRA_DIST =		udf_example.c udf_example.def $(BUILT_MAINT_SRC) \
 			nt_servc.cc nt_servc.h \
 			message.mc  message.h message.rc MSG00001.bin \
-			CMakeLists.txt
+			CMakeLists.txt opt_range_mrr.cc
 CLEANFILES =        	lex_hash.h sql_yacc.output link_sources

=== removed file 'sql/ds_mrr.cc'
--- a/sql/ds_mrr.cc	2009-12-21 02:26:15 +0000
+++ b/sql/ds_mrr.cc	1970-01-01 00:00:00 +0000
@@ -1,1337 +0,0 @@
-#include "mysql_priv.h"
-#include "sql_select.h"
-/* **************************************************************************
- * DS-MRR implementation 
- ***************************************************************************/
-  DS-MRR: Initialize and start MRR scan
-  Initialize and start the MRR scan. Depending on the mode parameter, this
-  may use default or DS-MRR implementation.
-  @param h               Table handler to be used
-  @param key             Index to be used
-  @param seq_funcs       Interval sequence enumeration functions
-  @param seq_init_param  Interval sequence enumeration parameter
-  @param n_ranges        Number of ranges in the sequence.
-  @param mode            HA_MRR_* modes to use
-  @param buf             INOUT Buffer to use
-  @retval 0     Ok, Scan started.
-  @retval other Error
-int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, 
-                           void *seq_init_param, uint n_ranges, uint mode,
-                           HANDLER_BUFFER *buf)
-  uint elem_size;
-  Item *pushed_cond= NULL;
-  handler *new_h2= 0;
-  DBUG_ENTER("DsMrr_impl::dsmrr_init");
-  /*
-    index_merge may invoke a scan on an object for which dsmrr_info[_const]
-    has not been called, so set the owner handler here as well.
-  */
-  h= h_arg;
-  if (mode & HA_MRR_USE_DEFAULT_IMPL || mode & HA_MRR_SORTED)
-  {
-    use_default_impl= TRUE;
-    const int retval=
-      h->handler::multi_range_read_init(seq_funcs, seq_init_param,
-                                        n_ranges, mode, buf);
-    DBUG_RETURN(retval);
-  }
-  rowids_buf= buf->buffer;
-  is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION);
-  if (is_mrr_assoc)
-    status_var_increment(table->in_use->status_var.ha_multi_range_read_init_count);
-  rowids_buf_end= buf->buffer_end;
-  elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
-  rowids_buf_last= rowids_buf + 
-                      ((rowids_buf_end - rowids_buf)/ elem_size)*
-                      elem_size;
-  rowids_buf_end= rowids_buf_last;
-    /*
-    There can be two cases:
-    - This is the first call since index_init(), h2==NULL
-       Need to setup h2 then.
-    - This is not the first call, h2 is initalized and set up appropriately.
-       The caller might have called h->index_init(), need to switch h to
-       rnd_pos calls.
-  */
-  if (!h2)
-  {
-    /* Create a separate handler object to do rndpos() calls. */
-    THD *thd= current_thd;
-    /*
-      ::clone() takes up a lot of stack, especially on 64 bit platforms.
-      The constant 5 is an empiric result.
-    */
-    if (check_stack_overrun(thd, 5*STACK_MIN_SIZE, (uchar*) &new_h2))
-      DBUG_RETURN(1);
-    DBUG_ASSERT(h->active_index != MAX_KEY);
-    uint mrr_keyno= h->active_index;
-    /* Create a separate handler object to do rndpos() calls. */
-    if (!(new_h2= h->clone(thd->mem_root)) || 
-        new_h2->ha_external_lock(thd, F_RDLCK))
-    {
-      delete new_h2;
-      DBUG_RETURN(1);
-    }
-    if (mrr_keyno == h->pushed_idx_cond_keyno)
-      pushed_cond= h->pushed_idx_cond;
-    /*
-      Caution: this call will invoke this->dsmrr_close(). Do not put the
-      created secondary table handler into this->h2 or it will delete it.
-    */
-    if (h->ha_index_end())
-    {
-      h2=new_h2;
-      goto error;
-    }
-    h2= new_h2; /* Ok, now can put it into h2 */
-    table->prepare_for_position();
-    h2->extra(HA_EXTRA_KEYREAD);
-    if (h2->ha_index_init(mrr_keyno, FALSE))
-      goto error;
-    use_default_impl= FALSE;
-    if (pushed_cond)
-      h2->idx_cond_push(mrr_keyno, pushed_cond);
-  }
-  else
-  {
-    /* 
-      We get here when the access alternates betwen MRR scan(s) and non-MRR
-      scans.
-      Calling h->index_end() will invoke dsmrr_close() for this object,
-      which will delete h2. We need to keep it, so save put it away and dont
-      let it be deleted:
-    */
-    handler *save_h2= h2;
-    h2= NULL;
-    int res= (h->inited == handler::INDEX && h->ha_index_end());
-    h2= save_h2;
-    use_default_impl= FALSE;
-    if (res)
-      goto error;
-  }
-  if (h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges,
-                                          mode, buf) || 
-      dsmrr_fill_buffer())
-  {
-    goto error;
-  }
-  /*
-    If the above call has scanned through all intervals in *seq, then
-    adjust *buf to indicate that the remaining buffer space will not be used.
-  */
-  if (dsmrr_eof) 
-    buf->end_of_used_area= rowids_buf_last;
-  /*
-     h->inited == INDEX may occur when 'range checked for each record' is
-     used.
-  */
-  if ((h->inited != handler::RND) && 
-      ((h->inited==handler::INDEX? h->ha_index_end(): FALSE) || 
-       (h->ha_rnd_init(FALSE))))
-      goto error;
-  use_default_impl= FALSE;
-  h->mrr_funcs= *seq_funcs;
-  h2->ha_index_or_rnd_end();
-  h2->ha_external_lock(current_thd, F_UNLCK);
-  h2->close();
-  delete h2;
-  h2= NULL;
-void DsMrr_impl::dsmrr_close()
-  DBUG_ENTER("DsMrr_impl::dsmrr_close");
-  if (h2)
-  {
-    h2->ha_index_or_rnd_end();
-    h2->ha_external_lock(current_thd, F_UNLCK);
-    h2->close();
-    delete h2;
-    h2= NULL;
-  }
-  use_default_impl= TRUE;
-static int rowid_cmp(void *h, uchar *a, uchar *b)
-  return ((handler*)h)->cmp_ref(a, b);
-  DS-MRR: Fill the buffer with rowids and sort it by rowid
-  {This is an internal function of DiskSweep MRR implementation}
-  Scan the MRR ranges and collect ROWIDs (or {ROWID, range_id} pairs) into 
-  buffer. When the buffer is full or scan is completed, sort the buffer by 
-  rowid and return.
-  The function assumes that rowids buffer is empty when it is invoked. 
-  @param h  Table handler
-  @retval 0      OK, the next portion of rowids is in the buffer,
-                 properly ordered
-  @retval other  Error
-int DsMrr_impl::dsmrr_fill_buffer()
-  char *range_info;
-  int res;
-  DBUG_ENTER("DsMrr_impl::dsmrr_fill_buffer");
-  rowids_buf_cur= rowids_buf;
-  while ((rowids_buf_cur < rowids_buf_end) && 
-         !(res= h2->handler::multi_range_read_next(&range_info)))
-  {
-    KEY_MULTI_RANGE *curr_range= &h2->handler::mrr_cur_range;
-    if (h2->mrr_funcs.skip_index_tuple &&
-        h2->mrr_funcs.skip_index_tuple(h2->mrr_iter, curr_range->ptr))
-      continue;
-    /* Put rowid, or {rowid, range_id} pair into the buffer */
-    h2->position(table->record[0]);
-    memcpy(rowids_buf_cur, h2->ref, h2->ref_length);
-    rowids_buf_cur += h2->ref_length;
-    if (is_mrr_assoc)
-    {
-      memcpy(rowids_buf_cur, &range_info, sizeof(void*));
-      rowids_buf_cur += sizeof(void*);
-    }
-  }
-  if (res && res != HA_ERR_END_OF_FILE)
-    DBUG_RETURN(res); 
-  dsmrr_eof= test(res == HA_ERR_END_OF_FILE);
-  /* Sort the buffer contents by rowid */
-  uint elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
-  uint n_rowids= (rowids_buf_cur - rowids_buf) / elem_size;
-  my_qsort2(rowids_buf, n_rowids, elem_size, (qsort2_cmp)rowid_cmp,
-            (void*)h);
-  rowids_buf_last= rowids_buf_cur;
-  rowids_buf_cur=  rowids_buf;
-  DS-MRR implementation: multi_range_read_next() function
-int DsMrr_impl::dsmrr_next(char **range_info)
-  int res;
-  uchar *cur_range_info= 0;
-  uchar *rowid;
-  if (use_default_impl)
-    return h->handler::multi_range_read_next(range_info);
-  do
-  {
-    if (rowids_buf_cur == rowids_buf_last)
-    {
-      if (dsmrr_eof)
-      {
-        res= HA_ERR_END_OF_FILE;
-        goto end;
-      }
-    res= dsmrr_fill_buffer();
-      if (res)
-        goto end;
-    }
-    /* return eof if there are no rowids in the buffer after re-fill attempt */
-    if (rowids_buf_cur == rowids_buf_last)
-    {
-      res= HA_ERR_END_OF_FILE;
-      goto end;
-    }
-    rowid= rowids_buf_cur;
-    if (is_mrr_assoc)
-      memcpy(&cur_range_info, rowids_buf_cur + h->ref_length, sizeof(uchar**));
-    rowids_buf_cur += h->ref_length + sizeof(void*) * test(is_mrr_assoc);
-    if (h2->mrr_funcs.skip_record &&
-	h2->mrr_funcs.skip_record(h2->mrr_iter, (char *) cur_range_info, rowid))
-      continue;
-    res= h->rnd_pos(table->record[0], rowid);
-    break;
-  } while (true);
-  if (is_mrr_assoc)
-  {
-    memcpy(range_info, rowid + h->ref_length, sizeof(void*));
-  }
-  return res;
-  DS-MRR implementation: multi_range_read_info() function
-ha_rows DsMrr_impl::dsmrr_info(uint keyno, uint n_ranges, uint rows,
-                               uint *bufsz, uint *flags, COST_VECT *cost)
-  ha_rows res;
-  uint def_flags= *flags;
-  uint def_bufsz= *bufsz;
-  /* Get cost/flags/mem_usage of default MRR implementation */
-  res= h->handler::multi_range_read_info(keyno, n_ranges, rows, &def_bufsz,
-                                         &def_flags, cost);
-  DBUG_ASSERT(!res);
-  if ((*flags & HA_MRR_USE_DEFAULT_IMPL) || 
-      choose_mrr_impl(keyno, rows, &def_flags, &def_bufsz, cost))
-  {
-    /* Default implementation is choosen */
-    DBUG_PRINT("info", ("Default MRR implementation choosen"));
-    *flags= def_flags;
-    *bufsz= def_bufsz;
-  }
-  else
-  {
-    /* *flags and *bufsz were set by choose_mrr_impl */
-    DBUG_PRINT("info", ("DS-MRR implementation choosen"));
-  }
-  return 0;
-  DS-MRR Implementation: multi_range_read_info_const() function
-ha_rows DsMrr_impl::dsmrr_info_const(uint keyno, RANGE_SEQ_IF *seq,
-                                 void *seq_init_param, uint n_ranges, 
-                                 uint *bufsz, uint *flags, COST_VECT *cost)
-  ha_rows rows;
-  uint def_flags= *flags;
-  uint def_bufsz= *bufsz;
-  /* Get cost/flags/mem_usage of default MRR implementation */
-  rows= h->handler::multi_range_read_info_const(keyno, seq, seq_init_param,
-                                                n_ranges, &def_bufsz, 
-                                                &def_flags, cost);
-  if (rows == HA_POS_ERROR)
-  {
-    /* Default implementation can't perform MRR scan => we can't either */
-    return rows;
-  }
-  /*
-    If HA_MRR_USE_DEFAULT_IMPL has been passed to us, that is an order to
-    use the default MRR implementation (we need it for UPDATE/DELETE).
-    Otherwise, make a choice based on cost and @@optimizer_use_mrr.
-  */
-  if ((*flags & HA_MRR_USE_DEFAULT_IMPL) ||
-      choose_mrr_impl(keyno, rows, flags, bufsz, cost))
-  {
-    DBUG_PRINT("info", ("Default MRR implementation choosen"));
-    *flags= def_flags;
-    *bufsz= def_bufsz;
-  }
-  else
-  {
-    /* *flags and *bufsz were set by choose_mrr_impl */
-    DBUG_PRINT("info", ("DS-MRR implementation choosen"));
-  }
-  return rows;
-  Check if key has partially-covered columns
-  We can't use DS-MRR to perform range scans when the ranges are over
-  partially-covered keys, because we'll not have full key part values
-  (we'll have their prefixes from the index) and will not be able to check
-  if we've reached the end the range.
-  @param keyno  Key to check
-  @todo
-    Allow use of DS-MRR in cases where the index has partially-covered
-    components but they are not used for scanning.
-  @retval TRUE   Yes
-  @retval FALSE  No
-bool key_uses_partial_cols(TABLE *table, uint keyno)
-  KEY_PART_INFO *kp= table->key_info[keyno].key_part;
-  KEY_PART_INFO *kp_end= kp + table->key_info[keyno].key_parts;
-  for (; kp != kp_end; kp++)
-  {
-    if (!kp->field->part_of_key.is_set(keyno))
-      return TRUE;
-  }
-  return FALSE;
-  DS-MRR Internals: Choose between Default MRR implementation and DS-MRR
-  Make the choice between using Default MRR implementation and DS-MRR.
-  This function contains common functionality factored out of dsmrr_info()
-  and dsmrr_info_const(). The function assumes that the default MRR
-  implementation's applicability requirements are satisfied.
-  @param keyno       Index number
-  @param rows        E(full rows to be retrieved)
-  @param flags  IN   MRR flags provided by the MRR user
-                OUT  If DS-MRR is choosen, flags of DS-MRR implementation
-                     else the value is not modified
-  @param bufsz  IN   If DS-MRR is choosen, buffer use of DS-MRR implementation
-                     else the value is not modified
-  @param cost   IN   Cost of default MRR implementation
-                OUT  If DS-MRR is choosen, cost of DS-MRR scan
-                     else the value is not modified
-  @retval TRUE   Default MRR implementation should be used
-  @retval FALSE  DS-MRR implementation should be used
-bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags,
-                                 uint *bufsz, COST_VECT *cost)
-  COST_VECT dsmrr_cost;
-  bool res;
-  THD *thd= current_thd;
-  if (thd->variables.optimizer_use_mrr == 2 || *flags & HA_MRR_INDEX_ONLY ||
-      (keyno == table->s->primary_key && h->primary_key_is_clustered()) ||
-       key_uses_partial_cols(table, keyno))
-  {
-    /* Use the default implementation */
-    *flags |= HA_MRR_USE_DEFAULT_IMPL;
-    return TRUE;
-  }
-  uint add_len= table->key_info[keyno].key_length + h->ref_length; 
-  *bufsz -= add_len;
-  if (get_disk_sweep_mrr_cost(keyno, rows, *flags, bufsz, &dsmrr_cost))
-    return TRUE;
-  *bufsz += add_len;
-  bool force_dsmrr;
-  /* 
-    If @@optimizer_use_mrr==force, then set cost of DS-MRR to be minimum of
-    DS-MRR and Default implementations cost. This allows one to force use of
-    DS-MRR whenever it is applicable without affecting other cost-based
-    choices.
-  */
-  if ((force_dsmrr= (thd->variables.optimizer_use_mrr == 1)) &&
-      dsmrr_cost.total_cost() > cost->total_cost())
-    dsmrr_cost= *cost;
-  if (force_dsmrr || dsmrr_cost.total_cost() <= cost->total_cost())
-  {
-    *flags &= ~HA_MRR_USE_DEFAULT_IMPL;  /* Use the DS-MRR implementation */
-    *flags &= ~HA_MRR_SORTED;          /* We will return unordered output */
-    *cost= dsmrr_cost;
-    res= FALSE;
-  }
-  else
-  {
-    /* Use the default MRR implementation */
-    res= TRUE;
-  }
-  return res;
-static void get_sort_and_sweep_cost(TABLE *table, ha_rows nrows, COST_VECT *cost);
-  Get cost of DS-MRR scan
-  @param keynr              Index to be used
-  @param rows               E(Number of rows to be scanned)
-  @param flags              Scan parameters (HA_MRR_* flags)
-  @param buffer_size INOUT  Buffer size
-  @param cost        OUT    The cost
-  @retval FALSE  OK
-  @retval TRUE   Error, DS-MRR cannot be used (the buffer is too small
-                 for even 1 rowid)
-bool DsMrr_impl::get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags,
-                                         uint *buffer_size, COST_VECT *cost)
-  ulong max_buff_entries, elem_size;
-  ha_rows rows_in_full_step, rows_in_last_step;
-  uint n_full_steps;
-  double index_read_cost;
-  elem_size= h->ref_length + sizeof(void*) * (!test(flags & HA_MRR_NO_ASSOCIATION));
-  max_buff_entries = *buffer_size / elem_size;
-  if (!max_buff_entries)
-    return TRUE; /* Buffer has not enough space for even 1 rowid */
-  /* Number of iterations we'll make with full buffer */
-  n_full_steps= (uint)floor(rows2double(rows) / max_buff_entries);
-  /* 
-    Get numbers of rows we'll be processing in 
-     - non-last sweep, with full buffer 
-     - last iteration, with non-full buffer
-  */
-  rows_in_full_step= max_buff_entries;
-  rows_in_last_step= rows % max_buff_entries;
-  /* Adjust buffer size if we expect to use only part of the buffer */
-  if (n_full_steps)
-  {
-    get_sort_and_sweep_cost(table, rows, cost);
-    cost->multiply(n_full_steps);
-  }
-  else
-  {
-    cost->zero();
-    *buffer_size= max(*buffer_size, 
-                      (size_t)(1.2*rows_in_last_step) * elem_size + 
-                      h->ref_length + table->key_info[keynr].key_length);
-  }
-  COST_VECT last_step_cost;
-  get_sort_and_sweep_cost(table, rows_in_last_step, &last_step_cost);
-  cost->add(&last_step_cost);
-  if (n_full_steps != 0)
-    cost->mem_cost= *buffer_size;
-  else
-    cost->mem_cost= (double)rows_in_last_step * elem_size;
-  /* Total cost of all index accesses */
-  index_read_cost= h->index_only_read_time(keynr, (double)rows);
-  cost->add_io(index_read_cost, 1 /* Random seeks */);
-  return FALSE;
-  Get cost of one sort-and-sweep step
-    get_sort_and_sweep_cost()
-      table       Table being accessed
-      nrows       Number of rows to be sorted and retrieved
-      cost   OUT  The cost
-    Get cost of these operations:
-     - sort an array of #nrows ROWIDs using qsort
-     - read #nrows records from table in a sweep.
-void get_sort_and_sweep_cost(TABLE *table, ha_rows nrows, COST_VECT *cost)
-  if (nrows)
-  {
-    get_sweep_read_cost(table, nrows, FALSE, cost);
-    /* Add cost of qsort call: n * log2(n) * cost(rowid_comparison) */
-    double cmp_op= rows2double(nrows) * (1.0 / TIME_FOR_COMPARE_ROWID);
-    if (cmp_op < 3)
-      cmp_op= 3;
-    cost->cpu_cost += cmp_op * log2(cmp_op);
-  }
-  else
-    cost->zero();
-  Get cost of reading nrows table records in a "disk sweep"
-  A disk sweep read is a sequence of handler->rnd_pos(rowid) calls that made
-  for an ordered sequence of rowids.
-  We assume hard disk IO. The read is performed as follows:
-   1. The disk head is moved to the needed cylinder
-   2. The controller waits for the plate to rotate
-   3. The data is transferred
-  Time to do #3 is insignificant compared to #2+#1.
-  Time to move the disk head is proportional to head travel distance.
-  Time to wait for the plate to rotate depends on whether the disk head
-  was moved or not. 
-  If disk head wasn't moved, the wait time is proportional to distance
-  between the previous block and the block we're reading.
-  If the head was moved, we don't know how much we'll need to wait for the
-  plate to rotate. We assume the wait time to be a variate with a mean of
-  0.5 of full rotation time.
-  Our cost units are "random disk seeks". The cost of random disk seek is
-  actually not a constant, it depends one range of cylinders we're going
-  to access. We make it constant by introducing a fuzzy concept of "typical 
-  datafile length" (it's fuzzy as it's hard to tell whether it should
-  include index file, temp.tables etc). Then random seek cost is:
-    1 = half_rotation_cost + move_cost * 1/3 * typical_data_file_length
-  We define half_rotation_cost as DISK_SEEK_BASE_COST=0.9.
-  @param table             Table to be accessed
-  @param nrows             Number of rows to retrieve
-  @param interrupted       TRUE <=> Assume that the disk sweep will be
-                           interrupted by other disk IO. FALSE - otherwise.
-  @param cost         OUT  The cost.
-void get_sweep_read_cost(TABLE *table, ha_rows nrows, bool interrupted, 
-                         COST_VECT *cost)
-  DBUG_ENTER("get_sweep_read_cost");
-  cost->zero();
-  if (table->file->primary_key_is_clustered())
-  {
-    cost->io_count= table->file->read_time(table->s->primary_key,
-                                           (uint) nrows, nrows);
-  }
-  else
-  {
-    double n_blocks=
-      ceil(ulonglong2double(table->file->stats.data_file_length) / IO_SIZE);
-    double busy_blocks=
-      n_blocks * (1.0 - pow(1.0 - 1.0/n_blocks, rows2double(nrows)));
-    if (busy_blocks < 1.0)
-      busy_blocks= 1.0;
-    DBUG_PRINT("info",("sweep: nblocks=%g, busy_blocks=%g", n_blocks,
-                       busy_blocks));
-    cost->io_count= busy_blocks;
-    if (!interrupted)
-    {
-      /* Assume reading is done in one 'sweep' */
-      cost->avg_io_cost= (DISK_SEEK_BASE_COST +
-                          DISK_SEEK_PROP_COST*n_blocks/busy_blocks);
-    }
-  }
-  DBUG_PRINT("info",("returning cost=%g", cost->total_cost()));
-/* **************************************************************************
- * DS-MRR implementation ends
- ***************************************************************************/
-/* **************************************************************************
- * Index Condition Pushdown code starts
- ***************************************************************************/
-  Check if given expression uses only table fields covered by the given index
-    uses_index_fields_only()
-      item           Expression to check
-      tbl            The table having the index
-      keyno          The index number
-      other_tbls_ok  TRUE <=> Fields of other non-const tables are allowed
-    Check if given expression only uses fields covered by index #keyno in the
-    table tbl. The expression can use any fields in any other tables.
-    The expression is guaranteed not to be AND or OR - those constructs are 
-    handled outside of this function.
-    TRUE   Yes
-    FALSE  No
-bool uses_index_fields_only(Item *item, TABLE *tbl, uint keyno, 
-                            bool other_tbls_ok)
-  if (item->const_item())
-    return TRUE;
-  /* 
-    Don't push down the triggered conditions. Nested outer joins execution 
-    code may need to evaluate a condition several times (both triggered and
-    untriggered), and there is no way to put thi
-    TODO: Consider cloning the triggered condition and using the copies for:
-      1. push the first copy down, to have most restrictive index condition
-         possible
-      2. Put the second copy into tab->select_cond. 
-  */
-  if (item->type() == Item::FUNC_ITEM && 
-      ((Item_func*)item)->functype() == Item_func::TRIG_COND_FUNC)
-    return FALSE;
-  if (!(item->used_tables() & tbl->map))
-    return other_tbls_ok;
-  Item::Type item_type= item->type();
-  switch (item_type) {
-  case Item::FUNC_ITEM:
-    {
-      /* This is a function, apply condition recursively to arguments */
-      Item_func *item_func= (Item_func*)item;
-      Item **child;
-      Item **item_end= (item_func->arguments()) + item_func->argument_count();
-      for (child= item_func->arguments(); child != item_end; child++)
-      {
-        if (!uses_index_fields_only(*child, tbl, keyno, other_tbls_ok))
-          return FALSE;
-      }
-      return TRUE;
-    }
-  case Item::COND_ITEM:
-    {
-      /*
-        This is a AND/OR condition. Regular AND/OR clauses are handled by
-        make_cond_for_index() which will chop off the part that can be
-        checked with index. This code is for handling non-top-level AND/ORs,
-        e.g. func(x AND y).
-      */
-      List_iterator<Item> li(*((Item_cond*)item)->argument_list());
-      Item *item;
-      while ((item=li++))
-      {
-        if (!uses_index_fields_only(item, tbl, keyno, other_tbls_ok))
-          return FALSE;
-      }
-      return TRUE;
-    }
-  case Item::FIELD_ITEM:
-    {
-      Item_field *item_field= (Item_field*)item;
-      if (item_field->field->table != tbl) 
-        return TRUE;
-      /*
-        The below is probably a repetition - the first part checks the
-        other two, but let's play it safe:
-      */
-      return item_field->field->part_of_key.is_set(keyno) &&
-             item_field->field->type() != MYSQL_TYPE_GEOMETRY &&
-             item_field->field->type() != MYSQL_TYPE_BLOB;
-    }
-  case Item::REF_ITEM:
-    return uses_index_fields_only(item->real_item(), tbl, keyno,
-                                  other_tbls_ok);
-  default:
-    return FALSE; /* Play it safe, don't push unknown non-const items */
-  }
-  Get a part of the condition that can be checked using only index fields
-    make_cond_for_index()
-      cond           The source condition
-      table          The table that is partially available
-      keyno          The index in the above table. Only fields covered by the index
-                     are available
-      other_tbls_ok  TRUE <=> Fields of other non-const tables are allowed
-    Get a part of the condition that can be checked when for the given table 
-    we have values only of fields covered by some index. The condition may
-    refer to other tables, it is assumed that we have values of all of their 
-    fields.
-    Example:
-      make_cond_for_index(
-         "cond(t1.field) AND cond(t2.key1) AND cond(t2.non_key) AND cond(t2.key2)",
-          t2, keyno(t2.key1)) 
-      will return
-        "cond(t1.field) AND cond(t2.key2)"
-    Index condition, or NULL if no condition could be inferred.
-Item *make_cond_for_index(Item *cond, TABLE *table, uint keyno,
-                          bool other_tbls_ok)
-  if (!cond)
-    return NULL;
-  if (cond->type() == Item::COND_ITEM)
-  {
-    uint n_marked= 0;
-    if (((Item_cond*) cond)->functype() == Item_func::COND_AND_FUNC)
-    {
-      table_map used_tables= 0;
-      Item_cond_and *new_cond=new Item_cond_and;
-      if (!new_cond)
-	return (COND*) 0;
-      List_iterator<Item> li(*((Item_cond*) cond)->argument_list());
-      Item *item;
-      while ((item=li++))
-      {
-	Item *fix= make_cond_for_index(item, table, keyno, other_tbls_ok);
-	if (fix)
-        {
-	  new_cond->argument_list()->push_back(fix);
-          used_tables|= fix->used_tables();
-        }
-        n_marked += test(item->marker == ICP_COND_USES_INDEX_ONLY);
-      }
-      if (n_marked ==((Item_cond*)cond)->argument_list()->elements)
-        cond->marker= ICP_COND_USES_INDEX_ONLY;
-      switch (new_cond->argument_list()->elements) {
-      case 0:
-	return (COND*) 0;
-      case 1:
-        new_cond->used_tables_cache= used_tables;
-	return new_cond->argument_list()->head();
-      default:
-	new_cond->quick_fix_field();
-        new_cond->used_tables_cache= used_tables;
-	return new_cond;
-      }
-    }
-    else /* It's OR */
-    {
-      Item_cond_or *new_cond=new Item_cond_or;
-      if (!new_cond)
-	return (COND*) 0;
-      List_iterator<Item> li(*((Item_cond*) cond)->argument_list());
-      Item *item;
-      while ((item=li++))
-      {
-	Item *fix= make_cond_for_index(item, table, keyno, other_tbls_ok);
-	if (!fix)
-	  return (COND*) 0;
-	new_cond->argument_list()->push_back(fix);
-        n_marked += test(item->marker == ICP_COND_USES_INDEX_ONLY);
-      }
-      if (n_marked ==((Item_cond*)cond)->argument_list()->elements)
-        cond->marker= ICP_COND_USES_INDEX_ONLY;
-      new_cond->quick_fix_field();
-      new_cond->used_tables_cache= ((Item_cond_or*) cond)->used_tables_cache;
-      new_cond->top_level_item();
-      return new_cond;
-    }
-  }
-  if (!uses_index_fields_only(cond, table, keyno, other_tbls_ok))
-    return (COND*) 0;
-  cond->marker= ICP_COND_USES_INDEX_ONLY;
-  return cond;
-Item *make_cond_remainder(Item *cond, bool exclude_index)
-  if (exclude_index && cond->marker == ICP_COND_USES_INDEX_ONLY)
-    return 0; /* Already checked */
-  if (cond->type() == Item::COND_ITEM)
-  {
-    table_map tbl_map= 0;
-    if (((Item_cond*) cond)->functype() == Item_func::COND_AND_FUNC)
-    {
-      /* Create new top level AND item */
-      Item_cond_and *new_cond=new Item_cond_and;
-      if (!new_cond)
-	return (COND*) 0;
-      List_iterator<Item> li(*((Item_cond*) cond)->argument_list());
-      Item *item;
-      while ((item=li++))
-      {
-	Item *fix= make_cond_remainder(item, exclude_index);
-	if (fix)
-        {
-	  new_cond->argument_list()->push_back(fix);
-          tbl_map |= fix->used_tables();
-        }
-      }
-      switch (new_cond->argument_list()->elements) {
-      case 0:
-	return (COND*) 0;
-      case 1:
-	return new_cond->argument_list()->head();
-      default:
-	new_cond->quick_fix_field();
-        ((Item_cond*)new_cond)->used_tables_cache= tbl_map;
-	return new_cond;
-      }
-    }
-    else /* It's OR */
-    {
-      Item_cond_or *new_cond=new Item_cond_or;
-      if (!new_cond)
-	return (COND*) 0;
-      List_iterator<Item> li(*((Item_cond*) cond)->argument_list());
-      Item *item;
-      while ((item=li++))
-      {
-	Item *fix= make_cond_remainder(item, FALSE);
-	if (!fix)
-	  return (COND*) 0;
-	new_cond->argument_list()->push_back(fix);
-        tbl_map |= fix->used_tables();
-      }
-      new_cond->quick_fix_field();
-      ((Item_cond*)new_cond)->used_tables_cache= tbl_map;
-      new_cond->top_level_item();
-      return new_cond;
-    }
-  }
-  return cond;
-  Try to extract and push the index condition
-    push_index_cond()
-      tab            A join tab that has tab->table->file and its condition
-                     in tab->select_cond
-      keyno          Index for which extract and push the condition
-      other_tbls_ok  TRUE <=> Fields of other non-const tables are allowed
-    Try to extract and push the index condition down to table handler
-void push_index_cond(JOIN_TAB *tab, uint keyno, bool other_tbls_ok)
-  DBUG_ENTER("push_index_cond");
-  Item *idx_cond;
-  bool do_index_cond_pushdown=
-    ((tab->table->file->index_flags(keyno, 0, 1) &
-     tab->join->thd->variables.engine_condition_pushdown);
-  /*
-    Do not try index condition pushdown on indexes which have partially-covered
-    columns. Unpacking from a column prefix into index tuple is not a supported 
-    operation in some engines, see e.g. MySQL BUG#42991.
-    TODO: a better solution would be not to consider partially-covered columns
-    as parts of the index and still produce/check index condition for
-    fully-covered index columns.
-  */
-  KEY *key_info= tab->table->key_info + keyno;
-  for (uint kp= 0; kp < key_info->key_parts; kp++)
-  {
-    if ((key_info->key_part[kp].key_part_flag & HA_PART_KEY_SEG))
-    {
-      do_index_cond_pushdown= FALSE;
-      break;
-    }
-  }
-  /*
-    When WL#5116 is done this DBUG statement must be removed. It's just a
-    temporary hack to allow us to discriminate whether a test failure relates
-    to *Engine* or *Index* Condition Pushdown.
-  */
-  DBUG_EXECUTE_IF("optimizer_no_icp", do_index_cond_pushdown= false;);
-  if (do_index_cond_pushdown)
-  {
-    DBUG_EXECUTE("where",
-                 print_where(tab->select_cond, "full cond", QT_ORDINARY););
-    idx_cond= make_cond_for_index(tab->select_cond, tab->table, keyno,
-                                  other_tbls_ok);
-    DBUG_EXECUTE("where",
-                 print_where(idx_cond, "idx cond", QT_ORDINARY););
-    if (idx_cond)
-    {
-      Item *idx_remainder_cond= 0;
-      tab->pre_idx_push_select_cond= tab->select_cond;
-      /*
-        For BKA cache we store condition to special BKA cache field
-        because evaluation of the condition requires additional operations
-        before the evaluation. This condition is used in 
-        JOIN_CACHE_BKA[_UNIQUE]::skip_index_tuple() functions.
-      */
-      if (tab->use_join_cache &&
-          /*
-            if cache is used then the value is TRUE only 
-            for BKA[_UNIQUE] cache (see check_join_cache_usage func).
-            In this case other_tbls_ok is an equivalent of
-            cache->is_key_access().
-          */
-          other_tbls_ok &&
-          (idx_cond->used_tables() &
-           ~(tab->table->map | tab->join->const_table_map)))
-        tab->cache_idx_cond= idx_cond;
-      else
-        idx_remainder_cond= tab->table->file->idx_cond_push(keyno, idx_cond);
-      /*
-        Disable eq_ref's "lookup cache" if we've pushed down an index
-        condition. 
-        TODO: This check happens to work on current ICP implementations, but
-        there may exist a compliant implementation that will not work 
-        correctly with it. Sort this out when we stabilize the condition
-        pushdown APIs.
-      */
-      if (idx_remainder_cond != idx_cond)
-        tab->ref.disable_cache= TRUE;
-      Item *row_cond= make_cond_remainder(tab->select_cond, TRUE);
-      DBUG_EXECUTE("where",
-                   print_where(row_cond, "remainder cond", QT_ORDINARY););
-      if (row_cond)
-      {
-        if (!idx_remainder_cond)
-          tab->select_cond= row_cond;
-        else
-        {
-          COND *new_cond= new Item_cond_and(row_cond, idx_remainder_cond);
-          tab->select_cond= new_cond;
-	  tab->select_cond->quick_fix_field();
-          ((Item_cond_and*)tab->select_cond)->used_tables_cache= 
-            row_cond->used_tables() | idx_remainder_cond->used_tables();
-        }
-      }
-      else
-        tab->select_cond= idx_remainder_cond;
-      if (tab->select)
-      {
-        DBUG_EXECUTE("where",
-                     print_where(tab->select->cond,
-                                 "select_cond",
-                                 QT_ORDINARY););
-        tab->select->cond= tab->select_cond;
-      }
-    }
-  }
-/* **************************************************************************
- * Default MRR implementation starts
- ***************************************************************************/
- * Default MRR implementation (MRR to non-MRR converter)
- ***************************************************************************/
-  Get cost and other information about MRR scan over a known list of ranges
-  Calculate estimated cost and other information about an MRR scan for given
-  sequence of ranges.
-  @param keyno           Index number
-  @param seq             Range sequence to be traversed
-  @param seq_init_param  First parameter for seq->init()
-  @param n_ranges_arg    Number of ranges in the sequence, or 0 if the caller
-                         can't efficiently determine it
-  @param bufsz    INOUT  IN:  Size of the buffer available for use
-                         OUT: Size of the buffer that is expected to be actually
-                              used, or 0 if buffer is not needed.
-  @param flags    INOUT  A combination of HA_MRR_* flags
-  @param cost     OUT    Estimated cost of MRR access
-  @note
-    This method (or an overriding one in a derived class) must check for
-    thd->killed and return HA_POS_ERROR if it is not zero. This is required
-    for a user to be able to interrupt the calculation by killing the
-    connection/query.
-  @retval
-    HA_POS_ERROR  Error or the engine is unable to perform the requested
-                  scan. Values of OUT parameters are undefined.
-  @retval
-    other         OK, *cost contains cost of the scan, *bufsz and *flags
-                  contain scan parameters.
-handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
-                                     void *seq_init_param, uint n_ranges_arg,
-                                     uint *bufsz, uint *flags, COST_VECT *cost)
-  range_seq_t seq_it;
-  ha_rows rows, total_rows= 0;
-  uint n_ranges=0;
-  THD *thd= current_thd;
-  /* Default MRR implementation doesn't need buffer */
-  *bufsz= 0;
-  seq_it= seq->init(seq_init_param, n_ranges, *flags);
-  while (!seq->next(seq_it, &range))
-  {
-    if (unlikely(thd->killed != 0))
-      return HA_POS_ERROR;
-    n_ranges++;
-    key_range *min_endp, *max_endp;
-    if (range.range_flag & GEOM_FLAG)
-    {
-      /* In this case tmp_min_flag contains the handler-read-function */
-      range.start_key.flag= (ha_rkey_function) (range.range_flag ^ GEOM_FLAG);
-      min_endp= &range.start_key;
-      max_endp= NULL;
-    }
-    else
-    {
-      min_endp= range.start_key.length? &range.start_key : NULL;
-      max_endp= range.end_key.length? &range.end_key : NULL;
-    }
-    if ((range.range_flag & UNIQUE_RANGE) && !(range.range_flag & NULL_RANGE))
-      rows= 1; /* there can be at most one row */
-    else
-    {
-      if (HA_POS_ERROR == (rows= this->records_in_range(keyno, min_endp, 
-                                                        max_endp)))
-      {
-        /* Can't scan one range => can't do MRR scan at all */
-        total_rows= HA_POS_ERROR;
-        break;
-      }
-    }
-    total_rows += rows;
-  }
-  if (total_rows != HA_POS_ERROR)
-  {
-    /* The following calculation is the same as in multi_range_read_info(): */
-    *flags |= HA_MRR_USE_DEFAULT_IMPL;
-    cost->zero();
-    cost->avg_io_cost= 1; /* assume random seeks */
-    if ((*flags & HA_MRR_INDEX_ONLY) && total_rows > 2)
-      cost->io_count= index_only_read_time(keyno, (uint)total_rows);
-    else
-      cost->io_count= read_time(keyno, n_ranges, total_rows);
-    cost->cpu_cost= (double) total_rows / TIME_FOR_COMPARE + 0.01;
-  }
-  return total_rows;
-  Get cost and other information about MRR scan over some sequence of ranges
-  Calculate estimated cost and other information about an MRR scan for some
-  sequence of ranges.
-  The ranges themselves will be known only at execution phase. When this
-  function is called we only know number of ranges and a (rough) E(#records)
-  within those ranges.
-  Currently this function is only called for "n-keypart singlepoint" ranges,
-  i.e. each range is "keypart1=someconst1 AND ... AND keypartN=someconstN"
-  The flags parameter is a combination of those flags: HA_MRR_SORTED,
-  @param keyno           Index number
-  @param n_ranges        Estimated number of ranges (i.e. intervals) in the
-                         range sequence.
-  @param n_rows          Estimated total number of records contained within all
-                         of the ranges
-  @param bufsz    INOUT  IN:  Size of the buffer available for use
-                         OUT: Size of the buffer that will be actually used, or
-                              0 if buffer is not needed.
-  @param flags    INOUT  A combination of HA_MRR_* flags
-  @param cost     OUT    Estimated cost of MRR access
-  @retval
-    0     OK, *cost contains cost of the scan, *bufsz and *flags contain scan
-          parameters.
-  @retval
-    other Error or can't perform the requested scan
-ha_rows handler::multi_range_read_info(uint keyno, uint n_ranges, uint n_rows,
-                                       uint *bufsz, uint *flags, COST_VECT *cost)
-  *bufsz= 0; /* Default implementation doesn't need a buffer */
-  cost->zero();
-  cost->avg_io_cost= 1; /* assume random seeks */
-  /* Produce the same cost as non-MRR code does */
-  if (*flags & HA_MRR_INDEX_ONLY)
-    cost->io_count= index_only_read_time(keyno, n_rows);
-  else
-    cost->io_count= read_time(keyno, n_ranges, n_rows);
-  return 0;
-  Initialize the MRR scan
-  Initialize the MRR scan. This function may do heavyweight scan 
-  initialization like row prefetching/sorting/etc (NOTE: but better not do
-  it here as we may not need it, e.g. if we never satisfy WHERE clause on
-  previous tables. For many implementations it would be natural to do such
-  initializations in the first multi_read_range_next() call)
-  mode is a combination of the following flags: HA_MRR_SORTED,
-  @param seq             Range sequence to be traversed
-  @param seq_init_param  First parameter for seq->init()
-  @param n_ranges        Number of ranges in the sequence
-  @param mode            Flags, see the description section for the details
-  @param buf             INOUT: memory buffer to be used
-  @note
-    One must have called index_init() before calling this function. Several
-    multi_range_read_init() calls may be made in course of one query.
-    Until WL#2623 is done (see its text, section 3.2), the following will 
-    also hold:
-    The caller will guarantee that if "seq->init == mrr_ranges_array_init"
-    then seq_init_param is an array of n_ranges KEY_MULTI_RANGE structures.
-    This property will only be used by NDB handler until WL#2623 is done.
-    Buffer memory management is done according to the following scenario:
-    The caller allocates the buffer and provides it to the callee by filling
-    the members of HANDLER_BUFFER structure.
-    The callee consumes all or some fraction of the provided buffer space, and
-    sets the HANDLER_BUFFER members accordingly.
-    The callee may use the buffer memory until the next multi_range_read_init()
-    call is made, all records have been read, or until index_end() call is
-    made, whichever comes first.
-  @retval 0  OK
-  @retval 1  Error
-handler::multi_range_read_init(RANGE_SEQ_IF *seq_funcs, void *seq_init_param,
-                               uint n_ranges, uint mode, HANDLER_BUFFER *buf)
-  DBUG_ENTER("handler::multi_range_read_init");
-  mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode);
-  mrr_funcs= *seq_funcs;
-  mrr_is_output_sorted= test(mode & HA_MRR_SORTED);
-  mrr_have_range= FALSE;
-  Get next record in MRR scan
-  Default MRR implementation: read the next record
-  @param range_info  OUT  Undefined if HA_MRR_NO_ASSOCIATION flag is in effect
-                          Otherwise, the opaque value associated with the range
-                          that contains the returned record.
-  @retval 0      OK
-  @retval other  Error code
-int handler::multi_range_read_next(char **range_info)
-  int UNINIT_VAR(result);
-  int range_res;
-  DBUG_ENTER("handler::multi_range_read_next");
-  if (!mrr_have_range)
-  {
-    mrr_have_range= TRUE;
-    goto start;
-  }
-  do
-  {
-    /* Save a call if there can be only one row in range. */
-    if (mrr_cur_range.range_flag != (UNIQUE_RANGE | EQ_RANGE))
-    {
-      result= read_range_next();
-      /* On success or non-EOF errors jump to the end. */
-      if (result != HA_ERR_END_OF_FILE)
-        break;
-    }
-    else
-    {
-      if (was_semi_consistent_read())
-        goto scan_it_again;
-      /*
-        We need to set this for the last range only, but checking this
-        condition is more expensive than just setting the result code.
-      */
-      result= HA_ERR_END_OF_FILE;
-    }
-    /* Try the next range(s) until one matches a record. */
-    while (!(range_res= mrr_funcs.next(mrr_iter, &mrr_cur_range)))
-    {
-      result= read_range_first(mrr_cur_range.start_key.keypart_map ?
-                                 &mrr_cur_range.start_key : 0,
-                               mrr_cur_range.end_key.keypart_map ?
-                                 &mrr_cur_range.end_key : 0,
-                               test(mrr_cur_range.range_flag & EQ_RANGE),
-                               mrr_is_output_sorted);
-      if (result != HA_ERR_END_OF_FILE)
-        break;
-    }
-  }
-  while ((result == HA_ERR_END_OF_FILE) && !range_res);
-  *range_info= mrr_cur_range.ptr;
-  DBUG_PRINT("exit",("handler::multi_range_read_next result %d", result));
-  DBUG_RETURN(result);

=== removed file 'sql/ds_mrr.h'
--- a/sql/ds_mrr.h	2009-12-15 21:35:55 +0000
+++ b/sql/ds_mrr.h	1970-01-01 00:00:00 +0000
@@ -1,71 +0,0 @@
-  This file contains declarations for 
-   - Disk-Sweep MultiRangeRead (DS-MRR) implementation
-   - Index Condition Pushdown helper functions
-  A Disk-Sweep MRR interface implementation
-  This implementation makes range (and, in the future, 'ref') scans to read
-  table rows in disk sweeps. 
-  Currently it is used by MyISAM and InnoDB. Potentially it can be used with
-  any table handler that has non-clustered indexes and on-disk rows.
-class DsMrr_impl
-  typedef void (handler::*range_check_toggle_func_t)(bool on);
-  DsMrr_impl()
-    : h2(NULL) {};
-  /*
-    The "owner" handler object (the one that calls dsmrr_XXX functions.
-    It is used to retrieve full table rows by calling rnd_pos().
-  */
-  handler *h;
-  TABLE *table; /* Always equal to h->table */
-  /* Secondary handler object.  It is used for scanning the index */
-  handler *h2;
-  /* Buffer to store rowids, or (rowid, range_id) pairs */
-  uchar *rowids_buf;
-  uchar *rowids_buf_cur;   /* Current position when reading/writing */
-  uchar *rowids_buf_last;  /* When reading: end of used buffer space */
-  uchar *rowids_buf_end;   /* End of the buffer */
-  bool dsmrr_eof; /* TRUE <=> We have reached EOF when reading index tuples */
-  /* TRUE <=> need range association, buffer holds {rowid, range_id} pairs */
-  bool is_mrr_assoc;
-  bool use_default_impl; /* TRUE <=> shortcut all calls to default MRR impl */
-  void init(handler *h_arg, TABLE *table_arg)
-  {
-    h= h_arg; 
-    table= table_arg;
-  }
-  int dsmrr_init(handler *h, RANGE_SEQ_IF *seq_funcs, void *seq_init_param, 
-                 uint n_ranges, uint mode, HANDLER_BUFFER *buf);
-  void dsmrr_close();
-  int dsmrr_fill_buffer();
-  int dsmrr_next(char **range_info);
-  ha_rows dsmrr_info(uint keyno, uint n_ranges, uint keys, uint *bufsz,
-                     uint *flags, COST_VECT *cost);
-  ha_rows dsmrr_info_const(uint keyno, RANGE_SEQ_IF *seq, 
-                            void *seq_init_param, uint n_ranges, uint *bufsz,
-                            uint *flags, COST_VECT *cost);
-  bool choose_mrr_impl(uint keyno, ha_rows rows, uint *flags, uint *bufsz, 
-                       COST_VECT *cost);
-  bool get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags, 
-                               uint *buffer_size, COST_VECT *cost);

=== modified file 'sql/handler.h'
--- a/sql/handler.h	2009-12-15 17:23:55 +0000
+++ b/sql/handler.h	2009-12-22 12:33:21 +0000
@@ -2314,7 +2314,7 @@ private:
   friend class DsMrr_impl;
-#include "ds_mrr.h"
+#include "multi_range_read.h"
 	/* Some extern variables used with handlers */

=== added file 'sql/multi_range_read.cc'
--- a/sql/multi_range_read.cc	1970-01-01 00:00:00 +0000
+++ b/sql/multi_range_read.cc	2009-12-22 12:33:21 +0000
@@ -0,0 +1,944 @@
+#include "mysql_priv.h"
+#include "sql_select.h"
+ * Default MRR implementation (MRR to non-MRR converter)
+ ***************************************************************************/
+  Get cost and other information about MRR scan over a known list of ranges
+  Calculate estimated cost and other information about an MRR scan for given
+  sequence of ranges.
+  @param keyno           Index number
+  @param seq             Range sequence to be traversed
+  @param seq_init_param  First parameter for seq->init()
+  @param n_ranges_arg    Number of ranges in the sequence, or 0 if the caller
+                         can't efficiently determine it
+  @param bufsz    INOUT  IN:  Size of the buffer available for use
+                         OUT: Size of the buffer that is expected to be actually
+                              used, or 0 if buffer is not needed.
+  @param flags    INOUT  A combination of HA_MRR_* flags
+  @param cost     OUT    Estimated cost of MRR access
+  @note
+    This method (or an overriding one in a derived class) must check for
+    thd->killed and return HA_POS_ERROR if it is not zero. This is required
+    for a user to be able to interrupt the calculation by killing the
+    connection/query.
+  @retval
+    HA_POS_ERROR  Error or the engine is unable to perform the requested
+                  scan. Values of OUT parameters are undefined.
+  @retval
+    other         OK, *cost contains cost of the scan, *bufsz and *flags
+                  contain scan parameters.
+handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
+                                     void *seq_init_param, uint n_ranges_arg,
+                                     uint *bufsz, uint *flags, COST_VECT *cost)
+  range_seq_t seq_it;
+  ha_rows rows, total_rows= 0;
+  uint n_ranges=0;
+  THD *thd= current_thd;
+  /* Default MRR implementation doesn't need buffer */
+  *bufsz= 0;
+  seq_it= seq->init(seq_init_param, n_ranges, *flags);
+  while (!seq->next(seq_it, &range))
+  {
+    if (unlikely(thd->killed != 0))
+      return HA_POS_ERROR;
+    n_ranges++;
+    key_range *min_endp, *max_endp;
+    if (range.range_flag & GEOM_FLAG)
+    {
+      /* In this case tmp_min_flag contains the handler-read-function */
+      range.start_key.flag= (ha_rkey_function) (range.range_flag ^ GEOM_FLAG);
+      min_endp= &range.start_key;
+      max_endp= NULL;
+    }
+    else
+    {
+      min_endp= range.start_key.length? &range.start_key : NULL;
+      max_endp= range.end_key.length? &range.end_key : NULL;
+    }
+    if ((range.range_flag & UNIQUE_RANGE) && !(range.range_flag & NULL_RANGE))
+      rows= 1; /* there can be at most one row */
+    else
+    {
+      if (HA_POS_ERROR == (rows= this->records_in_range(keyno, min_endp, 
+                                                        max_endp)))
+      {
+        /* Can't scan one range => can't do MRR scan at all */
+        total_rows= HA_POS_ERROR;
+        break;
+      }
+    }
+    total_rows += rows;
+  }
+  if (total_rows != HA_POS_ERROR)
+  {
+    /* The following calculation is the same as in multi_range_read_info(): */
+    *flags |= HA_MRR_USE_DEFAULT_IMPL;
+    cost->zero();
+    cost->avg_io_cost= 1; /* assume random seeks */
+    if ((*flags & HA_MRR_INDEX_ONLY) && total_rows > 2)
+      cost->io_count= index_only_read_time(keyno, (uint)total_rows);
+    else
+      cost->io_count= read_time(keyno, n_ranges, total_rows);
+    cost->cpu_cost= (double) total_rows / TIME_FOR_COMPARE + 0.01;
+  }
+  return total_rows;
+  Get cost and other information about MRR scan over some sequence of ranges
+  Calculate estimated cost and other information about an MRR scan for some
+  sequence of ranges.
+  The ranges themselves will be known only at execution phase. When this
+  function is called we only know number of ranges and a (rough) E(#records)
+  within those ranges.
+  Currently this function is only called for "n-keypart singlepoint" ranges,
+  i.e. each range is "keypart1=someconst1 AND ... AND keypartN=someconstN"
+  The flags parameter is a combination of those flags: HA_MRR_SORTED,
+  @param keyno           Index number
+  @param n_ranges        Estimated number of ranges (i.e. intervals) in the
+                         range sequence.
+  @param n_rows          Estimated total number of records contained within all
+                         of the ranges
+  @param bufsz    INOUT  IN:  Size of the buffer available for use
+                         OUT: Size of the buffer that will be actually used, or
+                              0 if buffer is not needed.
+  @param flags    INOUT  A combination of HA_MRR_* flags
+  @param cost     OUT    Estimated cost of MRR access
+  @retval
+    0     OK, *cost contains cost of the scan, *bufsz and *flags contain scan
+          parameters.
+  @retval
+    other Error or can't perform the requested scan
+ha_rows handler::multi_range_read_info(uint keyno, uint n_ranges, uint n_rows,
+                                       uint *bufsz, uint *flags, COST_VECT *cost)
+  *bufsz= 0; /* Default implementation doesn't need a buffer */
+  cost->zero();
+  cost->avg_io_cost= 1; /* assume random seeks */
+  /* Produce the same cost as non-MRR code does */
+  if (*flags & HA_MRR_INDEX_ONLY)
+    cost->io_count= index_only_read_time(keyno, n_rows);
+  else
+    cost->io_count= read_time(keyno, n_ranges, n_rows);
+  return 0;
+  Initialize the MRR scan
+  Initialize the MRR scan. This function may do heavyweight scan 
+  initialization like row prefetching/sorting/etc (NOTE: but better not do
+  it here as we may not need it, e.g. if we never satisfy WHERE clause on
+  previous tables. For many implementations it would be natural to do such
+  initializations in the first multi_read_range_next() call)
+  mode is a combination of the following flags: HA_MRR_SORTED,
+  @param seq             Range sequence to be traversed
+  @param seq_init_param  First parameter for seq->init()
+  @param n_ranges        Number of ranges in the sequence
+  @param mode            Flags, see the description section for the details
+  @param buf             INOUT: memory buffer to be used
+  @note
+    One must have called index_init() before calling this function. Several
+    multi_range_read_init() calls may be made in course of one query.
+    Until WL#2623 is done (see its text, section 3.2), the following will 
+    also hold:
+    The caller will guarantee that if "seq->init == mrr_ranges_array_init"
+    then seq_init_param is an array of n_ranges KEY_MULTI_RANGE structures.
+    This property will only be used by NDB handler until WL#2623 is done.
+    Buffer memory management is done according to the following scenario:
+    The caller allocates the buffer and provides it to the callee by filling
+    the members of HANDLER_BUFFER structure.
+    The callee consumes all or some fraction of the provided buffer space, and
+    sets the HANDLER_BUFFER members accordingly.
+    The callee may use the buffer memory until the next multi_range_read_init()
+    call is made, all records have been read, or until index_end() call is
+    made, whichever comes first.
+  @retval 0  OK
+  @retval 1  Error
+handler::multi_range_read_init(RANGE_SEQ_IF *seq_funcs, void *seq_init_param,
+                               uint n_ranges, uint mode, HANDLER_BUFFER *buf)
+  DBUG_ENTER("handler::multi_range_read_init");
+  mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode);
+  mrr_funcs= *seq_funcs;
+  mrr_is_output_sorted= test(mode & HA_MRR_SORTED);
+  mrr_have_range= FALSE;
+  Get next record in MRR scan
+  Default MRR implementation: read the next record
+  @param range_info  OUT  Undefined if HA_MRR_NO_ASSOCIATION flag is in effect
+                          Otherwise, the opaque value associated with the range
+                          that contains the returned record.
+  @retval 0      OK
+  @retval other  Error code
+int handler::multi_range_read_next(char **range_info)
+  int UNINIT_VAR(result);
+  int range_res;
+  DBUG_ENTER("handler::multi_range_read_next");
+  if (!mrr_have_range)
+  {
+    mrr_have_range= TRUE;
+    goto start;
+  }
+  do
+  {
+    /* Save a call if there can be only one row in range. */
+    if (mrr_cur_range.range_flag != (UNIQUE_RANGE | EQ_RANGE))
+    {
+      result= read_range_next();
+      /* On success or non-EOF errors jump to the end. */
+      if (result != HA_ERR_END_OF_FILE)
+        break;
+    }
+    else
+    {
+      if (was_semi_consistent_read())
+        goto scan_it_again;
+      /*
+        We need to set this for the last range only, but checking this
+        condition is more expensive than just setting the result code.
+      */
+      result= HA_ERR_END_OF_FILE;
+    }
+    /* Try the next range(s) until one matches a record. */
+    while (!(range_res= mrr_funcs.next(mrr_iter, &mrr_cur_range)))
+    {
+      result= read_range_first(mrr_cur_range.start_key.keypart_map ?
+                                 &mrr_cur_range.start_key : 0,
+                               mrr_cur_range.end_key.keypart_map ?
+                                 &mrr_cur_range.end_key : 0,
+                               test(mrr_cur_range.range_flag & EQ_RANGE),
+                               mrr_is_output_sorted);
+      if (result != HA_ERR_END_OF_FILE)
+        break;
+    }
+  }
+  while ((result == HA_ERR_END_OF_FILE) && !range_res);
+  *range_info= mrr_cur_range.ptr;
+  DBUG_PRINT("exit",("handler::multi_range_read_next result %d", result));
+  DBUG_RETURN(result);
+ * DS-MRR implementation 
+ ***************************************************************************/
+  DS-MRR: Initialize and start MRR scan
+  Initialize and start the MRR scan. Depending on the mode parameter, this
+  may use default or DS-MRR implementation.
+  @param h               Table handler to be used
+  @param key             Index to be used
+  @param seq_funcs       Interval sequence enumeration functions
+  @param seq_init_param  Interval sequence enumeration parameter
+  @param n_ranges        Number of ranges in the sequence.
+  @param mode            HA_MRR_* modes to use
+  @param buf             INOUT Buffer to use
+  @retval 0     Ok, Scan started.
+  @retval other Error
+int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, 
+                           void *seq_init_param, uint n_ranges, uint mode,
+                           HANDLER_BUFFER *buf)
+  uint elem_size;
+  Item *pushed_cond= NULL;
+  handler *new_h2= 0;
+  DBUG_ENTER("DsMrr_impl::dsmrr_init");
+  /*
+    index_merge may invoke a scan on an object for which dsmrr_info[_const]
+    has not been called, so set the owner handler here as well.
+  */
+  h= h_arg;
+  if (mode & HA_MRR_USE_DEFAULT_IMPL || mode & HA_MRR_SORTED)
+  {
+    use_default_impl= TRUE;
+    const int retval=
+      h->handler::multi_range_read_init(seq_funcs, seq_init_param,
+                                        n_ranges, mode, buf);
+    DBUG_RETURN(retval);
+  }
+  rowids_buf= buf->buffer;
+  is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION);
+  if (is_mrr_assoc)
+    status_var_increment(table->in_use->status_var.ha_multi_range_read_init_count);
+  rowids_buf_end= buf->buffer_end;
+  elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
+  rowids_buf_last= rowids_buf + 
+                      ((rowids_buf_end - rowids_buf)/ elem_size)*
+                      elem_size;
+  rowids_buf_end= rowids_buf_last;
+    /*
+    There can be two cases:
+    - This is the first call since index_init(), h2==NULL
+       Need to setup h2 then.
+    - This is not the first call, h2 is initalized and set up appropriately.
+       The caller might have called h->index_init(), need to switch h to
+       rnd_pos calls.
+  */
+  if (!h2)
+  {
+    /* Create a separate handler object to do rndpos() calls. */
+    THD *thd= current_thd;
+    /*
+      ::clone() takes up a lot of stack, especially on 64 bit platforms.
+      The constant 5 is an empiric result.
+    */
+    if (check_stack_overrun(thd, 5*STACK_MIN_SIZE, (uchar*) &new_h2))
+      DBUG_RETURN(1);
+    DBUG_ASSERT(h->active_index != MAX_KEY);
+    uint mrr_keyno= h->active_index;
+    /* Create a separate handler object to do rndpos() calls. */
+    if (!(new_h2= h->clone(thd->mem_root)) || 
+        new_h2->ha_external_lock(thd, F_RDLCK))
+    {
+      delete new_h2;
+      DBUG_RETURN(1);
+    }
+    if (mrr_keyno == h->pushed_idx_cond_keyno)
+      pushed_cond= h->pushed_idx_cond;
+    /*
+      Caution: this call will invoke this->dsmrr_close(). Do not put the
+      created secondary table handler into this->h2 or it will delete it.
+    */
+    if (h->ha_index_end())
+    {
+      h2=new_h2;
+      goto error;
+    }
+    h2= new_h2; /* Ok, now can put it into h2 */
+    table->prepare_for_position();
+    h2->extra(HA_EXTRA_KEYREAD);
+    if (h2->ha_index_init(mrr_keyno, FALSE))
+      goto error;
+    use_default_impl= FALSE;
+    if (pushed_cond)
+      h2->idx_cond_push(mrr_keyno, pushed_cond);
+  }
+  else
+  {
+    /* 
+      We get here when the access alternates betwen MRR scan(s) and non-MRR
+      scans.
+      Calling h->index_end() will invoke dsmrr_close() for this object,
+      which will delete h2. We need to keep it, so save put it away and dont
+      let it be deleted:
+    */
+    handler *save_h2= h2;
+    h2= NULL;
+    int res= (h->inited == handler::INDEX && h->ha_index_end());
+    h2= save_h2;
+    use_default_impl= FALSE;
+    if (res)
+      goto error;
+  }
+  if (h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges,
+                                          mode, buf) || 
+      dsmrr_fill_buffer())
+  {
+    goto error;
+  }
+  /*
+    If the above call has scanned through all intervals in *seq, then
+    adjust *buf to indicate that the remaining buffer space will not be used.
+  */
+  if (dsmrr_eof) 
+    buf->end_of_used_area= rowids_buf_last;
+  /*
+     h->inited == INDEX may occur when 'range checked for each record' is
+     used.
+  */
+  if ((h->inited != handler::RND) && 
+      ((h->inited==handler::INDEX? h->ha_index_end(): FALSE) || 
+       (h->ha_rnd_init(FALSE))))
+      goto error;
+  use_default_impl= FALSE;
+  h->mrr_funcs= *seq_funcs;
+  h2->ha_index_or_rnd_end();
+  h2->ha_external_lock(current_thd, F_UNLCK);
+  h2->close();
+  delete h2;
+  h2= NULL;
+void DsMrr_impl::dsmrr_close()
+  DBUG_ENTER("DsMrr_impl::dsmrr_close");
+  if (h2)
+  {
+    h2->ha_index_or_rnd_end();
+    h2->ha_external_lock(current_thd, F_UNLCK);
+    h2->close();
+    delete h2;
+    h2= NULL;
+  }
+  use_default_impl= TRUE;
+static int rowid_cmp(void *h, uchar *a, uchar *b)
+  return ((handler*)h)->cmp_ref(a, b);
+  DS-MRR: Fill the buffer with rowids and sort it by rowid
+  {This is an internal function of DiskSweep MRR implementation}
+  Scan the MRR ranges and collect ROWIDs (or {ROWID, range_id} pairs) into 
+  buffer. When the buffer is full or scan is completed, sort the buffer by 
+  rowid and return.
+  The function assumes that rowids buffer is empty when it is invoked. 
+  @param h  Table handler
+  @retval 0      OK, the next portion of rowids is in the buffer,
+                 properly ordered
+  @retval other  Error
+int DsMrr_impl::dsmrr_fill_buffer()
+  char *range_info;
+  int res;
+  DBUG_ENTER("DsMrr_impl::dsmrr_fill_buffer");
+  rowids_buf_cur= rowids_buf;
+  while ((rowids_buf_cur < rowids_buf_end) && 
+         !(res= h2->handler::multi_range_read_next(&range_info)))
+  {
+    KEY_MULTI_RANGE *curr_range= &h2->handler::mrr_cur_range;
+    if (h2->mrr_funcs.skip_index_tuple &&
+        h2->mrr_funcs.skip_index_tuple(h2->mrr_iter, curr_range->ptr))
+      continue;
+    /* Put rowid, or {rowid, range_id} pair into the buffer */
+    h2->position(table->record[0]);
+    memcpy(rowids_buf_cur, h2->ref, h2->ref_length);
+    rowids_buf_cur += h2->ref_length;
+    if (is_mrr_assoc)
+    {
+      memcpy(rowids_buf_cur, &range_info, sizeof(void*));
+      rowids_buf_cur += sizeof(void*);
+    }
+  }
+  if (res && res != HA_ERR_END_OF_FILE)
+    DBUG_RETURN(res); 
+  dsmrr_eof= test(res == HA_ERR_END_OF_FILE);
+  /* Sort the buffer contents by rowid */
+  uint elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
+  uint n_rowids= (rowids_buf_cur - rowids_buf) / elem_size;
+  my_qsort2(rowids_buf, n_rowids, elem_size, (qsort2_cmp)rowid_cmp,
+            (void*)h);
+  rowids_buf_last= rowids_buf_cur;
+  rowids_buf_cur=  rowids_buf;
+  DS-MRR implementation: multi_range_read_next() function
+int DsMrr_impl::dsmrr_next(char **range_info)
+  int res;
+  uchar *cur_range_info= 0;
+  uchar *rowid;
+  if (use_default_impl)
+    return h->handler::multi_range_read_next(range_info);
+  do
+  {
+    if (rowids_buf_cur == rowids_buf_last)
+    {
+      if (dsmrr_eof)
+      {
+        res= HA_ERR_END_OF_FILE;
+        goto end;
+      }
+      res= dsmrr_fill_buffer();
+      if (res)
+        goto end;
+    }
+    /* return eof if there are no rowids in the buffer after re-fill attempt */
+    if (rowids_buf_cur == rowids_buf_last)
+    {
+      res= HA_ERR_END_OF_FILE;
+      goto end;
+    }
+    rowid= rowids_buf_cur;
+    if (is_mrr_assoc)
+      memcpy(&cur_range_info, rowids_buf_cur + h->ref_length, sizeof(uchar**));
+    rowids_buf_cur += h->ref_length + sizeof(void*) * test(is_mrr_assoc);
+    if (h2->mrr_funcs.skip_record &&
+	h2->mrr_funcs.skip_record(h2->mrr_iter, (char *) cur_range_info, rowid))
+      continue;
+    res= h->rnd_pos(table->record[0], rowid);
+    break;
+  } while (true);
+  if (is_mrr_assoc)
+  {
+    memcpy(range_info, rowid + h->ref_length, sizeof(void*));
+  }
+  return res;
+  DS-MRR implementation: multi_range_read_info() function
+ha_rows DsMrr_impl::dsmrr_info(uint keyno, uint n_ranges, uint rows,
+                               uint *bufsz, uint *flags, COST_VECT *cost)
+  ha_rows res;
+  uint def_flags= *flags;
+  uint def_bufsz= *bufsz;
+  /* Get cost/flags/mem_usage of default MRR implementation */
+  res= h->handler::multi_range_read_info(keyno, n_ranges, rows, &def_bufsz,
+                                         &def_flags, cost);
+  DBUG_ASSERT(!res);
+  if ((*flags & HA_MRR_USE_DEFAULT_IMPL) || 
+      choose_mrr_impl(keyno, rows, &def_flags, &def_bufsz, cost))
+  {
+    /* Default implementation is choosen */
+    DBUG_PRINT("info", ("Default MRR implementation choosen"));
+    *flags= def_flags;
+    *bufsz= def_bufsz;
+  }
+  else
+  {
+    /* *flags and *bufsz were set by choose_mrr_impl */
+    DBUG_PRINT("info", ("DS-MRR implementation choosen"));
+  }
+  return 0;
+  DS-MRR Implementation: multi_range_read_info_const() function
+ha_rows DsMrr_impl::dsmrr_info_const(uint keyno, RANGE_SEQ_IF *seq,
+                                 void *seq_init_param, uint n_ranges, 
+                                 uint *bufsz, uint *flags, COST_VECT *cost)
+  ha_rows rows;
+  uint def_flags= *flags;
+  uint def_bufsz= *bufsz;
+  /* Get cost/flags/mem_usage of default MRR implementation */
+  rows= h->handler::multi_range_read_info_const(keyno, seq, seq_init_param,
+                                                n_ranges, &def_bufsz, 
+                                                &def_flags, cost);
+  if (rows == HA_POS_ERROR)
+  {
+    /* Default implementation can't perform MRR scan => we can't either */
+    return rows;
+  }
+  /*
+    If HA_MRR_USE_DEFAULT_IMPL has been passed to us, that is an order to
+    use the default MRR implementation (we need it for UPDATE/DELETE).
+    Otherwise, make a choice based on cost and @@optimizer_use_mrr.
+  */
+  if ((*flags & HA_MRR_USE_DEFAULT_IMPL) ||
+      choose_mrr_impl(keyno, rows, flags, bufsz, cost))
+  {
+    DBUG_PRINT("info", ("Default MRR implementation choosen"));
+    *flags= def_flags;
+    *bufsz= def_bufsz;
+  }
+  else
+  {
+    /* *flags and *bufsz were set by choose_mrr_impl */
+    DBUG_PRINT("info", ("DS-MRR implementation choosen"));
+  }
+  return rows;
+  Check if key has partially-covered columns
+  We can't use DS-MRR to perform range scans when the ranges are over
+  partially-covered keys, because we'll not have full key part values
+  (we'll have their prefixes from the index) and will not be able to check
+  if we've reached the end the range.
+  @param keyno  Key to check
+  @todo
+    Allow use of DS-MRR in cases where the index has partially-covered
+    components but they are not used for scanning.
+  @retval TRUE   Yes
+  @retval FALSE  No
+bool key_uses_partial_cols(TABLE *table, uint keyno)
+  KEY_PART_INFO *kp= table->key_info[keyno].key_part;
+  KEY_PART_INFO *kp_end= kp + table->key_info[keyno].key_parts;
+  for (; kp != kp_end; kp++)
+  {
+    if (!kp->field->part_of_key.is_set(keyno))
+      return TRUE;
+  }
+  return FALSE;
+  DS-MRR Internals: Choose between Default MRR implementation and DS-MRR
+  Make the choice between using Default MRR implementation and DS-MRR.
+  This function contains common functionality factored out of dsmrr_info()
+  and dsmrr_info_const(). The function assumes that the default MRR
+  implementation's applicability requirements are satisfied.
+  @param keyno       Index number
+  @param rows        E(full rows to be retrieved)
+  @param flags  IN   MRR flags provided by the MRR user
+                OUT  If DS-MRR is choosen, flags of DS-MRR implementation
+                     else the value is not modified
+  @param bufsz  IN   If DS-MRR is choosen, buffer use of DS-MRR implementation
+                     else the value is not modified
+  @param cost   IN   Cost of default MRR implementation
+                OUT  If DS-MRR is choosen, cost of DS-MRR scan
+                     else the value is not modified
+  @retval TRUE   Default MRR implementation should be used
+  @retval FALSE  DS-MRR implementation should be used
+bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags,
+                                 uint *bufsz, COST_VECT *cost)
+  COST_VECT dsmrr_cost;
+  bool res;
+  THD *thd= current_thd;
+  if (thd->variables.optimizer_use_mrr == 2 || *flags & HA_MRR_INDEX_ONLY ||
+      (keyno == table->s->primary_key && h->primary_key_is_clustered()) ||
+       key_uses_partial_cols(table, keyno))
+  {
+    /* Use the default implementation */
+    *flags |= HA_MRR_USE_DEFAULT_IMPL;
+    return TRUE;
+  }
+  uint add_len= table->key_info[keyno].key_length + h->ref_length; 
+  *bufsz -= add_len;
+  if (get_disk_sweep_mrr_cost(keyno, rows, *flags, bufsz, &dsmrr_cost))
+    return TRUE;
+  *bufsz += add_len;
+  bool force_dsmrr;
+  /* 
+    If @@optimizer_use_mrr==force, then set cost of DS-MRR to be minimum of
+    DS-MRR and Default implementations cost. This allows one to force use of
+    DS-MRR whenever it is applicable without affecting other cost-based
+    choices.
+  */
+  if ((force_dsmrr= (thd->variables.optimizer_use_mrr == 1)) &&
+      dsmrr_cost.total_cost() > cost->total_cost())
+    dsmrr_cost= *cost;
+  if (force_dsmrr || dsmrr_cost.total_cost() <= cost->total_cost())
+  {
+    *flags &= ~HA_MRR_USE_DEFAULT_IMPL;  /* Use the DS-MRR implementation */
+    *flags &= ~HA_MRR_SORTED;          /* We will return unordered output */
+    *cost= dsmrr_cost;
+    res= FALSE;
+  }
+  else
+  {
+    /* Use the default MRR implementation */
+    res= TRUE;
+  }
+  return res;
+static void get_sort_and_sweep_cost(TABLE *table, ha_rows nrows, COST_VECT *cost);
+  Get cost of DS-MRR scan
+  @param keynr              Index to be used
+  @param rows               E(Number of rows to be scanned)
+  @param flags              Scan parameters (HA_MRR_* flags)
+  @param buffer_size INOUT  Buffer size
+  @param cost        OUT    The cost
+  @retval FALSE  OK
+  @retval TRUE   Error, DS-MRR cannot be used (the buffer is too small
+                 for even 1 rowid)
+bool DsMrr_impl::get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags,
+                                         uint *buffer_size, COST_VECT *cost)
+  ulong max_buff_entries, elem_size;
+  ha_rows rows_in_full_step, rows_in_last_step;
+  uint n_full_steps;
+  double index_read_cost;
+  elem_size= h->ref_length + sizeof(void*) * (!test(flags & HA_MRR_NO_ASSOCIATION));
+  max_buff_entries = *buffer_size / elem_size;
+  if (!max_buff_entries)
+    return TRUE; /* Buffer has not enough space for even 1 rowid */
+  /* Number of iterations we'll make with full buffer */
+  n_full_steps= (uint)floor(rows2double(rows) / max_buff_entries);
+  /* 
+    Get numbers of rows we'll be processing in 
+     - non-last sweep, with full buffer 
+     - last iteration, with non-full buffer
+  */
+  rows_in_full_step= max_buff_entries;
+  rows_in_last_step= rows % max_buff_entries;
+  /* Adjust buffer size if we expect to use only part of the buffer */
+  if (n_full_steps)
+  {
+    get_sort_and_sweep_cost(table, rows, cost);
+    cost->multiply(n_full_steps);
+  }
+  else
+  {
+    cost->zero();
+    *buffer_size= max(*buffer_size, 
+                      (size_t)(1.2*rows_in_last_step) * elem_size + 
+                      h->ref_length + table->key_info[keynr].key_length);
+  }
+  COST_VECT last_step_cost;
+  get_sort_and_sweep_cost(table, rows_in_last_step, &last_step_cost);
+  cost->add(&last_step_cost);
+  if (n_full_steps != 0)
+    cost->mem_cost= *buffer_size;
+  else
+    cost->mem_cost= (double)rows_in_last_step * elem_size;
+  /* Total cost of all index accesses */
+  index_read_cost= h->index_only_read_time(keynr, (double)rows);
+  cost->add_io(index_read_cost, 1 /* Random seeks */);
+  return FALSE;
+  Get cost of one sort-and-sweep step
+    get_sort_and_sweep_cost()
+      table       Table being accessed
+      nrows       Number of rows to be sorted and retrieved
+      cost   OUT  The cost
+    Get cost of these operations:
+     - sort an array of #nrows ROWIDs using qsort
+     - read #nrows records from table in a sweep.
+void get_sort_and_sweep_cost(TABLE *table, ha_rows nrows, COST_VECT *cost)
+  if (nrows)
+  {
+    get_sweep_read_cost(table, nrows, FALSE, cost);
+    /* Add cost of qsort call: n * log2(n) * cost(rowid_comparison) */
+    double cmp_op= rows2double(nrows) * (1.0 / TIME_FOR_COMPARE_ROWID);
+    if (cmp_op < 3)
+      cmp_op= 3;
+    cost->cpu_cost += cmp_op * log2(cmp_op);
+  }
+  else
+    cost->zero();
+  Get cost of reading nrows table records in a "disk sweep"
+  A disk sweep read is a sequence of handler->rnd_pos(rowid) calls that made
+  for an ordered sequence of rowids.
+  We assume hard disk IO. The read is performed as follows:
+   1. The disk head is moved to the needed cylinder
+   2. The controller waits for the plate to rotate
+   3. The data is transferred
+  Time to do #3 is insignificant compared to #2+#1.
+  Time to move the disk head is proportional to head travel distance.
+  Time to wait for the plate to rotate depends on whether the disk head
+  was moved or not. 
+  If disk head wasn't moved, the wait time is proportional to distance
+  between the previous block and the block we're reading.
+  If the head was moved, we don't know how much we'll need to wait for the
+  plate to rotate. We assume the wait time to be a variate with a mean of
+  0.5 of full rotation time.
+  Our cost units are "random disk seeks". The cost of random disk seek is
+  actually not a constant, it depends one range of cylinders we're going
+  to access. We make it constant by introducing a fuzzy concept of "typical 
+  datafile length" (it's fuzzy as it's hard to tell whether it should
+  include index file, temp.tables etc). Then random seek cost is:
+    1 = half_rotation_cost + move_cost * 1/3 * typical_data_file_length
+  We define half_rotation_cost as DISK_SEEK_BASE_COST=0.9.
+  @param table             Table to be accessed
+  @param nrows             Number of rows to retrieve
+  @param interrupted       TRUE <=> Assume that the disk sweep will be
+                           interrupted by other disk IO. FALSE - otherwise.
+  @param cost         OUT  The cost.
+void get_sweep_read_cost(TABLE *table, ha_rows nrows, bool interrupted, 
+                         COST_VECT *cost)
+  DBUG_ENTER("get_sweep_read_cost");
+  cost->zero();
+  if (table->file->primary_key_is_clustered())
+  {
+    cost->io_count= table->file->read_time(table->s->primary_key,
+                                           (uint) nrows, nrows);
+  }
+  else
+  {
+    double n_blocks=
+      ceil(ulonglong2double(table->file->stats.data_file_length) / IO_SIZE);
+    double busy_blocks=
+      n_blocks * (1.0 - pow(1.0 - 1.0/n_blocks, rows2double(nrows)));
+    if (busy_blocks < 1.0)
+      busy_blocks= 1.0;
+    DBUG_PRINT("info",("sweep: nblocks=%g, busy_blocks=%g", n_blocks,
+                       busy_blocks));
+    cost->io_count= busy_blocks;
+    if (!interrupted)
+    {
+      /* Assume reading is done in one 'sweep' */
+      cost->avg_io_cost= (DISK_SEEK_BASE_COST +
+                          DISK_SEEK_PROP_COST*n_blocks/busy_blocks);
+    }
+  }
+  DBUG_PRINT("info",("returning cost=%g", cost->total_cost()));
+/* **************************************************************************
+ * DS-MRR implementation ends
+ ***************************************************************************/

=== added file 'sql/multi_range_read.h'
--- a/sql/multi_range_read.h	1970-01-01 00:00:00 +0000
+++ b/sql/multi_range_read.h	2009-12-22 12:33:21 +0000
@@ -0,0 +1,70 @@
+  This file contains declarations for 
+   - Disk-Sweep MultiRangeRead (DS-MRR) implementation
+  A Disk-Sweep MRR interface implementation
+  This implementation makes range (and, in the future, 'ref') scans to read
+  table rows in disk sweeps. 
+  Currently it is used by MyISAM and InnoDB. Potentially it can be used with
+  any table handler that has non-clustered indexes and on-disk rows.
+class DsMrr_impl
+  typedef void (handler::*range_check_toggle_func_t)(bool on);
+  DsMrr_impl()
+    : h2(NULL) {};
+  /*
+    The "owner" handler object (the one that calls dsmrr_XXX functions.
+    It is used to retrieve full table rows by calling rnd_pos().
+  */
+  handler *h;
+  TABLE *table; /* Always equal to h->table */
+  /* Secondary handler object.  It is used for scanning the index */
+  handler *h2;
+  /* Buffer to store rowids, or (rowid, range_id) pairs */
+  uchar *rowids_buf;
+  uchar *rowids_buf_cur;   /* Current position when reading/writing */
+  uchar *rowids_buf_last;  /* When reading: end of used buffer space */
+  uchar *rowids_buf_end;   /* End of the buffer */
+  bool dsmrr_eof; /* TRUE <=> We have reached EOF when reading index tuples */
+  /* TRUE <=> need range association, buffer holds {rowid, range_id} pairs */
+  bool is_mrr_assoc;
+  bool use_default_impl; /* TRUE <=> shortcut all calls to default MRR impl */
+  void init(handler *h_arg, TABLE *table_arg)
+  {
+    h= h_arg; 
+    table= table_arg;
+  }
+  int dsmrr_init(handler *h, RANGE_SEQ_IF *seq_funcs, void *seq_init_param, 
+                 uint n_ranges, uint mode, HANDLER_BUFFER *buf);
+  void dsmrr_close();
+  int dsmrr_fill_buffer();
+  int dsmrr_next(char **range_info);
+  ha_rows dsmrr_info(uint keyno, uint n_ranges, uint keys, uint *bufsz,
+                     uint *flags, COST_VECT *cost);
+  ha_rows dsmrr_info_const(uint keyno, RANGE_SEQ_IF *seq, 
+                            void *seq_init_param, uint n_ranges, uint *bufsz,
+                            uint *flags, COST_VECT *cost);
+  bool choose_mrr_impl(uint keyno, ha_rows rows, uint *flags, uint *bufsz, 
+                       COST_VECT *cost);
+  bool get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags, 
+                               uint *buffer_size, COST_VECT *cost);

=== modified file 'sql/mysql_priv.h'
--- a/sql/mysql_priv.h	2009-12-15 07:16:46 +0000
+++ b/sql/mysql_priv.h	2009-12-22 12:33:21 +0000
@@ -540,12 +540,13 @@ protected:
 #ifdef DBUG_OFF
 #ifdef DBUG_OFF 
@@ -553,12 +554,14 @@ protected:
                                     OPTIMIZER_SWITCH_INDEX_MERGE_UNION | \
                                     OPTIMIZER_SWITCH_INDEX_MERGE_SORT_UNION | \
-                                    OPTIMIZER_SWITCH_INDEX_MERGE_INTERSECT)
+                                    OPTIMIZER_SWITCH_INDEX_MERGE_INTERSECT | \
+                                    OPTIMIZER_SWITCH_INDEX_COND_PUSHDOWN)
                                     OPTIMIZER_SWITCH_INDEX_MERGE_UNION | \
                                     OPTIMIZER_SWITCH_INDEX_MERGE_SORT_UNION | \
                                     OPTIMIZER_SWITCH_INDEX_MERGE_INTERSECT | \
+                                    OPTIMIZER_SWITCH_INDEX_COND_PUSHDOWN | \

=== modified file 'sql/mysqld.cc'
--- a/sql/mysqld.cc	2009-12-21 02:26:15 +0000
+++ b/sql/mysqld.cc	2009-12-22 12:49:15 +0000
@@ -300,6 +300,7 @@ static const char *optimizer_switch_name
+  "index_condition_pushdown",
 #ifndef DBUG_OFF
@@ -313,6 +314,7 @@ static const unsigned int optimizer_swit
   sizeof("index_merge_union") - 1,
   sizeof("index_merge_sort_union") - 1,
   sizeof("index_merge_intersection") - 1,
+  sizeof("index_condition_pushdown") - 1,
 #ifndef DBUG_OFF
   sizeof("table_elimination") - 1,
@@ -391,7 +393,8 @@ static const char *sql_mode_str= "OFF";
 /* Text representation for OPTIMIZER_SWITCH_DEFAULT */
 static const char *optimizer_switch_str="index_merge=on,index_merge_union=on,"
-                                        "index_merge_intersection=on"
+                                        "index_merge_intersection=on,"
+                                        "index_condition_pushdown=on"
 #ifndef DBUG_OFF                                        
@@ -5767,7 +5770,7 @@ enum options_mysqld
@@ -6968,6 +6971,12 @@ The minimum value for this variable is 4
    (uchar**) &global_system_variables.min_examined_row_limit,
    (uchar**) &max_system_variables.min_examined_row_limit, 0, GET_ULONG,
   REQUIRED_ARG, 0, 0, (longlong) ULONG_MAX, 0, 1L, 0},
+  {"mrr_buffer_size", OPT_MRR_BUFFER_SIZE,
+   "Size of buffer to use when using MRR with range access",
+   (uchar**) &global_system_variables.mrr_buff_size,
+   (uchar**) &max_system_variables.mrr_buff_size, 0,
+   INT_MAX32, MALLOC_OVERHEAD, 1 /* Small to be able to do tests */ , 0},
   {"myisam_block_size", OPT_MYISAM_BLOCK_SIZE,
    "Block size to be used for MyISAM index pages.",
    (uchar**) &opt_myisam_block_size,
@@ -7047,7 +7056,8 @@ The minimum value for this variable is 4
    0, GET_ULONG, OPT_ARG, MAX_TABLES+1, 0, MAX_TABLES+2, 0, 1, 0},
   {"optimizer_switch", OPT_OPTIMIZER_SWITCH,
    "optimizer_switch=option=val[,option=val...], where option={index_merge, "
-   "index_merge_union, index_merge_sort_union, index_merge_intersection"
+   "index_merge_union, index_merge_sort_union, index_merge_intersection, "
+   "index_condition_pushdown"
 #ifndef DBUG_OFF
    ", table_elimination"
@@ -7131,7 +7141,7 @@ The minimum value for this variable is 4
    (uchar**) &global_system_variables.read_rnd_buff_size,
    (uchar**) &max_system_variables.read_rnd_buff_size, 0,
-   INT_MAX32, MALLOC_OVERHEAD, 1 /* Small overhead to be able to test MRR, was: IO_SIZE*/ , 0},
   {"record_buffer", OPT_RECORD_BUFFER,
    "Alias for read_buffer_size",
    (uchar**) &global_system_variables.read_buff_size,

=== added file 'sql/opt_index_cond_pushdown.cc'
--- a/sql/opt_index_cond_pushdown.cc	1970-01-01 00:00:00 +0000
+++ b/sql/opt_index_cond_pushdown.cc	2009-12-22 12:49:15 +0000
@@ -0,0 +1,387 @@
+#include "mysql_priv.h"
+#include "sql_select.h"
+ * Index Condition Pushdown code starts
+ ***************************************************************************/
+  Check if given expression uses only table fields covered by the given index
+    uses_index_fields_only()
+      item           Expression to check
+      tbl            The table having the index
+      keyno          The index number
+      other_tbls_ok  TRUE <=> Fields of other non-const tables are allowed
+    Check if given expression only uses fields covered by index #keyno in the
+    table tbl. The expression can use any fields in any other tables.
+    The expression is guaranteed not to be AND or OR - those constructs are 
+    handled outside of this function.
+    TRUE   Yes
+    FALSE  No
+bool uses_index_fields_only(Item *item, TABLE *tbl, uint keyno, 
+                            bool other_tbls_ok)
+  if (item->const_item())
+    return TRUE;
+  /* 
+    Don't push down the triggered conditions. Nested outer joins execution 
+    code may need to evaluate a condition several times (both triggered and
+    untriggered), and there is no way to put thi
+    TODO: Consider cloning the triggered condition and using the copies for:
+      1. push the first copy down, to have most restrictive index condition
+         possible
+      2. Put the second copy into tab->select_cond. 
+  */
+  if (item->type() == Item::FUNC_ITEM && 
+      ((Item_func*)item)->functype() == Item_func::TRIG_COND_FUNC)
+    return FALSE;
+  if (!(item->used_tables() & tbl->map))
+    return other_tbls_ok;
+  Item::Type item_type= item->type();
+  switch (item_type) {
+  case Item::FUNC_ITEM:
+    {
+      /* This is a function, apply condition recursively to arguments */
+      Item_func *item_func= (Item_func*)item;
+      Item **child;
+      Item **item_end= (item_func->arguments()) + item_func->argument_count();
+      for (child= item_func->arguments(); child != item_end; child++)
+      {
+        if (!uses_index_fields_only(*child, tbl, keyno, other_tbls_ok))
+          return FALSE;
+      }
+      return TRUE;
+    }
+  case Item::COND_ITEM:
+    {
+      /*
+        This is a AND/OR condition. Regular AND/OR clauses are handled by
+        make_cond_for_index() which will chop off the part that can be
+        checked with index. This code is for handling non-top-level AND/ORs,
+        e.g. func(x AND y).
+      */
+      List_iterator<Item> li(*((Item_cond*)item)->argument_list());
+      Item *item;
+      while ((item=li++))
+      {
+        if (!uses_index_fields_only(item, tbl, keyno, other_tbls_ok))
+          return FALSE;
+      }
+      return TRUE;
+    }
+  case Item::FIELD_ITEM:
+    {
+      Item_field *item_field= (Item_field*)item;
+      if (item_field->field->table != tbl) 
+        return TRUE;
+      /*
+        The below is probably a repetition - the first part checks the
+        other two, but let's play it safe:
+      */
+      return item_field->field->part_of_key.is_set(keyno) &&
+             item_field->field->type() != MYSQL_TYPE_GEOMETRY &&
+             item_field->field->type() != MYSQL_TYPE_BLOB;
+    }
+  case Item::REF_ITEM:
+    return uses_index_fields_only(item->real_item(), tbl, keyno,
+                                  other_tbls_ok);
+  default:
+    return FALSE; /* Play it safe, don't push unknown non-const items */
+  }
+  Get a part of the condition that can be checked using only index fields
+    make_cond_for_index()
+      cond           The source condition
+      table          The table that is partially available
+      keyno          The index in the above table. Only fields covered by the index
+                     are available
+      other_tbls_ok  TRUE <=> Fields of other non-const tables are allowed
+    Get a part of the condition that can be checked when for the given table 
+    we have values only of fields covered by some index. The condition may
+    refer to other tables, it is assumed that we have values of all of their 
+    fields.
+    Example:
+      make_cond_for_index(
+         "cond(t1.field) AND cond(t2.key1) AND cond(t2.non_key) AND cond(t2.key2)",
+          t2, keyno(t2.key1)) 
+      will return
+        "cond(t1.field) AND cond(t2.key2)"
+    Index condition, or NULL if no condition could be inferred.
+Item *make_cond_for_index(Item *cond, TABLE *table, uint keyno,
+                          bool other_tbls_ok)
+  if (!cond)
+    return NULL;
+  if (cond->type() == Item::COND_ITEM)
+  {
+    uint n_marked= 0;
+    if (((Item_cond*) cond)->functype() == Item_func::COND_AND_FUNC)
+    {
+      table_map used_tables= 0;
+      Item_cond_and *new_cond=new Item_cond_and;
+      if (!new_cond)
+	return (COND*) 0;
+      List_iterator<Item> li(*((Item_cond*) cond)->argument_list());
+      Item *item;
+      while ((item=li++))
+      {
+	Item *fix= make_cond_for_index(item, table, keyno, other_tbls_ok);
+	if (fix)
+        {
+	  new_cond->argument_list()->push_back(fix);
+          used_tables|= fix->used_tables();
+        }
+        n_marked += test(item->marker == ICP_COND_USES_INDEX_ONLY);
+      }
+      if (n_marked ==((Item_cond*)cond)->argument_list()->elements)
+        cond->marker= ICP_COND_USES_INDEX_ONLY;
+      switch (new_cond->argument_list()->elements) {
+      case 0:
+	return (COND*) 0;
+      case 1:
+        new_cond->used_tables_cache= used_tables;
+	return new_cond->argument_list()->head();
+      default:
+	new_cond->quick_fix_field();
+        new_cond->used_tables_cache= used_tables;
+	return new_cond;
+      }
+    }
+    else /* It's OR */
+    {
+      Item_cond_or *new_cond=new Item_cond_or;
+      if (!new_cond)
+	return (COND*) 0;
+      List_iterator<Item> li(*((Item_cond*) cond)->argument_list());
+      Item *item;
+      while ((item=li++))
+      {
+	Item *fix= make_cond_for_index(item, table, keyno, other_tbls_ok);
+	if (!fix)
+	  return (COND*) 0;
+	new_cond->argument_list()->push_back(fix);
+        n_marked += test(item->marker == ICP_COND_USES_INDEX_ONLY);
+      }
+      if (n_marked ==((Item_cond*)cond)->argument_list()->elements)
+        cond->marker= ICP_COND_USES_INDEX_ONLY;
+      new_cond->quick_fix_field();
+      new_cond->used_tables_cache= ((Item_cond_or*) cond)->used_tables_cache;
+      new_cond->top_level_item();
+      return new_cond;
+    }
+  }
+  if (!uses_index_fields_only(cond, table, keyno, other_tbls_ok))
+    return (COND*) 0;
+  cond->marker= ICP_COND_USES_INDEX_ONLY;
+  return cond;
+Item *make_cond_remainder(Item *cond, bool exclude_index)
+  if (exclude_index && cond->marker == ICP_COND_USES_INDEX_ONLY)
+    return 0; /* Already checked */
+  if (cond->type() == Item::COND_ITEM)
+  {
+    table_map tbl_map= 0;
+    if (((Item_cond*) cond)->functype() == Item_func::COND_AND_FUNC)
+    {
+      /* Create new top level AND item */
+      Item_cond_and *new_cond=new Item_cond_and;
+      if (!new_cond)
+	return (COND*) 0;
+      List_iterator<Item> li(*((Item_cond*) cond)->argument_list());
+      Item *item;
+      while ((item=li++))
+      {
+	Item *fix= make_cond_remainder(item, exclude_index);
+	if (fix)
+        {
+	  new_cond->argument_list()->push_back(fix);
+          tbl_map |= fix->used_tables();
+        }
+      }
+      switch (new_cond->argument_list()->elements) {
+      case 0:
+	return (COND*) 0;
+      case 1:
+	return new_cond->argument_list()->head();
+      default:
+	new_cond->quick_fix_field();
+        ((Item_cond*)new_cond)->used_tables_cache= tbl_map;
+	return new_cond;
+      }
+    }
+    else /* It's OR */
+    {
+      Item_cond_or *new_cond=new Item_cond_or;
+      if (!new_cond)
+	return (COND*) 0;
+      List_iterator<Item> li(*((Item_cond*) cond)->argument_list());
+      Item *item;
+      while ((item=li++))
+      {
+	Item *fix= make_cond_remainder(item, FALSE);
+	if (!fix)
+	  return (COND*) 0;
+	new_cond->argument_list()->push_back(fix);
+        tbl_map |= fix->used_tables();
+      }
+      new_cond->quick_fix_field();
+      ((Item_cond*)new_cond)->used_tables_cache= tbl_map;
+      new_cond->top_level_item();
+      return new_cond;
+    }
+  }
+  return cond;
+  Try to extract and push the index condition
+    push_index_cond()
+      tab            A join tab that has tab->table->file and its condition
+                     in tab->select_cond
+      keyno          Index for which extract and push the condition
+      other_tbls_ok  TRUE <=> Fields of other non-const tables are allowed
+    Try to extract and push the index condition down to table handler
+void push_index_cond(JOIN_TAB *tab, uint keyno, bool other_tbls_ok)
+  DBUG_ENTER("push_index_cond");
+  Item *idx_cond;
+  bool do_index_cond_pushdown=
+    ((tab->table->file->index_flags(keyno, 0, 1) &
+     optimizer_flag(tab->join->thd, OPTIMIZER_SWITCH_INDEX_COND_PUSHDOWN));
+  /*
+    Do not try index condition pushdown on indexes which have partially-covered
+    columns. Unpacking from a column prefix into index tuple is not a supported 
+    operation in some engines, see e.g. MySQL BUG#42991.
+    TODO: a better solution would be not to consider partially-covered columns
+    as parts of the index and still produce/check index condition for
+    fully-covered index columns.
+  */
+  KEY *key_info= tab->table->key_info + keyno;
+  for (uint kp= 0; kp < key_info->key_parts; kp++)
+  {
+    if ((key_info->key_part[kp].key_part_flag & HA_PART_KEY_SEG))
+    {
+      do_index_cond_pushdown= FALSE;
+      break;
+    }
+  }
+  if (do_index_cond_pushdown)
+  {
+    DBUG_EXECUTE("where",
+                 print_where(tab->select_cond, "full cond", QT_ORDINARY););
+    idx_cond= make_cond_for_index(tab->select_cond, tab->table, keyno,
+                                  other_tbls_ok);
+    DBUG_EXECUTE("where",
+                 print_where(idx_cond, "idx cond", QT_ORDINARY););
+    if (idx_cond)
+    {
+      Item *idx_remainder_cond= 0;
+      tab->pre_idx_push_select_cond= tab->select_cond;
+      /*
+        For BKA cache we store condition to special BKA cache field
+        because evaluation of the condition requires additional operations
+        before the evaluation. This condition is used in 
+        JOIN_CACHE_BKA[_UNIQUE]::skip_index_tuple() functions.
+      */
+      if (tab->use_join_cache &&
+          /*
+            if cache is used then the value is TRUE only 
+            for BKA[_UNIQUE] cache (see check_join_cache_usage func).
+            In this case other_tbls_ok is an equivalent of
+            cache->is_key_access().
+          */
+          other_tbls_ok &&
+          (idx_cond->used_tables() &
+           ~(tab->table->map | tab->join->const_table_map)))
+        tab->cache_idx_cond= idx_cond;
+      else
+        idx_remainder_cond= tab->table->file->idx_cond_push(keyno, idx_cond);
+      /*
+        Disable eq_ref's "lookup cache" if we've pushed down an index
+        condition. 
+        TODO: This check happens to work on current ICP implementations, but
+        there may exist a compliant implementation that will not work 
+        correctly with it. Sort this out when we stabilize the condition
+        pushdown APIs.
+      */
+      if (idx_remainder_cond != idx_cond)
+        tab->ref.disable_cache= TRUE;
+      Item *row_cond= make_cond_remainder(tab->select_cond, TRUE);
+      DBUG_EXECUTE("where",
+                   print_where(row_cond, "remainder cond", QT_ORDINARY););
+      if (row_cond)
+      {
+        if (!idx_remainder_cond)
+          tab->select_cond= row_cond;
+        else
+        {
+          COND *new_cond= new Item_cond_and(row_cond, idx_remainder_cond);
+          tab->select_cond= new_cond;
+	  tab->select_cond->quick_fix_field();
+          ((Item_cond_and*)tab->select_cond)->used_tables_cache= 
+            row_cond->used_tables() | idx_remainder_cond->used_tables();
+        }
+      }
+      else
+        tab->select_cond= idx_remainder_cond;
+      if (tab->select)
+      {
+        DBUG_EXECUTE("where",
+                     print_where(tab->select->cond,
+                                 "select_cond",
+                                 QT_ORDINARY););
+        tab->select->cond= tab->select_cond;
+      }
+    }
+  }

=== modified file 'sql/opt_range.cc'
--- a/sql/opt_range.cc	2009-12-15 07:16:46 +0000
+++ b/sql/opt_range.cc	2009-12-22 12:33:21 +0000
@@ -720,6 +720,7 @@ public:
   uint8 first_null_comp; /* first null component if any, 0 - otherwise */
   class TRP_RANGE;
@@ -789,7 +790,9 @@ static SEL_ARG null_element(SEL_ARG::IMP
 static bool null_part_in_key(KEY_PART *key_part, const uchar *key,
                              uint length);
 bool sel_trees_can_be_ored(SEL_TREE *tree1, SEL_TREE *tree2, RANGE_OPT_PARAM* param);
+static bool is_key_scan_ror(PARAM *param, uint keynr, uint8 nparts);
+#include "opt_range_mrr.cc"
   SEL_IMERGE is a list of possible ways to do index merge, i.e. it is
   my_init_dynamic_array(&ranges, sizeof(QUICK_RANGE*), 16, 16);
   /* 'thd' is not accessible in QUICK_RANGE_SELECT::reset(). */
-  mrr_buf_size= thd->variables.read_rnd_buff_size;
+  mrr_buf_size= thd->variables.mrr_buff_size;
   mrr_buf_desc= NULL;
   if (!no_alloc && !parent_alloc)
@@ -4875,7 +4878,6 @@ static TRP_RANGE *get_key_scans_params(P
   uint    UNINIT_VAR(best_mrr_flags),            /* protected by key_to_read */
           UNINIT_VAR(best_buf_size);             /* protected by key_to_read */
   TRP_RANGE* read_plan= NULL;
-  bool pk_is_clustered= param->table->file->primary_key_is_clustered();
     Note that there may be trees that have type SEL_TREE::KEY but contain no
@@ -7281,284 +7283,6 @@ void SEL_ARG::test_use_count(SEL_ARG *ro
-  MRR Range Sequence Interface implementation that walks a SEL_ARG* tree.
- ****************************************************************************/
-/* MRR range sequence, SEL_ARG* implementation: stack entry */
-typedef struct st_range_seq_entry 
-  /* 
-    Pointers in min and max keys. They point to right-after-end of key
-    images. The 0-th entry has these pointing to key tuple start.
-  */
-  uchar *min_key, *max_key;
-  /* 
-    Flags, for {keypart0, keypart1, ... this_keypart} subtuple.
-    min_key_flag may have NULL_RANGE set.
-  */
-  uint min_key_flag, max_key_flag;
-  /* Number of key parts */
-  uint min_key_parts, max_key_parts;
-  SEL_ARG *key_tree;
-  MRR range sequence, SEL_ARG* implementation: SEL_ARG graph traversal context
-typedef struct st_sel_arg_range_seq
-  uint keyno;      /* index of used tree in SEL_TREE structure */
-  uint real_keyno; /* Number of the index in tables */
-  PARAM *param;
-  SEL_ARG *start; /* Root node of the traversed SEL_ARG* graph */
-  int i; /* Index of last used element in the above array */
-  bool at_start; /* TRUE <=> The traversal has just started */
-  Range sequence interface, SEL_ARG* implementation: Initialize the traversal
-    init()
-      init_params  SEL_ARG tree traversal context
-      n_ranges     [ignored] The number of ranges obtained 
-      flags        [ignored] HA_MRR_SINGLE_POINT, HA_MRR_FIXED_KEY
-    Value of init_param
-range_seq_t sel_arg_range_seq_init(void *init_param, uint n_ranges, uint flags)
-  SEL_ARG_RANGE_SEQ *seq= (SEL_ARG_RANGE_SEQ*)init_param;
-  seq->at_start= TRUE;
-  seq->stack[0].key_tree= NULL;
-  seq->stack[0].min_key= seq->param->min_key;
-  seq->stack[0].min_key_flag= 0;
-  seq->stack[0].min_key_parts= 0;
-  seq->stack[0].max_key= seq->param->max_key;
-  seq->stack[0].max_key_flag= 0;
-  seq->stack[0].max_key_parts= 0;
-  seq->i= 0;
-  return init_param;
-static void step_down_to(SEL_ARG_RANGE_SEQ *arg, SEL_ARG *key_tree)
-  RANGE_SEQ_ENTRY *cur= &arg->stack[arg->i+1];
-  RANGE_SEQ_ENTRY *prev= &arg->stack[arg->i];
-  cur->key_tree= key_tree;
-  cur->min_key= prev->min_key;
-  cur->max_key= prev->max_key;
-  cur->min_key_parts= prev->min_key_parts;
-  cur->max_key_parts= prev->max_key_parts;
-  uint16 stor_length= arg->param->key[arg->keyno][key_tree->part].store_length;
-  cur->min_key_parts += key_tree->store_min(stor_length, &cur->min_key,
-                                            prev->min_key_flag);
-  cur->max_key_parts += key_tree->store_max(stor_length, &cur->max_key,
-                                            prev->max_key_flag);
-  cur->min_key_flag= prev->min_key_flag | key_tree->min_flag;
-  cur->max_key_flag= prev->max_key_flag | key_tree->max_flag;
-  if (key_tree->is_null_interval())
-    cur->min_key_flag |= NULL_RANGE;
-  (arg->i)++;
-  Range sequence interface, SEL_ARG* implementation: get the next interval
-    sel_arg_range_seq_next()
-      rseq        Value returned from sel_arg_range_seq_init
-      range  OUT  Store information about the range here
-    This is "get_next" function for Range sequence interface implementation
-    for SEL_ARG* tree.
-    The traversal also updates those param members:
-      - is_ror_scan
-      - range_count
-      - max_key_part
-    0  Ok
-    1  No more ranges in the sequence
-//psergey-merge-todo: support check_quick_keys:max_keypart
-uint sel_arg_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range)
-  SEL_ARG *key_tree;
-  if (seq->at_start)
-  {
-    key_tree= seq->start;
-    seq->at_start= FALSE;
-    goto walk_up_n_right;
-  }
-  key_tree= seq->stack[seq->i].key_tree;
-  /* Ok, we're at some "full tuple" position in the tree */
-  /* Step down if we can */
-  if (key_tree->next && key_tree->next != &null_element)
-  {
-    //step down; (update the tuple, we'll step right and stay there)
-    seq->i--;
-    step_down_to(seq, key_tree->next);
-    key_tree= key_tree->next;
-    seq->param->is_ror_scan= FALSE;
-    goto walk_right_n_up;
-  }
-  /* Ok, can't step down, walk left until we can step down */
-  while (1)
-  {
-    if (seq->i == 1) // can't step left
-      return 1;
-    /* Step left */
-    seq->i--;
-    key_tree= seq->stack[seq->i].key_tree;
-    /* Step down if we can */
-    if (key_tree->next && key_tree->next != &null_element)
-    {
-      // Step down; update the tuple
-      seq->i--;
-      step_down_to(seq, key_tree->next);
-      key_tree= key_tree->next;
-      break;
-    }
-  }
-  /*
-    Ok, we've stepped down from the path to previous tuple.
-    Walk right-up while we can
-  */
-  while (key_tree->next_key_part && key_tree->next_key_part != &null_element && 
-         key_tree->next_key_part->part == key_tree->part + 1 &&
-         key_tree->next_key_part->type == SEL_ARG::KEY_RANGE)
-  {
-    {
-      RANGE_SEQ_ENTRY *cur= &seq->stack[seq->i];
-      uint min_key_length= cur->min_key - seq->param->min_key;
-      uint max_key_length= cur->max_key - seq->param->max_key;
-      uint len= cur->min_key - cur[-1].min_key;
-      if (!(min_key_length == max_key_length &&
-            !memcmp(cur[-1].min_key, cur[-1].max_key, len) &&
-            !key_tree->min_flag && !key_tree->max_flag))
-      {
-        seq->param->is_ror_scan= FALSE;
-        if (!key_tree->min_flag)
-          cur->min_key_parts += 
-            key_tree->next_key_part->store_min_key(seq->param->key[seq->keyno],
-                                                   &cur->min_key,
-                                                   &cur->min_key_flag);
-        if (!key_tree->max_flag)
-          cur->max_key_parts += 
-            key_tree->next_key_part->store_max_key(seq->param->key[seq->keyno],
-                                                   &cur->max_key,
-                                                   &cur->max_key_flag);
-        break;
-      }
-    }
-    /*
-      Ok, current atomic interval is in form "t.field=const" and there is
-      next_key_part interval. Step right, and walk up from there.
-    */
-    key_tree= key_tree->next_key_part;
-    while (key_tree->prev && key_tree->prev != &null_element)
-    {
-      /* Step up */
-      key_tree= key_tree->prev;
-    }
-    step_down_to(seq, key_tree);
-  }
-  /* Ok got a tuple */
-  RANGE_SEQ_ENTRY *cur= &seq->stack[seq->i];
-  uint min_key_length= cur->min_key - seq->param->min_key;
-  range->ptr= (char*)(int)(key_tree->part);
-  if (cur->min_key_flag & GEOM_FLAG)
-  {
-    range->range_flag= cur->min_key_flag;
-    /* Here minimum contains also function code bits, and maximum is +inf */
-    range->start_key.key=    seq->param->min_key;
-    range->start_key.length= min_key_length;
-    range->start_key.flag=  (ha_rkey_function) (cur->min_key_flag ^ GEOM_FLAG);
-  }
-  else
-  {
-    range->range_flag= cur->min_key_flag | cur->max_key_flag;
-    range->start_key.key=    seq->param->min_key;
-    range->start_key.length= cur->min_key - seq->param->min_key;
-    range->start_key.keypart_map= make_prev_keypart_map(cur->min_key_parts);
-    range->start_key.flag= (cur->min_key_flag & NEAR_MIN ? HA_READ_AFTER_KEY : 
-                                                           HA_READ_KEY_EXACT);
-    range->end_key.key=    seq->param->max_key;
-    range->end_key.length= cur->max_key - seq->param->max_key;
-    range->end_key.flag= (cur->max_key_flag & NEAR_MAX ? HA_READ_BEFORE_KEY : 
-                                                         HA_READ_AFTER_KEY);
-    range->end_key.keypart_map= make_prev_keypart_map(cur->max_key_parts);
-    if (!(cur->min_key_flag & ~NULL_RANGE) && !cur->max_key_flag &&
-        (uint)key_tree->part+1 == seq->param->table->key_info[seq->real_keyno].key_parts &&
-        (seq->param->table->key_info[seq->real_keyno].flags & (HA_NOSAME | HA_END_SPACE_KEY)) ==
-        HA_NOSAME &&
-        range->start_key.length == range->end_key.length &&
-        !memcmp(seq->param->min_key,seq->param->max_key,range->start_key.length))
-      range->range_flag= UNIQUE_RANGE | (cur->min_key_flag & NULL_RANGE);
-    if (seq->param->is_ror_scan)
-    {
-      /*
-        If we get here, the condition on the key was converted to form
-        "(keyXpart1 = c1) AND ... AND (keyXpart{key_tree->part - 1} = cN) AND
-          somecond(keyXpart{key_tree->part})"
-        Check if
-          somecond is "keyXpart{key_tree->part} = const" and
-          uncovered "tail" of KeyX parts is either empty or is identical to
-          first members of clustered primary key.
-      */
-      if (!(!(cur->min_key_flag & ~NULL_RANGE) && !cur->max_key_flag &&
-            (range->start_key.length == range->end_key.length) &&
-            !memcmp(range->start_key.key, range->end_key.key, range->start_key.length) &&
-            is_key_scan_ror(seq->param, seq->real_keyno, key_tree->part + 1)))
-        seq->param->is_ror_scan= FALSE;
-    }
-  }
-  seq->param->range_count++;
-  seq->param->max_key_part=max(seq->param->max_key_part,key_tree->part);
-  return 0;
   Calculate cost and E(#rows) for a given index and intervals tree 
@@ -7633,7 +7357,7 @@ ha_rows check_quick_select(PARAM *param,
   if (current_thd->lex->sql_command != SQLCOM_SELECT)
     *mrr_flags |= HA_MRR_USE_DEFAULT_IMPL;
-  *bufsize= param->thd->variables.read_rnd_buff_size;
+  *bufsize= param->thd->variables.mrr_buff_size;
   rows= file->multi_range_read_info_const(keynr, &seq_if, (void*)&seq, 0,
                                           bufsize, mrr_flags, cost);
   if (rows != HA_POS_ERROR)
@@ -8148,7 +7872,7 @@ QUICK_RANGE_SELECT *get_quick_select_for
     quick->mrr_flags |= HA_MRR_NO_NULL_ENDPOINTS;
-  quick->mrr_buf_size= thd->variables.read_rnd_buff_size;
+  quick->mrr_buf_size= thd->variables.mrr_buff_size;
   if (table->file->multi_range_read_info(quick->index, 1, (uint)records,
                                          &quick->mrr_flags, &cost))
@@ -8518,75 +8242,6 @@ int QUICK_RANGE_SELECT::reset()
-  Range sequence interface implementation for array<QUICK_RANGE>: initialize
-    quick_range_seq_init()
-      init_param  Caller-opaque paramenter: QUICK_RANGE_SELECT* pointer
-      n_ranges    Number of ranges in the sequence (ignored)
-      flags       MRR flags (currently not used) 
-    Opaque value to be passed to quick_range_seq_next
-range_seq_t quick_range_seq_init(void *init_param, uint n_ranges, uint flags)
-  quick->qr_traversal_ctx.first=  (QUICK_RANGE**)quick->ranges.buffer;
-  quick->qr_traversal_ctx.cur=    (QUICK_RANGE**)quick->ranges.buffer;
-  quick->qr_traversal_ctx.last=   quick->qr_traversal_ctx.cur + 
-                                  quick->ranges.elements;
-  return &quick->qr_traversal_ctx;
-  Range sequence interface implementation for array<QUICK_RANGE>: get next
-    quick_range_seq_next()
-      rseq        Value returned from quick_range_seq_init
-      range  OUT  Store information about the range here
-    0  Ok
-    1  No more ranges in the sequence
-uint quick_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range)
-  if (ctx->cur == ctx->last)
-    return 1; /* no more ranges */
-  QUICK_RANGE *cur= *(ctx->cur);
-  key_range *start_key= &range->start_key;
-  key_range *end_key=   &range->end_key;
-  start_key->key=    cur->min_key;
-  start_key->length= cur->min_length;
-  start_key->keypart_map= cur->min_keypart_map;
-  start_key->flag=   ((cur->flag & NEAR_MIN) ? HA_READ_AFTER_KEY :
-                      (cur->flag & EQ_RANGE) ?
-                      HA_READ_KEY_EXACT : HA_READ_KEY_OR_NEXT);
-  end_key->key=      cur->max_key;
-  end_key->length=   cur->max_length;
-  end_key->keypart_map= cur->max_keypart_map;
-  /*
-    We use HA_READ_AFTER_KEY here because if we are reading on a key
-    prefix. We want to find all keys with this prefix.
-  */
-  end_key->flag=     (cur->flag & NEAR_MAX ? HA_READ_BEFORE_KEY :
-                      HA_READ_AFTER_KEY);
-  range->range_flag= cur->flag;
-  ctx->cur++;
-  return 0;
   Get next possible record using quick-struct.
@@ -9658,7 +9313,7 @@ get_best_group_min_max(PARAM *param, SEL
       uint mrr_flags= HA_MRR_USE_DEFAULT_IMPL;
       uint mrr_bufsize=0;
       cur_quick_prefix_records= check_quick_select(param, cur_param_idx,
-                                                   FALSE /*don't care(*/,
+                                                   FALSE /*don't care*/,
                                                    cur_index_tree, TRUE,
                                                    &mrr_flags, &mrr_bufsize,

=== modified file 'sql/opt_range.h'
--- a/sql/opt_range.h	2009-12-15 07:16:46 +0000
+++ b/sql/opt_range.h	2009-12-22 12:33:21 +0000
@@ -317,7 +317,7 @@ protected:
   uint mrr_flags; /* Flags to be used with MRR interface */
-  uint mrr_buf_size; /* copy from thd->variables.read_rnd_buff_size */  
+  uint mrr_buf_size; /* copy from thd->variables.mrr_buff_size */  
   HANDLER_BUFFER *mrr_buf_desc; /* the handler buffer */
   /* Info about index we're scanning */

=== added file 'sql/opt_range_mrr.cc'
--- a/sql/opt_range_mrr.cc	1970-01-01 00:00:00 +0000
+++ b/sql/opt_range_mrr.cc	2009-12-22 12:33:21 +0000
@@ -0,0 +1,349 @@
+  MRR Range Sequence Interface implementation that walks a SEL_ARG* tree.
+ ****************************************************************************/
+/* MRR range sequence, SEL_ARG* implementation: stack entry */
+typedef struct st_range_seq_entry 
+  /* 
+    Pointers in min and max keys. They point to right-after-end of key
+    images. The 0-th entry has these pointing to key tuple start.
+  */
+  uchar *min_key, *max_key;
+  /* 
+    Flags, for {keypart0, keypart1, ... this_keypart} subtuple.
+    min_key_flag may have NULL_RANGE set.
+  */
+  uint min_key_flag, max_key_flag;
+  /* Number of key parts */
+  uint min_key_parts, max_key_parts;
+  SEL_ARG *key_tree;
+  MRR range sequence, SEL_ARG* implementation: SEL_ARG graph traversal context
+typedef struct st_sel_arg_range_seq
+  uint keyno;      /* index of used tree in SEL_TREE structure */
+  uint real_keyno; /* Number of the index in tables */
+  PARAM *param;
+  SEL_ARG *start; /* Root node of the traversed SEL_ARG* graph */
+  int i; /* Index of last used element in the above array */
+  bool at_start; /* TRUE <=> The traversal has just started */
+  Range sequence interface, SEL_ARG* implementation: Initialize the traversal
+    init()
+      init_params  SEL_ARG tree traversal context
+      n_ranges     [ignored] The number of ranges obtained 
+      flags        [ignored] HA_MRR_SINGLE_POINT, HA_MRR_FIXED_KEY
+    Value of init_param
+range_seq_t sel_arg_range_seq_init(void *init_param, uint n_ranges, uint flags)
+  SEL_ARG_RANGE_SEQ *seq= (SEL_ARG_RANGE_SEQ*)init_param;
+  seq->at_start= TRUE;
+  seq->stack[0].key_tree= NULL;
+  seq->stack[0].min_key= seq->param->min_key;
+  seq->stack[0].min_key_flag= 0;
+  seq->stack[0].min_key_parts= 0;
+  seq->stack[0].max_key= seq->param->max_key;
+  seq->stack[0].max_key_flag= 0;
+  seq->stack[0].max_key_parts= 0;
+  seq->i= 0;
+  return init_param;
+static void step_down_to(SEL_ARG_RANGE_SEQ *arg, SEL_ARG *key_tree)
+  RANGE_SEQ_ENTRY *cur= &arg->stack[arg->i+1];
+  RANGE_SEQ_ENTRY *prev= &arg->stack[arg->i];
+  cur->key_tree= key_tree;
+  cur->min_key= prev->min_key;
+  cur->max_key= prev->max_key;
+  cur->min_key_parts= prev->min_key_parts;
+  cur->max_key_parts= prev->max_key_parts;
+  uint16 stor_length= arg->param->key[arg->keyno][key_tree->part].store_length;
+  cur->min_key_parts += key_tree->store_min(stor_length, &cur->min_key,
+                                            prev->min_key_flag);
+  cur->max_key_parts += key_tree->store_max(stor_length, &cur->max_key,
+                                            prev->max_key_flag);
+  cur->min_key_flag= prev->min_key_flag | key_tree->min_flag;
+  cur->max_key_flag= prev->max_key_flag | key_tree->max_flag;
+  if (key_tree->is_null_interval())
+    cur->min_key_flag |= NULL_RANGE;
+  (arg->i)++;
+  Range sequence interface, SEL_ARG* implementation: get the next interval
+    sel_arg_range_seq_next()
+      rseq        Value returned from sel_arg_range_seq_init
+      range  OUT  Store information about the range here
+    This is "get_next" function for Range sequence interface implementation
+    for SEL_ARG* tree.
+    The traversal also updates those param members:
+      - is_ror_scan
+      - range_count
+      - max_key_part
+    0  Ok
+    1  No more ranges in the sequence
+uint sel_arg_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range)
+  SEL_ARG *key_tree;
+  if (seq->at_start)
+  {
+    key_tree= seq->start;
+    seq->at_start= FALSE;
+    goto walk_up_n_right;
+  }
+  key_tree= seq->stack[seq->i].key_tree;
+  /* Ok, we're at some "full tuple" position in the tree */
+  /* Step down if we can */
+  if (key_tree->next && key_tree->next != &null_element)
+  {
+    //step down; (update the tuple, we'll step right and stay there)
+    seq->i--;
+    step_down_to(seq, key_tree->next);
+    key_tree= key_tree->next;
+    seq->param->is_ror_scan= FALSE;
+    goto walk_right_n_up;
+  }
+  /* Ok, can't step down, walk left until we can step down */
+  while (1)
+  {
+    if (seq->i == 1) // can't step left
+      return 1;
+    /* Step left */
+    seq->i--;
+    key_tree= seq->stack[seq->i].key_tree;
+    /* Step down if we can */
+    if (key_tree->next && key_tree->next != &null_element)
+    {
+      // Step down; update the tuple
+      seq->i--;
+      step_down_to(seq, key_tree->next);
+      key_tree= key_tree->next;
+      break;
+    }
+  }
+  /*
+    Ok, we've stepped down from the path to previous tuple.
+    Walk right-up while we can
+  */
+  while (key_tree->next_key_part && key_tree->next_key_part != &null_element && 
+         key_tree->next_key_part->part == key_tree->part + 1 &&
+         key_tree->next_key_part->type == SEL_ARG::KEY_RANGE)
+  {
+    {
+      RANGE_SEQ_ENTRY *cur= &seq->stack[seq->i];
+      uint min_key_length= cur->min_key - seq->param->min_key;
+      uint max_key_length= cur->max_key - seq->param->max_key;
+      uint len= cur->min_key - cur[-1].min_key;
+      if (!(min_key_length == max_key_length &&
+            !memcmp(cur[-1].min_key, cur[-1].max_key, len) &&
+            !key_tree->min_flag && !key_tree->max_flag))
+      {
+        seq->param->is_ror_scan= FALSE;
+        if (!key_tree->min_flag)
+          cur->min_key_parts += 
+            key_tree->next_key_part->store_min_key(seq->param->key[seq->keyno],
+                                                   &cur->min_key,
+                                                   &cur->min_key_flag);
+        if (!key_tree->max_flag)
+          cur->max_key_parts += 
+            key_tree->next_key_part->store_max_key(seq->param->key[seq->keyno],
+                                                   &cur->max_key,
+                                                   &cur->max_key_flag);
+        break;
+      }
+    }
+    /*
+      Ok, current atomic interval is in form "t.field=const" and there is
+      next_key_part interval. Step right, and walk up from there.
+    */
+    key_tree= key_tree->next_key_part;
+    while (key_tree->prev && key_tree->prev != &null_element)
+    {
+      /* Step up */
+      key_tree= key_tree->prev;
+    }
+    step_down_to(seq, key_tree);
+  }
+  /* Ok got a tuple */
+  RANGE_SEQ_ENTRY *cur= &seq->stack[seq->i];
+  uint min_key_length= cur->min_key - seq->param->min_key;
+  range->ptr= (char*)(int)(key_tree->part);
+  if (cur->min_key_flag & GEOM_FLAG)
+  {
+    range->range_flag= cur->min_key_flag;
+    /* Here minimum contains also function code bits, and maximum is +inf */
+    range->start_key.key=    seq->param->min_key;
+    range->start_key.length= min_key_length;
+    range->start_key.flag=  (ha_rkey_function) (cur->min_key_flag ^ GEOM_FLAG);
+  }
+  else
+  {
+    range->range_flag= cur->min_key_flag | cur->max_key_flag;
+    range->start_key.key=    seq->param->min_key;
+    range->start_key.length= cur->min_key - seq->param->min_key;
+    range->start_key.keypart_map= make_prev_keypart_map(cur->min_key_parts);
+    range->start_key.flag= (cur->min_key_flag & NEAR_MIN ? HA_READ_AFTER_KEY : 
+                                                           HA_READ_KEY_EXACT);
+    range->end_key.key=    seq->param->max_key;
+    range->end_key.length= cur->max_key - seq->param->max_key;
+    range->end_key.flag= (cur->max_key_flag & NEAR_MAX ? HA_READ_BEFORE_KEY : 
+                                                         HA_READ_AFTER_KEY);
+    range->end_key.keypart_map= make_prev_keypart_map(cur->max_key_parts);
+    if (!(cur->min_key_flag & ~NULL_RANGE) && !cur->max_key_flag &&
+        (uint)key_tree->part+1 == seq->param->table->key_info[seq->real_keyno].key_parts &&
+        (seq->param->table->key_info[seq->real_keyno].flags & (HA_NOSAME | HA_END_SPACE_KEY)) ==
+        HA_NOSAME &&
+        range->start_key.length == range->end_key.length &&
+        !memcmp(seq->param->min_key,seq->param->max_key,range->start_key.length))
+      range->range_flag= UNIQUE_RANGE | (cur->min_key_flag & NULL_RANGE);
+    if (seq->param->is_ror_scan)
+    {
+      /*
+        If we get here, the condition on the key was converted to form
+        "(keyXpart1 = c1) AND ... AND (keyXpart{key_tree->part - 1} = cN) AND
+          somecond(keyXpart{key_tree->part})"
+        Check if
+          somecond is "keyXpart{key_tree->part} = const" and
+          uncovered "tail" of KeyX parts is either empty or is identical to
+          first members of clustered primary key.
+      */
+      if (!(!(cur->min_key_flag & ~NULL_RANGE) && !cur->max_key_flag &&
+            (range->start_key.length == range->end_key.length) &&
+            !memcmp(range->start_key.key, range->end_key.key, range->start_key.length) &&
+            is_key_scan_ror(seq->param, seq->real_keyno, key_tree->part + 1)))
+        seq->param->is_ror_scan= FALSE;
+    }
+  }
+  seq->param->range_count++;
+  seq->param->max_key_part=max(seq->param->max_key_part,key_tree->part);
+  return 0;
+  MRR Range Sequence Interface implementation that walks array<QUICK_RANGE>
+ ****************************************************************************/
+  Range sequence interface implementation for array<QUICK_RANGE>: initialize
+    quick_range_seq_init()
+      init_param  Caller-opaque paramenter: QUICK_RANGE_SELECT* pointer
+      n_ranges    Number of ranges in the sequence (ignored)
+      flags       MRR flags (currently not used) 
+    Opaque value to be passed to quick_range_seq_next
+range_seq_t quick_range_seq_init(void *init_param, uint n_ranges, uint flags)
+  quick->qr_traversal_ctx.first=  (QUICK_RANGE**)quick->ranges.buffer;
+  quick->qr_traversal_ctx.cur=    (QUICK_RANGE**)quick->ranges.buffer;
+  quick->qr_traversal_ctx.last=   quick->qr_traversal_ctx.cur + 
+                                  quick->ranges.elements;
+  return &quick->qr_traversal_ctx;
+  Range sequence interface implementation for array<QUICK_RANGE>: get next
+    quick_range_seq_next()
+      rseq        Value returned from quick_range_seq_init
+      range  OUT  Store information about the range here
+    0  Ok
+    1  No more ranges in the sequence
+uint quick_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range)
+  if (ctx->cur == ctx->last)
+    return 1; /* no more ranges */
+  QUICK_RANGE *cur= *(ctx->cur);
+  key_range *start_key= &range->start_key;
+  key_range *end_key=   &range->end_key;
+  start_key->key=    cur->min_key;
+  start_key->length= cur->min_length;
+  start_key->keypart_map= cur->min_keypart_map;
+  start_key->flag=   ((cur->flag & NEAR_MIN) ? HA_READ_AFTER_KEY :
+                      (cur->flag & EQ_RANGE) ?
+                      HA_READ_KEY_EXACT : HA_READ_KEY_OR_NEXT);
+  end_key->key=      cur->max_key;
+  end_key->length=   cur->max_length;
+  end_key->keypart_map= cur->max_keypart_map;
+  /*
+    We use HA_READ_AFTER_KEY here because if we are reading on a key
+    prefix. We want to find all keys with this prefix.
+  */
+  end_key->flag=     (cur->flag & NEAR_MAX ? HA_READ_BEFORE_KEY :
+                      HA_READ_AFTER_KEY);
+  range->range_flag= cur->flag;
+  ctx->cur++;
+  return 0;

=== modified file 'sql/set_var.cc'
--- a/sql/set_var.cc	2009-12-21 02:26:15 +0000
+++ b/sql/set_var.cc	2009-12-22 12:49:15 +0000
@@ -528,6 +528,8 @@ static sys_var_bool_ptr	        sys_user
 static sys_var_thd_ulong	sys_read_rnd_buff_size(&vars, "read_rnd_buffer_size",
+static sys_var_thd_ulong	sys_mrr_buff_size(&vars, "mrr_buffer_size",
+					          &SV::mrr_buff_size);
 static sys_var_thd_ulong	sys_div_precincrement(&vars, "div_precision_increment",
 static sys_var_long_ptr	sys_rpl_recovery_rank(&vars, "rpl_recovery_rank",

=== modified file 'sql/sql_class.h'
--- a/sql/sql_class.h	2009-12-21 02:26:15 +0000
+++ b/sql/sql_class.h	2009-12-22 12:49:15 +0000
@@ -340,6 +340,7 @@ struct system_variables
   ulong query_cache_type;
   ulong read_buff_size;
   ulong read_rnd_buff_size;
+  ulong mrr_buff_size;
   ulong div_precincrement;
   ulong sortbuff_size;
   ulong thread_handling;

=== modified file 'storage/maria/ha_maria.cc'
--- a/storage/maria/ha_maria.cc	2009-12-15 07:16:46 +0000
+++ b/storage/maria/ha_maria.cc	2009-12-22 12:33:21 +0000
@@ -2022,16 +2022,15 @@ int ha_maria::delete_row(const uchar * b
-my_bool index_cond_func_maria(void *arg)
+ICP_RESULT index_cond_func_maria(void *arg)
   ha_maria *h= (ha_maria*)arg;
-  /*if (h->in_range_read)*/
   if (h->end_range)
     if (h->compare_key2(h->end_range) > 0)
-      return 2; /* caller should return HA_ERR_END_OF_FILE already */
+      return ICP_OUT_OF_RANGE; /* caller should return HA_ERR_END_OF_FILE already */
-  return (my_bool)h->pushed_idx_cond->val_int();
+  return h->pushed_idx_cond->val_int() ? ICP_MATCH : ICP_NO_MATCH;

=== modified file 'storage/maria/ha_maria.h'
--- a/storage/maria/ha_maria.h	2009-12-15 07:16:46 +0000
+++ b/storage/maria/ha_maria.h	2009-12-22 12:33:21 +0000
@@ -29,7 +29,7 @@
 #define HA_RECOVER_QUICK        8       /* Don't check rows in data file */
-my_bool index_cond_func_maria(void *arg);
+ICP_RESULT index_cond_func_maria(void *arg);
 extern ulong maria_sort_buffer_size;
@@ -187,5 +187,5 @@ public:
   Item *idx_cond_push(uint keyno, Item* idx_cond);
   DsMrr_impl ds_mrr;
-  friend my_bool index_cond_func_maria(void *arg);
+  friend ICP_RESULT index_cond_func_maria(void *arg);

=== modified file 'storage/maria/ma_key.c'
--- a/storage/maria/ma_key.c	2009-12-15 07:16:46 +0000
+++ b/storage/maria/ma_key.c	2009-12-22 12:33:21 +0000
@@ -669,10 +669,10 @@ int _ma_read_key_record(MARIA_HA *info, 
               will look for column values there)
-    -1  Error 
-    0   Index condition is not satisfied, continue scanning
-    1   Index condition is satisfied
-    2   Index condition is not satisfied, end the scan. 
+    ICP_ERROR         Error 
+    ICP_NO_MATCH      Index condition is not satisfied, continue scanning
+    ICP_MATCH         Index condition is satisfied
+    ICP_OUT_OF_RANGE  Index condition is not satisfied, end the scan. 
 int ma_check_index_cond(register MARIA_HA *info, uint keynr, uchar *record)

=== modified file 'storage/maria/maria_def.h'
--- a/storage/maria/maria_def.h	2009-12-15 07:16:46 +0000
+++ b/storage/maria/maria_def.h	2009-12-22 12:33:21 +0000
@@ -477,8 +477,7 @@ typedef struct st_maria_block_scan
   MARIA_RECORD_POS row_base_page;
-/*psergey-todo: do really need to have copies of this all over the place?*/
-typedef my_bool (*index_cond_func_t)(void *param);
+typedef ICP_RESULT (*index_cond_func_t)(void *param);
 struct st_maria_handler

=== modified file 'storage/myisam/mi_key.c'
--- a/storage/myisam/mi_key.c	2009-12-15 07:16:46 +0000
+++ b/storage/myisam/mi_key.c	2009-12-22 12:33:21 +0000
@@ -504,10 +504,10 @@ int _mi_read_key_record(MI_INFO *info, m
               will look for column values there)
-    -1  Error 
-    0   Index condition is not satisfied, continue scanning
-    1   Index condition is satisfied
-    2   Index condition is not satisfied, end the scan. 
+    ICP_ERROR         Error 
+    ICP_NO_MATCH      Index condition is not satisfied, continue scanning
+    ICP_MATCH         Index condition is satisfied
+    ICP_OUT_OF_RANGE  Index condition is not satisfied, end the scan. 
 int mi_check_index_cond(register MI_INFO *info, uint keynr, uchar *record)
@@ -516,7 +516,7 @@ int mi_check_index_cond(register MI_INFO
     mi_print_error(info->s, HA_ERR_CRASHED);
-    return -1;
+    return ICP_ERROR;
   return info->index_cond_func(info->index_cond_func_arg);

=== modified file 'storage/myisam/mi_rkey.c'
--- a/storage/myisam/mi_rkey.c	2009-12-15 07:16:46 +0000
+++ b/storage/myisam/mi_rkey.c	2009-12-22 12:33:21 +0000
@@ -29,7 +29,7 @@ int mi_rkey(MI_INFO *info, uchar *buf, i
   MI_KEYDEF *keyinfo;
   HA_KEYSEG *last_used_keyseg;
   uint pack_key_length, use_key_length, nextflag;
-  int res= 0;
   DBUG_PRINT("enter", ("base: 0x%lx  buf: 0x%lx  inx: %d  search_flag: %d",
                        (long) info, (long) buf, inx, search_flag));
@@ -118,7 +118,7 @@ int mi_rkey(MI_INFO *info, uchar *buf, i
               (search_flag != HA_READ_KEY_EXACT ||
               last_used_keyseg != keyinfo->seg + keyinfo->keysegs)) ||
              (info->index_cond_func && 
-              !(res= mi_check_index_cond(info, inx, buf))))
+              (res= mi_check_index_cond(info, inx, buf)) == ICP_NO_MATCH))
         uint not_used[2];
@@ -146,7 +146,7 @@ int mi_rkey(MI_INFO *info, uchar *buf, i
-      if (res == 2)
+      if (res == ICP_OUT_OF_RANGE)
         info->lastpos= HA_OFFSET_ERROR;
         if (share->concurrent_insert)

=== modified file 'storage/myisam/mi_rnext.c'
--- a/storage/myisam/mi_rnext.c	2009-12-15 07:16:46 +0000
+++ b/storage/myisam/mi_rnext.c	2009-12-22 12:33:21 +0000
@@ -28,7 +28,7 @@ int mi_rnext(MI_INFO *info, uchar *buf, 
   int error,changed;
   uint flag;
-  int res= 0;
+  ICP_RESULT res= 0;
   if ((inx = _mi_check_index(info,inx)) < 0)
@@ -87,7 +87,7 @@ int mi_rnext(MI_INFO *info, uchar *buf, 
     while ((info->s->concurrent_insert &&
             info->lastpos >= info->state->data_file_length) ||
            (info->index_cond_func &&
-           !(res= mi_check_index_cond(info, inx, buf))))
+           (res= mi_check_index_cond(info, inx, buf)) == ICP_NO_MATCH))
          Skip rows that are either inserted by other threads since
@@ -100,7 +100,7 @@ int mi_rnext(MI_INFO *info, uchar *buf, 
-    if (!error && res == 2)
+    if (!error && res == ICP_OUT_OF_RANGE)
       if (info->s->concurrent_insert)

=== modified file 'storage/myisam/mi_rnext_same.c'
--- a/storage/myisam/mi_rnext_same.c	2009-12-15 07:16:46 +0000
+++ b/storage/myisam/mi_rnext_same.c	2009-12-22 12:33:21 +0000
@@ -75,9 +75,13 @@ int mi_rnext_same(MI_INFO *info, uchar *
           info->lastpos= HA_OFFSET_ERROR;
-        /* Skip rows that are inserted by other threads since we got a lock */
+        /* 
+          Skip 
+           - rows that are inserted by other threads since we got a lock 
+           - rows that don't match index condition */
         if (info->lastpos < info->state->data_file_length && 
-            (!info->index_cond_func || mi_check_index_cond(info, inx, buf)))
+            (!info->index_cond_func || 
+              mi_check_index_cond(info, inx, buf) != ICP_NO_MATCH))

=== modified file 'storage/xtradb/handler/ha_innodb.cc'
--- a/storage/xtradb/handler/ha_innodb.cc	2009-12-15 07:16:46 +0000
+++ b/storage/xtradb/handler/ha_innodb.cc	2009-12-22 12:33:21 +0000
@@ -114,7 +114,7 @@ static pthread_mutex_t commit_cond_m;
 static bool innodb_inited = 0;
-static uint index_cond_func_innodb(void *arg);
+static int index_cond_func_innodb(void *arg);
@@ -10765,24 +10765,12 @@ ha_rows ha_innobase::multi_range_read_in
   /* See comments in ha_myisam::multi_range_read_info_const */
   ds_mrr.init(this, table);
-  //psergey-mrr-fix:
   if (prebuilt->select_lock_type != LOCK_NONE)
     *flags |= HA_MRR_USE_DEFAULT_IMPL;
-  uint orig_flags= *flags;
   ha_rows res= ds_mrr.dsmrr_info_const(keyno, seq, seq_init_param, n_ranges,
                                        bufsz, flags, cost);
-  bool disable_ds_mrr= true;
-  disable_ds_mrr= false;
-//  DBUG_EXECUTE_IF("optimizer_innodb_ds_mrr", disable_ds_mrr= false;);
-  if (!disable_ds_mrr)
-    return res;
-  /* Disable DS-MRR: enable MS-MRR only after critical bugs are fixed */
-  *bufsz= 0;
-  *flags = orig_flags | HA_MRR_USE_DEFAULT_IMPL;
   return res;
@@ -10791,17 +10779,7 @@ ha_rows ha_innobase::multi_range_read_in
                                            uint *flags, COST_VECT *cost)
   ds_mrr.init(this, table);
-  uint orig_flags= *flags;
   ha_rows res= ds_mrr.dsmrr_info(keyno, n_ranges, keys, bufsz, flags, cost);
-  bool disable_ds_mrr= false;
- // DBUG_EXECUTE_IF("optimizer_innodb_ds_mrr", disable_ds_mrr= false;);
-  if (!disable_ds_mrr)
-    return res;
-  /* Disable DS-MRR: enable MS-MRR only after critical bugs are fixed */
-  *bufsz= 0;
-  *flags = orig_flags | HA_MRR_USE_DEFAULT_IMPL;
   return res;
@@ -10818,15 +10796,15 @@ C_MODE_START
   See note on ICP_RESULT for return values description.
-static uint index_cond_func_innodb(void *arg)
+static int index_cond_func_innodb(void *arg)
   ha_innobase *h= (ha_innobase*)arg;
   if (h->end_range)
     if (h->compare_key2(h->end_range) > 0)
-      return 2; /* caller should return HA_ERR_END_OF_FILE already */
+      return ICP_OUT_OF_RANGE; /* caller should return HA_ERR_END_OF_FILE already */
-  return test(h->pushed_idx_cond->val_int());
+  return h->pushed_idx_cond->val_int()? ICP_MATCH : ICP_NO_MATCH;
@@ -10834,8 +10812,7 @@ C_MODE_END
 Item *ha_innobase::idx_cond_push(uint keyno_arg, Item* idx_cond_arg)
-  //                              V :psergey-mrrr-merge: V
-  if (keyno_arg != primary_key && (prebuilt->select_lock_type == LOCK_NONE))
+  if ((keyno_arg != primary_key) && (prebuilt->select_lock_type == LOCK_NONE))
     pushed_idx_cond_keyno= keyno_arg;
     pushed_idx_cond= idx_cond_arg;

=== modified file 'storage/xtradb/include/row0mysql.h'
--- a/storage/xtradb/include/row0mysql.h	2009-12-15 07:16:46 +0000
+++ b/storage/xtradb/include/row0mysql.h	2009-12-22 12:33:21 +0000
@@ -564,7 +564,7 @@ struct mysql_row_templ_struct {
 #define ROW_PREBUILT_ALLOCATED	78540783
 #define ROW_PREBUILT_FREED	26423527
-typedef uint (*index_cond_func_t)(void *param);
+typedef int (*index_cond_func_t)(void *param);
 /* A struct for (sometimes lazily) prebuilt structures in an Innobase table
 handle used within MySQL; these are used to save CPU time. */

=== modified file 'storage/xtradb/row/row0sel.c'
--- a/storage/xtradb/row/row0sel.c	2009-12-15 07:16:46 +0000
+++ b/storage/xtradb/row/row0sel.c	2009-12-22 12:33:21 +0000
@@ -3116,10 +3116,14 @@ row_sel_pop_cached_row_for_mysql(
 			/* Copy NULL bit of the current field from cached_rec
 			to buf */
 			if (templ->mysql_null_bit_mask) {
-				buf[templ->mysql_null_byte_offset]
+				/*buf[templ->mysql_null_byte_offset]
 					^= (buf[templ->mysql_null_byte_offset]
 					    ^ cached_rec[templ->mysql_null_byte_offset])
-					& (byte)templ->mysql_null_bit_mask;
+					& (byte)templ->mysql_null_bit_mask;*/
+                                byte *null_byte= buf + templ->mysql_null_byte_offset;
+                                (*null_byte)&= ~templ->mysql_null_bit_mask;
+                                (*null_byte)|= cached_rec[templ->mysql_null_byte_offset] & 
+                                               templ->mysql_null_bit_mask;
@@ -3354,10 +3358,8 @@ row_search_for_mysql(
 	mem_heap_t*	heap				= NULL;
 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
 	ulint*		offsets				= offsets_;
-        /*psergey-mrr:*/
 	ibool		some_fields_in_buffer;
 	ibool		get_clust_rec			= 0;
-        /*:psergey-mrr*/
@@ -4210,11 +4212,8 @@ no_gap_lock:
 			information via the clustered index record. */
 			ut_ad(index != clust_index);
-			/*psergey-mrr:*/
                         get_clust_rec = TRUE;
 			goto idx_cond_check;
-			/**goto requires_clust_rec;**/
-			/*:psergey-mrr*/
@@ -4260,22 +4259,20 @@ no_gap_lock:
-        if (prebuilt->idx_cond_func)
-        {
-          int res;
-          ut_ad(prebuilt->template_type != ROW_MYSQL_DUMMY_TEMPLATE);
-          offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
-          row_sel_store_mysql_rec(buf, prebuilt, rec,
-                                  offsets, 0, prebuilt->n_index_fields);
-          res= prebuilt->idx_cond_func(prebuilt->idx_cond_func_arg);
-          if (res == 0)
-            goto next_rec;
-          if (res == 2)
-          {
-            err = DB_RECORD_NOT_FOUND;
-            goto idx_cond_failed;
-          }
-        }
+	if (prebuilt->idx_cond_func) {
+		int res;
+		ut_ad(prebuilt->template_type != ROW_MYSQL_DUMMY_TEMPLATE);
+		offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
+		row_sel_store_mysql_rec(buf, prebuilt, rec,
+		                        offsets, 0, prebuilt->n_index_fields);
+		res= prebuilt->idx_cond_func(prebuilt->idx_cond_func_arg);
+		if (res == 0)
+			goto next_rec;
+		if (res == 2) {
+			goto idx_cond_failed;
+		}
+	}
 	/* Get the clustered index record if needed, if we did not do the
 	search using the clustered index. */