← Back to team overview

maria-developers team mailing list archive

bzr commit into MariaDB 5.1, with Maria 1.5:maria branch (knielsen:2722)

 

#At lp:maria

 2722 knielsen@xxxxxxxxxxxxxxx	2009-08-31 [merge]
      Merge Paul's maria-pbxt-rc2 branch into MariaDB.
      
      Includes disabling PBMS (per Paul's recommendation) and test file fixes.
      
      Signed-off-by: Kristian Nielsen <knielsen@xxxxxxxxxxxxxxx>
      removed:
        storage/pbxt/src/streaming_xt.cc
        storage/pbxt/src/streaming_xt.h
      added:
        mysql-test/suite/pbxt/t/lowercase_table_grant-master.opt
        mysql-test/suite/pbxt/t/lowercase_table_qcache-master.opt
        mysql-test/suite/pbxt/t/lowercase_view-master.opt
        mysql-test/suite/pbxt/t/udf-master.opt
        storage/pbxt/src/pbms_enabled.cc
        storage/pbxt/src/pbms_enabled.h
      modified:
        mysql-test/suite/pbxt/r/alter_table.result
        mysql-test/suite/pbxt/r/analyze.result
        mysql-test/suite/pbxt/r/auto_increment.result
        mysql-test/suite/pbxt/r/delete.result
        mysql-test/suite/pbxt/r/distinct.result
        mysql-test/suite/pbxt/r/func_group.result
        mysql-test/suite/pbxt/r/func_math.result
        mysql-test/suite/pbxt/r/func_str.result
        mysql-test/suite/pbxt/r/grant.result
        mysql-test/suite/pbxt/r/group_min_max.result
        mysql-test/suite/pbxt/r/join.result
        mysql-test/suite/pbxt/r/join_nested.result
        mysql-test/suite/pbxt/r/key.result
        mysql-test/suite/pbxt/r/key_cache.result
        mysql-test/suite/pbxt/r/key_diff.result
        mysql-test/suite/pbxt/r/lowercase_view.result
        mysql-test/suite/pbxt/r/mysqlshow.result
        mysql-test/suite/pbxt/r/null.result
        mysql-test/suite/pbxt/r/null_key.result
        mysql-test/suite/pbxt/r/partition_pruning.result
        mysql-test/suite/pbxt/r/pbxt_bugs.result
        mysql-test/suite/pbxt/r/pbxt_ref_int.result
        mysql-test/suite/pbxt/r/preload.result
        mysql-test/suite/pbxt/r/ps_1general.result
        mysql-test/suite/pbxt/r/range.result
        mysql-test/suite/pbxt/r/schema.result
        mysql-test/suite/pbxt/r/select.result
        mysql-test/suite/pbxt/r/select_safe.result
        mysql-test/suite/pbxt/r/subselect.result
        mysql-test/suite/pbxt/r/type_enum.result
        mysql-test/suite/pbxt/r/type_ranges.result
        mysql-test/suite/pbxt/r/type_timestamp.result
        mysql-test/suite/pbxt/r/union.result
        mysql-test/suite/pbxt/r/view_grant.result
        mysql-test/suite/pbxt/t/auto_increment.test
        mysql-test/suite/pbxt/t/delete.test
        mysql-test/suite/pbxt/t/join_nested.test
        mysql-test/suite/pbxt/t/null.test
        mysql-test/suite/pbxt/t/pbxt_bugs.test
        mysql-test/suite/pbxt/t/rename.test
        mysql-test/suite/pbxt/t/schema.test
        mysql-test/suite/pbxt/t/type_enum.test
        mysql-test/suite/pbxt/t/union.test
        storage/pbxt/ChangeLog
        storage/pbxt/src/Makefile.am
        storage/pbxt/src/cache_xt.cc
        storage/pbxt/src/cache_xt.h
        storage/pbxt/src/ccutils_xt.cc
        storage/pbxt/src/database_xt.cc
        storage/pbxt/src/datadic_xt.cc
        storage/pbxt/src/datadic_xt.h
        storage/pbxt/src/datalog_xt.cc
        storage/pbxt/src/datalog_xt.h
        storage/pbxt/src/discover_xt.cc
        storage/pbxt/src/filesys_xt.cc
        storage/pbxt/src/filesys_xt.h
        storage/pbxt/src/ha_pbxt.cc
        storage/pbxt/src/ha_pbxt.h
        storage/pbxt/src/ha_xtsys.cc
        storage/pbxt/src/ha_xtsys.h
        storage/pbxt/src/hashtab_xt.cc
        storage/pbxt/src/heap_xt.cc
        storage/pbxt/src/heap_xt.h
        storage/pbxt/src/index_xt.cc
        storage/pbxt/src/index_xt.h
        storage/pbxt/src/lock_xt.cc
        storage/pbxt/src/lock_xt.h
        storage/pbxt/src/locklist_xt.cc
        storage/pbxt/src/locklist_xt.h
        storage/pbxt/src/memory_xt.cc
        storage/pbxt/src/memory_xt.h
        storage/pbxt/src/myxt_xt.cc
        storage/pbxt/src/myxt_xt.h
        storage/pbxt/src/pbms.h
        storage/pbxt/src/pthread_xt.cc
        storage/pbxt/src/restart_xt.cc
        storage/pbxt/src/restart_xt.h
        storage/pbxt/src/sortedlist_xt.cc
        storage/pbxt/src/strutil_xt.cc
        storage/pbxt/src/systab_xt.cc
        storage/pbxt/src/systab_xt.h
        storage/pbxt/src/tabcache_xt.cc
        storage/pbxt/src/tabcache_xt.h
        storage/pbxt/src/table_xt.cc
        storage/pbxt/src/table_xt.h
        storage/pbxt/src/thread_xt.cc
        storage/pbxt/src/thread_xt.h
        storage/pbxt/src/trace_xt.cc
        storage/pbxt/src/trace_xt.h
        storage/pbxt/src/util_xt.cc
        storage/pbxt/src/xaction_xt.cc
        storage/pbxt/src/xaction_xt.h
        storage/pbxt/src/xactlog_xt.cc
        storage/pbxt/src/xactlog_xt.h
        storage/pbxt/src/xt_config.h
        storage/pbxt/src/xt_defs.h

=== modified file 'mysql-test/suite/pbxt/r/alter_table.result'
--- a/mysql-test/suite/pbxt/r/alter_table.result	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/r/alter_table.result	2009-08-17 11:12:36 +0000
@@ -126,23 +126,23 @@ key (n4, n1, n2, n3) );
 alter table t1 disable keys;
 show keys from t1;
 Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment
-t1	0	n1	1	n1	NULL	0	NULL	NULL		BTREE	
-t1	1	n1_2	1	n1	NULL	NULL	NULL	NULL		BTREE	
-t1	1	n1_2	2	n2	NULL	NULL	NULL	NULL	YES	BTREE	
-t1	1	n1_2	3	n3	NULL	NULL	NULL	NULL	YES	BTREE	
-t1	1	n1_2	4	n4	NULL	NULL	NULL	NULL	YES	BTREE	
-t1	1	n2	1	n2	NULL	NULL	NULL	NULL	YES	BTREE	
-t1	1	n2	2	n3	NULL	NULL	NULL	NULL	YES	BTREE	
-t1	1	n2	3	n4	NULL	NULL	NULL	NULL	YES	BTREE	
-t1	1	n2	4	n1	NULL	NULL	NULL	NULL		BTREE	
-t1	1	n3	1	n3	NULL	NULL	NULL	NULL	YES	BTREE	
-t1	1	n3	2	n4	NULL	NULL	NULL	NULL	YES	BTREE	
-t1	1	n3	3	n1	NULL	NULL	NULL	NULL		BTREE	
-t1	1	n3	4	n2	NULL	NULL	NULL	NULL	YES	BTREE	
-t1	1	n4	1	n4	NULL	NULL	NULL	NULL	YES	BTREE	
-t1	1	n4	2	n1	NULL	NULL	NULL	NULL		BTREE	
-t1	1	n4	3	n2	NULL	NULL	NULL	NULL	YES	BTREE	
-t1	1	n4	4	n3	NULL	NULL	NULL	NULL	YES	BTREE	
+t1	0	n1	1	n1	A	0	NULL	NULL		BTREE	
+t1	1	n1_2	1	n1	A	0	NULL	NULL		BTREE	
+t1	1	n1_2	2	n2	A	0	NULL	NULL	YES	BTREE	
+t1	1	n1_2	3	n3	A	0	NULL	NULL	YES	BTREE	
+t1	1	n1_2	4	n4	A	0	NULL	NULL	YES	BTREE	
+t1	1	n2	1	n2	A	0	NULL	NULL	YES	BTREE	
+t1	1	n2	2	n3	A	0	NULL	NULL	YES	BTREE	
+t1	1	n2	3	n4	A	0	NULL	NULL	YES	BTREE	
+t1	1	n2	4	n1	A	0	NULL	NULL		BTREE	
+t1	1	n3	1	n3	A	0	NULL	NULL	YES	BTREE	
+t1	1	n3	2	n4	A	0	NULL	NULL	YES	BTREE	
+t1	1	n3	3	n1	A	0	NULL	NULL		BTREE	
+t1	1	n3	4	n2	A	0	NULL	NULL	YES	BTREE	
+t1	1	n4	1	n4	A	0	NULL	NULL	YES	BTREE	
+t1	1	n4	2	n1	A	0	NULL	NULL		BTREE	
+t1	1	n4	3	n2	A	0	NULL	NULL	YES	BTREE	
+t1	1	n4	4	n3	A	0	NULL	NULL	YES	BTREE	
 insert into t1 values(10,RAND()*1000,RAND()*1000,RAND());
 insert into t1 values(9,RAND()*1000,RAND()*1000,RAND());
 insert into t1 values(8,RAND()*1000,RAND()*1000,RAND());
@@ -156,23 +156,23 @@ insert into t1 values(1,RAND()*1000,RAND
 alter table t1 enable keys;
 show keys from t1;
 Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment
-t1	0	n1	1	n1	NULL	10	NULL	NULL		BTREE	
-t1	1	n1_2	1	n1	NULL	NULL	NULL	NULL		BTREE	
-t1	1	n1_2	2	n2	NULL	NULL	NULL	NULL	YES	BTREE	
-t1	1	n1_2	3	n3	NULL	NULL	NULL	NULL	YES	BTREE	
-t1	1	n1_2	4	n4	NULL	NULL	NULL	NULL	YES	BTREE	
-t1	1	n2	1	n2	NULL	NULL	NULL	NULL	YES	BTREE	
-t1	1	n2	2	n3	NULL	NULL	NULL	NULL	YES	BTREE	
-t1	1	n2	3	n4	NULL	NULL	NULL	NULL	YES	BTREE	
-t1	1	n2	4	n1	NULL	NULL	NULL	NULL		BTREE	
-t1	1	n3	1	n3	NULL	NULL	NULL	NULL	YES	BTREE	
-t1	1	n3	2	n4	NULL	NULL	NULL	NULL	YES	BTREE	
-t1	1	n3	3	n1	NULL	NULL	NULL	NULL		BTREE	
-t1	1	n3	4	n2	NULL	NULL	NULL	NULL	YES	BTREE	
-t1	1	n4	1	n4	NULL	NULL	NULL	NULL	YES	BTREE	
-t1	1	n4	2	n1	NULL	NULL	NULL	NULL		BTREE	
-t1	1	n4	3	n2	NULL	NULL	NULL	NULL	YES	BTREE	
-t1	1	n4	4	n3	NULL	NULL	NULL	NULL	YES	BTREE	
+t1	0	n1	1	n1	A	10	NULL	NULL		BTREE	
+t1	1	n1_2	1	n1	A	10	NULL	NULL		BTREE	
+t1	1	n1_2	2	n2	A	10	NULL	NULL	YES	BTREE	
+t1	1	n1_2	3	n3	A	10	NULL	NULL	YES	BTREE	
+t1	1	n1_2	4	n4	A	10	NULL	NULL	YES	BTREE	
+t1	1	n2	1	n2	A	10	NULL	NULL	YES	BTREE	
+t1	1	n2	2	n3	A	10	NULL	NULL	YES	BTREE	
+t1	1	n2	3	n4	A	10	NULL	NULL	YES	BTREE	
+t1	1	n2	4	n1	A	10	NULL	NULL		BTREE	
+t1	1	n3	1	n3	A	10	NULL	NULL	YES	BTREE	
+t1	1	n3	2	n4	A	10	NULL	NULL	YES	BTREE	
+t1	1	n3	3	n1	A	10	NULL	NULL		BTREE	
+t1	1	n3	4	n2	A	10	NULL	NULL	YES	BTREE	
+t1	1	n4	1	n4	A	10	NULL	NULL	YES	BTREE	
+t1	1	n4	2	n1	A	10	NULL	NULL		BTREE	
+t1	1	n4	3	n2	A	10	NULL	NULL	YES	BTREE	
+t1	1	n4	4	n3	A	10	NULL	NULL	YES	BTREE	
 drop table t1;
 create table t1 (i int unsigned not null auto_increment primary key);
 alter table t1 rename t2;
@@ -286,17 +286,17 @@ insert into t1 values(1,1), (2,1), (3, 1
 alter table t1 add unique (a,b), add key (b);
 show keys from t1;
 Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment
-t1	0	a	1	a	A	NULL	NULL	NULL	YES	BTREE	
-t1	0	a	2	b	A	NULL	NULL	NULL	YES	BTREE	
-t1	1	b	1	b	A	NULL	NULL	NULL	YES	BTREE	
+t1	0	a	1	a	A	300	NULL	NULL	YES	BTREE	
+t1	0	a	2	b	A	300	NULL	NULL	YES	BTREE	
+t1	1	b	1	b	A	300	NULL	NULL	YES	BTREE	
 analyze table t1;
 Table	Op	Msg_type	Msg_text
 test.t1	analyze	status	OK
 show keys from t1;
 Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment
-t1	0	a	1	a	A	NULL	NULL	NULL	YES	BTREE	
-t1	0	a	2	b	A	NULL	NULL	NULL	YES	BTREE	
-t1	1	b	1	b	A	NULL	NULL	NULL	YES	BTREE	
+t1	0	a	1	a	A	300	NULL	NULL	YES	BTREE	
+t1	0	a	2	b	A	300	NULL	NULL	YES	BTREE	
+t1	1	b	1	b	A	300	NULL	NULL	YES	BTREE	
 drop table t1;
 CREATE TABLE t1 (i int(10), index(i) );
 ALTER TABLE t1 DISABLE KEYS;
@@ -545,37 +545,37 @@ drop table if exists t1;
 create table t1 (a int, key(a));
 show indexes from t1;
 Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment
-t1	1	a	1	a	A	NULL	NULL	NULL	YES	BTREE	
+t1	1	a	1	a	A	0	NULL	NULL	YES	BTREE	
 "this used not to disable the index"
 alter table t1 modify a int, disable keys;
 show indexes from t1;
 Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment
-t1	1	a	1	a	A	NULL	NULL	NULL	YES	BTREE	
+t1	1	a	1	a	A	0	NULL	NULL	YES	BTREE	
 alter table t1 enable keys;
 show indexes from t1;
 Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment
-t1	1	a	1	a	NULL	NULL	NULL	NULL	YES	BTREE	
+t1	1	a	1	a	A	0	NULL	NULL	YES	BTREE	
 alter table t1 modify a bigint, disable keys;
 show indexes from t1;
 Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment
-t1	1	a	1	a	A	NULL	NULL	NULL	YES	BTREE	
+t1	1	a	1	a	A	0	NULL	NULL	YES	BTREE	
 alter table t1 enable keys;
 show indexes from t1;
 Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment
-t1	1	a	1	a	NULL	NULL	NULL	NULL	YES	BTREE	
+t1	1	a	1	a	A	0	NULL	NULL	YES	BTREE	
 alter table t1 add b char(10), disable keys;
 show indexes from t1;
 Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment
-t1	1	a	1	a	A	NULL	NULL	NULL	YES	BTREE	
+t1	1	a	1	a	A	0	NULL	NULL	YES	BTREE	
 alter table t1 add c decimal(10,2), enable keys;
 show indexes from t1;
 Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment
-t1	1	a	1	a	A	NULL	NULL	NULL	YES	BTREE	
+t1	1	a	1	a	A	0	NULL	NULL	YES	BTREE	
 "this however did"
 alter table t1 disable keys;
 show indexes from t1;
 Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment
-t1	1	a	1	a	NULL	NULL	NULL	NULL	YES	BTREE	
+t1	1	a	1	a	A	0	NULL	NULL	YES	BTREE	
 desc t1;
 Field	Type	Null	Key	Default	Extra
 a	bigint(20)	YES	MUL	NULL	
@@ -585,7 +585,7 @@ alter table t1 add d decimal(15,5);
 "The key should still be disabled"
 show indexes from t1;
 Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment
-t1	1	a	1	a	A	NULL	NULL	NULL	YES	BTREE	
+t1	1	a	1	a	A	0	NULL	NULL	YES	BTREE	
 drop table t1;
 "Now will test with one unique index"
 create table t1(a int, b char(10), unique(a));
@@ -595,7 +595,7 @@ t1	0	a	1	a	A	0	NULL	NULL	YES	BTREE	
 alter table t1 disable keys;
 show indexes from t1;
 Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment
-t1	0	a	1	a	NULL	0	NULL	NULL	YES	BTREE	
+t1	0	a	1	a	A	0	NULL	NULL	YES	BTREE	
 alter table t1 enable keys;
 "If no copy on noop change, this won't touch the data file"
 "Unique index, no change"
@@ -623,12 +623,12 @@ create table t1(a int, b char(10), uniqu
 show indexes from t1;
 Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment
 t1	0	a	1	a	A	0	NULL	NULL	YES	BTREE	
-t1	1	b	1	b	A	NULL	NULL	NULL	YES	BTREE	
+t1	1	b	1	b	A	0	NULL	NULL	YES	BTREE	
 alter table t1 disable keys;
 show indexes from t1;
 Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment
-t1	0	a	1	a	NULL	0	NULL	NULL	YES	BTREE	
-t1	1	b	1	b	NULL	NULL	NULL	NULL	YES	BTREE	
+t1	0	a	1	a	A	0	NULL	NULL	YES	BTREE	
+t1	1	b	1	b	A	0	NULL	NULL	YES	BTREE	
 alter table t1 enable keys;
 "If no copy on noop change, this won't touch the data file"
 "The non-unique index will be disabled"
@@ -636,31 +636,31 @@ alter table t1 modify a int, disable key
 show indexes from t1;
 Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment
 t1	0	a	1	a	A	0	NULL	NULL	YES	BTREE	
-t1	1	b	1	b	A	NULL	NULL	NULL	YES	BTREE	
+t1	1	b	1	b	A	0	NULL	NULL	YES	BTREE	
 alter table t1 enable keys;
 show indexes from t1;
 Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment
-t1	0	a	1	a	NULL	0	NULL	NULL	YES	BTREE	
-t1	1	b	1	b	NULL	NULL	NULL	NULL	YES	BTREE	
+t1	0	a	1	a	A	0	NULL	NULL	YES	BTREE	
+t1	1	b	1	b	A	0	NULL	NULL	YES	BTREE	
 "Change the type implying data copy"
 "The non-unique index will be disabled"
 alter table t1 modify a bigint, disable keys;
 show indexes from t1;
 Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment
 t1	0	a	1	a	A	0	NULL	NULL	YES	BTREE	
-t1	1	b	1	b	A	NULL	NULL	NULL	YES	BTREE	
+t1	1	b	1	b	A	0	NULL	NULL	YES	BTREE	
 "Change again the type, but leave the indexes as_is"
 alter table t1 modify a int;
 show indexes from t1;
 Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment
 t1	0	a	1	a	A	0	NULL	NULL	YES	BTREE	
-t1	1	b	1	b	A	NULL	NULL	NULL	YES	BTREE	
+t1	1	b	1	b	A	0	NULL	NULL	YES	BTREE	
 "Try the same. When data is no copied on similar tables, this is noop"
 alter table t1 modify a int;
 show indexes from t1;
 Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment
 t1	0	a	1	a	A	0	NULL	NULL	YES	BTREE	
-t1	1	b	1	b	A	NULL	NULL	NULL	YES	BTREE	
+t1	1	b	1	b	A	0	NULL	NULL	YES	BTREE	
 drop table t1;
 create database mysqltest;
 create table t1 (c1 int);
@@ -697,11 +697,11 @@ DROP TABLE IF EXISTS bug24219_2;
 CREATE TABLE bug24219 (a INT, INDEX(a));
 SHOW INDEX FROM bug24219;
 Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment
-bug24219	1	a	1	a	A	NULL	NULL	NULL	YES	BTREE	
+bug24219	1	a	1	a	A	0	NULL	NULL	YES	BTREE	
 ALTER TABLE bug24219 RENAME TO bug24219_2, DISABLE KEYS;
 SHOW INDEX FROM bug24219_2;
 Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment
-bug24219_2	1	a	1	a	A	NULL	NULL	NULL	YES	BTREE	
+bug24219_2	1	a	1	a	A	0	NULL	NULL	YES	BTREE	
 DROP TABLE bug24219_2;
 create table t1 (mycol int(10) not null);
 alter table t1 alter column mycol set default 0;
@@ -882,7 +882,7 @@ int_field	int(10) unsigned	NO	MUL	NULL	
 char_field	char(10)	YES		NULL	
 SHOW INDEXES FROM t1;
 Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment
-t1	1	int_field	1	int_field	A	NULL	NULL	NULL		BTREE	
+t1	1	int_field	1	int_field	A	0	NULL	NULL		BTREE	
 INSERT INTO t1 VALUES (1, "edno"), (1, "edno"), (2, "dve"), (3, "tri"), (5, "pet");
 "Non-copy data change - new frm, but old data and index files"
 ALTER TABLE t1

=== modified file 'mysql-test/suite/pbxt/r/analyze.result'
--- a/mysql-test/suite/pbxt/r/analyze.result	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/r/analyze.result	2009-08-17 15:57:58 +0000
@@ -56,5 +56,5 @@ Table	Op	Msg_type	Msg_text
 test.t1	analyze	status	OK
 show index from t1;
 Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment
-t1	1	a	1	a	A	NULL	NULL	NULL	YES	BTREE	
+t1	1	a	1	a	A	5	NULL	NULL	YES	BTREE	
 drop table t1;

=== modified file 'mysql-test/suite/pbxt/r/auto_increment.result'
--- a/mysql-test/suite/pbxt/r/auto_increment.result	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/r/auto_increment.result	2009-08-17 15:57:58 +0000
@@ -229,7 +229,8 @@ a	b
 204	7
 delete from t1 where a=0;
 update t1 set a=NULL where b=6;
-ERROR 23000: Column 'a' cannot be null
+Warnings:
+Warning	1048	Column 'a' cannot be null
 update t1 set a=300 where b=7;
 SET SQL_MODE='';
 insert into t1(a,b)values(NULL,8);
@@ -244,7 +245,7 @@ a	b
 1	1
 200	2
 201	4
-203	6
+0	6
 300	7
 301	8
 400	9
@@ -260,7 +261,6 @@ a	b
 1	1
 200	2
 201	4
-203	6
 300	7
 301	8
 400	9
@@ -271,20 +271,20 @@ a	b
 405	14
 delete from t1 where a=0;
 update t1 set a=NULL where b=13;
-ERROR 23000: Column 'a' cannot be null
+Warnings:
+Warning	1048	Column 'a' cannot be null
 update t1 set a=500 where b=14;
 select * from t1 order by b;
 a	b
 1	1
 200	2
 201	4
-203	6
 300	7
 301	8
 400	9
 401	10
 402	11
-404	13
+0	13
 500	14
 drop table t1;
 create table t1 (a bigint);

=== modified file 'mysql-test/suite/pbxt/r/delete.result'
--- a/mysql-test/suite/pbxt/r/delete.result	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/r/delete.result	2009-08-17 15:57:58 +0000
@@ -125,18 +125,19 @@ a	b
 0	11
 2	12
 delete ignore t11.*, t12.* from t11,t12 where t11.a = t12.a and t11.b <> (select b from t2 where t11.a < t2.a);
-Warnings:
-Error	1242	Subquery returns more than 1 row
-Error	1242	Subquery returns more than 1 row
+ERROR 21000: Subquery returns more than 1 row
 select * from t11;
 a	b
 0	10
 1	11
+2	12
 select * from t12;
 a	b
 33	10
 0	11
+2	12
 insert into t11 values (2, 12);
+ERROR 23000: Duplicate entry '2' for key 'PRIMARY'
 delete from t11 where t11.b <> (select b from t2 where t11.a < t2.a);
 ERROR 21000: Subquery returns more than 1 row
 select * from t11;
@@ -145,13 +146,12 @@ a	b
 1	11
 2	12
 delete ignore from t11 where t11.b <> (select b from t2 where t11.a < t2.a);
-Warnings:
-Error	1242	Subquery returns more than 1 row
-Error	1242	Subquery returns more than 1 row
+ERROR 21000: Subquery returns more than 1 row
 select * from t11;
 a	b
 0	10
 1	11
+2	12
 drop table t11, t12, t2;
 create table t1 (a int, b int, unique key (a), key (b));
 insert into t1 values (3, 3), (7, 7);

=== modified file 'mysql-test/suite/pbxt/r/distinct.result'
--- a/mysql-test/suite/pbxt/r/distinct.result	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/r/distinct.result	2009-08-17 15:57:58 +0000
@@ -174,8 +174,8 @@ INSERT INTO t3 VALUES (1,'1'),(2,'2'),(1
 explain SELECT distinct t3.a FROM t3,t2,t1 WHERE t3.a=t1.b AND t1.a=t2.a;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	ALL	PRIMARY	NULL	NULL	NULL	4	Using temporary
-1	SIMPLE	t3	ref	a	a	5	test.t1.b	2	Using where; Using index
-1	SIMPLE	t2	index	a	a	4	NULL	5	Using where; Using index; Distinct; Using join buffer
+1	SIMPLE	t2	ref	a	a	4	test.t1.a	1	Using index
+1	SIMPLE	t3	ref	a	a	5	test.t1.b	1	Using where; Using index
 SELECT distinct t3.a FROM t3,t2,t1 WHERE t3.a=t1.b AND t1.a=t2.a;
 a
 1
@@ -190,7 +190,7 @@ insert into t3 select * from t4;
 explain select distinct t1.a from t1,t3 where t1.a=t3.a;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	index	PRIMARY	PRIMARY	4	NULL	4	Using index; Using temporary
-1	SIMPLE	t3	ref	a	a	5	test.t1.a	11	Using where; Using index; Distinct
+1	SIMPLE	t3	ref	a	a	5	test.t1.a	1	Using where; Using index; Distinct
 select distinct t1.a from t1,t3 where t1.a=t3.a;
 a
 1
@@ -212,7 +212,7 @@ id	select_type	table	type	possible_keys	
 1	SIMPLE	t1	index	NULL	PRIMARY	4	NULL	1	Using index
 explain SELECT distinct a from t3 order by a desc limit 2;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t3	index	NULL	a	5	NULL	40	Using index
+1	SIMPLE	t3	index	NULL	a	5	NULL	2	Using index
 explain SELECT distinct a,b from t3 order by a+1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t3	ALL	NULL	NULL	NULL	NULL	204	Using temporary; Using filesort

=== modified file 'mysql-test/suite/pbxt/r/func_group.result'
--- a/mysql-test/suite/pbxt/r/func_group.result	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/r/func_group.result	2009-08-17 15:57:58 +0000
@@ -61,7 +61,7 @@ grp	sum
 NULL	NULL
 1	7
 2	20.25
-3	45.483163247594
+3	45.4831632475944
 create table t2 (grp int, a bigint unsigned, c char(10));
 insert into t2 select grp,max(a)+max(grp),max(c) from t1 group by grp;
 replace into t2 select grp, a, c from t1 limit 2,1;
@@ -613,8 +613,8 @@ id	select_type	table	type	possible_keys	
 explain
 select max(t1.a3), min(t2.a2) from t1, t2 where t1.a2 = 2 and t1.a3 < 'MIN' and t2.a3 > 'CA';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t2	range	k1	k1	3	NULL	1	Using where; Using index
-1	SIMPLE	t1	range	k1	k1	7	NULL	1	Using where; Using index; Using join buffer
+1	SIMPLE	t1	range	k1	k1	7	NULL	1	Using where; Using index
+1	SIMPLE	t2	range	k1	k1	3	NULL	1	Using where; Using index; Using join buffer
 explain
 select min(a4 - 0.01) from t1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
@@ -1186,7 +1186,7 @@ std(s1/s2)
 0.21325764
 select std(o1/o2) from bug22555;
 std(o1/o2)
-0.21325763586649
+0.213257635866493
 select std(e1/e2) from bug22555;
 std(e1/e2)
 0.21325764
@@ -1212,7 +1212,7 @@ round(std(s1/s2), 17)
 0.21325763586649341
 select std(o1/o2) from bug22555;
 std(o1/o2)
-0.21325763586649
+0.213257635866493
 select round(std(e1/e2), 17) from bug22555;
 round(std(e1/e2), 17)
 0.21325763586649341
@@ -1237,7 +1237,7 @@ round(std(s1/s2), 17)
 0.21325763586649341
 select std(o1/o2) from bug22555;
 std(o1/o2)
-0.21325763586649
+0.213257635866493
 select round(std(e1/e2), 17) from bug22555;
 round(std(e1/e2), 17)
 0.21325763586649341

=== modified file 'mysql-test/suite/pbxt/r/func_math.result'
--- a/mysql-test/suite/pbxt/r/func_math.result	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/r/func_math.result	2009-08-17 15:57:58 +0000
@@ -60,7 +60,7 @@ Warnings:
 Note	1003	select ln(exp(10)) AS `ln(exp(10))`,exp((ln(sqrt(10)) * 2)) AS `exp(ln(sqrt(10))*2)`,ln(-(1)) AS `ln(-1)`,ln(0) AS `ln(0)`,ln(NULL) AS `ln(NULL)`
 select log2(8),log2(15),log2(-2),log2(0),log2(NULL);
 log2(8)	log2(15)	log2(-2)	log2(0)	log2(NULL)
-3	3.9068905956085	NULL	NULL	NULL
+3	3.90689059560852	NULL	NULL	NULL
 explain extended select log2(8),log2(15),log2(-2),log2(0),log2(NULL);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 1	SIMPLE	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	No tables used
@@ -68,7 +68,7 @@ Warnings:
 Note	1003	select log2(8) AS `log2(8)`,log2(15) AS `log2(15)`,log2(-(2)) AS `log2(-2)`,log2(0) AS `log2(0)`,log2(NULL) AS `log2(NULL)`
 select log10(100),log10(18),log10(-4),log10(0),log10(NULL);
 log10(100)	log10(18)	log10(-4)	log10(0)	log10(NULL)
-2	1.2552725051033	NULL	NULL	NULL
+2	1.25527250510331	NULL	NULL	NULL
 explain extended select log10(100),log10(18),log10(-4),log10(0),log10(NULL);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 1	SIMPLE	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	No tables used
@@ -85,7 +85,7 @@ Note	1003	select pow(10,log10(10)) AS `p
 set @@rand_seed1=10000000,@@rand_seed2=1000000;
 select rand(999999),rand();
 rand(999999)	rand()
-0.014231365187309	0.028870999839968
+0.0142313651873091	0.028870999839968
 explain extended select rand(999999),rand();
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 1	SIMPLE	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	No tables used
@@ -101,7 +101,7 @@ Warnings:
 Note	1003	select pi() AS `pi()`,format(sin((pi() / 2)),6) AS `format(sin(pi()/2),6)`,format(cos((pi() / 2)),6) AS `format(cos(pi()/2),6)`,format(abs(tan(pi())),6) AS `format(abs(tan(pi())),6)`,format((1 / tan(1)),6) AS `format(cot(1),6)`,format(asin(1),6) AS `format(asin(1),6)`,format(acos(0),6) AS `format(acos(0),6)`,format(atan(1),6) AS `format(atan(1),6)`
 select degrees(pi()),radians(360);
 degrees(pi())	radians(360)
-180	6.2831853071796
+180	6.28318530717959
 select format(atan(-2, 2), 6);
 format(atan(-2, 2), 6)
 -0.785398

=== modified file 'mysql-test/suite/pbxt/r/func_str.result'
--- a/mysql-test/suite/pbxt/r/func_str.result	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/r/func_str.result	2009-08-17 15:57:58 +0000
@@ -1327,10 +1327,10 @@ cast(rtrim(ltrim('  20.06 ')) as decimal
 20.06
 select conv("18383815659218730760",10,10) + 0;
 conv("18383815659218730760",10,10) + 0
-1.8383815659219e+19
+1.83838156592187e+19
 select "18383815659218730760" + 0;
 "18383815659218730760" + 0
-1.8383815659219e+19
+1.83838156592187e+19
 CREATE TABLE t1 (code varchar(10));
 INSERT INTO t1 VALUES ('a12'), ('A12'), ('a13');
 SELECT ASCII(code), code FROM t1 WHERE code='A12';

=== modified file 'mysql-test/suite/pbxt/r/grant.result'
--- a/mysql-test/suite/pbxt/r/grant.result	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/r/grant.result	2009-08-17 15:57:58 +0000
@@ -457,7 +457,7 @@ Privilege	Context	Comment
 Alter	Tables	To alter the table
 Alter routine	Functions,Procedures	To alter or drop stored functions/procedures
 Create	Databases,Tables,Indexes	To create new databases and tables
-Create routine	Functions,Procedures	To use CREATE FUNCTION/PROCEDURE
+Create routine	Databases	To use CREATE FUNCTION/PROCEDURE
 Create temporary tables	Databases	To use CREATE TEMPORARY TABLE
 Create view	Tables	To create new views
 Create user	Server Admin	To create new users

=== modified file 'mysql-test/suite/pbxt/r/group_min_max.result'
--- a/mysql-test/suite/pbxt/r/group_min_max.result	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/r/group_min_max.result	2009-08-17 15:57:58 +0000
@@ -133,34 +133,34 @@ Table	Op	Msg_type	Msg_text
 test.t3	analyze	status	OK
 explain select a1, min(a2) from t1 group by a1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	130	NULL	10	Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	130	NULL	129	Using index for group-by
 explain select a1, max(a2) from t1 group by a1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	65	NULL	10	Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	65	NULL	129	Using index for group-by
 explain select a1, min(a2), max(a2) from t1 group by a1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	130	NULL	10	Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	130	NULL	129	Using index for group-by
 explain select a1, a2, b, min(c), max(c) from t1 group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	10	Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	129	Using index for group-by
 explain select a1,a2,b,max(c),min(c) from t1 group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	10	Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	129	Using index for group-by
 explain select a1,a2,b,max(c),min(c) from t2 group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t2	range	NULL	idx_t2_1	#	NULL	#	Using index for group-by
 explain select min(a2), a1, max(a2), min(a2), a1 from t1 group by a1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	130	NULL	10	Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	130	NULL	129	Using index for group-by
 explain select a1, b, min(c), a1, max(c), b, a2, max(c), max(c) from t1 group by a1, a2, b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	10	Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	129	Using index for group-by
 explain select min(a2) from t1 group by a1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	130	NULL	10	Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	130	NULL	129	Using index for group-by
 explain select a2, min(c), max(c) from t1 group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	10	Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	129	Using index for group-by
 select a1, min(a2) from t1 group by a1;
 a1	min(a2)
 a	a
@@ -293,13 +293,13 @@ id	select_type	table	type	possible_keys	
 1	SIMPLE	t1	range	idx_t1_0,idx_t1_1,idx_t1_2	idx_t1_2	65	NULL	1	Using where
 explain select a1,a2,b,       max(c) from t1 where a1 >= 'c' or a1 < 'b' group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	idx_t1_0,idx_t1_1,idx_t1_2	idx_t1_1	147	NULL	1	Using where; Using index for group-by
+1	SIMPLE	t1	range	idx_t1_0,idx_t1_1,idx_t1_2	idx_t1_1	147	NULL	2	Using where; Using index for group-by
 explain select a1, max(c)            from t1 where a1 >= 'c' or a1 < 'b' group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	idx_t1_0,idx_t1_1,idx_t1_2	idx_t1_1	147	NULL	1	Using where; Using index for group-by
+1	SIMPLE	t1	range	idx_t1_0,idx_t1_1,idx_t1_2	idx_t1_1	147	NULL	2	Using where; Using index for group-by
 explain select a1,a2,b,min(c),max(c) from t1 where a1 >= 'c' or a2 < 'b' group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	idx_t1_0,idx_t1_1,idx_t1_2	idx_t1_1	147	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t1	range	idx_t1_0,idx_t1_1,idx_t1_2	idx_t1_1	147	NULL	129	Using where; Using index for group-by
 explain select a1,a2,b,       max(c) from t1 where a1 = 'z' or a1 = 'b' or a1 = 'd' group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	range	idx_t1_0,idx_t1_1,idx_t1_2	idx_t1_1	65	NULL	3	Using where; Using index
@@ -669,40 +669,40 @@ d	l421
 d	p422
 explain select a1,a2,b,max(c),min(c) from t1 where (a2 = 'a') and (b = 'b') group by a1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	129	Using where; Using index for group-by
 explain select a1,max(c),min(c)      from t1 where (a2 = 'a') and (b = 'b') group by a1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	129	Using where; Using index for group-by
 explain select a1,a2,b,       max(c) from t1 where (b = 'b') group by a1,a2;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	129	Using where; Using index for group-by
 explain select a1,a2,b,min(c),max(c) from t1 where (b = 'b') group by a1,a2;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	129	Using where; Using index for group-by
 explain select a1,a2, max(c)         from t1 where (b = 'b') group by a1,a2;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	129	Using where; Using index for group-by
 explain select a1,a2,b,max(c),min(c) from t2 where (a2 = 'a') and (b = 'b') group by a1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t2	range	NULL	idx_t2_1	163	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t2	range	NULL	idx_t2_1	163	NULL	165	Using where; Using index for group-by
 explain select a1,max(c),min(c)      from t2 where (a2 = 'a') and (b = 'b') group by a1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t2	range	NULL	idx_t2_1	163	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t2	range	NULL	idx_t2_1	163	NULL	165	Using where; Using index for group-by
 explain select a1,a2,b,       max(c) from t2 where (b = 'b') group by a1,a2;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t2	range	NULL	idx_t2_1	146	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t2	range	NULL	idx_t2_1	146	NULL	165	Using where; Using index for group-by
 explain select a1,a2,b,min(c),max(c) from t2 where (b = 'b') group by a1,a2;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t2	range	NULL	idx_t2_1	163	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t2	range	NULL	idx_t2_1	163	NULL	165	Using where; Using index for group-by
 explain select a1,a2, max(c)         from t2 where (b = 'b') group by a1,a2;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t2	range	NULL	idx_t2_1	146	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t2	range	NULL	idx_t2_1	146	NULL	165	Using where; Using index for group-by
 explain select a1,a2,b,max(c),min(c) from t3 where (a2 = 'a') and (b = 'b') group by a1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t3	range	NULL	idx_t3_1	6	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t3	range	NULL	idx_t3_1	6	NULL	193	Using where; Using index for group-by
 explain select a1,max(c),min(c)      from t3 where (a2 = 'a') and (b = 'b') group by a1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t3	range	NULL	idx_t3_1	6	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t3	range	NULL	idx_t3_1	6	NULL	193	Using where; Using index for group-by
 select a1,a2,b,max(c),min(c) from t1 where (a2 = 'a') and (b = 'b') group by a1;
 a1	a2	b	max(c)	min(c)
 a	a	b	h112	e112
@@ -804,22 +804,22 @@ b	h212	e212
 c	h312	e312
 explain select a1,a2,b,min(c) from t2 where (a2 = 'a') and b is NULL group by a1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t2	range	NULL	idx_t2_1	163	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t2	range	NULL	idx_t2_1	163	NULL	165	Using where; Using index for group-by
 explain select a1,a2,b,max(c) from t2 where (a2 = 'a') and b is NULL group by a1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t2	range	NULL	idx_t2_1	146	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t2	range	NULL	idx_t2_1	146	NULL	165	Using where; Using index for group-by
 explain select a1,a2,b,min(c) from t2 where b is NULL group by a1,a2;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t2	range	NULL	idx_t2_1	163	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t2	range	NULL	idx_t2_1	163	NULL	165	Using where; Using index for group-by
 explain select a1,a2,b,max(c) from t2 where b is NULL group by a1,a2;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t2	range	NULL	idx_t2_1	146	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t2	range	NULL	idx_t2_1	146	NULL	165	Using where; Using index for group-by
 explain select a1,a2,b,min(c),max(c) from t2 where b is NULL group by a1,a2;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t2	range	NULL	idx_t2_1	163	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t2	range	NULL	idx_t2_1	163	NULL	165	Using where; Using index for group-by
 explain select a1,a2,b,min(c),max(c) from t2 where b is NULL group by a1,a2;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t2	range	NULL	idx_t2_1	163	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t2	range	NULL	idx_t2_1	163	NULL	165	Using where; Using index for group-by
 select a1,a2,b,min(c) from t2 where (a2 = 'a') and b is NULL group by a1;
 a1	a2	b	min(c)
 a	a	NULL	a777
@@ -849,49 +849,49 @@ id	select_type	table	type	possible_keys	
 1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	#	Using where; Using index for group-by
 explain select a1,a2,b,min(c),max(c) from t1 where (c > 'b1') group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	163	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	163	NULL	129	Using where; Using index for group-by
 explain select a1,a2,b,       max(c) from t1 where (c > 'f123') group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	129	Using where; Using index for group-by
 explain select a1,a2,b,min(c),max(c) from t1 where (c > 'f123') group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	163	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	163	NULL	129	Using where; Using index for group-by
 explain select a1,a2,b,       max(c) from t1 where (c < 'a0') group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	163	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	163	NULL	129	Using where; Using index for group-by
 explain select a1,a2,b,min(c),max(c) from t1 where (c < 'a0') group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	163	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	163	NULL	129	Using where; Using index for group-by
 explain select a1,a2,b,       max(c) from t1 where (c < 'k321') group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	163	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	163	NULL	129	Using where; Using index for group-by
 explain select a1,a2,b,min(c),max(c) from t1 where (c < 'k321') group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	163	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	163	NULL	129	Using where; Using index for group-by
 explain select a1,a2,b,       max(c) from t1 where (c < 'a0') or (c > 'b1') group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	163	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	163	NULL	129	Using where; Using index for group-by
 explain select a1,a2,b,min(c),max(c) from t1 where (c < 'a0') or (c > 'b1') group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	163	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	163	NULL	129	Using where; Using index for group-by
 explain select a1,a2,b,       max(c) from t1 where (c > 'b1') or (c <= 'g1') group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	129	Using where; Using index for group-by
 explain select a1,a2,b,min(c),max(c) from t1 where (c > 'b1') or (c <= 'g1') group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	129	Using where; Using index for group-by
 explain select a1,a2,b,min(c),max(c) from t1 where (c > 'b111') and (c <= 'g112') group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	163	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	163	NULL	129	Using where; Using index for group-by
 explain select a1,a2,b,min(c),max(c) from t1 where (c < 'c5') or (c = 'g412') or (c = 'k421') group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	163	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	163	NULL	129	Using where; Using index for group-by
 explain select a1,a2,b,min(c),max(c) from t1 where ((c > 'b111') and (c <= 'g112')) or ((c > 'd000') and (c <= 'i110')) group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	163	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	163	NULL	129	Using where; Using index for group-by
 explain select a1,a2,b,min(c),max(c) from t1 where (c between 'b111' and 'g112') or (c between 'd000' and 'i110') group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	163	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	163	NULL	129	Using where; Using index for group-by
 explain select a1,a2,b,       max(c) from t2 where (c > 'b1') group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t2	range	NULL	idx_t2_1	146	NULL	#	Using where; Using index for group-by
@@ -1364,29 +1364,29 @@ explain select a1,a2,b,min(c),max(c) fro
 where exists ( select * from t2 where t2.c > 'b1' )
 group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	PRIMARY	t1	range	NULL	idx_t1_1	147	NULL	10	Using index for group-by
+1	PRIMARY	t1	range	NULL	idx_t1_1	147	NULL	129	Using index for group-by
 2	SUBQUERY	t2	index	NULL	idx_t2_1	163	NULL	164	Using where; Using index
 explain select a1,a2,b,min(c),max(c) from t1 where (a1 >= 'c' or a2 < 'b') and (b > 'a') group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	idx_t1_0,idx_t1_1,idx_t1_2	idx_t1_1	147	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t1	range	idx_t1_0,idx_t1_1,idx_t1_2	idx_t1_1	147	NULL	129	Using where; Using index for group-by
 explain select a1,a2,b,min(c),max(c) from t1 where (a1 >= 'c' or a2 < 'b') and (c > 'b111') group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	idx_t1_0,idx_t1_1,idx_t1_2	idx_t1_1	163	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t1	range	idx_t1_0,idx_t1_1,idx_t1_2	idx_t1_1	163	NULL	129	Using where; Using index for group-by
 explain select a1,a2,b,min(c),max(c) from t1 where (a2 >= 'b') and (b = 'a') and (c > 'b111') group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	163	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	163	NULL	129	Using where; Using index for group-by
 explain select a1,a2,b,min(c) from t1 where ((a1 > 'a') or (a1 < '9'))  and ((a2 >= 'b') and (a2 < 'z')) and (b = 'a') and ((c < 'h112') or (c = 'j121') or (c > 'k121' and c < 'm122') or (c > 'o122')) group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	idx_t1_0,idx_t1_1,idx_t1_2	idx_t1_1	163	NULL	1	Using where; Using index for group-by
+1	SIMPLE	t1	range	idx_t1_0,idx_t1_1,idx_t1_2	idx_t1_1	163	NULL	2	Using where; Using index for group-by
 explain select a1,a2,b,min(c) from t1 where ((a1 > 'a') or (a1 < '9'))  and ((a2 >= 'b') and (a2 < 'z')) and (b = 'a') and ((c = 'j121') or (c > 'k121' and c < 'm122') or (c > 'o122') or (c < 'h112') or (c = 'c111')) group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	idx_t1_0,idx_t1_1,idx_t1_2	idx_t1_1	163	NULL	1	Using where; Using index for group-by
+1	SIMPLE	t1	range	idx_t1_0,idx_t1_1,idx_t1_2	idx_t1_1	163	NULL	2	Using where; Using index for group-by
 explain select a1,a2,b,min(c) from t1 where (a1 > 'a') and (a2 > 'a') and (b = 'c') group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	range	idx_t1_0,idx_t1_1,idx_t1_2	idx_t1_2	65	NULL	1	Using where
 explain select a1,a2,b,min(c) from t1 where (ord(a1) > 97) and (ord(a2) + ord(a1) > 194) and (b = 'c') group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	129	Using where; Using index for group-by
 explain select a1,a2,b,min(c),max(c) from t2 where (a1 >= 'c' or a2 < 'b') and (b > 'a') group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t2	range	idx_t2_0,idx_t2_1,idx_t2_2	idx_t2_1	163	NULL	#	Using where; Using index for group-by
@@ -1491,13 +1491,13 @@ select a1,a2,b,min(c) from t2 where (a1 
 a1	a2	b	min(c)
 explain select a1,a2,b from t1 where (a1 >= 'c' or a2 < 'b') and (b > 'a') group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	idx_t1_0,idx_t1_1,idx_t1_2	idx_t1_1	147	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t1	range	idx_t1_0,idx_t1_1,idx_t1_2	idx_t1_1	147	NULL	129	Using where; Using index for group-by
 explain select a1,a2,b from t1 where (a2 >= 'b') and (b = 'a') group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	129	Using where; Using index for group-by
 explain select a1,a2,b,c from t1 where (a2 >= 'b') and (b = 'a') and (c = 'i121') group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	163	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	163	NULL	129	Using where; Using index for group-by
 explain select a1,a2,b from t1 where (a1 > 'a') and (a2 > 'a') and (b = 'c') group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	range	idx_t1_0,idx_t1_1,idx_t1_2	idx_t1_2	147	NULL	1	Using where
@@ -1554,13 +1554,13 @@ select a1,a2,b from t2 where (a1 > 'a') 
 a1	a2	b
 explain select distinct a1,a2,b from t1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	10	Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	129	Using index for group-by
 explain select distinct a1,a2,b from t1 where (a2 >= 'b') and (b = 'a');
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	129	Using where; Using index for group-by
 explain extended select distinct a1,a2,b,c from t1 where (a2 >= 'b') and (b = 'a') and (c = 'i121');
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	163	NULL	10	100.00	Using where; Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	163	NULL	129	99.22	Using where; Using index for group-by
 Warnings:
 Note	1003	select distinct `test`.`t1`.`a1` AS `a1`,`test`.`t1`.`a2` AS `a2`,`test`.`t1`.`b` AS `b`,`test`.`t1`.`c` AS `c` from `test`.`t1` where ((`test`.`t1`.`c` = 'i121') and (`test`.`t1`.`b` = 'a') and (`test`.`t1`.`a2` >= 'b'))
 explain select distinct a1,a2,b from t1 where (a1 > 'a') and (a2 > 'a') and (b = 'c');
@@ -1577,7 +1577,7 @@ id	select_type	table	type	possible_keys	
 1	SIMPLE	t2	range	NULL	idx_t2_2	146	NULL	#	Using where; Using index for group-by
 explain extended select distinct a1,a2,b,c from t2 where (a2 >= 'b') and (b = 'a') and (c = 'i121');
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	t2	range	NULL	idx_t2_1	163	NULL	10	100.00	Using where; Using index for group-by
+1	SIMPLE	t2	range	NULL	idx_t2_1	163	NULL	165	99.39	Using where; Using index for group-by
 Warnings:
 Note	1003	select distinct `test`.`t2`.`a1` AS `a1`,`test`.`t2`.`a2` AS `a2`,`test`.`t2`.`b` AS `b`,`test`.`t2`.`c` AS `c` from `test`.`t2` where ((`test`.`t2`.`c` = 'i121') and (`test`.`t2`.`b` = 'a') and (`test`.`t2`.`a2` >= 'b'))
 explain select distinct a1,a2,b from t2 where (a1 > 'a') and (a2 > 'a') and (b = 'c');
@@ -1702,19 +1702,19 @@ c	e
 d	e
 explain select distinct a1,a2,b from t1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	10	Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	129	Using index for group-by
 explain select distinct a1,a2,b from t1 where (a2 >= 'b') and (b = 'a') group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	129	Using where; Using index for group-by
 explain select distinct a1,a2,b,c from t1 where (a2 >= 'b') and (b = 'a') and (c = 'i121') group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	163	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	163	NULL	129	Using where; Using index for group-by
 explain select distinct a1,a2,b from t1 where (a1 > 'a') and (a2 > 'a') and (b = 'c') group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	range	idx_t1_0,idx_t1_1,idx_t1_2	idx_t1_2	147	NULL	1	Using where
 explain select distinct b from t1 where (a2 >= 'b') and (b = 'a') group by a1,a2,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	10	Using where; Using index for group-by; Using temporary; Using filesort
+1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	129	Using where; Using index for group-by; Using temporary; Using filesort
 explain select distinct a1,a2,b from t2;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t2	range	NULL	idx_t2_2	146	NULL	#	Using index for group-by
@@ -1846,7 +1846,7 @@ id	select_type	table	type	possible_keys	
 1	SIMPLE	t1	range	idx_t1_0,idx_t1_1,idx_t1_2	idx_t1_2	65	NULL	1	Using where
 explain select concat(ord(min(b)),ord(max(b))),min(b),max(b) from t1 group by a1,a2;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	10	Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	147	NULL	129	Using index for group-by
 select a1,a2,b, concat(min(c), max(c)) from t1 where a1 < 'd' group by a1,a2,b;
 a1	a2	b	concat(min(c), max(c))
 a	a	a	a111d111
@@ -1985,7 +1985,7 @@ c
 d
 explain select a1 from t1 where a2 = 'b' group by a1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	130	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	130	NULL	129	Using where; Using index for group-by
 select a1 from t1 where a2 = 'b' group by a1;
 a1
 a
@@ -1994,7 +1994,7 @@ c
 d
 explain select distinct a1 from t1 where a2 = 'b';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx_t1_1	130	NULL	10	Using where; Using index for group-by
+1	SIMPLE	t1	range	NULL	idx_t1_1	130	NULL	129	Using where; Using index for group-by
 select distinct a1 from t1 where a2 = 'b';
 a1
 a
@@ -2188,7 +2188,7 @@ INSERT INTO t1 (a, b) VALUES (1,1), (1,2
 (2,2), (2,3), (2,1), (3,1), (4,1), (4,2), (4,3), (4,4), (4,5), (4,6);
 EXPLAIN SELECT max(b), a FROM t1 GROUP BY a;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	a	5	NULL	8	Using index for group-by
+1	SIMPLE	t1	index	NULL	a	10	NULL	15	Using index
 FLUSH STATUS;
 SELECT max(b), a FROM t1 GROUP BY a;
 max(b)	a
@@ -2202,7 +2202,7 @@ Handler_read_key	0
 Handler_read_next	0
 EXPLAIN SELECT max(b), a FROM t1 GROUP BY a;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	a	5	NULL	8	Using index for group-by
+1	SIMPLE	t1	index	NULL	a	10	NULL	15	Using index
 FLUSH STATUS;
 CREATE TABLE t2 SELECT max(b), a FROM t1 GROUP BY a;
 SHOW STATUS LIKE 'handler_read__e%';
@@ -2235,14 +2235,14 @@ Handler_read_next	0
 EXPLAIN (SELECT max(b), a FROM t1 GROUP BY a) UNION 
 (SELECT max(b), a FROM t1 GROUP BY a);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	PRIMARY	t1	range	NULL	a	5	NULL	8	Using index for group-by
-2	UNION	t1	range	NULL	a	5	NULL	8	Using index for group-by
+1	PRIMARY	t1	index	NULL	a	10	NULL	15	Using index
+2	UNION	t1	index	NULL	a	10	NULL	15	Using index
 NULL	UNION RESULT	<union1,2>	ALL	NULL	NULL	NULL	NULL	NULL	
 EXPLAIN SELECT (SELECT max(b) FROM t1 GROUP BY a HAVING a < 2) x
 FROM t1 AS t1_outer;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	PRIMARY	t1_outer	index	NULL	a	10	NULL	15	Using index
-2	SUBQUERY	t1	range	NULL	a	5	NULL	8	Using index for group-by
+2	SUBQUERY	t1	index	NULL	a	10	NULL	15	Using index
 EXPLAIN SELECT 1 FROM t1 AS t1_outer WHERE EXISTS 
 (SELECT max(b) FROM t1 GROUP BY a HAVING a < 2);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
@@ -2252,7 +2252,7 @@ EXPLAIN SELECT 1 FROM t1 AS t1_outer WHE
 (SELECT max(b) FROM t1 GROUP BY a HAVING a < 2) > 12;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	PRIMARY	NULL	NULL	NULL	NULL	NULL	NULL	NULL	Impossible WHERE
-2	SUBQUERY	t1	range	NULL	a	5	NULL	8	Using index for group-by
+2	SUBQUERY	t1	index	NULL	a	10	NULL	15	Using index
 EXPLAIN SELECT 1 FROM t1 AS t1_outer WHERE 
 a IN (SELECT max(b) FROM t1 GROUP BY a HAVING a < 2);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
@@ -2261,21 +2261,21 @@ id	select_type	table	type	possible_keys	
 EXPLAIN SELECT 1 FROM t1 AS t1_outer GROUP BY a HAVING 
 a > (SELECT max(b) FROM t1 GROUP BY a HAVING a < 2);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	PRIMARY	t1_outer	range	NULL	a	5	NULL	8	Using index for group-by
-2	SUBQUERY	t1	range	NULL	a	5	NULL	8	Using index for group-by
+1	PRIMARY	t1_outer	index	NULL	a	10	NULL	15	Using index
+2	SUBQUERY	t1	index	NULL	a	10	NULL	15	Using index
 EXPLAIN SELECT 1 FROM t1 AS t1_outer1 JOIN t1 AS t1_outer2 
 ON t1_outer1.a = (SELECT max(b) FROM t1 GROUP BY a HAVING a < 2) 
 AND t1_outer1.b = t1_outer2.b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	PRIMARY	t1_outer1	ref	a	a	5	const	1	Using where; Using index
 1	PRIMARY	t1_outer2	index	NULL	a	10	NULL	15	Using where; Using index; Using join buffer
-2	SUBQUERY	t1	range	NULL	a	5	NULL	8	Using index for group-by
+2	SUBQUERY	t1	index	NULL	a	10	NULL	15	Using index
 EXPLAIN SELECT (SELECT (SELECT max(b) FROM t1 GROUP BY a HAVING a < 2) x
 FROM t1 AS t1_outer) x2 FROM t1 AS t1_outer2;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	PRIMARY	t1_outer2	index	NULL	a	10	NULL	15	Using index
 2	SUBQUERY	t1_outer	index	NULL	a	10	NULL	15	Using index
-3	SUBQUERY	t1	range	NULL	a	5	NULL	8	Using index for group-by
+3	SUBQUERY	t1	index	NULL	a	10	NULL	15	Using index
 CREATE TABLE t3 LIKE t1;
 FLUSH STATUS;
 INSERT INTO t3 SELECT a,MAX(b) FROM t1 GROUP BY a;
@@ -2312,7 +2312,7 @@ INSERT INTO t1 VALUES
 (4), (2), (1), (2), (2), (4), (1), (4);
 EXPLAIN SELECT DISTINCT(a) FROM t1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx	5	NULL	9	Using index for group-by
+1	SIMPLE	t1	index	NULL	idx	5	NULL	16	Using index
 SELECT DISTINCT(a) FROM t1;
 a
 1
@@ -2320,7 +2320,7 @@ a
 4
 EXPLAIN SELECT SQL_BIG_RESULT DISTINCT(a) FROM t1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	idx	5	NULL	9	Using index for group-by
+1	SIMPLE	t1	index	NULL	idx	5	NULL	16	Using index
 SELECT SQL_BIG_RESULT DISTINCT(a) FROM t1;
 a
 1
@@ -2345,7 +2345,7 @@ CREATE INDEX break_it ON t1 (a, b);
 EXPLAIN
 SELECT a, MIN(b), MAX(b) FROM t1 GROUP BY a ORDER BY a;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	break_it	10	NULL	7	Using index for group-by
+1	SIMPLE	t1	index	NULL	break_it	10	NULL	12	Using index
 SELECT a, MIN(b), MAX(b) FROM t1 GROUP BY a ORDER BY a;
 a	MIN(b)	MAX(b)
 1	1	3
@@ -2355,7 +2355,7 @@ a	MIN(b)	MAX(b)
 EXPLAIN
 SELECT a, MIN(b), MAX(b) FROM t1 GROUP BY a ORDER BY a DESC;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	NULL	break_it	10	NULL	7	Using index for group-by; Using temporary; Using filesort
+1	SIMPLE	t1	index	NULL	break_it	10	NULL	12	Using index
 SELECT a, MIN(b), MAX(b) FROM t1 GROUP BY a ORDER BY a DESC;
 a	MIN(b)	MAX(b)
 4	1	3

=== modified file 'mysql-test/suite/pbxt/r/join.result'
--- a/mysql-test/suite/pbxt/r/join.result	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/r/join.result	2009-08-17 15:57:58 +0000
@@ -774,7 +774,7 @@ insert into t3 select * from t2 where a 
 explain select * from t2,t3 where t2.a < 200 and t2.b=t3.b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t2	range	a,b	a	5	NULL	1	Using where
-1	SIMPLE	t3	ref	b	b	5	test.t2.b	11	Using where
+1	SIMPLE	t3	ref	b	b	5	test.t2.b	1	Using where
 drop table t1, t2, t3;
 create table t1 (a int);
 insert into t1 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);

=== modified file 'mysql-test/suite/pbxt/r/join_nested.result'
--- a/mysql-test/suite/pbxt/r/join_nested.result	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/r/join_nested.result	2009-08-17 15:57:58 +0000
@@ -851,7 +851,7 @@ ON t3.a=1 AND t3.b=t2.b AND t2.b=t4.b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 1	SIMPLE	t3	ALL	NULL	NULL	NULL	NULL	2	100.00	
 1	SIMPLE	t4	ALL	NULL	NULL	NULL	NULL	2	100.00	Using join buffer
-1	SIMPLE	t2	ref	idx_b	idx_b	5	test.t3.b	2	100.00	
+1	SIMPLE	t2	ref	idx_b	idx_b	5	test.t3.b	1	100.00	
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	3	100.00	
 Warnings:
 Note	1003	select `test`.`t2`.`a` AS `a`,`test`.`t2`.`b` AS `b`,`test`.`t3`.`a` AS `a`,`test`.`t3`.`b` AS `b`,`test`.`t4`.`a` AS `a`,`test`.`t4`.`b` AS `b` from `test`.`t3` join `test`.`t4` left join (`test`.`t1` join `test`.`t2`) on(((`test`.`t3`.`a` = 1) and (`test`.`t3`.`b` = `test`.`t2`.`b`) and (`test`.`t2`.`b` = `test`.`t4`.`b`))) where 1
@@ -958,15 +958,15 @@ id	select_type	table	type	possible_keys	
 1	SIMPLE	t0	ALL	NULL	NULL	NULL	NULL	3	100.00	Using where
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	3	100.00	Using where; Using join buffer
 1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	100.00	Using where
+1	SIMPLE	t4	ref	idx_b	idx_b	5	test.t2.b	1	100.00	
 1	SIMPLE	t3	ALL	NULL	NULL	NULL	NULL	2	100.00	Using where
-1	SIMPLE	t4	ref	idx_b	idx_b	5	test.t2.b	2	100.00	Using where
 1	SIMPLE	t5	ALL	idx_b	NULL	NULL	NULL	3	100.00	Using where
 1	SIMPLE	t7	ALL	NULL	NULL	NULL	NULL	2	100.00	Using where
 1	SIMPLE	t6	ALL	NULL	NULL	NULL	NULL	3	100.00	Using where
 1	SIMPLE	t8	ALL	NULL	NULL	NULL	NULL	2	100.00	Using where
 1	SIMPLE	t9	ALL	NULL	NULL	NULL	NULL	3	100.00	Using where; Using join buffer
 Warnings:
-Note	1003	select `test`.`t0`.`a` AS `a`,`test`.`t0`.`b` AS `b`,`test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t2`.`a` AS `a`,`test`.`t2`.`b` AS `b`,`test`.`t3`.`a` AS `a`,`test`.`t3`.`b` AS `b`,`test`.`t4`.`a` AS `a`,`test`.`t4`.`b` AS `b`,`test`.`t5`.`a` AS `a`,`test`.`t5`.`b` AS `b`,`test`.`t6`.`a` AS `a`,`test`.`t6`.`b` AS `b`,`test`.`t7`.`a` AS `a`,`test`.`t7`.`b` AS `b`,`test`.`t8`.`a` AS `a`,`test`.`t8`.`b` AS `b`,`test`.`t9`.`a` AS `a`,`test`.`t9`.`b` AS `b` from `test`.`t0` join `test`.`t1` left join (`test`.`t2` left join (`test`.`t3` join `test`.`t4`) on(((`test`.`t4`.`b` = `test`.`t2`.`b`) and (`test`.`t3`.`a` = 1))) join `test`.`t5` left join (`test`.`t6` join `test`.`t7` left join `test`.`t8` on(((`test`.`t8`.`b` = `test`.`t5`.`b`) and (`test`.`t6`.`b` < 10)))) on(((`test`.`t7`.`b` = `test`.`t5`.`b`) and (`test`.`t6`.`b` >= 2)))) on((((`test`.`t3`.`b` = 2) or isnull(`test`.`t3`.`c`)) and ((`test`.`t6`.`b` = 2) or isnull(`test`.`t6`.`c`)) and ((`test`.`t5`.`b` = `test`.`t0`.`b`) or isnull(`test`.`t3`.`c`) or isnull(`test`.`t6`.`c`) or isnull(`test`.`t8`.`c`)) and (`test`.`t1`.`a` <> 2))) join `test`.`t9` where ((`test`.`t9`.`a` = 1) and (`test`.`t1`.`b` = `test`.`t0`.`b`) and (`test`.`t0`.`a` = 1) and ((`test`.`t2`.`a` >= 4) or isnull(`test`.`t2`.`c`)) and ((`test`.`t3`.`a` < 5) or isnull(`test`.`t3`.`c`)) and ((`test`.`t4`.`b` = `test`.`t3`.`b`) or isnull(`test`.`t3`.`c`) or isnull(`test`.`t4`.`c`)) and ((`test`.`t5`.`a` >= 2) or isnull(`test`.`t5`.`c`)) and ((`test`.`t6`.`a` >= 4) or isnull(`test`.`t6`.`c`)) and ((`test`.`t7`.`a` <= 2) or isnull(`test`.`t7`.`c`)) and ((`test`.`t8`.`a` < 1) or isnull(`test`.`t8`.`c`)) and ((`test`.`t9`.`b` = `test`.`t8`.`b`) or isnull(`test`.`t8`.`c`)))
+Note	1003	select `test`.`t0`.`a` AS `a`,`test`.`t0`.`b` AS `b`,`test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t2`.`a` AS `a`,`test`.`t2`.`b` AS `b`,`test`.`t3`.`a` AS `a`,`test`.`t3`.`b` AS `b`,`test`.`t4`.`a` AS `a`,`test`.`t4`.`b` AS `b`,`test`.`t5`.`a` AS `a`,`test`.`t5`.`b` AS `b`,`test`.`t6`.`a` AS `a`,`test`.`t6`.`b` AS `b`,`test`.`t7`.`a` AS `a`,`test`.`t7`.`b` AS `b`,`test`.`t8`.`a` AS `a`,`test`.`t8`.`b` AS `b`,`test`.`t9`.`a` AS `a`,`test`.`t9`.`b` AS `b` from `test`.`t0` join `test`.`t1` left join (`test`.`t2` left join (`test`.`t3` join `test`.`t4`) on(((`test`.`t4`.`b` = `test`.`t2`.`b`) and (`test`.`t3`.`a` = 1))) join `test`.`t5` left join (`test`.`t6` join `test`.`t7` left join `test`.`t8` on(((`test`.`t8`.`b` = `test`.`t5`.`b`) and (`test`.`t6`.`b` < 10)))) on(((`test`.`t7`.`b` = `test`.`t5`.`b`) and (`test`.`t6`.`b` >= 2)))) on((((`test`.`t3`.`b` = 2) or isnull(`test`.`t3`.`c`)) and ((`test`.`t6`.`b` = 2) or isnull(`test`.`t6`.`c`)) and ((`test`.`t5`.`b` = `test`.`t0`.`b`) or isnull(`test`.`t3`.`c`) or isnull(`test`.`t6`.`c`) or isnull(`test`.`t8`.`c`)) and (`test`.`t1`.`a` <> 2))) join `test`.`t9` where ((`test`.`t9`.`a` = 1) and (`test`.`t1`.`b` = `test`.`t0`.`b`) and (`test`.`t0`.`a` = 1) and ((`test`.`t2`.`a` >= 4) or isnull(`test`.`t2`.`c`)) and ((`test`.`t3`.`a` < 5) or isnull(`test`.`t3`.`c`)) and ((`test`.`t3`.`b` = `test`.`t4`.`b`) or isnull(`test`.`t3`.`c`) or isnull(`test`.`t4`.`c`)) and ((`test`.`t5`.`a` >= 2) or isnull(`test`.`t5`.`c`)) and ((`test`.`t6`.`a` >= 4) or isnull(`test`.`t6`.`c`)) and ((`test`.`t7`.`a` <= 2) or isnull(`test`.`t7`.`c`)) and ((`test`.`t8`.`a` < 1) or isnull(`test`.`t8`.`c`)) and ((`test`.`t9`.`b` = `test`.`t8`.`b`) or isnull(`test`.`t8`.`c`)))
 CREATE INDEX idx_b ON t8(b);
 EXPLAIN EXTENDED
 SELECT t0.a,t0.b,t1.a,t1.b,t2.a,t2.b,t3.a,t3.b,t4.a,t4.b,
@@ -1008,14 +1008,14 @@ id	select_type	table	type	possible_keys	
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	3	100.00	Using where; Using join buffer
 1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	100.00	Using where
 1	SIMPLE	t3	ALL	NULL	NULL	NULL	NULL	2	100.00	Using where
-1	SIMPLE	t4	ref	idx_b	idx_b	5	test.t2.b	2	100.00	Using where
+1	SIMPLE	t4	ref	idx_b	idx_b	5	test.t2.b	1	100.00	
 1	SIMPLE	t5	ALL	idx_b	NULL	NULL	NULL	3	100.00	Using where
-1	SIMPLE	t7	ALL	NULL	NULL	NULL	NULL	2	100.00	Using where
 1	SIMPLE	t6	ALL	NULL	NULL	NULL	NULL	3	100.00	Using where
-1	SIMPLE	t8	ref	idx_b	idx_b	5	test.t5.b	2	100.00	Using where
+1	SIMPLE	t7	ALL	NULL	NULL	NULL	NULL	2	100.00	Using where
+1	SIMPLE	t8	ref	idx_b	idx_b	5	test.t5.b	1	100.00	Using where
 1	SIMPLE	t9	ALL	NULL	NULL	NULL	NULL	3	100.00	Using where; Using join buffer
+Note	1003	select `test`.`t0`.`a` AS `a`,`test`.`t0`.`b` AS `b`,`test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t2`.`a` AS `a`,`test`.`t2`.`b` AS `b`,`test`.`t3`.`a` AS `a`,`test`.`t3`.`b` AS `b`,`test`.`t4`.`a` AS `a`,`test`.`t4`.`b` AS `b`,`test`.`t5`.`a` AS `a`,`test`.`t5`.`b` AS `b`,`test`.`t6`.`a` AS `a`,`test`.`t6`.`b` AS `b`,`test`.`t7`.`a` AS `a`,`test`.`t7`.`b` AS `b`,`test`.`t8`.`a` AS `a`,`test`.`t8`.`b` AS `b`,`test`.`t9`.`a` AS `a`,`test`.`t9`.`b` AS `b` from `test`.`t0` join `test`.`t1` left join (`test`.`t2` left join (`test`.`t3` join `test`.`t4`) on(((`test`.`t4`.`b` = `test`.`t2`.`b`) and (`test`.`t3`.`a` = 1))) join `test`.`t5` left join (`test`.`t6` join `test`.`t7` left join `test`.`t8` on(((`test`.`t8`.`b` = `test`.`t5`.`b`) and (`test`.`t6`.`b` < 10)))) on(((`test`.`t7`.`b` = `test`.`t5`.`b`) and (`test`.`t6`.`b` >= 2)))) on((((`test`.`t3`.`b` = 2) or isnull(`test`.`t3`.`c`)) and ((`test`.`t6`.`b` = 2) or isnull(`test`.`t6`.`c`)) and ((`test`.`t5`.`b` = `test`.`t0`.`b`) or isnull(`test`.`t3`.`c`) or isnull(`test`.`t6`.`c`) or isnull(`test`.`t8`.`c`)) and (`test`.`t1`.`a` <> 2))) join `test`.`t9` where ((`test`.`t9`.`a` = 1) and (`test`.`t1`.`b` = `test`.`t0`.`b`) and (`test`.`t0`.`a` = 1) and ((`test`.`t2`.`a` >= 4) or isnull(`test`.`t2`.`c`)) and ((`test`.`t3`.`a` < 5) or isnull(`test`.`t3`.`c`)) and ((`test`.`t3`.`b` = `test`.`t4`.`b`) or isnull(`test`.`t3`.`c`) or isnull(`test`.`t4`.`c`)) and ((`test`.`t5`.`a` >= 2) or isnull(`test`.`t5`.`c`)) and ((`test`.`t6`.`a` >= 4) or isnull(`test`.`t6`.`c`)) and ((`test`.`t7`.`a` <= 2) or isnull(`test`.`t7`.`c`)) and ((`test`.`t8`.`a` < 1) or isnull(`test`.`t8`.`c`)) and ((`test`.`t9`.`b` = `test`.`t8`.`b`) or isnull(`test`.`t8`.`c`)))
 Warnings:
-Note	1003	select `test`.`t0`.`a` AS `a`,`test`.`t0`.`b` AS `b`,`test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t2`.`a` AS `a`,`test`.`t2`.`b` AS `b`,`test`.`t3`.`a` AS `a`,`test`.`t3`.`b` AS `b`,`test`.`t4`.`a` AS `a`,`test`.`t4`.`b` AS `b`,`test`.`t5`.`a` AS `a`,`test`.`t5`.`b` AS `b`,`test`.`t6`.`a` AS `a`,`test`.`t6`.`b` AS `b`,`test`.`t7`.`a` AS `a`,`test`.`t7`.`b` AS `b`,`test`.`t8`.`a` AS `a`,`test`.`t8`.`b` AS `b`,`test`.`t9`.`a` AS `a`,`test`.`t9`.`b` AS `b` from `test`.`t0` join `test`.`t1` left join (`test`.`t2` left join (`test`.`t3` join `test`.`t4`) on(((`test`.`t4`.`b` = `test`.`t2`.`b`) and (`test`.`t3`.`a` = 1))) join `test`.`t5` left join (`test`.`t6` join `test`.`t7` left join `test`.`t8` on(((`test`.`t8`.`b` = `test`.`t5`.`b`) and (`test`.`t6`.`b` < 10)))) on(((`test`.`t7`.`b` = `test`.`t5`.`b`) and (`test`.`t6`.`b` >= 2)))) on((((`test`.`t3`.`b` = 2) or isnull(`test`.`t3`.`c`)) and ((`test`.`t6`.`b` = 2) or isnull(`test`.`t6`.`c`)) and ((`test`.`t5`.`b` = `test`.`t0`.`b`) or isnull(`test`.`t3`.`c`) or isnull(`test`.`t6`.`c`) or isnull(`test`.`t8`.`c`)) and (`test`.`t1`.`a` <> 2))) join `test`.`t9` where ((`test`.`t9`.`a` = 1) and (`test`.`t1`.`b` = `test`.`t0`.`b`) and (`test`.`t0`.`a` = 1) and ((`test`.`t2`.`a` >= 4) or isnull(`test`.`t2`.`c`)) and ((`test`.`t3`.`a` < 5) or isnull(`test`.`t3`.`c`)) and ((`test`.`t4`.`b` = `test`.`t3`.`b`) or isnull(`test`.`t3`.`c`) or isnull(`test`.`t4`.`c`)) and ((`test`.`t5`.`a` >= 2) or isnull(`test`.`t5`.`c`)) and ((`test`.`t6`.`a` >= 4) or isnull(`test`.`t6`.`c`)) and ((`test`.`t7`.`a` <= 2) or isnull(`test`.`t7`.`c`)) and ((`test`.`t8`.`a` < 1) or isnull(`test`.`t8`.`c`)) and ((`test`.`t9`.`b` = `test`.`t8`.`b`) or isnull(`test`.`t8`.`c`)))
 CREATE INDEX idx_b ON t1(b);
 CREATE INDEX idx_a ON t0(a);
 EXPLAIN EXTENDED
@@ -1055,17 +1055,17 @@ t0.b=t1.b AND          
 (t9.a=1);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 1	SIMPLE	t0	ref	idx_a	idx_a	5	const	1	100.00	Using where
-1	SIMPLE	t1	ref	idx_b	idx_b	5	test.t0.b	2	100.00	Using where
+1	SIMPLE	t1	ref	idx_b	idx_b	5	test.t0.b	1	100.00	Using where
 1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	100.00	Using where
 1	SIMPLE	t3	ALL	NULL	NULL	NULL	NULL	2	100.00	Using where
-1	SIMPLE	t4	ref	idx_b	idx_b	5	test.t2.b	2	100.00	Using where
+1	SIMPLE	t4	ref	idx_b	idx_b	5	test.t2.b	1	100.00	
 1	SIMPLE	t5	ALL	idx_b	NULL	NULL	NULL	3	100.00	Using where
-1	SIMPLE	t7	ALL	NULL	NULL	NULL	NULL	2	100.00	Using where
 1	SIMPLE	t6	ALL	NULL	NULL	NULL	NULL	3	100.00	Using where
-1	SIMPLE	t8	ref	idx_b	idx_b	5	test.t5.b	2	100.00	Using where
+1	SIMPLE	t7	ALL	NULL	NULL	NULL	NULL	2	100.00	Using where
+1	SIMPLE	t8	ref	idx_b	idx_b	5	test.t5.b	1	100.00	Using where
 1	SIMPLE	t9	ALL	NULL	NULL	NULL	NULL	3	100.00	Using where; Using join buffer
+Note	1003	select `test`.`t0`.`a` AS `a`,`test`.`t0`.`b` AS `b`,`test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t2`.`a` AS `a`,`test`.`t2`.`b` AS `b`,`test`.`t3`.`a` AS `a`,`test`.`t3`.`b` AS `b`,`test`.`t4`.`a` AS `a`,`test`.`t4`.`b` AS `b`,`test`.`t5`.`a` AS `a`,`test`.`t5`.`b` AS `b`,`test`.`t6`.`a` AS `a`,`test`.`t6`.`b` AS `b`,`test`.`t7`.`a` AS `a`,`test`.`t7`.`b` AS `b`,`test`.`t8`.`a` AS `a`,`test`.`t8`.`b` AS `b`,`test`.`t9`.`a` AS `a`,`test`.`t9`.`b` AS `b` from `test`.`t0` join `test`.`t1` left join (`test`.`t2` left join (`test`.`t3` join `test`.`t4`) on(((`test`.`t4`.`b` = `test`.`t2`.`b`) and (`test`.`t3`.`a` = 1))) join `test`.`t5` left join (`test`.`t6` join `test`.`t7` left join `test`.`t8` on(((`test`.`t8`.`b` = `test`.`t5`.`b`) and (`test`.`t6`.`b` < 10)))) on(((`test`.`t7`.`b` = `test`.`t5`.`b`) and (`test`.`t6`.`b` >= 2)))) on((((`test`.`t3`.`b` = 2) or isnull(`test`.`t3`.`c`)) and ((`test`.`t6`.`b` = 2) or isnull(`test`.`t6`.`c`)) and ((`test`.`t5`.`b` = `test`.`t0`.`b`) or isnull(`test`.`t3`.`c`) or isnull(`test`.`t6`.`c`) or isnull(`test`.`t8`.`c`)) and (`test`.`t1`.`a` <> 2))) join `test`.`t9` where ((`test`.`t9`.`a` = 1) and (`test`.`t1`.`b` = `test`.`t0`.`b`) and (`test`.`t0`.`a` = 1) and ((`test`.`t2`.`a` >= 4) or isnull(`test`.`t2`.`c`)) and ((`test`.`t3`.`a` < 5) or isnull(`test`.`t3`.`c`)) and ((`test`.`t3`.`b` = `test`.`t4`.`b`) or isnull(`test`.`t3`.`c`) or isnull(`test`.`t4`.`c`)) and ((`test`.`t5`.`a` >= 2) or isnull(`test`.`t5`.`c`)) and ((`test`.`t6`.`a` >= 4) or isnull(`test`.`t6`.`c`)) and ((`test`.`t7`.`a` <= 2) or isnull(`test`.`t7`.`c`)) and ((`test`.`t8`.`a` < 1) or isnull(`test`.`t8`.`c`)) and ((`test`.`t9`.`b` = `test`.`t8`.`b`) or isnull(`test`.`t8`.`c`)))
 Warnings:
-Note	1003	select `test`.`t0`.`a` AS `a`,`test`.`t0`.`b` AS `b`,`test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t2`.`a` AS `a`,`test`.`t2`.`b` AS `b`,`test`.`t3`.`a` AS `a`,`test`.`t3`.`b` AS `b`,`test`.`t4`.`a` AS `a`,`test`.`t4`.`b` AS `b`,`test`.`t5`.`a` AS `a`,`test`.`t5`.`b` AS `b`,`test`.`t6`.`a` AS `a`,`test`.`t6`.`b` AS `b`,`test`.`t7`.`a` AS `a`,`test`.`t7`.`b` AS `b`,`test`.`t8`.`a` AS `a`,`test`.`t8`.`b` AS `b`,`test`.`t9`.`a` AS `a`,`test`.`t9`.`b` AS `b` from `test`.`t0` join `test`.`t1` left join (`test`.`t2` left join (`test`.`t3` join `test`.`t4`) on(((`test`.`t4`.`b` = `test`.`t2`.`b`) and (`test`.`t3`.`a` = 1))) join `test`.`t5` left join (`test`.`t6` join `test`.`t7` left join `test`.`t8` on(((`test`.`t8`.`b` = `test`.`t5`.`b`) and (`test`.`t6`.`b` < 10)))) on(((`test`.`t7`.`b` = `test`.`t5`.`b`) and (`test`.`t6`.`b` >= 2)))) on((((`test`.`t3`.`b` = 2) or isnull(`test`.`t3`.`c`)) and ((`test`.`t6`.`b` = 2) or isnull(`test`.`t6`.`c`)) and ((`test`.`t5`.`b` = `test`.`t0`.`b`) or isnull(`test`.`t3`.`c`) or isnull(`test`.`t6`.`c`) or isnull(`test`.`t8`.`c`)) and (`test`.`t1`.`a` <> 2))) join `test`.`t9` where ((`test`.`t9`.`a` = 1) and (`test`.`t1`.`b` = `test`.`t0`.`b`) and (`test`.`t0`.`a` = 1) and ((`test`.`t2`.`a` >= 4) or isnull(`test`.`t2`.`c`)) and ((`test`.`t3`.`a` < 5) or isnull(`test`.`t3`.`c`)) and ((`test`.`t4`.`b` = `test`.`t3`.`b`) or isnull(`test`.`t3`.`c`) or isnull(`test`.`t4`.`c`)) and ((`test`.`t5`.`a` >= 2) or isnull(`test`.`t5`.`c`)) and ((`test`.`t6`.`a` >= 4) or isnull(`test`.`t6`.`c`)) and ((`test`.`t7`.`a` <= 2) or isnull(`test`.`t7`.`c`)) and ((`test`.`t8`.`a` < 1) or isnull(`test`.`t8`.`c`)) and ((`test`.`t9`.`b` = `test`.`t8`.`b`) or isnull(`test`.`t8`.`c`)))
 SELECT t0.a,t0.b,t1.a,t1.b,t2.a,t2.b,t3.a,t3.b,t4.a,t4.b,
 t5.a,t5.b,t6.a,t6.b,t7.a,t7.b,t8.a,t8.b,t9.a,t9.b
 FROM t0,t1
@@ -1102,21 +1102,21 @@ t0.b=t1.b AND          
 (t9.a=1);
 a	b	a	b	a	b	a	b	a	b	a	b	a	b	a	b	a	b	a	b
 1	2	2	2	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1	1
-1	2	3	2	4	2	1	2	3	2	3	1	6	2	1	1	NULL	NULL	1	1
-1	2	3	2	4	2	1	2	3	2	3	3	NULL	NULL	NULL	NULL	NULL	NULL	1	1
-1	2	3	2	4	2	1	2	4	2	3	1	6	2	1	1	NULL	NULL	1	1
-1	2	3	2	4	2	1	2	4	2	3	3	NULL	NULL	NULL	NULL	NULL	NULL	1	1
-1	2	3	2	5	3	NULL	NULL	NULL	NULL	3	1	6	2	1	1	NULL	NULL	1	1
-1	2	3	2	5	3	NULL	NULL	NULL	NULL	3	3	NULL	NULL	NULL	NULL	NULL	NULL	1	1
 1	2	2	2	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1	2
-1	2	3	2	4	2	1	2	3	2	3	1	6	2	1	1	NULL	NULL	1	2
 1	2	3	2	4	2	1	2	3	2	2	2	6	2	2	2	0	2	1	2
+1	2	3	2	4	2	1	2	3	2	3	1	6	2	1	1	NULL	NULL	1	1
+1	2	3	2	4	2	1	2	3	2	3	1	6	2	1	1	NULL	NULL	1	2
+1	2	3	2	4	2	1	2	3	2	3	3	NULL	NULL	NULL	NULL	NULL	NULL	1	1
 1	2	3	2	4	2	1	2	3	2	3	3	NULL	NULL	NULL	NULL	NULL	NULL	1	2
-1	2	3	2	4	2	1	2	4	2	3	1	6	2	1	1	NULL	NULL	1	2
 1	2	3	2	4	2	1	2	4	2	2	2	6	2	2	2	0	2	1	2
+1	2	3	2	4	2	1	2	4	2	3	1	6	2	1	1	NULL	NULL	1	1
+1	2	3	2	4	2	1	2	4	2	3	1	6	2	1	1	NULL	NULL	1	2
+1	2	3	2	4	2	1	2	4	2	3	3	NULL	NULL	NULL	NULL	NULL	NULL	1	1
 1	2	3	2	4	2	1	2	4	2	3	3	NULL	NULL	NULL	NULL	NULL	NULL	1	2
-1	2	3	2	5	3	NULL	NULL	NULL	NULL	3	1	6	2	1	1	NULL	NULL	1	2
 1	2	3	2	5	3	NULL	NULL	NULL	NULL	2	2	6	2	2	2	0	2	1	2
+1	2	3	2	5	3	NULL	NULL	NULL	NULL	3	1	6	2	1	1	NULL	NULL	1	1
+1	2	3	2	5	3	NULL	NULL	NULL	NULL	3	1	6	2	1	1	NULL	NULL	1	2
+1	2	3	2	5	3	NULL	NULL	NULL	NULL	3	3	NULL	NULL	NULL	NULL	NULL	NULL	1	1
 1	2	3	2	5	3	NULL	NULL	NULL	NULL	3	3	NULL	NULL	NULL	NULL	NULL	NULL	1	2
 SELECT t2.a,t2.b
 FROM t2;
@@ -1203,7 +1203,7 @@ EXPLAIN SELECT a, b, c FROM t1 LEFT JOIN
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	index	NULL	a	5	NULL	21	Using index
 1	SIMPLE	t3	index	c	c	5	NULL	6	Using index
-1	SIMPLE	t2	ref	b	b	5	test.t3.c	2	Using index
+1	SIMPLE	t2	ref	b	b	5	test.t3.c	1	Using index
 EXPLAIN SELECT a, b, c FROM t1 LEFT JOIN (t2, t3) ON b < 3 and b = c;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	index	NULL	a	5	NULL	#	Using index
@@ -1484,8 +1484,8 @@ explain select * from t1 left join 
 on (t1.a = t2.a);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	10	
-1	SIMPLE	t2	ref	a	a	5	test.t1.a	11	
-1	SIMPLE	t3	ref	a	a	5	test.t2.a	11	
+1	SIMPLE	t2	ref	a	a	5	test.t1.a	1	
+1	SIMPLE	t3	ref	a	a	5	test.t2.a	1	
 drop table t1, t2, t3;
 CREATE TABLE t1 (id int NOT NULL PRIMARY KEY, type varchar(10));
 CREATE TABLE t2 (pid int NOT NULL PRIMARY KEY, type varchar(10));

=== modified file 'mysql-test/suite/pbxt/r/key.result'
--- a/mysql-test/suite/pbxt/r/key.result	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/r/key.result	2009-08-17 15:57:58 +0000
@@ -153,7 +153,7 @@ t1	0	PRIMARY	1	d	A	0	NULL	NULL		BTREE	
 t1	0	a	1	a	A	0	NULL	NULL		BTREE	
 t1	0	e	1	e	A	0	NULL	NULL		BTREE	
 t1	0	b	1	b	A	0	NULL	NULL	YES	BTREE	
-t1	1	c	1	c	A	NULL	NULL	NULL	YES	BTREE	
+t1	1	c	1	c	A	0	NULL	NULL	YES	BTREE	
 drop table t1;
 CREATE TABLE t1 (c CHAR(10) NOT NULL,i INT NOT NULL AUTO_INCREMENT,
 UNIQUE (c,i));

=== modified file 'mysql-test/suite/pbxt/r/key_cache.result'
--- a/mysql-test/suite/pbxt/r/key_cache.result	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/r/key_cache.result	2009-08-17 15:57:58 +0000
@@ -122,7 +122,7 @@ i
 explain select count(*) from t1, t2 where t1.p = t2.i;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	index	PRIMARY	PRIMARY	4	NULL	2	Using index
-1	SIMPLE	t2	ref	k1	k1	5	test.t1.p	2	Using where; Using index
+1	SIMPLE	t2	ref	k1	k1	5	test.t1.p	1	Using where; Using index
 select count(*) from t1, t2 where t1.p = t2.i;
 count(*)
 3
@@ -257,8 +257,6 @@ test.t2	assign_to_keycache	note	The stor
 drop table t1,t2,t3;
 set global keycache2.key_buffer_size=0;
 set global keycache3.key_buffer_size=100;
-Warnings:
-Warning	1292	Truncated incorrect key_buffer_size value: '100'
 set global keycache3.key_buffer_size=0;
 create table t1 (mytext text, FULLTEXT (mytext)) engine=myisam;
 insert t1 values ('aaabbb');

=== modified file 'mysql-test/suite/pbxt/r/key_diff.result'
--- a/mysql-test/suite/pbxt/r/key_diff.result	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/r/key_diff.result	2009-08-17 15:57:58 +0000
@@ -36,7 +36,7 @@ a	a	a	a
 explain select t1.*,t2.* from t1,t1 as t2 where t1.A=t2.B;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	ALL	a	NULL	NULL	NULL	5	
-1	SIMPLE	t2	ALL	b	NULL	NULL	NULL	5	Using where; Using join buffer
+1	SIMPLE	t2	ref	b	b	4	test.t1.a	1	Using where
 select t1.*,t2.* from t1,t1 as t2 where t1.A=t2.B order by binary t1.a,t2.a;
 a	b	a	b
 A	B	a	a

=== modified file 'mysql-test/suite/pbxt/r/lowercase_view.result'
--- a/mysql-test/suite/pbxt/r/lowercase_view.result	2009-04-02 20:36:52 +0000
+++ b/mysql-test/suite/pbxt/r/lowercase_view.result	2009-08-31 11:07:44 +0000
@@ -119,7 +119,7 @@ create table t1Aa (col1 int);
 create view v1Aa as select col1 from t1Aa as AaA;
 show create view v1AA;
 View	Create View	character_set_client	collation_connection
-v1aa	CREATE ALGORITHM=UNDEFINED DEFINER=`root`@`localhost` SQL SECURITY DEFINER VIEW `v1aa` AS select `AaA`.`col1` AS `col1` from `t1aa` `AaA`	latin1	latin1_swedish_ci
+v1aa	CREATE ALGORITHM=UNDEFINED DEFINER=`root`@`localhost` SQL SECURITY DEFINER VIEW `v1aa` AS select `aaa`.`col1` AS `col1` from `t1aa` `aaa`	latin1	latin1_swedish_ci
 drop view v1AA;
 select Aaa.col1 from t1Aa as AaA;
 col1
@@ -128,7 +128,7 @@ drop view v1AA;
 create view v1Aa as select AaA.col1 from t1Aa as AaA;
 show create view v1AA;
 View	Create View	character_set_client	collation_connection
-v1aa	CREATE ALGORITHM=UNDEFINED DEFINER=`root`@`localhost` SQL SECURITY DEFINER VIEW `v1aa` AS select `AaA`.`col1` AS `col1` from `t1aa` `AaA`	latin1	latin1_swedish_ci
+v1aa	CREATE ALGORITHM=UNDEFINED DEFINER=`root`@`localhost` SQL SECURITY DEFINER VIEW `v1aa` AS select `aaa`.`col1` AS `col1` from `t1aa` `aaa`	latin1	latin1_swedish_ci
 drop view v1AA;
 drop table t1Aa;
 CREATE TABLE  t1 (a int, b int);
@@ -142,7 +142,7 @@ CREATE OR REPLACE VIEW v1 AS
 select X.a from t1 AS X group by X.b having (X.a = 1);
 SHOW CREATE VIEW v1;
 View	Create View	character_set_client	collation_connection
-v1	CREATE ALGORITHM=UNDEFINED DEFINER=`root`@`localhost` SQL SECURITY DEFINER VIEW `v1` AS select `X`.`a` AS `a` from `t1` `X` group by `X`.`b` having (`X`.`a` = 1)	latin1	latin1_swedish_ci
+v1	CREATE ALGORITHM=UNDEFINED DEFINER=`root`@`localhost` SQL SECURITY DEFINER VIEW `v1` AS select `x`.`a` AS `a` from `t1` `x` group by `x`.`b` having (`x`.`a` = 1)	latin1	latin1_swedish_ci
 SELECT * FROM v1;
 a
 DROP VIEW v1;

=== modified file 'mysql-test/suite/pbxt/r/mysqlshow.result'
--- a/mysql-test/suite/pbxt/r/mysqlshow.result	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/r/mysqlshow.result	2009-08-17 15:57:58 +0000
@@ -107,7 +107,21 @@ Database: information_schema
 | TRIGGERS                              |
 | USER_PRIVILEGES                       |
 | VIEWS                                 |
+| INNODB_BUFFER_POOL_PAGES              |
 | PBXT_STATISTICS                       |
+| INNODB_CMP                            |
+| INNODB_RSEG                           |
+| XTRADB_ENHANCEMENTS                   |
+| INNODB_BUFFER_POOL_PAGES_INDEX        |
+| INNODB_INDEX_STATS                    |
+| INNODB_TRX                            |
+| INNODB_CMP_RESET                      |
+| INNODB_LOCK_WAITS                     |
+| INNODB_CMPMEM_RESET                   |
+| INNODB_LOCKS                          |
+| INNODB_CMPMEM                         |
+| INNODB_TABLE_STATS                    |
+| INNODB_BUFFER_POOL_PAGES_BLOB         |
 +---------------------------------------+
 Database: INFORMATION_SCHEMA
 +---------------------------------------+
@@ -141,7 +155,21 @@ Database: INFORMATION_SCHEMA
 | TRIGGERS                              |
 | USER_PRIVILEGES                       |
 | VIEWS                                 |
+| INNODB_BUFFER_POOL_PAGES              |
 | PBXT_STATISTICS                       |
+| INNODB_CMP                            |
+| INNODB_RSEG                           |
+| XTRADB_ENHANCEMENTS                   |
+| INNODB_BUFFER_POOL_PAGES_INDEX        |
+| INNODB_INDEX_STATS                    |
+| INNODB_TRX                            |
+| INNODB_CMP_RESET                      |
+| INNODB_LOCK_WAITS                     |
+| INNODB_CMPMEM_RESET                   |
+| INNODB_LOCKS                          |
+| INNODB_CMPMEM                         |
+| INNODB_TABLE_STATS                    |
+| INNODB_BUFFER_POOL_PAGES_BLOB         |
 +---------------------------------------+
 Wildcard: inf_rmation_schema
 +--------------------+

=== modified file 'mysql-test/suite/pbxt/r/null.result'
--- a/mysql-test/suite/pbxt/r/null.result	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/r/null.result	2009-08-17 15:57:58 +0000
@@ -93,9 +93,11 @@ INSERT INTO t1 SET a = "", d= "2003-01-1
 Warnings:
 Warning	1265	Data truncated for column 'd' at row 1
 UPDATE t1 SET d=1/NULL;
-ERROR 23000: Column 'd' cannot be null
+Warnings:
+Warning	1265	Data truncated for column 'd' at row 1
 UPDATE t1 SET d=NULL;
-ERROR 23000: Column 'd' cannot be null
+Warnings:
+Warning	1048	Column 'd' cannot be null
 INSERT INTO t1 (a) values (null);
 ERROR 23000: Column 'a' cannot be null
 INSERT INTO t1 (a) values (1/null);
@@ -130,7 +132,7 @@ Warning	1048	Column 'd' cannot be null
 Warning	1048	Column 'd' cannot be null
 select * from t1;
 a	b	c	d
-	0	0000-00-00 00:00:00	2003
+	0	0000-00-00 00:00:00	0
 	0	0000-00-00 00:00:00	0
 	0	0000-00-00 00:00:00	0
 	0	0000-00-00 00:00:00	0

=== modified file 'mysql-test/suite/pbxt/r/null_key.result'
--- a/mysql-test/suite/pbxt/r/null_key.result	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/r/null_key.result	2009-08-17 15:57:58 +0000
@@ -407,8 +407,8 @@ EXPLAIN SELECT SQL_CALC_FOUND_ROWS * FRO
 LEFT JOIN t3 ON t2.b=t3.b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	4	
-1	SIMPLE	t2	ref	idx	idx	5	test.t1.a	2	
-1	SIMPLE	t3	ref	idx	idx	5	test.t2.b	186	Using index
+1	SIMPLE	t2	ref	idx	idx	5	test.t1.a	1	
+1	SIMPLE	t3	ref	idx	idx	5	test.t2.b	1	Using index
 FLUSH STATUS ;
 SELECT SQL_CALC_FOUND_ROWS * FROM t1 LEFT JOIN t2 ON t1.a=t2.a
 LEFT JOIN t3 ON t2.b=t3.b;

=== modified file 'mysql-test/suite/pbxt/r/partition_pruning.result'
--- a/mysql-test/suite/pbxt/r/partition_pruning.result	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/r/partition_pruning.result	2009-08-17 15:57:58 +0000
@@ -338,12 +338,12 @@ select * from t1 X, t1 Y 
 where X.b = Y.b and (X.a=1 or X.a=2) and (Y.a=2 or Y.a=3);
 id	select_type	table	partitions	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	X	p1,p2	ALL	a,b	NULL	NULL	NULL	2	Using where
-1	SIMPLE	Y	p2,p3	ref	a,b	b	4	test.X.b	2	Using where
+1	SIMPLE	Y	p2,p3	ref	a,b	b	4	test.X.b	1	Using where
 explain partitions
 select * from t1 X, t1 Y where X.a = Y.a and (X.a=1 or X.a=2);
 id	select_type	table	partitions	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	X	p1,p2	ALL	a	NULL	NULL	NULL	4	Using where
-1	SIMPLE	Y	p1,p2	ref	a	a	4	test.X.a	2	
+1	SIMPLE	Y	p1,p2	ref	a	a	4	test.X.a	1	
 drop table t1;
 create table t1 (a int) partition by hash(a) partitions 20;
 insert into t1 values (1),(2),(3);

=== modified file 'mysql-test/suite/pbxt/r/pbxt_bugs.result'
--- a/mysql-test/suite/pbxt/r/pbxt_bugs.result	2009-04-02 20:36:52 +0000
+++ b/mysql-test/suite/pbxt/r/pbxt_bugs.result	2009-08-17 15:57:58 +0000
@@ -1218,3 +1218,59 @@ c1	c2
 0	opq
 1	jkl
 DROP TABLE t1;
+create table parent (id int primary key);
+create table child (id int PRIMARY KEY, FOREIGN KEY (id) REFERENCES parent(id));
+insert into parent values (2), (3), (4);
+insert into child values (3), (4);
+delete ignore from parent;
+ERROR 23000: Cannot delete or update a parent row: a foreign key constraint fails (Constraint: `FOREIGN_1`)
+select * from parent;
+id
+2
+3
+4
+drop table child, parent;
+create schema test378222;
+use test378222;
+create table t1 (id int primary key);
+create table t2 (id int primary key);
+alter table t1 add constraint foreign key (id) references t2 (id);
+alter table t2 add constraint foreign key (id) references t1 (id);
+drop schema test378222;
+create schema test378222a;
+create schema test378222b;
+create table test378222a.t1 (id int primary key);
+create table test378222b.t2 (id int primary key);
+alter table test378222a.t1 add constraint foreign key (id) references test378222b.t2 (id);
+alter table test378222b.t2 add constraint foreign key (id) references test378222a.t1 (id);
+set foreign_key_checks = 1;
+drop schema test378222a;
+ERROR 23000: Cannot delete or update a parent row: a foreign key constraint fails
+drop schema test378222b;
+ERROR 23000: Cannot delete or update a parent row: a foreign key constraint fails
+set foreign_key_checks = 0;
+drop schema test378222a;
+drop schema test378222b;
+set foreign_key_checks = 1;
+use test;
+CREATE TABLE t1(c1 TINYINT AUTO_INCREMENT NULL KEY ) AUTO_INCREMENT=10;
+SHOW CREATE TABLE t1;
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `c1` tinyint(4) NOT NULL AUTO_INCREMENT,
+  PRIMARY KEY (`c1`)
+) ENGINE=PBXT AUTO_INCREMENT=10 DEFAULT CHARSET=latin1
+INSERT INTO t1 VALUES(null);
+INSERT INTO t1 VALUES(null);
+INSERT INTO t1 VALUES(null);
+SELECT * FROM t1;
+c1
+10
+11
+12
+TRUNCATE TABLE t1;
+INSERT INTO t1 VALUES(null);
+SELECT * FROM t1;
+c1
+1
+DROP TABLE t1;

=== modified file 'mysql-test/suite/pbxt/r/pbxt_ref_int.result'
--- a/mysql-test/suite/pbxt/r/pbxt_ref_int.result	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/r/pbxt_ref_int.result	2009-08-17 15:57:58 +0000
@@ -166,7 +166,7 @@ child	CREATE TABLE `child` (
   `parent_id` int(11) DEFAULT NULL,
   KEY `par_ind` (`parent_id`),
   KEY `child_ind` (`id`),
-  CONSTRAINT `FOREIGN_1` FOREIGN KEY (`parent_id`) REFERENCES `parent` (`id`)
+  CONSTRAINT `FOREIGN_1` FOREIGN KEY (`parent_id`) REFERENCES `test`.`parent` (`id`)
 ) ENGINE=PBXT DEFAULT CHARSET=latin1
 drop index child_ind on child;
 show create table child;
@@ -175,7 +175,7 @@ child	CREATE TABLE `child` (
   `id` int(11) DEFAULT NULL,
   `parent_id` int(11) DEFAULT NULL,
   KEY `par_ind` (`parent_id`),
-  CONSTRAINT `FOREIGN_1` FOREIGN KEY (`parent_id`) REFERENCES `parent` (`id`)
+  CONSTRAINT `FOREIGN_1` FOREIGN KEY (`parent_id`) REFERENCES `test`.`parent` (`id`)
 ) ENGINE=PBXT DEFAULT CHARSET=latin1
 alter table parent add column c1 varchar(40);
 insert child values(2000, 2);
@@ -243,7 +243,7 @@ child	CREATE TABLE `child` (
   `id` int(11) DEFAULT NULL,
   `parent_id` int(11) DEFAULT NULL,
   KEY `par_ind` (`parent_id`),
-  CONSTRAINT `FOREIGN_1` FOREIGN KEY (`parent_id`) REFERENCES `parent` (`id`)
+  CONSTRAINT `FOREIGN_1` FOREIGN KEY (`parent_id`) REFERENCES `test`.`parent` (`id`)
 ) ENGINE=PBXT DEFAULT CHARSET=latin1
 alter table child add column c1 varchar(40);
 insert child values(400, 1, "asd");
@@ -284,7 +284,7 @@ child	CREATE TABLE `child` (
   `id` int(11) DEFAULT NULL,
   `parent_id` int(11) DEFAULT NULL,
   KEY `par_ind` (`parent_id`),
-  CONSTRAINT `FOREIGN_1` FOREIGN KEY (`parent_id`) REFERENCES `parent` (`id`) ON DELETE CASCADE
+  CONSTRAINT `FOREIGN_1` FOREIGN KEY (`parent_id`) REFERENCES `test`.`parent` (`id`) ON DELETE CASCADE
 ) ENGINE=PBXT DEFAULT CHARSET=latin1
 insert parent values(1);
 insert child values(100, 1);

=== modified file 'mysql-test/suite/pbxt/r/preload.result'
--- a/mysql-test/suite/pbxt/r/preload.result	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/r/preload.result	2009-08-17 15:57:58 +0000
@@ -144,7 +144,7 @@ Key_reads	0
 load index into cache t3, t2 key (primary,b) ;
 Table	Op	Msg_type	Msg_text
 test.t3	preload_keys	Error	Table 'test.t3' doesn't exist
-test.t3	preload_keys	error	Corrupt
+test.t3	preload_keys	status	Operation failed
 test.t2	preload_keys	note	The storage engine for the table doesn't support preload_keys
 show status like "key_read%";
 Variable_name	Value
@@ -159,7 +159,7 @@ Key_reads	0
 load index into cache t3 key (b), t2 key (c) ;
 Table	Op	Msg_type	Msg_text
 test.t3	preload_keys	Error	Table 'test.t3' doesn't exist
-test.t3	preload_keys	error	Corrupt
+test.t3	preload_keys	status	Operation failed
 test.t2	preload_keys	note	The storage engine for the table doesn't support preload_keys
 show status like "key_read%";
 Variable_name	Value

=== modified file 'mysql-test/suite/pbxt/r/ps_1general.result'
--- a/mysql-test/suite/pbxt/r/ps_1general.result	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/r/ps_1general.result	2009-08-31 11:07:44 +0000
@@ -289,11 +289,11 @@ prepare stmt4 from ' show index from t2 
 execute stmt4;
 Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment
 t2	0	PRIMARY	1	a	A	0	NULL	NULL		BTREE	
-t2	1	t2_idx	1	b	A	NULL	NULL	NULL	YES	BTREE	
+t2	1	t2_idx	1	b	A	0	NULL	NULL	YES	BTREE	
 prepare stmt4 from ' show table status from test like ''t2%'' ';
 execute stmt4;
 Name	Engine	Version	Row_format	Rows	Avg_row_length	Data_length	Max_data_length	Index_length	Data_free	Auto_increment	Create_time	Update_time	Check_time	Collation	Checksum	Create_options	Comment
-t2	PBXT	10	Fixed	0	29	1	#	4096	0	NULL	#	#	#	latin1_swedish_ci	NULL		
+t2	PBXT	10	Fixed	0	29	1024	#	4096	0	NULL	#	#	#	latin1_swedish_ci	NULL		
 prepare stmt4 from ' show table status from test like ''t9%'' ';
 execute stmt4;
 Name	Engine	Version	Row_format	Rows	Avg_row_length	Data_length	Max_data_length	Index_length	Data_free	Auto_increment	Create_time	Update_time	Check_time	Collation	Checksum	Create_options	Comment
@@ -447,7 +447,7 @@ def					type	253	10	3	Y	0	31	8
 def					possible_keys	253	4096	0	Y	0	31	8
 def					key	253	64	0	Y	0	31	8
 def					key_len	253	4096	0	Y	0	31	8
-def					ref	253	1024	0	Y	0	31	8
+def					ref	253	2048	0	Y	0	31	8
 def					rows	8	10	1	Y	32928	0	63
 def					Extra	253	255	14	N	1	31	8
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
@@ -463,7 +463,7 @@ def					type	253	10	5	Y	0	31	8
 def					possible_keys	253	4096	7	Y	0	31	8
 def					key	253	64	7	Y	0	31	8
 def					key_len	253	4096	1	Y	0	31	8
-def					ref	253	1024	0	Y	0	31	8
+def					ref	253	2048	0	Y	0	31	8
 def					rows	8	10	1	Y	32928	0	63
 def					Extra	253	255	27	N	1	31	8
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra

=== modified file 'mysql-test/suite/pbxt/r/range.result'
--- a/mysql-test/suite/pbxt/r/range.result	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/r/range.result	2009-08-17 15:57:58 +0000
@@ -423,19 +423,19 @@ test.t2	analyze	status	OK
 explain select * from t1, t2  where t1.uid=t2.uid AND t1.uid > 0;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	range	uid_index	uid_index	4	NULL	1	Using where
-1	SIMPLE	t2	ref	uid_index	uid_index	4	test.t1.uid	12	
+1	SIMPLE	t2	ref	uid_index	uid_index	4	test.t1.uid	1	
 explain select * from t1, t2  where t1.uid=t2.uid AND t2.uid > 0;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	range	uid_index	uid_index	4	NULL	1	Using where
-1	SIMPLE	t2	ref	uid_index	uid_index	4	test.t1.uid	12	
+1	SIMPLE	t2	ref	uid_index	uid_index	4	test.t1.uid	1	
 explain select * from t1, t2  where t1.uid=t2.uid AND t1.uid != 0;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	range	uid_index	uid_index	4	NULL	2	Using where
-1	SIMPLE	t2	ref	uid_index	uid_index	4	test.t1.uid	12	
+1	SIMPLE	t2	ref	uid_index	uid_index	4	test.t1.uid	1	
 explain select * from t1, t2  where t1.uid=t2.uid AND t2.uid != 0;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	range	uid_index	uid_index	4	NULL	2	Using where
-1	SIMPLE	t2	ref	uid_index	uid_index	4	test.t1.uid	12	
+1	SIMPLE	t2	ref	uid_index	uid_index	4	test.t1.uid	1	
 select * from t1, t2  where t1.uid=t2.uid AND t1.uid > 0;
 id	name	uid	id	name	uid
 1001	A	1	1001	A	1

=== modified file 'mysql-test/suite/pbxt/r/schema.result'
--- a/mysql-test/suite/pbxt/r/schema.result	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/r/schema.result	2009-08-17 15:57:58 +0000
@@ -3,11 +3,13 @@ create schema foo;
 show create schema foo;
 Database	Create Database
 foo	CREATE DATABASE `foo` /*!40100 DEFAULT CHARACTER SET latin1 */
+create table t1 (id int) engine=pbxt;
 show schemas;
 Database
 information_schema
 foo
 mtr
 mysql
+pbxt
 test
 drop schema foo;

=== modified file 'mysql-test/suite/pbxt/r/select.result'
--- a/mysql-test/suite/pbxt/r/select.result	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/r/select.result	2009-08-17 15:57:58 +0000
@@ -604,15 +604,15 @@ id	select_type	table	type	possible_keys	
 explain select * from t3 as t1,t3 where t1.period=t3.period order by t3.period;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	ALL	period	NULL	NULL	NULL	41810	Using temporary; Using filesort
-1	SIMPLE	t3	ref	period	period	4	test.t1.period	18	
+1	SIMPLE	t3	ref	period	period	4	test.t1.period	1	
 explain select * from t3 as t1,t3 where t1.period=t3.period order by t3.period limit 10;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t3	index	period	period	4	NULL	1	
-1	SIMPLE	t1	ref	period	period	4	test.t3.period	18	
+1	SIMPLE	t3	index	period	period	4	NULL	10	
+1	SIMPLE	t1	ref	period	period	4	test.t3.period	1	
 explain select * from t3 as t1,t3 where t1.period=t3.period order by t1.period limit 10;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	index	period	period	4	NULL	1	
-1	SIMPLE	t3	ref	period	period	4	test.t1.period	18	
+1	SIMPLE	t1	index	period	period	4	NULL	10	
+1	SIMPLE	t3	ref	period	period	4	test.t1.period	1	
 select period from t1;
 period
 9410
@@ -2095,7 +2095,7 @@ show keys from t2;
 Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment
 t2	0	PRIMARY	1	auto	A	1199	NULL	NULL		BTREE	
 t2	0	fld1	1	fld1	A	1199	NULL	NULL		BTREE	
-t2	1	fld3	1	fld3	A	NULL	NULL	NULL		BTREE	
+t2	1	fld3	1	fld3	A	1199	NULL	NULL		BTREE	
 drop table t4, t3, t2, t1;
 DO 1;
 DO benchmark(100,1+1),1,1;
@@ -2369,7 +2369,7 @@ insert into t2 values (1,3), (2,3), (3,4
 explain select * from t1 left join t2 on a=c where d in (4);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t2	ref	c,d	d	5	const	1	Using where
-1	SIMPLE	t1	ref	a	a	5	test.t2.c	2	Using where
+1	SIMPLE	t1	ref	a	a	5	test.t2.c	1	Using where
 select * from t1 left join t2 on a=c where d in (4);
 a	b	c	d
 3	2	3	4
@@ -2377,7 +2377,7 @@ a	b	c	d
 explain select * from t1 left join t2 on a=c where d = 4;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t2	ref	c,d	d	5	const	1	Using where
-1	SIMPLE	t1	ref	a	a	5	test.t2.c	2	Using where
+1	SIMPLE	t1	ref	a	a	5	test.t2.c	1	Using where
 select * from t1 left join t2 on a=c where d = 4;
 a	b	c	d
 3	2	3	4
@@ -2403,11 +2403,11 @@ INSERT INTO t2 VALUES ('one'),('two'),('
 EXPLAIN SELECT * FROM t1 LEFT JOIN t2 USE INDEX (a) ON t1.a=t2.a;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	5	
-1	SIMPLE	t2	ref	a	a	23	test.t1.a	2	
+1	SIMPLE	t2	ref	a	a	23	test.t1.a	1	
 EXPLAIN SELECT * FROM t1 LEFT JOIN t2 FORCE INDEX (a) ON t1.a=t2.a;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	5	
-1	SIMPLE	t2	ref	a	a	23	test.t1.a	2	
+1	SIMPLE	t2	ref	a	a	23	test.t1.a	1	
 DROP TABLE t1, t2;
 CREATE TABLE t1 ( city char(30) );
 INSERT INTO t1 VALUES ('London');
@@ -2792,26 +2792,26 @@ id	select_type	table	type	possible_keys	
 1	SIMPLE	NULL	NULL	NULL	NULL	NULL	NULL	NULL	Select tables optimized away
 select max(key1) from t1 where key1 <= 0.6158;
 max(key1)
-0.61580002307892
+0.615800023078918
 select max(key2) from t2 where key2 <= 1.6158;
 max(key2)
-1.6158000230789
+1.61580002307892
 select min(key1) from t1 where key1 >= 0.3762;
 min(key1)
-0.37619999051094
+0.376199990510941
 select min(key2) from t2 where key2 >= 1.3762;
 min(key2)
-1.3761999607086
+1.37619996070862
 select max(key1), min(key2) from t1, t2
 where key1 <= 0.6158 and key2 >= 1.3762;
 max(key1)	min(key2)
-0.61580002307892	1.3761999607086
+0.615800023078918	1.37619996070862
 select max(key1) from t1 where key1 <= 0.6158 and rand() + 0.5 >= 0.5;
 max(key1)
-0.61580002307892
+0.615800023078918
 select min(key1) from t1 where key1 >= 0.3762 and rand() + 0.5 >= 0.5;
 min(key1)
-0.37619999051094
+0.376199990510941
 DROP TABLE t1,t2;
 CREATE TABLE t1 (i BIGINT UNSIGNED NOT NULL);
 INSERT INTO t1 VALUES (10);
@@ -3454,7 +3454,7 @@ explain select * from t2 A, t2 B where A
 and B.a=5 and B.b=A.e and (B.b =1 or B.b = 3 or B.b=5);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	A	range	PRIMARY	PRIMARY	12	NULL	1	Using where
-1	SIMPLE	B	ref	PRIMARY	PRIMARY	8	const,test.A.e	11	
+1	SIMPLE	B	ref	PRIMARY	PRIMARY	8	const,test.A.e	1	
 drop table t1, t2;
 CREATE TABLE t1 (a int PRIMARY KEY, b int, INDEX(b));
 INSERT INTO t1 VALUES (1, 3), (9,4), (7,5), (4,5), (6,2),
@@ -3468,12 +3468,12 @@ EXPLAIN
 SELECT a, c, d, f FROM t1,t2 WHERE a=c AND b BETWEEN 4 AND 6;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	range	PRIMARY,b	b	5	NULL	1	Using where
-1	SIMPLE	t2	ref	c	c	5	test.t1.a	2	Using where
+1	SIMPLE	t2	ref	c	c	5	test.t1.a	1	Using where
 EXPLAIN
 SELECT a, c, d, f FROM t1,t2 WHERE a=c AND b BETWEEN 4 AND 6 AND a > 0;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	range	PRIMARY,b	PRIMARY	4	NULL	1	Using where
-1	SIMPLE	t2	ref	c	c	5	test.t1.a	2	Using where
+1	SIMPLE	t2	ref	c	c	5	test.t1.a	1	Using where
 DROP TABLE t1, t2;
 create table t1 (
 a int unsigned    not null auto_increment primary key,

=== modified file 'mysql-test/suite/pbxt/r/select_safe.result'
--- a/mysql-test/suite/pbxt/r/select_safe.result	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/r/select_safe.result	2009-08-17 15:57:58 +0000
@@ -70,12 +70,12 @@ insert into t1 values (null,"a"),(null,"
 explain select STRAIGHT_JOIN * from t1,t1 as t2 where t1.b=t2.b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	ALL	b	NULL	NULL	NULL	21	
-1	SIMPLE	t2	ref	b	b	21	test.t1.b	2	Using where
+1	SIMPLE	t2	ref	b	b	21	test.t1.b	1	Using where
 set MAX_SEEKS_FOR_KEY=1;
 explain select STRAIGHT_JOIN * from t1,t1 as t2 where t1.b=t2.b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	ALL	b	NULL	NULL	NULL	21	
-1	SIMPLE	t2	ref	b	b	21	test.t1.b	2	Using where
+1	SIMPLE	t2	ref	b	b	21	test.t1.b	1	Using where
 SET MAX_SEEKS_FOR_KEY=DEFAULT;
 drop table t1;
 create table t1 (a int);

=== modified file 'mysql-test/suite/pbxt/r/subselect.result'
--- a/mysql-test/suite/pbxt/r/subselect.result	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/r/subselect.result	2009-08-17 15:57:58 +0000
@@ -1333,7 +1333,7 @@ a
 explain extended select * from t2 where t2.a in (select a from t1);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 1	PRIMARY	t2	index	NULL	a	5	NULL	4	100.00	Using where; Using index
-2	DEPENDENT SUBQUERY	t1	index_subquery	a	a	5	func	1001	100.00	Using index; Using where
+2	DEPENDENT SUBQUERY	t1	index_subquery	a	a	5	func	1	100.00	Using index; Using where
 Warnings:
 Note	1003	select `test`.`t2`.`a` AS `a` from `test`.`t2` where <in_optimizer>(`test`.`t2`.`a`,<exists>(<index_lookup>(<cache>(`test`.`t2`.`a`) in t1 on a where (<cache>(`test`.`t2`.`a`) = `test`.`t1`.`a`))))
 select * from t2 where t2.a in (select a from t1 where t1.b <> 30);
@@ -1343,7 +1343,7 @@ a
 explain extended select * from t2 where t2.a in (select a from t1 where t1.b <> 30);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 1	PRIMARY	t2	index	NULL	a	5	NULL	4	100.00	Using where; Using index
-2	DEPENDENT SUBQUERY	t1	index_subquery	a	a	5	func	1001	100.00	Using index; Using where
+2	DEPENDENT SUBQUERY	t1	index_subquery	a	a	5	func	1	100.00	Using index; Using where
 Warnings:
 Note	1003	select `test`.`t2`.`a` AS `a` from `test`.`t2` where <in_optimizer>(`test`.`t2`.`a`,<exists>(<index_lookup>(<cache>(`test`.`t2`.`a`) in t1 on a where ((`test`.`t1`.`b` <> 30) and (<cache>(`test`.`t2`.`a`) = `test`.`t1`.`a`)))))
 select * from t2 where t2.a in (select t1.a from t1,t3 where t1.b=t3.a);
@@ -1353,8 +1353,8 @@ a
 explain extended select * from t2 where t2.a in (select t1.a from t1,t3 where t1.b=t3.a);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 1	PRIMARY	t2	index	NULL	a	5	NULL	4	100.00	Using where; Using index
-2	DEPENDENT SUBQUERY	t1	ref	a	a	5	func	1001	100.00	Using where; Using index
-2	DEPENDENT SUBQUERY	t3	index	a	a	5	NULL	3	100.00	Using where; Using index; Using join buffer
+2	DEPENDENT SUBQUERY	t1	ref	a	a	5	func	1	100.00	Using where; Using index
+2	DEPENDENT SUBQUERY	t3	ref	a	a	5	test.t1.b	1	100.00	Using where; Using index
 Warnings:
 Note	1003	select `test`.`t2`.`a` AS `a` from `test`.`t2` where <in_optimizer>(`test`.`t2`.`a`,<exists>(select 1 AS `Not_used` from `test`.`t1` join `test`.`t3` where ((`test`.`t3`.`a` = `test`.`t1`.`b`) and (<cache>(`test`.`t2`.`a`) = `test`.`t1`.`a`))))
 insert into t1 values (3,31);
@@ -1370,7 +1370,7 @@ a
 explain extended select * from t2 where t2.a in (select a from t1 where t1.b <> 30);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 1	PRIMARY	t2	index	NULL	a	5	NULL	4	100.00	Using where; Using index
-2	DEPENDENT SUBQUERY	t1	index_subquery	a	a	5	func	1001	100.00	Using index; Using where
+2	DEPENDENT SUBQUERY	t1	index_subquery	a	a	5	func	1	100.00	Using index; Using where
 Warnings:
 Note	1003	select `test`.`t2`.`a` AS `a` from `test`.`t2` where <in_optimizer>(`test`.`t2`.`a`,<exists>(<index_lookup>(<cache>(`test`.`t2`.`a`) in t1 on a where ((`test`.`t1`.`b` <> 30) and (<cache>(`test`.`t2`.`a`) = `test`.`t1`.`a`)))))
 drop table t1, t2, t3;
@@ -3546,7 +3546,7 @@ ORDER BY t1.t DESC LIMIT 1);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	PRIMARY	t2	ALL	NULL	NULL	NULL	NULL	1	
 1	PRIMARY	t1	index	NULL	PRIMARY	16	NULL	11	Using where; Using index; Using join buffer
-2	DEPENDENT SUBQUERY	t1	ref	PRIMARY	PRIMARY	8	test.t2.i1,const	2	Using where; Using index; Using filesort
+2	DEPENDENT SUBQUERY	t1	ref	PRIMARY	PRIMARY	8	test.t2.i1,const	1	Using where; Using index; Using filesort
 SELECT * FROM t1,t2
 WHERE t1.t = (SELECT t1.t FROM t1 
 WHERE t1.t < t2.t  AND t1.i2=1 AND t2.i1=t1.i1
@@ -4214,7 +4214,7 @@ CREATE INDEX I2 ON t1 (b);
 EXPLAIN SELECT a,b FROM t1 WHERE b IN (SELECT a FROM t1);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	PRIMARY	t1	ALL	NULL	NULL	NULL	NULL	2	Using where
-2	DEPENDENT SUBQUERY	t1	index_subquery	I1	I1	2	func	2	Using index; Using where
+2	DEPENDENT SUBQUERY	t1	index_subquery	I1	I1	2	func	1	Using index; Using where
 SELECT a,b FROM t1 WHERE b IN (SELECT a FROM t1);
 a	b
 CREATE TABLE t2 (a VARCHAR(1), b VARCHAR(10));
@@ -4224,14 +4224,14 @@ CREATE INDEX I2 ON t2 (b);
 EXPLAIN SELECT a,b FROM t2 WHERE b IN (SELECT a FROM t2);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	PRIMARY	t2	ALL	NULL	NULL	NULL	NULL	2	Using where
-2	DEPENDENT SUBQUERY	t2	index_subquery	I1	I1	4	func	2	Using index; Using where
+2	DEPENDENT SUBQUERY	t2	index_subquery	I1	I1	4	func	1	Using index; Using where
 SELECT a,b FROM t2 WHERE b IN (SELECT a FROM t2);
 a	b
 EXPLAIN
 SELECT a,b FROM t1 WHERE b IN (SELECT a FROM t1 WHERE LENGTH(a)<500);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	PRIMARY	t1	ALL	NULL	NULL	NULL	NULL	2	Using where
-2	DEPENDENT SUBQUERY	t1	index_subquery	I1	I1	2	func	2	Using index; Using where
+2	DEPENDENT SUBQUERY	t1	index_subquery	I1	I1	2	func	1	Using index; Using where
 SELECT a,b FROM t1 WHERE b IN (SELECT a FROM t1 WHERE LENGTH(a)<500);
 a	b
 DROP TABLE t1,t2;

=== modified file 'mysql-test/suite/pbxt/r/type_enum.result'
--- a/mysql-test/suite/pbxt/r/type_enum.result	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/r/type_enum.result	2009-08-17 15:57:58 +0000
@@ -1776,8 +1776,14 @@ t1	CREATE TABLE `t1` (
   `russian_deviant` enum('E','F','E�F','F,E') NOT NULL DEFAULT 'E'
 ) ENGINE=PBXT DEFAULT CHARSET=latin1
 drop table t1;
+select @@SESSION.sql_mode;
+@@SESSION.sql_mode
+
+select @@GLOBAL.sql_mode;
+@@GLOBAL.sql_mode
+
 create table t1(exhausting_charset enum('ABCDEFGHIJKLMNOPQRSTUVWXYZ','	
 
  !"','#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~','xx\','yy\�','zz�����������������������������������������������������������������������������������������'));
-ERROR HY000: Can't create table 'test.t1' (errno: -1)
+drop table t1;
 End of 5.1 tests

=== modified file 'mysql-test/suite/pbxt/r/type_ranges.result'
--- a/mysql-test/suite/pbxt/r/type_ranges.result	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/r/type_ranges.result	2009-08-17 15:57:58 +0000
@@ -70,19 +70,19 @@ flags	set('one','two','tree')	latin1_swe
 show keys from t1;
 Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment
 t1	0	PRIMARY	1	auto	A	0	NULL	NULL		BTREE	
-t1	1	utiny	1	utiny	A	NULL	NULL	NULL		BTREE	
-t1	1	tiny	1	tiny	A	NULL	NULL	NULL		BTREE	
-t1	1	short	1	short	A	NULL	NULL	NULL		BTREE	
-t1	1	any_name	1	medium	A	NULL	NULL	NULL		BTREE	
-t1	1	longlong	1	longlong	A	NULL	NULL	NULL		BTREE	
-t1	1	real_float	1	real_float	A	NULL	NULL	NULL		BTREE	
-t1	1	ushort	1	ushort	A	NULL	NULL	NULL		BTREE	
-t1	1	umedium	1	umedium	A	NULL	NULL	NULL		BTREE	
-t1	1	ulong	1	ulong	A	NULL	NULL	NULL		BTREE	
-t1	1	ulonglong	1	ulonglong	A	NULL	NULL	NULL		BTREE	
-t1	1	ulonglong	2	ulong	A	NULL	NULL	NULL		BTREE	
-t1	1	options	1	options	A	NULL	NULL	NULL		BTREE	
-t1	1	options	2	flags	A	NULL	NULL	NULL		BTREE	
+t1	1	utiny	1	utiny	A	0	NULL	NULL		BTREE	
+t1	1	tiny	1	tiny	A	0	NULL	NULL		BTREE	
+t1	1	short	1	short	A	0	NULL	NULL		BTREE	
+t1	1	any_name	1	medium	A	0	NULL	NULL		BTREE	
+t1	1	longlong	1	longlong	A	0	NULL	NULL		BTREE	
+t1	1	real_float	1	real_float	A	0	NULL	NULL		BTREE	
+t1	1	ushort	1	ushort	A	0	NULL	NULL		BTREE	
+t1	1	umedium	1	umedium	A	0	NULL	NULL		BTREE	
+t1	1	ulong	1	ulong	A	0	NULL	NULL		BTREE	
+t1	1	ulonglong	1	ulonglong	A	0	NULL	NULL		BTREE	
+t1	1	ulonglong	2	ulong	A	0	NULL	NULL		BTREE	
+t1	1	options	1	options	A	0	NULL	NULL		BTREE	
+t1	1	options	2	flags	A	0	NULL	NULL		BTREE	
 CREATE UNIQUE INDEX test on t1 ( auto ) ;
 CREATE INDEX test2 on t1 ( ulonglong,ulong) ;
 CREATE INDEX test3 on t1 ( medium ) ;

=== modified file 'mysql-test/suite/pbxt/r/type_timestamp.result'
--- a/mysql-test/suite/pbxt/r/type_timestamp.result	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/r/type_timestamp.result	2009-08-17 15:57:58 +0000
@@ -101,13 +101,13 @@ create table t1 (t2 timestamp(2), t4 tim
 t8 timestamp(8), t10 timestamp(10), t12 timestamp(12),
 t14 timestamp(14));
 Warnings:
-Warning	1287	The syntax 'TIMESTAMP(2)' is deprecated and will be removed in MySQL 5.2. Please use 'TIMESTAMP' instead
-Warning	1287	The syntax 'TIMESTAMP(4)' is deprecated and will be removed in MySQL 5.2. Please use 'TIMESTAMP' instead
-Warning	1287	The syntax 'TIMESTAMP(6)' is deprecated and will be removed in MySQL 5.2. Please use 'TIMESTAMP' instead
-Warning	1287	The syntax 'TIMESTAMP(8)' is deprecated and will be removed in MySQL 5.2. Please use 'TIMESTAMP' instead
-Warning	1287	The syntax 'TIMESTAMP(10)' is deprecated and will be removed in MySQL 5.2. Please use 'TIMESTAMP' instead
-Warning	1287	The syntax 'TIMESTAMP(12)' is deprecated and will be removed in MySQL 5.2. Please use 'TIMESTAMP' instead
-Warning	1287	The syntax 'TIMESTAMP(14)' is deprecated and will be removed in MySQL 5.2. Please use 'TIMESTAMP' instead
+Warning	1287	The syntax 'TIMESTAMP(2)' is deprecated and will be removed in MySQL 6.0. Please use 'TIMESTAMP' instead
+Warning	1287	The syntax 'TIMESTAMP(4)' is deprecated and will be removed in MySQL 6.0. Please use 'TIMESTAMP' instead
+Warning	1287	The syntax 'TIMESTAMP(6)' is deprecated and will be removed in MySQL 6.0. Please use 'TIMESTAMP' instead
+Warning	1287	The syntax 'TIMESTAMP(8)' is deprecated and will be removed in MySQL 6.0. Please use 'TIMESTAMP' instead
+Warning	1287	The syntax 'TIMESTAMP(10)' is deprecated and will be removed in MySQL 6.0. Please use 'TIMESTAMP' instead
+Warning	1287	The syntax 'TIMESTAMP(12)' is deprecated and will be removed in MySQL 6.0. Please use 'TIMESTAMP' instead
+Warning	1287	The syntax 'TIMESTAMP(14)' is deprecated and will be removed in MySQL 6.0. Please use 'TIMESTAMP' instead
 insert t1 values (0,0,0,0,0,0,0),
 ("1997-12-31 23:47:59", "1997-12-31 23:47:59", "1997-12-31 23:47:59",
 "1997-12-31 23:47:59", "1997-12-31 23:47:59", "1997-12-31 23:47:59",

=== modified file 'mysql-test/suite/pbxt/r/union.result'
--- a/mysql-test/suite/pbxt/r/union.result	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/r/union.result	2009-08-17 15:57:58 +0000
@@ -1301,12 +1301,14 @@ t3	CREATE TABLE `t3` (
   `left(a,100000000)` longtext
 ) ENGINE=PBXT DEFAULT CHARSET=latin1
 drop tables t1,t2,t3;
+SELECT @tmp_max:= @@global.max_allowed_packet;
+@tmp_max:= @@global.max_allowed_packet
+1048576
+SET @@global.max_allowed_packet=25000000;
 CREATE TABLE t1 (a mediumtext);
 CREATE TABLE t2 (b varchar(20));
 INSERT INTO t1 VALUES ('a');
 CREATE TABLE t3 SELECT REPEAT(a,20000000) AS a FROM t1 UNION SELECT b FROM t2;
-Warnings:
-Warning	1301	Result of repeat() was larger than max_allowed_packet (1048576) - truncated
 SHOW CREATE TABLE t3;
 Table	Create Table
 t3	CREATE TABLE `t3` (
@@ -1340,6 +1342,7 @@ t3	CREATE TABLE `t3` (
   `a` varbinary(510) DEFAULT NULL
 ) ENGINE=PBXT DEFAULT CHARSET=latin1
 DROP TABLES t1,t2,t3;
+SET @@global.max_allowed_packet:= @tmp_max;
 create table t1 ( id int not null auto_increment, primary key (id), col1 int);
 insert into t1 (col1) values (2),(3),(4),(5),(6);
 select 99 union all select id from t1 order by 1;

=== modified file 'mysql-test/suite/pbxt/r/view_grant.result'
--- a/mysql-test/suite/pbxt/r/view_grant.result	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/r/view_grant.result	2009-08-17 15:57:58 +0000
@@ -28,7 +28,7 @@ create view v2 as select * from mysqltes
 ERROR 42000: ANY command denied to user 'mysqltest_1'@'localhost' for table 't2'
 show create view v1;
 View	Create View	character_set_client	collation_connection
-v1	CREATE ALGORITHM=UNDEFINED DEFINER=`mysqltest_1`@`localhost` SQL SECURITY DEFINER VIEW `test`.`v1` AS select `mysqltest`.`t1`.`a` AS `a`,`mysqltest`.`t1`.`b` AS `b` from `mysqltest`.`t1`	latin1	latin1_swedish_ci
+v1	CREATE ALGORITHM=UNDEFINED DEFINER=`mysqltest_1`@`localhost` SQL SECURITY DEFINER VIEW `v1` AS select `mysqltest`.`t1`.`a` AS `a`,`mysqltest`.`t1`.`b` AS `b` from `mysqltest`.`t1`	latin1	latin1_swedish_ci
 grant create view,drop,select on test.* to mysqltest_1@localhost;
 use test;
 alter view v1 as select * from mysqltest.t1;
@@ -309,7 +309,7 @@ grant create view,select on test.* to my
 create view v1 as select * from mysqltest.t1;
 show create view v1;
 View	Create View	character_set_client	collation_connection
-v1	CREATE ALGORITHM=UNDEFINED DEFINER=`mysqltest_1`@`localhost` SQL SECURITY DEFINER VIEW `test`.`v1` AS select `mysqltest`.`t1`.`a` AS `a`,`mysqltest`.`t1`.`b` AS `b` from `mysqltest`.`t1`	latin1	latin1_swedish_ci
+v1	CREATE ALGORITHM=UNDEFINED DEFINER=`mysqltest_1`@`localhost` SQL SECURITY DEFINER VIEW `v1` AS select `mysqltest`.`t1`.`a` AS `a`,`mysqltest`.`t1`.`b` AS `b` from `mysqltest`.`t1`	latin1	latin1_swedish_ci
 revoke select on mysqltest.t1 from mysqltest_1@localhost;
 select * from v1;
 ERROR HY000: View 'test.v1' references invalid table(s) or column(s) or function(s) or definer/invoker of view lack rights to use them

=== modified file 'mysql-test/suite/pbxt/t/auto_increment.test'
--- a/mysql-test/suite/pbxt/t/auto_increment.test	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/t/auto_increment.test	2009-08-17 15:57:58 +0000
@@ -150,7 +150,6 @@ delete from t1 where a=0;
 update t1 set a=0 where b=5;
 select * from t1 order by b;
 delete from t1 where a=0;
---error 1048
 update t1 set a=NULL where b=6;
 update t1 set a=300 where b=7;
 SET SQL_MODE='';
@@ -166,7 +165,6 @@ delete from t1 where a=0;
 update t1 set a=0 where b=12;
 select * from t1 order by b;
 delete from t1 where a=0;
---error 1048
 update t1 set a=NULL where b=13;
 update t1 set a=500 where b=14;
 select * from t1 order by b;

=== modified file 'mysql-test/suite/pbxt/t/delete.test'
--- a/mysql-test/suite/pbxt/t/delete.test	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/t/delete.test	2009-08-17 15:57:58 +0000
@@ -120,13 +120,16 @@ select * from t2;
 delete t11.*, t12.* from t11,t12 where t11.a = t12.a and t11.b <> (select b from t2 where t11.a < t2.a);
 select * from t11;
 select * from t12;
+--error 1242
 delete ignore t11.*, t12.* from t11,t12 where t11.a = t12.a and t11.b <> (select b from t2 where t11.a < t2.a);
 select * from t11;
 select * from t12;
+--error 1062
 insert into t11 values (2, 12);
 -- error 1242
 delete from t11 where t11.b <> (select b from t2 where t11.a < t2.a);
 select * from t11;
+--error 1242
 delete ignore from t11 where t11.b <> (select b from t2 where t11.a < t2.a);
 select * from t11;
 drop table t11, t12, t2;

=== modified file 'mysql-test/suite/pbxt/t/join_nested.test'
--- a/mysql-test/suite/pbxt/t/join_nested.test	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/t/join_nested.test	2009-08-17 15:57:58 +0000
@@ -546,6 +546,7 @@ SELECT t0.a,t0.b,t1.a,t1.b,t2.a,t2.b,t3.
 
 CREATE INDEX idx_b ON t8(b);
 
+--sorted_result
 EXPLAIN EXTENDED
 SELECT t0.a,t0.b,t1.a,t1.b,t2.a,t2.b,t3.a,t3.b,t4.a,t4.b,
        t5.a,t5.b,t6.a,t6.b,t7.a,t7.b,t8.a,t8.b,t9.a,t9.b
@@ -585,6 +586,7 @@ SELECT t0.a,t0.b,t1.a,t1.b,t2.a,t2.b,t3.
 CREATE INDEX idx_b ON t1(b);
 CREATE INDEX idx_a ON t0(a);
 
+--sorted_result
 EXPLAIN EXTENDED
 SELECT t0.a,t0.b,t1.a,t1.b,t2.a,t2.b,t3.a,t3.b,t4.a,t4.b,
        t5.a,t5.b,t6.a,t6.b,t7.a,t7.b,t8.a,t8.b,t9.a,t9.b
@@ -621,6 +623,7 @@ SELECT t0.a,t0.b,t1.a,t1.b,t2.a,t2.b,t3.
            (t8.b=t9.b OR t8.c IS NULL) AND
            (t9.a=1); 
 
+--sorted_result
 SELECT t0.a,t0.b,t1.a,t1.b,t2.a,t2.b,t3.a,t3.b,t4.a,t4.b,
        t5.a,t5.b,t6.a,t6.b,t7.a,t7.b,t8.a,t8.b,t9.a,t9.b
   FROM t0,t1

=== added file 'mysql-test/suite/pbxt/t/lowercase_table_grant-master.opt'
--- a/mysql-test/suite/pbxt/t/lowercase_table_grant-master.opt	1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/pbxt/t/lowercase_table_grant-master.opt	2009-08-31 11:07:44 +0000
@@ -0,0 +1 @@
+--lower_case_table_names

=== added file 'mysql-test/suite/pbxt/t/lowercase_table_qcache-master.opt'
--- a/mysql-test/suite/pbxt/t/lowercase_table_qcache-master.opt	1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/pbxt/t/lowercase_table_qcache-master.opt	2009-08-31 11:07:44 +0000
@@ -0,0 +1 @@
+--lower_case_table_names

=== added file 'mysql-test/suite/pbxt/t/lowercase_view-master.opt'
--- a/mysql-test/suite/pbxt/t/lowercase_view-master.opt	1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/pbxt/t/lowercase_view-master.opt	2009-08-31 11:07:44 +0000
@@ -0,0 +1 @@
+--lower_case_table_names=1

=== modified file 'mysql-test/suite/pbxt/t/null.test'
--- a/mysql-test/suite/pbxt/t/null.test	2009-04-02 20:36:52 +0000
+++ b/mysql-test/suite/pbxt/t/null.test	2009-08-17 15:57:58 +0000
@@ -61,9 +61,7 @@ drop table t1;
 #
 CREATE TABLE t1 (a varchar(16) NOT NULL default '', b smallint(6) NOT NULL default 0, c datetime NOT NULL default '0000-00-00 00:00:00', d smallint(6) NOT NULL default 0);
 INSERT INTO t1 SET a = "", d= "2003-01-14 03:54:55";
---error 1048
 UPDATE t1 SET d=1/NULL;
---error 1048
 UPDATE t1 SET d=NULL;
 --error 1048
 INSERT INTO t1 (a) values (null);

=== modified file 'mysql-test/suite/pbxt/t/pbxt_bugs.test'
--- a/mysql-test/suite/pbxt/t/pbxt_bugs.test	2009-04-02 20:36:52 +0000
+++ b/mysql-test/suite/pbxt/t/pbxt_bugs.test	2009-08-17 15:57:58 +0000
@@ -926,7 +926,59 @@ LOAD DATA LOCAL INFILE 'suite/pbxt/t/loa
 SELECT * FROM t1 ORDER BY c1;
 DROP TABLE t1;
 
+create table parent (id int primary key);
+create table child (id int PRIMARY KEY, FOREIGN KEY (id) REFERENCES parent(id));
+insert into parent values (2), (3), (4);
+insert into child values (3), (4);
+
+--error 1451
+delete ignore from parent;
+--sorted_result
+select * from parent;
+
+drop table child, parent;
+
+# bug 378222: Drop sakila causes error: Cannot delete or update a parent row: a foreign key constraint fails
+
+create schema test378222;
+use test378222;
+create table t1 (id int primary key);
+create table t2 (id int primary key);
+alter table t1 add constraint foreign key (id) references t2 (id);
+alter table t2 add constraint foreign key (id) references t1 (id);
+drop schema test378222;
+
+create schema test378222a;
+create schema test378222b;
+create table test378222a.t1 (id int primary key);
+create table test378222b.t2 (id int primary key);
+alter table test378222a.t1 add constraint foreign key (id) references test378222b.t2 (id);
+alter table test378222b.t2 add constraint foreign key (id) references test378222a.t1 (id);
+set foreign_key_checks = 1;
+--error 1217
+drop schema test378222a;
+--error 1217
+drop schema test378222b;
+set foreign_key_checks = 0;
+drop schema test378222a;
+drop schema test378222b;
+set foreign_key_checks = 1;
+use test;
+
+# bug 369086: Incosistent/Incorrect Truncate behavior
+CREATE TABLE t1(c1 TINYINT AUTO_INCREMENT NULL KEY ) AUTO_INCREMENT=10;
+SHOW CREATE TABLE t1;
+INSERT INTO t1 VALUES(null);
+INSERT INTO t1 VALUES(null);
+INSERT INTO t1 VALUES(null);
+SELECT * FROM t1;
+TRUNCATE TABLE t1;
+INSERT INTO t1 VALUES(null);
+SELECT * FROM t1;
+DROP TABLE t1;
+
 --disable_query_log
+
 DROP TABLE t2, t5;
 drop database pbxt;
 --enable_query_log

=== modified file 'mysql-test/suite/pbxt/t/rename.test'
--- a/mysql-test/suite/pbxt/t/rename.test	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/t/rename.test	2009-08-17 15:57:58 +0000
@@ -63,7 +63,28 @@ connection con2;
 # Wait for the the tables to be renamed
 # i.e the query below succeds
 let $query= select * from t2, t4;
-source include/wait_for_query_to_suceed.inc;
+# source include/wait_for_query_to_suceed.inc;
+let $counter= 100;
+
+disable_abort_on_error;
+disable_query_log;
+disable_result_log;
+eval $query;
+while ($mysql_errno)
+{
+  eval $query;
+  sleep 0.1;
+  dec $counter;
+
+  if (!$counter)
+  {
+    die("Waited too long for query to suceed");
+  }
+}
+enable_abort_on_error;
+enable_query_log;
+enable_result_log;
+
 
 show tables;
 

=== modified file 'mysql-test/suite/pbxt/t/schema.test'
--- a/mysql-test/suite/pbxt/t/schema.test	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/t/schema.test	2009-08-17 15:57:58 +0000
@@ -10,5 +10,12 @@ drop database if exists mysqltest1;
 
 create schema foo;
 show create schema foo;
+# force PBXT schema to be created
+create table t1 (id int) engine=pbxt;
 show schemas;
 drop schema foo;
+
+--disable_query_log
+drop table if exists t1;
+drop database pbxt;
+--enable_query_log

=== modified file 'mysql-test/suite/pbxt/t/type_enum.test'
--- a/mysql-test/suite/pbxt/t/type_enum.test	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/t/type_enum.test	2009-08-17 15:57:58 +0000
@@ -153,12 +153,19 @@ create table t1(russian_deviant enum('E'
 show create table t1;
 drop table t1;
 
-# ER_WRONG_FIELD_TERMINATORS
---error 1005
+# the following create statement sometimes fails like it would if NO_BACKSLASH_ESCAPES sql mode was on,
+# we check sql mode here
+select @@SESSION.sql_mode;
+select @@GLOBAL.sql_mode;
+
+## ER_WRONG_FIELD_TERMINATORS
+#--error 1005
 create table t1(exhausting_charset enum('ABCDEFGHIJKLMNOPQRSTUVWXYZ','	
 
  !"','#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~','xx\','yy\�','zz�����������������������������������������������������������������������������������������'));
 
+drop table t1;
+
 --disable_query_log
 drop database pbxt;
 --enable_query_log

=== added file 'mysql-test/suite/pbxt/t/udf-master.opt'
--- a/mysql-test/suite/pbxt/t/udf-master.opt	1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/pbxt/t/udf-master.opt	2009-08-31 11:07:44 +0000
@@ -0,0 +1 @@
+$UDF_EXAMPLE_LIB_OPT

=== modified file 'mysql-test/suite/pbxt/t/union.test'
--- a/mysql-test/suite/pbxt/t/union.test	2009-04-02 10:03:14 +0000
+++ b/mysql-test/suite/pbxt/t/union.test	2009-08-17 15:57:58 +0000
@@ -802,6 +802,10 @@ drop tables t1,t2,t3;
 # exceeds mediumtext maximum length
 #
 
+SELECT @tmp_max:= @@global.max_allowed_packet;
+SET @@global.max_allowed_packet=25000000;
+# switching connection to allow the new max_allowed_packet take effect
+--connect (newconn, localhost, root,,)
 CREATE TABLE t1 (a mediumtext);
 CREATE TABLE t2 (b varchar(20));
 INSERT INTO t1 VALUES ('a');
@@ -823,6 +827,9 @@ INSERT INTO t1 VALUES ('a');
 CREATE TABLE t3 SELECT REPEAT(a,2) AS a FROM t1 UNION SELECT b FROM t2;
 SHOW CREATE TABLE t3;
 DROP TABLES t1,t2,t3;
+--connection default
+SET @@global.max_allowed_packet:= @tmp_max;
+--disconnect newconn
 
 #
 # Bug #10032 Bug in parsing UNION with ORDER BY when one node does not use FROM

=== modified file 'storage/pbxt/ChangeLog'
--- a/storage/pbxt/ChangeLog	2009-04-02 11:49:57 +0000
+++ b/storage/pbxt/ChangeLog	2009-08-18 07:46:53 +0000
@@ -1,7 +1,75 @@
 PBXT Release Notes
 ==================
 
-------- 1.0.08 RC - Not yet released
+------- 1.0.08c RC2 - 2009-08-18
+
+RN266: Updated BLOB streaming glue, used with the PBMS engine. The glue code is now identical to the version of "1.0.08-rc-pbms" version of PBXT available from http://blobstreaming.org/download.
+
+RN265: Changes the sequential reading of data log files to skip gaps, instead of returning EOF. This ensures that extended data records are preserved even when something goes wrong with the way the file is written.
+
+RN264: Fixed a bug that cased an "Data log not found" error after an out of disk space error on a log file. This bug is similar to RN262 in that it allows "gaps" to appear in the data logs.
+
+RN263: Updated xtstat to compile on Windows/MS Visual C++.
+
+RN262: Merged changes for PBMS version 0.5.09.
+
+RN261: Concerning bug #377788: Cannot find index for FK. Fixed buffer overflow which occurred when the error was reported.
+
+RN260: Fixed bug #377788: Cannot find index for FK. PBXT now correctly uses prefix of an index to support FK references (e.g. if key = (c1, c2) then an index on (c1, c2, c3) will work). Also fixed buffer overflow, which occurred when reporting the error.
+
+RN259: Fixed bug #309424: xtstat doesn't use my.cnf. You can now add an [xtstat] section to my.cnf, for use with xtstat.
+
+RN258: updated xt_p_join implementation for Windows to check if a thread has already exited or has not yet started
+
+RN257: Removed false assertion that could fail during restore if a transaction log page was zero-filled
+
+RN256: Update datalog eof pointer only if write opearions were sucessful
+
+RN255: Added re-allocation of of filemap if allocating the of the new map failed. This often happens if there's not enough space on disk.
+
+RN254: When a table with a corrupted index is detected, PBXT creates a file called 'repair-pending' in the pbxt directory, with the name of the table in it. Each table in the file is listed on a line by itself (the last line has no trailing \n). When the table is repaired (using the REPAIR TABLE command), this entry is removed from the file.
+
+RN253: Use fcntl(F_FULLFSYNC) instead of fsync on platforms that support it. Improper fsync operation was presumably the reason of index corruption on Mac OS X.
+
+RN252: Fixed bug #368692: PBXT not reporting data size correctly in information_schema.
+
+------- 1.0.08 RC2 - 2009-06-30
+
+RN251: A Windows-specific test update, also removed false assertion that failed on Windows.
+
+RN250: Fixed a bug that caused recovery to fail when the transaction log ID exceeded 255. The problem was a checksum failed in the log record.
+
+RN249: Fixed bug #313176: Test case timeout. This happened because record cache pages where not properly freed and as soon as cache filled up the performacne degraded.
+
+RN248: PBXT now compiles and runs with MySQL 5.1.35. All tests pass.
+
+RN247: Fixed bug #369086: Incosistent/Incorrect Truncate behavior
+
+RN246: Fixed bug #378222: Drop sakila causes error: Cannot delete or update a parent row: a foreign key constraint fails
+
+RN245: Fixed bug #379315: Inconsistent behavior of DELETE IGNORE and FK constraint.
+
+RN244: Fixed a recovery problem: during the recovery of "record modified" action the table was updated before the old index entries were removed; then the xres_remove_index_entries was supplied the new record which lead to incorrect index update.
+
+RN243: Fixed a bug that caused a recovery failure if partitioned pbxt tables where present. This happended because the recovery used a MySQL function to open tables and the PBXT handler was not yet registered
+
+RN242: Fixed a bug that caused a deadlock if pbxt initialization failed. This happened because pbxt ceanup was done from pbxt_init() with PLUGIN_lock being held by MySQL which lead to a deadlock in the freeer thread
+
+RN241: Fixed a heap corruption bug (writing to a freed memory location). It happened only when memory mapped files were used leading to heap inconsistency and program crash or termination by heap checker. Likely to happen right after or during DROP TABLE but possible in other cases too.
+
+RN240: Load the record cache on read when no using memory mapped files.
+
+RN239: Added PBXT variable pbxt_max_threads. This is the maximum number of threads that can be created PBXT. By default this value is set to 0 which means the number of threads is derived from the MySQL variable max_connections. The value used is max_connections+7. Under Drizzle the default value is 500.
+
+RN238: Added an option to wait for the sweeper to clean up old transactions on a particular connection. This prevents the sweeper from getting too far behind.
+
+RN237: Added an option to lazy delete fixed length index entries. This means the index entries are just marked for deletion, instead of removing the items from the index page. This has the advantage that an exclusive lock is not always required for deletion.
+
+RN236: Fixed bug #349177: a bug in configure.in script.
+
+RN235: Fixed bug 349176: a compiler warning.
+
+RN234: Completed Drizzle integration. All Drizzle tests now run with PBXT.
 
 RN233: Fixed bugs which occur when PBXT is used together with PBMS (BLOB Streaming engine).
 

=== modified file 'storage/pbxt/src/Makefile.am'
--- a/storage/pbxt/src/Makefile.am	2009-05-09 04:01:53 +0000
+++ b/storage/pbxt/src/Makefile.am	2009-08-31 11:07:44 +0000
@@ -19,7 +19,7 @@ noinst_HEADERS =		bsearch_xt.h cache_xt.
 						datadic_xt.h datalog_xt.h filesys_xt.h hashtab_xt.h \
 						ha_pbxt.h heap_xt.h index_xt.h linklist_xt.h \
 						memory_xt.h myxt_xt.h pthread_xt.h restart_xt.h \
-						streaming_xt.h sortedlist_xt.h strutil_xt.h \
+						pbms_enabled.h sortedlist_xt.h strutil_xt.h \
 						tabcache_xt.h table_xt.h trace_xt.h thread_xt.h \
 						util_xt.h xaction_xt.h xactlog_xt.h lock_xt.h \
 						systab_xt.h ha_xtsys.h discover_xt.h \
@@ -30,7 +30,7 @@ libpbxt_la_SOURCES =	bsearch_xt.cc cache
 						datadic_xt.cc datalog_xt.cc filesys_xt.cc hashtab_xt.cc \
 						ha_pbxt.cc heap_xt.cc index_xt.cc linklist_xt.cc \
 						memory_xt.cc myxt_xt.cc pthread_xt.cc restart_xt.cc \
-						streaming_xt.cc sortedlist_xt.cc strutil_xt.cc \
+						pbms_enabled.cc sortedlist_xt.cc strutil_xt.cc \
 						tabcache_xt.cc table_xt.cc trace_xt.cc thread_xt.cc \
 						systab_xt.cc ha_xtsys.cc discover_xt.cc \
 						util_xt.cc xaction_xt.cc xactlog_xt.cc lock_xt.cc locklist_xt.cc

=== modified file 'storage/pbxt/src/cache_xt.cc'
--- a/storage/pbxt/src/cache_xt.cc	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/cache_xt.cc	2009-08-17 11:12:36 +0000
@@ -23,6 +23,10 @@
 
 #include "xt_config.h"
 
+#ifdef DRIZZLED
+#include <bitset>
+#endif
+
 #ifndef XT_WIN
 #include <unistd.h>
 #endif
@@ -51,17 +55,22 @@
 #define IDX_CAC_SEGMENT_COUNT		((off_t) 1 << XT_INDEX_CACHE_SEGMENT_SHIFTS)
 #define IDX_CAC_SEGMENT_MASK		(IDX_CAC_SEGMENT_COUNT - 1)
 
-//#define IDX_USE_SPINRWLOCK
-#define IDX_USE_RWMUTEX
+#ifdef XT_NO_ATOMICS
+#define IDX_CAC_USE_PTHREAD_RW
+#else
+//#define IDX_CAC_USE_RWMUTEX
 //#define IDX_CAC_USE_PTHREAD_RW
+//#define IDX_USE_SPINXSLOCK
+#define IDX_CAC_USE_XSMUTEX
+#endif
 
-#ifdef IDX_CAC_USE_FASTWRLOCK
-#define IDX_CAC_LOCK_TYPE				XTFastRWLockRec
-#define IDX_CAC_INIT_LOCK(s, i)			xt_fastrwlock_init(s, &(i)->cs_lock)
-#define IDX_CAC_FREE_LOCK(s, i)			xt_fastrwlock_free(s, &(i)->cs_lock)	
-#define IDX_CAC_READ_LOCK(i, o)			xt_fastrwlock_slock(&(i)->cs_lock, (o))
-#define IDX_CAC_WRITE_LOCK(i, o)		xt_fastrwlock_xlock(&(i)->cs_lock, (o))
-#define IDX_CAC_UNLOCK(i, o)			xt_fastrwlock_unlock(&(i)->cs_lock, (o))
+#ifdef IDX_CAC_USE_XSMUTEX
+#define IDX_CAC_LOCK_TYPE				XTXSMutexRec
+#define IDX_CAC_INIT_LOCK(s, i)			xt_xsmutex_init_with_autoname(s, &(i)->cs_lock)
+#define IDX_CAC_FREE_LOCK(s, i)			xt_xsmutex_free(s, &(i)->cs_lock)	
+#define IDX_CAC_READ_LOCK(i, o)			xt_xsmutex_slock(&(i)->cs_lock, (o)->t_id)
+#define IDX_CAC_WRITE_LOCK(i, o)		xt_xsmutex_xlock(&(i)->cs_lock, (o)->t_id)
+#define IDX_CAC_UNLOCK(i, o)			xt_xsmutex_unlock(&(i)->cs_lock, (o)->t_id)
 #elif defined(IDX_CAC_USE_PTHREAD_RW)
 #define IDX_CAC_LOCK_TYPE				xt_rwlock_type
 #define IDX_CAC_INIT_LOCK(s, i)			xt_init_rwlock(s, &(i)->cs_lock)
@@ -69,13 +78,20 @@
 #define IDX_CAC_READ_LOCK(i, o)			xt_slock_rwlock_ns(&(i)->cs_lock)
 #define IDX_CAC_WRITE_LOCK(i, o)		xt_xlock_rwlock_ns(&(i)->cs_lock)
 #define IDX_CAC_UNLOCK(i, o)			xt_unlock_rwlock_ns(&(i)->cs_lock)
-#elif defined(IDX_USE_RWMUTEX)
+#elif defined(IDX_CAC_USE_RWMUTEX)
 #define IDX_CAC_LOCK_TYPE				XTRWMutexRec
 #define IDX_CAC_INIT_LOCK(s, i)			xt_rwmutex_init_with_autoname(s, &(i)->cs_lock)
 #define IDX_CAC_FREE_LOCK(s, i)			xt_rwmutex_free(s, &(i)->cs_lock)	
 #define IDX_CAC_READ_LOCK(i, o)			xt_rwmutex_slock(&(i)->cs_lock, (o)->t_id)
 #define IDX_CAC_WRITE_LOCK(i, o)		xt_rwmutex_xlock(&(i)->cs_lock, (o)->t_id)
 #define IDX_CAC_UNLOCK(i, o)			xt_rwmutex_unlock(&(i)->cs_lock, (o)->t_id)
+#elif defined(IDX_CAC_USE_SPINXSLOCK)
+#define IDX_CAC_LOCK_TYPE				XTSpinXSLockRec
+#define IDX_CAC_INIT_LOCK(s, i)			xt_spinxslock_init_with_autoname(s, &(i)->cs_lock)
+#define IDX_CAC_FREE_LOCK(s, i)			xt_spinxslock_free(s, &(i)->cs_lock)	
+#define IDX_CAC_READ_LOCK(i, s)			xt_spinxslock_slock(&(i)->cs_lock, (s)->t_id)
+#define IDX_CAC_WRITE_LOCK(i, s)		xt_spinxslock_xlock(&(i)->cs_lock, (s)->t_id)
+#define IDX_CAC_UNLOCK(i, s)			xt_spinxslock_unlock(&(i)->cs_lock, (s)->t_id)
 #endif
 
 #define ID_HANDLE_USE_SPINLOCK
@@ -308,7 +324,8 @@ xtPublic XTIndHandlePtr xt_ind_get_handl
 
 	hs = &ind_cac_globals.cg_handle_slot[iref->ir_block->cb_address % XT_HANDLE_SLOTS];
 
-	ASSERT_NS(iref->ir_ulock == XT_UNLOCK_READ);
+	ASSERT_NS(iref->ir_xlock == FALSE);
+	ASSERT_NS(iref->ir_updated == FALSE);
 	ID_HANDLE_LOCK(&hs->hs_handles_lock);
 #ifdef CHECK_HANDLE_STRUCTS
 	ic_check_handle_structs();
@@ -337,10 +354,10 @@ xtPublic XTIndHandlePtr xt_ind_get_handl
 	 * at least an Slock on the index.
 	 * So this excludes anyone who is reading 
 	 * cb_handle_count in the index.
-	 * (all cache block writers, and a freeer).
+	 * (all cache block writers, and the freeer).
 	 *
 	 * The increment is safe because I have the list
-	 * lock, which is required by anyone else
+	 * lock (hs_handles_lock), which is required by anyone else
 	 * who increments or decrements this value.
 	 */
 	iref->ir_block->cb_handle_count++;
@@ -396,8 +413,11 @@ xtPublic void xt_ind_release_handle(XTIn
 		xblock = seg->cs_hash_table[hash_idx];
 		while (xblock) {
 			if (block == xblock) {
-				/* Found the block... */
-				xt_atomicrwlock_xlock(&block->cb_lock, thread->t_id);
+				/* Found the block... 
+				 * {HANDLE-COUNT-SLOCK}
+				 * 04.05.2009, changed to slock.
+				 */
+				XT_IPAGE_READ_LOCK(&block->cb_lock);
 				goto block_found;
 			}
 			xblock = xblock->cb_next;
@@ -431,7 +451,18 @@ xtPublic void xt_ind_release_handle(XTIn
 		/* {HANDLE-COUNT-USAGE}
 		 * This is safe here because I have excluded
 		 * all readers by taking an Xlock on the
-		 * cache block.
+		 * cache block (CHANGED - see below).
+		 *
+		 * {HANDLE-COUNT-SLOCK}
+		 * 04.05.2009, changed to slock.
+		 * Should be OK, because:
+		 * A have a lock on the list lock (hs_handles_lock),
+		 * which prevents concurrent updates to cb_handle_count.
+		 *
+		 * I have also have a read lock on the cache block
+		 * but not a lock on the index. As a result, we cannot
+		 * excluded all index writers (and readers of 
+		 * cb_handle_count.
 		 */
 		block->cb_handle_count--;
 	}
@@ -466,7 +497,7 @@ xtPublic void xt_ind_release_handle(XTIn
 	ID_HANDLE_UNLOCK(&hs->hs_handles_lock);
 
 	if (block)
-		xt_atomicrwlock_unlock(&block->cb_lock, TRUE);
+		XT_IPAGE_UNLOCK(&block->cb_lock, FALSE);
 }
 
 /* Call this function before a referenced cache block is modified!
@@ -482,17 +513,28 @@ xtPublic xtBool xt_ind_copy_on_write(XTI
 
 	hs = &ind_cac_globals.cg_handle_slot[iref->ir_block->cb_address % XT_HANDLE_SLOTS];
 
+	ID_HANDLE_LOCK(&hs->hs_handles_lock);
+
 	/* {HANDLE-COUNT-USAGE}
 	 * This is only called by updaters of this index block, or
 	 * the free which holds an Xlock on the index block.
-	 *
 	 * These are all mutually exclusive for the index block.
+	 *
+	 * {HANDLE-COUNT-SLOCK}
+	 * Do this check again, after we have the list lock (hs_handles_lock).
+	 * There is a small chance that the count has changed, since we last
+	 * checked because xt_ind_release_handle() only holds
+	 * an slock on the index page.
+	 *
+	 * An updater can sometimes have a XLOCK on the index and an slock
+	 * on the cache block. In this case xt_ind_release_handle()
+	 * could have run through.
 	 */
-	ASSERT_NS(iref->ir_block->cb_handle_count);
-	if (!iref->ir_block->cb_handle_count)
+	if (!iref->ir_block->cb_handle_count) {
+		ID_HANDLE_UNLOCK(&hs->hs_handles_lock);
 		return OK;
+	}
 
-	ID_HANDLE_LOCK(&hs->hs_handles_lock);
 #ifdef CHECK_HANDLE_STRUCTS
 	ic_check_handle_structs();
 #endif
@@ -609,7 +651,7 @@ xtPublic void xt_ind_init(XTThreadPtr se
 #endif
 
 		for (u_int i=0; i<ind_cac_globals.cg_block_count; i++) {
-			xt_atomicrwlock_init_with_autoname(self, &block->cb_lock);
+			XT_IPAGE_INIT_LOCK(self, &block->cb_lock);
 			block->cb_state = IDX_CAC_BLOCK_FREE;
 			block->cb_next = ind_cac_globals.cg_free_list;
 #ifdef XT_USE_DIRECT_IO_ON_INDEX
@@ -836,10 +878,10 @@ static xtBool ind_free_block(XTOpenTable
 	while (xblock) {
 		if (block == xblock) {
 			/* Found the block... */
-			xt_atomicrwlock_xlock(&block->cb_lock, ot->ot_thread->t_id);
+			XT_IPAGE_WRITE_LOCK(&block->cb_lock, ot->ot_thread->t_id);
 			if (block->cb_state != IDX_CAC_BLOCK_CLEAN) {
 				/* This block cannot be freeed: */
-				xt_atomicrwlock_unlock(&block->cb_lock, TRUE);
+				XT_IPAGE_UNLOCK(&block->cb_lock, TRUE);
 				IDX_CAC_UNLOCK(seg, ot->ot_thread);
 #ifdef DEBUG_CHECK_IND_CACHE
 				xt_ind_check_cache(NULL);
@@ -878,11 +920,12 @@ static xtBool ind_free_block(XTOpenTable
 	if (block->cb_handle_count) {
 		XTIndReferenceRec	iref;
 		
-		iref.ir_ulock = XT_UNLOCK_WRITE;
+		iref.ir_xlock = TRUE;
+		iref.ir_updated = FALSE;
 		iref.ir_block = block;
 		iref.ir_branch = (XTIdxBranchDPtr) block->cb_data;
 		if (!xt_ind_copy_on_write(&iref)) {
-			xt_atomicrwlock_unlock(&block->cb_lock, TRUE);
+			XT_IPAGE_UNLOCK(&block->cb_lock, TRUE);
 			return FALSE;
 		}
 	}
@@ -918,7 +961,7 @@ static xtBool ind_free_block(XTOpenTable
 	IDX_TRACE("%d- f%x\n", (int) XT_NODE_ID(address), (int) XT_GET_DISK_2(block->cb_data));
 
 	/* Unlock BEFORE the block is reused! */
-	xt_atomicrwlock_unlock(&block->cb_lock, TRUE);
+	XT_IPAGE_UNLOCK(&block->cb_lock, TRUE);
 
 	xt_unlock_mutex_ns(&ind_cac_globals.cg_lock);
 
@@ -1001,7 +1044,7 @@ static u_int ind_cac_free_lru_blocks(XTO
  * Fetch the block. Note, if we are about to write the block
  * then there is no need to read it from disk!
  */
-static XTIndBlockPtr ind_cac_fetch(XTOpenTablePtr ot, xtIndexNodeID address, DcSegmentPtr *ret_seg, xtBool read_data)
+static XTIndBlockPtr ind_cac_fetch(XTOpenTablePtr ot, XTIndexPtr ind, xtIndexNodeID address, DcSegmentPtr *ret_seg, xtBool read_data)
 {
 	register XTOpenFilePtr	file = ot->ot_ind_file;
 	register XTIndBlockPtr	block, new_block;
@@ -1110,6 +1153,7 @@ static XTIndBlockPtr ind_cac_fetch(XTOpe
 	new_block->cb_state = IDX_CAC_BLOCK_CLEAN;
 	new_block->cb_handle_count = 0;
 	new_block->cp_flush_seq = 0;
+	new_block->cp_del_count = 0;
 	new_block->cb_dirty_next = NULL;
 	new_block->cb_dirty_prev = NULL;
 
@@ -1172,6 +1216,13 @@ static XTIndBlockPtr ind_cac_fetch(XTOpe
 #endif
 	xt_unlock_mutex_ns(&dcg->cg_lock);
 
+	/* {LAZY-DEL-INDEX-ITEMS}
+	 * Conditionally count the number of deleted entries in the index:
+	 * We do this before other threads can read the block.
+	 */
+	if (ind->mi_lazy_delete && read_data)
+		xt_ind_count_deleted_items(ot->ot_table, ind, block);
+
 	/* Add to the hash table: */
 	block->cb_next = seg->cs_hash_table[hash_idx];
 	seg->cs_hash_table[hash_idx] = block;
@@ -1221,10 +1272,10 @@ xtPublic xtBool xt_ind_write(XTOpenTable
 	XTIndBlockPtr	block;
 	DcSegmentPtr	seg;
 
-	if (!(block = ind_cac_fetch(ot, address, &seg, FALSE)))
+	if (!(block = ind_cac_fetch(ot, ind, address, &seg, FALSE)))
 		return FAILED;
 
-	xt_atomicrwlock_xlock(&block->cb_lock, ot->ot_thread->t_id);
+	XT_IPAGE_WRITE_LOCK(&block->cb_lock, ot->ot_thread->t_id);
 	ASSERT_NS(block->cb_state == IDX_CAC_BLOCK_CLEAN || block->cb_state == IDX_CAC_BLOCK_DIRTY);
 	memcpy(block->cb_data, data, size);
 	block->cp_flush_seq = ot->ot_table->tab_ind_flush_seq;
@@ -1239,7 +1290,7 @@ xtPublic xtBool xt_ind_write(XTOpenTable
 		xt_spinlock_unlock(&ind->mi_dirty_lock);
 		block->cb_state = IDX_CAC_BLOCK_DIRTY;
 	}
-	xt_atomicrwlock_unlock(&block->cb_lock, TRUE);
+	XT_IPAGE_UNLOCK(&block->cb_lock, TRUE);
 	IDX_CAC_UNLOCK(seg, ot->ot_thread);
 #ifdef XT_TRACK_INDEX_UPDATES
 	ot->ot_ind_changed++;
@@ -1259,10 +1310,10 @@ xtPublic xtBool xt_ind_write_cache(XTOpe
 		return FAILED;
 
 	if (block) {
-		xt_atomicrwlock_xlock(&block->cb_lock, ot->ot_thread->t_id);
+		XT_IPAGE_WRITE_LOCK(&block->cb_lock, ot->ot_thread->t_id);
 		ASSERT_NS(block->cb_state == IDX_CAC_BLOCK_CLEAN || block->cb_state == IDX_CAC_BLOCK_DIRTY);
 		memcpy(block->cb_data, data, size);
-		xt_atomicrwlock_unlock(&block->cb_lock, TRUE);
+		XT_IPAGE_UNLOCK(&block->cb_lock, TRUE);
 		IDX_CAC_UNLOCK(seg, ot->ot_thread);
 	}
 
@@ -1277,7 +1328,7 @@ xtPublic xtBool xt_ind_clean(XTOpenTable
 	if (!ind_cac_get(ot, address, &seg, &block))
 		return FAILED;
 	if (block) {
-		xt_atomicrwlock_xlock(&block->cb_lock, ot->ot_thread->t_id);
+		XT_IPAGE_WRITE_LOCK(&block->cb_lock, ot->ot_thread->t_id);
 		ASSERT_NS(block->cb_state == IDX_CAC_BLOCK_CLEAN || block->cb_state == IDX_CAC_BLOCK_DIRTY);
 
 		if (block->cb_state == IDX_CAC_BLOCK_DIRTY) {
@@ -1293,7 +1344,7 @@ xtPublic xtBool xt_ind_clean(XTOpenTable
 			xt_spinlock_unlock(&ind->mi_dirty_lock);
 			block->cb_state = IDX_CAC_BLOCK_CLEAN;
 		}
-		xt_atomicrwlock_unlock(&block->cb_lock, TRUE);
+		XT_IPAGE_UNLOCK(&block->cb_lock, TRUE);
 
 		IDX_CAC_UNLOCK(seg, ot->ot_thread);
 	}
@@ -1301,29 +1352,33 @@ xtPublic xtBool xt_ind_clean(XTOpenTable
 	return OK;
 }
 
-xtPublic xtBool xt_ind_read_bytes(XTOpenTablePtr ot, xtIndexNodeID address, size_t size, xtWord1 *data)
+xtPublic xtBool xt_ind_read_bytes(XTOpenTablePtr ot, XTIndexPtr ind, xtIndexNodeID address, size_t size, xtWord1 *data)
 {
 	XTIndBlockPtr	block;
 	DcSegmentPtr	seg;
 
-	if (!(block = ind_cac_fetch(ot, address, &seg, TRUE)))
+	if (!(block = ind_cac_fetch(ot, ind, address, &seg, TRUE)))
 		return FAILED;
 
-	xt_atomicrwlock_slock(&block->cb_lock);
+	XT_IPAGE_READ_LOCK(&block->cb_lock);
 	memcpy(data, block->cb_data, size);
-	xt_atomicrwlock_unlock(&block->cb_lock, FALSE);
+	XT_IPAGE_UNLOCK(&block->cb_lock, FALSE);
 	IDX_CAC_UNLOCK(seg, ot->ot_thread);
 	return OK;
 }
 
-xtPublic xtBool xt_ind_fetch(XTOpenTablePtr ot, xtIndexNodeID address, XTPageLockType ltype, XTIndReferencePtr iref)
+xtPublic xtBool xt_ind_fetch(XTOpenTablePtr ot, XTIndexPtr ind, xtIndexNodeID address, XTPageLockType ltype, XTIndReferencePtr iref)
 {
 	register XTIndBlockPtr	block;
 	DcSegmentPtr			seg;
 	xtWord2					branch_size;
+	xtBool					xlock = FALSE;
 
-	ASSERT_NS(iref->ir_ulock == XT_UNLOCK_NONE);
-	if (!(block = ind_cac_fetch(ot, address, &seg, TRUE)))
+#ifdef DEBUG
+	ASSERT_NS(iref->ir_xlock == 2);
+	ASSERT_NS(iref->ir_xlock == 2);
+#endif
+	if (!(block = ind_cac_fetch(ot, ind, address, &seg, TRUE)))
 		return NULL;
 
 	branch_size = XT_GET_DISK_2(((XTIdxBranchDPtr) block->cb_data)->tb_size_2);
@@ -1333,21 +1388,50 @@ xtPublic xtBool xt_ind_fetch(XTOpenTable
 		return FAILED;
 	}
 
-	if (ltype == XT_XLOCK_LEAF) {
-		if (XT_IS_NODE(branch_size))
-			ltype = XT_LOCK_READ;
-		else
-			ltype = XT_LOCK_WRITE;
+	switch (ltype) {
+		case XT_LOCK_READ:
+			break;
+		case XT_LOCK_WRITE:
+			xlock = TRUE;
+			break;
+		case XT_XLOCK_LEAF:
+			if (!XT_IS_NODE(branch_size))
+				xlock = TRUE;
+			break;
+		case XT_XLOCK_DEL_LEAF:
+			if (!XT_IS_NODE(branch_size)) {
+				if (ot->ot_table->tab_dic.dic_no_lazy_delete)
+					xlock = TRUE;
+				else {
+					/*
+					 * {LAZY-DEL-INDEX-ITEMS}
+					 *
+					 * We are fetch a page for delete purpose.
+					 * we decide here if we plan to do a lazy delete,
+					 * Or if we plan to compact the node.
+					 *
+					 * A lazy delete just requires a shared lock.
+					 *
+					 */
+					if (ind->mi_lazy_delete) {
+						/* If the number of deleted items is greater than
+						 * half of the number of times that can fit in the
+						 * page, then we will compact the node.
+						 */
+						if (!xt_idx_lazy_delete_on_leaf(ind, block, XT_GET_INDEX_BLOCK_LEN(branch_size)))
+							xlock = TRUE;
+					}
+					else
+						xlock = TRUE;
+				}
+			}
+			break;
 	}
 
-	if (ltype == XT_LOCK_WRITE) {
-		xt_atomicrwlock_xlock(&block->cb_lock, ot->ot_thread->t_id);
-		iref->ir_ulock = XT_UNLOCK_WRITE;
-	}
-	else {
-		xt_atomicrwlock_slock(&block->cb_lock);
-		iref->ir_ulock = XT_UNLOCK_READ;
-	}
+	if ((iref->ir_xlock = xlock))
+		XT_IPAGE_WRITE_LOCK(&block->cb_lock, ot->ot_thread->t_id);
+	else
+		XT_IPAGE_READ_LOCK(&block->cb_lock);
 
 	IDX_CAC_UNLOCK(seg, ot->ot_thread);
 
@@ -1358,18 +1442,31 @@ xtPublic xtBool xt_ind_fetch(XTOpenTable
 	 * As a result, we need to pass a pointer to both the
 	 * cache block and the cache block data:
 	 */
+	iref->ir_updated = FALSE;
 	iref->ir_block = block;
 	iref->ir_branch = (XTIdxBranchDPtr) block->cb_data;
 	return OK;
 }
 
-xtPublic xtBool xt_ind_release(XTOpenTablePtr ot, XTIndexPtr ind, XTPageUnlockType XT_UNUSED(utype), XTIndReferencePtr iref)
+xtPublic xtBool xt_ind_release(XTOpenTablePtr ot, XTIndexPtr ind, XTPageUnlockType XT_NDEBUG_UNUSED(utype), XTIndReferencePtr iref)
 {
 	register XTIndBlockPtr	block;
 
 	block = iref->ir_block;
 
-	if (utype == XT_UNLOCK_R_UPDATE || utype == XT_UNLOCK_W_UPDATE) {
+#ifdef DEBUG
+	ASSERT_NS(iref->ir_xlock != 2);
+	ASSERT_NS(iref->ir_updated != 2);
+	if (iref->ir_updated)
+		ASSERT_NS(utype == XT_UNLOCK_R_UPDATE || utype == XT_UNLOCK_W_UPDATE);
+	else
+		ASSERT_NS(utype == XT_UNLOCK_READ || utype == XT_UNLOCK_WRITE);
+	if (iref->ir_xlock)
+		ASSERT_NS(utype == XT_UNLOCK_WRITE || utype == XT_UNLOCK_W_UPDATE);
+	else
+		ASSERT_NS(utype == XT_UNLOCK_READ || utype == XT_UNLOCK_R_UPDATE);
+#endif
+	if (iref->ir_updated) {
 		/* The page was update: */
 		ASSERT_NS(block->cb_state == IDX_CAC_BLOCK_CLEAN || block->cb_state == IDX_CAC_BLOCK_DIRTY);
 		block->cp_flush_seq = ot->ot_table->tab_ind_flush_seq;
@@ -1386,16 +1483,10 @@ xtPublic xtBool xt_ind_release(XTOpenTab
 		}
 	}
 
+	XT_IPAGE_UNLOCK(&block->cb_lock, iref->ir_xlock);
 #ifdef DEBUG
-	if (utype == XT_UNLOCK_W_UPDATE)
-		utype = XT_UNLOCK_WRITE;
-	else if (utype == XT_UNLOCK_R_UPDATE)
-		utype = XT_UNLOCK_READ;
-	ASSERT_NS(iref->ir_ulock == utype);
-#endif
-	xt_atomicrwlock_unlock(&block->cb_lock, iref->ir_ulock == XT_UNLOCK_WRITE ? TRUE : FALSE);
-#ifdef DEBUG
-	iref->ir_ulock = XT_UNLOCK_NONE;
+	iref->ir_xlock = 2;
+	iref->ir_updated = 2;
 #endif
 	return OK;
 }
@@ -1484,24 +1575,3 @@ xtPublic void xt_ind_unreserve(XTOpenTab
 		xt_ind_free_reserved(ot);
 }
 
-xtPublic void xt_load_indices(XTThreadPtr self, XTOpenTablePtr ot)
-{
-	register XTTableHPtr	tab = ot->ot_table;
-	register XTIndBlockPtr	block;
-	DcSegmentPtr			seg;
-	xtIndexNodeID			id;
-
-	xt_lock_mutex_ns(&tab->tab_ind_flush_lock);
-
-	for (id=1; id < XT_NODE_ID(tab->tab_ind_eof); id++) {
-		if (!(block = ind_cac_fetch(ot, id, &seg, TRUE))) {
-			xt_unlock_mutex_ns(&tab->tab_ind_flush_lock);
-			xt_throw(self);
-		}
-		IDX_CAC_UNLOCK(seg, ot->ot_thread);
-	}
-
-	xt_unlock_mutex_ns(&tab->tab_ind_flush_lock);
-}
-
-

=== modified file 'storage/pbxt/src/cache_xt.h'
--- a/storage/pbxt/src/cache_xt.h	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/cache_xt.h	2009-08-17 11:12:36 +0000
@@ -45,8 +45,46 @@ struct XTIdxReadBuffer;
 #define IDX_CAC_BLOCK_CLEAN				1
 #define IDX_CAC_BLOCK_DIRTY				2
 
-typedef enum XTPageLockType { XT_LOCK_READ, XT_LOCK_WRITE, XT_XLOCK_LEAF };
-typedef enum XTPageUnlockType { XT_UNLOCK_NONE, XT_UNLOCK_READ, XT_UNLOCK_WRITE, XT_UNLOCK_R_UPDATE, XT_UNLOCK_W_UPDATE };
+#ifdef XT_NO_ATOMICS
+#define XT_IPAGE_USE_PTHREAD_RW
+#else
+//#define XT_IPAGE_USE_ATOMIC_RW
+#define XT_IPAGE_USE_SPINXSLOCK
+//#define XT_IPAGE_USE_SKEW_RW
+#endif
+
+#ifdef XT_IPAGE_USE_ATOMIC_RW
+#define XT_IPAGE_LOCK_TYPE				XTAtomicRWLockRec
+#define XT_IPAGE_INIT_LOCK(s, i)		xt_atomicrwlock_init_with_autoname(s, i)
+#define XT_IPAGE_FREE_LOCK(s, i)		xt_atomicrwlock_free(s, i)	
+#define XT_IPAGE_READ_LOCK(i)			xt_atomicrwlock_slock(i)
+#define XT_IPAGE_WRITE_LOCK(i, o)		xt_atomicrwlock_xlock(i, o)
+#define XT_IPAGE_UNLOCK(i, x)			xt_atomicrwlock_unlock(i, x)
+#elif defined(XT_IPAGE_USE_PTHREAD_RW)
+#define XT_IPAGE_LOCK_TYPE				xt_rwlock_type
+#define XT_IPAGE_INIT_LOCK(s, i)		xt_init_rwlock(s, i)
+#define XT_IPAGE_FREE_LOCK(s, i)		xt_free_rwlock(i)	
+#define XT_IPAGE_READ_LOCK(i)			xt_slock_rwlock_ns(i)
+#define XT_IPAGE_WRITE_LOCK(i, s)		xt_xlock_rwlock_ns(i)
+#define XT_IPAGE_UNLOCK(i, x)			xt_unlock_rwlock_ns(i)
+#elif defined(XT_IPAGE_USE_SPINXSLOCK)
+#define XT_IPAGE_LOCK_TYPE				XTSpinXSLockRec
+#define XT_IPAGE_INIT_LOCK(s, i)		xt_spinxslock_init_with_autoname(s, i)
+#define XT_IPAGE_FREE_LOCK(s, i)		xt_spinxslock_free(s, i)	
+#define XT_IPAGE_READ_LOCK(i)			xt_spinxslock_slock(i)
+#define XT_IPAGE_WRITE_LOCK(i, o)		xt_spinxslock_xlock(i, o)
+#define XT_IPAGE_UNLOCK(i, x)			xt_spinxslock_unlock(i, x)
+#else // XT_IPAGE_USE_SKEW_RW
+#define XT_IPAGE_LOCK_TYPE				XTSkewRWLockRec
+#define XT_IPAGE_INIT_LOCK(s, i)		xt_skewrwlock_init_with_autoname(s, i)
+#define XT_IPAGE_FREE_LOCK(s, i)		xt_skewrwlock_free(s, i)	
+#define XT_IPAGE_READ_LOCK(i)			xt_skewrwlock_slock(i)
+#define XT_IPAGE_WRITE_LOCK(i, o)		xt_skewrwlock_xlock(i, o)
+#define XT_IPAGE_UNLOCK(i, x)			xt_skewrwlock_unlock(i, x)
+#endif
+
+enum XTPageLockType { XT_LOCK_READ, XT_LOCK_WRITE, XT_XLOCK_LEAF, XT_XLOCK_DEL_LEAF };
+enum XTPageUnlockType { XT_UNLOCK_NONE, XT_UNLOCK_READ, XT_UNLOCK_WRITE, XT_UNLOCK_R_UPDATE, XT_UNLOCK_W_UPDATE };
 
 /* A block is X locked if it is being changed or freed.
  * A block is S locked if it is being read.
@@ -64,10 +102,11 @@ typedef struct XTIndBlock {
 	struct XTIndBlock	*cb_mr_used;					/* More recently used blocks. */
 	struct XTIndBlock	*cb_lr_used;					/* Less recently used blocks. */
 	/* Protected by cb_lock: */
-	XTAtomicRWLockRec	cb_lock;
+	XT_IPAGE_LOCK_TYPE	cb_lock;
 	xtWord1				cb_state;						/* Block status. */
 	xtWord2				cb_handle_count;				/* TRUE if this page is referenced by a handle. */
 	xtWord2				cp_flush_seq;
+	xtWord2				cp_del_count;					/* Number of deleted entries. */
 #ifdef XT_USE_DIRECT_IO_ON_INDEX
 	xtWord1				*cb_data;
 #else
@@ -76,16 +115,18 @@ typedef struct XTIndBlock {
 } XTIndBlockRec, *XTIndBlockPtr;
 
 typedef struct XTIndReference {
-	XTPageUnlockType		ir_ulock;
+	xtBool					ir_xlock;					/* Set to TRUE if the cache block is X locked. */
+	xtBool					ir_updated;					/* Set to TRUE if the cache block is updated. */
 	XTIndBlockPtr			ir_block;
 	XTIdxBranchDPtr			ir_branch;
 } XTIndReferenceRec, *XTIndReferencePtr;
 
 typedef struct XTIndFreeBlock {
+	XTDiskValue1			if_zero1_1;					/* Must be set to zero. */
+	XTDiskValue1			if_zero2_1;					/* Must be set to zero. */
 	XTDiskValue1			if_status_1;
 	XTDiskValue1			if_unused1_1;
-	XTDiskValue2			if_unused2_2;
-	XTDiskValue4			if_unused3_4;
+	XTDiskValue4			if_unused2_4;
 	XTDiskValue8			if_next_block_8;
 } XTIndFreeBlockRec, *XTIndFreeBlockPtr;
 
@@ -116,14 +157,13 @@ xtInt8			xt_ind_get_size();
 xtBool			xt_ind_write(struct XTOpenTable *ot, XTIndexPtr ind, xtIndexNodeID offset, size_t size, xtWord1 *data);
 xtBool			xt_ind_write_cache(struct XTOpenTable *ot, xtIndexNodeID offset, size_t size, xtWord1 *data);
 xtBool			xt_ind_clean(struct XTOpenTable *ot, XTIndexPtr ind, xtIndexNodeID offset);
-xtBool			xt_ind_read_bytes(struct XTOpenTable *ot, xtIndexNodeID offset, size_t size, xtWord1 *data);
+xtBool			xt_ind_read_bytes(struct XTOpenTable *ot, XTIndexPtr ind, xtIndexNodeID offset, size_t size, xtWord1 *data);
 void			xt_ind_check_cache(XTIndexPtr ind);
 xtBool			xt_ind_reserve(struct XTOpenTable *ot, u_int count, XTIdxBranchDPtr not_this);
 void			xt_ind_free_reserved(struct XTOpenTable *ot);
 void			xt_ind_unreserve(struct XTOpenTable *ot);
-void			xt_load_indices(XTThreadPtr self, struct XTOpenTable *ot);
 
-xtBool			xt_ind_fetch(struct XTOpenTable *ot, xtIndexNodeID node, XTPageLockType ltype, XTIndReferencePtr iref);
+xtBool			xt_ind_fetch(struct XTOpenTable *ot, XTIndexPtr ind, xtIndexNodeID node, XTPageLockType ltype, XTIndReferencePtr iref);
 xtBool			xt_ind_release(struct XTOpenTable *ot, XTIndexPtr ind, XTPageUnlockType utype, XTIndReferencePtr iref);
 
 void			xt_ind_lock_handle(XTIndHandlePtr handle);

=== modified file 'storage/pbxt/src/ccutils_xt.cc'
--- a/storage/pbxt/src/ccutils_xt.cc	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/ccutils_xt.cc	2009-08-17 11:12:36 +0000
@@ -29,7 +29,7 @@
 #include "ccutils_xt.h"
 #include "bsearch_xt.h"
 
-static int ccu_compare_object(XTThreadPtr XT_UNUSED(self), register const void XT_UNUSED(*thunk), register const void *a, register const void *b)
+static int ccu_compare_object(XTThreadPtr XT_UNUSED(self), register const void *XT_UNUSED(thunk), register const void *a, register const void *b)
 {
 	XTObject *obj_ptr = (XTObject *) b;
 

=== modified file 'storage/pbxt/src/database_xt.cc'
--- a/storage/pbxt/src/database_xt.cc	2009-04-02 10:03:14 +0000
+++ b/storage/pbxt/src/database_xt.cc	2009-08-17 11:12:36 +0000
@@ -23,6 +23,10 @@
 
 #include "xt_config.h"
 
+#ifdef DRIZZLED
+#include <bitset>
+#endif
+
 #include <string.h>
 #include <stdio.h>
 
@@ -240,7 +244,7 @@ static void db_hash_free(XTThreadPtr sel
 	xt_heap_release(self, (XTDatabaseHPtr) data);
 }
 
-static int db_cmp_db_id(struct XTThread XT_UNUSED(*self), register const void XT_UNUSED(*thunk), register const void *a, register const void *b)
+static int db_cmp_db_id(struct XTThread *XT_UNUSED(self), register const void *XT_UNUSED(thunk), register const void *a, register const void *b)
 {
 	xtDatabaseID	db_id = *((xtDatabaseID *) a);
 	XTDatabaseHPtr	*db_ptr = (XTDatabaseHPtr *) b;
@@ -346,7 +350,7 @@ static void db_finalize(XTThreadPtr self
 	}
 }
 
-static void db_onrelease(XTThreadPtr self, void XT_UNUSED(*x))
+static void db_onrelease(XTThreadPtr self, void *XT_UNUSED(x))
 {
 	/* Signal threads waiting for exclusive use of the database: */
 	if (xt_db_open_databases)	// The database may already be closed.
@@ -612,7 +616,7 @@ xtPublic void xt_drop_database(XTThreadP
 xtPublic void xt_open_database(XTThreadPtr self, char *path, xtBool multi_path)
 {
 	XTDatabaseHPtr db;
-	
+
 	/* We cannot get a database, without unusing the current
 	 * first. The reason is that the restart process will
 	 * partially set the current database!
@@ -621,7 +625,7 @@ xtPublic void xt_open_database(XTThreadP
 	db = xt_get_database(self, path, multi_path);
 	pushr_(xt_heap_release, db);
 	xt_use_database(self, db, XT_FOR_USER);
-	freer_(); // xt_heap_release(self, db);	
+	freer_();	// xt_heap_release(self, db);	
 }
 
 /* This function can only be called if you do not already have a database in
@@ -638,6 +642,12 @@ xtPublic void xt_use_database(XTThreadPt
 
 	xt_heap_reference(self, db);
 	self->st_database = db;
+#ifdef XT_WAIT_FOR_CLEANUP
+	self->st_last_xact = 0;
+	for (int i=0; i<XT_MAX_XACT_BEHIND; i++) {
+		self->st_prev_xact[i] = db->db_xn_curr_id;
+	}
+#endif
 	xt_xn_init_thread(self, what_for);
 }
 
@@ -1117,15 +1127,18 @@ xtPublic void xt_db_return_table_to_pool
 	XTDatabaseHPtr		db = ot->ot_table->tab_db;
 	xtBool				flush_table = TRUE;
 
+	/* No open table returned to the pool should still
+	 * have a cache handle!
+	 */
+	ASSERT_NS(!ot->ot_ind_rhandle);
 	xt_lock_mutex_ns(&db->db_ot_pool.opt_lock);
 
 	if (!(table_pool = db_get_open_table_pool(db, ot->ot_table->tab_id)))
 		goto failed;
 
 	if (table_pool->opt_locked && !table_pool->opt_flushing) {
-		table_pool->opt_total_open--;
 		/* Table will be closed below: */
-		if (table_pool->opt_total_open > 0)
+		if (table_pool->opt_total_open > 1)
 			flush_table = FALSE;
 	}
 	else {
@@ -1151,14 +1164,21 @@ xtPublic void xt_db_return_table_to_pool
 		ot = NULL;
 	}
 
+	if (ot) {
+		xt_unlock_mutex_ns(&db->db_ot_pool.opt_lock);
+		xt_close_table(ot, flush_table, FALSE);
+
+		/* assume that table_pool cannot be invalidated in between as we have table_pool->opt_total_open > 0 */
+		xt_lock_mutex_ns(&db->db_ot_pool.opt_lock);
+		table_pool->opt_total_open--;
+	}
+
 	db_free_open_table_pool(NULL, table_pool);
 
 	if (!xt_broadcast_cond_ns(&db->db_ot_pool.opt_cond))
 		goto failed;
 	xt_unlock_mutex_ns(&db->db_ot_pool.opt_lock);
-	if (ot)
-		xt_close_table(ot, flush_table, FALSE);
-
+	
 	return;
 
 	failed:

=== modified file 'storage/pbxt/src/datadic_xt.cc'
--- a/storage/pbxt/src/datadic_xt.cc	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/datadic_xt.cc	2009-08-18 07:46:53 +0000
@@ -26,6 +26,10 @@
 
 #include "xt_config.h"
 
+#ifdef DRIZZLED
+#include <bitset>
+#endif
+
 #include <ctype.h>
 #include <errno.h>
 
@@ -433,7 +437,7 @@ class XTTokenizer {
 	XTToken *nextToken(XTThreadPtr self, c_char *keyword, XTToken *tk);
 };
 
-void ri_free_token(XTThreadPtr self __attribute__((unused)), XTToken *tk)
+void ri_free_token(XTThreadPtr XT_UNUSED(self), XTToken *tk)
 {
 	delete tk;
 }
@@ -524,6 +528,13 @@ XTToken *XTTokenizer::nextToken(XTThread
 						break;
 					tkn_curr_pos++;
 				}
+				/* TODO: Unless sql_mode == 'NO_BACKSLASH_ESCAPES'!!! */
+				if (*tkn_curr_pos == '\\') {
+					if (*(tkn_curr_pos+1) == quote) {
+						if (quote == '"' || quote == '\'')
+							tkn_curr_pos++;
+					}
+				}
 				tkn_curr_pos++;
 			}
 			
@@ -639,7 +650,7 @@ class XTParseTable : public XTObject {
 	int parseKeyAction(XTThreadPtr self);	
 	void parseCreateTable(XTThreadPtr self);
 	void parseAddTableItem(XTThreadPtr self);
-	void parseQualifiedName(XTThreadPtr self, char *name);
+	void parseQualifiedName(XTThreadPtr self, char *parent_name, char *name);
 	void parseTableName(XTThreadPtr self, bool alterTable);
 	void parseExpression(XTThreadPtr self, bool allow_reserved);
 	void parseBrackets(XTThreadPtr self);
@@ -667,53 +678,53 @@ class XTParseTable : public XTObject {
 		memset(&pt_sbuffer, 0, sizeof(XTStringBufferRec));
 	}
 
-	virtual void finalize(XTThreadPtr self __attribute__((unused))) {
+	virtual void finalize(XTThreadPtr XT_UNUSED(self)) {
 		if (pt_tokenizer)
 			delete pt_tokenizer;
 		xt_sb_set_size(NULL, &pt_sbuffer, 0);
 	}
 
 	// Hooks to receive output from the parser:
-	virtual void setTableName(XTThreadPtr self __attribute__((unused)), char *name __attribute__((unused)), bool alterTable __attribute__((unused))) {
+	virtual void setTableName(XTThreadPtr XT_UNUSED(self), char *XT_UNUSED(name), bool XT_UNUSED(alterTable)) {
 	}
-	virtual void addColumn(XTThreadPtr self __attribute__((unused)), char *col_name __attribute__((unused)), char *old_col_name __attribute__((unused))) {
+	virtual void addColumn(XTThreadPtr XT_UNUSED(self), char *XT_UNUSED(col_name), char *XT_UNUSED(old_col_name)) {
 	}
 	virtual void setDataType(XTThreadPtr self, char *cstring) {
 		if (cstring) 
 			xt_free(self, cstring);
 	}
-	virtual void setNull(XTThreadPtr self __attribute__((unused)), bool nullOK __attribute__((unused))) {
+	virtual void setNull(XTThreadPtr XT_UNUSED(self), bool XT_UNUSED(nullOK)) {
 	}
-	virtual void setAutoInc(XTThreadPtr self __attribute__((unused)), bool autoInc __attribute__((unused))) {
+	virtual void setAutoInc(XTThreadPtr XT_UNUSED(self), bool XT_UNUSED(autoInc)) {
 	}
 	
 	/* Add a contraint. If lastColumn is TRUE then add the contraint 
 	 * to the last column. If not, expect addListedColumn() to be called.
 	 */
-	virtual void addConstraint(XTThreadPtr self __attribute__((unused)), char *name __attribute__((unused)), u_int type __attribute__((unused)), bool lastColumn __attribute__((unused))) {
+	virtual void addConstraint(XTThreadPtr XT_UNUSED(self), char *XT_UNUSED(name), u_int XT_UNUSED(type), bool XT_UNUSED(lastColumn)) {
 	}
 	
 	/* Move the last column created. If symbol is NULL then move the column to the
 	 * first position, else move it to the position just after the given column.
 	 */
-	virtual void moveColumn(XTThreadPtr self __attribute__((unused)), char *col_name __attribute__((unused))) {
+	virtual void moveColumn(XTThreadPtr XT_UNUSED(self), char *XT_UNUSED(col_name)) {
 	}
 
-	virtual void dropColumn(XTThreadPtr self __attribute__((unused)), char *col_name __attribute__((unused))) {
+	virtual void dropColumn(XTThreadPtr XT_UNUSED(self), char *XT_UNUSED(col_name)) {
 	}
 
-	virtual void dropConstraint(XTThreadPtr self __attribute__((unused)), char *name __attribute__((unused)), u_int type __attribute__((unused))) {
+	virtual void dropConstraint(XTThreadPtr XT_UNUSED(self), char *XT_UNUSED(name), u_int XT_UNUSED(type)) {
 	}
 
-	virtual void setIndexName(XTThreadPtr self __attribute__((unused)), char *name __attribute__((unused))) {
+	virtual void setIndexName(XTThreadPtr XT_UNUSED(self), char *XT_UNUSED(name)) {
 	}
-	virtual void addListedColumn(XTThreadPtr self __attribute__((unused)), char *index_col_name __attribute__((unused))) {
+	virtual void addListedColumn(XTThreadPtr XT_UNUSED(self), char *XT_UNUSED(index_col_name)) {
 	}
-	virtual void setReferencedTable(XTThreadPtr self __attribute__((unused)), char *ref_table __attribute__((unused))) {
+	virtual void setReferencedTable(XTThreadPtr XT_UNUSED(self), char *XT_UNUSED(ref_schema), char *XT_UNUSED(ref_table)) {
 	}
-	virtual void addReferencedColumn(XTThreadPtr self __attribute__((unused)), char *index_col_name __attribute__((unused))) {
+	virtual void addReferencedColumn(XTThreadPtr XT_UNUSED(self), char *XT_UNUSED(index_col_name)) {
 	}
-	virtual void setActions(XTThreadPtr self __attribute__((unused)), int on_delete __attribute__((unused)), int on_update __attribute__((unused))) {
+	virtual void setActions(XTThreadPtr XT_UNUSED(self), int XT_UNUSED(on_delete), int XT_UNUSED(on_update)) {
 	}
 
 	virtual void parseTable(XTThreadPtr self, bool convert, char *sql);	
@@ -859,7 +870,7 @@ void XTParseTable::parseAddTableItem(XTT
 	if (pt_current->isKeyWord("CONSTRAINT")) {
 		pt_current = pt_tokenizer->nextToken(self);
 		if (pt_current->isIdentifier())
-			parseQualifiedName(self, name);
+			parseQualifiedName(self, NULL, name);
 	}
 
 	if (pt_current->isReservedWord(XT_TK_PRIMARY)) {
@@ -974,13 +985,15 @@ void XTParseTable::parseMoveColumn(XTThr
 		char	name[XT_IDENTIFIER_NAME_SIZE];
 
 		pt_current = pt_tokenizer->nextToken(self);
-		parseQualifiedName(self, name);
+		parseQualifiedName(self, NULL, name);
 		moveColumn(self, name);
 	}
 }
 
-void XTParseTable::parseQualifiedName(XTThreadPtr self, char *name)
+void XTParseTable::parseQualifiedName(XTThreadPtr self, char *parent_name, char *name)
 {
+	if (parent_name)
+		parent_name[0] = '\0';
 	/* Should be an identifier by I have this example:
 	 * CREATE TABLE t1 ( comment CHAR(32) ASCII NOT NULL, koi8_ru_f CHAR(32) CHARACTER SET koi8r NOT NULL default '' ) CHARSET=latin5;
 	 *
@@ -990,6 +1003,8 @@ void XTParseTable::parseQualifiedName(XT
 		raiseError(self, pt_current, XT_ERR_ID_TOO_LONG);
 	pt_current = pt_tokenizer->nextToken(self);
 	while (pt_current->isKeyWord(".")) {
+		if (parent_name)
+			xt_strcpy(XT_IDENTIFIER_NAME_SIZE,parent_name, name);
 		pt_current = pt_tokenizer->nextToken(self);
 		/* Accept anything after the DOT! */
 		if (pt_current->getString(name, XT_IDENTIFIER_NAME_SIZE) >= XT_IDENTIFIER_NAME_SIZE)
@@ -1002,7 +1017,7 @@ void XTParseTable::parseTableName(XTThre
 {
 	char name[XT_IDENTIFIER_NAME_SIZE];
 
-	parseQualifiedName(self, name);
+	parseQualifiedName(self, NULL, name);
 	setTableName(self, name, alterTable);
 }
 
@@ -1011,7 +1026,7 @@ void XTParseTable::parseColumnDefinition
 	char col_name[XT_IDENTIFIER_NAME_SIZE];
 
 	// column_definition
-	parseQualifiedName(self, col_name);
+	parseQualifiedName(self, NULL, col_name);
 	addColumn(self, col_name, old_col_name);
 	parseDataType(self);
 
@@ -1111,7 +1126,7 @@ u_int XTParseTable::columnList(XTThreadP
 	pt_current->expectKeyWord(self, "(");
 	do {
 		pt_current = pt_tokenizer->nextToken(self);
-		parseQualifiedName(self, name);
+		parseQualifiedName(self, NULL, name);
 		addListedColumn(self, name);
 		cols++;
 		if (index_cols) {
@@ -1135,19 +1150,20 @@ void XTParseTable::parseReferenceDefinit
 	int		on_delete = XT_KEY_ACTION_DEFAULT;
 	int		on_update = XT_KEY_ACTION_DEFAULT;
 	char	name[XT_IDENTIFIER_NAME_SIZE];
+	char	parent_name[XT_IDENTIFIER_NAME_SIZE];
 	u_int	cols = 0;
 
 	// REFERENCES tbl_name
 	pt_current = pt_tokenizer->nextToken(self, "REFERENCES", pt_current);
-	parseQualifiedName(self, name);
-	setReferencedTable(self, name);
+	parseQualifiedName(self, parent_name, name);
+	setReferencedTable(self, parent_name[0] ? parent_name : NULL, name);
 
 	// [ (index_col_name,...) ]
 	if (pt_current->isKeyWord("(")) {
 		pt_current->expectKeyWord(self, "(");
 		do {
 			pt_current = pt_tokenizer->nextToken(self);
-			parseQualifiedName(self, name);
+			parseQualifiedName(self, NULL, name);
 			addReferencedColumn(self, name);
 			cols++;
 			if (cols > req_cols)
@@ -1219,7 +1235,7 @@ void XTParseTable::parseAlterTable(XTThr
 			if (pt_current->isReservedWord(XT_TK_COLUMN))
 				pt_current = pt_tokenizer->nextToken(self);
 
-			parseQualifiedName(self, old_col_name);
+			parseQualifiedName(self, NULL, old_col_name);
 			parseColumnDefinition(self, old_col_name);
 			parseMoveColumn(self);
 		}
@@ -1251,7 +1267,7 @@ void XTParseTable::parseAlterTable(XTThr
 			else {
 				if (pt_current->isReservedWord(XT_TK_COLUMN))
 					pt_current = pt_tokenizer->nextToken(self);
-				parseQualifiedName(self, name);
+				parseQualifiedName(self, NULL, name);
 				dropColumn(self, name);
 			}
 		}
@@ -1259,7 +1275,7 @@ void XTParseTable::parseAlterTable(XTThr
 			pt_current = pt_tokenizer->nextToken(self);
 			if (pt_current->isKeyWord("TO"))
 				pt_current = pt_tokenizer->nextToken(self);
-			parseQualifiedName(self, name);
+			parseQualifiedName(self, NULL, name);
 		}
 		else
 			/* Just ignore the syntax until the next , */
@@ -1284,7 +1300,7 @@ void XTParseTable::parseCreateIndex(XTTh
 	else if (pt_current->isKeyWord("SPACIAL"))
 		pt_current = pt_tokenizer->nextToken(self);
 	pt_current = pt_tokenizer->nextToken(self, "INDEX", pt_current);
-	parseQualifiedName(self, name);
+	parseQualifiedName(self, NULL, name);
 	optionalIndexType(self);
 	pt_current = pt_tokenizer->nextToken(self, "ON", pt_current);
 	parseTableName(self, true);
@@ -1299,7 +1315,7 @@ void XTParseTable::parseDropIndex(XTThre
 
 	pt_current = pt_tokenizer->nextToken(self, "DROP", pt_current);
 	pt_current = pt_tokenizer->nextToken(self, "INDEX", pt_current);
-	parseQualifiedName(self, name);
+	parseQualifiedName(self, NULL, name);
 	pt_current = pt_tokenizer->nextToken(self, "ON", pt_current);
 	parseTableName(self, true);
 	dropConstraint(self, name, XT_DD_INDEX);
@@ -1340,7 +1356,7 @@ class XTCreateTable : public XTParseTabl
 	virtual void addConstraint(XTThreadPtr self, char *name, u_int type, bool lastColumn);
 	virtual void dropConstraint(XTThreadPtr self, char *name, u_int type);
 	virtual void addListedColumn(XTThreadPtr self, char *index_col_name);
-	virtual void setReferencedTable(XTThreadPtr self, char *ref_table);
+	virtual void setReferencedTable(XTThreadPtr self, char *ref_schema, char *ref_table);
 	virtual void addReferencedColumn(XTThreadPtr self, char *index_col_name);
 	virtual void setActions(XTThreadPtr self, int on_delete, int on_update);
 
@@ -1535,23 +1551,31 @@ void XTCreateTable::addListedColumn(XTTh
 	}
 }
 
-void XTCreateTable::setReferencedTable(XTThreadPtr self, char *ref_table)
+void XTCreateTable::setReferencedTable(XTThreadPtr self, char *ref_schema, char *ref_table)
 {
 	XTDDForeignKey	*fk = (XTDDForeignKey *) ct_curr_constraint;
 	char			path[PATH_MAX];
 
-	xt_strcpy(PATH_MAX, path, ct_tab_path->ps_path);
-	xt_remove_last_name_of_path(path);
-	if (ct_convert) {
-		char	buffer[XT_IDENTIFIER_NAME_SIZE];
-		size_t	len;
-
-		myxt_static_convert_identifier(self, ct_charset, ref_table, buffer, XT_IDENTIFIER_NAME_SIZE);
-		len = strlen(path);
-		myxt_static_convert_table_name(self, buffer, &path[len], PATH_MAX - len);
-	}
-	else
+	if (ref_schema) {
+		xt_strcpy(PATH_MAX,path, ".");
+		xt_add_dir_char(PATH_MAX, path);
+		xt_strcat(PATH_MAX, path, ref_schema);
+		xt_add_dir_char(PATH_MAX, path);
 		xt_strcat(PATH_MAX, path, ref_table);
+	} else {
+		xt_strcpy(PATH_MAX, path, ct_tab_path->ps_path);
+		xt_remove_last_name_of_path(path);
+		if (ct_convert) {
+			char	buffer[XT_IDENTIFIER_NAME_SIZE];
+			size_t	len;
+
+			myxt_static_convert_identifier(self, ct_charset, ref_table, buffer, XT_IDENTIFIER_NAME_SIZE);
+			len = strlen(path);
+			myxt_static_convert_table_name(self, buffer, &path[len], PATH_MAX - len);
+		}
+		else
+			xt_strcat(PATH_MAX, path, ref_table);
+	}
 
 	fk->fk_ref_tab_name = (XTPathStrPtr) xt_dup_string(self, path);
 }
@@ -1578,7 +1602,7 @@ void XTCreateTable::addReferencedColumn(
 		fk->fk_ref_cols.clone(self, &fk->co_cols);
 }
 
-void XTCreateTable::setActions(XTThreadPtr self __attribute__((unused)), int on_delete, int on_update)
+void XTCreateTable::setActions(XTThreadPtr XT_UNUSED(self), int on_delete, int on_update)
 {
 	XTDDForeignKey	*fk = (XTDDForeignKey *) ct_curr_constraint;
 
@@ -1711,8 +1735,8 @@ void XTDDConstraint::alterColumnName(XTT
 void XTDDConstraint::getColumnList(char *buffer, size_t size)
 {
 	if (co_table->dt_table) {
-		xt_strcat(size, buffer, "`");
-		xt_strcpy(size, buffer, co_table->dt_table->tab_name->ps_path);
+		xt_strcpy(size, buffer, "`");
+		xt_strcat(size, buffer, co_table->dt_table->tab_name->ps_path);
 		xt_strcat(size, buffer, "` (`");
 	}
 	else
@@ -1739,6 +1763,20 @@ bool XTDDConstraint::sameColumns(XTDDCon
 	return OK;
 }
 
+bool XTDDConstraint::samePrefixColumns(XTDDConstraint *co)
+{
+	u_int i = 0;
+
+	if (co_cols.size() > co->co_cols.size())
+		return false;
+	while (i<co_cols.size()) {
+		if (myxt_strcasecmp(co_cols.itemAt(i)->cr_col_name, co->co_cols.itemAt(i)->cr_col_name) != 0)
+			return false;
+		i++;
+	}
+	return OK;
+}
+
 bool XTDDConstraint::attachColumns()
 {
 	XTDDColumn		*col;
@@ -1773,6 +1811,7 @@ bool XTDDTableRef::checkReference(xtWord
 	XTIdxSearchKeyRec	search_key;
 	xtXactID			xn_id;
 	XTXactWaitRec		xw;
+	bool				ok = false;
 
 	if (!(loc_ind = tr_fkey->getReferenceIndexPtr()))
 		return false;
@@ -1792,40 +1831,42 @@ bool XTDDTableRef::checkReference(xtWord
 
 	/* Search for the key in the child (referencing) table: */
 	if (!(ot = xt_db_open_table_using_tab(tr_fkey->co_table->dt_table, thread)))
-		goto failed;
+		return false;
 
 	retry:
 	if (!xt_idx_search(ot, ind, &search_key))
-		goto failed;
+		goto done;
 		
 	while (ot->ot_curr_rec_id && search_key.sk_on_key) {
 		switch (xt_tab_maybe_committed(ot, ot->ot_curr_rec_id, &xn_id, &ot->ot_curr_row_id, &ot->ot_curr_updated)) {
 			case XT_MAYBE:				
 				xw.xw_xn_id = xn_id;
 				if (!xt_xn_wait_for_xact(thread, &xw, NULL))
-					goto failed;
+					goto done;
 				goto retry;
 			case XT_ERR:
-				goto failed;
+				goto done;
 			case TRUE:
 				/* We found a matching child: */
 				xt_register_ixterr(XT_REG_CONTEXT, XT_ERR_ROW_IS_REFERENCED, tr_fkey->co_name);
-				goto failed;
-				break;
+				goto done;
 			case FALSE:
 				if (!xt_idx_next(ot, ind, &search_key))
-					goto failed;
+					goto done;
 				break;
 		}
 	}
 
 	/* No matching children, all OK: */
-	xt_db_return_table_to_pool_ns(ot);
-	return true;
+	ok = true;
 
-	failed:
+	done:
+	if (ot->ot_ind_rhandle) {
+		xt_ind_release_handle(ot->ot_ind_rhandle, FALSE, thread);
+		ot->ot_ind_rhandle = NULL;
+	}
 	xt_db_return_table_to_pool_ns(ot);
-	return false;
+	return ok;
 }
 
 /*
@@ -1962,6 +2003,10 @@ bool XTDDTableRef::modifyRow(XTOpenTable
 	}
 
 	/* No matching children, all OK: */
+	if (ot->ot_ind_rhandle) {
+		xt_ind_release_handle(ot->ot_ind_rhandle, FALSE, thread);
+		ot->ot_ind_rhandle = NULL;
+	}
 	xt_db_return_table_to_pool_ns(ot);
 
 	success:
@@ -1971,6 +2016,10 @@ bool XTDDTableRef::modifyRow(XTOpenTable
 	return true;
 
 	failed_2:
+	if (ot->ot_ind_rhandle) {
+		xt_ind_release_handle(ot->ot_ind_rhandle, FALSE, thread);
+		ot->ot_ind_rhandle = NULL;
+	}
 	xt_db_return_table_to_pool_ns(ot);
 
 	failed:
@@ -2055,8 +2104,13 @@ void XTDDForeignKey::finalize(XTThreadPt
 
 void XTDDForeignKey::loadString(XTThreadPtr self, XTStringBufferPtr sb)
 {
+	char schema_name[XT_IDENTIFIER_NAME_SIZE];
+	
 	XTDDConstraint::loadString(self, sb);
 	xt_sb_concat(self, sb, " REFERENCES `");
+	xt_2nd_last_name_of_path(XT_IDENTIFIER_NAME_SIZE, schema_name, fk_ref_tab_name->ps_path);
+	xt_sb_concat(self, sb, schema_name);
+	xt_sb_concat(self, sb, "`.`");
 	xt_sb_concat(self, sb, xt_last_name_of_path(fk_ref_tab_name->ps_path));
 	xt_sb_concat(self, sb, "` ");
 
@@ -2136,6 +2190,20 @@ bool XTDDForeignKey::sameReferenceColumn
 	return OK;
 }
 
+bool XTDDForeignKey::samePrefixReferenceColumns(XTDDConstraint *co)
+{
+	u_int i = 0;
+
+	if (fk_ref_cols.size() > co->co_cols.size())
+		return false;
+	while (i<fk_ref_cols.size()) {
+		if (myxt_strcasecmp(fk_ref_cols.itemAt(i)->cr_col_name, co->co_cols.itemAt(i)->cr_col_name) != 0)
+			return false;
+		i++;
+	}
+	return OK;
+}
+
 bool XTDDForeignKey::checkReferencedTypes(XTDDTable *dt)
 {
 	XTDDColumn *col, *ref_col;
@@ -2288,6 +2356,10 @@ bool XTDDForeignKey::insertRow(xtWord1 *
 				goto failed_2;
 			case TRUE:
 				/* We found a matching parent: */
+				if (ot->ot_ind_rhandle) {
+					xt_ind_release_handle(ot->ot_ind_rhandle, FALSE, thread);
+					ot->ot_ind_rhandle = NULL;
+				}
 				xt_db_return_table_to_pool_ns(ot);
 				goto success;
 			case FALSE:
@@ -2300,6 +2372,10 @@ bool XTDDForeignKey::insertRow(xtWord1 *
 	xt_register_ixterr(XT_REG_CONTEXT, XT_ERR_NO_REFERENCED_ROW, co_name);
 
 	failed_2:
+	if (ot->ot_ind_rhandle) {
+		xt_ind_release_handle(ot->ot_ind_rhandle, FALSE, thread);
+		ot->ot_ind_rhandle = NULL;
+	}
 	xt_db_return_table_to_pool_ns(ot);
 
 	failed:
@@ -2672,16 +2748,24 @@ void XTDDTable::checkForeignKeys(XTThrea
 
 XTDDIndex *XTDDTable::findIndex(XTDDConstraint *co)
 {
-	XTDDIndex *ind;
+	XTDDIndex *ind = NULL;
+	XTDDIndex *cur_ind;
+	u_int index_size = UINT_MAX;
 
 	for (u_int i=0; i<dt_indexes.size(); i++) {
-		ind = dt_indexes.itemAt(i);
-		if (co->sameColumns(ind))
-			return ind;
+		cur_ind = dt_indexes.itemAt(i);
+		u_int sz = cur_ind->getIndexPtr()->mi_key_size;
+		if (sz < index_size && co->samePrefixColumns(cur_ind)) {
+			ind = cur_ind;
+			index_size = sz;
+		}
 	}
+
+	if (ind) 
+		return ind;
+	
 	{
 		char buffer[XT_ERR_MSG_SIZE - 200];
-
 		co->getColumnList(buffer, XT_ERR_MSG_SIZE - 200);
 		xt_register_ixterr(XT_REG_CONTEXT, XT_ERR_NO_MATCHING_INDEX, buffer);
 	}
@@ -2690,16 +2774,24 @@ XTDDIndex *XTDDTable::findIndex(XTDDCons
 
 XTDDIndex *XTDDTable::findReferenceIndex(XTDDForeignKey *fk)
 {
-	XTDDIndex		*ind;
+	XTDDIndex		*ind = NULL;
+	XTDDIndex		*cur_ind;
 	XTDDColumnRef	*cr;
 	u_int			i;
+	u_int			index_size = UINT_MAX;
 
 	for (i=0; i<dt_indexes.size(); i++) {
-		ind = dt_indexes.itemAt(i);
-		if (fk->sameReferenceColumns(ind))
-			return ind;
+		cur_ind = dt_indexes.itemAt(i);
+		u_int sz = cur_ind->getIndexPtr()->mi_key_size;
+		if (sz < index_size && fk->samePrefixReferenceColumns(cur_ind)) {
+			ind = cur_ind;
+			index_size = sz;
+		}
 	}
 
+	if (ind)
+		return ind;
+
 	/* If the index does not exist, maybe the columns do not exist?! */
 	for (i=0; i<fk->fk_ref_cols.size(); i++) {
 		cr = fk->fk_ref_cols.itemAt(i);
@@ -2867,9 +2959,33 @@ bool XTDDTable::updateRow(XTOpenTablePtr
 	return ok;
 }
 
-xtBool XTDDTable::checkCanDrop()
+/*
+ * drop_db parameter is TRUE if we are dropping the schema of this table. In this case
+ * we return TRUE if the table has only refs to the tables from its own schema
+ */
+xtBool XTDDTable::checkCanDrop(xtBool drop_db)
 {
 	/* no refs or references only itself */
-	return (dt_trefs == NULL) || 
-		(dt_trefs->tr_next == NULL) && (dt_trefs->tr_fkey->co_table == this);
+	if ((dt_trefs == NULL) || ((dt_trefs->tr_next == NULL) && (dt_trefs->tr_fkey->co_table == this)))
+		return TRUE;
+
+	if (!drop_db) 
+		return FALSE;
+	
+	const char *this_schema = xt_last_2_names_of_path(dt_table->tab_name->ps_path);
+	size_t this_schema_sz = xt_last_name_of_path(dt_table->tab_name->ps_path) - this_schema;
+	XTDDTableRef *tr = dt_trefs;
+
+	while (tr) {
+		const char *tab_path = tr->tr_fkey->co_table->dt_table->tab_name->ps_path;
+		const char *tab_schema = xt_last_2_names_of_path(tab_path);
+		size_t tab_schema_sz = xt_last_name_of_path(tab_path) - tab_schema;
+
+		if (this_schema_sz != tab_schema_sz || strncmp(this_schema, tab_schema, tab_schema_sz))
+			return FALSE;
+		
+		tr = tr->tr_next;
+	}
+
+	return TRUE;
 }

=== modified file 'storage/pbxt/src/datadic_xt.h'
--- a/storage/pbxt/src/datadic_xt.h	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/datadic_xt.h	2009-08-18 07:46:53 +0000
@@ -137,6 +137,7 @@ class XTDDColumnRef : public XTObject {
 		return new_obj;
 	}
 
+	virtual void init(XTThreadPtr self) { XTObject::init(self); }
 	virtual void init(XTThreadPtr self, XTObject *obj);
 	virtual void finalize(XTThreadPtr self);
 };
@@ -156,6 +157,7 @@ class XTDDConstraint : public XTObject {
 		co_ind_name(NULL) {
 	}
 
+	virtual void init(XTThreadPtr self) { XTObject::init(self); }
 	virtual void init(XTThreadPtr self, XTObject *obj);
 	virtual void finalize(XTThreadPtr self) {
 		if (co_name)
@@ -169,6 +171,7 @@ class XTDDConstraint : public XTObject {
 	virtual void alterColumnName(XTThreadPtr self, char *from_name, char *to_name);
 	void getColumnList(char *buffer, size_t size);
 	bool sameColumns(XTDDConstraint *co);
+	bool samePrefixColumns(XTDDConstraint *co);
 	bool attachColumns();
 };
 
@@ -198,6 +201,7 @@ class XTDDIndex : public XTDDConstraint 
 		return new_obj;
 	}
 
+        virtual void init(XTThreadPtr self) { XTDDConstraint::init(self); };
 	virtual void init(XTThreadPtr self, XTObject *obj);
 	struct XTIndex *getIndexPtr();
 };
@@ -230,12 +234,14 @@ class XTDDForeignKey : public XTDDIndex 
 		return new_obj;
 	}
 
+        virtual void init(XTThreadPtr self) { XTDDIndex::init(self); }
 	virtual void init(XTThreadPtr self, XTObject *obj);
 	virtual void finalize(XTThreadPtr self);
 	virtual void loadString(XTThreadPtr self, XTStringBufferPtr sb);
 	void getReferenceList(char *buffer, size_t size);
 	struct XTIndex *getReferenceIndexPtr();
 	bool sameReferenceColumns(XTDDConstraint *co);
+	bool samePrefixReferenceColumns(XTDDConstraint *co);
 	bool checkReferencedTypes(XTDDTable *dt);
 	void removeReference(XTThreadPtr self);
 	bool insertRow(xtWord1 *before, xtWord1 *after, XTThreadPtr thread);
@@ -284,7 +290,7 @@ class XTDDTable : public XTObject {
 	XTDDIndex *findReferenceIndex(XTDDForeignKey *fk);
 	bool insertRow(struct XTOpenTable *rec_ot, xtWord1 *buffer);
 	bool checkNoAction(struct XTOpenTable *ot, xtRecordID rec_id);
-	xtBool checkCanDrop();
+	xtBool checkCanDrop(xtBool drop_db);
 	bool deleteRow(struct XTOpenTable *rec_ot, xtWord1 *buffer);
 	void deleteAllRows(XTThreadPtr self);
 	bool updateRow(struct XTOpenTable *rec_ot, xtWord1 *before, xtWord1 *after);

=== modified file 'storage/pbxt/src/datalog_xt.cc'
--- a/storage/pbxt/src/datalog_xt.cc	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/datalog_xt.cc	2009-08-18 07:46:53 +0000
@@ -69,6 +69,7 @@ xtBool XTDataSeqRead::sl_seq_init(struct
 	sl_rec_log_id = 0;
 	sl_rec_log_offset = 0;
 	sl_record_len = 0;
+	sl_extra_garbage = 0;
 
 	return sl_buffer != NULL;
 }
@@ -130,8 +131,25 @@ xtBool XTDataSeqRead::sl_rnd_read(xtLogO
 /*
  * Unlike the transaction log sequential reader, this function only returns
  * the header of a record.
+ *
+ * {SKIP-GAPS}
+ * This function now skips gaps. This should not be required, because in normal
+ * operation, no gaps should be created.
+ *
+ * However, if his happens there is a danger that a valid record after the
+ * gap will be lost.
+ *
+ * So, if we find an invalid record, we scan through the log to find the next
+ * valid record. Note, that there is still a danger that will will find
+ * data that looks like a valid record, but is not.
+ *
+ * In this case, this "pseudo record" may cause the function to actually skip
+ * valid records.
+ *
+ * Note, any such malfunction will eventually cause the record to be lost forever
+ * after the garbage collector has run.
  */
-xtBool XTDataSeqRead::sl_seq_next(XTXactLogBufferDPtr *ret_entry, xtBool verify, struct XTThread *thread)
+xtBool XTDataSeqRead::sl_seq_next(XTXactLogBufferDPtr *ret_entry, struct XTThread *thread)
 {
 	XTXactLogBufferDPtr	record;
 	size_t				tfer;
@@ -140,10 +158,12 @@ xtBool XTDataSeqRead::sl_seq_next(XTXact
 	size_t				max_rec_len;
 	xtBool				reread_from_buffer;
 	xtWord4				size;
+	xtLogOffset			gap_start = 0;
 
 	/* Go to the next record (xseq_record_len must be initialized
 	 * to 0 for this to work.
 	 */
+	retry:
 	sl_rec_log_offset += sl_record_len;
 	sl_record_len = 0;
 
@@ -174,6 +194,8 @@ xtBool XTDataSeqRead::sl_seq_next(XTXact
 	record = (XTXactLogBufferDPtr) (sl_buffer + rec_offset);
 	switch (record->xl.xl_status_1) {
 		case XT_LOG_ENT_HEADER:
+			if (sl_rec_log_offset != 0)
+				goto scan_to_next_record;
 			if (offsetof(XTXactLogHeaderDRec, xh_size_4) + 4 > max_rec_len) {
 				reread_from_buffer = TRUE;
 				goto read_more;
@@ -183,33 +205,42 @@ xtBool XTDataSeqRead::sl_seq_next(XTXact
 				reread_from_buffer = TRUE;
 				goto read_more;
 			}
-			if (verify) {
-				if (record->xh.xh_checksum_1 != XT_CHECKSUM_1(sl_rec_log_id))
-					goto return_empty;
-				if (XT_LOG_HEAD_MAGIC(record, len) != XT_LOG_FILE_MAGIC)
+
+			if (record->xh.xh_checksum_1 != XT_CHECKSUM_1(sl_rec_log_id))
+				goto return_empty;
+			if (XT_LOG_HEAD_MAGIC(record, len) != XT_LOG_FILE_MAGIC)
+				goto return_empty;
+			if (len > offsetof(XTXactLogHeaderDRec, xh_log_id_4) + 4) {
+				if (XT_GET_DISK_4(record->xh.xh_log_id_4) != sl_rec_log_id)
 					goto return_empty;
-				if (len > offsetof(XTXactLogHeaderDRec, xh_log_id_4) + 4) {
-					if (XT_GET_DISK_4(record->xh.xh_log_id_4) != sl_rec_log_id)
-						goto return_empty;
-				}
 			}
 			break;
 		case XT_LOG_ENT_EXT_REC_OK:
 		case XT_LOG_ENT_EXT_REC_DEL:
+			if (gap_start) {
+				xt_logf(XT_NS_CONTEXT, XT_LOG_WARNING, "Gap in data log %lu, start: %llu, size: %llu\n", (u_long) sl_rec_log_id, (u_llong) gap_start, (u_llong) (sl_rec_log_offset - gap_start));
+				gap_start = 0;
+			}
 			len = offsetof(XTactExtRecEntryDRec, er_data);
 			if (len > max_rec_len) {
 				reread_from_buffer = TRUE;
 				goto read_more;
 			}
 			size = XT_GET_DISK_4(record->er.er_data_size_4);
-			if (verify) {
-				if (sl_rec_log_offset + (xtLogOffset) offsetof(XTactExtRecEntryDRec, er_data) + size > sl_log_eof)
-					goto return_empty;
-			}
+			/* Verify the record as good as we can! */
+			if (!size)
+				goto scan_to_next_record;
+			if (sl_rec_log_offset + (xtLogOffset) offsetof(XTactExtRecEntryDRec, er_data) + size > sl_log_eof)
+				goto scan_to_next_record;
+			if (!XT_GET_DISK_4(record->er.er_tab_id_4))
+				goto scan_to_next_record;
+			if (!XT_GET_DISK_4(record->er.er_rec_id_4))
+				goto scan_to_next_record;
 			break;
 		default:
-			ASSERT_NS(FALSE);
-			goto return_empty;
+			/* Note, we no longer assume EOF.
+			 * Instead, we skip to the next value record. */
+			goto scan_to_next_record;
 	}
 
 	if (len <= max_rec_len) {
@@ -243,7 +274,20 @@ xtBool XTDataSeqRead::sl_seq_next(XTXact
 	*ret_entry = (XTXactLogBufferDPtr) sl_buffer;
 	return OK;
 
+	scan_to_next_record:
+	if (!gap_start) {
+		gap_start = sl_rec_log_offset;
+		xt_logf(XT_NS_CONTEXT, XT_LOG_WARNING, "Gap found in data log %lu, starting at offset %llu\n", (u_long) sl_rec_log_id, (u_llong) gap_start);
+	}
+	sl_record_len = 1;
+	sl_extra_garbage++;
+	goto retry;
+
 	return_empty:
+	if (gap_start) {
+		xt_logf(XT_NS_CONTEXT, XT_LOG_WARNING, "Gap in data log %lu, start: %llu, size: %llu\n", (u_long) sl_rec_log_id, (u_llong) gap_start, (u_llong) (sl_rec_log_offset - gap_start));
+		gap_start = 0;
+	}
 	*ret_entry = NULL;
 	return OK;
 }
@@ -285,22 +329,54 @@ static xtBool dl_create_log_header(XTDat
 	return OK;
 }
 
-static xtBool dl_write_log_header(XTDataLogFilePtr data_log, XTOpenFilePtr of, xtBool flush, XTThreadPtr thread)
+static xtBool dl_write_garbage_level(XTDataLogFilePtr data_log, XTOpenFilePtr of, xtBool flush, XTThreadPtr thread)
 {
 	XTXactLogHeaderDRec	header;
 
 	/* The header was not completely written, so write a new one: */
 	XT_SET_DISK_8(header.xh_free_space_8, data_log->dlf_garbage_count);
-	XT_SET_DISK_8(header.xh_file_len_8, data_log->dlf_log_eof);
-	XT_SET_DISK_8(header.xh_comp_pos_8, data_log->dlf_start_offset);
-
-	if (!xt_pwrite_file(of, offsetof(XTXactLogHeaderDRec, xh_free_space_8), 24, (xtWord1 *) &header.xh_free_space_8, &thread->st_statistics.st_data, thread))
+	if (!xt_pwrite_file(of, offsetof(XTXactLogHeaderDRec, xh_free_space_8), 8, (xtWord1 *) &header.xh_free_space_8, &thread->st_statistics.st_data, thread))
 		return FAILED;
 	if (flush && !xt_flush_file(of, &thread->st_statistics.st_data, thread))
 		return FAILED;
 	return OK;
 }
 
+/*
+ * {SKIP-GAPS}
+ * Extra garbage is the amount of space skipped during recovery of the data
+ * log file. We assume this space has not be counted as garbage, 
+ * and add it to the garbage count.
+ *
+ * This may mean that our estimate of garbaged is higher than it should
+ * be, but that is better than the other way around.
+ *
+ * The fact is, there should not be any gaps in the data log files, so
+ * this is actually an exeption which should not occur.
+ */
+static xtBool dl_write_log_header(XTDataLogFilePtr data_log, XTOpenFilePtr of, xtLogOffset extra_garbage, XTThreadPtr thread)
+{
+	XTXactLogHeaderDRec	header;
+
+	XT_SET_DISK_8(header.xh_file_len_8, data_log->dlf_log_eof);
+
+	if (extra_garbage) {
+		data_log->dlf_garbage_count += extra_garbage;
+		if (data_log->dlf_garbage_count > data_log->dlf_log_eof)
+			data_log->dlf_garbage_count = data_log->dlf_log_eof;
+		XT_SET_DISK_8(header.xh_free_space_8, data_log->dlf_garbage_count);
+		if (!xt_pwrite_file(of, offsetof(XTXactLogHeaderDRec, xh_free_space_8), 16, (xtWord1 *) &header.xh_free_space_8, &thread->st_statistics.st_data, thread))
+			return FAILED;
+	}
+	else {
+		if (!xt_pwrite_file(of, offsetof(XTXactLogHeaderDRec, xh_file_len_8), 8, (xtWord1 *) &header.xh_file_len_8, &thread->st_statistics.st_data, thread))
+			return FAILED;
+	}
+	if (!xt_flush_file(of, &thread->st_statistics.st_data, thread))
+		return FAILED;
+	return OK;
+}
+
 static void dl_free_seq_read(XTThreadPtr self __attribute__((unused)), XTDataSeqReadPtr seq_read)
 {
 	seq_read->sl_seq_exit();
@@ -318,7 +394,7 @@ static void dl_recover_log(XTThreadPtr s
 	seq_read.sl_seq_start(data_log->dlf_log_id, 0, FALSE);
 
 	for (;;) {
-		if (!seq_read.sl_seq_next(&record, TRUE, self))
+		if (!seq_read.sl_seq_next(&record, self))
 			xt_throw(self);
 		if (!record)
 			break;
@@ -331,13 +407,18 @@ static void dl_recover_log(XTThreadPtr s
 		}
 	}
 
-	if (!(data_log->dlf_log_eof = seq_read.sl_rec_log_offset)) {
+	ASSERT_NS(seq_read.sl_log_eof == seq_read.sl_rec_log_offset);
+	data_log->dlf_log_eof = seq_read.sl_rec_log_offset;
+
+	if (data_log->dlf_log_eof < sizeof(XTXactLogHeaderDRec)) {
 		data_log->dlf_log_eof = sizeof(XTXactLogHeaderDRec);
 		if (!dl_create_log_header(data_log, seq_read.sl_log_file, self))
 			xt_throw(self);
 	}
-	if (!dl_write_log_header(data_log, seq_read.sl_log_file, TRUE, self))
-		xt_throw(self);
+	else {
+		if (!dl_write_log_header(data_log, seq_read.sl_log_file, seq_read.sl_extra_garbage, self))
+			xt_throw(self);
+	}
 
 	freer_(); // dl_free_seq_read(&seq_read)
 }
@@ -452,7 +533,7 @@ xtBool XTDataLogCache::dls_set_log_state
 	return FAILED;
 }
 
-static int dl_cmp_log_id(XTThreadPtr XT_UNUSED(self), register const void XT_UNUSED(*thunk), register const void *a, register const void *b)
+static int dl_cmp_log_id(XTThreadPtr XT_UNUSED(self), register const void *XT_UNUSED(thunk), register const void *a, register const void *b)
 {
 	xtLogID			log_id_a = *((xtLogID *) a);
 	xtLogID			log_id_b = *((xtLogID *) b);
@@ -1110,7 +1191,6 @@ xtBool XTDataLogBuffer::dlb_get_log_offs
 
 	*log_id = dlb_data_log->dlf_log_id;
 	*out_offset = dlb_data_log->dlf_log_eof;
-	dlb_data_log->dlf_log_eof += req_size;
 	return OK;
 }
 
@@ -1149,7 +1229,7 @@ xtBool XTDataLogBuffer::dlb_flush_log(xt
 	return OK;
 }
 
-xtBool XTDataLogBuffer::dlb_write_thru_log(xtLogID log_id __attribute__((unused)), xtLogOffset log_offset, size_t size, xtWord1 *data, XTThreadPtr thread)
+xtBool XTDataLogBuffer::dlb_write_thru_log(xtLogID XT_NDEBUG_UNUSED(log_id), xtLogOffset log_offset, size_t size, xtWord1 *data, XTThreadPtr thread)
 {
 	ASSERT_NS(log_id == dlb_data_log->dlf_log_id);
 
@@ -1158,6 +1238,11 @@ xtBool XTDataLogBuffer::dlb_write_thru_l
 
 	if (!xt_pwrite_file(dlb_data_log->dlf_log_file, log_offset, size, data, &thread->st_statistics.st_data, thread))
 		return FAILED;
+	/* Increment of dlb_data_log->dlf_log_eof was moved here from dlb_get_log_offset()
+	 * to ensure it is done after a successful update of the log, otherwise otherwise a 
+	 * gap occurs in the log which cause eof to be detected  in middle of the log
+	 */
+	dlb_data_log->dlf_log_eof += size;
 #ifdef DEBUG
 	if (log_offset + size > dlb_max_write_offset)
 		dlb_max_write_offset = log_offset + size;
@@ -1166,7 +1251,7 @@ xtBool XTDataLogBuffer::dlb_write_thru_l
 	return OK;
 }
 
-xtBool XTDataLogBuffer::dlb_append_log(xtLogID log_id __attribute__((unused)), xtLogOffset log_offset, size_t size, xtWord1 *data, XTThreadPtr thread)
+xtBool XTDataLogBuffer::dlb_append_log(xtLogID XT_NDEBUG_UNUSED(log_id), xtLogOffset log_offset, size_t size, xtWord1 *data, XTThreadPtr thread)
 {
 	ASSERT_NS(log_id == dlb_data_log->dlf_log_id);
 
@@ -1179,10 +1264,12 @@ xtBool XTDataLogBuffer::dlb_append_log(x
 			if (dlb_buffer_size >= dlb_buffer_len + size) {
 				memcpy(dlb_log_buffer + dlb_buffer_len, data, size);
 				dlb_buffer_len += size;
+				dlb_data_log->dlf_log_eof += size;
 				return OK;
 			}
 		}
-		dlb_flush_log(FALSE, thread);
+		if (dlb_flush_log(FALSE, thread) != OK)
+			return FAILED;
 	}
 	
 	ASSERT_NS(dlb_buffer_len == 0);
@@ -1191,6 +1278,7 @@ xtBool XTDataLogBuffer::dlb_append_log(x
 		dlb_buffer_offset = log_offset;
 		dlb_buffer_len = size;
 		memcpy(dlb_log_buffer, data, size);
+		dlb_data_log->dlf_log_eof += size;
 		return OK;
 	}
 
@@ -1202,6 +1290,7 @@ xtBool XTDataLogBuffer::dlb_append_log(x
 		dlb_max_write_offset = log_offset + size;
 #endif
 	dlb_flush_required = TRUE;
+	dlb_data_log->dlf_log_eof += size;
 	return OK;
 }
 
@@ -1306,7 +1395,7 @@ xtBool XTDataLogBuffer::dlb_delete_log(x
 		xt_lock_mutex_ns(&dlb_db->db_datalogs.dlc_head_lock);
 		dlb_data_log->dlf_garbage_count += offsetof(XTactExtRecEntryDRec, er_data) + size;
 		ASSERT_NS(dlb_data_log->dlf_garbage_count < dlb_data_log->dlf_log_eof);
-		if (!dl_write_log_header(dlb_data_log, dlb_data_log->dlf_log_file, FALSE, thread)) {
+		if (!dl_write_garbage_level(dlb_data_log, dlb_data_log->dlf_log_file, FALSE, thread)) {
 			xt_unlock_mutex_ns(&dlb_db->db_datalogs.dlc_head_lock);
 			return FAILED;
 		}
@@ -1329,7 +1418,7 @@ xtBool XTDataLogBuffer::dlb_delete_log(x
 	xt_lock_mutex_ns(&dlb_db->db_datalogs.dlc_head_lock);
 	data_log->dlf_garbage_count += offsetof(XTactExtRecEntryDRec, er_data) + size;
 	ASSERT_NS(data_log->dlf_garbage_count < data_log->dlf_log_eof);
-	if (!dl_write_log_header(data_log, open_log->odl_log_file, FALSE, thread)) {
+	if (!dl_write_garbage_level(data_log, open_log->odl_log_file, FALSE, thread)) {
 		xt_unlock_mutex_ns(&dlb_db->db_datalogs.dlc_head_lock);
 		goto failed;
 	}
@@ -1357,7 +1446,7 @@ xtBool XTDataLogBuffer::dlb_delete_log(x
  * Delete all the extended data belonging to a particular
  * table.
  */
-xtPublic void xt_dl_delete_ext_data(XTThreadPtr self, XTTableHPtr tab, xtBool missing_ok __attribute__((unused)), xtBool have_table_lock)
+xtPublic void xt_dl_delete_ext_data(XTThreadPtr self, XTTableHPtr tab, xtBool XT_UNUSED(missing_ok), xtBool have_table_lock)
 {
 	XTOpenTablePtr	ot;
 	xtRecordID		page_rec_id, offs_rec_id;
@@ -1674,7 +1763,7 @@ static xtBool dl_collect_garbage(XTThrea
 			xt_lock_mutex_ns(&db->db_datalogs.dlc_head_lock);
 			data_log->dlf_garbage_count += garbage_count;
 			ASSERT(data_log->dlf_garbage_count < data_log->dlf_log_eof);
-			if (!dl_write_log_header(data_log, cs.cs_seqread->sl_seq_open_file(), TRUE, self)) {
+			if (!dl_write_garbage_level(data_log, cs.cs_seqread->sl_seq_open_file(), TRUE, self)) {
 				xt_unlock_mutex_ns(&db->db_datalogs.dlc_head_lock);
 				xt_throw(self);
 			}
@@ -1683,7 +1772,7 @@ static xtBool dl_collect_garbage(XTThrea
 			freer_(); // dl_free_compactor_state(&cs)
 			return FAILED;
 		}
-		if (!cs.cs_seqread->sl_seq_next(&record, TRUE, self))
+		if (!cs.cs_seqread->sl_seq_next(&record, self))
 			xt_throw(self);
 		cs.cs_seqread->sl_seq_pos(&curr_log_id, &curr_log_offset);
 		if (!record) {
@@ -1809,7 +1898,7 @@ static xtBool dl_collect_garbage(XTThrea
 	xt_lock_mutex_ns(&db->db_datalogs.dlc_head_lock);
 	data_log->dlf_garbage_count += garbage_count;
 	ASSERT(data_log->dlf_garbage_count < data_log->dlf_log_eof);
-	if (!dl_write_log_header(data_log, cs.cs_seqread->sl_seq_open_file(), TRUE, self)) {
+	if (!dl_write_garbage_level(data_log, cs.cs_seqread->sl_seq_open_file(), TRUE, self)) {
 		xt_unlock_mutex_ns(&db->db_datalogs.dlc_head_lock);
 		xt_throw(self);
 	}

=== modified file 'storage/pbxt/src/datalog_xt.h'
--- a/storage/pbxt/src/datalog_xt.h	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/datalog_xt.h	2009-08-18 07:46:53 +0000
@@ -183,8 +183,8 @@ typedef struct XTSeqLogRead {
 	virtual xtBool			sl_rnd_read(xtLogOffset log_offset, size_t size, xtWord1 *data, size_t *read, struct XTThread *thread) {
 		(void) log_offset; (void) size; (void) data; (void) read; (void) thread; return OK;
 	};
-	virtual xtBool			sl_seq_next(XTXactLogBufferDPtr *entry, xtBool verify, struct XTThread *thread) {
-		(void) entry; (void) verify; (void) thread; return OK;
+	virtual xtBool			sl_seq_next(XTXactLogBufferDPtr *entry, struct XTThread *thread) {
+		(void) entry; (void) thread; return OK;
 	};
 	virtual void			sl_seq_skip(size_t size) { (void) size; }
 } XTSeqLogReadRec, *XTSeqLogReadPtr;
@@ -195,6 +195,7 @@ typedef struct XTDataSeqRead : public XT
 	xtLogOffset				sl_rec_log_offset;	/* The current log read position. */
 	size_t					sl_record_len;		/* The length of the current record. */
 	xtLogOffset				sl_log_eof;
+	xtLogOffset				sl_extra_garbage;	/* Garbage found during a scan. */
 
 	size_t					sl_buffer_size;		/* Size of the buffer. */
 	xtLogOffset				sl_buf_log_offset;	/* File offset of the buffer. */
@@ -208,7 +209,7 @@ typedef struct XTDataSeqRead : public XT
 	virtual void			sl_seq_pos(xtLogID *log_id, xtLogOffset *log_offset);
 	virtual xtBool			sl_seq_start(xtLogID log_id, xtLogOffset log_offset, xtBool missing_ok);
 	virtual xtBool			sl_rnd_read(xtLogOffset log_offset, size_t size, xtWord1 *data, size_t *read, struct XTThread *thread);
-	virtual xtBool			sl_seq_next(XTXactLogBufferDPtr *entry, xtBool verify, struct XTThread *thread);
+	virtual xtBool			sl_seq_next(XTXactLogBufferDPtr *entry, struct XTThread *thread);
 	virtual void			sl_seq_skip(size_t size);
 	virtual void			sl_seq_skip_to(off_t offset);
 } XTDataSeqReadRec, *XTDataSeqReadPtr;

=== modified file 'storage/pbxt/src/discover_xt.cc'
--- a/storage/pbxt/src/discover_xt.cc	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/discover_xt.cc	2009-08-17 11:12:36 +0000
@@ -1282,9 +1282,11 @@ warn:
 #endif // LOCK_OPEN_HACK_REQUIRED
 
 //------------------------------
-int xt_create_table_frm(handlerton *hton, THD* thd, const char *db, const char *name, DT_FIELD_INFO *info, DT_KEY_INFO *keys __attribute__((unused)), xtBool skip_existing)
+int xt_create_table_frm(handlerton *hton, THD* thd, const char *db, const char *name, DT_FIELD_INFO *info, DT_KEY_INFO *XT_UNUSED(keys), xtBool skip_existing)
 {
 #ifdef DRIZZLED
+    drizzled::message::Table table_proto;
+
 	static const char *ext = ".dfe";
 	static const int ext_len = 4;
 #else
@@ -1329,8 +1331,7 @@ int xt_create_table_frm(handlerton *hton
 			info->field_flags,
             COLUMN_FORMAT_TYPE_FIXED,
 		    NULL /*default_value*/, NULL /*on_update_value*/, &comment, NULL /*change*/,
-            NULL /*interval_list*/, info->field_charset,
-			NULL /*vcol_info*/))
+            NULL /*interval_list*/, info->field_charset))
 #else
 		if (add_field_to_list(thd, &field_name, info->field_type, field_length_ptr, info->field_decimal_length,
 			info->field_flags,
@@ -1365,7 +1366,10 @@ int xt_create_table_frm(handlerton *hton
 	
 	/* Create an internal temp table */
 #ifdef DRIZZLED
-	if (mysql_create_table_no_lock(thd, db, name, &mylex.create_info, &mylex.alter_info, 1, 0, false)) 
+    table_proto.set_name(name);
+    table_proto.set_type(drizzled::message::Table::STANDARD);
+
+	if (mysql_create_table_no_lock(thd, db, name, &mylex.create_info, &table_proto, &mylex.alter_info, 1, 0, false)) 
 		goto error;
 #else
 	if (mysql_create_table_no_lock(thd, db, name, &mylex.create_info, &mylex.alter_info, 1, 0)) 

=== modified file 'storage/pbxt/src/filesys_xt.cc'
--- a/storage/pbxt/src/filesys_xt.cc	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/filesys_xt.cc	2009-08-18 07:46:53 +0000
@@ -23,6 +23,10 @@
 
 #include "xt_config.h"
 
+#ifdef DRIZZLED
+#include <bitset>
+#endif
+
 #ifndef XT_WIN
 #include <unistd.h>
 #include <dirent.h>
@@ -50,6 +54,7 @@
 //#define DEBUG_TRACE_IO
 //#define DEBUG_TRACE_MAP_IO
 //#define DEBUG_TRACE_FILES
+//#define INJECT_WRITE_REMAP_ERROR
 #endif
 
 #ifdef DEBUG_TRACE_FILES
@@ -57,6 +62,11 @@
 #define PRINTF		xt_trace
 #endif
 
+#ifdef INJECT_WRITE_REMAP_ERROR
+#define INJECT_REMAP_FILE_SIZE			1000000
+#define INJECT_REMAP_FILE_TYPE			"xtd"
+#endif
+
 /* ----------------------------------------------------------------------
  * Globals
  */
@@ -127,11 +137,11 @@ static void fs_close_fmap(XTThreadPtr se
 		mm->mm_start = NULL;
 	}
 #endif
-	xt_rwmutex_free(self, &mm->mm_lock);
+	FILE_MAP_FREE_LOCK(self, &mm->mm_lock);
 	xt_free(self, mm);
 }
 
-static void fs_free_file(XTThreadPtr self, void *thunk __attribute__((unused)), void *item)
+static void fs_free_file(XTThreadPtr self, void *XT_UNUSED(thunk), void *item)
 {
 	XTFilePtr	file_ptr = *((XTFilePtr *) item);
 
@@ -148,17 +158,13 @@ static void fs_free_file(XTThreadPtr sel
 		file_ptr->fil_filedes = XT_NULL_FD;
 	}
 
-	if (file_ptr->fil_memmap) {
-		fs_close_fmap(self, file_ptr->fil_memmap);
-		file_ptr->fil_memmap = NULL;
-	}
-
 #ifdef DEBUG_TRACE_FILES
 	PRINTF("%s: free file: (%d) %s\n", self->t_name, (int) file_ptr->fil_id, 
 		file_ptr->fil_path ? xt_last_2_names_of_path(file_ptr->fil_path) : "?");
 #endif
 
 	if (!file_ptr->fil_ref_count) {
+		ASSERT_NS(!file_ptr->fil_handle_count);
 		/* Flush any cache before this file is invalid: */
 		if (file_ptr->fil_path) {
 			xt_free(self, file_ptr->fil_path);
@@ -169,7 +175,7 @@ static void fs_free_file(XTThreadPtr sel
 	}
 }
 
-static int fs_comp_file(XTThreadPtr self __attribute__((unused)), register const void *thunk __attribute__((unused)), register const void *a, register const void *b)
+static int fs_comp_file(XTThreadPtr XT_UNUSED(self), register const void *XT_UNUSED(thunk), register const void *a, register const void *b)
 {
 	char		*file_name = (char *) a;
 	XTFilePtr	file_ptr = *((XTFilePtr *) b);
@@ -177,7 +183,7 @@ static int fs_comp_file(XTThreadPtr self
 	return strcmp(file_name, file_ptr->fil_path);
 }
 
-static int fs_comp_file_ci(XTThreadPtr self __attribute__((unused)), register const void *thunk __attribute__((unused)), register const void *a, register const void *b)
+static int fs_comp_file_ci(XTThreadPtr XT_UNUSED(self), register const void *XT_UNUSED(thunk), register const void *a, register const void *b)
 {
 	char		*file_name = (char *) a;
 	XTFilePtr	file_ptr = *((XTFilePtr *) b);
@@ -868,11 +874,22 @@ xtPublic xtBool xt_flush_file(XTOpenFile
 		goto failed;
 	}
 #else
+	/* Mac OS X has problems with fsync. We had several cases of index corruption presumably because
+	 * fsync didn't really flush index pages to disk. fcntl(F_FULLFSYNC) is considered more effective 
+	 * in such case.
+	 */
+#ifdef F_FULLFSYNC
+	if (fcntl(of->of_filedes, F_FULLFSYNC, 0) == -1) {
+		xt_register_ferrno(XT_REG_CONTEXT, errno, xt_file_path(of));
+		goto failed;
+	}
+#else
 	if (fsync(of->of_filedes) == -1) {
 		xt_register_ferrno(XT_REG_CONTEXT, errno, xt_file_path(of));
 		goto failed;
 	}
 #endif
+#endif
 #ifdef DEBUG_TRACE_IO
 	xt_trace("/* %s */ pbxt_file_sync(\"%s\");\n", xt_trace_clock_diff(timef, start), of->fr_file->fil_path);
 #endif
@@ -938,6 +955,29 @@ xtBool xt_pread_file(XTOpenFilePtr of, o
 	return OK;
 }
 
+xtPublic xtBool xt_lock_file_ptr(XTOpenFilePtr of, xtWord1 **data, off_t offset, size_t size, XTIOStatsPtr stat, XTThreadPtr thread)
+{
+	size_t red_size;
+
+	if (!*data) {
+		if (!(*data = (xtWord1 *) xt_malloc_ns(size)))
+			return FAILED;
+	}
+
+	if (!xt_pread_file(of, offset, size, 0, *data, &red_size, stat, thread))
+		return FAILED;
+	
+	//if (red_size < size)
+	//	memset();
+	return OK;
+}
+
+xtPublic void xt_unlock_file_ptr(XTOpenFilePtr XT_UNUSED(of), xtWord1 *data, XTThreadPtr XT_UNUSED(thread))
+{
+	if (data)
+		xt_free_ns(data);
+}
+
 /* ----------------------------------------------------------------------
  * Directory operations
  */
@@ -949,7 +989,13 @@ XTOpenDirPtr xt_dir_open(XTThreadPtr sel
 {
 	XTOpenDirPtr	od;
 
-	pushsr_(od, xt_dir_close, (XTOpenDirPtr) xt_calloc(self, sizeof(XTOpenDirRec)));
+#ifdef XT_SOLARIS
+	/* see the comment in filesys_xt.h */
+	size_t sz = pathconf(path, _PC_NAME_MAX) + sizeof(XTOpenDirRec) + 1;
+#else
+	size_t sz = sizeof(XTOpenDirRec);
+#endif
+	pushsr_(od, xt_dir_close, (XTOpenDirPtr) xt_calloc(self, sz));
 
 #ifdef XT_WIN
 	size_t			len;
@@ -976,7 +1022,6 @@ XTOpenDirPtr xt_dir_open(XTThreadPtr sel
 	if (!od->od_dir)
 		xt_throw_ferrno(XT_CONTEXT, errno, path);
 #endif
-
 	popr_(); // Discard xt_dir_close(od)
 	return od;
 }
@@ -1097,7 +1142,7 @@ xtBool xt_dir_next(XTThreadPtr self, XTO
 }
 #endif
 
-char *xt_dir_name(XTThreadPtr self __attribute__((unused)), XTOpenDirPtr od)
+char *xt_dir_name(XTThreadPtr XT_UNUSED(self), XTOpenDirPtr od)
 {
 #ifdef XT_WIN
 	return od->od_data.cFileName;
@@ -1106,8 +1151,9 @@ char *xt_dir_name(XTThreadPtr self __att
 #endif
 }
 
-xtBool xt_dir_is_file(XTThreadPtr self __attribute__((unused)), XTOpenDirPtr od)
+xtBool xt_dir_is_file(XTThreadPtr self, XTOpenDirPtr od)
 {
+	(void) self;
 #ifdef XT_WIN
 	if (od->od_data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)
 		return FALSE;
@@ -1156,6 +1202,15 @@ off_t xt_dir_file_size(XTThreadPtr self,
 
 static xtBool fs_map_file(XTFileMemMapPtr mm, XTFilePtr file, xtBool grow)
 {
+#ifdef INJECT_WRITE_REMAP_ERROR
+	if (xt_is_extension(file->fil_path, INJECT_REMAP_FILE_TYPE)) {
+		if (mm->mm_length > INJECT_REMAP_FILE_SIZE) {
+			xt_register_ferrno(XT_REG_CONTEXT, 30, file->fil_path);
+			return FAILED;
+		}
+	}
+#endif
+
 	ASSERT_NS(!mm->mm_start);
 #ifdef XT_WIN
 	/* This will grow the file to the given size: */
@@ -1228,7 +1283,7 @@ xtPublic XTMapFilePtr xt_open_fmap(XTThr
 			/* NULL is the value returned on error! */
 			mm->mm_mapdes = NULL;
 #endif
-			xt_rwmutex_init_with_autoname(self, &mm->mm_lock);
+			FILE_MAP_INIT_LOCK(self, &mm->mm_lock);
 			mm->mm_length = fs_seek_eof(self, map->fr_file->fil_filedes, map->fr_file);
 			if (sizeof(size_t) == 4 && mm->mm_length >= (off_t) 0xFFFFFFFF)
 				xt_throw_ixterr(XT_CONTEXT, XT_ERR_FILE_TOO_LONG, map->fr_file->fil_path);
@@ -1257,21 +1312,19 @@ xtPublic XTMapFilePtr xt_open_fmap(XTThr
 
 xtPublic void xt_close_fmap(XTThreadPtr self, XTMapFilePtr map)
 {
+	ASSERT_NS(!map->mf_slock_count);
 	if (map->fr_file) {
-		xt_fs_release_file(self, map->fr_file);
-
 		xt_sl_lock(self, fs_globals.fsg_open_files);
-		pushr_(xt_sl_unlock, fs_globals.fsg_open_files);
-		
+		pushr_(xt_sl_unlock, fs_globals.fsg_open_files);		
 		map->fr_file->fil_handle_count--;
-		if (!map->fr_file->fil_handle_count)
-			fs_free_file(self, NULL, &map->fr_file);
-
+		if (!map->fr_file->fil_handle_count) {
+			fs_close_fmap(self, map->fr_file->fil_memmap);
+			map->fr_file->fil_memmap = NULL;
+		}
 		freer_();
-
+		
+		xt_fs_release_file(self, map->fr_file);
 		map->fr_file = NULL;
-
-
 	}
 	map->mf_memmap = NULL;
 	xt_free(self, map);
@@ -1346,14 +1399,23 @@ static xtBool fs_remap_file(XTMapFilePtr
 		}
 		mm->mm_start = NULL;
 #ifdef XT_WIN
-		if (!CloseHandle(mm->mm_mapdes))
+		/* It is possible that a previous remap attempt has failed: the map was closed
+		 * but the new map was not allocated (e.g. because of insufficient disk space). 
+		 * In this case mm->mm_mapdes will be NULL.
+		 */
+		if (mm->mm_mapdes && !CloseHandle(mm->mm_mapdes))
 			return xt_register_ferrno(XT_REG_CONTEXT, fs_get_win_error(), xt_file_path(map));
 		mm->mm_mapdes = NULL;
 #endif
+		off_t old_size = mm->mm_length;
 		mm->mm_length = new_size;
 
-		if (!fs_map_file(mm, map->fr_file, TRUE))
+		if (!fs_map_file(mm, map->fr_file, TRUE)) {
+			/* Try to restore old mapping */
+			mm->mm_length = old_size;
+			fs_map_file(mm, map->fr_file, FALSE);
 			return FAILED;
+		}
 	}
 	return OK;
 	
@@ -1367,16 +1429,19 @@ static xtBool fs_remap_file(XTMapFilePtr
 xtPublic xtBool xt_pwrite_fmap(XTMapFilePtr map, off_t offset, size_t size, void *data, XTIOStatsPtr stat, XTThreadPtr thread)
 {
 	XTFileMemMapPtr mm = map->mf_memmap;
+#ifndef FILE_MAP_USE_PTHREAD_RW
 	xtThreadID		thd_id = thread->t_id;
+#endif
 
 #ifdef DEBUG_TRACE_MAP_IO
 	xt_trace("/* %s */ pbxt_fmap_writ(\"%s\", %lu, %lu);\n", xt_trace_clock_diff(NULL), map->fr_file->fil_path, (u_long) offset, (u_long) size);
 #endif
-	xt_rwmutex_slock(&mm->mm_lock, thd_id);
+	ASSERT_NS(!map->mf_slock_count);
+	FILE_MAP_READ_LOCK(&mm->mm_lock, thd_id);
 	if (!mm->mm_start || offset + (off_t) size > mm->mm_length) {
-		xt_rwmutex_unlock(&mm->mm_lock, thd_id);
+		FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
 
-		xt_rwmutex_xlock(&mm->mm_lock, thd_id);
+		FILE_MAP_WRITE_LOCK(&mm->mm_lock, thd_id);
 		if (!fs_remap_file(map, offset, size, stat))
 			goto failed;
 	}
@@ -1396,29 +1461,32 @@ xtPublic xtBool xt_pwrite_fmap(XTMapFile
 	memcpy(mm->mm_start + offset, data, size);
 #endif
 
-	xt_rwmutex_unlock(&mm->mm_lock, thd_id);
+	FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
 	stat->ts_write += size;
 	return OK;
 
 	failed:
-	xt_rwmutex_unlock(&mm->mm_lock, thd_id);
+	FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
 	return FAILED;
 }
 
 xtPublic xtBool xt_pread_fmap_4(XTMapFilePtr map, off_t offset, xtWord4 *value, XTIOStatsPtr stat, XTThreadPtr thread)
 {
 	XTFileMemMapPtr	mm = map->mf_memmap;
+#ifndef FILE_MAP_USE_PTHREAD_RW
 	xtThreadID		thd_id = thread->t_id;
+#endif
 
 #ifdef DEBUG_TRACE_MAP_IO
 	xt_trace("/* %s */ pbxt_fmap_read_4(\"%s\", %lu, 4);\n", xt_trace_clock_diff(NULL), map->fr_file->fil_path, (u_long) offset);
 #endif
-	xt_rwmutex_slock(&mm->mm_lock, thd_id);
+	if (!map->mf_slock_count)
+		FILE_MAP_READ_LOCK(&mm->mm_lock, thd_id);
 	if (!mm->mm_start) {
-		xt_rwmutex_unlock(&mm->mm_lock, thd_id);
-		xt_rwmutex_xlock(&mm->mm_lock, thd_id);
+		FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
+		FILE_MAP_WRITE_LOCK(&mm->mm_lock, thd_id);
 		if (!fs_remap_file(map, 0, 0, stat)) {
-			xt_rwmutex_unlock(&mm->mm_lock, thd_id);
+			FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
 			return FAILED;
 		}
 	}
@@ -1436,7 +1504,7 @@ xtPublic xtBool xt_pread_fmap_4(XTMapFil
 		}
 		__except(EXCEPTION_EXECUTE_HANDLER)
 		{
-			xt_rwmutex_unlock(&mm->mm_lock, thd_id);
+			FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
 			return xt_register_ferrno(XT_REG_CONTEXT, GetExceptionCode(), xt_file_path(map));
 		}
 #else
@@ -1444,7 +1512,8 @@ xtPublic xtBool xt_pread_fmap_4(XTMapFil
 #endif
 	}
 
-	xt_rwmutex_unlock(&mm->mm_lock, thd_id);
+	if (!map->mf_slock_count)
+		FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
 	stat->ts_read += 4;
 	return OK;
 }
@@ -1452,7 +1521,9 @@ xtPublic xtBool xt_pread_fmap_4(XTMapFil
 xtPublic xtBool xt_pread_fmap(XTMapFilePtr map, off_t offset, size_t size, size_t min_size, void *data, size_t *red_size, XTIOStatsPtr stat, XTThreadPtr thread)
 {
 	XTFileMemMapPtr	mm = map->mf_memmap;
+#ifndef FILE_MAP_USE_PTHREAD_RW
 	xtThreadID		thd_id = thread->t_id;
+#endif
 	size_t			tfer;
 
 #ifdef DEBUG_TRACE_MAP_IO
@@ -1461,6 +1532,8 @@ xtPublic xtBool xt_pread_fmap(XTMapFileP
 	/* NOTE!! The file map may already be locked,
 	 * by a call to xt_lock_fmap_ptr()!
 	 *
+	 * 20.05.2009: This problem should be fixed now with mf_slock_count!
+	 *
 	 * This can occur during a sequential scan:
 	 * xt_pread_fmap()  Line 1330
 	 * XTTabCache::tc_read_direct()  Line 361
@@ -1491,13 +1564,16 @@ xtPublic xtBool xt_pread_fmap(XTMapFileP
 	 * As a result, the slock must be able to handle
 	 * nested calls to lock/unlock.
 	 */
-	xt_rwmutex_slock(&mm->mm_lock, thd_id);
+	if (!map->mf_slock_count)
+		FILE_MAP_READ_LOCK(&mm->mm_lock, thd_id);
 	tfer = size;
 	if (!mm->mm_start) {
-		xt_rwmutex_unlock(&mm->mm_lock, thd_id);
-		xt_rwmutex_xlock(&mm->mm_lock, thd_id);
+		FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
+		ASSERT_NS(!map->mf_slock_count);
+		FILE_MAP_WRITE_LOCK(&mm->mm_lock, thd_id);
 		if (!fs_remap_file(map, 0, 0, stat)) {
-			xt_rwmutex_unlock(&mm->mm_lock, thd_id);
+			if (!map->mf_slock_count)
+				FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
 			return FAILED;
 		}
 	}
@@ -1514,7 +1590,8 @@ xtPublic xtBool xt_pread_fmap(XTMapFileP
 		}
 		__except(EXCEPTION_EXECUTE_HANDLER)
 		{
-			xt_rwmutex_unlock(&mm->mm_lock, thd_id);
+			if (!map->mf_slock_count)
+				FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
 			return xt_register_ferrno(XT_REG_CONTEXT, GetExceptionCode(), xt_file_path(map));
 		}
 #else
@@ -1522,7 +1599,8 @@ xtPublic xtBool xt_pread_fmap(XTMapFileP
 #endif
 	}
 
-	xt_rwmutex_unlock(&mm->mm_lock, thd_id);
+	if (!map->mf_slock_count)
+		FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
 	if (tfer < min_size)
 		return xt_register_ferrno(XT_REG_CONTEXT, ESPIPE, xt_file_path(map));
 
@@ -1535,18 +1613,23 @@ xtPublic xtBool xt_pread_fmap(XTMapFileP
 xtPublic xtBool xt_flush_fmap(XTMapFilePtr map, XTIOStatsPtr stat, XTThreadPtr thread)
 {
 	XTFileMemMapPtr	mm = map->mf_memmap;
+#ifndef FILE_MAP_USE_PTHREAD_RW
 	xtThreadID		thd_id = thread->t_id;
+#endif
 	xtWord8			s;
 
 #ifdef DEBUG_TRACE_MAP_IO
 	xt_trace("/* %s */ pbxt_fmap_sync(\"%s\");\n", xt_trace_clock_diff(NULL), map->fr_file->fil_path);
 #endif
-	xt_rwmutex_slock(&mm->mm_lock, thd_id);
+	if (!map->mf_slock_count)
+		FILE_MAP_READ_LOCK(&mm->mm_lock, thd_id);
 	if (!mm->mm_start) {
-		xt_rwmutex_unlock(&mm->mm_lock, thd_id);
-		xt_rwmutex_xlock(&mm->mm_lock, thd_id);
+		FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
+		ASSERT_NS(!map->mf_slock_count);
+		FILE_MAP_WRITE_LOCK(&mm->mm_lock, thd_id);
 		if (!fs_remap_file(map, 0, 0, stat)) {
-			xt_rwmutex_unlock(&mm->mm_lock, thd_id);
+			if (!map->mf_slock_count)
+				FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
 			return FAILED;
 		}
 	}
@@ -1562,7 +1645,8 @@ xtPublic xtBool xt_flush_fmap(XTMapFileP
 		goto failed;
 	}
 #endif
-	xt_rwmutex_unlock(&mm->mm_lock, thd_id);
+	if (!map->mf_slock_count)
+		FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
 	s = stat->ts_flush_start;
 	stat->ts_flush_start = 0;
 	stat->ts_flush_time += xt_trace_clock() - s;
@@ -1570,22 +1654,27 @@ xtPublic xtBool xt_flush_fmap(XTMapFileP
 	return OK;
 
 	failed:
-	xt_rwmutex_unlock(&mm->mm_lock, thd_id);
+	if (!map->mf_slock_count)
+		FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
 	s = stat->ts_flush_start;
 	stat->ts_flush_start = 0;
 	stat->ts_flush_time += xt_trace_clock() - s;
 	return FAILED;
 }
 
-xtPublic xtWord1 *xt_lock_fmap_ptr(XTMapFilePtr map, off_t offset, size_t size, XTIOStatsPtr stat, XTThreadPtr XT_UNUSED(thread))
+xtPublic xtWord1 *xt_lock_fmap_ptr(XTMapFilePtr map, off_t offset, size_t size, XTIOStatsPtr stat, XTThreadPtr thread)
 {
 	XTFileMemMapPtr	mm = map->mf_memmap;
+#ifndef FILE_MAP_USE_PTHREAD_RW
 	xtThreadID		thd_id = thread->t_id;
+#endif
 
-	xt_rwmutex_slock(&mm->mm_lock, thd_id);
+	if (!map->mf_slock_count)
+		FILE_MAP_READ_LOCK(&mm->mm_lock, thd_id);
+	map->mf_slock_count++;
 	if (!mm->mm_start) {
-		xt_rwmutex_unlock(&mm->mm_lock, thd_id);
-		xt_rwmutex_xlock(&mm->mm_lock, thd_id);
+		FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
+		FILE_MAP_WRITE_LOCK(&mm->mm_lock, thd_id);
 		if (!fs_remap_file(map, 0, 0, stat))
 			goto failed;
 	}
@@ -1599,13 +1688,17 @@ xtPublic xtWord1 *xt_lock_fmap_ptr(XTMap
 	return mm->mm_start + offset;
 
 	failed:
-	xt_rwmutex_unlock(&mm->mm_lock, thd_id);
+	map->mf_slock_count--;
+	if (!map->mf_slock_count)
+		FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
 	return NULL;
 }
 
 xtPublic void xt_unlock_fmap_ptr(XTMapFilePtr map, XTThreadPtr thread)
 {
-	xt_rwmutex_unlock(&map->mf_memmap->mm_lock, thread->t_id);
+	map->mf_slock_count--;
+	if (!map->mf_slock_count)
+		FILE_MAP_UNLOCK(&map->mf_memmap->mm_lock, thread->t_id);
 }
 
 /* ----------------------------------------------------------------------

=== modified file 'storage/pbxt/src/filesys_xt.h'
--- a/storage/pbxt/src/filesys_xt.h	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/filesys_xt.h	2009-08-17 11:12:36 +0000
@@ -76,13 +76,60 @@ xtBool			xt_fs_rename(struct XTThread *s
 #define XT_NULL_FD	(-1)
 #endif
 
+/* Note, this lock must be re-entrant,
+ * The only lock that satifies this is
+ * FILE_MAP_USE_RWMUTEX!
+ *
+ * 20.05.2009: This problem should be fixed now with mf_slock_count!
+ *
+ * The lock need no longer be re-entrant
+ */
+#ifdef XT_NO_ATOMICS
+#define FILE_MAP_USE_PTHREAD_RW
+#else
+//#define FILE_MAP_USE_RWMUTEX
+//#define FILE_MAP_USE_PTHREAD_RW
+//#define IDX_USE_SPINXSLOCK
+#define FILE_MAP_USE_XSMUTEX
+#endif
+
+#ifdef FILE_MAP_USE_XSMUTEX
+#define FILE_MAP_LOCK_TYPE				XTXSMutexRec
+#define FILE_MAP_INIT_LOCK(s, i)		xt_xsmutex_init_with_autoname(s, i)
+#define FILE_MAP_FREE_LOCK(s, i)		xt_xsmutex_free(s, i)	
+#define FILE_MAP_READ_LOCK(i, o)		xt_xsmutex_slock(i, o)
+#define FILE_MAP_WRITE_LOCK(i, o)		xt_xsmutex_xlock(i, o)
+#define FILE_MAP_UNLOCK(i, o)			xt_xsmutex_unlock(i, o)
+#elif defined(FILE_MAP_USE_PTHREAD_RW)
+#define FILE_MAP_LOCK_TYPE				xt_rwlock_type
+#define FILE_MAP_INIT_LOCK(s, i)		xt_init_rwlock(s, i)
+#define FILE_MAP_FREE_LOCK(s, i)		xt_free_rwlock(i)	
+#define FILE_MAP_READ_LOCK(i, o)		xt_slock_rwlock_ns(i)
+#define FILE_MAP_WRITE_LOCK(i, o)		xt_xlock_rwlock_ns(i)
+#define FILE_MAP_UNLOCK(i, o)			xt_unlock_rwlock_ns(i)
+#elif defined(FILE_MAP_USE_RWMUTEX)
+#define FILE_MAP_LOCK_TYPE				XTRWMutexRec
+#define FILE_MAP_INIT_LOCK(s, i)		xt_rwmutex_init_with_autoname(s, i)
+#define FILE_MAP_FREE_LOCK(s, i)		xt_rwmutex_free(s, i)	
+#define FILE_MAP_READ_LOCK(i, o)		xt_rwmutex_slock(i, o)
+#define FILE_MAP_WRITE_LOCK(i, o)		xt_rwmutex_xlock(i, o)
+#define FILE_MAP_UNLOCK(i, o)			xt_rwmutex_unlock(i, o)
+#elif defined(FILE_MAP_USE_SPINXSLOCK)
+#define FILE_MAP_LOCK_TYPE				XTSpinXSLockRec
+#define FILE_MAP_INIT_LOCK(s, i)		xt_spinxslock_init_with_autoname(s, i)
+#define FILE_MAP_FREE_LOCK(s, i)		xt_spinxslock_free(s, i)	
+#define FILE_MAP_READ_LOCK(i, o)		xt_spinxslock_slock(i, o)
+#define FILE_MAP_WRITE_LOCK(i, o)		xt_spinxslock_xlock(i, o)
+#define FILE_MAP_UNLOCK(i, o)			xt_spinxslock_unlock(i, o)
+#endif
+
 typedef struct XTFileMemMap {
 	xtWord1				*mm_start;			/* The in-memory start of the map. */
 #ifdef XT_WIN
 	HANDLE				mm_mapdes;
 #endif
 	off_t				mm_length;			/* The length of the file map. */
-	XTRWMutexRec		mm_lock;			/* The file map R/W lock. */
+	FILE_MAP_LOCK_TYPE	mm_lock;			/* The file map R/W lock. */
 	size_t				mm_grow_size;		/* The amount by which the map file is increased. */
 } XTFileMemMapRec, *XTFileMemMapPtr;
 
@@ -127,6 +174,9 @@ xtBool			xt_pwrite_file(XTOpenFilePtr of
 xtBool			xt_pread_file(XTOpenFilePtr of, off_t offset, size_t size, size_t min_size, void *data, size_t *red_size, struct XTIOStats *timer, struct XTThread *thread);
 xtBool			xt_flush_file(XTOpenFilePtr of, struct XTIOStats *timer, struct XTThread *thread);
 
+xtBool			xt_lock_file_ptr(XTOpenFilePtr of, xtWord1 **data, off_t offset, size_t size, struct XTIOStats *timer, struct XTThread *thread);
+void			xt_unlock_file_ptr(XTOpenFilePtr of, xtWord1 *data, struct XTThread *thread);
+
 typedef struct XTOpenDir {
 	char				*od_path;
 #ifdef XT_WIN
@@ -134,8 +184,14 @@ typedef struct XTOpenDir {
 	WIN32_FIND_DATA		od_data;
 #else
 	char				*od_filter;
-	struct dirent		od_entry;
 	DIR					*od_dir;
+	/* WARNING: Solaris requires od_entry.d_name member to have size at least as returned
+	 * by pathconf() function on per-directory basis. This makes it impossible to statically
+	 * pre-set the size. So xt_dir_open on Solaris dynamically allocates space as needed. 
+	 *
+	 * This also means that the od_entry member should always be last in the XTOpenDir structure.
+	 */
+	struct dirent		od_entry;
 #endif
 } XTOpenDirRec, *XTOpenDirPtr;
 
@@ -147,6 +203,7 @@ xtBool			xt_dir_is_file(struct XTThread 
 off_t			xt_dir_file_size(struct XTThread *self, XTOpenDirPtr od);
 
 typedef struct XTMapFile : public XTFileRef {
+	u_int				mf_slock_count;
 	XTFileMemMapPtr		mf_memmap;
 } XTMapFileRec, *XTMapFilePtr;
 

=== modified file 'storage/pbxt/src/ha_pbxt.cc'
--- a/storage/pbxt/src/ha_pbxt.cc	2009-04-02 11:49:57 +0000
+++ b/storage/pbxt/src/ha_pbxt.cc	2009-08-18 07:46:53 +0000
@@ -65,8 +65,8 @@ extern "C" char **session_query(Session 
 #include "heap_xt.h"
 #include "myxt_xt.h"
 #include "datadic_xt.h"
-#ifdef XT_STREAMING
-#include "streaming_xt.h"
+#ifdef PBMS_ENABLED
+#include "pbms_enabled.h"
 #endif
 #include "tabcache_xt.h"
 #include "systab_xt.h"
@@ -91,20 +91,22 @@ extern "C" char **session_query(Session 
 //#define PRINT_STATEMENTS
 #endif
 
+#ifndef DRIZZLED
 static handler	*pbxt_create_handler(handlerton *hton, TABLE_SHARE *table, MEM_ROOT *mem_root);
 static int		pbxt_init(void *p);
 static int		pbxt_end(void *p);
-#ifndef DRIZZLED
 static int		pbxt_panic(handlerton *hton, enum ha_panic_function flag);
-#endif
 static void		pbxt_drop_database(handlerton *hton, char *path);
 static int		pbxt_close_connection(handlerton *hton, THD* thd);
 static int		pbxt_commit(handlerton *hton, THD *thd, bool all);
 static int		pbxt_rollback(handlerton *hton, THD *thd, bool all);
+#endif
 static void		ha_aquire_exclusive_use(XTThreadPtr self, XTSharePtr share, ha_pbxt *mine);
 static void		ha_release_exclusive_use(XTThreadPtr self, XTSharePtr share);
 static void		ha_close_open_tables(XTThreadPtr self, XTSharePtr share, ha_pbxt *mine);
 
+extern void		xt_xres_start_database_recovery(XTThreadPtr self);
+
 #ifdef TRACE_STATEMENTS
 
 #ifdef PRINT_STATEMENTS
@@ -167,7 +169,7 @@ xtBool					pbxt_crash_debug = FALSE;
 /* Variables for pbxt share methods */
 static xt_mutex_type	pbxt_database_mutex;		// Prevent a database from being opened while it is being dropped
 static XTHashTabPtr		pbxt_share_tables;			// Hash used to track open tables
-static XTDatabaseHPtr	pbxt_database = NULL;		// The global open database
+XTDatabaseHPtr			pbxt_database = NULL;		// The global open database
 static char				*pbxt_index_cache_size;
 static char				*pbxt_record_cache_size;
 static char				*pbxt_log_cache_size;
@@ -178,6 +180,7 @@ static char				*pbxt_checkpoint_frequenc
 static char				*pbxt_data_log_threshold;
 static char				*pbxt_data_file_grow_size;
 static char				*pbxt_row_file_grow_size;
+static int				pbxt_max_threads;
 
 #ifdef DEBUG
 #define XT_SHARE_LOCK_WAIT		5000
@@ -454,7 +457,7 @@ xtPublic void xt_ha_close_global_databas
  * operation to make sure it does not occur while
  * some other thread is doing a "closeall".
  */
-xtPublic void xt_ha_open_database_of_table(XTThreadPtr self, XTPathStrPtr table_path __attribute__((unused)))
+xtPublic void xt_ha_open_database_of_table(XTThreadPtr self, XTPathStrPtr XT_UNUSED(table_path))
 {
 #ifdef XT_USE_GLOBAL_DB
 	if (!self->st_database) {
@@ -574,7 +577,7 @@ xtPublic XTThreadPtr xt_ha_thd_to_self(T
 }
 
 /* The first bit is 1. */
-static u_int ha_get_max_bit(MY_BITMAP *map)
+static u_int ha_get_max_bit(MX_BITMAP *map)
 {
 	my_bitmap_map	*data_ptr = map->bitmap;
 	my_bitmap_map	*end_ptr = map->last_word_ptr;
@@ -676,7 +679,7 @@ xtPublic int xt_ha_pbxt_to_mysql_error(i
 	return(-1);			// Unknown error
 }
 
-xtPublic int xt_ha_pbxt_thread_error_for_mysql(THD *thd __attribute__((unused)), const XTThreadPtr self, int ignore_dup_key)
+xtPublic int xt_ha_pbxt_thread_error_for_mysql(THD *XT_UNUSED(thd), const XTThreadPtr self, int ignore_dup_key)
 {
 	int xt_err = self->t_exception.e_xt_err;
 
@@ -965,8 +968,8 @@ static void ha_exit(XTThreadPtr self)
 	/* This may cause the streaming engine to cleanup connections and 
 	 * tables belonging to this engine. This in turn may require some of
 	 * the stuff below (like xt_create_thread() called from pbxt_close_table()! */
-#ifdef XT_STREAMING
-	xt_exit_streaming();
+#ifdef PBMS_ENABLED
+	pbms_finalize();
 #endif
 	pbxt_call_exit(self);
 	xt_exit_threading(self);
@@ -979,9 +982,13 @@ static void ha_exit(XTThreadPtr self)
 /*
  * Outout the PBXT status. Return FALSE on error.
  */
-static bool pbxt_show_status(handlerton *hton __attribute__((unused)), THD* thd, 
+#ifdef DRIZZLED
+bool PBXTStorageEngine::show_status(Session *thd, stat_print_fn *stat_print, enum ha_stat_type)
+#else
+static bool pbxt_show_status(handlerton *XT_UNUSED(hton), THD* thd, 
                           stat_print_fn* stat_print,
-                          enum ha_stat_type stat_type __attribute__((unused)))
+                          enum ha_stat_type XT_UNUSED(stat_type))
+#endif
 {
 	XTThreadPtr			self;	
 	int					err = 0;
@@ -997,6 +1004,9 @@ static bool pbxt_show_status(handlerton 
 	xt_trace("// %s - dump\n", xt_trace_clock_diff(NULL));
 	xt_dump_trace();
 #endif
+#ifdef XT_TRACK_CONNECTIONS
+	xt_dump_conn_tracking();
+#endif
 
 	try_(a) {
 		myxt_get_status(self, &strbuf);
@@ -1020,14 +1030,18 @@ static bool pbxt_show_status(handlerton 
  *
  * return 1 on error, else 0.
  */
+#ifdef DRIZZLED
+static int pbxt_init(PluginRegistry &registry)
+#else
 static int pbxt_init(void *p)
+#endif
 {
 	int init_err = 0;
 
 	XT_TRACE_CALL();
 
 	if (sizeof(xtWordPS) != sizeof(void *)) {
-		printf("PBXT: This won't work, I require that sizeof(xtWordPS) != sizeof(void *)!\n");
+		printf("PBXT: This won't work, I require that sizeof(xtWordPS) == sizeof(void *)!\n");
 		XT_RETURN(1);
 	}
 
@@ -1045,28 +1059,31 @@ static int pbxt_init(void *p)
 
  		xt_p_mutex_init_with_autoname(&pbxt_database_mutex, NULL);
 
+#ifdef DRIZZLED
+		pbxt_hton= new PBXTStorageEngine(std::string("PBXT"));
+		registry.add(pbxt_hton);
+#else
 		pbxt_hton = (handlerton *) p;
 		pbxt_hton->state = SHOW_OPTION_YES;
-#ifndef DRIZZLED
 		pbxt_hton->db_type = DB_TYPE_PBXT; // Wow! I have my own!
-#endif
 		pbxt_hton->close_connection = pbxt_close_connection; /* close_connection, cleanup thread related data. */
 		pbxt_hton->commit = pbxt_commit; /* commit */
 		pbxt_hton->rollback = pbxt_rollback; /* rollback */
 		pbxt_hton->create = pbxt_create_handler; /* Create a new handler */
 		pbxt_hton->drop_database = pbxt_drop_database; /* Drop a database */
-#ifndef DRIZZLED
 		pbxt_hton->panic = pbxt_panic; /* Panic call */
-#endif
 		pbxt_hton->show_status = pbxt_show_status;
 		pbxt_hton->flags = HTON_NO_FLAGS; /* HTON_CAN_RECREATE - Without this flags TRUNCATE uses delete_all_rows() */
-
+#endif
 		if (!xt_init_logging())					/* Initialize logging */
 			goto error_1;
 
-#ifdef XT_STREAMING
-		if (!xt_init_streaming())
+#ifdef PBMS_ENABLED
+		PBMSResultRec result;
+		if (!pbms_initialize("PBXT", false, &result)) {
+			xt_logf(XT_NT_ERROR, "pbms_initialize() Error: %s", result.mr_message);
 			goto error_2;
+		}
 #endif
 
 		if (!xt_init_memory())					/* Initialize memory */
@@ -1082,9 +1099,13 @@ static int pbxt_init(void *p)
 		 * +1 Free'er thread
 		 * +1 Temporary thread (e.g. TempForClose, TempForEnd)
 		 */
-		self = xt_init_threading(max_connections + 7);				/* Create the main self: */
+#ifndef DRIZZLED
+		if (pbxt_max_threads == 0)
+			pbxt_max_threads = max_connections + 7;
+#endif
+		self = xt_init_threading(pbxt_max_threads);				/* Create the main self: */
 		if (!self)
-			goto error_4;
+			goto error_3;
 
  		pbxt_inited = true;
 
@@ -1111,7 +1132,7 @@ static int pbxt_init(void *p)
 			ASSERT(!pbxt_database);
 			{
 				THD *curr_thd = current_thd;
-				THD *thd = curr_thd;
+				THD *thd = NULL;
 
 #ifndef DRIZZLED
 				extern myxt_mutex_t LOCK_plugin;
@@ -1148,21 +1169,23 @@ static int pbxt_init(void *p)
 							xt_throw(self);
 					}
 
-					xt_open_database(self, mysql_real_data_home, TRUE);
-					pbxt_database = self->st_database;
-					xt_heap_reference(self, pbxt_database);
+					xt_xres_start_database_recovery(self);
 				}
 				catch_(b) {
-					if (!curr_thd && thd)
-						myxt_destroy_thread(thd, FALSE);
-#ifndef DRIZZLED
-					myxt_mutex_lock(&LOCK_plugin);
-#endif
-					xt_throw(self);
+					/* It is possible that the error was reset by cleanup code.
+					 * Set a generic error code in that case.
+					 */
+					/* PMC - This is not necessary in because exceptions are 
+					 * now preserved, in exception handler cleanup.
+					*/
+					if (!self->t_exception.e_xt_err)
+						xt_register_error(XT_REG_CONTEXT, XT_SYSTEM_ERROR, 0, "Initialization failed"); 
+					xt_log_exception(self, &self->t_exception, XT_LOG_DEFAULT);
+					init_err = 1;
 				}
 				cont_(b);
 
-				if (!curr_thd)
+				if (thd)
 					myxt_destroy_thread(thd, FALSE);
 #ifndef DRIZZLED
 				myxt_mutex_lock(&LOCK_plugin);
@@ -1205,32 +1228,37 @@ static int pbxt_init(void *p)
 			 * I have to stop the freeer here because it was
 			 * started before opening the database.
 			 */
-			pbxt_call_exit(self);
-			pbxt_inited = FALSE;
-			xt_exit_threading(self);
-			goto error_4;
+
+			/* {FREEER-HANG-ON-INIT-ERROR}
+			 * pbxt_init is called with LOCK_plugin and if it fails and tries to exit
+			 * the freeer here it hangs because the freeer calls THD::~THD which tries
+			 * to aquire the same lock and hangs. OTOH MySQL calls pbxt_end() after
+			 * an unsuccessful call to pbxt_init, so we defer cleaup, except 
+			 * releasing 'self'
+			 */
+			xt_free_thread(self);
+			goto error_3;
 		}
 		xt_free_thread(self);
  	}
 	XT_RETURN(init_err);
 
-	error_4:
-	xt_exit_memory();
-
 	error_3:
-#ifdef XT_STREAMING
-	xt_exit_streaming();
+#ifdef PBMS_ENABLED
+	pbms_finalize();
 
 	error_2:
 #endif
-	xt_exit_logging();
 
 	error_1:
-	xt_p_mutex_destroy(&pbxt_database_mutex);		
 	XT_RETURN(1);
 }
 
-static int pbxt_end(void *p __attribute__((unused)))
+#ifdef DRIZZLED
+static int pbxt_end(PluginRegistry &registry)
+#else
+static int pbxt_end(void *)
+#endif
 {
 	XTThreadPtr		self;
 	int				err = 0;
@@ -1241,7 +1269,7 @@ static int pbxt_end(void *p __attribute_
 		XTExceptionRec	e;
 
 		/* This flag also means "shutting down". */
-		pbxt_inited = FALSE;
+		pbxt_inited = FALSE; 
 		self = xt_create_thread("TempForEnd", FALSE, TRUE, &e);
 		if (self) {
 			self->t_main = TRUE;
@@ -1249,6 +1277,9 @@ static int pbxt_end(void *p __attribute_
 		}
 	}
 
+#ifdef DRIZZLED
+	registry.remove(pbxt_hton);
+#endif
 	XT_RETURN(err);
 }
 
@@ -1262,12 +1293,15 @@ static int pbxt_panic(handlerton *hton, 
 /*
  * Kill the PBXT thread associated with the MySQL thread.
  */
+#ifdef DRIZZLED
+int PBXTStorageEngine::close_connection(Session *thd)
+{
+	PBXTStorageEngine * const hton = this;
+#else
 static int pbxt_close_connection(handlerton *hton, THD* thd)
 {
-	XTThreadPtr		self;
-#ifdef XT_STREAMING
-	XTExceptionRec	e;
 #endif
+	XTThreadPtr		self;
 
 	XT_TRACE_CALL();
 	if ((self = (XTThreadPtr) *thd_ha_data(thd, hton))) {
@@ -1278,10 +1312,6 @@ static int pbxt_close_connection(handler
 		xt_set_self(self);
 		xt_free_thread(self);
 	}
-#ifdef XT_STREAMING
-	if (!xt_pbms_close_connection((void *) thd, &e))
-		xt_log_exception(NULL, &e, XT_LOG_DEFAULT);
-#endif
 	return 0;
 }
 
@@ -1290,7 +1320,11 @@ static int pbxt_close_connection(handler
  * when the last PBXT table was removed from the 
  * database.
  */
-static void pbxt_drop_database(handlerton *hton __attribute__((unused)), char *path __attribute__((unused)))
+#ifdef DRIZZLED
+void PBXTStorageEngine::drop_database(char *)
+#else
+static void pbxt_drop_database(handlerton *XT_UNUSED(hton), char *XT_UNUSED(path))
+#endif
 {
 	XT_TRACE_CALL();
 }
@@ -1317,8 +1351,14 @@ static void pbxt_drop_database(handlerto
  * pbxt_thr is a pointer the the PBXT thread structure.
  *
  */
+#ifdef DRIZZLED
+int PBXTStorageEngine::commit(Session *thd, bool all)
+{
+	PBXTStorageEngine * const hton = this;
+#else
 static int pbxt_commit(handlerton *hton, THD *thd, bool all)
 {
+#endif
 	int			err = 0;
 	XTThreadPtr	self;
 
@@ -1343,8 +1383,14 @@ static int pbxt_commit(handlerton *hton,
 	return err;
 }
 
+#ifdef DRIZZLED
+int PBXTStorageEngine::rollback(Session *thd, bool all)
+{
+	PBXTStorageEngine * const hton = this;
+#else
 static int pbxt_rollback(handlerton *hton, THD *thd, bool all)
 {
+#endif
 	int			err = 0;
 	XTThreadPtr	self;
 
@@ -1377,8 +1423,14 @@ static int pbxt_rollback(handlerton *hto
 	return 0;
 }
 
+#ifdef DRIZZLED
+handler *PBXTStorageEngine::create(TABLE_SHARE *table, MEM_ROOT *mem_root)
+{
+	PBXTStorageEngine * const hton = this;
+#else
 static handler *pbxt_create_handler(handlerton *hton, TABLE_SHARE *table, MEM_ROOT *mem_root)
 {
+#endif
 	if (table && XTSystemTableShare::isSystemTable(table->path.str))
 		return new (mem_root) ha_xtsys(hton, table);
 	else
@@ -1513,7 +1565,11 @@ static void ha_close_open_tables(XTThrea
 	freer_(); // xt_unlock_mutex(share->sh_ex_mutex)
 }
 
-static void ha_release_exclusive_use(XTThreadPtr self __attribute__((unused)), XTSharePtr share)
+#ifdef PBXT_ALLOW_PRINTING
+static void ha_release_exclusive_use(XTThreadPtr self, XTSharePtr share)
+#else
+static void ha_release_exclusive_use(XTThreadPtr XT_UNUSED(self), XTSharePtr share)
+#endif
 {
 	XT_PRINT1(self, "ha_release_exclusive_use %s PBXT X UNLOCK\n", share->sh_table_path->ps_path);
 	xt_lock_mutex_ns((xt_mutex_type *) share->sh_ex_mutex);
@@ -1629,11 +1685,23 @@ ST_FIELD_INFO pbxt_statistics_fields_inf
 	{ 0,		0,	MYSQL_TYPE_STRING,		0, 0, 0, SKIP_OPEN_TABLE}
 };
 
+#ifdef DRIZZLED
+static InfoSchemaTable	*pbxt_statistics_table;
+
+int pbxt_init_statitics(PluginRegistry &registry)
+#else
 int pbxt_init_statitics(void *p)
+#endif
 {
-	ST_SCHEMA_TABLE *schema = (ST_SCHEMA_TABLE *) p;
-	schema->fields_info = pbxt_statistics_fields_info;
-	schema->fill_table = pbxt_statistics_fill_table;
+#ifdef DRIZZLED
+	pbxt_statistics_table = (InfoSchemaTable *)xt_calloc_ns(sizeof(InfoSchemaTable));
+	pbxt_statistics_table->table_name= "PBXT_STATISTICS";
+	registry.add(pbxt_statistics_table);
+#else
+	ST_SCHEMA_TABLE *pbxt_statistics_table = (ST_SCHEMA_TABLE *) p;
+#endif
+	pbxt_statistics_table->fields_info = pbxt_statistics_fields_info;
+	pbxt_statistics_table->fill_table = pbxt_statistics_fill_table;
 
 #if defined(XT_WIN) && defined(XT_COREDUMP)
 	void register_crash_filter();
@@ -1645,8 +1713,16 @@ int pbxt_init_statitics(void *p)
 	return 0;
 }
 
-int pbxt_exit_statitics(void *p __attribute__((unused)))
+#ifdef DRIZZLED
+int pbxt_exit_statitics(PluginRegistry &registry)
+#else
+int pbxt_exit_statitics(void *XT_UNUSED(p))
+#endif
 {
+#ifdef DRIZZLED
+	registry.remove(pbxt_statistics_table);
+	xt_free_ns(pbxt_statistics_table);
+#endif
 	return(0);
 }
 
@@ -1765,7 +1841,7 @@ MX_TABLE_TYPES_T ha_pbxt::table_flags() 
  */
 #define FLAGS_ARE_READ_DYNAMICALLY
 
-MX_ULONG_T ha_pbxt::index_flags(uint inx __attribute__((unused)), uint part __attribute__((unused)), bool all_parts __attribute__((unused))) const
+MX_ULONG_T ha_pbxt::index_flags(uint XT_UNUSED(inx), uint XT_UNUSED(part), bool XT_UNUSED(all_parts)) const
 {
 	/* It would be nice if the dynamic version of this function works,
 	 * but it does not. MySQL loads this information when the table is openned,
@@ -1876,7 +1952,7 @@ void ha_pbxt::internal_close(THD *thd, s
  * Called from handler.cc by handler::ha_open(). The server opens all tables by
  * calling ha_open() which then calls the handler specific open().
  */
-int ha_pbxt::open(const char *table_path, int mode __attribute__((unused)), uint test_if_locked __attribute__((unused)))
+int ha_pbxt::open(const char *table_path, int XT_UNUSED(mode), uint XT_UNUSED(test_if_locked))
 {
 	THD			*thd = current_thd;
 	int			err = 0;
@@ -2104,9 +2180,9 @@ void ha_pbxt::init_auto_increment(xtWord
 }
 
 void ha_pbxt::get_auto_increment(MX_ULONGLONG_T offset, MX_ULONGLONG_T increment,
-                                 MX_ULONGLONG_T nb_desired_values __attribute__((unused)),
+                                 MX_ULONGLONG_T XT_UNUSED(nb_desired_values),
                                  MX_ULONGLONG_T *first_value,
-                                 MX_ULONGLONG_T *nb_reserved_values __attribute__((unused)))
+                                 MX_ULONGLONG_T *nb_reserved_values)
 {
 	register XTTableHPtr	tab;
 	MX_ULONGLONG_T			nr, nr_less_inc;
@@ -2225,6 +2301,14 @@ int ha_pbxt::write_row(byte *buf)
 	XT_PRINT1(pb_open_tab->ot_thread, "ha_pbxt::write_row %s\n", pb_share->sh_table_path->ps_path);
 	XT_DISABLED_TRACE(("INSERT tx=%d val=%d\n", (int) pb_open_tab->ot_thread->st_xact_data->xd_start_xn_id, (int) XT_GET_DISK_4(&buf[1])));
 	//statistic_increment(ha_write_count,&LOCK_status);
+#ifdef PBMS_ENABLED
+	PBMSResultRec result;
+	err = pbms_write_row_blobs(table, buf, &result);
+	if (err) {
+		xt_logf(XT_NT_ERROR, "pbms_write_row_blobs() Error: %s", result.mr_message);
+		return err;
+	}
+#endif
 
 	/* GOTCHA: I have a huge problem with the transaction statement.
 	 * It is not ALWAYS committed (I mean ha_commit_trans() is
@@ -2256,7 +2340,8 @@ int ha_pbxt::write_row(byte *buf)
 		int update_err = update_auto_increment();
 		if (update_err) {
 			ha_log_pbxt_thread_error_for_mysql(pb_ignore_dup_key);
-			return update_err;
+			err = update_err;
+			goto done;
 		}
 		set_auto_increment(table->next_number_field);
 	}
@@ -2274,6 +2359,10 @@ int ha_pbxt::write_row(byte *buf)
 			pb_open_tab->ot_thread->st_update_id++;
 	}
 
+	done:
+#ifdef PBMS_ENABLED
+	pbms_completed(table, (err == 0));
+#endif
 	return err;
 }
 
@@ -2347,6 +2436,21 @@ int ha_pbxt::update_row(const byte * old
 	if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_UPDATE)
 		table->timestamp_field->set_time();
 
+#ifdef PBMS_ENABLED
+	PBMSResultRec result;
+
+	err = pbms_delete_row_blobs(table, old_data, &result);
+	if (err) {
+		xt_logf(XT_NT_ERROR, "update_row:pbms_delete_row_blobs() Error: %s", result.mr_message);
+		return err;
+	}
+	err = pbms_write_row_blobs(table, new_data, &result);
+	if (err) { 
+		xt_logf(XT_NT_ERROR, "update_row:pbms_write_row_blobs() Error: %s", result.mr_message);
+		goto pbms_done;
+	}
+#endif
+
 	/* GOTCHA: We need to check the auto-increment value on update
 	 * because of the following test (which fails for InnoDB) -
 	 * auto_increment.test:
@@ -2369,6 +2473,11 @@ int ha_pbxt::update_row(const byte * old
 		err = ha_log_pbxt_thread_error_for_mysql(pb_ignore_dup_key);
 
 	pb_open_tab->ot_table->tab_locks.xt_remove_temp_lock(pb_open_tab, TRUE);
+	
+#ifdef PBMS_ENABLED
+	pbms_done:
+	pbms_completed(table, (err == 0));
+#endif
 
 	return err;
 }
@@ -2392,6 +2501,16 @@ int ha_pbxt::delete_row(const byte * buf
 	XT_DISABLED_TRACE(("DELETE tx=%d val=%d\n", (int) pb_open_tab->ot_thread->st_xact_data->xd_start_xn_id, (int) XT_GET_DISK_4(&buf[1])));
 	//statistic_increment(ha_delete_count,&LOCK_status);
 
+#ifdef PBMS_ENABLED
+	PBMSResultRec result;
+
+	err = pbms_delete_row_blobs(table, buf, &result);
+	if (err) {
+		xt_logf(XT_NT_ERROR, "pbms_delete_row_blobs() Error: %s", result.mr_message);
+		return err;
+	}
+#endif
+
 	if (!pb_open_tab->ot_thread->st_stat_trans) {
 		trans_register_ha(pb_mysql_thd, FALSE, pbxt_hton);
 		XT_PRINT0(pb_open_tab->ot_thread, "ha_pbxt::delete_row trans_register_ha all=FALSE\n");
@@ -2405,6 +2524,9 @@ int ha_pbxt::delete_row(const byte * buf
 
 	pb_open_tab->ot_table->tab_locks.xt_remove_temp_lock(pb_open_tab, TRUE);
 
+#ifdef PBMS_ENABLED
+	pbms_completed(table, (err == 0));
+#endif
 	return err;
 }
 
@@ -2491,7 +2613,7 @@ int ha_pbxt::delete_row(const byte * buf
  * commit;
  */
 
-int ha_pbxt::xt_index_in_range(register XTOpenTablePtr ot __attribute__((unused)), register XTIndexPtr ind,
+int ha_pbxt::xt_index_in_range(register XTOpenTablePtr XT_UNUSED(ot), register XTIndexPtr ind,
 	register XTIdxSearchKeyPtr search_key, xtWord1 *buf)
 {
 	/* If search key is given, this means we want an exact match. */
@@ -2698,7 +2820,7 @@ int ha_pbxt::xt_index_prev_read(XTOpenTa
 	return ha_log_pbxt_thread_error_for_mysql(FALSE);
 }
 
-int ha_pbxt::index_init(uint idx, bool sorted __attribute__((unused)))
+int ha_pbxt::index_init(uint idx, bool XT_UNUSED(sorted))
 {
 	XTIndexPtr ind;
 
@@ -2715,7 +2837,8 @@ int ha_pbxt::index_init(uint idx, bool s
 
 	/* The number of columns required: */
 	if (pb_open_tab->ot_is_modify) {
-		pb_open_tab->ot_cols_req = table->read_set->n_bits;
+
+		pb_open_tab->ot_cols_req = table->read_set->MX_BIT_SIZE();
 #ifdef XT_PRINT_INDEX_OPT
 		ind = (XTIndexPtr) pb_share->sh_dic_keys[idx];
 
@@ -2764,10 +2887,10 @@ int ha_pbxt::index_init(uint idx, bool s
 		 * seem to have this problem!
 		 */
 		ind = (XTIndexPtr) pb_share->sh_dic_keys[idx];
-		if (bitmap_is_subset(table->read_set, &ind->mi_col_map))
+		if (MX_BIT_IS_SUBSET(table->read_set, &ind->mi_col_map))
 			pb_key_read = TRUE;
 #ifdef XT_PRINT_INDEX_OPT
-		printf("index_init %s index %d cols req=%d/%d read_bits=%X write_bits=%X index_bits=%X converage=%d\n", pb_open_tab->ot_table->tab_name->ps_path, (int) idx, pb_open_tab->ot_cols_req, table->read_set->n_bits, (int) *table->read_set->bitmap, (int) *table->write_set->bitmap, (int) *ind->mi_col_map.bitmap, (int) (bitmap_is_subset(table->read_set, &ind->mi_col_map) != 0));
+		printf("index_init %s index %d cols req=%d/%d read_bits=%X write_bits=%X index_bits=%X converage=%d\n", pb_open_tab->ot_table->tab_name->ps_path, (int) idx, pb_open_tab->ot_cols_req, table->read_set->MX_BIT_SIZE(), (int) *table->read_set->bitmap, (int) *table->write_set->bitmap, (int) *ind->mi_col_map.bitmap, (int) (MX_BIT_IS_SUBSET(table->read_set, &ind->mi_col_map) != 0));
 #endif
 	}
 	
@@ -2845,7 +2968,7 @@ void ha_return_row(XTOpenTablePtr ot, u_
 }
 #endif
 
-int ha_pbxt::index_read_xt(byte * buf, uint idx, const byte *key, uint key_len __attribute__((unused)), enum ha_rkey_function find_flag __attribute__((unused)))
+int ha_pbxt::index_read_xt(byte * buf, uint idx, const byte *key, uint key_len, enum ha_rkey_function find_flag)
 {
 	int					err = 0;
 	XTIndexPtr			ind;
@@ -2887,9 +3010,12 @@ int ha_pbxt::index_read_xt(byte * buf, u
 			xt_idx_prep_key(ind, &search_key, ((find_flag == HA_READ_AFTER_KEY) ? XT_SEARCH_AFTER_KEY : 0) | prefix, (xtWord1 *) key, key_len);
 			if (!xt_idx_search(pb_open_tab, ind, &search_key))
 				err = ha_log_pbxt_thread_error_for_mysql(pb_ignore_dup_key);
-			else
+			else {
 				err = xt_index_next_read(pb_open_tab, ind, pb_key_read,
 					(find_flag == HA_READ_KEY_EXACT || find_flag == HA_READ_PREFIX) ? &search_key : NULL, buf);
+				if (err == HA_ERR_END_OF_FILE && find_flag == HA_READ_AFTER_KEY)
+					err = HA_ERR_KEY_NOT_FOUND;			
+			}
 			break;
 	}
 
@@ -2913,13 +3039,13 @@ int ha_pbxt::index_read_xt(byte * buf, u
  * row if available. If the key value is null, begin at the first key of the
  * index.
  */
-int ha_pbxt::index_read(byte * buf, const byte * key, uint key_len __attribute__((unused)), enum ha_rkey_function find_flag __attribute__((unused)))
+int ha_pbxt::index_read(byte * buf, const byte * key, uint key_len, enum ha_rkey_function find_flag)
 {
 	//statistic_increment(ha_read_key_count,&LOCK_status);
 	return index_read_xt(buf, active_index, key, key_len, find_flag);
 }
 
-int ha_pbxt::index_read_idx(byte * buf, uint idx, const byte *key, uint key_len __attribute__((unused)), enum ha_rkey_function find_flag __attribute__((unused)))
+int ha_pbxt::index_read_idx(byte * buf, uint idx, const byte *key, uint key_len, enum ha_rkey_function find_flag)
 {
 	//statistic_increment(ha_read_key_count,&LOCK_status);
 	return index_read_xt(buf, idx, key, key_len, find_flag);
@@ -3147,9 +3273,24 @@ int ha_pbxt::rnd_init(bool scan)
 	XT_PRINT1(pb_open_tab->ot_thread, "ha_pbxt::rnd_init %s\n", pb_share->sh_table_path->ps_path);
 	XT_DISABLED_TRACE(("seq scan tx=%d\n", (int) pb_open_tab->ot_thread->st_xact_data->xd_start_xn_id));
 
+	/* Call xt_tab_seq_exit() to make sure the resources used by the previous
+	 * scan are freed. In particular make sure cache page ref count is decremented.
+	 * This is needed as rnd_init() can be called mulitple times w/o matching calls 
+	 * to rnd_end(). Our experience is that currently this is done in queries like:
+	 *
+	 * SELECT t1.c1,t2.c1 FROM t1 LEFT JOIN t2 USING (c1);
+	 * UPDATE t1 LEFT JOIN t2 USING (c1) SET t1.c1 = t2.c1 WHERE t1.c1 = t2.c1;
+	 *
+	 * when scanning inner tables. It is important to understand that in such case
+	 * multiple calls to rnd_init() are not semantically equal to a new query. For
+	 * example we cannot make row locks permanent as we do in rnd_end(), as 
+	 * ha_pbxt::unlock_row still can be called.
+	 */
+	xt_tab_seq_exit(pb_open_tab);
+
 	/* The number of columns required: */
 	if (pb_open_tab->ot_is_modify)
-		pb_open_tab->ot_cols_req = table->read_set->n_bits;
+		pb_open_tab->ot_cols_req = table->read_set->MX_BIT_SIZE();
 	else {
 		pb_open_tab->ot_cols_req = ha_get_max_bit(table->read_set);
 
@@ -3243,7 +3384,7 @@ int ha_pbxt::rnd_next(byte *buf)
  *
  * Called from filesort.cc, sql_select.cc, sql_delete.cc and sql_update.cc.
  */
-void ha_pbxt::position(const byte *record __attribute__((unused)))
+void ha_pbxt::position(const byte *XT_UNUSED(record))
 {
 	XT_TRACE_CALL();
 	ASSERT_NS(pb_ex_in_use);
@@ -3383,7 +3524,7 @@ int ha_pbxt::info(uint flag)
 		if (flag & HA_STATUS_VARIABLE) {
 			stats.deleted = ot->ot_table->tab_row_fnum;
 			stats.records = (ha_rows) (ot->ot_table->tab_row_eof_id - 1 - stats.deleted);
-			stats.data_file_length = ot->ot_table->tab_rec_eof_id;
+			stats.data_file_length = xt_rec_id_to_rec_offset(ot->ot_table, ot->ot_table->tab_rec_eof_id);
 			stats.index_file_length = xt_ind_node_to_offset(ot->ot_table, ot->ot_table->tab_ind_eof);
 			stats.delete_length = ot->ot_table->tab_rec_fnum * ot->ot_rec_size;
 			//check_time = info.check_time;
@@ -3434,10 +3575,15 @@ int ha_pbxt::info(uint flag)
 #endif
 
 #endif // SAFE_MUTEX
+#ifdef DRIZZLED
+			set_prefix(share->keys_in_use, share->keys);
+			share->keys_for_keyread&= share->keys_in_use;
+#else
 			share->keys_in_use.set_prefix(share->keys);
 			//share->keys_in_use.intersect_extended(info.key_map);
 			share->keys_for_keyread.intersect(share->keys_in_use);
 			//share->db_record_offset = info.record_offset;
+#endif
 			for (u_int i = 0; i < share->keys; i++) {
 				ind = pb_share->sh_dic_keys[i];
 
@@ -3445,7 +3591,7 @@ int ha_pbxt::info(uint flag)
 				if (ind->mi_seg_count == 1 && (ind->mi_flags & HA_NOSAME))
 					rec_per_key = 1;
 				else {
-					
+					rec_per_key = 1;	
 				}
 				for (u_int j = 0; j < table->key_info[i].key_parts; j++)
 	 				table->key_info[i].rec_per_key[j] = (ulong) rec_per_key;
@@ -3570,6 +3716,8 @@ int ha_pbxt::extra(enum ha_extra_functio
 				if (pb_open_tab)
 					pb_open_tab->ot_table->tab_locks.xt_make_lock_permanent(pb_open_tab, &self->st_lock_list);
 			}
+			if (pb_open_tab)
+				pb_open_tab->ot_for_update = 0;
 			break;
 		case HA_EXTRA_KEYREAD:
 			/* This means we so not need to read the entire record. */
@@ -3706,6 +3854,12 @@ int ha_pbxt::delete_all_rows()
 		 */
 		ha_close_share(self, pb_share);
 
+		/* MySQL documentation requires us to reset auto increment value to 1
+		 * on truncate even if the table was created with a different value. 
+		 * This is also consistent with other engines.
+		 */
+		dic.dic_min_auto_inc = 1;
+
 		xt_create_table(self, (XTPathStrPtr) path, &dic);
 		if (!pb_table_locked)
 			freer_(); // ha_release_exclusive_use(pb_share)
@@ -3737,7 +3891,7 @@ int ha_pbxt::delete_all_rows()
  * now agree with the MyISAM strategy.
  * 
  */
-int ha_pbxt::analyze(THD *thd __attribute__((unused)), HA_CHECK_OPT *check_opt __attribute__((unused)))
+int ha_pbxt::analyze(THD *thd, HA_CHECK_OPT *XT_UNUSED(check_opt))
 {
 	int				err = 0;
 	XTDatabaseHPtr	db;
@@ -3819,7 +3973,7 @@ int ha_pbxt::analyze(THD *thd __attribut
 	XT_RETURN(err);
 }
 
-int ha_pbxt::repair(THD *thd __attribute__((unused)), HA_CHECK_OPT *check_opt __attribute__((unused)))
+int ha_pbxt::repair(THD *XT_UNUSED(thd), HA_CHECK_OPT *XT_UNUSED(check_opt))
 {
 	return(HA_ADMIN_TRY_ALTER);
 }
@@ -3828,7 +3982,7 @@ int ha_pbxt::repair(THD *thd __attribute
  * This is mapped to "ALTER TABLE tablename TYPE=PBXT", which rebuilds
  * the table in MySQL.
  */
-int ha_pbxt::optimize(THD *thd __attribute__((unused)), HA_CHECK_OPT *check_opt __attribute__((unused)))
+int ha_pbxt::optimize(THD *XT_UNUSED(thd), HA_CHECK_OPT *XT_UNUSED(check_opt))
 {
 	return(HA_ADMIN_TRY_ALTER);
 }
@@ -3837,7 +3991,7 @@ int ha_pbxt::optimize(THD *thd __attribu
 extern int pbxt_mysql_trace_on;
 #endif
 
-int ha_pbxt::check(THD* thd, HA_CHECK_OPT* check_opt __attribute__((unused)))
+int ha_pbxt::check(THD* thd, HA_CHECK_OPT* XT_UNUSED(check_opt))
 {
 	int				err = 0;
 	XTThreadPtr		self;
@@ -3993,8 +4147,10 @@ xtPublic int ha_pbxt::external_lock(THD 
 			 * (or update statement) just saw.
 			 */
 			if (pb_open_tab) {
-				if (pb_open_tab->ot_for_update)
+				if (pb_open_tab->ot_for_update) {
 					self->st_visible_time = self->st_database->db_xn_end_time;
+					pb_open_tab->ot_for_update = 0;
+				}
 
 				if (pb_share->sh_recalc_selectivity) {
 					if ((pb_share->sh_table->tab_row_eof_id - 1 - pb_share->sh_table->tab_row_fnum) >= 200) {
@@ -4079,10 +4235,15 @@ xtPublic int ha_pbxt::external_lock(THD 
 			pb_open_tab->ot_is_modify = FALSE;
 			if ((pb_open_tab->ot_for_update = (lock_type == F_WRLCK))) {
 				switch ((int) thd_sql_command(thd)) {
-					case SQLCOM_UPDATE:
-					case SQLCOM_UPDATE_MULTI:
 					case SQLCOM_DELETE:
 					case SQLCOM_DELETE_MULTI:
+						/* turn DELETE IGNORE into normal DELETE. The IGNORE option causes problems because 
+						 * when a record is deleted we add an xlog record which we cannot "rollback" later
+						 * when we find that an FK-constraint has failed. 
+						 */
+						thd->lex->ignore = false;
+					case SQLCOM_UPDATE:
+					case SQLCOM_UPDATE_MULTI:
 					case SQLCOM_REPLACE:
 					case SQLCOM_REPLACE_SELECT:
 					case SQLCOM_INSERT:
@@ -4290,7 +4451,9 @@ int ha_pbxt::start_stmt(THD *thd, thr_lo
 	pb_open_tab->ot_for_update =
 		(lock_type != TL_READ && 
 		 lock_type != TL_READ_WITH_SHARED_LOCKS &&
+#ifndef DRIZZLED
 		 lock_type != TL_READ_HIGH_PRIORITY && 
+#endif
 		 lock_type != TL_READ_NO_INSERT);
 	pb_open_tab->ot_is_modify = FALSE;
 	if (pb_open_tab->ot_for_update) {
@@ -4557,9 +4720,12 @@ int ha_pbxt::delete_table(const char *ta
 {
 	THD				*thd = current_thd;
 	int				err = 0;
-	XTThreadPtr		self;
+	XTThreadPtr		self = NULL;
 	XTSharePtr		share;
 
+	STAT_TRACE(self, *thd_query(thd));
+	XT_PRINT1(self, "ha_pbxt::delete_table %s\n", table_path);
+
 	if (XTSystemTableShare::isSystemTable(table_path))
 		return delete_system_table(table_path);
 
@@ -4568,9 +4734,6 @@ int ha_pbxt::delete_table(const char *ta
 
 	self->st_ignore_fkeys = (thd_test_options(thd, OPTION_NO_FOREIGN_KEY_CHECKS)) != 0;
 
-	STAT_TRACE(self, *thd_query(thd));
-	XT_PRINT1(self, "ha_pbxt::delete_table %s\n", table_path);
-
 	try_(a) {
 		xt_ha_open_database_of_table(self, (XTPathStrPtr) table_path);
 
@@ -4586,16 +4749,23 @@ int ha_pbxt::delete_table(const char *ta
 			pushr_(ha_release_exclusive_use, share);
 			ha_close_open_tables(self, share, NULL);
 
-			xt_drop_table(self, (XTPathStrPtr) table_path);
+			xt_drop_table(self, (XTPathStrPtr) table_path, thd_sql_command(thd) == SQLCOM_DROP_DB);
 
 			freer_(); // ha_release_exclusive_use(share)
 			freer_(); // ha_unget_share(share)
 		}
 		catch_(b) {
-			/* If the table does not exist, just log the error and continue... */
+			/* In MySQL if the table does not exist, just log the error and continue. This is
+ 			 * needed to delete table in the case when CREATE TABLE fails and no PBXT disk
+ 			 * structures were created. 
+ 			 * Drizzle unlike MySQL iterates over all handlers and tries to delete table. It
+ 			 * stops after when a handler returns TRUE, so in Drizzle we need to report error.  
+			 */
+#ifndef DRIZZLED
 			if (self->t_exception.e_xt_err == XT_ERR_TABLE_NOT_FOUND)
 				xt_log_and_clear_exception(self);
 			else
+#endif
 				throw_();
 		}
 		cont_(b);
@@ -4619,8 +4789,25 @@ int ha_pbxt::delete_table(const char *ta
 	}
 	catch_(a) {
 		err = xt_ha_pbxt_thread_error_for_mysql(thd, self, pb_ignore_dup_key);
+#ifdef DRIZZLED
+		if (err == HA_ERR_NO_SUCH_TABLE)
+			err = ENOENT;
+#endif
 	}
 	cont_(a);
+	
+#ifdef PBMS_ENABLED
+	/* Call pbms_delete_table_with_blobs() last because it cannot be undone. */
+	if (!err) {
+		PBMSResultRec result;
+
+		if (pbms_delete_table_with_blobs(table_path, &result)) {
+			xt_logf(XT_NT_WARNING, "pbms_delete_table_with_blobs() Error: %s", result.mr_message);
+		}
+		
+		pbms_completed(NULL, true);
+	}
+#endif
 
 	return err;
 }
@@ -4681,6 +4868,16 @@ int ha_pbxt::rename_table(const char *fr
 
 	XT_PRINT2(self, "ha_pbxt::rename_table %s -> %s\n", from, to);
 
+#ifdef PBMS_ENABLED
+	PBMSResultRec result;
+
+	err = pbms_rename_table_with_blobs(from, to, &result);
+	if (err) {
+		xt_logf(XT_NT_ERROR, "pbms_rename_table_with_blobs() Error: %s", result.mr_message);
+		return err;
+	}
+#endif
+
 	try_(a) {
 		xt_ha_open_database_of_table(self, (XTPathStrPtr) to);
 		to_db = self->st_database;
@@ -4709,10 +4906,6 @@ int ha_pbxt::rename_table(const char *fr
 		freer_(); // ha_release_exclusive_use(share)
 		freer_(); // ha_unget_share(share)
 
-#ifdef XT_STREAMING
-		/* PBMS remove the table? */
-		xt_pbms_rename_table(from, to);
-#endif
 		/*
 		 * If there are no more PBXT tables in the database, we
 		 * "drop the database", which deletes all PBXT resources
@@ -4732,11 +4925,15 @@ int ha_pbxt::rename_table(const char *fr
 		err = xt_ha_pbxt_thread_error_for_mysql(thd, self, pb_ignore_dup_key);
 	}
 	cont_(a);
+	
+#ifdef PBMS_ENABLED
+	pbms_completed(NULL, (err == 0));
+#endif
 
 	XT_RETURN(err);
 }
 
-int ha_pbxt::rename_system_table(const char *from __attribute__((unused)), const char *to __attribute__((unused)))
+int ha_pbxt::rename_system_table(const char *XT_UNUSED(from), const char *XT_UNUSED(to))
 {
 	return ER_NOT_SUPPORTED_YET;
 }
@@ -4771,7 +4968,7 @@ double ha_pbxt::scan_time()
 /*
  * The next method will never be called if you do not implement indexes.
  */
-double ha_pbxt::read_time(uint index __attribute__((unused)), uint ranges, ha_rows rows)
+double ha_pbxt::read_time(uint XT_UNUSED(index), uint ranges, ha_rows rows)
 {
 	double result = rows2double(ranges+rows);
 	return result;
@@ -4945,7 +5142,7 @@ void ha_pbxt::free_foreign_key_create_in
 	xt_free(NULL, str);
 }
 
-bool ha_pbxt::get_error_message(int error __attribute__((unused)), String *buf)
+bool ha_pbxt::get_error_message(int XT_UNUSED(error), String *buf)
 {
 	THD				*thd = current_thd;
 	int				err = 0;
@@ -5104,9 +5301,9 @@ struct st_mysql_sys_var
 #endif
 
 #ifdef USE_CONST_SAVE
-static void pbxt_record_cache_size_func(THD *thd __attribute__((unused)), struct st_mysql_sys_var *var, void *tgt, const void *save)
+static void pbxt_record_cache_size_func(THD *XT_UNUSED(thd), struct st_mysql_sys_var *var, void *tgt, const void *save)
 #else
-static void pbxt_record_cache_size_func(THD *thd __attribute__((unused)), struct st_mysql_sys_var *var, void *tgt, void *save)
+static void pbxt_record_cache_size_func(THD *XT_UNUSED(thd), struct st_mysql_sys_var *var, void *tgt, void *save)
 #endif
 {
 	xtInt8	record_cache_size;
@@ -5215,6 +5412,18 @@ static MYSQL_SYSVAR_INT(sweeper_priority
 	"Determines the priority of the background sweeper process, 0 = low (default), 1 = normal (same as user threads), 2 = high.",
 	NULL, NULL, XT_PRIORITY_LOW, XT_PRIORITY_LOW, XT_PRIORITY_HIGH, 1);
 
+#ifdef DRIZZLED
+static MYSQL_SYSVAR_INT(max_threads, pbxt_max_threads,
+	PLUGIN_VAR_OPCMDARG,
+	"The maximum number of threads used by PBXT",
+	NULL, NULL, 500, 20, 20000, 1);
+#else
+static MYSQL_SYSVAR_INT(max_threads, pbxt_max_threads,
+	PLUGIN_VAR_OPCMDARG,
+	"The maximum number of threads used by PBXT, 0 = set according to MySQL max_connections.",
+	NULL, NULL, 0, 0, 20000, 1);
+#endif
+
 static struct st_mysql_sys_var* pbxt_system_variables[] = {
   MYSQL_SYSVAR(index_cache_size),
   MYSQL_SYSVAR(record_cache_size),
@@ -5231,6 +5440,7 @@ static struct st_mysql_sys_var* pbxt_sys
   MYSQL_SYSVAR(auto_increment_mode),
   MYSQL_SYSVAR(offline_log_function),
   MYSQL_SYSVAR(sweeper_priority),
+  MYSQL_SYSVAR(max_threads),
   NULL
 };
 #endif
@@ -5241,8 +5451,8 @@ drizzle_declare_plugin(pbxt)
 mysql_declare_plugin(pbxt)
 #endif
 {
-	MYSQL_STORAGE_ENGINE_PLUGIN,
 #ifndef DRIZZLED
+	MYSQL_STORAGE_ENGINE_PLUGIN,
 	&pbxt_storage_engine,
 #endif
 	"PBXT",
@@ -5266,8 +5476,8 @@ mysql_declare_plugin(pbxt)
 	NULL						/* config options                  */
 },
 {
-	MYSQL_INFORMATION_SCHEMA_PLUGIN,
 #ifndef DRIZZLED
+	MYSQL_INFORMATION_SCHEMA_PLUGIN,
 	&pbxt_statitics,
 #endif
 	"PBXT_STATISTICS",

=== modified file 'storage/pbxt/src/ha_pbxt.h'
--- a/storage/pbxt/src/ha_pbxt.h	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/ha_pbxt.h	2009-08-17 11:12:36 +0000
@@ -28,7 +28,7 @@
 #ifdef DRIZZLED
 #include <drizzled/common.h>
 #include <drizzled/handler.h>
-#include <drizzled/handlerton.h>
+#include <drizzled/plugin/storage_engine.h>
 #include <mysys/thr_lock.h>
 #else
 #include "mysql_priv.h"
@@ -51,6 +51,25 @@
 
 class ha_pbxt;
 
+#ifdef DRIZZLED
+
+class PBXTStorageEngine : public StorageEngine {
+public:
+	PBXTStorageEngine(std::string name_arg)
+	: StorageEngine(name_arg, HTON_NO_FLAGS) {}
+
+	/* override */ int close_connection(Session *);
+	/* override */ int commit(Session *, bool);
+	/* override */ int rollback(Session *, bool);
+	/* override */ handler *create(TABLE_SHARE *, MEM_ROOT *);
+	/* override */ void drop_database(char *);
+	/* override */ bool show_status(Session *, stat_print_fn *, enum ha_stat_type);
+};
+
+typedef PBXTStorageEngine handlerton;
+
+#endif
+
 extern handlerton *pbxt_hton;
 
 /*

=== modified file 'storage/pbxt/src/ha_xtsys.cc'
--- a/storage/pbxt/src/ha_xtsys.cc	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/ha_xtsys.cc	2009-08-17 11:12:36 +0000
@@ -75,7 +75,7 @@ const char **ha_xtsys::bas_ext() const
 	return ha_pbms_exts;
 }
 
-int ha_xtsys::open(const char *table_path, int mode __attribute__((unused)), uint test_if_locked __attribute__((unused)))
+int ha_xtsys::open(const char *table_path, int XT_UNUSED(mode), uint XT_UNUSED(test_if_locked))
 {
 	THD				*thd = current_thd;
 	XTExceptionRec	e;
@@ -141,7 +141,7 @@ int ha_xtsys::close(void)
 	return err;
 }
 
-int ha_xtsys::rnd_init(bool scan __attribute__((unused)))
+int ha_xtsys::rnd_init(bool XT_UNUSED(scan))
 {
 	int err = 0;
 
@@ -185,7 +185,7 @@ int ha_xtsys::rnd_pos(byte * buf, byte *
 	return err;
 }
 
-int ha_xtsys::info(uint flag __attribute__((unused)))
+int ha_xtsys::info(uint XT_UNUSED(flag))
 {
 	return 0;
 }
@@ -211,7 +211,7 @@ int ha_xtsys::external_lock(THD *thd, in
 	return err;
 }
 
-THR_LOCK_DATA **ha_xtsys::store_lock(THD *thd __attribute__((unused)), THR_LOCK_DATA **to, enum thr_lock_type lock_type)
+THR_LOCK_DATA **ha_xtsys::store_lock(THD *XT_UNUSED(thd), THR_LOCK_DATA **to, enum thr_lock_type lock_type)
 {
 	if (lock_type != TL_IGNORE && ha_lock.type == TL_UNLOCK)
 		ha_lock.type = lock_type;
@@ -220,13 +220,13 @@ THR_LOCK_DATA **ha_xtsys::store_lock(THD
 }
 
 /* Note: ha_pbxt::delete_system_table is called instead. */
-int ha_xtsys::delete_table(const char *table_path __attribute__((unused)))
+int ha_xtsys::delete_table(const char *XT_UNUSED(table_path))
 {
 	/* Should never be called */
 	return 0;
 }
 
-int ha_xtsys::create(const char *name __attribute__((unused)), TABLE *table_arg __attribute__((unused)), HA_CREATE_INFO *create_info __attribute__((unused)))
+int ha_xtsys::create(const char *XT_UNUSED(name), TABLE *XT_UNUSED(table_arg), HA_CREATE_INFO *XT_UNUSED(create_info))
 {
 	/* Allow the table to be created.
 	 * This is required after a dump is restored.
@@ -234,7 +234,7 @@ int ha_xtsys::create(const char *name __
 	return 0;
 }
 
-bool ha_xtsys::get_error_message(int error __attribute__((unused)), String *buf)
+bool ha_xtsys::get_error_message(int XT_UNUSED(error), String *buf)
 {
 	THD				*thd = current_thd;
 	XTExceptionRec	e;

=== modified file 'storage/pbxt/src/ha_xtsys.h'
--- a/storage/pbxt/src/ha_xtsys.h	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/ha_xtsys.h	2009-08-17 11:12:36 +0000
@@ -59,7 +59,7 @@ public:
 
 	const char *table_type() const { return "PBXT"; }
 
-	const char *index_type(uint inx __attribute__((unused))) {
+	const char *index_type(uint XT_UNUSED(inx)) {
 		return "NONE";
 	}
 
@@ -69,7 +69,7 @@ public:
 		return HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE;
 	}
 
-	MX_ULONG_T index_flags(uint inx __attribute__((unused)), uint part __attribute__((unused)), bool all_parts __attribute__((unused))) const {
+	MX_ULONG_T index_flags(uint XT_UNUSED(inx), uint XT_UNUSED(part), bool XT_UNUSED(all_parts)) const {
 		return (HA_READ_NEXT | HA_READ_PREV | HA_READ_RANGE | HA_KEYREAD_ONLY);
 	}
 	uint	max_supported_keys()			const { return 512; }

=== modified file 'storage/pbxt/src/hashtab_xt.cc'
--- a/storage/pbxt/src/hashtab_xt.cc	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/hashtab_xt.cc	2009-08-17 11:12:36 +0000
@@ -115,7 +115,7 @@ xtPublic void xt_ht_put(XTThreadPtr self
 	popr_();
 }
 
-xtPublic void *xt_ht_get(XTThreadPtr self __attribute__((unused)), XTHashTabPtr ht, void *key)
+xtPublic void *xt_ht_get(XTThreadPtr XT_UNUSED(self), XTHashTabPtr ht, void *key)
 {
 	XTHashItemPtr	item;
 	xtHashValue		h;
@@ -239,14 +239,14 @@ xtPublic void xt_ht_signal(XTThreadPtr s
 	xt_signal_cond(self, ht->ht_cond);
 }
 
-xtPublic void xt_ht_enum(struct XTThread *self __attribute__((unused)), XTHashTabPtr ht, XTHashEnumPtr en)
+xtPublic void xt_ht_enum(struct XTThread *XT_UNUSED(self), XTHashTabPtr ht, XTHashEnumPtr en)
 {
 	en->he_i = 0;
 	en->he_item = NULL;
 	en->he_ht = ht;
 }
 
-xtPublic void *xt_ht_next(struct XTThread *self __attribute__((unused)), XTHashEnumPtr en)
+xtPublic void *xt_ht_next(struct XTThread *XT_UNUSED(self), XTHashEnumPtr en)
 {
 	if (en->he_item) {
 		en->he_item = en->he_item->hi_next;

=== modified file 'storage/pbxt/src/heap_xt.cc'
--- a/storage/pbxt/src/heap_xt.cc	2009-04-02 20:36:52 +0000
+++ b/storage/pbxt/src/heap_xt.cc	2009-08-17 11:12:36 +0000
@@ -31,7 +31,7 @@
 #undef xt_heap_new
 #endif
 
-#ifdef DEBUG
+#ifdef DEBUG_MEMORY
 xtPublic XTHeapPtr xt_mm_heap_new(XTThreadPtr self, size_t size, XTFinalizeFunc finalize, u_int line, c_char *file, xtBool track)
 #else
 xtPublic XTHeapPtr xt_heap_new(XTThreadPtr self, size_t size, XTFinalizeFunc finalize)
@@ -39,7 +39,7 @@ xtPublic XTHeapPtr xt_heap_new(XTThreadP
 {
 	volatile XTHeapPtr	hp;
 	
-#ifdef DEBUG
+#ifdef DEBUG_MEMORY
 	hp = (XTHeapPtr) xt_mm_calloc(self, size, line, file);
 	hp->h_track = track;
 	if (track)
@@ -65,21 +65,21 @@ xtPublic XTHeapPtr xt_heap_new(XTThreadP
 	return hp;
 }
 
-xtPublic void xt_check_heap(XTThreadPtr self __attribute__((unused)), XTHeapPtr hp __attribute__((unused)))
+xtPublic void xt_check_heap(XTThreadPtr XT_NDEBUG_UNUSED(self), XTHeapPtr XT_NDEBUG_UNUSED(hp))
 {
-#ifdef DEBUG
+#ifdef DEBUG_MEMORY
 	xt_mm_malloc_size(self, hp);
 #endif
 }
 
-#ifdef DEBUG
+#ifdef DEBUG_MEMORY
 xtPublic void xt_mm_heap_reference(XTThreadPtr self, XTHeapPtr hp, u_int line, c_char *file)
 #else
 xtPublic void xt_heap_reference(XTThreadPtr, XTHeapPtr hp)
 #endif
 {
 	xt_spinlock_lock(&hp->h_lock);
-#ifdef DEBUG
+#ifdef DEBUG_MEMORY
 	if (hp->h_track)
 		printf("HEAP: +1 %d->%d %s:%d\n", (int) hp->h_ref_count, (int) hp->h_ref_count+1, file, (int) line);
 #endif
@@ -91,7 +91,7 @@ xtPublic void xt_heap_release(XTThreadPt
 {	
 	if (!hp)
 		return;
-#ifdef DEBUG
+#ifdef DEBUG_MEMORY
 	xt_spinlock_lock(&hp->h_lock);
 	ASSERT(hp->h_ref_count != 0);
 	xt_spinlock_unlock(&hp->h_lock);
@@ -100,7 +100,7 @@ xtPublic void xt_heap_release(XTThreadPt
 	if (hp->h_onrelease)
 		(*hp->h_onrelease)(self, hp);
 	if (hp->h_ref_count > 0) {
-#ifdef DEBUG
+#ifdef DEBUG_MEMORY
 	if (hp->h_track)
 		printf("HEAP: -1 %d->%d\n", (int) hp->h_ref_count, (int) hp->h_ref_count-1);
 #endif
@@ -116,12 +116,12 @@ xtPublic void xt_heap_release(XTThreadPt
 	xt_spinlock_unlock(&hp->h_lock);
 }
 
-xtPublic void xt_heap_set_release_callback(XTThreadPtr self __attribute__((unused)), XTHeapPtr hp, XTFinalizeFunc onrelease)
+xtPublic void xt_heap_set_release_callback(XTThreadPtr XT_UNUSED(self), XTHeapPtr hp, XTFinalizeFunc onrelease)
 {
 	hp->h_onrelease = onrelease;
 }
 
-xtPublic u_int xt_heap_get_ref_count(struct XTThread *self __attribute__((unused)), XTHeapPtr hp)
+xtPublic u_int xt_heap_get_ref_count(struct XTThread *XT_UNUSED(self), XTHeapPtr hp)
 {
 	return hp->h_ref_count;
 }

=== modified file 'storage/pbxt/src/heap_xt.h'
--- a/storage/pbxt/src/heap_xt.h	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/heap_xt.h	2009-08-17 11:12:36 +0000
@@ -25,6 +25,7 @@
 
 #include "xt_defs.h"
 #include "lock_xt.h"
+#include "memory_xt.h"
 
 struct XTThread;
 
@@ -59,7 +60,7 @@ u_int		xt_heap_get_ref_count(struct XTTh
 
 void		xt_check_heap(struct XTThread *self, XTHeapPtr mem);
 
-#ifdef DEBUG
+#ifdef DEBUG_MEMORY
 #define xt_heap_new(t, s, f)		xt_mm_heap_new(t, s, f, __LINE__, __FILE__, FALSE)
 #define xt_heap_new_track(t, s, f)	xt_mm_heap_new(t, s, f, __LINE__, __FILE__, TRUE)
 #define xt_heap_reference(t, s)		xt_mm_heap_reference(t, s, __LINE__, __FILE__)

=== modified file 'storage/pbxt/src/index_xt.cc'
--- a/storage/pbxt/src/index_xt.cc	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/index_xt.cc	2009-08-18 07:46:53 +0000
@@ -23,6 +23,10 @@
 
 #include "xt_config.h"
 
+#ifdef DRIZZLED
+#include <bitset>
+#endif
+
 #include <string.h>
 #include <stdio.h>
 #include <stddef.h>
@@ -52,7 +56,7 @@
 //#define CHECK_AND_PRINT
 //#define CHECK_NODE_REFERENCE
 //#define TRACE_FLUSH
-//#define CHECK_PRINTS_RECORD_REFERENCES
+#define CHECK_PRINTS_RECORD_REFERENCES
 #else
 #define MAX_SEARCH_DEPTH			100
 #endif
@@ -77,6 +81,7 @@ static u_int idx_check_index(XTOpenTable
 #endif
 
 static xtBool idx_insert_node(XTOpenTablePtr ot, XTIndexPtr ind, IdxBranchStackPtr stack, XTIdxKeyValuePtr key_value, xtIndexNodeID branch);
+static xtBool idx_remove_lazy_deleted_item_in_node(XTOpenTablePtr ot, XTIndexPtr ind, xtIndexNodeID current, XTIndReferencePtr iref, XTIdxKeyValuePtr key_value);
 
 #ifdef XT_TRACK_INDEX_UPDATES
 
@@ -163,7 +168,7 @@ static void track_dump_all(u_int max_blo
 
 #endif
 
-xtPublic void xt_ind_track_dump_block(XTTableHPtr tab __attribute__((unused)), xtIndexNodeID address __attribute__((unused)))
+xtPublic void xt_ind_track_dump_block(XTTableHPtr XT_UNUSED(tab), xtIndexNodeID XT_UNUSED(address))
 {
 #ifdef TRACK_ACTIVITY
 	u_int i = XT_NODE_ID(address)-1;
@@ -268,7 +273,7 @@ static xtBool idx_new_branch(XTOpenTable
 
 	if ((XT_NODE_ID(wrote_pos) = XT_NODE_ID(tab->tab_ind_free))) {
 		/* Use the block on the free list: */
-		if (!xt_ind_read_bytes(ot, wrote_pos, sizeof(XTIndFreeBlockRec), (xtWord1 *) &free_block))
+		if (!xt_ind_read_bytes(ot, ind, wrote_pos, sizeof(XTIndFreeBlockRec), (xtWord1 *) &free_block))
 			goto failed;
 		XT_NODE_ID(tab->tab_ind_free) = (xtIndexNodeID) XT_GET_DISK_8(free_block.if_next_block_8);
 		xt_unlock_mutex_ns(&tab->tab_ind_lock);
@@ -343,7 +348,7 @@ static xtBool idx_free_branch(XTOpenTabl
  * Simple compare functions
  */
 
-xtPublic int xt_compare_2_int4(XTIndexPtr ind __attribute__((unused)), uint key_length, xtWord1 *key_value, xtWord1 *b_value)
+xtPublic int xt_compare_2_int4(XTIndexPtr XT_UNUSED(ind), uint key_length, xtWord1 *key_value, xtWord1 *b_value)
 {
 	int r;
 
@@ -357,7 +362,7 @@ xtPublic int xt_compare_2_int4(XTIndexPt
 	return r;
 }
 
-xtPublic int xt_compare_3_int4(XTIndexPtr ind __attribute__((unused)), uint key_length, xtWord1 *key_value, xtWord1 *b_value)
+xtPublic int xt_compare_3_int4(XTIndexPtr XT_UNUSED(ind), uint key_length, xtWord1 *key_value, xtWord1 *b_value)
 {
 	int r;
 
@@ -381,7 +386,7 @@ xtPublic int xt_compare_3_int4(XTIndexPt
  * Tree branch sanning (searching nodes and leaves)
  */
 
-xtPublic void xt_scan_branch_single(struct XTTable *tab __attribute__((unused)), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxKeyValuePtr value, register XTIdxResultRec *result)
+xtPublic void xt_scan_branch_single(struct XTTable *XT_UNUSED(tab), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxKeyValuePtr value, register XTIdxResultRec *result)
 {
 	XT_NODE_TEMP;
 	u_int				branch_size;
@@ -522,7 +527,7 @@ xtPublic void xt_scan_branch_single(stru
  * index (in the case of -1) or to the first value after the
  * the search key in the case of 1.
  */
-xtPublic void xt_scan_branch_fix(struct XTTable *tab __attribute__((unused)), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxKeyValuePtr value, register XTIdxResultRec *result)
+xtPublic void xt_scan_branch_fix(struct XTTable *XT_UNUSED(tab), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxKeyValuePtr value, register XTIdxResultRec *result)
 {
 	XT_NODE_TEMP;
 	u_int				branch_size;
@@ -619,7 +624,7 @@ xtPublic void xt_scan_branch_fix(struct 
 	result->sr_item.i_item_offset = node_ref_size + i * full_item_size;
 }
 
-xtPublic void xt_scan_branch_fix_simple(struct XTTable *tab __attribute__((unused)), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxKeyValuePtr value, register XTIdxResultRec *result)
+xtPublic void xt_scan_branch_fix_simple(struct XTTable *XT_UNUSED(tab), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxKeyValuePtr value, register XTIdxResultRec *result)
 {
 	XT_NODE_TEMP;
 	u_int				branch_size;
@@ -720,7 +725,7 @@ xtPublic void xt_scan_branch_fix_simple(
  * Variable length key values are stored as a sorted list. Since each list item has a variable length, we
  * must scan the list sequentially in order to find a key.
  */
-xtPublic void xt_scan_branch_var(struct XTTable *tab __attribute__((unused)), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxKeyValuePtr value, register XTIdxResultRec *result)
+xtPublic void xt_scan_branch_var(struct XTTable *XT_UNUSED(tab), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxKeyValuePtr value, register XTIdxResultRec *result)
 {
 	XT_NODE_TEMP;
 	u_int			branch_size;
@@ -816,7 +821,7 @@ xtPublic void xt_scan_branch_var(struct 
 }
 
 /* Go to the next item in the node. */
-static void idx_next_branch_item(XTTableHPtr tab __attribute__((unused)), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxResultRec *result)
+static void idx_next_branch_item(XTTableHPtr XT_UNUSED(tab), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxResultRec *result)
 {
 	XT_NODE_TEMP;
 	xtWord1	*bitem;
@@ -834,7 +839,7 @@ static void idx_next_branch_item(XTTable
 	result->sr_branch = IDX_GET_NODE_REF(tab, bitem, result->sr_item.i_node_ref_size);
 }
 
-xtPublic void xt_prev_branch_item_fix(XTTableHPtr tab __attribute__((unused)), XTIndexPtr ind __attribute__((unused)), XTIdxBranchDPtr branch, register XTIdxResultRec *result)
+xtPublic void xt_prev_branch_item_fix(XTTableHPtr XT_UNUSED(tab), XTIndexPtr XT_UNUSED(ind), XTIdxBranchDPtr branch, register XTIdxResultRec *result)
 {
 	XT_NODE_TEMP;
 	ASSERT_NS(result->sr_item.i_item_offset >= result->sr_item.i_item_size + result->sr_item.i_node_ref_size + result->sr_item.i_node_ref_size);
@@ -843,7 +848,7 @@ xtPublic void xt_prev_branch_item_fix(XT
 	result->sr_branch = IDX_GET_NODE_REF(tab, branch->tb_data + result->sr_item.i_item_offset, result->sr_item.i_node_ref_size);
 }
 
-xtPublic void xt_prev_branch_item_var(XTTableHPtr tab __attribute__((unused)), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxResultRec *result)
+xtPublic void xt_prev_branch_item_var(XTTableHPtr XT_UNUSED(tab), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxResultRec *result)
 {
 	XT_NODE_TEMP;
 	xtWord1	*bitem;
@@ -865,7 +870,20 @@ xtPublic void xt_prev_branch_item_var(XT
 	result->sr_item.i_item_offset = bitem - branch->tb_data;
 }
 
-static void idx_first_branch_item(XTTableHPtr tab __attribute__((unused)), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxResultPtr result)
+static void idx_reload_item_fix(XTIndexPtr XT_NDEBUG_UNUSED(ind), XTIdxBranchDPtr branch, register XTIdxResultPtr result)
+{
+	u_int branch_size;
+
+	branch_size = XT_GET_DISK_2(branch->tb_size_2);
+	ASSERT_NS(result->sr_item.i_node_ref_size == (XT_IS_NODE(branch_size) ? XT_NODE_REF_SIZE : 0));
+	ASSERT_NS(result->sr_item.i_item_size == ind->mi_key_size + XT_RECORD_REF_SIZE);
+	result->sr_item.i_total_size = XT_GET_BRANCH_DATA_SIZE(branch_size);
+	if (result->sr_item.i_item_offset > result->sr_item.i_total_size)
+		result->sr_item.i_item_offset = result->sr_item.i_total_size;
+	xt_get_res_record_ref(&branch->tb_data[result->sr_item.i_item_offset + result->sr_item.i_item_size - XT_RECORD_REF_SIZE], result); 
+}
+
+static void idx_first_branch_item(XTTableHPtr XT_UNUSED(tab), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxResultPtr result)
 {
 	XT_NODE_TEMP;
 	u_int branch_size;
@@ -903,7 +921,7 @@ static void idx_first_branch_item(XTTabl
 /*
  * Last means different things for leaf or node!
  */
-xtPublic void xt_last_branch_item_fix(XTTableHPtr tab __attribute__((unused)), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxResultPtr result)
+xtPublic void xt_last_branch_item_fix(XTTableHPtr XT_UNUSED(tab), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxResultPtr result)
 {
 	XT_NODE_TEMP;
 	u_int branch_size;
@@ -935,7 +953,7 @@ xtPublic void xt_last_branch_item_fix(XT
 	}
 }
 
-xtPublic void xt_last_branch_item_var(XTTableHPtr tab __attribute__((unused)), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxResultPtr result)
+xtPublic void xt_last_branch_item_var(XTTableHPtr XT_UNUSED(tab), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxResultPtr result)
 {
 	XT_NODE_TEMP;
 	u_int	branch_size;
@@ -986,6 +1004,218 @@ xtPublic void xt_last_branch_item_var(XT
 	}
 }
 
+xtPublic xtBool xt_idx_lazy_delete_on_leaf(XTIndexPtr ind, XTIndBlockPtr block, xtWord2 branch_size)
+{
+	ASSERT_NS(ind->mi_fix_key);
+	
+	/* Compact the leaf if more than half the items that fit on the page
+	 * are deleted: */
+	if (block->cp_del_count >= ind->mi_max_items/2)
+		return FALSE;
+
+	/* Compact the page if there is only 1 (or less) valid item left: */
+	if ((u_int) block->cp_del_count+1 >= ((u_int) branch_size - 2)/(ind->mi_key_size + XT_RECORD_REF_SIZE))
+		return FALSE;
+
+	return OK;
+}
+
+static xtBool idx_lazy_delete_on_node(XTIndexPtr ind, XTIndBlockPtr block, register XTIdxItemPtr item)
+{
+	ASSERT_NS(ind->mi_fix_key);
+	
+	/* Compact the node if more than 1/4 of the items that fit on the page
+	 * are deleted: */
+	if (block->cp_del_count >= ind->mi_max_items/4)
+		return FALSE;
+
+	/* Compact the page if there is only 1 (or less) valid item left: */
+	if ((u_int) block->cp_del_count+1 >= (item->i_total_size - item->i_node_ref_size)/(item->i_item_size + item->i_node_ref_size))
+		return FALSE;
+
+	return OK;
+}
+
+inline static xtBool idx_cmp_item_key_fix(XTIndReferencePtr iref, register XTIdxItemPtr item, XTIdxKeyValuePtr value)
+{
+	xtWord1 *data;
+
+	data = &iref->ir_branch->tb_data[item->i_item_offset];
+	return memcmp(data, value->sv_key, value->sv_length) == 0;
+}
+
+inline static void idx_set_item_key_fix(XTIndReferencePtr iref, register XTIdxItemPtr item, XTIdxKeyValuePtr value)
+{
+	xtWord1 *data;
+
+	data = &iref->ir_branch->tb_data[item->i_item_offset];
+	memcpy(data, value->sv_key, value->sv_length);
+	xt_set_val_record_ref(data + value->sv_length, value);
+	iref->ir_updated = TRUE;
+}
+
+inline static void idx_set_item_reference(XTIndReferencePtr iref, register XTIdxItemPtr item, xtRowID rec_id, xtRowID row_id)
+{
+	size_t	offset;
+	xtWord1	*data;
+
+	/* This is the offset of the reference in the item we found: */
+	offset = item->i_item_offset +item->i_item_size - XT_RECORD_REF_SIZE;
+	data = &iref->ir_branch->tb_data[offset];
+
+	xt_set_record_ref(data, rec_id, row_id);
+	iref->ir_updated = TRUE;
+}
+
+inline static void idx_set_item_row_id(XTIndReferencePtr iref, register XTIdxItemPtr item, xtRowID row_id)
+{
+	size_t	offset;
+	xtWord1	*data;
+
+	offset = 
+		/* This is the offset of the reference in the item we found: */
+		item->i_item_offset +item->i_item_size - XT_RECORD_REF_SIZE +
+		/* This is the offset of the row id in the reference: */
+		XT_RECORD_ID_SIZE;
+	data = &iref->ir_branch->tb_data[offset];
+
+	/* This update does not change the structure of page, so we do it without
+	 * copying the page before we write.
+	 */
+	XT_SET_DISK_4(data, row_id);
+	iref->ir_updated = TRUE;
+}
+
+inline static xtBool idx_is_item_deleted(register XTIdxBranchDPtr branch, register XTIdxItemPtr item)
+{
+	xtWord1	*data;
+
+	data = &branch->tb_data[item->i_item_offset + item->i_item_size - XT_RECORD_REF_SIZE + XT_RECORD_ID_SIZE];
+	return XT_GET_DISK_4(data) == (xtRowID) -1;
+}
+
+inline static void idx_set_item_deleted(XTIndReferencePtr iref, register XTIdxItemPtr item)
+{
+	idx_set_item_row_id(iref, item, (xtRowID) -1);
+	
+	/* This should be safe because there is only one thread,
+	 * the sweeper, that does this!
+	 *
+	 * Threads that decrement this value have an xlock on
+	 * the page, or the index.
+	 */
+	iref->ir_block->cp_del_count++;
+}
+
+/*
+ * {LAZY-DEL-INDEX-ITEMS}
+ * Do a lazy delete of an item by just setting the Row ID
+ * to the delete indicator: row ID -1.
+ */
+static void idx_lazy_delete_branch_item(XTOpenTablePtr ot, XTIndexPtr ind, XTIndReferencePtr iref, register XTIdxItemPtr item)
+{
+	idx_set_item_deleted(iref, item);
+	xt_ind_release(ot, ind, iref->ir_xlock ? XT_UNLOCK_W_UPDATE : XT_UNLOCK_R_UPDATE, iref);
+}
+
+/*
+ * This function compacts the leaf, but preserves the
+ * position of the item.
+ */
+static xtBool idx_compact_leaf(XTOpenTablePtr ot, XTIndexPtr ind, XTIndReferencePtr iref, register XTIdxItemPtr item)
+{
+	register XTIdxBranchDPtr branch = iref->ir_branch;
+	int		item_idx, count, i, idx;
+	u_int	size;
+	xtWord1	*s_data;
+	xtWord1	*d_data;
+	xtWord1	*data;
+	xtRowID	row_id;
+
+	if (iref->ir_block->cb_handle_count) {
+		if (!xt_ind_copy_on_write(iref)) {
+			xt_ind_release(ot, ind, iref->ir_xlock ? XT_UNLOCK_WRITE : XT_UNLOCK_READ, iref);
+			return FAILED;
+		}
+	}
+
+	ASSERT_NS(!item->i_node_ref_size);
+	ASSERT_NS(ind->mi_fix_key);
+	size = item->i_item_size;
+	count = item->i_total_size / size;
+	item_idx = item->i_item_offset / size;
+	s_data = d_data = branch->tb_data;
+	idx = 0;
+	for (i=0; i<count; i++) {
+		data = s_data + item->i_item_size - XT_RECORD_REF_SIZE + XT_RECORD_ID_SIZE;
+		row_id = XT_GET_DISK_4(data);
+		if (row_id == (xtRowID) -1) {
+			if (idx < item_idx)
+				item_idx--;
+		}
+		else {
+			if (d_data != s_data)
+				memcpy(d_data, s_data, size);
+			d_data += size;
+			idx++;
+		}
+		s_data += size;
+	}
+	iref->ir_block->cp_del_count = 0;
+	item->i_total_size = d_data - branch->tb_data;
+	ASSERT_NS(idx * size == item->i_total_size);
+	item->i_item_offset = item_idx * size;
+	XT_SET_DISK_2(branch->tb_size_2, XT_MAKE_BRANCH_SIZE(item->i_total_size, 0));
+	iref->ir_updated = TRUE;
+	return OK;
+}
+
+static xtBool idx_lazy_remove_leaf_item_right(XTOpenTablePtr ot, XTIndexPtr ind, XTIndReferencePtr iref, register XTIdxItemPtr item)
+{
+	register XTIdxBranchDPtr branch = iref->ir_branch;
+	int		item_idx, count, i;
+	u_int	size;
+	xtWord1	*s_data;
+	xtWord1	*d_data;
+	xtWord1	*data;
+	xtRowID	row_id;
+
+	ASSERT_NS(!item->i_node_ref_size);
+
+	if (iref->ir_block->cb_handle_count) {
+		if (!xt_ind_copy_on_write(iref)) {
+			xt_ind_release(ot, ind, XT_UNLOCK_WRITE, iref);
+			return FAILED;
+		}
+	}
+
+	ASSERT_NS(ind->mi_fix_key);
+	size = item->i_item_size;
+	count = item->i_total_size / size;
+	item_idx = item->i_item_offset / size;
+	s_data = d_data = branch->tb_data;
+	for (i=0; i<count; i++) {
+		if (i == item_idx)
+			item->i_item_offset = d_data - branch->tb_data;
+		else {
+			data = s_data + item->i_item_size - XT_RECORD_REF_SIZE + XT_RECORD_ID_SIZE;
+			row_id = XT_GET_DISK_4(data);
+			if (row_id != (xtRowID) -1) {
+				if (d_data != s_data)
+					memcpy(d_data, s_data, size);
+				d_data += size;
+			}
+		}
+		s_data += size;
+	}
+	iref->ir_block->cp_del_count = 0;
+	item->i_total_size = d_data - branch->tb_data;
+	XT_SET_DISK_2(branch->tb_size_2, XT_MAKE_BRANCH_SIZE(item->i_total_size, 0));
+	iref->ir_updated = TRUE;
+	xt_ind_release(ot, ind, XT_UNLOCK_W_UPDATE, iref);
+	return OK;
+}
+
 /*
  * Remove an item and save to disk.
  */
@@ -1003,8 +1233,14 @@ static xtBool idx_remove_branch_item_rig
 	 * an Xlock on the cache block.
 	 */
 	if (iref->ir_block->cb_handle_count) {
-		if (!xt_ind_copy_on_write(iref))
+		if (!xt_ind_copy_on_write(iref)) {
+			xt_ind_release(ot, ind, item->i_node_ref_size ? XT_UNLOCK_READ : XT_UNLOCK_WRITE, iref);
 			return FAILED;
+		}
+	}
+	if (ind->mi_lazy_delete) {
+		if (idx_is_item_deleted(branch, item))
+			iref->ir_block->cp_del_count--;
 	}
 	/* Remove the node reference to the left of the item: */
 	memmove(&branch->tb_data[item->i_item_offset],
@@ -1013,18 +1249,28 @@ static xtBool idx_remove_branch_item_rig
 	item->i_total_size -= size;
 	XT_SET_DISK_2(branch->tb_size_2, XT_MAKE_BRANCH_SIZE(item->i_total_size, item->i_node_ref_size));
 	IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(address), (int) XT_GET_DISK_2(branch->tb_size_2));
+	iref->ir_updated = TRUE;
 	xt_ind_release(ot, ind, item->i_node_ref_size ? XT_UNLOCK_R_UPDATE : XT_UNLOCK_W_UPDATE, iref);
 	return OK;
 }
 
-static xtBool idx_remove_branch_item_left(XTOpenTablePtr ot, XTIndexPtr ind, xtIndexNodeID, XTIndReferencePtr iref, register XTIdxItemPtr item)
+static xtBool idx_remove_branch_item_left(XTOpenTablePtr ot, XTIndexPtr ind, xtIndexNodeID, XTIndReferencePtr iref, register XTIdxItemPtr item, xtBool *lazy_delete_cleanup_required)
 {
 	register XTIdxBranchDPtr branch = iref->ir_branch;
 	u_int size = item->i_item_size + item->i_node_ref_size;
 
+	ASSERT_NS(item->i_node_ref_size);
 	if (iref->ir_block->cb_handle_count) {
-		if (!xt_ind_copy_on_write(iref))
+		if (!xt_ind_copy_on_write(iref)) {
+			xt_ind_release(ot, ind, item->i_node_ref_size ? XT_UNLOCK_READ : XT_UNLOCK_WRITE, iref);
 			return FAILED;
+		}
+	}
+	if (ind->mi_lazy_delete) {
+		if (idx_is_item_deleted(branch, item))
+			iref->ir_block->cp_del_count--;
+		if (lazy_delete_cleanup_required)
+			*lazy_delete_cleanup_required = idx_lazy_delete_on_node(ind, iref->ir_block, item);
 	}
 	/* Remove the node reference to the left of the item: */
 	memmove(&branch->tb_data[item->i_item_offset - item->i_node_ref_size],
@@ -1033,11 +1279,12 @@ static xtBool idx_remove_branch_item_lef
 	item->i_total_size -= size;
 	XT_SET_DISK_2(branch->tb_size_2, XT_MAKE_BRANCH_SIZE(item->i_total_size, item->i_node_ref_size));
 	IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(address), (int) XT_GET_DISK_2(branch->tb_size_2));
+	iref->ir_updated = TRUE;
 	xt_ind_release(ot, ind, item->i_node_ref_size ? XT_UNLOCK_R_UPDATE : XT_UNLOCK_W_UPDATE, iref);
 	return OK;
 }
 
-static void idx_insert_leaf_item(XTIndexPtr ind __attribute__((unused)), XTIdxBranchDPtr leaf, XTIdxKeyValuePtr value, XTIdxResultPtr result)
+static void idx_insert_leaf_item(XTIndexPtr XT_UNUSED(ind), XTIdxBranchDPtr leaf, XTIdxKeyValuePtr value, XTIdxResultPtr result)
 {
 	xtWord1 *item;
 
@@ -1053,7 +1300,7 @@ static void idx_insert_leaf_item(XTIndex
 	XT_SET_DISK_2(leaf->tb_size_2, XT_MAKE_LEAF_SIZE(result->sr_item.i_total_size));
 }
 
-static void idx_insert_node_item(XTTableHPtr tab __attribute__((unused)), XTIndexPtr ind __attribute__((unused)), XTIdxBranchDPtr leaf, XTIdxKeyValuePtr value, XTIdxResultPtr result, xtIndexNodeID branch)
+static void idx_insert_node_item(XTTableHPtr XT_UNUSED(tab), XTIndexPtr XT_UNUSED(ind), XTIdxBranchDPtr leaf, XTIdxKeyValuePtr value, XTIdxResultPtr result, xtIndexNodeID branch)
 {
 	xtWord1 *item;
 
@@ -1114,7 +1361,7 @@ static void idx_get_middle_branch_item(X
 	}
 }
 
-static size_t idx_write_branch_item(XTIndexPtr ind __attribute__((unused)), xtWord1 *item, XTIdxKeyValuePtr value)
+static size_t idx_write_branch_item(XTIndexPtr XT_UNUSED(ind), xtWord1 *item, XTIdxKeyValuePtr value)
 {
 	memcpy(item, value->sv_key, value->sv_length);
 	xt_set_val_record_ref(item + value->sv_length, value);
@@ -1133,23 +1380,38 @@ static xtBool idx_replace_node_key(XTOpe
 	xtWord1				key_buf[XT_INDEX_MAX_KEY_SIZE];
 
 #ifdef DEBUG
-	iref.ir_ulock = XT_UNLOCK_NONE;
+	iref.ir_xlock = 2;
+	iref.ir_updated = 2;
 #endif
-	if (!xt_ind_fetch(ot, current, XT_LOCK_WRITE, &iref))
+	if (!xt_ind_fetch(ot, ind, current, XT_LOCK_WRITE, &iref))
 		return FAILED;
 	if (iref.ir_block->cb_handle_count) {
 		if (!xt_ind_copy_on_write(&iref))
 			goto failed_1;
 	}
+	if (ind->mi_lazy_delete) {
+		ASSERT_NS(item_size == item->i_pos.i_item_size);
+		if (idx_is_item_deleted(iref.ir_branch, &item->i_pos))
+			iref.ir_block->cp_del_count--;
+	}
 	memmove(&iref.ir_branch->tb_data[item->i_pos.i_item_offset + item_size],
 		&iref.ir_branch->tb_data[item->i_pos.i_item_offset + item->i_pos.i_item_size],
 		item->i_pos.i_total_size - item->i_pos.i_item_offset - item->i_pos.i_item_size);
 	memcpy(&iref.ir_branch->tb_data[item->i_pos.i_item_offset],
 		item_buf, item_size);
+	if (ind->mi_lazy_delete) {
+		if (idx_is_item_deleted(iref.ir_branch, &item->i_pos))
+			iref.ir_block->cp_del_count++;
+	}
 	item->i_pos.i_total_size = item->i_pos.i_total_size + item_size - item->i_pos.i_item_size;
 	XT_SET_DISK_2(iref.ir_branch->tb_size_2, XT_MAKE_NODE_SIZE(item->i_pos.i_total_size));
 	IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(current), (int) XT_GET_DISK_2(iref.ir_branch->tb_size_2));
+	iref.ir_updated = TRUE;
 
+#ifdef DEBUG
+	if (ind->mi_lazy_delete)
+		ASSERT_NS(item->i_pos.i_total_size <= XT_INDEX_PAGE_DATA_SIZE);
+#endif
 	if (item->i_pos.i_total_size <= XT_INDEX_PAGE_DATA_SIZE)
 		return xt_ind_release(ot, ind, XT_UNLOCK_W_UPDATE, &iref);
 
@@ -1184,6 +1446,7 @@ static xtBool idx_replace_node_key(XTOpe
 	/* Change the size of the old branch: */
 	XT_SET_DISK_2(iref.ir_branch->tb_size_2, XT_MAKE_NODE_SIZE(result.sr_item.i_item_offset));
 	IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(current), (int) XT_GET_DISK_2(iref.ir_branch->tb_size_2));
+	iref.ir_updated = TRUE;
 
 	xt_ind_release(ot, ind, XT_UNLOCK_W_UPDATE, &iref);
 
@@ -1237,7 +1500,8 @@ static xtBool idx_insert_node(XTOpenTabl
 	XTIdxBranchDPtr		new_branch_ptr;
 
 #ifdef DEBUG
-	iref.ir_ulock = XT_UNLOCK_NONE;
+	iref.ir_xlock = 2;
+	iref.ir_updated = 2;
 #endif
 	/* Insert a new branch (key, data)... */
 	if (!(stack_item = idx_pop(stack))) {
@@ -1268,7 +1532,7 @@ static xtBool idx_insert_node(XTOpenTabl
 	 * cache, and will remain in cache when we read again below for the
 	 * purpose of update.
 	 */
-	if (!xt_ind_fetch(ot, current, XT_LOCK_READ, &iref))
+	if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
 		goto failed;
 	ASSERT_NS(XT_IS_NODE(XT_GET_DISK_2(iref.ir_branch->tb_size_2)));
 	ind->mi_scan_branch(ot->ot_table, ind, iref.ir_branch, key_value, &result);
@@ -1280,6 +1544,7 @@ static xtBool idx_insert_node(XTOpenTabl
 		}
 		idx_insert_node_item(ot->ot_table, ind, iref.ir_branch, key_value, &result, branch);
 		IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(current), (int) XT_GET_DISK_2(ot->ot_ind_wbuf.tb_size_2));
+		iref.ir_updated = TRUE;
 		ASSERT_NS(result.sr_item.i_total_size <= XT_INDEX_PAGE_DATA_SIZE);
 		xt_ind_release(ot, ind, XT_UNLOCK_R_UPDATE, &iref);
 		goto done_ok;
@@ -1314,6 +1579,7 @@ static xtBool idx_insert_node(XTOpenTabl
 			goto failed_2;
 	}
 	memcpy(iref.ir_branch, &ot->ot_ind_wbuf, offsetof(XTIdxBranchDRec, tb_data) + result.sr_item.i_item_offset);
+	iref.ir_updated = TRUE;
 	xt_ind_release(ot, ind, XT_UNLOCK_R_UPDATE, &iref);
 
 	/* Insert the new branch into the parent node, using the new middle key value: */
@@ -1373,7 +1639,8 @@ static xtBool idx_check_duplicates(XTOpe
 	XTXactWaitRec		xw;
 
 #ifdef DEBUG
-	iref.ir_ulock = XT_UNLOCK_NONE;
+	iref.ir_xlock = 2;
+	iref.ir_updated = 2;
 #endif
 	retry:
 	idx_newstack(&stack);
@@ -1385,7 +1652,7 @@ static xtBool idx_check_duplicates(XTOpe
 	key_value->sv_flags = 0;
 
 	while (XT_NODE_ID(current)) {
-		if (!xt_ind_fetch(ot, current, XT_LOCK_READ, &iref)) {
+		if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref)) {
 			key_value->sv_flags = save_flags;
 			return FAILED;
 		}
@@ -1422,7 +1689,7 @@ static xtBool idx_check_duplicates(XTOpe
 			while ((node = idx_pop(&stack))) {
 				if (node->i_pos.i_item_offset < node->i_pos.i_total_size) {
 					current = node->i_branch;
-					if (!xt_ind_fetch(ot, current, XT_LOCK_READ, &iref))
+					if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
 						return FAILED;
 					xt_get_res_record_ref(&iref.ir_branch->tb_data[node->i_pos.i_item_offset + node->i_pos.i_item_size - XT_RECORD_REF_SIZE], &result);
 					result.sr_item = node->i_pos;
@@ -1439,6 +1706,11 @@ static xtBool idx_check_duplicates(XTOpe
 			break;
 		}
 
+		if (ind->mi_lazy_delete) {
+			if (result.sr_row_id == (xtRowID) -1)
+				goto next_item;
+		}
+
 		switch (xt_tab_maybe_committed(ot, result.sr_rec_id, &xn_id, NULL, NULL)) {
 			case XT_MAYBE:
 				/* Record is not committed, wait for the transaction. */
@@ -1464,6 +1736,7 @@ static xtBool idx_check_duplicates(XTOpe
 				break;
 		}
 
+		next_item:
 		idx_next_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
 
 		if (result.sr_item.i_node_ref_size) {
@@ -1473,7 +1746,7 @@ static xtBool idx_check_duplicates(XTOpe
 				if (!idx_push(&stack, current, &result.sr_item))
 					return FAILED;
 				current = result.sr_branch;
-				if (!xt_ind_fetch(ot, current, XT_LOCK_READ, &iref))
+				if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
 					return FAILED;
 				idx_first_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
 				if (!result.sr_item.i_node_ref_size)
@@ -1489,6 +1762,14 @@ static xtBool idx_check_duplicates(XTOpe
 	return FAILED;
 }
 
+inline static void idx_still_on_key(XTIndexPtr ind, register XTIdxSearchKeyPtr search_key, register XTIdxBranchDPtr branch, register XTIdxItemPtr item)
+{
+	if (search_key && search_key->sk_on_key) {
+		search_key->sk_on_key = myxt_compare_key(ind, search_key->sk_key_value.sv_flags, search_key->sk_key_value.sv_length,
+			search_key->sk_key_value.sv_key, &branch->tb_data[item->i_item_offset]) == 0;
+	}
+}
+
 /*
  * Insert a value into the given index. Return FALSE if an error occurs.
  */
@@ -1506,9 +1787,11 @@ xtPublic xtBool xt_idx_insert(XTOpenTabl
 	size_t				new_size;
 	xtBool				check_for_dups = ind->mi_flags & (HA_UNIQUE_CHECK | HA_NOSAME) && !allow_dups;
 	xtBool				lock_structure = FALSE;
+	xtBool				updated = FALSE;
 
 #ifdef DEBUG
-	iref.ir_ulock = XT_UNLOCK_NONE;
+	iref.ir_xlock = 2;
+	iref.ir_updated = 2;
 #endif
 #ifdef CHECK_AND_PRINT
 	//idx_check_index(ot, ind, TRUE);
@@ -1559,6 +1842,7 @@ xtPublic xtBool xt_idx_insert(XTOpenTabl
 		XT_INDEX_READ_LOCK(ind, ot);
 
 	retry:
+	/* Create a root node if required: */
 	if (!(XT_NODE_ID(current) = XT_NODE_ID(ind->mi_root))) {
 		/* Index is empty, create a new one: */
 		ASSERT_NS(lock_structure);
@@ -1575,8 +1859,9 @@ xtPublic xtBool xt_idx_insert(XTOpenTabl
 		goto done_ok;
 	}
 
+	/* Search down the tree for the insertion point. */
 	while (XT_NODE_ID(current)) {
-		if (!xt_ind_fetch(ot, current, XT_XLOCK_LEAF, &iref))
+		if (!xt_ind_fetch(ot, ind, current, XT_XLOCK_LEAF, &iref))
 			goto failed;
 		ind->mi_scan_branch(ot->ot_table, ind, iref.ir_branch, &key_value, &result);
 		if (result.sr_duplicate) {
@@ -1601,8 +1886,23 @@ xtPublic xtBool xt_idx_insert(XTOpenTabl
 			}
 		}
 		if (result.sr_found) {
-			/* Node found, can happen during recovery of indexes! */
-			XTPageUnlockType utype;
+			/* Node found, can happen during recovery of indexes! 
+			 * We have found an exact match of both key and record.
+			 */
+			XTPageUnlockType	utype;
+			xtBool				overwrite = FALSE;
+
+			/* {LAZY-DEL-INDEX-ITEMS}
+			 * If the item has been lazy deleted, then just overwrite!
+			 */ 
+			if (result.sr_row_id == (xtRowID) -1) {
+				xtWord2 del_count;
+	
+				/* This is safe because we have an xlock on the leaf. */
+				if ((del_count = iref.ir_block->cp_del_count))
+					iref.ir_block->cp_del_count = del_count-1;
+				overwrite = TRUE;
+			}
 
 			if (!result.sr_row_id && row_id) {
 				/* {INDEX-RECOV_ROWID} Set the row-id
@@ -1610,20 +1910,11 @@ xtPublic xtBool xt_idx_insert(XTOpenTabl
 				 * is not committed.
 				 * It will be removed later by the sweeper.
 				 */
-				size_t	offset;
-				xtWord1	*data;
-
-				offset = 
-					/* This is the offset of the reference in the item we found: */
-					result.sr_item.i_item_offset + result.sr_item.i_item_size - XT_RECORD_REF_SIZE +
-					/* This is the offset of the row id in the reference: */
-					4;
-				data = &iref.ir_branch->tb_data[offset];
+				overwrite = TRUE;
+			}
 
-				/* This update does not change the structure of page, so we do it without
-				 * copying the page before we write.
-				 */
-				XT_SET_DISK_4(data, row_id);
+			if (overwrite) {
+				idx_set_item_row_id(&iref, &result.sr_item, row_id);
 				utype = result.sr_item.i_node_ref_size ? XT_UNLOCK_R_UPDATE : XT_UNLOCK_W_UPDATE;
 			}
 			else
@@ -1644,14 +1935,84 @@ xtPublic xtBool xt_idx_insert(XTOpenTabl
 	/* Must be a leaf!: */
 	ASSERT_NS(!result.sr_item.i_node_ref_size);
 
+	updated = FALSE;
+	if (ind->mi_lazy_delete && iref.ir_block->cp_del_count) {
+		/* There are a number of possibilities:
+		 * - We could just replace a lazy deleted slot.
+		 * - We could compact and insert.
+		 * - We could just insert
+		 */
+
+		if (result.sr_item.i_item_offset > 0) {
+			/* Check if it can go into the previous node: */
+			XTIdxResultRec	t_res;
+
+			t_res.sr_item = result.sr_item;
+			xt_prev_branch_item_fix(ot->ot_table, ind, iref.ir_branch, &t_res);
+			if (t_res.sr_row_id != (xtRowID) -1)
+				goto try_current;
+
+			/* Yup, it can, but first check to see if it would be 
+			 * better to put it in the current node.
+			 * This is the case if the previous node key is not the
+			 * same as the key we are adding...
+			 */
+			if (result.sr_item.i_item_offset < result.sr_item.i_total_size &&
+				result.sr_row_id == (xtRowID) -1) {
+				if (!idx_cmp_item_key_fix(&iref, &t_res.sr_item, &key_value))
+					goto try_current;
+			}
+
+			idx_set_item_key_fix(&iref, &t_res.sr_item, &key_value);
+			iref.ir_block->cp_del_count--;
+			xt_ind_release(ot, ind, XT_UNLOCK_W_UPDATE, &iref);
+			goto done_ok;
+		}
+
+		try_current:
+		if (result.sr_item.i_item_offset < result.sr_item.i_total_size) {
+			if (result.sr_row_id == (xtRowID) -1) {
+				idx_set_item_key_fix(&iref, &result.sr_item, &key_value);
+				iref.ir_block->cp_del_count--;
+				xt_ind_release(ot, ind, XT_UNLOCK_W_UPDATE, &iref);
+				goto done_ok;
+			}
+		}
+
+		/* Check if we must compact... 
+		 * It makes no sense to split as long as there are lazy deleted items
+		 * in the page. So, delete them if a split would otherwise be required!
+		 */
+		ASSERT_NS(key_value.sv_length + XT_RECORD_REF_SIZE == result.sr_item.i_item_size);
+		if (result.sr_item.i_total_size + key_value.sv_length + XT_RECORD_REF_SIZE > XT_INDEX_PAGE_DATA_SIZE) {
+			if (!idx_compact_leaf(ot, ind, &iref, &result.sr_item))
+				goto failed;
+			updated = TRUE;
+		}
+		
+		/* Fall through to the insert code... */
+		/* NOTE: if there were no lazy deleted items in the leaf, then
+		 * idx_compact_leaf is a NOP. This is the only case in which it may not
+		 * fall through and do the insert below.
+		 *
+		 * Normally, if the cp_del_count is correct then the insert
+		 * will work below, and the assertion here will not fail.
+		 *
+		 * In this case, the xt_ind_release() will correctly indicate an update.
+		 */
+		ASSERT_NS(result.sr_item.i_total_size + key_value.sv_length + XT_RECORD_REF_SIZE <= XT_INDEX_PAGE_DATA_SIZE);
+	}
+
 	if (result.sr_item.i_total_size + key_value.sv_length + XT_RECORD_REF_SIZE <= XT_INDEX_PAGE_DATA_SIZE) {
 		if (iref.ir_block->cb_handle_count) {
 			if (!xt_ind_copy_on_write(&iref))
 				goto failed_1;
 		}
+
 		idx_insert_leaf_item(ind, iref.ir_branch, &key_value, &result);
 		IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(current), (int) XT_GET_DISK_2(ot->ot_ind_wbuf.tb_size_2));
 		ASSERT_NS(result.sr_item.i_total_size <= XT_INDEX_PAGE_DATA_SIZE);
+		iref.ir_updated = TRUE;
 		xt_ind_release(ot, ind, XT_UNLOCK_W_UPDATE, &iref);
 		goto done_ok;
 	}
@@ -1660,7 +2021,7 @@ xtPublic xtBool xt_idx_insert(XTOpenTabl
 	 * Make sure we have a structural lock:
 	 */
 	if (!lock_structure) {
-		xt_ind_release(ot, ind, XT_UNLOCK_WRITE, &iref);
+		xt_ind_release(ot, ind, updated ? XT_UNLOCK_W_UPDATE : XT_UNLOCK_WRITE, &iref);
 		XT_INDEX_UNLOCK(ind, ot);
 		lock_structure = TRUE;
 		goto lock_and_retry;
@@ -1705,6 +2066,7 @@ xtPublic xtBool xt_idx_insert(XTOpenTabl
 			goto failed_2;
 	}
 	memcpy(iref.ir_branch, &ot->ot_ind_wbuf, offsetof(XTIdxBranchDRec, tb_data) + result.sr_item.i_item_offset);
+	iref.ir_updated = TRUE;
 	xt_ind_release(ot, ind, XT_UNLOCK_W_UPDATE, &iref);
 
 	/* Insert the new branch into the parent node, using the new middle key value: */
@@ -1732,7 +2094,7 @@ xtPublic xtBool xt_idx_insert(XTOpenTabl
 	idx_free_branch(ot, ind, new_branch);
 
 	failed_1:
-	xt_ind_release(ot, ind, XT_UNLOCK_WRITE, &iref);
+	xt_ind_release(ot, ind, updated ? XT_UNLOCK_W_UPDATE : XT_UNLOCK_WRITE, &iref);
 
 	failed:
 	XT_INDEX_UNLOCK(ind, ot);
@@ -1747,18 +2109,175 @@ xtPublic xtBool xt_idx_insert(XTOpenTabl
 	return FAILED;
 }
 
+
+/* Remove the given item in the node.
+ * This is done by going down the tree to find a replacement
+ * for the deleted item!
+ */
+static xtBool idx_remove_item_in_node(XTOpenTablePtr ot, XTIndexPtr ind, IdxBranchStackPtr stack, XTIndReferencePtr iref, XTIdxKeyValuePtr key_value)
+{
+	IdxStackItemPtr		delete_node;
+	XTIdxResultRec		result;
+	xtIndexNodeID		current;
+	xtBool				lazy_delete_cleanup_required = FALSE;
+	IdxStackItemPtr		current_top;
+
+	delete_node = idx_top(stack);
+	current = delete_node->i_branch;
+	result.sr_item = delete_node->i_pos;
+
+	/* Follow the branch after this item: */
+	idx_next_branch_item(ot->ot_table, ind, iref->ir_branch, &result);
+	xt_ind_release(ot, ind, iref->ir_updated ? XT_UNLOCK_R_UPDATE : XT_UNLOCK_READ, iref);
+
+	/* Go down the left-hand side until we reach a leaf: */
+	while (XT_NODE_ID(current)) {
+		current = result.sr_branch;
+		if (!xt_ind_fetch(ot, ind, current, XT_XLOCK_LEAF, iref))
+			return FAILED;
+		idx_first_branch_item(ot->ot_table, ind, iref->ir_branch, &result);
+		if (!result.sr_item.i_node_ref_size)
+			break;
+		xt_ind_release(ot, ind, XT_UNLOCK_READ, iref);
+		if (!idx_push(stack, current, &result.sr_item))
+			return FAILED;
+	}
+
+	ASSERT_NS(XT_NODE_ID(current));
+	ASSERT_NS(!result.sr_item.i_node_ref_size);
+
+	if (!xt_ind_reserve(ot, stack->s_top + 2, iref->ir_branch)) {
+		xt_ind_release(ot, ind, XT_UNLOCK_WRITE, iref);
+		return FAILED;
+	}
+	
+	/* This code removes lazy deleted items from the leaf,
+	 * before we promote an item to a leaf.
+	 * This is not essential, but prevents lazy deleted
+	 * items from being propogated up the tree.
+	 */
+	if (ind->mi_lazy_delete) {
+		if (iref->ir_block->cp_del_count) {
+			if (!idx_compact_leaf(ot, ind, iref, &result.sr_item))
+				return FAILED;
+		}
+	}
+
+	/* Crawl back up the stack trace, looking for a key
+	 * that can be used to replace the deleted key.
+	 *
+	 * Any empty nodes on the way up can be removed!
+	 */
+	if (result.sr_item.i_total_size > 0) {
+		/* There is a key in the leaf, extract it, and put it in the node: */
+		memcpy(key_value->sv_key, &iref->ir_branch->tb_data[result.sr_item.i_item_offset], result.sr_item.i_item_size);
+		/* This call also frees the iref.ir_branch page! */
+		if (!idx_remove_branch_item_right(ot, ind, current, iref, &result.sr_item))
+			return FAILED;
+		if (!idx_replace_node_key(ot, ind, delete_node, stack, result.sr_item.i_item_size, key_value->sv_key))
+			return FAILED;
+		goto done_ok;
+	}
+
+	xt_ind_release(ot, ind, iref->ir_updated ? XT_UNLOCK_W_UPDATE : XT_UNLOCK_WRITE, iref);
+
+	for (;;) {
+		/* The current node/leaf is empty, remove it: */
+		idx_free_branch(ot, ind, current);
+
+		current_top = idx_pop(stack);
+		current = current_top->i_branch;
+		if (!xt_ind_fetch(ot, ind, current, XT_XLOCK_LEAF, iref))
+			return FAILED;
+		
+		if (current_top == delete_node) {
+			/* All children have been removed. Delete the key and done: */
+			if (!idx_remove_branch_item_right(ot, ind, current, iref, &current_top->i_pos))
+				return FAILED;
+			goto done_ok;
+		}
+
+		if (current_top->i_pos.i_total_size > current_top->i_pos.i_node_ref_size) {
+			/* Save the key: */
+			memcpy(key_value->sv_key, &iref->ir_branch->tb_data[current_top->i_pos.i_item_offset], current_top->i_pos.i_item_size);
+			/* This function also frees the cache page: */
+			if (!idx_remove_branch_item_left(ot, ind, current, iref, &current_top->i_pos, &lazy_delete_cleanup_required))
+				return FAILED;
+			if (!idx_replace_node_key(ot, ind, delete_node, stack, current_top->i_pos.i_item_size, key_value->sv_key))
+				return FAILED;
+			/* */
+			if (lazy_delete_cleanup_required) {
+				if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, iref))
+					return FAILED;
+				if (!idx_remove_lazy_deleted_item_in_node(ot, ind, current, iref, key_value))
+					return FAILED;
+			}
+			goto done_ok;
+		}
+		xt_ind_release(ot, ind, current_top->i_pos.i_node_ref_size ? XT_UNLOCK_READ : XT_UNLOCK_WRITE, iref);
+	}
+
+	done_ok:
+#ifdef XT_TRACK_INDEX_UPDATES
+	ASSERT_NS(ot->ot_ind_reserved >= ot->ot_ind_reads);
+#endif
+	return OK;
+}
+
+/*
+ * This function assumes we have a lock on the structure of the index.
+ */
+static xtBool idx_remove_lazy_deleted_item_in_node(XTOpenTablePtr ot, XTIndexPtr ind, xtIndexNodeID current, XTIndReferencePtr iref, XTIdxKeyValuePtr key_value)
+{
+	IdxBranchStackRec	stack;
+	XTIdxResultRec		result;
+
+	/* Now remove all lazy deleted items in this node.... */
+	idx_first_branch_item(ot->ot_table, ind, (XTIdxBranchDPtr) iref->ir_block->cb_data, &result);
+
+	for (;;) {
+		while (result.sr_item.i_item_offset < result.sr_item.i_total_size) {
+			if (result.sr_row_id == (xtRowID) -1)
+				goto remove_item;
+			idx_next_branch_item(ot->ot_table, ind, (XTIdxBranchDPtr) iref->ir_block->cb_data, &result);
+		}
+		break;
+
+		remove_item:
+
+		idx_newstack(&stack);
+		if (!idx_push(&stack, current, &result.sr_item)) {
+			xt_ind_release(ot, ind, iref->ir_updated ? XT_UNLOCK_R_UPDATE : XT_UNLOCK_READ, iref);
+			return FAILED;
+		}
+
+		if (!idx_remove_item_in_node(ot, ind, &stack, iref, key_value))
+			return FAILED;
+
+		/* Go back up to the node we are trying to
+		 * free of things.
+		 */
+		if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, iref))
+			return FAILED;
+		/* Load the data again: */
+		idx_reload_item_fix(ind, iref->ir_branch, &result);
+	}
+
+	xt_ind_release(ot, ind, iref->ir_updated ? XT_UNLOCK_R_UPDATE : XT_UNLOCK_READ, iref);
+	return OK;
+}
+
 static xtBool idx_delete(XTOpenTablePtr ot, XTIndexPtr ind, XTIdxKeyValuePtr key_value)
 {
 	IdxBranchStackRec	stack;
 	xtIndexNodeID		current;
 	XTIndReferenceRec	iref;
 	XTIdxResultRec		result;
-	IdxStackItemPtr		delete_node = NULL;
-	IdxStackItemPtr		current_top = NULL;
 	xtBool				lock_structure = FALSE;
 
 #ifdef DEBUG
-	iref.ir_ulock = XT_UNLOCK_NONE;
+	iref.ir_xlock = 2;
+	iref.ir_updated = 2;
 #endif
 	/* The index appears to have no root: */
 	if (!XT_NODE_ID(ind->mi_root))
@@ -1776,17 +2295,37 @@ static xtBool idx_delete(XTOpenTablePtr 
 		goto done_ok;
 
 	while (XT_NODE_ID(current)) {
-		if (!xt_ind_fetch(ot, current, XT_XLOCK_LEAF, &iref))
+		if (!xt_ind_fetch(ot, ind, current, XT_XLOCK_DEL_LEAF, &iref))
 			goto failed;
 		ind->mi_scan_branch(ot->ot_table, ind, iref.ir_branch, key_value, &result);
 		if (!result.sr_item.i_node_ref_size) {
 			/* A leaf... */
 			if (result.sr_found) {
-				if (!idx_remove_branch_item_right(ot, ind, current, &iref, &result.sr_item))
-					goto failed;
+				if (ind->mi_lazy_delete) {
+					/* If the we have a W lock, then fetch decided that we
+					 * need to compact the page.
+					 * The decision is made by xt_idx_lazy_delete_on_leaf() 
+					 */
+					if (!iref.ir_xlock)
+						idx_lazy_delete_branch_item(ot, ind, &iref, &result.sr_item);
+					else {
+						if (!iref.ir_block->cp_del_count) {
+							if (!idx_remove_branch_item_right(ot, ind, current, &iref, &result.sr_item))
+								goto failed;
+						}
+						else {
+							if (!idx_lazy_remove_leaf_item_right(ot, ind, &iref, &result.sr_item))
+								goto failed;
+						}
+					}
+				}
+				else {
+					if (!idx_remove_branch_item_right(ot, ind, current, &iref, &result.sr_item))
+						goto failed;
+				}
 			}
 			else
-				xt_ind_release(ot, ind, XT_UNLOCK_WRITE, &iref);
+				xt_ind_release(ot, ind, iref.ir_xlock ? XT_UNLOCK_WRITE : XT_UNLOCK_READ, &iref);
 			goto done_ok;
 		}
 		if (!idx_push(&stack, current, &result.sr_item)) {
@@ -1803,6 +2342,35 @@ static xtBool idx_delete(XTOpenTablePtr 
 	/* Must be a non-leaf!: */
 	ASSERT_NS(result.sr_item.i_node_ref_size);
 
+	if (ind->mi_lazy_delete) {
+		if (!idx_lazy_delete_on_node(ind, iref.ir_block, &result.sr_item)) {
+			/* We need to remove some items from this node: */
+
+			if (!lock_structure) {
+				xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+				XT_INDEX_UNLOCK(ind, ot);
+				lock_structure = TRUE;
+				goto lock_and_retry;
+			}
+
+			idx_set_item_deleted(&iref, &result.sr_item);
+			if (!idx_remove_lazy_deleted_item_in_node(ot, ind, current, &iref, key_value))
+				goto failed;
+			goto done_ok;
+		}
+
+		if (!ot->ot_table->tab_dic.dic_no_lazy_delete) {
+			/* {LAZY-DEL-INDEX-ITEMS}
+			 * We just set item to deleted, this is a significant time
+			 * saver.
+			 * But this item can only be cleaned up when all
+			 * items on the node below are deleted.
+			 */
+			idx_lazy_delete_branch_item(ot, ind, &iref, &result.sr_item);
+			goto done_ok;
+		}
+	}
+
 	/* We will have to remove the key from a non-leaf node,
 	 * which means we are changing the structure of the index.
 	 * Make sure we have a structural lock:
@@ -1815,86 +2383,8 @@ static xtBool idx_delete(XTOpenTablePtr 
 	}
 
 	/* This is the item we will have to replace: */
-	delete_node = idx_top(&stack);
-
-	/* Follow the branch after this item: */
-	idx_next_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
-	ASSERT_NS(XT_NODE_ID(current));
-	xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
-
-	/* Go down the left-hand side until we reach a leaf: */
-	while (XT_NODE_ID(current)) {
-		current = result.sr_branch;
-		if (!xt_ind_fetch(ot, current, XT_XLOCK_LEAF, &iref))
-			goto failed;
-		idx_first_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
-		if (!result.sr_item.i_node_ref_size)
-			break;
-		xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
-		if (!idx_push(&stack, current, &result.sr_item))
-			goto failed;
-	}
-
-	ASSERT_NS(XT_NODE_ID(current));
-	ASSERT_NS(!result.sr_item.i_node_ref_size);
-
-	if (!xt_ind_reserve(ot, stack.s_top + 2, iref.ir_branch)) {
-		xt_ind_release(ot, ind, XT_UNLOCK_WRITE, &iref);
+	if (!idx_remove_item_in_node(ot, ind, &stack, &iref, key_value))
 		goto failed;
-	}
-
-	/* Crawl back up the stack trace, looking for a key
-	 * that can be used to replace the deleted key.
-	 *
-	 * Any empty nodes on the way up can be removed!
-	 */
-	if (result.sr_item.i_total_size > 0) {
-		/* There is a key in the leaf, extract it, and put it in the node: */
-		memcpy(key_value->sv_key, &iref.ir_branch->tb_data[result.sr_item.i_item_offset], result.sr_item.i_item_size);
-		/* This call also frees the iref.ir_branch page! */
-		if (!idx_remove_branch_item_right(ot, ind, current, &iref, &result.sr_item))
-			goto failed;
-		if (!idx_replace_node_key(ot, ind, delete_node, &stack, result.sr_item.i_item_size, key_value->sv_key))
-			goto failed;
-		goto done_ok_2;
-	}
-
-	xt_ind_release(ot, ind, XT_UNLOCK_WRITE, &iref);
-
-	for (;;) {
-		/* The current node/leaf is empty, remove it: */
-		idx_free_branch(ot, ind, current);
-
-		current_top = idx_pop(&stack);
-		current = current_top->i_branch;
-		if (!xt_ind_fetch(ot, current, XT_XLOCK_LEAF, &iref))
-			goto failed;
-		
-		if (current_top == delete_node) {
-			/* All children have been removed. Delete the key and done: */
-			if (!idx_remove_branch_item_right(ot, ind, current, &iref, &current_top->i_pos))
-				goto failed;
-			goto done_ok_2;
-		}
-
-		if (current_top->i_pos.i_total_size > current_top->i_pos.i_node_ref_size) {
-			/* Save the key: */
-			memcpy(key_value->sv_key, &iref.ir_branch->tb_data[current_top->i_pos.i_item_offset], current_top->i_pos.i_item_size);
-			/* This function also frees the cache page: */
-			if (!idx_remove_branch_item_left(ot, ind, current, &iref, &current_top->i_pos))
-				goto failed;
-			if (!idx_replace_node_key(ot, ind, delete_node, &stack, current_top->i_pos.i_item_size, key_value->sv_key))
-				goto failed;
-			goto done_ok_2;
-		}
-		xt_ind_release(ot, ind, current_top->i_pos.i_node_ref_size ? XT_UNLOCK_READ : XT_UNLOCK_WRITE, &iref);
-	}
-
-
-	done_ok_2:
-#ifdef XT_TRACK_INDEX_UPDATES
-	ASSERT_NS(ot->ot_ind_reserved >= ot->ot_ind_reads);
-#endif
 
 	done_ok:
 	XT_INDEX_UNLOCK(ind, ot);
@@ -1945,7 +2435,8 @@ xtPublic xtBool xt_idx_update_row_id(XTO
 	xtWord1				key_buf[XT_INDEX_MAX_KEY_SIZE + XT_MAX_RECORD_REF_SIZE];
 
 #ifdef DEBUG
-	iref.ir_ulock = XT_UNLOCK_NONE;
+	iref.ir_xlock = 2;
+	iref.ir_updated = 2;
 #endif
 #ifdef CHECK_AND_PRINT
 	idx_check_index(ot, ind, TRUE);
@@ -1989,7 +2480,7 @@ xtPublic xtBool xt_idx_update_row_id(XTO
 		goto done_ok;
 
 	while (XT_NODE_ID(current)) {
-		if (!xt_ind_fetch(ot, current, XT_LOCK_READ, &iref))
+		if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
 			goto failed;
 		ind->mi_scan_branch(ot->ot_table, ind, iref.ir_branch, &key_value, &result);
 		if (result.sr_found || !result.sr_item.i_node_ref_size)
@@ -1999,23 +2490,10 @@ xtPublic xtBool xt_idx_update_row_id(XTO
 	}
 
 	if (result.sr_found) {
-		size_t	offset;
-		xtWord1	*data;
-
-		offset = 
-			/* This is the offset of the reference in the item we found: */
-			result.sr_item.i_item_offset + result.sr_item.i_item_size - XT_RECORD_REF_SIZE +
-			/* This is the offset of the row id in the reference: */
-			4;
-		data = &iref.ir_branch->tb_data[offset];
-
-		/* This update does not change the structure of page, so we do it without
-		 * copying the page before we write.
-		 *
-		 * TODO: Check that concurrent reads can handle this!
+		/* TODO: Check that concurrent reads can handle this!
 		 * assuming the write is not atomic.
 		 */
-		XT_SET_DISK_4(data, row_id);
+		idx_set_item_row_id(&iref, &result.sr_item, row_id);
 		xt_ind_release(ot, ind, XT_UNLOCK_R_UPDATE, &iref);
 	}
 	else
@@ -2076,7 +2554,8 @@ xtPublic xtBool xt_idx_search(XTOpenTabl
 	XTIdxResultRec		result;
 
 #ifdef DEBUG
-	iref.ir_ulock = XT_UNLOCK_NONE;
+	iref.ir_xlock = 2;
+	iref.ir_updated = 2;
 #endif
 	if (ot->ot_ind_rhandle) {
 		xt_ind_release_handle(ot->ot_ind_rhandle, FALSE, ot->ot_thread);
@@ -2110,7 +2589,7 @@ xtPublic xtBool xt_idx_search(XTOpenTabl
 		goto done_ok;
 
 	while (XT_NODE_ID(current)) {
-		if (!xt_ind_fetch(ot, current, XT_LOCK_READ, &iref))
+		if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
 			goto failed;
 		ind->mi_scan_branch(ot->ot_table, ind, iref.ir_branch, &search_key->sk_key_value, &result);
 		if (result.sr_found)
@@ -2124,6 +2603,17 @@ xtPublic xtBool xt_idx_search(XTOpenTabl
 		current = result.sr_branch;
 	}
 
+	if (ind->mi_lazy_delete) {
+		ignore_lazy_deleted_items:
+		while (result.sr_item.i_item_offset < result.sr_item.i_total_size) {
+			if (result.sr_row_id != (xtRowID) -1) {
+				idx_still_on_key(ind, search_key, iref.ir_branch, &result.sr_item);
+				break;
+			}
+			idx_next_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
+		}
+	}
+
 	if (result.sr_item.i_item_offset == result.sr_item.i_total_size) {
 		IdxStackItemPtr node;
 
@@ -2134,12 +2624,39 @@ xtPublic xtBool xt_idx_search(XTOpenTabl
 		xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 		while ((node = idx_pop(&stack))) {
 			if (node->i_pos.i_item_offset < node->i_pos.i_total_size) {
-				xtRecordID rec_id;
-
-				if (!xt_ind_fetch(ot, node->i_branch, XT_LOCK_READ, &iref))
+				if (!xt_ind_fetch(ot, ind, node->i_branch, XT_LOCK_READ, &iref))
 					goto failed;
-				xt_get_record_ref(&iref.ir_branch->tb_data[node->i_pos.i_item_offset + node->i_pos.i_item_size - XT_RECORD_REF_SIZE], &rec_id, &ot->ot_curr_row_id);
-				ot->ot_curr_rec_id = rec_id;
+				xt_get_res_record_ref(&iref.ir_branch->tb_data[node->i_pos.i_item_offset + node->i_pos.i_item_size - XT_RECORD_REF_SIZE], &result);
+
+				if (ind->mi_lazy_delete) {
+					result.sr_item = node->i_pos;
+					if (result.sr_row_id == (xtRowID) -1) {
+						/* If this node pointer is lazy deleted, then
+						 * go down the next branch...
+						 */
+						idx_next_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
+
+						/* Go down to the bottom: */
+						current = node->i_branch;
+						while (XT_NODE_ID(current)) {
+							xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+							if (!idx_push(&stack, current, &result.sr_item))
+								goto failed;
+							current = result.sr_branch;
+							if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
+								goto failed;
+							idx_first_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
+							if (!result.sr_item.i_node_ref_size)
+								break;
+						}
+
+						goto ignore_lazy_deleted_items;
+					}
+					idx_still_on_key(ind, search_key, iref.ir_branch, &result.sr_item);
+				}
+
+				ot->ot_curr_rec_id = result.sr_rec_id;
+				ot->ot_curr_row_id = result.sr_row_id;
 				ot->ot_ind_state = node->i_pos;
 
 				/* Convert the pointer to a handle which can be used in later operations: */
@@ -2180,14 +2697,16 @@ xtPublic xtBool xt_idx_search(XTOpenTabl
 	//idx_check_index(ot, ind, TRUE);
 	//idx_check_on_key(ot);
 #endif
-	ASSERT_NS(iref.ir_ulock == XT_UNLOCK_NONE);
+	ASSERT_NS(iref.ir_xlock == 2);
+	ASSERT_NS(iref.ir_updated == 2);
 	return OK;
 
 	failed:
 	XT_INDEX_UNLOCK(ind, ot);
 	if (idx_out_of_memory_failure(ot))
 		goto retry_after_oom;
-	ASSERT_NS(iref.ir_ulock == XT_UNLOCK_NONE);
+	ASSERT_NS(iref.ir_xlock == 2);
+	ASSERT_NS(iref.ir_updated == 2);
 	return FAILED;
 }
 
@@ -2199,7 +2718,8 @@ xtPublic xtBool xt_idx_search_prev(XTOpe
 	XTIdxResultRec		result;
 
 #ifdef DEBUG
-	iref.ir_ulock = XT_UNLOCK_NONE;
+	iref.ir_xlock = 2;
+	iref.ir_updated = 2;
 #endif
 	if (ot->ot_ind_rhandle) {
 		xt_ind_release_handle(ot->ot_ind_rhandle, FALSE, ot->ot_thread);
@@ -2232,7 +2752,7 @@ xtPublic xtBool xt_idx_search_prev(XTOpe
 		goto done_ok;
 
 	while (XT_NODE_ID(current)) {
-		if (!xt_ind_fetch(ot, current, XT_LOCK_READ, &iref))
+		if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
 			goto failed;
 		ind->mi_scan_branch(ot->ot_table, ind, iref.ir_branch, &search_key->sk_key_value, &result);
 		if (result.sr_found)
@@ -2249,17 +2769,43 @@ xtPublic xtBool xt_idx_search_prev(XTOpe
 	if (result.sr_item.i_item_offset == 0) {
 		IdxStackItemPtr node;
 
-		/* We are at the end of a leaf node.
-		 * Go up the stack to find the start poition of the next key.
+		search_up_stack:
+		/* We are at the start of a leaf node.
+		 * Go up the stack to find the start position of the next key.
 		 * If we find none, then we are the end of the index.
 		 */
 		xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 		while ((node = idx_pop(&stack))) {
 			if (node->i_pos.i_item_offset > node->i_pos.i_node_ref_size) {
-				if (!xt_ind_fetch(ot, node->i_branch, XT_LOCK_READ, &iref))
+				if (!xt_ind_fetch(ot, ind, node->i_branch, XT_LOCK_READ, &iref))
 					goto failed;
 				result.sr_item = node->i_pos;
 				ind->mi_prev_item(ot->ot_table, ind, iref.ir_branch, &result);
+
+				if (ind->mi_lazy_delete) {
+					if (result.sr_row_id == (xtRowID) -1) {
+						/* Go down to the bottom, in order to scan the leaf backwards: */
+						current = node->i_branch;
+						while (XT_NODE_ID(current)) {
+							xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+							if (!idx_push(&stack, current, &result.sr_item))
+								goto failed;
+							current = result.sr_branch;
+							if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
+								goto failed;
+							ind->mi_last_item(ot->ot_table, ind, iref.ir_branch, &result);
+							if (!result.sr_item.i_node_ref_size)
+								break;
+						}
+
+						/* If the leaf empty we have to go up the stack again... */
+						if (result.sr_item.i_total_size == 0)
+							goto search_up_stack;
+
+						goto scan_back_in_leaf;
+					}
+				}
+
 				goto record_found;
 			}
 		}
@@ -2269,6 +2815,16 @@ xtPublic xtBool xt_idx_search_prev(XTOpe
 	/* We must just step once to the left in this leaf node... */
 	ind->mi_prev_item(ot->ot_table, ind, iref.ir_branch, &result);
 
+	if (ind->mi_lazy_delete) {
+		scan_back_in_leaf:
+		while (result.sr_row_id == (xtRowID) -1) {
+			if (result.sr_item.i_item_offset == 0)
+				goto search_up_stack;
+			ind->mi_prev_item(ot->ot_table, ind, iref.ir_branch, &result);
+		}
+		idx_still_on_key(ind, search_key, iref.ir_branch, &result.sr_item);
+	}
+
 	record_found:
 	ot->ot_curr_rec_id = result.sr_rec_id;
 	ot->ot_curr_row_id = result.sr_row_id;
@@ -2330,34 +2886,47 @@ xtPublic xtBool xt_idx_next(register XTO
 	XTIndReferenceRec	iref;
 
 #ifdef DEBUG
-	iref.ir_ulock = XT_UNLOCK_NONE;
+	iref.ir_xlock = 2;
+	iref.ir_updated = 2;
 #endif
 	ASSERT_NS(ot->ot_ind_rhandle);
 	xt_ind_lock_handle(ot->ot_ind_rhandle);
-	if (!ot->ot_ind_state.i_node_ref_size && 
-		ot->ot_ind_state.i_item_offset < ot->ot_ind_state.i_total_size && 
+	result.sr_item = ot->ot_ind_state;
+	if (!result.sr_item.i_node_ref_size && 
+		result.sr_item.i_item_offset < result.sr_item.i_total_size && 
 		ot->ot_ind_rhandle->ih_cache_reference) {
-		key_value.sv_key = &ot->ot_ind_rhandle->ih_branch->tb_data[ot->ot_ind_state.i_item_offset];
-		key_value.sv_length = ot->ot_ind_state.i_item_size - XT_RECORD_REF_SIZE;
+		XTIdxItemRec prev_item;
+
+		key_value.sv_key = &ot->ot_ind_rhandle->ih_branch->tb_data[result.sr_item.i_item_offset];
+		key_value.sv_length = result.sr_item.i_item_size - XT_RECORD_REF_SIZE;
 
-		result.sr_item = ot->ot_ind_state;
+		prev_item = result.sr_item;
 		idx_next_branch_item(ot->ot_table, ind, ot->ot_ind_rhandle->ih_branch, &result);
+
+		if (ind->mi_lazy_delete) {
+			while (result.sr_item.i_item_offset < result.sr_item.i_total_size) {
+				if (result.sr_row_id != (xtRowID) -1)
+					break;
+				prev_item = result.sr_item;
+				idx_next_branch_item(ot->ot_table, ind, ot->ot_ind_rhandle->ih_branch, &result);
+			}
+		}
+
 		if (result.sr_item.i_item_offset < result.sr_item.i_total_size) {
 			/* Still on key? */
-			if (search_key && search_key->sk_on_key) {
-				search_key->sk_on_key = myxt_compare_key(ind, search_key->sk_key_value.sv_flags, search_key->sk_key_value.sv_length,
-					search_key->sk_key_value.sv_key, &ot->ot_ind_rhandle->ih_branch->tb_data[result.sr_item.i_item_offset]) == 0;
-			}
+			idx_still_on_key(ind, search_key, ot->ot_ind_rhandle->ih_branch, &result.sr_item);
 			xt_ind_unlock_handle(ot->ot_ind_rhandle);
 			goto checked_on_key;
 		}
+
+		result.sr_item = prev_item;
 	}
 
 	key_value.sv_flags = XT_SEARCH_WHOLE_KEY;
-	xt_get_record_ref(&ot->ot_ind_rhandle->ih_branch->tb_data[ot->ot_ind_state.i_item_offset + ot->ot_ind_state.i_item_size - XT_RECORD_REF_SIZE], &key_value.sv_rec_id, &key_value.sv_row_id);
+	xt_get_record_ref(&ot->ot_ind_rhandle->ih_branch->tb_data[result.sr_item.i_item_offset + result.sr_item.i_item_size - XT_RECORD_REF_SIZE], &key_value.sv_rec_id, &key_value.sv_row_id);
 	key_value.sv_key = key_buf;
-	key_value.sv_length = ot->ot_ind_state.i_item_size - XT_RECORD_REF_SIZE;
-	memcpy(key_buf, &ot->ot_ind_rhandle->ih_branch->tb_data[ot->ot_ind_state.i_item_offset], key_value.sv_length);
+	key_value.sv_length = result.sr_item.i_item_size - XT_RECORD_REF_SIZE;
+	memcpy(key_buf, &ot->ot_ind_rhandle->ih_branch->tb_data[result.sr_item.i_item_offset], key_value.sv_length);
 	xt_ind_release_handle(ot->ot_ind_rhandle, TRUE, ot->ot_thread);
 	ot->ot_ind_rhandle = NULL;
 
@@ -2375,7 +2944,7 @@ xtPublic xtBool xt_idx_next(register XTO
 	}
 
 	while (XT_NODE_ID(current)) {
-		if (!xt_ind_fetch(ot, current, XT_LOCK_READ, &iref))
+		if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
 			goto failed;
 		ind->mi_scan_branch(ot->ot_table, ind, iref.ir_branch, &key_value, &result);
 		if (result.sr_item.i_node_ref_size) {
@@ -2389,7 +2958,7 @@ xtPublic xtBool xt_idx_next(register XTO
 					if (!idx_push(&stack, current, &result.sr_item))
 						goto failed;
 					current = result.sr_branch;
-					if (!xt_ind_fetch(ot, current, XT_LOCK_READ, &iref))
+					if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
 						goto failed;
 					idx_first_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
 					if (!result.sr_item.i_node_ref_size)
@@ -2416,6 +2985,15 @@ xtPublic xtBool xt_idx_next(register XTO
 		current = result.sr_branch;
 	}
 
+	if (ind->mi_lazy_delete) {
+		ignore_lazy_deleted_items:
+		while (result.sr_item.i_item_offset < result.sr_item.i_total_size) {
+			if (result.sr_row_id != (xtRowID) -1)
+				break;
+			idx_next_branch_item(NULL, ind, iref.ir_branch, &result);
+		}
+	}
+
 	/* Check the current position in a leaf: */
 	if (result.sr_item.i_item_offset == result.sr_item.i_total_size) {
 		/* At the end: */
@@ -2428,10 +3006,37 @@ xtPublic xtBool xt_idx_next(register XTO
 		xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 		while ((node = idx_pop(&stack))) {
 			if (node->i_pos.i_item_offset < node->i_pos.i_total_size) {
-				if (!xt_ind_fetch(ot, node->i_branch, XT_LOCK_READ, &iref))
+				if (!xt_ind_fetch(ot, ind, node->i_branch, XT_LOCK_READ, &iref))
 					goto failed;
 				result.sr_item = node->i_pos;
 				xt_get_res_record_ref(&iref.ir_branch->tb_data[result.sr_item.i_item_offset + result.sr_item.i_item_size - XT_RECORD_REF_SIZE], &result);
+
+				if (ind->mi_lazy_delete) {
+					if (result.sr_row_id == (xtRowID) -1) {
+						/* If this node pointer is lazy deleted, then
+						 * go down the next branch...
+						 */
+						idx_next_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
+
+						/* Go down to the bottom: */
+						current = node->i_branch;
+						while (XT_NODE_ID(current)) {
+							xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+							if (!idx_push(&stack, current, &result.sr_item))
+								goto failed;
+							current = result.sr_branch;
+							if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
+								goto failed;
+							idx_first_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
+							if (!result.sr_item.i_node_ref_size)
+								break;
+						}
+
+						/* And scan the leaf... */
+						goto ignore_lazy_deleted_items;
+					}
+				}
+
 				goto unlock_check_on_key;
 			}
 		}
@@ -2503,32 +3108,39 @@ xtPublic xtBool xt_idx_prev(register XTO
 	IdxStackItemPtr		node;
 
 #ifdef DEBUG
-	iref.ir_ulock = XT_UNLOCK_NONE;
+	iref.ir_xlock = 2;
+	iref.ir_updated = 2;
 #endif
 	ASSERT_NS(ot->ot_ind_rhandle);
 	xt_ind_lock_handle(ot->ot_ind_rhandle);
-	if (!ot->ot_ind_state.i_node_ref_size && ot->ot_ind_state.i_item_offset > 0) {
-		key_value.sv_key = &ot->ot_ind_rhandle->ih_branch->tb_data[ot->ot_ind_state.i_item_offset];
-		key_value.sv_length = ot->ot_ind_state.i_item_size - XT_RECORD_REF_SIZE;
+	result.sr_item = ot->ot_ind_state;
+	if (!result.sr_item.i_node_ref_size && result.sr_item.i_item_offset > 0) {
+		key_value.sv_key = &ot->ot_ind_rhandle->ih_branch->tb_data[result.sr_item.i_item_offset];
+		key_value.sv_length = result.sr_item.i_item_size - XT_RECORD_REF_SIZE;
 
-		result.sr_item = ot->ot_ind_state;
 		ind->mi_prev_item(ot->ot_table, ind, ot->ot_ind_rhandle->ih_branch, &result);
 
-		if (search_key && search_key->sk_on_key) {
-			search_key->sk_on_key = myxt_compare_key(ind, search_key->sk_key_value.sv_flags, search_key->sk_key_value.sv_length,
-				search_key->sk_key_value.sv_key, &ot->ot_ind_rhandle->ih_branch->tb_data[result.sr_item.i_item_offset]) == 0;
+		if (ind->mi_lazy_delete) {
+			while (result.sr_row_id == (xtRowID) -1) {
+				if (result.sr_item.i_item_offset == 0)
+					goto research;
+				ind->mi_prev_item(ot->ot_table, ind, ot->ot_ind_rhandle->ih_branch, &result);
+			}
 		}
 
+		idx_still_on_key(ind, search_key, ot->ot_ind_rhandle->ih_branch, &result.sr_item);
+
 		xt_ind_unlock_handle(ot->ot_ind_rhandle);
 		goto checked_on_key;
 	}
 
+	research:
 	key_value.sv_flags = XT_SEARCH_WHOLE_KEY;
 	key_value.sv_rec_id = ot->ot_curr_rec_id;
 	key_value.sv_row_id = 0;
 	key_value.sv_key = key_buf;
-	key_value.sv_length = ot->ot_ind_state.i_item_size - XT_RECORD_REF_SIZE;
-	memcpy(key_buf, &ot->ot_ind_rhandle->ih_branch->tb_data[ot->ot_ind_state.i_item_offset], key_value.sv_length);
+	key_value.sv_length = result.sr_item.i_item_size - XT_RECORD_REF_SIZE;
+	memcpy(key_buf, &ot->ot_ind_rhandle->ih_branch->tb_data[result.sr_item.i_item_offset], key_value.sv_length);
 	xt_ind_release_handle(ot->ot_ind_rhandle, TRUE, ot->ot_thread);
 	ot->ot_ind_rhandle = NULL;
 
@@ -2546,29 +3158,39 @@ xtPublic xtBool xt_idx_prev(register XTO
 	}
 
 	while (XT_NODE_ID(current)) {
-		if (!xt_ind_fetch(ot, current, XT_LOCK_READ, &iref))
+		if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
 			goto failed;
 		ind->mi_scan_branch(ot->ot_table, ind, iref.ir_branch, &key_value, &result);
 		if (result.sr_item.i_node_ref_size) {
 			if (result.sr_found) {
 				/* If we have found the key in a node: */
 
+				search_down_stack:
 				/* Go down to the bottom: */
 				while (XT_NODE_ID(current)) {
 					xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 					if (!idx_push(&stack, current, &result.sr_item))
 						goto failed;
 					current = result.sr_branch;
-					if (!xt_ind_fetch(ot, current, XT_LOCK_READ, &iref))
+					if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
 						goto failed;
 					ind->mi_last_item(ot->ot_table, ind, iref.ir_branch, &result);
 					if (!result.sr_item.i_node_ref_size)
 						break;
 				}
 
-				/* Is the leaf not empty, then we are done... */
+				/* If the leaf empty we have to go up the stack again... */
 				if (result.sr_item.i_total_size == 0)
 					break;
+
+				if (ind->mi_lazy_delete) {
+					while (result.sr_row_id == (xtRowID) -1) {
+						if (result.sr_item.i_item_offset == 0)
+							goto search_up_stack;
+						ind->mi_prev_item(ot->ot_table, ind, iref.ir_branch, &result);
+					}
+				}
+
 				goto unlock_check_on_key;
 			}
 		}
@@ -2580,6 +3202,15 @@ xtPublic xtBool xt_idx_prev(register XTO
 			if (result.sr_item.i_item_offset == 0)
 				break;
 			ind->mi_prev_item(ot->ot_table, ind, iref.ir_branch, &result);
+
+			if (ind->mi_lazy_delete) {
+				while (result.sr_row_id == (xtRowID) -1) {
+					if (result.sr_item.i_item_offset == 0)
+						goto search_up_stack;
+					ind->mi_prev_item(ot->ot_table, ind, iref.ir_branch, &result);
+				}
+			}
+
 			goto unlock_check_on_key;
 		}
 		xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
@@ -2588,6 +3219,7 @@ xtPublic xtBool xt_idx_prev(register XTO
 		current = result.sr_branch;
 	}
 
+	search_up_stack:
 	/* We are at the start of a leaf node.
 	 * Go up the stack to find the start poition of the next key.
 	 * If we find none, then we are the end of the index.
@@ -2595,10 +3227,18 @@ xtPublic xtBool xt_idx_prev(register XTO
 	xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
 	while ((node = idx_pop(&stack))) {
 		if (node->i_pos.i_item_offset > node->i_pos.i_node_ref_size) {
-			if (!xt_ind_fetch(ot, node->i_branch, XT_LOCK_READ, &iref))
+			if (!xt_ind_fetch(ot, ind, node->i_branch, XT_LOCK_READ, &iref))
 				goto failed;
 			result.sr_item = node->i_pos;
 			ind->mi_prev_item(ot->ot_table, ind, iref.ir_branch, &result);
+
+			if (ind->mi_lazy_delete) {
+				if (result.sr_row_id == (xtRowID) -1) {
+					current = node->i_branch;
+					goto search_down_stack;
+				}
+			}
+
 			goto unlock_check_on_key;
 		}
 	}
@@ -2648,7 +3288,7 @@ xtPublic xtBool xt_idx_prev(register XTO
 }
 
 /* Return TRUE if the record matches the current index search! */
-xtPublic xtBool xt_idx_match_search(register XTOpenTablePtr ot __attribute__((unused)), register XTIndexPtr ind, register XTIdxSearchKeyPtr search_key, xtWord1 *buf, int mode)
+xtPublic xtBool xt_idx_match_search(register XTOpenTablePtr XT_UNUSED(ot), register XTIndexPtr ind, register XTIdxSearchKeyPtr search_key, xtWord1 *buf, int mode)
 {
 	int		r;
 	xtWord1	key_buf[XT_INDEX_MAX_KEY_SIZE];
@@ -2666,7 +3306,7 @@ xtPublic xtBool xt_idx_match_search(regi
 	return FALSE;
 }
 
-static void idx_set_index_selectivity(XTThreadPtr self __attribute__((unused)), XTOpenTablePtr ot, XTIndexPtr ind)
+static void idx_set_index_selectivity(XTThreadPtr self, XTOpenTablePtr ot, XTIndexPtr ind)
 {
 	static const xtRecordID MAX_RECORDS = 100;
 
@@ -2784,7 +3424,7 @@ static void idx_set_index_selectivity(XT
 	ot->ot_ind_rhandle = NULL;
 
 	failed:
-	ot->ot_table->tab_dic.dic_disable_index = XT_INDEX_CORRUPTED;
+	xt_tab_disable_index(ot->ot_table, XT_INDEX_CORRUPTED);
 	xt_log_and_clear_exception_ns();
 	return;
 }
@@ -2834,10 +3474,11 @@ static u_int idx_check_node(XTOpenTableP
 	XTIndReferenceRec	iref;
 
 #ifdef DEBUG
-	iref.ir_ulock = XT_UNLOCK_NONE;
+	iref.ir_xlock = 2;
+	iref.ir_updated = 2;
 #endif
 	ASSERT_NS(XT_NODE_ID(node) <= XT_NODE_ID(ot->ot_table->tab_ind_eof));
-	if (!xt_ind_fetch(ot, node, XT_LOCK_READ, &iref))
+	if (!xt_ind_fetch(ot, ind, node, XT_LOCK_READ, &iref))
 		return 0;
 
 	idx_first_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
@@ -2974,7 +3615,7 @@ xtPublic void xt_check_indices(XTOpenTab
 			track_block_exists(current);
 #endif
 			printf("%d ", (int) XT_NODE_ID(current));
-			if (!xt_ind_read_bytes(ot, current, sizeof(XTIndFreeBlockRec), (xtWord1 *) &free_block)) {
+			if (!xt_ind_read_bytes(ot, *ind, current, sizeof(XTIndFreeBlockRec), (xtWord1 *) &free_block)) {
 				xt_log_and_clear_exception_ns();
 				break;
 			}
@@ -3000,6 +3641,88 @@ xtPublic void xt_check_indices(XTOpenTab
 
 /*
  * -----------------------------------------------------------------------
+ * Load index
+ */
+
+static void idx_load_node(XTThreadPtr self, XTOpenTablePtr ot, XTIndexPtr ind, xtIndexNodeID node)
+{
+	XTIdxResultRec		result;
+	XTIndReferenceRec	iref;
+
+	ASSERT_NS(XT_NODE_ID(node) <= XT_NODE_ID(ot->ot_table->tab_ind_eof));
+	if (!xt_ind_fetch(ot, ind, node, XT_LOCK_READ, &iref))
+		xt_throw(self);
+
+	idx_first_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
+	if (result.sr_item.i_node_ref_size)
+		idx_load_node(self, ot, ind, result.sr_branch);
+	while (result.sr_item.i_item_offset < result.sr_item.i_total_size) {
+		idx_next_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
+		if (result.sr_item.i_node_ref_size)
+			idx_load_node(self, ot, ind, result.sr_branch);
+	}
+
+	xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+}
+
+xtPublic void xt_load_indices(XTThreadPtr self, XTOpenTablePtr ot)
+{
+	register XTTableHPtr	tab = ot->ot_table;
+	XTIndexPtr				*ind_ptr;
+	XTIndexPtr				ind;
+	xtIndexNodeID			current;
+
+	xt_lock_mutex(self, &tab->tab_ind_flush_lock);
+	pushr_(xt_unlock_mutex, &tab->tab_ind_flush_lock);
+
+	ind_ptr = tab->tab_dic.dic_keys;
+	for (u_int k=0; k<tab->tab_dic.dic_key_count; k++, ind_ptr++) {
+		ind = *ind_ptr;
+		XT_INDEX_WRITE_LOCK(ind, ot);
+		if ((XT_NODE_ID(current) = XT_NODE_ID(ind->mi_root)))
+			idx_load_node(self, ot, ind, current);
+		XT_INDEX_UNLOCK(ind, ot);
+	}
+
+	freer_(); // xt_unlock_mutex(&tab->tab_ind_flush_lock)
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * Count the number of deleted entries in a node:
+ */
+
+/*
+ * {LAZY-DEL-INDEX-ITEMS}
+ *
+ * Use this function to count the number of deleted items 
+ * in a node when it is loaded.
+ *
+ * The count helps us decide of the node should be "packed".
+ */
+xtPublic void xt_ind_count_deleted_items(XTTableHPtr tab, XTIndexPtr ind, XTIndBlockPtr block)
+{
+	XTIdxResultRec		result;
+	int					del_count = 0;
+	xtWord2				branch_size;
+
+	branch_size = XT_GET_DISK_2(((XTIdxBranchDPtr) block->cb_data)->tb_size_2);
+
+	/* This is possible when reading free pages. */
+	if (XT_GET_INDEX_BLOCK_LEN(branch_size) < 2 || XT_GET_INDEX_BLOCK_LEN(branch_size) > XT_INDEX_PAGE_SIZE)
+		return;
+
+	idx_first_branch_item(tab, ind, (XTIdxBranchDPtr) block->cb_data, &result);
+	while (result.sr_item.i_item_offset < result.sr_item.i_total_size) {
+		if (result.sr_row_id == (xtRowID) -1)
+			del_count++;
+		idx_next_branch_item(tab, ind, (XTIdxBranchDPtr) block->cb_data, &result);
+	}
+	block->cp_del_count = del_count;
+}
+
+/*
+ * -----------------------------------------------------------------------
  * Index consistant flush
  */
 
@@ -3408,7 +4131,7 @@ void XTIndexLogPool::ilp_init(struct XTT
 	xt_throw(self);
 }
 
-void XTIndexLogPool::ilp_close(struct XTThread *self __attribute__((unused)), xtBool lock)
+void XTIndexLogPool::ilp_close(struct XTThread *XT_UNUSED(self), xtBool lock)
 {
 	XTIndexLogPtr	il;
 
@@ -3570,7 +4293,7 @@ xtBool XTIndexLog::il_require_space(size
 	return OK;
 }
 
-xtBool XTIndexLog::il_write_byte(struct XTOpenTable *ot __attribute__((unused)), xtWord1 byte)
+xtBool XTIndexLog::il_write_byte(struct XTOpenTable *ot, xtWord1 byte)
 {
 	if (!il_require_space(1, ot->ot_thread))
 		return FAILED;
@@ -3579,7 +4302,7 @@ xtBool XTIndexLog::il_write_byte(struct 
 	return OK;
 }
 
-xtBool XTIndexLog::il_write_word4(struct XTOpenTable *ot __attribute__((unused)), xtWord4 value)
+xtBool XTIndexLog::il_write_word4(struct XTOpenTable *ot, xtWord4 value)
 {
 	xtWord1 *buffer;
 
@@ -3591,7 +4314,7 @@ xtBool XTIndexLog::il_write_word4(struct
 	return OK;
 }
 
-xtBool XTIndexLog::il_write_block(struct XTOpenTable *ot __attribute__((unused)), XTIndBlockPtr block)
+xtBool XTIndexLog::il_write_block(struct XTOpenTable *ot, XTIndBlockPtr block)
 {
 	XTIndPageDataDPtr	page_data;
 	xtIndexNodeID		node_id;
@@ -3618,7 +4341,7 @@ xtBool XTIndexLog::il_write_block(struct
 	return OK;
 }
 
-xtBool XTIndexLog::il_write_header(struct XTOpenTable *ot __attribute__((unused)), size_t head_size, xtWord1 *head_buf)
+xtBool XTIndexLog::il_write_header(struct XTOpenTable *ot, size_t head_size, xtWord1 *head_buf)
 {
 	XTIndHeadDataDPtr	head_data;
 

=== modified file 'storage/pbxt/src/index_xt.h'
--- a/storage/pbxt/src/index_xt.h	2009-04-02 10:03:14 +0000
+++ b/storage/pbxt/src/index_xt.h	2009-08-17 11:12:36 +0000
@@ -24,6 +24,7 @@
 #define __xt_index_h__
 
 #ifdef DRIZZLED
+#include <drizzled/definitions.h>
 #include <mysys/my_bitmap.h>
 #else
 #include <mysql_version.h>
@@ -34,7 +35,6 @@
 #include "linklist_xt.h"
 #include "datalog_xt.h"
 #include "datadic_xt.h"
-//#include "cache_xt.h"
 
 #ifndef MYSQL_VERSION_ID
 #error MYSQL_VERSION_ID must be defined!
@@ -109,7 +109,7 @@ class Field;
 
 #define XT_MAX_RECORD_REF_SIZE		8
 
-#define XT_INDEX_PAGE_DATA_SIZE		XT_INDEX_PAGE_SIZE - 2			/* NOTE: 2 == offsetof(XTIdxBranchDRec, tb_data) */
+#define XT_INDEX_PAGE_DATA_SIZE		(XT_INDEX_PAGE_SIZE - 2)			/* NOTE: 2 == offsetof(XTIdxBranchDRec, tb_data) */
 
 #define XT_MAKE_LEAF_SIZE(x)		((x) + offsetof(XTIdxBranchDRec, tb_data))
 
@@ -218,7 +218,7 @@ typedef struct XTIndFreeList {
  * in 32 threads on smalltab: runTest(SMALL_INSERT_TEST, 32, dbUrl)
  */
 /*
- * XT_INDEX_USE_RW_MUTEX:
+ * XT_INDEX_USE_RWMUTEX:
  * But the RW mutex is a close second, if not just as fast.
  * If it is at least as fast, then it is better because read lock
  * overhead is then zero.
@@ -240,17 +240,24 @@ typedef struct XTIndFreeList {
  * Latest test show that RW mutex is slightly faster:
  * 127460 to 123574 payment transactions.
  */
-#define XT_INDEX_USE_RW_MUTEX
+
+#ifdef XT_NO_ATOMICS
+#define XT_INDEX_USE_PTHREAD_RW
+#else
+//#define XT_INDEX_USE_RWMUTEX
 //#define XT_INDEX_USE_PTHREAD_RW
+//#define XT_INDEX_SPINXSLOCK
+#define XT_TAB_ROW_USE_XSMUTEX
+#endif
 
-#ifdef XT_INDEX_USE_FASTWRLOCK
-#define XT_INDEX_LOCK_TYPE				XTFastRWLockRec
-#define XT_INDEX_INIT_LOCK(s, i)		xt_fastrwlock_init(s, &(i)->mi_rwlock)
-#define XT_INDEX_FREE_LOCK(s, i)		xt_fastrwlock_free(s, &(i)->mi_rwlock)	
-#define XT_INDEX_READ_LOCK(i, o)		xt_fastrwlock_slock(&(i)->mi_rwlock, (o)->ot_thread)
-#define XT_INDEX_WRITE_LOCK(i, o)		xt_fastrwlock_xlock(&(i)->mi_rwlock, (o)->ot_thread)
-#define XT_INDEX_UNLOCK(i, o)			xt_fastrwlock_unlock(&(i)->mi_rwlock, (o)->ot_thread)
-#define XT_INDEX_HAVE_XLOCK(i, o)		TRUE
+#ifdef XT_TAB_ROW_USE_XSMUTEX
+#define XT_INDEX_LOCK_TYPE				XTXSMutexRec
+#define XT_INDEX_INIT_LOCK(s, i)		xt_xsmutex_init_with_autoname(s, &(i)->mi_rwlock)
+#define XT_INDEX_FREE_LOCK(s, i)		xt_xsmutex_free(s, &(i)->mi_rwlock)	
+#define XT_INDEX_READ_LOCK(i, o)		xt_xsmutex_slock(&(i)->mi_rwlock, (o)->ot_thread->t_id)
+#define XT_INDEX_WRITE_LOCK(i, o)		xt_xsmutex_xlock(&(i)->mi_rwlock, (o)->ot_thread->t_id)
+#define XT_INDEX_UNLOCK(i, o)			xt_xsmutex_unlock(&(i)->mi_rwlock, (o)->ot_thread->t_id)
+#define XT_INDEX_HAVE_XLOCK(i, o)		((i)->sxs_xlocker == (o)->ot_thread->t_id)
 #elif defined(XT_INDEX_USE_PTHREAD_RW)
 #define XT_INDEX_LOCK_TYPE				xt_rwlock_type
 #define XT_INDEX_INIT_LOCK(s, i)		xt_init_rwlock_with_autoname(s, &(i)->mi_rwlock)
@@ -259,7 +266,15 @@ typedef struct XTIndFreeList {
 #define XT_INDEX_WRITE_LOCK(i, o)		xt_xlock_rwlock_ns(&(i)->mi_rwlock)
 #define XT_INDEX_UNLOCK(i, o)			xt_unlock_rwlock_ns(&(i)->mi_rwlock)
 #define XT_INDEX_HAVE_XLOCK(i, o)		TRUE
-#else // XT_INDEX_USE_RW_MUTEX
+#elif defined(XT_INDEX_SPINXSLOCK)
+#define XT_INDEX_LOCK_TYPE				XTSpinXSLockRec
+#define XT_INDEX_INIT_LOCK(s, i)		xt_spinxslock_init_with_autoname(s, &(i)->mi_rwlock)
+#define XT_INDEX_FREE_LOCK(s, i)		xt_spinxslock_free(s, &(i)->mi_rwlock)	
+#define XT_INDEX_READ_LOCK(i, o)		xt_spinxslock_slock(&(i)->mi_rwlock, (o)->ot_thread->t_id)
+#define XT_INDEX_WRITE_LOCK(i, o)		xt_spinxslock_xlock(&(i)->mi_rwlock, (o)->ot_thread->t_id)
+#define XT_INDEX_UNLOCK(i, o)			xt_spinxslock_unlock(&(i)->mi_rwlock, (o)->ot_thread->t_id)
+#define XT_INDEX_HAVE_XLOCK(i, o)		((i)->mi_rwlock.nrw_xlocker == (o)->ot_thread->t_id)
+#else // XT_INDEX_USE_RWMUTEX
 #define XT_INDEX_LOCK_TYPE				XTRWMutexRec
 #define XT_INDEX_INIT_LOCK(s, i)		xt_rwmutex_init_with_autoname(s, &(i)->mi_rwlock)
 #define XT_INDEX_FREE_LOCK(s, i)		xt_rwmutex_free(s, &(i)->mi_rwlock)	
@@ -289,22 +304,24 @@ typedef struct XTIndex {
 	XTIndFreeListPtr	mi_free_list;				/* List of free pages for this index. */
 	
 	/* Protected by the mi_dirty_lock: */
-	XTSpinLockRec		mi_dirty_lock;			/* Spin lock protecting the dirty & free lists. */
+	XTSpinLockRec		mi_dirty_lock;				/* Spin lock protecting the dirty & free lists. */
 	struct XTIndBlock	*mi_dirty_list;				/* List of dirty pages for this index. */
 	u_int				mi_dirty_blocks;			/* Count of the dirty blocks. */
 
 	/* Index contants: */
 	u_int				mi_flags;
 	u_int				mi_key_size;
+	u_int				mi_max_items;				/* The maximum number of items that can fit in a leaf node. */
 	xtBool				mi_low_byte_first;
 	xtBool				mi_fix_key;
+	xtBool				mi_lazy_delete;				/* TRUE if index entries are "lazy deleted". */
 	u_int				mi_single_type;				/* Used when the index contains a single field. */
 	u_int				mi_select_total;
 	XTScanBranchFunc	mi_scan_branch;
 	XTPrevItemFunc		mi_prev_item;
 	XTLastItemFunc		mi_last_item;
 	XTSimpleCompFunc	mi_simple_comp_key;
-	MY_BITMAP			mi_col_map;					/* Bit-map of columns in the index. */
+	MX_BITMAP			mi_col_map;					/* Bit-map of columns in the index. */
 	u_int				mi_subset_of;				/* Indicates if this index is a complete subset of someother index. */
 	u_int				mi_seg_count;
 	XTIndexSegRec		mi_seg[200];
@@ -344,6 +361,7 @@ typedef struct XTDictionary {
 	Field				**dic_blob_cols;
 
 	/* MySQL related information. NULL when no tables are open from MySQL side! */
+	xtBool				dic_no_lazy_delete;			/* FALSE if lazy delete is OK. */
 	u_int				dic_disable_index;			/* Non-zero if the index cannot be used. */
 	u_int				dic_index_ver;				/* The version of the index. */
 	u_int				dic_key_count;
@@ -462,6 +480,8 @@ xtBool	xt_idx_prev(register struct XTOpe
 xtBool	xt_idx_read(struct XTOpenTable *ot, struct XTIndex *ind, xtWord1 *rec_buf);
 void	xt_ind_set_index_selectivity(XTThreadPtr self, struct XTOpenTable *ot);
 void	xt_check_indices(struct XTOpenTable *ot);
+void	xt_load_indices(XTThreadPtr self, struct XTOpenTable *ot);
+void	xt_ind_count_deleted_items(struct XTTable *ot, struct XTIndex *ind, struct XTIndBlock *block);
 xtBool	xt_flush_indices(struct XTOpenTable *ot, off_t *bytes_flushed, xtBool have_table_lock);
 void	xt_ind_track_dump_block(struct XTTable *tab, xtIndexNodeID address);
 
@@ -482,6 +502,7 @@ void	xt_prev_branch_item_var(struct XTTa
 
 void	xt_last_branch_item_fix(struct XTTable *tab, XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxResultPtr result);
 void	xt_last_branch_item_var(struct XTTable *tab, XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxResultPtr result);
+xtBool	xt_idx_lazy_delete_on_leaf(XTIndexPtr ind, struct XTIndBlock *block, xtWord2 branch_size);
 
 //#define TRACK_ACTIVITY
 #ifdef TRACK_ACTIVITY

=== modified file 'storage/pbxt/src/lock_xt.cc'
--- a/storage/pbxt/src/lock_xt.cc	2009-04-02 10:03:14 +0000
+++ b/storage/pbxt/src/lock_xt.cc	2009-08-17 11:12:36 +0000
@@ -25,6 +25,10 @@
 
 #include "xt_config.h"
 
+#ifdef DRIZZLED
+#include <bitset>
+#endif
+
 #include <stdio.h>
 
 #include "lock_xt.h"
@@ -40,6 +44,16 @@
 #endif
 
 /*
+ * This function should never be called. It indicates a link
+ * error!
+ */
+xtPublic void xt_log_atomic_error_and_abort(c_char *func, c_char *file, u_int line)
+{
+	xt_logf(NULL, func, file, line, XT_LOG_ERROR, "%s", "Atomic operations not supported\n");
+	abort();
+}
+
+/*
  * -----------------------------------------------------------------------
  * ROW LOCKS, LIST BASED
  */
@@ -715,7 +729,7 @@ xtBool xt_init_row_locks(XTRowLocksPtr r
 	return OK;
 }
 
-void xt_exit_row_locks(XTRowLocksPtr rl __attribute__((unused)))
+void xt_exit_row_locks(XTRowLocksPtr rl)
 {
 	for (int i=0; i<XT_ROW_LOCK_GROUP_COUNT; i++) {
 		xt_spinlock_free(NULL, &rl->rl_groups[i].lg_lock);
@@ -982,7 +996,7 @@ xtBool old_xt_init_row_locks(XTRowLocksP
 	return OK;
 }
 
-void old_xt_exit_row_locks(XTRowLocksPtr rl __attribute__((unused)))
+void old_xt_exit_row_locks(XTRowLocksPtr XT_UNUSED(rl))
 {
 }
 
@@ -1007,10 +1021,6 @@ xtPublic void xt_exit_row_lock_list(XTRo
  * SPECIAL EXCLUSIVE/SHARED (XS) LOCK
  */
 
-#define XT_GET1(x)		*(x)
-#define XT_SET4(x, y)	xt_atomic_set4(x, y)
-#define XT_GET4(x)		xt_atomic_get4(x)
-
 #ifdef XT_THREAD_LOCK_INFO
 xtPublic void xt_rwmutex_init(struct XTThread *self, XTRWMutexPtr xsl, const char *n)
 #else
@@ -1023,7 +1033,7 @@ xtPublic void xt_rwmutex_init(XTThreadPt
 #endif
 	xt_init_mutex_with_autoname(self, &xsl->xs_lock);
 	xt_init_cond(self, &xsl->xs_cond);
-	XT_SET4(&xsl->xs_state, 0);
+	xt_atomic_set4(&xsl->xs_state, 0);
 	xsl->xs_xlocker = 0;
 	/* Must be aligned! */
 	ASSERT(xt_thr_maximum_threads == xt_align_size(xt_thr_maximum_threads, XT_XS_LOCK_ALIGN));
@@ -1068,7 +1078,7 @@ xtPublic xtBool xt_rwmutex_xlock(XTRWMut
 	}
 
 	/* I am the locker (set state before locker!): */
-	XT_SET4(&xsl->xs_state, 0);
+	xt_atomic_set4(&xsl->xs_state, 0);
 	xsl->xs_xlocker = thd_id;
 
 	/* Wait for all the read lockers: */
@@ -1078,7 +1088,7 @@ xtPublic xtBool xt_rwmutex_xlock(XTRWMut
 			 * Just in case of this, we keep the wait time down!
 			 */
 			if (!xt_timed_wait_cond_ns(&xsl->xs_cond, &xsl->xs_lock, 10)) {
-				XT_SET4(&xsl->xs_state, 0);
+				xt_atomic_set4(&xsl->xs_state, 0);
 				xsl->xs_xlocker = 0;
 				xt_unlock_mutex_ns(&xsl->xs_lock);
 				return FAILED;
@@ -1087,11 +1097,11 @@ xtPublic xtBool xt_rwmutex_xlock(XTRWMut
 		/* State can be incremented in parallel by a reader
 		 * thread!
 		 */
-		XT_SET4(&xsl->xs_state, xsl->xs_state + 1);
+		xt_atomic_set4(&xsl->xs_state, xsl->xs_state + 1);
 	}
 
 	/* I have waited for all: */
-	XT_SET4(&xsl->xs_state, xt_thr_maximum_threads);
+	xt_atomic_set4(&xsl->xs_state, xt_thr_maximum_threads);
 
 #ifdef XT_THREAD_LOCK_INFO
 	xt_thread_lock_info_add_owner(&xsl->xs_lock_info);
@@ -1107,7 +1117,7 @@ xtPublic xtBool xt_rwmutex_slock(XTRWMut
 #endif
 	ASSERT_NS(xt_get_self()->t_id == thd_id);
 
-	xt_flushed_inc1(&xsl->x.xs_rlock[thd_id]);
+	xt_atomic_inc1(&xsl->x.xs_rlock[thd_id]);
 
 	if (xsl->x.xs_rlock[thd_id] > 1)
 		return OK;
@@ -1158,7 +1168,7 @@ xtPublic xtBool xt_rwmutex_unlock(XTRWMu
 		/* I have an X lock. */
 		ASSERT_NS(xsl->x.xs_rlock[thd_id] == XT_NO_LOCK);
 		ASSERT_NS(xsl->xs_state == xt_thr_maximum_threads);
-		XT_SET4(&xsl->xs_state, 0);
+		xt_atomic_set4(&xsl->xs_state, 0);
 		xsl->xs_xlocker = 0;
 		xt_unlock_mutex_ns(&xsl->xs_lock);
 		/* Wake up any other X or shared lockers: */
@@ -1201,7 +1211,7 @@ xtPublic xtBool xt_rwmutex_unlock(XTRWMu
 						return FAILED;
 					}
 				}
-				xt_flushed_dec1(&xsl->x.xs_rlock[thd_id]);
+				xt_atomic_dec1(&xsl->x.xs_rlock[thd_id]);
 				xt_unlock_mutex_ns(&xsl->xs_lock);
 			}
 			else
@@ -1213,7 +1223,7 @@ xtPublic xtBool xt_rwmutex_unlock(XTRWMu
 				 * try to get the lock xs_lock, I could hand for the duration
 				 * of the X lock.
 				 */
-				xt_flushed_dec1(&xsl->x.xs_rlock[thd_id]);
+				xt_atomic_dec1(&xsl->x.xs_rlock[thd_id]);
 		}
 	}
 #ifdef XT_THREAD_LOCK_INFO
@@ -1228,13 +1238,14 @@ xtPublic xtBool xt_rwmutex_unlock(XTRWMu
  */
 
 #ifdef XT_THREAD_LOCK_INFO
-xtPublic void xt_spinlock_init(XTThreadPtr self __attribute__((unused)), XTSpinLockPtr spl, const char *n)
+xtPublic void xt_spinlock_init(XTThreadPtr self, XTSpinLockPtr spl, const char *n)
 #else
-xtPublic void xt_spinlock_init(XTThreadPtr self __attribute__((unused)), XTSpinLockPtr spl)
+xtPublic void xt_spinlock_init(XTThreadPtr self, XTSpinLockPtr spl)
 #endif
 {
+	(void) self;
 	spl->spl_lock = 0;
-#ifdef XT_SPL_DEFAULT
+#ifdef XT_NO_ATOMICS
 	xt_init_mutex(self, &spl->spl_mutex);
 #endif
 #ifdef DEBUG
@@ -1246,9 +1257,10 @@ xtPublic void xt_spinlock_init(XTThreadP
 #endif
 }
 
-xtPublic void xt_spinlock_free(XTThreadPtr self __attribute__((unused)), XTSpinLockPtr spl __attribute__((unused)))
+xtPublic void xt_spinlock_free(XTThreadPtr XT_UNUSED(self), XTSpinLockPtr spl)
 {
-#ifdef XT_SPL_DEFAULT
+	(void) spl;
+#ifdef XT_NO_ATOMICS
 	xt_free_mutex(&spl->spl_mutex);
 #endif
 #ifdef XT_THREAD_LOCK_INFO
@@ -1266,7 +1278,7 @@ xtPublic xtBool xt_spinlock_spin(XTSpinL
 			if (!*lck) {
 				/* Try to get the lock: */
 				if (!xt_spinlock_set(spl))
-					return OK;
+					goto done_ok;
 			}
 		}
 
@@ -1274,6 +1286,7 @@ xtPublic xtBool xt_spinlock_spin(XTSpinL
 		xt_critical_wait();
 	}
 
+	done_ok:
 	return OK;
 }
 
@@ -1400,147 +1413,96 @@ xtPublic void xt_fastlock_wakeup(XTFastL
 /*
  * -----------------------------------------------------------------------
  * READ/WRITE SPIN LOCK
+ *
+ * An extremely genius very fast read/write lock based on atomics!
  */
 
 #ifdef XT_THREAD_LOCK_INFO
-xtPublic void xt_spinrwlock_init(struct XTThread *self, XTSpinRWLockPtr srw, const char *name)
+xtPublic void xt_spinxslock_init(struct XTThread *XT_UNUSED(self), XTSpinXSLockPtr sxs, const char *name)
 #else
-xtPublic void xt_spinrwlock_init(struct XTThread *self, XTSpinRWLockPtr srw)
+xtPublic void xt_spinxslock_init(struct XTThread *XT_UNUSED(self), XTSpinXSLockPtr sxs)
 #endif
 {
-	xt_spinlock_init_with_autoname(self, &srw->srw_lock);
-	xt_spinlock_init_with_autoname(self, &srw->srw_state_lock);
-	srw->srw_state = 0;
-	srw->srw_xlocker = 0;
-	/* Must be aligned! */
-	ASSERT(xt_thr_maximum_threads == xt_align_size(xt_thr_maximum_threads, XT_XS_LOCK_ALIGN));
-	srw->x.srw_rlock = (xtWord1 *) xt_calloc(self, xt_thr_maximum_threads);
+	sxs->sxs_xlocked = 0;
+	sxs->sxs_rlock_count = 0;
+	sxs->sxs_wait_count = 0;
+#ifdef DEBUG
+	sxs->sxs_locker = 0;
+#endif
 #ifdef XT_THREAD_LOCK_INFO
-	srw->srw_name = name;
-	xt_thread_lock_info_init(&srw->srw_lock_info, srw);
+	sxs->sxs_name = name;
+	xt_thread_lock_info_init(&sxs->sxs_lock_info, sxs);
 #endif
 }
 
-xtPublic void xt_spinrwlock_free(struct XTThread *self, XTSpinRWLockPtr srw)
+xtPublic void xt_spinxslock_free(struct XTThread *XT_UNUSED(self), XTSpinXSLockPtr sxs)
 {
-	if (srw->x.srw_rlock)
-		xt_free(self, (void *) srw->x.srw_rlock);
-	xt_spinlock_free(self, &srw->srw_lock);
-	xt_spinlock_free(self, &srw->srw_state_lock);
 #ifdef XT_THREAD_LOCK_INFO
-	xt_thread_lock_info_free(&srw->srw_lock_info);
+	xt_thread_lock_info_free(&sxs->sxs_lock_info);
+#else
+	(void) sxs;
 #endif
 }
 
-xtPublic xtBool xt_spinrwlock_xlock(XTSpinRWLockPtr srw, xtThreadID thd_id)
+xtPublic xtBool xt_spinxslock_xlock(XTSpinXSLockPtr sxs, xtThreadID XT_NDEBUG_UNUSED(thd_id))
 {
-	xt_spinlock_lock(&srw->srw_lock);
-	ASSERT_NS(srw->x.srw_rlock[thd_id] == XT_NO_LOCK);
-	
-	xt_spinlock_lock(&srw->srw_state_lock);
-
-	/* Set the state before xlocker (dirty read!) */
-	srw->srw_state = 0;
-
-	/* I am the locker: */
-	srw->srw_xlocker = thd_id;
+	register xtWord2 set;
 
-	/* Wait for all the read lockers: */
-	while (srw->srw_state < xt_thr_current_max_threads) {
-		while (srw->x.srw_rlock[srw->srw_state]) {
-			xt_spinlock_unlock(&srw->srw_state_lock);
-			/* Wait for this reader, during this time, the reader
-			 * himself, may increment the state. */
-			xt_critical_wait();
-			xt_spinlock_lock(&srw->srw_state_lock);
-		}
-		/* State can be incremented in parallel by a reader
-		 * thread!
-		 */
-		srw->srw_state++;
+	/* Wait for exclusive locker: */
+	for (;;) {
+		set = xt_atomic_tas2(&sxs->sxs_xlocked, 1);
+		if (!set)
+			break;
+		xt_yield();
 	}
 
-	/* I have waited for all: */
-	srw->srw_state = xt_thr_maximum_threads;
+#ifdef DEBUG
+	sxs->sxs_locker = thd_id;
+#endif
 
-	xt_spinlock_unlock(&srw->srw_state_lock);
+	/* Wait for all the reader to wait! */
+	while (sxs->sxs_wait_count < sxs->sxs_rlock_count)
+		xt_yield();
 
 #ifdef XT_THREAD_LOCK_INFO
-	xt_thread_lock_info_add_owner(&srw->srw_lock_info);
+	xt_thread_lock_info_add_owner(&sxs->sxs_lock_info);
 #endif
-
 	return OK;
 }
 
-xtPublic xtBool xt_spinrwlock_slock(XTSpinRWLockPtr srw, xtThreadID thd_id)
+xtPublic xtBool xt_spinxslock_slock(XTSpinXSLockPtr sxs)
 {
-	ASSERT_NS(srw->x.srw_rlock[thd_id] == XT_NO_LOCK);
-	srw->x.srw_rlock[thd_id] = XT_WANT_LOCK;
+	xt_atomic_inc2(&sxs->sxs_rlock_count);
+
 	/* Check if there could be an X locker: */
-	if (srw->srw_xlocker) {
-		/* There is an X locker.
-		 * If srw_state < thd_id then the X locker will wait for me.
-		 * So I should not wait!
-		 */
-		if (srw->srw_state >= thd_id) {
-			/* If srw->srw_state >= thd_id, then the locker may have, or
-			 * has already checked me, and I will have to wait.
-			 *
-			 * Otherwise, srw_state <= thd_id, which means the
-			 * X locker has not checked me, and will still wait for me (or 
-			 * is already waiting for me). In this case, I will have to
-			 * take the mutex to make sure exactly how far he
-			 * is with the checking.
-			 */
-			xt_spinlock_lock(&srw->srw_state_lock);
-			while (srw->srw_state > thd_id && srw->srw_xlocker) {
-				xt_spinlock_unlock(&srw->srw_state_lock);
-				xt_critical_wait();
-				xt_spinlock_lock(&srw->srw_state_lock);
-			}
-			xt_spinlock_unlock(&srw->srw_state_lock);
-		}
+	if (sxs->sxs_xlocked) {
+		/* I am waiting... */
+		xt_atomic_inc2(&sxs->sxs_wait_count);
+		while (sxs->sxs_xlocked)
+			xt_yield();
+		xt_atomic_dec2(&sxs->sxs_wait_count);
 	}
-	/* There is no exclusive locker, so we have the read lock: */
-	srw->x.srw_rlock[thd_id] = XT_HAVE_LOCK;
 
 #ifdef XT_THREAD_LOCK_INFO
-	xt_thread_lock_info_add_owner(&srw->srw_lock_info);
+	xt_thread_lock_info_add_owner(&sxs->sxs_lock_info);
 #endif
-
 	return OK;
 }
 
-xtPublic xtBool xt_spinrwlock_unlock(XTSpinRWLockPtr srw, xtThreadID thd_id)
+xtPublic xtBool xt_spinxslock_unlock(XTSpinXSLockPtr sxs, xtBool xlocked)
 {
-	if (srw->srw_xlocker == thd_id) {
-		/* I have an X lock. */
-		ASSERT_NS(srw->srw_state == xt_thr_maximum_threads);
-		srw->srw_state = 0;
-		srw->srw_xlocker = 0;
-		xt_spinlock_unlock(&srw->srw_lock);
-	}
-	else {
-		/* I have a shared lock: */
-		ASSERT_NS(srw->x.srw_rlock[thd_id] == XT_HAVE_LOCK);
-		ASSERT_NS(srw->srw_state != xt_thr_maximum_threads);
-		srw->x.srw_rlock[thd_id] = XT_NO_LOCK;
-		if (srw->srw_xlocker && srw->srw_state == thd_id) {
-			xt_spinlock_lock(&srw->srw_state_lock);
-			if (srw->srw_xlocker && srw->srw_state == thd_id) {
-				/* If the X locker is waiting for me,
-				 * then allow him to continue. 
-				 */
-				srw->srw_state = thd_id+1;
-			}
-			xt_spinlock_unlock(&srw->srw_state_lock);
-		}
+	if (xlocked) {
+#ifdef DEBUG
+		sxs->sxs_locker = 0;
+#endif
+		sxs->sxs_xlocked = 0;
 	}
+	else
+		xt_atomic_dec2(&sxs->sxs_rlock_count);
 
 #ifdef XT_THREAD_LOCK_INFO
-	xt_thread_lock_info_release_owner(&srw->srw_lock_info);
+	xt_thread_lock_info_release_owner(&sxs->sxs_lock_info);
 #endif
-
 	return OK;
 }
 
@@ -1550,194 +1512,159 @@ xtPublic xtBool xt_spinrwlock_unlock(XTS
  */
 
 #ifdef XT_THREAD_LOCK_INFO
-xtPublic void xt_fastrwlock_init(struct XTThread *self, XTFastRWLockPtr frw, const char *n)
+xtPublic void xt_xsmutex_init(struct XTThread *self, XTXSMutexLockPtr xsm, const char *name)
 #else
-xtPublic void xt_fastrwlock_init(struct XTThread *self, XTFastRWLockPtr frw)
+xtPublic void xt_xsmutex_init(struct XTThread *self, XTXSMutexLockPtr xsm)
 #endif
 {
-	xt_fastlock_init_with_autoname(self, &frw->frw_lock);
-	frw->frw_xlocker = NULL;
-	xt_spinlock_init_with_autoname(self, &frw->frw_state_lock);
-	frw->frw_state = 0;
-	frw->frw_read_waiters = 0;
-	/* Must be aligned! */
-	ASSERT(xt_thr_maximum_threads == xt_align_size(xt_thr_maximum_threads, XT_XS_LOCK_ALIGN));
-	frw->x.frw_rlock = (xtWord1 *) xt_calloc(self, xt_thr_maximum_threads);
+	xt_init_mutex_with_autoname(self, &xsm->xsm_lock);
+	xt_init_cond(self, &xsm->xsm_cond);
+	xt_init_cond(self, &xsm->xsm_cond_2);
+	xsm->xsm_xlocker = 0;
+	xsm->xsm_rlock_count = 0;
+	xsm->xsm_wait_count = 0;
+#ifdef DEBUG
+	xsm->xsm_locker = 0;
+#endif
 #ifdef XT_THREAD_LOCK_INFO
-	frw->frw_name = n;
-	xt_thread_lock_info_init(&frw->frw_lock_info, frw);
+	xsm->xsm_name = name;
+	xt_thread_lock_info_init(&xsm->xsm_lock_info, xsm);
 #endif
 }
 
-xtPublic void xt_fastrwlock_free(struct XTThread *self, XTFastRWLockPtr frw)
+xtPublic void xt_xsmutex_free(struct XTThread *XT_UNUSED(self), XTXSMutexLockPtr xsm)
 {
-	if (frw->x.frw_rlock)
-		xt_free(self, (void *) frw->x.frw_rlock);
-	xt_fastlock_free(self, &frw->frw_lock);
-	xt_spinlock_free(self, &frw->frw_state_lock);
+	xt_free_mutex(&xsm->xsm_lock);
+	xt_free_cond(&xsm->xsm_cond);
+	xt_free_cond(&xsm->xsm_cond_2);
 #ifdef XT_THREAD_LOCK_INFO
-	xt_thread_lock_info_free(&frw->frw_lock_info);
+	xt_thread_lock_info_free(&xsm->xsm_lock_info);
 #endif
 }
 
-xtPublic xtBool xt_fastrwlock_xlock(XTFastRWLockPtr frw, struct XTThread *thread)
+xtPublic xtBool xt_xsmutex_xlock(XTXSMutexLockPtr xsm, xtThreadID thd_id)
 {
-	xt_fastlock_lock(&frw->frw_lock, thread);
-	ASSERT_NS(frw->x.frw_rlock[thread->t_id] == XT_NO_LOCK);
-	
-	xt_spinlock_lock(&frw->frw_state_lock);
-
-	/* Set the state before xlocker (dirty read!) */
-	frw->frw_state = 0;
+	xt_lock_mutex_ns(&xsm->xsm_lock);
 
-	/* I am the locker: */
-	frw->frw_xlocker = thread;
-
-	/* Wait for all the read lockers: */
-	while (frw->frw_state < xt_thr_current_max_threads) {
-		while (frw->x.frw_rlock[frw->frw_state]) {
-			xt_lock_thread(thread);
-			xt_spinlock_unlock(&frw->frw_state_lock);
-			/* Wait for this reader. We rely on the reader to free
-			 * us from this wait! */
-			if (!xt_wait_thread(thread)) {
-				xt_unlock_thread(thread);
-				frw->frw_state = 0;
-				frw->frw_xlocker = NULL;
-				xt_fastlock_unlock(&frw->frw_lock, thread);
-				return FAILED;
-			}
-			xt_unlock_thread(thread);
-			xt_spinlock_lock(&frw->frw_state_lock);
+	/* Wait for exclusive locker: */
+	while (xsm->xsm_xlocker) {
+		if (!xt_timed_wait_cond_ns(&xsm->xsm_cond, &xsm->xsm_lock, 10000)) {
+			xt_unlock_mutex_ns(&xsm->xsm_lock);
+			return FAILED;
 		}
-		/* State can be incremented in parallel by a reader
-		 * thread!
-		 */
-		frw->frw_state++;
 	}
 
-	/* I have waited for all: */
-	frw->frw_state = xt_thr_maximum_threads;
-
-	xt_spinlock_unlock(&frw->frw_state_lock);
+	/* GOTCHA: You would think this is not necessary...
+	 * But is does not always work, if a normal insert is used.
+	 * The reason is, I guess, on MMP the assignment is not
+	 * always immediately visible to other processors, because they
+	 * have old versions of this variable in there cache.
+	 *
+	 * But this is required, because the locking mechanism is based
+	 * on:
+	 * Locker: sets xlocker, tests rlock_count
+	 * Reader: incs rlock_count, tests xlocker
+	 *
+	 * The test, in both cases, may not read stale values.
+	 * volatile does not help, because this just turns compiler
+	 * optimisations off.
+	 */
+	xt_atomic_set4(&xsm->xsm_xlocker, thd_id);
+
+	/* Wait for all the reader to wait! */
+	while (xsm->xsm_wait_count < xsm->xsm_rlock_count) {
+		/* {RACE-WR_MUTEX} Here as well: */
+		if (!xt_timed_wait_cond_ns(&xsm->xsm_cond, &xsm->xsm_lock, 100)) {
+			xsm->xsm_xlocker = 0;
+			xt_unlock_mutex_ns(&xsm->xsm_lock);
+			return FAILED;
+		}
+	}
 
 #ifdef XT_THREAD_LOCK_INFO
-	xt_thread_lock_info_add_owner(&frw->frw_lock_info);
+	xt_thread_lock_info_add_owner(&xsm->xsm_lock_info);
 #endif
-
 	return OK;
 }
 
-xtPublic xtBool xt_fastrwlock_slock(XTFastRWLockPtr frw, struct XTThread *thread)
+xtPublic xtBool xt_xsmutex_slock(XTXSMutexLockPtr xsm, xtThreadID XT_UNUSED(thd_id))
 {
-	xtThreadID thd_id = thread->t_id;
+	xt_atomic_inc2(&xsm->xsm_rlock_count);
 
-	ASSERT_NS(frw->x.frw_rlock[thd_id] == XT_NO_LOCK);
-	frw->x.frw_rlock[thd_id] = XT_WANT_LOCK;
 	/* Check if there could be an X locker: */
-	if (frw->frw_xlocker) {
-		/* There is an X locker.
-		 * If frw_state < thd_id then the X locker will wait for me.
-		 * So I should not wait!
-		 */
-		if (frw->frw_state >= thd_id) {
-			/* If frw->frw_state >= thd_id, then the locker may have, or
-			 * has already checked me, and I will have to wait.
-			 *
-			 * Otherwise, frw_state <= thd_id, which means the
-			 * X locker has not checked me, and will still wait for me (or 
-			 * is already waiting for me). In this case, I will have to
-			 * take the mutex to make sure exactly how far he
-			 * is with the checking.
-			 */
-			xt_spinlock_lock(&frw->frw_state_lock);
-			frw->frw_read_waiters++;
-			frw->x.frw_rlock[thd_id] = XT_WAITING;
-			while (frw->frw_state > thd_id && frw->frw_xlocker) {
-				xt_lock_thread(thread);
-				xt_spinlock_unlock(&frw->frw_state_lock);
-				if (!xt_wait_thread(thread)) {
-					xt_unlock_thread(thread);
-					xt_spinlock_lock(&frw->frw_state_lock);
-					frw->frw_read_waiters--;
-					frw->x.frw_rlock[thd_id] = XT_NO_LOCK;
-					xt_spinlock_unlock(&frw->frw_state_lock);
-					return FAILED;
-				}
-				xt_unlock_thread(thread);
-				xt_spinlock_lock(&frw->frw_state_lock);
+	if (xsm->xsm_xlocker) {
+		/* I am waiting... */
+		xt_lock_mutex_ns(&xsm->xsm_lock);
+		xsm->xsm_wait_count++;
+		/* Wake up the xlocker: */
+		if (xsm->xsm_xlocker && xsm->xsm_wait_count == xsm->xsm_rlock_count) {
+			if (!xt_broadcast_cond_ns(&xsm->xsm_cond)) {
+				xsm->xsm_wait_count--;
+				xt_unlock_mutex_ns(&xsm->xsm_lock);
+				return FAILED;
 			}
-			frw->x.frw_rlock[thd_id] = XT_HAVE_LOCK;
-			frw->frw_read_waiters--;
-			xt_spinlock_unlock(&frw->frw_state_lock);
-			return OK;
 		}
+		while (xsm->xsm_xlocker) {
+			if (!xt_timed_wait_cond_ns(&xsm->xsm_cond_2, &xsm->xsm_lock, 10000)) {
+				xsm->xsm_wait_count--;
+				xt_unlock_mutex_ns(&xsm->xsm_lock);
+				return FAILED;
+			}
+		}
+		xsm->xsm_wait_count--;
+		xt_unlock_mutex_ns(&xsm->xsm_lock);
 	}
-	/* There is no exclusive locker, so we have the read lock: */
-	frw->x.frw_rlock[thd_id] = XT_HAVE_LOCK;
 
 #ifdef XT_THREAD_LOCK_INFO
-	xt_thread_lock_info_add_owner(&frw->frw_lock_info);
+	xt_thread_lock_info_add_owner(&xsm->xsm_lock_info);
 #endif
-
 	return OK;
 }
 
-xtPublic xtBool xt_fastrwlock_unlock(XTFastRWLockPtr frw, struct XTThread *thread)
+xtPublic xtBool xt_xsmutex_unlock(XTXSMutexLockPtr xsm, xtThreadID thd_id)
 {
-	xtThreadID thd_id = thread->t_id;
-
-	if (frw->frw_xlocker == thread) {
-		/* I have an X lock. */
-		ASSERT_NS(frw->frw_state == xt_thr_maximum_threads);
-		frw->frw_state = 0;
-		frw->frw_xlocker = NULL;
-
-		/* Wake up all read waiters: */
-		if (frw->frw_read_waiters) {
-			xt_spinlock_lock(&frw->frw_state_lock);
-			if (frw->frw_read_waiters) {
-				XTThreadPtr	target;
-
-				for (u_int i=0; i<xt_thr_current_max_threads; i++) {
-					if (frw->x.frw_rlock[i] == XT_WAITING) {
-						if ((target = xt_thr_array[i])) {
-							xt_lock_thread(target);
-							xt_signal_thread(target);
-							xt_unlock_thread(target);
-						}
-					}
-				}
+	if (xsm->xsm_xlocker == thd_id) {
+		xsm->xsm_xlocker = 0;
+		if (xsm->xsm_wait_count) {
+			if (!xt_broadcast_cond_ns(&xsm->xsm_cond_2)) {
+				xt_unlock_mutex_ns(&xsm->xsm_lock);
+				return FAILED;
 			}
-			xt_spinlock_unlock(&frw->frw_state_lock);
 		}
-		xt_fastlock_unlock(&frw->frw_lock, thread);
+		else {
+			/* Wake up any other X or shared lockers: */
+			if (!xt_broadcast_cond_ns(&xsm->xsm_cond)) {
+				xt_unlock_mutex_ns(&xsm->xsm_lock);
+				return FAILED;
+			}
+		}
+		xt_unlock_mutex_ns(&xsm->xsm_lock);
 	}
 	else {
-		/* I have a shared lock: */
-		ASSERT_NS(frw->x.frw_rlock[thd_id] == XT_HAVE_LOCK);
-		ASSERT_NS(frw->frw_state != xt_thr_maximum_threads);
-		frw->x.frw_rlock[thd_id] = XT_NO_LOCK;
-		if (frw->frw_xlocker && frw->frw_state == thd_id) {
-			xt_spinlock_lock(&frw->frw_state_lock);
-			if (frw->frw_xlocker && frw->frw_state == thd_id) {
+		/* Taking the advice from {RACE-WR_MUTEX} I do the decrement
+		 * after I have a lock!
+		 */
+		if (xsm->xsm_xlocker) {
+			xt_lock_mutex_ns(&xsm->xsm_lock);
+			xt_atomic_dec2(&xsm->xsm_rlock_count);
+			if (xsm->xsm_xlocker && xsm->xsm_wait_count == xsm->xsm_rlock_count) {
 				/* If the X locker is waiting for me,
 				 * then allow him to continue. 
 				 */
-				frw->frw_state = thd_id+1;
-				/* Wake him up: */
-				xt_lock_thread(frw->frw_xlocker);
-				xt_signal_thread(frw->frw_xlocker);
-				xt_unlock_thread(frw->frw_xlocker);
+				if (!xt_broadcast_cond_ns(&xsm->xsm_cond)) {
+					xt_unlock_mutex_ns(&xsm->xsm_lock);
+					return FAILED;
+				}
 			}
-			xt_spinlock_unlock(&frw->frw_state_lock);
+			xt_unlock_mutex_ns(&xsm->xsm_lock);
 		}
+		else
+			xt_atomic_dec2(&xsm->xsm_rlock_count);
 	}
 
 #ifdef XT_THREAD_LOCK_INFO
-	xt_thread_lock_info_release_owner(&frw->frw_lock_info);
+	xt_thread_lock_info_release_owner(&xsm->xsm_lock_info);
 #endif
-
 	return OK;
 }
 
@@ -1747,9 +1674,9 @@ xtPublic xtBool xt_fastrwlock_unlock(XTF
  */
 
 #ifdef XT_THREAD_LOCK_INFO
-xtPublic void xt_atomicrwlock_init(struct XTThread XT_UNUSED(*self), XTAtomicRWLockPtr arw, const char *n)
+xtPublic void xt_atomicrwlock_init(struct XTThread *XT_UNUSED(self), XTAtomicRWLockPtr arw, const char *n)
 #else
-xtPublic void xt_atomicrwlock_init(struct XTThread XT_UNUSED(*self), XTAtomicRWLockPtr arw)
+xtPublic void xt_atomicrwlock_init(struct XTThread *XT_UNUSED(self), XTAtomicRWLockPtr arw)
 #endif
 {
 	arw->arw_reader_count = 0;
@@ -1760,14 +1687,18 @@ xtPublic void xt_atomicrwlock_init(struc
 #endif
 }
 
+#ifdef XT_THREAD_LOCK_INFO
+xtPublic void xt_atomicrwlock_free(struct XTThread *, XTAtomicRWLockPtr arw)
+#else
 xtPublic void xt_atomicrwlock_free(struct XTThread *, XTAtomicRWLockPtr XT_UNUSED(arw))
+#endif
 {
 #ifdef XT_THREAD_LOCK_INFO
 	xt_thread_lock_info_free(&arw->arw_lock_info);
 #endif
 }
 
-xtPublic xtBool xt_atomicrwlock_xlock(XTAtomicRWLockPtr arw, xtThreadID XT_UNUSED(thr_id))
+xtPublic xtBool xt_atomicrwlock_xlock(XTAtomicRWLockPtr arw, xtThreadID XT_NDEBUG_UNUSED(thr_id))
 {
 	register xtWord2 set;
 
@@ -1819,16 +1750,118 @@ xtPublic xtBool xt_atomicrwlock_slock(XT
 
 xtPublic xtBool xt_atomicrwlock_unlock(XTAtomicRWLockPtr arw, xtBool xlocked)
 {
-	if (xlocked)
+	if (xlocked) {
+#ifdef DEBUG
+		arw->arw_locker = 0;
+#endif
 		arw->arw_xlock_set = 0;
+	}
 	else
 		xt_atomic_dec2(&arw->arw_reader_count);
 
 #ifdef XT_THREAD_LOCK_INFO
 	xt_thread_lock_info_release_owner(&arw->arw_lock_info);
 #endif
+
+	return OK;
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * "SKEW" ATOMITC READ/WRITE LOCK (BASED ON ATOMIC OPERATIONS)
+ *
+ * This lock type favors writers. It only works if the proportion of readers
+ * to writer is high.
+ */
+
+#ifdef XT_THREAD_LOCK_INFO
+xtPublic void xt_skewrwlock_init(struct XTThread *XT_UNUSED(self), XTSkewRWLockPtr srw, const char *n)
+#else
+xtPublic void xt_skewrwlock_init(struct XTThread *XT_UNUSED(self), XTSkewRWLockPtr srw)
+#endif
+{
+	srw->srw_reader_count = 0;
+	srw->srw_xlock_set = 0;
+#ifdef XT_THREAD_LOCK_INFO
+	srw->srw_name = n;
+	xt_thread_lock_info_init(&srw->srw_lock_info, srw);
+#endif
+}
+
+#ifdef XT_THREAD_LOCK_INFO
+xtPublic void xt_skewrwlock_free(struct XTThread *, XTSkewRWLockPtr srw)
+#else
+xtPublic void xt_skewrwlock_free(struct XTThread *, XTSkewRWLockPtr XT_UNUSED(srw))
+#endif
+{
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_free(&srw->srw_lock_info);
+#endif
+}
+
+xtPublic xtBool xt_skewrwlock_xlock(XTSkewRWLockPtr srw, xtThreadID XT_NDEBUG_UNUSED(thr_id))
+{
+	register xtWord2 set;
+
+	/* First get an exclusive lock: */
+	for (;;) {
+		set = xt_atomic_tas2(&srw->srw_xlock_set, 1);
+		if (!set)
+			break;
+		xt_yield();
+	}
+
+	/* Wait for the remaining readers: */
+	while (srw->srw_reader_count)
+		xt_yield();
+
+#ifdef DEBUG
+	srw->srw_locker = thr_id;
+#endif
+
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_add_owner(&srw->srw_lock_info);
+#endif
+	return OK;
+}
+
+xtPublic xtBool xt_skewrwlock_slock(XTSkewRWLockPtr srw)
+{
+	/* Wait for an exclusive lock: */
+	retry:
+	for (;;) {
+		if (!srw->srw_xlock_set)
+			break;
+		xt_yield();
+	}
+
+	/* Add a reader: */
+	xt_atomic_inc2(&srw->srw_reader_count);
+
+	/* Check for xlock again: */
+	if (srw->srw_xlock_set) {
+		xt_atomic_dec2(&srw->srw_reader_count);
+		goto retry;
+	}
+
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_add_owner(&srw->srw_lock_info);
+#endif
+	return OK;
+}
+
+xtPublic xtBool xt_skewrwlock_unlock(XTSkewRWLockPtr srw, xtBool xlocked)
+{
+	if (xlocked)
+		srw->srw_xlock_set = 0;
+	else
+		xt_atomic_dec2(&srw->srw_reader_count);
+
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_release_owner(&srw->srw_lock_info);
+#endif
 #ifdef DEBUG
-	arw->arw_locker = 0;
+	srw->srw_locker = 0;
 #endif
 
 	return OK;
@@ -1844,15 +1877,17 @@ xtPublic xtBool xt_atomicrwlock_unlock(X
 #define JOB_PRINT			3
 #define JOB_INCREMENT		4
 #define JOB_SNOOZE			5
+#define JOB_DOUBLE_INC		6
 
 #define LOCK_PTHREAD_RW		1
 #define LOCK_PTHREAD_MUTEX	2
-#define LOCK_FASTRW			3
+#define LOCK_RWMUTEX		3
 #define LOCK_SPINLOCK		4
 #define LOCK_FASTLOCK		5
-#define LOCK_SPINRWLOCK		6
-#define LOCK_FASTRWLOCK		7
+#define LOCK_SPINXSLOCK		6
+#define LOCK_XSMUTEX		7
 #define LOCK_ATOMICRWLOCK	8
+#define LOCK_SKEWRWLOCK		9
 
 typedef struct XSLockTest {
 	u_int			xs_interations;
@@ -1864,18 +1899,19 @@ typedef struct XSLockTest {
 	XTSpinLockRec	xs_spinlock;
 	xt_mutex_type	xs_mutex;
 	XTFastLockRec	xs_fastlock;
-	XTSpinRWLockRec	xs_spinrwlock;
-	XTFastRWLockRec	xs_fastrwlock;
+	XTSpinXSLockRec	xs_spinrwlock;
+	XTXSMutexRec	xs_fastrwlock;
 	XTAtomicRWLockRec xs_atomicrwlock;
+	XTSkewRWLockRec xs_skewrwlock;
 	int				xs_progress;
 	xtWord4			xs_inc;
 } XSLockTestRec, *XSLockTestPtr;
 
-static void lck_free_thread_data(XTThreadPtr self __attribute__((unused)), void *data __attribute__((unused)))
+static void lck_free_thread_data(XTThreadPtr XT_UNUSED(self), void *XT_UNUSED(data))
 {
 }
 
-static void lck_do_job(XTThreadPtr self, int job, XSLockTestPtr data)
+static void lck_do_job(XTThreadPtr self, int job, XSLockTestPtr data, xtBool reader)
 {
 	char b1[2048], b2[2048];
 
@@ -1900,6 +1936,16 @@ static void lck_do_job(XTThreadPtr self,
 			xt_sleep_milli_second(10);
 			data->xs_inc++;
 			break;
+		case JOB_DOUBLE_INC:
+			if (reader) {
+				if ((data->xs_inc & 1) != 0)
+					printf("Noooo!\n");
+			}
+			else {
+				data->xs_inc++;
+				data->xs_inc++;
+			}
+			break;
 	}
 }
 
@@ -1929,29 +1975,34 @@ static void *lck_run_reader(XTThreadPtr 
 			printf("- %s %d\n", self->t_name, i+1);
 		if (data->xs_which_lock == LOCK_PTHREAD_RW) {
 			xt_slock_rwlock_ns(&data->xs_plock);
-			lck_do_job(self, data->xs_which_job, data);
+			lck_do_job(self, data->xs_which_job, data, TRUE);
 			xt_unlock_rwlock_ns(&data->xs_plock);
 		}
-		else if (data->xs_which_lock == LOCK_FASTRW) {
+		else if (data->xs_which_lock == LOCK_RWMUTEX) {
 			xt_rwmutex_slock(&data->xs_lock, self->t_id);
-			lck_do_job(self, data->xs_which_job, data);
+			lck_do_job(self, data->xs_which_job, data, TRUE);
 			xt_rwmutex_unlock(&data->xs_lock, self->t_id);
 		}
-		else if (data->xs_which_lock == LOCK_SPINRWLOCK) {
-			xt_spinrwlock_slock(&data->xs_spinrwlock, self->t_id);
-			lck_do_job(self, data->xs_which_job, data);
-			xt_spinrwlock_unlock(&data->xs_spinrwlock, self->t_id);
-		}
-		else if (data->xs_which_lock == LOCK_FASTRWLOCK) {
-			xt_fastrwlock_slock(&data->xs_fastrwlock, self);
-			lck_do_job(self, data->xs_which_job, data);
-			xt_fastrwlock_unlock(&data->xs_fastrwlock, self);
+		else if (data->xs_which_lock == LOCK_SPINXSLOCK) {
+			xt_spinxslock_slock(&data->xs_spinrwlock);
+			lck_do_job(self, data->xs_which_job, data, TRUE);
+			xt_spinxslock_unlock(&data->xs_spinrwlock, FALSE);
+		}
+		else if (data->xs_which_lock == LOCK_XSMUTEX) {
+			xt_xsmutex_slock(&data->xs_fastrwlock, self->t_id);
+			lck_do_job(self, data->xs_which_job, data, TRUE);
+			xt_xsmutex_unlock(&data->xs_fastrwlock, self->t_id);
 		}
 		else if (data->xs_which_lock == LOCK_ATOMICRWLOCK) {
 			xt_atomicrwlock_slock(&data->xs_atomicrwlock);
-			lck_do_job(self, data->xs_which_job, data);
+			lck_do_job(self, data->xs_which_job, data, TRUE);
 			xt_atomicrwlock_unlock(&data->xs_atomicrwlock, FALSE);
 		}
+		else if (data->xs_which_lock == LOCK_SKEWRWLOCK) {
+			xt_skewrwlock_slock(&data->xs_skewrwlock);
+			lck_do_job(self, data->xs_which_job, data, TRUE);
+			xt_skewrwlock_unlock(&data->xs_skewrwlock, FALSE);
+		}
 		else
 			ASSERT(FALSE);
 	}
@@ -1971,29 +2022,34 @@ static void *lck_run_writer(XTThreadPtr 
 			printf("- %s %d\n", self->t_name, i+1);
 		if (data->xs_which_lock == LOCK_PTHREAD_RW) {
 			xt_xlock_rwlock_ns(&data->xs_plock);
-			lck_do_job(self, data->xs_which_job, data);
+			lck_do_job(self, data->xs_which_job, data, FALSE);
 			xt_unlock_rwlock_ns(&data->xs_plock);
 		}
-		else if (data->xs_which_lock == LOCK_FASTRW) {
+		else if (data->xs_which_lock == LOCK_RWMUTEX) {
 			xt_rwmutex_xlock(&data->xs_lock, self->t_id);
-			lck_do_job(self, data->xs_which_job, data);
+			lck_do_job(self, data->xs_which_job, data, FALSE);
 			xt_rwmutex_unlock(&data->xs_lock, self->t_id);
 		}
-		else if (data->xs_which_lock == LOCK_SPINRWLOCK) {
-			xt_spinrwlock_xlock(&data->xs_spinrwlock, self->t_id);
-			lck_do_job(self, data->xs_which_job, data);
-			xt_spinrwlock_unlock(&data->xs_spinrwlock, self->t_id);
-		}
-		else if (data->xs_which_lock == LOCK_FASTRWLOCK) {
-			xt_fastrwlock_xlock(&data->xs_fastrwlock, self);
-			lck_do_job(self, data->xs_which_job, data);
-			xt_fastrwlock_unlock(&data->xs_fastrwlock, self);
+		else if (data->xs_which_lock == LOCK_SPINXSLOCK) {
+			xt_spinxslock_xlock(&data->xs_spinrwlock, self->t_id);
+			lck_do_job(self, data->xs_which_job, data, FALSE);
+			xt_spinxslock_unlock(&data->xs_spinrwlock, TRUE);
+		}
+		else if (data->xs_which_lock == LOCK_XSMUTEX) {
+			xt_xsmutex_xlock(&data->xs_fastrwlock, self->t_id);
+			lck_do_job(self, data->xs_which_job, data, FALSE);
+			xt_xsmutex_unlock(&data->xs_fastrwlock, self->t_id);
 		}
 		else if (data->xs_which_lock == LOCK_ATOMICRWLOCK) {
 			xt_atomicrwlock_xlock(&data->xs_atomicrwlock, self->t_id);
-			lck_do_job(self, data->xs_which_job, data);
+			lck_do_job(self, data->xs_which_job, data, FALSE);
 			xt_atomicrwlock_unlock(&data->xs_atomicrwlock, TRUE);
 		}
+		else if (data->xs_which_lock == LOCK_SKEWRWLOCK) {
+			xt_skewrwlock_xlock(&data->xs_skewrwlock, self->t_id);
+			lck_do_job(self, data->xs_which_job, data, FALSE);
+			xt_skewrwlock_unlock(&data->xs_skewrwlock, TRUE);
+		}
 		else
 			ASSERT(FALSE);
 	}
@@ -2011,7 +2067,7 @@ static void lck_print_test(XSLockTestRec
 		case LOCK_PTHREAD_MUTEX:
 			printf("pthread mutex");
 			break;
-		case LOCK_FASTRW:
+		case LOCK_RWMUTEX:
 			printf("fast read/write mutex");
 			break;
 		case LOCK_SPINLOCK:
@@ -2020,15 +2076,18 @@ static void lck_print_test(XSLockTestRec
 		case LOCK_FASTLOCK:
 			printf("fast mutex");
 			break;
-		case LOCK_SPINRWLOCK:
+		case LOCK_SPINXSLOCK:
 			printf("spin read/write lock");
 			break;
-		case LOCK_FASTRWLOCK:
-			printf("fast read/write lock");
+		case LOCK_XSMUTEX:
+			printf("fast x/s mutex");
 			break;
 		case LOCK_ATOMICRWLOCK:
 			printf("atomic read/write lock");
 			break;
+		case LOCK_SKEWRWLOCK:
+			printf("skew read/write lock");
+			break;
 	}
 
 	switch (data->xs_which_job) {
@@ -2063,17 +2122,17 @@ static void *lck_run_mutex_locker(XTThre
 			printf("- %s %d\n", self->t_name, i+1);
 		if (data->xs_which_lock == LOCK_PTHREAD_MUTEX) {
 			xt_lock_mutex_ns(&data->xs_mutex);
-			lck_do_job(self, data->xs_which_job, data);
+			lck_do_job(self, data->xs_which_job, data, FALSE);
 			xt_unlock_mutex_ns(&data->xs_mutex);
 		}
 		else if (data->xs_which_lock == LOCK_SPINLOCK) {
 			xt_spinlock_lock(&data->xs_spinlock);
-			lck_do_job(self, data->xs_which_job, data);
+			lck_do_job(self, data->xs_which_job, data, FALSE);
 			xt_spinlock_unlock(&data->xs_spinlock);
 		}
 		else if (data->xs_which_lock == LOCK_FASTLOCK) {
 			xt_fastlock_lock(&data->xs_fastlock, self);
-			lck_do_job(self, data->xs_which_job, data);
+			lck_do_job(self, data->xs_which_job, data, FALSE);
 			xt_fastlock_unlock(&data->xs_fastlock, self);
 		}
 		else
@@ -2164,15 +2223,19 @@ xtPublic void xt_unit_test_read_write_lo
 	memset(&data, 0, sizeof(data));
 
 	printf("TEST: xt_unit_test_read_write_locks\n");
+	printf("size of XTXSMutexRec = %d\n", (int) sizeof(XTXSMutexRec));
+	printf("size of pthread_cond_t = %d\n", (int) sizeof(pthread_cond_t));
+	printf("size of pthread_mutex_t = %d\n", (int) sizeof(pthread_mutex_t));
 	xt_rwmutex_init_with_autoname(self, &data.xs_lock);
 	xt_init_rwlock_with_autoname(self, &data.xs_plock);
-	xt_spinrwlock_init_with_autoname(self, &data.xs_spinrwlock);
-	xt_fastrwlock_init_with_autoname(self, &data.xs_fastrwlock);
+	xt_spinxslock_init_with_autoname(self, &data.xs_spinrwlock);
+	xt_xsmutex_init_with_autoname(self, &data.xs_fastrwlock);
 	xt_atomicrwlock_init_with_autoname(self, &data.xs_atomicrwlock);
+	xt_skewrwlock_init_with_autoname(self, &data.xs_skewrwlock);
 
 	/**
 	data.xs_interations = 10;
-	data.xs_which_lock = LOCK_FASTRW; // LOCK_PTHREAD_RW, LOCK_FASTRW, LOCK_SPINRWLOCK, LOCK_FASTRWLOCK
+	data.xs_which_lock = LOCK_RWMUTEX; // LOCK_PTHREAD_RW, LOCK_RWMUTEX, LOCK_SPINXSLOCK, LOCK_XSMUTEX
 	data.xs_which_job = JOB_PRINT;
 	data.xs_debug_print = TRUE;
 	data.xs_progress = 0;
@@ -2184,7 +2247,7 @@ xtPublic void xt_unit_test_read_write_lo
 
 	/**
 	data.xs_interations = 4000;
-	data.xs_which_lock = LOCK_FASTRW; // LOCK_PTHREAD_RW, LOCK_FASTRW, LOCK_SPINRWLOCK, LOCK_FASTRWLOCK
+	data.xs_which_lock = LOCK_RWMUTEX; // LOCK_PTHREAD_RW, LOCK_RWMUTEX, LOCK_SPINXSLOCK, LOCK_XSMUTEX
 	data.xs_which_job = JOB_SLEEP;
 	data.xs_debug_print = TRUE;
 	data.xs_progress = 200;
@@ -2194,37 +2257,52 @@ xtPublic void xt_unit_test_read_write_lo
 	lck_reader_writer_test(self, &data, 4, 2);
 	**/
 
+	// LOCK_PTHREAD_RW, LOCK_RWMUTEX, LOCK_SPINXSLOCK, LOCK_XSMUTEX, LOCK_ATOMICRWLOCK, LOCK_SKEWRWLOCK
 	/**/
-	data.xs_interations = 1000000;
-	data.xs_which_lock = LOCK_FASTRW; // LOCK_PTHREAD_RW, LOCK_FASTRW, LOCK_SPINRWLOCK, LOCK_FASTRWLOCK, LOCK_ATOMICRWLOCK
-	data.xs_which_job = JOB_INCREMENT;
+	data.xs_interations = 100000;
+	data.xs_which_lock = LOCK_XSMUTEX;
+	data.xs_which_job = JOB_DOUBLE_INC; // JOB_INCREMENT, JOB_DOUBLE_INC
 	data.xs_debug_print = FALSE;
 	data.xs_progress = 0;
 	lck_reader_writer_test(self, &data, 10, 0);
+	data.xs_which_lock = LOCK_XSMUTEX;
+	lck_reader_writer_test(self, &data, 10, 0);
+	//lck_reader_writer_test(self, &data, 0, 5);
+	//lck_reader_writer_test(self, &data, 10, 0);
+	//lck_reader_writer_test(self, &data, 10, 5);
 	/**/
 
-	/**
+	/**/
 	data.xs_interations = 10000;
-	data.xs_which_lock = LOCK_FASTRW; // LOCK_PTHREAD_RW, LOCK_FASTRW, LOCK_SPINRWLOCK, LOCK_FASTRWLOCK
+	data.xs_which_lock = LOCK_XSMUTEX;
 	data.xs_which_job = JOB_MEMCPY;
 	data.xs_debug_print = FALSE;
 	data.xs_progress = 0;
-	lck_reader_writer_test(self, &data, 10, 5);
-	**/
+	lck_reader_writer_test(self, &data, 10, 0);
+	data.xs_which_lock = LOCK_XSMUTEX;
+	lck_reader_writer_test(self, &data, 10, 0);
+	//lck_reader_writer_test(self, &data, 0, 5);
+	//lck_reader_writer_test(self, &data, 10, 0);
+	//lck_reader_writer_test(self, &data, 10, 5);
+	/**/
 
-	/**
+	/**/
 	data.xs_interations = 1000;
-	data.xs_which_lock = LOCK_FASTRW; // LOCK_PTHREAD_RW, LOCK_FASTRW, LOCK_SPINRWLOCK, LOCK_FASTRWLOCK
-	data.xs_which_job = JOB_SLEEP;
+	data.xs_which_lock = LOCK_XSMUTEX;
+	data.xs_which_job = JOB_SLEEP; // JOB_SLEEP, JOB_SNOOZE
 	data.xs_debug_print = FALSE;
 	data.xs_progress = 0;
-	lck_reader_writer_test(self, &data, 10, 5);
-	**/
+	lck_reader_writer_test(self, &data, 10, 0);
+	data.xs_which_lock = LOCK_XSMUTEX;
+	lck_reader_writer_test(self, &data, 10, 0);
+	/**/
 
 	xt_rwmutex_free(self, &data.xs_lock);
 	xt_free_rwlock(&data.xs_plock);
-	xt_spinrwlock_free(self, &data.xs_spinrwlock);
-	xt_fastrwlock_free(self, &data.xs_fastrwlock);
+	xt_spinxslock_free(self, &data.xs_spinrwlock);
+	xt_xsmutex_free(self, &data.xs_fastrwlock);
+	xt_atomicrwlock_free(self, &data.xs_atomicrwlock);
+	xt_skewrwlock_free(self, &data.xs_skewrwlock);
 }
 
 xtPublic void xt_unit_test_mutex_locks(XTThreadPtr self)

=== modified file 'storage/pbxt/src/lock_xt.h'
--- a/storage/pbxt/src/lock_xt.h	2009-05-09 04:01:53 +0000
+++ b/storage/pbxt/src/lock_xt.h	2009-08-17 11:12:36 +0000
@@ -36,96 +36,16 @@ struct XTOpenTable;
 struct XTXactData;
 struct XTTable;
 
-/* Possibilities are 2 = align 4 or 2 = align 8 */
-#define XT_XS_LOCK_SHIFT		2
-#define XT_XS_LOCK_ALIGN		(1 << XT_XS_LOCK_SHIFT)
-
-/* This lock is fast for reads but slow for writes.
- * Use this lock in situations where you have 99% reads,
- * and then some potentially long writes.
- */
-typedef struct XTRWMutex {
-#ifdef DEBUG
-	struct XTThread				*xs_lock_thread;
-	u_int						xs_inited;
-#endif
-#ifdef XT_THREAD_LOCK_INFO
-	XTThreadLockInfoRec			xs_lock_info;
-	const char				    *xs_name;
-#endif
-	xt_mutex_type				xs_lock;
-	xt_cond_type				xs_cond;
-	volatile xtWord4			xs_state;
-	volatile xtThreadID			xs_xlocker;
-	union {
-#if XT_XS_LOCK_ALIGN == 4
-		volatile xtWord4		*xs_rlock_align;
-#else
-		volatile  xtWord8		*xs_rlock_align;
-#endif
-		volatile  xtWord1		*xs_rlock;
-	}							x;
-} XTRWMutexRec, *XTRWMutexPtr;
-
-#ifdef XT_THREAD_LOCK_INFO
-#define xt_rwmutex_init_with_autoname(a,b) xt_rwmutex_init(a,b,LOCKLIST_ARG_SUFFIX(b))
-void xt_rwmutex_init(struct XTThread *self, XTRWMutexPtr xsl, const char *name);
-#else
-#define xt_rwmutex_init_with_autoname(a,b) xt_rwmutex_init(a,b)
-void xt_rwmutex_init(struct XTThread *self, XTRWMutexPtr xsl);
-#endif
-void xt_rwmutex_free(struct XTThread *self, XTRWMutexPtr xsl);
-xtBool xt_rwmutex_xlock(XTRWMutexPtr xsl, xtThreadID thd_id);
-xtBool xt_rwmutex_slock(XTRWMutexPtr xsl, xtThreadID thd_id);
-xtBool xt_rwmutex_unlock(XTRWMutexPtr xsl, xtThreadID thd_id);
-
-#ifdef XT_WIN
-#define XT_SPL_WIN32_ASM
-#else
-#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
-#define XT_SPL_GNUC_X86
-#else
-#define XT_SPL_DEFAULT
-#endif
-#endif
-
-#ifdef XT_SOLARIS
-/* Use Sun atomic operations library
- * http://docs.sun.com/app/docs/doc/816-5168/atomic-ops-3c?a=view
- */
-#define XT_SPL_SOLARIS_LIB
-#endif
-
-#ifdef XT_SPL_SOLARIS_LIB
+#ifdef XT_ATOMIC_SOLARIS_LIB
 #include <atomic.h>
 #endif
 
-typedef struct XTSpinLock {
-	volatile xtWord4			spl_lock;
-#ifdef XT_SPL_DEFAULT
-	xt_mutex_type				spl_mutex;
-#endif
-#ifdef DEBUG
-	struct XTThread				*spl_locker;
-#endif
-#ifdef XT_THREAD_LOCK_INFO
-	XTThreadLockInfoRec			spl_lock_info;
-	const char				    *spl_name;
-#endif
-} XTSpinLockRec, *XTSpinLockPtr;
+void xt_log_atomic_error_and_abort(c_char *func, c_char *file, u_int line);
 
-#ifdef XT_THREAD_LOCK_INFO
-#define xt_spinlock_init_with_autoname(a,b) xt_spinlock_init(a,b,LOCKLIST_ARG_SUFFIX(b))
-void	xt_spinlock_init(struct XTThread *self, XTSpinLockPtr sp, const char *name);
-#else
-#define xt_spinlock_init_with_autoname(a,b) xt_spinlock_init(a,b)
-void	xt_spinlock_init(struct XTThread *self, XTSpinLockPtr sp);
-#endif
-void	xt_spinlock_free(struct XTThread *self, XTSpinLockPtr sp);
-xtBool	xt_spinlock_spin(XTSpinLockPtr spl);
-#ifdef DEBUG
-void	xt_spinlock_set_thread(XTSpinLockPtr spl);
-#endif
+/*
+ * -----------------------------------------------------------------------
+ * ATOMIC OPERATIONS
+ */
 
 /*
  * This macro is to remind me where it was safe
@@ -137,37 +57,38 @@ void	xt_spinlock_set_thread(XTSpinLockPt
  * is written atomically.
  * But the operations themselves are not atomic!
  */
-inline void xt_flushed_inc1(volatile xtWord1 *mptr)
+inline void xt_atomic_inc1(volatile xtWord1 *mptr)
 {
-#ifdef XT_SPL_WIN32_ASM
+#ifdef XT_ATOMIC_WIN32_X86
 	__asm MOV  ECX, mptr
 	__asm MOV  DL, BYTE PTR [ECX]
 	__asm INC  DL
 	__asm XCHG DL, BYTE PTR [ECX]
-#elif defined(XT_SPL_GNUC_X86)
+#elif defined(XT_ATOMIC_GNUC_X86)
 	xtWord1 val;
 
 	asm volatile ("movb %1,%0" : "=r" (val) : "m" (*mptr) : "memory");
 	val++;
 	asm volatile ("xchgb %1,%0" : "=r" (val) : "m" (*mptr), "0" (val) : "memory");
-#elif defined(XT_SPL_SOLARIS_LIB)
+#elif defined(XT_ATOMIC_SOLARIS_LIB)
 	atomic_inc_8(mptr);
 #else
 	*mptr++;
+	xt_log_atomic_error_and_abort(__FUNC__, __FILE__, __LINE__);
 #endif
 }
 
-inline xtWord1 xt_flushed_dec1(volatile xtWord1 *mptr)
+inline xtWord1 xt_atomic_dec1(volatile xtWord1 *mptr)
 {
 	xtWord1 val;
 
-#ifdef XT_SPL_WIN32_ASM
+#ifdef XT_ATOMIC_WIN32_X86
 	__asm MOV  ECX, mptr
 	__asm MOV  DL, BYTE PTR [ECX]
 	__asm DEC  DL
 	__asm MOV  val, DL
 	__asm XCHG DL, BYTE PTR [ECX]
-#elif defined(XT_SPL_GNUC_X86)
+#elif defined(XT_ATOMIC_GNUC_X86)
 	xtWord1 val2;
 
 	asm volatile ("movb %1, %0" : "=r" (val) : "m" (*mptr) : "memory");
@@ -176,55 +97,58 @@ inline xtWord1 xt_flushed_dec1(volatile 
 	/* Should work, but compiler makes a mistake?
 	 * asm volatile ("xchgb %1, %0" : : "r" (val), "m" (*mptr) : "memory");
 	 */
-#elif defined(XT_SPL_SOLARIS_LIB)
+#elif defined(XT_ATOMIC_SOLARIS_LIB)
 	val = atomic_dec_8_nv(mptr);
 #else
 	val = --(*mptr);
+	xt_log_atomic_error_and_abort(__FUNC__, __FILE__, __LINE__);
 #endif
 	return val;
 }
 
 inline void xt_atomic_inc2(volatile xtWord2 *mptr)
 {
-#ifdef XT_SPL_WIN32_ASM
+#ifdef XT_ATOMIC_WIN32_X86
 	__asm LOCK INC	WORD PTR mptr
-#elif defined(XT_SPL_GNUC_X86)
-        asm volatile ("lock; incw %0" : : "m" (*mptr) : "memory");
-#elif defined(__GNUC__)
+#elif defined(XT_ATOMIC_GNUC_X86)
+	asm volatile ("lock; incw %0" : : "m" (*mptr) : "memory");
+#elif defined(XT_ATOMIC_GCC_OPS)
 	__sync_fetch_and_add(mptr, 1);
-#elif defined(XT_SPL_SOLARIS_LIB)
+#elif defined(XT_ATOMIC_SOLARIS_LIB)
 	atomic_inc_16_nv(mptr);
 #else
 	(*mptr)++;
+	xt_log_atomic_error_and_abort(__FUNC__, __FILE__, __LINE__);
 #endif
 }
 
 inline void xt_atomic_dec2(volatile xtWord2 *mptr)
 {
-#ifdef XT_SPL_WIN32_ASM
+#ifdef XT_ATOMIC_WIN32_X86
 	__asm LOCK DEC	WORD PTR mptr
-#elif defined(XT_SPL_GNUC_X86)
+#elif defined(XT_ATOMIC_GNUC_X86)
 	asm volatile ("lock; decw %0" : : "m" (*mptr) : "memory");
-#elif defined(__GNUC__)
+#elif defined(XT_ATOMIC_GCC_OPS)
 	__sync_fetch_and_sub(mptr, 1);
-#elif defined(XT_SPL_SOLARIS_LIB)
+#elif defined(XT_ATOMIC_SOLARIS_LIB)
 	atomic_dec_16_nv(mptr);
 #else
 	--(*mptr);
+	xt_log_atomic_error_and_abort(__FUNC__, __FILE__, __LINE__);
 #endif
 }
 
 /* Atomic test and set 2 byte word! */
 inline xtWord2 xt_atomic_tas2(volatile xtWord2 *mptr, xtWord2 val)
 {
-#ifdef XT_SPL_WIN32_ASM
+#ifdef XT_ATOMIC_WIN32_X86
 	__asm MOV  ECX, mptr
 	__asm MOV  DX, val
 	__asm XCHG DX, WORD PTR [ECX]
 	__asm MOV  val, DX
-#elif defined(XT_SPL_GNUC_X86)
+#elif defined(XT_ATOMIC_GNUC_X86)
 	asm volatile ("xchgw %1,%0" : "=r" (val) : "m" (*mptr), "0" (val) : "memory");
-#elif defined(XT_SPL_SOLARIS_LIB)
+#elif defined(XT_ATOMIC_SOLARIS_LIB)
 	val = atomic_swap_16(mptr, val);
 #else
 	/* Yikes! */
@@ -232,43 +156,80 @@ inline xtWord2 xt_atomic_tas2(volatile x
 
 	val = *mptr;
 	*mptr = nval;
+	xt_log_atomic_error_and_abort(__FUNC__, __FILE__, __LINE__);
 #endif
 	return val;
 }
 
 inline void xt_atomic_set4(volatile xtWord4 *mptr, xtWord4 val)
 {
-#ifdef XT_SPL_WIN32_ASM
+#ifdef XT_ATOMIC_WIN32_X86
 	__asm MOV  ECX, mptr
 	__asm MOV  EDX, val
 	__asm XCHG EDX, DWORD PTR [ECX]
 	//__asm MOV  DWORD PTR [ECX], EDX
-#elif defined(XT_SPL_GNUC_X86)
+#elif defined(XT_ATOMIC_GNUC_X86)
 	asm volatile ("xchgl %1,%0" : "=r" (val) : "m" (*mptr), "0" (val) : "memory");
 	//asm volatile ("movl %0,%1" : "=r" (val) : "m" (*mptr) : "memory");
-#elif defined(XT_SPL_SOLARIS_LIB)
+#elif defined(XT_ATOMIC_SOLARIS_LIB)
 	atomic_swap_32(mptr, val);
 #else
 	*mptr = val;
+	xt_log_atomic_error_and_abort(__FUNC__, __FILE__, __LINE__);
 #endif
 }
 
-inline xtWord4 xt_atomic_get4(volatile xtWord4 *mptr)
-{
-	xtWord4 val;
-
-#ifdef XT_SPL_WIN32_ASM
-	__asm MOV ECX, mptr
-	__asm MOV EDX, DWORD PTR [ECX]
-	__asm MOV val, EDX
-#elif defined(XT_SPL_GNUC_X86)
-	asm volatile ("movl %1,%0" : "=r" (val) : "m" (*mptr) : "memory");
+inline xtWord4 xt_atomic_tas4(volatile xtWord4 *mptr, xtWord4 val)
+{				
+#ifdef XT_ATOMIC_WIN32_X86
+	__asm MOV  ECX, mptr
+	__asm MOV  EDX, val
+	__asm XCHG EDX, DWORD PTR [ECX]
+	__asm MOV  val, EDX
+#elif defined(XT_ATOMIC_GNUC_X86)
+	val = val;
+	asm volatile ("xchgl %1,%0" : "=r" (val) : "m" (*mptr), "0" (val) : "memory");
+#elif defined(XT_ATOMIC_SOLARIS_LIB)
+	val = atomic_swap_32(mptr, val);
 #else
-	val = *mptr;
+	*mptr = val;
+	xt_log_atomic_error_and_abort(__FUNC__, __FILE__, __LINE__);
 #endif
 	return val;
 }
 
+/*
+ * -----------------------------------------------------------------------
+ * DIFFERENT TYPES OF LOCKS
+ */
+
+typedef struct XTSpinLock {
+	volatile xtWord4			spl_lock;
+#ifdef XT_NO_ATOMICS
+	xt_mutex_type				spl_mutex;
+#endif
+#ifdef DEBUG
+	struct XTThread				*spl_locker;
+#endif
+#ifdef XT_THREAD_LOCK_INFO
+	XTThreadLockInfoRec			spl_lock_info;
+	const char				    *spl_name;
+#endif
+} XTSpinLockRec, *XTSpinLockPtr;
+
+#ifdef XT_THREAD_LOCK_INFO
+#define xt_spinlock_init_with_autoname(a,b) xt_spinlock_init(a,b,LOCKLIST_ARG_SUFFIX(b))
+void	xt_spinlock_init(struct XTThread *self, XTSpinLockPtr sp, const char *name);
+#else
+#define xt_spinlock_init_with_autoname(a,b) xt_spinlock_init(a,b)
+void	xt_spinlock_init(struct XTThread *self, XTSpinLockPtr sp);
+#endif
+void	xt_spinlock_free(struct XTThread *self, XTSpinLockPtr sp);
+xtBool	xt_spinlock_spin(XTSpinLockPtr spl);
+#ifdef DEBUG
+void	xt_spinlock_set_thread(XTSpinLockPtr spl);
+#endif
+
 /* Code for test and set is derived from code by Larry Zhou and
  * Google: http://code.google.com/p/google-perftools
  */
@@ -278,15 +239,15 @@ inline xtWord4 xt_spinlock_set(XTSpinLoc
 	volatile xtWord4	*lck;
 				
 	lck = &spl->spl_lock;
-#ifdef XT_SPL_WIN32_ASM
+#ifdef XT_ATOMIC_WIN32_X86
 	__asm MOV  ECX, lck
 	__asm MOV  EDX, 1
 	__asm XCHG EDX, DWORD PTR [ECX]
 	__asm MOV  prv, EDX
-#elif defined(XT_SPL_GNUC_X86)
+#elif defined(XT_ATOMIC_GNUC_X86)
 	prv = 1;
 	asm volatile ("xchgl %1,%0" : "=r" (prv) : "m" (*lck), "0" (prv) : "memory");
-#elif defined(XT_SPL_SOLARIS_LIB)
+#elif defined(XT_ATOMIC_SOLARIS_LIB)
 	prv = atomic_swap_32(lck, 1);
 #else
 	/* The default implementation just uses a mutex, and
@@ -312,15 +273,15 @@ inline xtWord4 xt_spinlock_reset(XTSpinL
 	spl->spl_locker = NULL;
 #endif
 	lck = &spl->spl_lock;
-#ifdef XT_SPL_WIN32_ASM
+#ifdef XT_ATOMIC_WIN32_X86
 	__asm MOV  ECX, lck
 	__asm MOV  EDX, 0
 	__asm XCHG EDX, DWORD PTR [ECX]
 	__asm MOV  prv, EDX
-#elif defined(XT_SPL_GNUC_X86)
+#elif defined(XT_ATOMIC_GNUC_X86)
 	prv = 0;
 	asm volatile ("xchgl %1,%0" : "=r" (prv) : "m" (*lck), "0" (prv) : "memory");
-#elif defined(XT_SPL_SOLARIS_LIB)
+#elif defined(XT_ATOMIC_SOLARIS_LIB)
 	prv = atomic_swap_32(lck, 0);
 #else
 	*lck = 0;
@@ -359,9 +320,48 @@ inline void xt_spinlock_unlock(XTSpinLoc
 #endif
 }
 
-void xt_unit_test_read_write_locks(struct XTThread *self);
-void xt_unit_test_mutex_locks(struct XTThread *self);
-void xt_unit_test_create_threads(struct XTThread *self);
+/* Possibilities are 2 = align 4 or 2 = align 8 */
+#define XT_XS_LOCK_SHIFT		2
+#define XT_XS_LOCK_ALIGN		(1 << XT_XS_LOCK_SHIFT)
+
+/* This lock is fast for reads but slow for writes.
+ * Use this lock in situations where you have 99% reads,
+ * and then some potentially long writes.
+ */
+typedef struct XTRWMutex {
+#ifdef DEBUG
+	struct XTThread				*xs_lock_thread;
+	u_int						xs_inited;
+#endif
+#ifdef XT_THREAD_LOCK_INFO
+	XTThreadLockInfoRec			xs_lock_info;
+	const char				    *xs_name;
+#endif
+	xt_mutex_type				xs_lock;
+	xt_cond_type				xs_cond;
+	volatile xtWord4			xs_state;
+	volatile xtThreadID			xs_xlocker;
+	union {
+#if XT_XS_LOCK_ALIGN == 4
+		volatile xtWord4		*xs_rlock_align;
+#else
+		volatile  xtWord8		*xs_rlock_align;
+#endif
+		volatile  xtWord1		*xs_rlock;
+	}							x;
+} XTRWMutexRec, *XTRWMutexPtr;
+
+#ifdef XT_THREAD_LOCK_INFO
+#define xt_rwmutex_init_with_autoname(a,b) xt_rwmutex_init(a,b,LOCKLIST_ARG_SUFFIX(b))
+void xt_rwmutex_init(struct XTThread *self, XTRWMutexPtr xsl, const char *name);
+#else
+#define xt_rwmutex_init_with_autoname(a,b) xt_rwmutex_init(a,b)
+void xt_rwmutex_init(struct XTThread *self, XTRWMutexPtr xsl);
+#endif
+void xt_rwmutex_free(struct XTThread *self, XTRWMutexPtr xsl);
+xtBool xt_rwmutex_xlock(XTRWMutexPtr xsl, xtThreadID thd_id);
+xtBool xt_rwmutex_slock(XTRWMutexPtr xsl, xtThreadID thd_id);
+xtBool xt_rwmutex_unlock(XTRWMutexPtr xsl, xtThreadID thd_id);
 
 #define XT_FAST_LOCK_MAX_WAIT	100
 
@@ -410,7 +410,7 @@ inline xtBool xt_fastlock_lock(XTFastLoc
 #endif
 }
 
-inline void xt_fastlock_unlock(XTFastLockPtr fal, struct XTThread *thread __attribute__((unused)))
+inline void xt_fastlock_unlock(XTFastLockPtr fal, struct XTThread *XT_UNUSED(thread))
 {
 	if (fal->fal_wait_count)
 		xt_fastlock_wakeup(fal);
@@ -423,73 +423,61 @@ inline void xt_fastlock_unlock(XTFastLoc
 #endif
 }
 
-typedef struct XTSpinRWLock {
-	XTSpinLockRec				srw_lock;
-	volatile xtThreadID			srw_xlocker;
-	XTSpinLockRec				srw_state_lock;
-	volatile u_int				srw_state;
-	union {
-#if XT_XS_LOCK_ALIGN == 4
-		volatile xtWord4		*srw_rlock_align;
-#else
-		volatile  xtWord8		*srw_rlock_align;
-#endif
-		volatile  xtWord1		*srw_rlock;
-	} x;
+#define XT_SXS_SLOCK_COUNT		2
 
+typedef struct XTSpinXSLock {
+	volatile xtWord2			sxs_xlocked;
+	volatile xtWord2			sxs_rlock_count;
+	volatile xtWord2			sxs_wait_count;			/* The number of readers waiting for the xlocker. */
+#ifdef DEBUG
+	xtThreadID					sxs_locker;
+#endif
 #ifdef XT_THREAD_LOCK_INFO
-	XTThreadLockInfoRec			srw_lock_info;
-	const char				    *srw_name;
+	XTThreadLockInfoRec			sxs_lock_info;
+	const char				    *sxs_name;
 #endif
-
-} XTSpinRWLockRec, *XTSpinRWLockPtr;
+} XTSpinXSLockRec, *XTSpinXSLockPtr;
 
 #ifdef XT_THREAD_LOCK_INFO
-#define xt_spinrwlock_init_with_autoname(a,b) xt_spinrwlock_init(a,b,LOCKLIST_ARG_SUFFIX(b))
-void xt_spinrwlock_init(struct XTThread *self, XTSpinRWLockPtr xsl, const char *name);
-#else
-#define xt_spinrwlock_init_with_autoname(a,b) xt_spinrwlock_init(a,b)
-void xt_spinrwlock_init(struct XTThread *self, XTSpinRWLockPtr xsl);
-#endif
-void xt_spinrwlock_free(struct XTThread *self, XTSpinRWLockPtr xsl);
-xtBool xt_spinrwlock_xlock(XTSpinRWLockPtr xsl, xtThreadID thd_id);
-xtBool xt_spinrwlock_slock(XTSpinRWLockPtr xsl, xtThreadID thd_id);
-xtBool xt_spinrwlock_unlock(XTSpinRWLockPtr xsl, xtThreadID thd_id);
-
-typedef struct XTFastRWLock {
-	XTFastLockRec				frw_lock;
-	struct XTThread				*frw_xlocker;
-	XTSpinLockRec				frw_state_lock;
-	volatile u_int				frw_state;
-	u_int						frw_read_waiters;
-	union {
-#if XT_XS_LOCK_ALIGN == 4
-		volatile xtWord4		*frw_rlock_align;
+#define xt_spinxslock_init_with_autoname(a,b) xt_spinxslock_init(a,b,LOCKLIST_ARG_SUFFIX(b))
+void xt_spinxslock_init(struct XTThread *self, XTSpinXSLockPtr sxs, const char *name);
 #else
-		volatile  xtWord8		*frw_rlock_align;
+#define xt_spinxslock_init_with_autoname(a,b) xt_spinxslock_init(a,b)
+void xt_spinxslock_init(struct XTThread *self, XTSpinXSLockPtr sxs);
 #endif
-		volatile  xtWord1		*frw_rlock;
-	} x;
+void xt_spinxslock_free(struct XTThread *self, XTSpinXSLockPtr sxs);
+xtBool xt_spinxslock_xlock(XTSpinXSLockPtr sxs, xtThreadID thd_id);
+xtBool xt_spinxslock_slock(XTSpinXSLockPtr sxs);
+xtBool xt_spinxslock_unlock(XTSpinXSLockPtr sxs, xtBool xlocked);
 
+typedef struct XTXSMutexLock {
+	xt_mutex_type				xsm_lock;
+	xt_cond_type				xsm_cond;
+	xt_cond_type				xsm_cond_2;
+	volatile xtThreadID			xsm_xlocker;
+	volatile xtWord2			xsm_rlock_count;
+	volatile xtWord2			xsm_wait_count;			/* The number of readers waiting for the xlocker. */
+#ifdef DEBUG
+	xtThreadID					xsm_locker;
+#endif
 #ifdef XT_THREAD_LOCK_INFO
-	XTThreadLockInfoRec			frw_lock_info;
-	const char				    *frw_name;
+	XTThreadLockInfoRec			xsm_lock_info;
+	const char				    *xsm_name;
 #endif
-
-} XTFastRWLockRec, *XTFastRWLockPtr;
+} XTXSMutexRec, *XTXSMutexLockPtr;
 
 #ifdef XT_THREAD_LOCK_INFO
-#define xt_fastrwlock_init_with_autoname(a,b) xt_fastrwlock_init(a,b,LOCKLIST_ARG_SUFFIX(b))
-void xt_fastrwlock_init(struct XTThread *self, XTFastRWLockPtr frw, const char *name);
+#define xt_xsmutex_init_with_autoname(a,b) xt_xsmutex_init(a,b,LOCKLIST_ARG_SUFFIX(b))
+void xt_xsmutex_init(struct XTThread *self, XTXSMutexLockPtr xsm, const char *name);
 #else
-#define xt_fastrwlock_init_with_autoname(a,b) xt_fastrwlock_init(a,b)
-void xt_fastrwlock_init(struct XTThread *self, XTFastRWLockPtr frw);
+#define xt_xsmutex_init_with_autoname(a,b) xt_xsmutex_init(a,b)
+void xt_xsmutex_init(struct XTThread *self, XTXSMutexLockPtr xsm);
 #endif
 
-void xt_fastrwlock_free(struct XTThread *self, XTFastRWLockPtr frw);
-xtBool xt_fastrwlock_xlock(XTFastRWLockPtr frw, struct XTThread *thread);
-xtBool xt_fastrwlock_slock(XTFastRWLockPtr frw, struct XTThread *thread);
-xtBool xt_fastrwlock_unlock(XTFastRWLockPtr frw, struct XTThread *thread);
+void xt_xsmutex_free(struct XTThread *self, XTXSMutexLockPtr xsm);
+xtBool xt_xsmutex_xlock(XTXSMutexLockPtr xsm, xtThreadID thd_id);
+xtBool xt_xsmutex_slock(XTXSMutexLockPtr xsm, xtThreadID thd_id);
+xtBool xt_xsmutex_unlock(XTXSMutexLockPtr xsm, xtThreadID thd_id);
 
 typedef struct XTAtomicRWLock {
 	volatile xtWord2			arw_reader_count;
@@ -516,6 +504,35 @@ xtBool xt_atomicrwlock_xlock(XTAtomicRWL
 xtBool xt_atomicrwlock_slock(XTAtomicRWLockPtr xsl);
 xtBool xt_atomicrwlock_unlock(XTAtomicRWLockPtr xsl, xtBool xlocked);
 
+typedef struct XTSkewRWLock {
+	volatile xtWord2			srw_reader_count;
+	volatile xtWord2			srw_xlock_set;
+
+#ifdef XT_THREAD_LOCK_INFO
+	XTThreadLockInfoRec			srw_lock_info;
+	const char				    *srw_name;
+#endif
+#ifdef DEBUG
+	xtThreadID					srw_locker;
+#endif
+} XTSkewRWLockRec, *XTSkewRWLockPtr;
+
+#ifdef XT_THREAD_LOCK_INFO
+#define xt_skewrwlock_init_with_autoname(a,b) xt_skewrwlock_init(a,b,LOCKLIST_ARG_SUFFIX(b))
+void xt_skewrwlock_init(struct XTThread *self, XTSkewRWLockPtr xsl, const char *name);
+#else
+#define xt_skewrwlock_init_with_autoname(a,b) xt_skewrwlock_init(a,b)
+void xt_skewrwlock_init(struct XTThread *self, XTSkewRWLockPtr xsl);
+#endif
+void xt_skewrwlock_free(struct XTThread *self, XTSkewRWLockPtr xsl);
+xtBool xt_skewrwlock_xlock(XTSkewRWLockPtr xsl, xtThreadID thr_id);
+xtBool xt_skewrwlock_slock(XTSkewRWLockPtr xsl);
+xtBool xt_skewrwlock_unlock(XTSkewRWLockPtr xsl, xtBool xlocked);
+
+void xt_unit_test_read_write_locks(struct XTThread *self);
+void xt_unit_test_mutex_locks(struct XTThread *self);
+void xt_unit_test_create_threads(struct XTThread *self);
+
 /*
  * -----------------------------------------------------------------------
  * ROW LOCKS

=== modified file 'storage/pbxt/src/locklist_xt.cc'
--- a/storage/pbxt/src/locklist_xt.cc	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/locklist_xt.cc	2009-08-17 11:12:36 +0000
@@ -59,13 +59,13 @@ void xt_thread_lock_info_init(XTThreadLo
 	ptr->li_lock_type = XTThreadLockInfo::RW_LOCK;
 }
 
-void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, XTFastRWLock *lock)
+void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, XTXSMutexLock *lock)
 {
 	ptr->li_fast_rwlock = lock;
 	ptr->li_lock_type   = XTThreadLockInfo::FAST_RW_LOCK;
 }
 
-void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, XTSpinRWLock *lock)
+void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, XTSpinXSLock *lock)
 {
 	ptr->li_spin_rwlock = lock;
 	ptr->li_lock_type   = XTThreadLockInfo::SPIN_RW_LOCK;
@@ -77,6 +77,12 @@ void xt_thread_lock_info_init(XTThreadLo
 	ptr->li_lock_type   = XTThreadLockInfo::ATOMIC_RW_LOCK;
 }
 
+void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, XTSkewRWLock *lock)
+{
+	ptr->li_skew_rwlock = lock;
+	ptr->li_lock_type   = XTThreadLockInfo::SKEW_RW_LOCK;
+}
+
 void xt_thread_lock_info_free(XTThreadLockInfoPtr ptr)
 {
 	/* TODO: check to see if it's present in a thread's list */
@@ -163,12 +169,12 @@ void xt_trace_thread_locks(XTThread *sel
 				lock_name = li->li_fast_lock->fal_name;
 				break;
 			case XTThreadLockInfo::FAST_RW_LOCK:
-				lock_type = "XTFastRWLock";
-				lock_name = li->li_fast_rwlock->frw_name;
+				lock_type = "XTXSMutexLock";
+				lock_name = li->li_fast_rwlock->xsm_name;
 				break;
 			case XTThreadLockInfo::SPIN_RW_LOCK:
 				lock_type = "XTSpinRWLock";
-				lock_name = li->li_spin_rwlock->srw_name;
+				lock_name = li->li_spin_rwlock->sxs_name;
 				break;
 			case XTThreadLockInfo::ATOMIC_RW_LOCK:
 				lock_type = "XTAtomicRWLock";

=== modified file 'storage/pbxt/src/locklist_xt.h'
--- a/storage/pbxt/src/locklist_xt.h	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/locklist_xt.h	2009-08-17 11:12:36 +0000
@@ -25,7 +25,7 @@
 #define __xt_locklist_h__
 
 #ifdef DEBUG
-//#define XT_THREAD_LOCK_INFO
+#define XT_THREAD_LOCK_INFO
 #ifndef XT_WIN
 /* We need DEBUG_LOCKING in order to enable pthread function wrappers */
 #define DEBUG_LOCKING
@@ -40,9 +40,10 @@ struct XTRWMutex;
 struct xt_mutex_struct;
 struct xt_rwlock_struct;
 struct XTFastLock;
-struct XTFastRWLock;
-struct XTSpinRWLock;
+struct XTXSMutexLock;
+struct XTSpinXSLock;
 struct XTAtomicRWLock;
+struct XTSkewRWLock;
 
 #ifdef XT_THREAD_LOCK_INFO
 
@@ -61,7 +62,7 @@ struct XTAtomicRWLock;
  */
 typedef struct XTThreadLockInfo {
 
-	enum LockType { SPIN_LOCK, RW_MUTEX, MUTEX, RW_LOCK, FAST_LOCK, FAST_RW_LOCK, SPIN_RW_LOCK, ATOMIC_RW_LOCK };
+	enum LockType { SPIN_LOCK, RW_MUTEX, MUTEX, RW_LOCK, FAST_LOCK, FAST_RW_LOCK, SPIN_RW_LOCK, ATOMIC_RW_LOCK, SKEW_RW_LOCK };
 
 	LockType		  li_lock_type;
 
@@ -69,11 +70,12 @@ typedef struct XTThreadLockInfo {
 		XTSpinLock       *li_spin_lock;	  // SPIN_LOCK
 		XTRWMutex        *li_rw_mutex;	  // RW_MUTEX
 		XTFastLock		 *li_fast_lock;   // FAST_LOCK
-		XTFastRWLock	 *li_fast_rwlock; // FAST_RW_LOCK
-		XTSpinRWLock	 *li_spin_rwlock; // SPIN_RW_LOCK
+		XTXSMutexLock	 *li_fast_rwlock; // FAST_RW_LOCK
+		XTSpinXSLock	 *li_spin_rwlock; // SPIN_RW_LOCK
 		XTAtomicRWLock	 *li_atomic_rwlock; // ATOMIC_RW_LOCK
 		xt_mutex_struct  *li_mutex;		  // MUTEX
 		xt_rwlock_struct *li_rwlock;	  // RW_LOCK
+		XTSkewRWLock	 *li_skew_rwlock;	// SKEW_RW_LOCK
 	};
 } 
 XTThreadLockInfoRec, *XTThreadLockInfoPtr;
@@ -81,11 +83,12 @@ XTThreadLockInfoRec, *XTThreadLockInfoPt
 void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, XTSpinLock *lock);
 void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, XTRWMutex *lock);
 void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, XTFastLock *lock);
-void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, XTFastRWLock *lock);
-void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, XTSpinRWLock *lock);
+void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, XTXSMutexLock *lock);
+void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, XTSpinXSLock *lock);
 void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, XTAtomicRWLock *lock);
 void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, xt_mutex_struct *lock);
 void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, xt_rwlock_struct *lock);
+void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, XTSkewRWLock *lock);
 void xt_thread_lock_info_free(XTThreadLockInfoPtr ptr);
 
 void xt_thread_lock_info_add_owner (XTThreadLockInfoPtr ptr);

=== modified file 'storage/pbxt/src/memory_xt.cc'
--- a/storage/pbxt/src/memory_xt.cc	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/memory_xt.cc	2009-08-17 11:12:36 +0000
@@ -117,7 +117,7 @@ xtPublic xtBool	xt_realloc(XTThreadPtr s
 	return OK;
 }
 
-xtPublic void xt_free(XTThreadPtr self __attribute__((unused)), void *ptr)
+xtPublic void xt_free(XTThreadPtr XT_UNUSED(self), void *ptr)
 {
 	free(ptr);
 }
@@ -186,7 +186,7 @@ xtPublic void xt_free_ns(void *ptr)
 	free(ptr);
 }
 
-#ifdef DEBUG
+#ifdef DEBUG_MEMORY
 
 /*
  * -----------------------------------------------------------------------
@@ -678,7 +678,7 @@ void xt_mm_memset(void *block, void *des
 	memset(dest, value, size);
 }
 
-void *xt_mm_malloc(XTThreadPtr self, size_t size, u_int line __attribute__((unused)), c_char *file __attribute__((unused)))
+void *xt_mm_malloc(XTThreadPtr self, size_t size, u_int line, c_char *file)
 {
 	unsigned char *p;
 
@@ -695,6 +695,8 @@ void *xt_mm_malloc(XTThreadPtr self, siz
 	*(p + size + MEM_DEBUG_HDR_SIZE) = MEM_TRAILER_BYTE;
 	*(p + size + MEM_DEBUG_HDR_SIZE + 1L) = MEM_TRAILER_BYTE;
 
+	(void) line;
+	(void) file;
 #ifdef RECORD_MM
 	xt_lock_mutex(self, &mm_mutex);
 	mm_add_core_ptr(self, p + MEM_DEBUG_HDR_SIZE, 0, line, file);
@@ -704,7 +706,7 @@ void *xt_mm_malloc(XTThreadPtr self, siz
 	return p + MEM_DEBUG_HDR_SIZE;
 }
 
-void *xt_mm_calloc(XTThreadPtr self, size_t size, u_int line __attribute__((unused)), c_char *file __attribute__((unused)))
+void *xt_mm_calloc(XTThreadPtr self, size_t size, u_int line, c_char *file)
 {
 	unsigned char *p;
 	
@@ -719,6 +721,8 @@ void *xt_mm_calloc(XTThreadPtr self, siz
 	*(p + size + MEM_DEBUG_HDR_SIZE) = MEM_TRAILER_BYTE;
 	*(p + size + MEM_DEBUG_HDR_SIZE + 1L) = MEM_TRAILER_BYTE;
 
+	(void) line;
+	(void) file;
 #ifdef RECORD_MM
 	xt_lock_mutex(self, &mm_mutex);
 	mm_add_core_ptr(self, p + MEM_DEBUG_HDR_SIZE, 0, line, file);
@@ -849,7 +853,7 @@ void xt_mm_check_ptr(XTThreadPtr self, v
 
 xtPublic xtBool xt_init_memory(void)
 {
-#ifdef DEBUG
+#ifdef DEBUG_MEMORY
 	XTThreadPtr	self = NULL;
 
 	if (!xt_init_mutex_with_autoname(NULL, &mm_mutex))
@@ -875,7 +879,7 @@ xtPublic void debug_ik_sum(void);
 
 xtPublic void xt_exit_memory(void)
 {
-#ifdef DEBUG
+#ifdef DEBUG_MEMORY
 	long	mm;
 	int		i;
 
@@ -919,7 +923,7 @@ xtPublic void xt_exit_memory(void)
  * MEMORY ALLOCATION UTILITIES
  */
 
-#ifdef DEBUG
+#ifdef DEBUG_MEMORY
 char	*xt_mm_dup_string(XTThreadPtr self, c_char *str, u_int line, c_char *file)
 #else
 char	*xt_dup_string(XTThreadPtr self, c_char *str)
@@ -931,7 +935,7 @@ char	*xt_dup_string(XTThreadPtr self, c_
 	if (!str)
 		return NULL;
 	len = strlen(str);
-#ifdef DEBUG
+#ifdef DEBUG_MEMORY
 	new_str = (char *) xt_mm_malloc(self, len + 1, line, file);
 #else
 	new_str = (char *) xt_malloc(self, len + 1);
@@ -1020,7 +1024,7 @@ xtPublic xtBool	xt_realloc(XTThreadPtr s
 	return *ptr != NULL;
 }
 
-xtPublic void xt_free(XTThreadPtr self __attribute__((unused)), void *ptr)
+xtPublic void xt_free(XTThreadPtr XT_UNUSED(self), void *ptr)
 {
 	char	*old_ptr;
 	xtWord4 size;

=== modified file 'storage/pbxt/src/memory_xt.h'
--- a/storage/pbxt/src/memory_xt.h	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/memory_xt.h	2009-08-17 11:12:36 +0000
@@ -30,6 +30,10 @@
 struct XTThread;
 
 #ifdef DEBUG
+#define DEBUG_MEMORY
+#endif
+
+#ifdef DEBUG_MEMORY
 
 #define XT_MM_STACK_TRACE	200
 #define XT_MM_TRACE_DEPTH	4
@@ -109,7 +113,7 @@ void	xt_free_ns(void *ptr);
 
 #endif
 
-#ifdef DEBUG
+#ifdef DEBUG_MEMORY
 #define xt_dup_string(t, s)		xt_mm_dup_string(t, s, __LINE__, __FILE__)
 
 char	*xt_mm_dup_string(struct XTThread *self, const char *path, u_int line, const char *file);

=== modified file 'storage/pbxt/src/myxt_xt.cc'
--- a/storage/pbxt/src/myxt_xt.cc	2009-04-02 10:03:14 +0000
+++ b/storage/pbxt/src/myxt_xt.cc	2009-08-18 07:46:53 +0000
@@ -52,12 +52,12 @@ extern pthread_key_t THR_Session;
 #include "myxt_xt.h"
 #include "strutil_xt.h"
 #include "database_xt.h"
-#ifdef XT_STREAMING
-#include "streaming_xt.h"
-#endif
 #include "cache_xt.h"
 #include "datalog_xt.h"
 
+static void		myxt_bitmap_init(XTThreadPtr self, MX_BITMAP *map, u_int n_bits);
+static void		myxt_bitmap_free(XTThreadPtr self, MX_BITMAP *map);
+
 #ifdef DRIZZLED
 #define swap_variables(TYPE, a, b) \
   do {                             \
@@ -143,7 +143,7 @@ static void my_store_blob_length(byte *p
 
 static int my_compare_text(MX_CONST_CHARSET_INFO *charset_info, uchar *a, uint a_length,
 				uchar *b, uint b_length, my_bool part_key,
-				my_bool skip_end_space __attribute__((unused)))
+				my_bool XT_UNUSED(skip_end_space))
 {
 	if (!part_key)
 		/* The last parameter is diff_if_only_endspace_difference, which means
@@ -632,7 +632,6 @@ static char *mx_get_length_and_data(Fiel
 		case DRIZZLE_TYPE_DATE:
 		case DRIZZLE_TYPE_NEWDECIMAL:
 		case DRIZZLE_TYPE_ENUM:
-		case DRIZZLE_TYPE_VIRTUAL:
 #endif
 			break;
 	}
@@ -751,7 +750,6 @@ static void mx_set_length_and_data(Field
 		case DRIZZLE_TYPE_DATE:
 		case DRIZZLE_TYPE_NEWDECIMAL:
 		case DRIZZLE_TYPE_ENUM:
-		case DRIZZLE_TYPE_VIRTUAL:
 #endif
 			break;
 	}
@@ -764,7 +762,7 @@ static void mx_set_length_and_data(Field
 		bzero(from, field->pack_length());
 }
 
-xtPublic void myxt_set_null_row_from_key(XTOpenTablePtr ot __attribute__((unused)), XTIndexPtr ind, xtWord1 *record)
+xtPublic void myxt_set_null_row_from_key(XTOpenTablePtr XT_UNUSED(ot), XTIndexPtr ind, xtWord1 *record)
 {
 	register XTIndexSegRec *keyseg = ind->mi_seg;
 
@@ -800,7 +798,7 @@ xtPublic void myxt_set_default_row_from_
 }
 
 /* Derived from _mi_put_key_in_record */
-xtPublic xtBool myxt_create_row_from_key(XTOpenTablePtr ot __attribute__((unused)), XTIndexPtr ind, xtWord1 *b_value, u_int key_len, xtWord1 *dest_buff)
+xtPublic xtBool myxt_create_row_from_key(XTOpenTablePtr XT_UNUSED(ot), XTIndexPtr ind, xtWord1 *b_value, u_int key_len, xtWord1 *dest_buff)
 {
 	byte					*record = (byte *) dest_buff;
 	register byte			*key;
@@ -935,8 +933,8 @@ xtPublic xtBool myxt_create_row_from_key
 
 #ifdef CHECK_KEYS
 	err:
-#endif
 	return FAILED;				/* Crashed row */
+#endif
 }
 
 /*
@@ -1715,7 +1713,7 @@ xtPublic void myxt_get_column_as_string(
 
 		/* Required by store() - or an assertion will fail: */
 		if (table->read_set)
-			bitmap_set_bit(table->read_set, col_idx);
+			MX_BIT_SET(table->read_set, col_idx);
 
 		save = field->ptr;
 		xt_lock_mutex(self, &tab->tab_dic_field_lock);
@@ -1743,7 +1741,7 @@ xtPublic xtBool myxt_set_column(XTOpenTa
 
 	/* Required by store() - or an assertion will fail: */
 	if (table->write_set)
-		bitmap_set_bit(table->write_set, col_idx);
+		MX_BIT_SET(table->write_set, col_idx);
 
 	mx_set_notnull_in_record(field, buffer);
 
@@ -1875,7 +1873,12 @@ xtPublic void myxt_print_key(XTIndexPtr 
 
 static void my_close_table(TABLE *table)
 {
-#ifndef DRIZZLED
+#ifdef DRIZZLED
+	TABLE_SHARE	*share;
+
+	share = (TABLE_SHARE *) ((char *) table + sizeof(TABLE));
+	share->free_table_share();
+#else
 	closefrm(table, 1);  // TODO: Q, why did Stewart remove this?
 #endif
 	xt_free_ns(table);
@@ -1885,7 +1888,7 @@ static void my_close_table(TABLE *table)
  * This function returns NULL if the table cannot be opened 
  * because this is not a MySQL thread.
  */ 
-static TABLE *my_open_table(XTThreadPtr self, XTDatabaseHPtr db __attribute__((unused)), XTPathStrPtr tab_path)
+static TABLE *my_open_table(XTThreadPtr self, XTDatabaseHPtr XT_UNUSED(db), XTPathStrPtr tab_path)
 {
 	THD			*thd = current_thd;
 	char		path_buffer[PATH_MAX];
@@ -1946,6 +1949,18 @@ static TABLE *my_open_table(XTThreadPtr 
 	new_lex.current_select= NULL;
 	lex_start(thd);
 
+#ifdef DRIZZLED
+	share->init(db_name, 0, name, path);
+	if ((error = open_table_def(thd, share)) ||
+		(error = open_table_from_share(thd, share, "", 0, (uint32_t) READ_ALL, 0, table, OTM_OPEN)))
+	{
+		xt_free(self, table);
+		lex_end(&new_lex);
+		thd->lex = old_lex;
+		xt_throw_ulxterr(XT_CONTEXT, XT_ERR_LOADING_MYSQL_DIC, (u_long) error);
+		return NULL;
+	}
+#else
 #if MYSQL_VERSION_ID < 60000
 #if MYSQL_VERSION_ID < 50123
 	init_tmp_table_share(share, db_name, 0, name, path);
@@ -1980,6 +1995,7 @@ static TABLE *my_open_table(XTThreadPtr 
 		xt_throw_ulxterr(XT_CONTEXT, XT_ERR_LOADING_MYSQL_DIC, (u_long) error);
 		return NULL;
 	}
+#endif
 
 	lex_end(&new_lex);
 	thd->lex = old_lex;
@@ -1989,8 +2005,10 @@ static TABLE *my_open_table(XTThreadPtr 
 	 * plugin_shutdown() and reap_plugins() in sql_plugin.cc
 	 * from doing their job on shutdown!
 	 */
+#ifndef DRIZZLED
 	plugin_unlock(NULL, table->s->db_plugin);
 	table->s->db_plugin = NULL;
+#endif
 	return table;
 }
 
@@ -2069,6 +2087,11 @@ static xtBool my_is_not_null_int4(XTInde
 	return (seg->type == HA_KEYTYPE_LONG_INT && !(seg->flag & HA_NULL_PART));
 }
 
+/* MY_BITMAP definition in Drizzle does not like if
+ * I use a NULL pointer to calculate the offset!?
+ */
+#define MX_OFFSETOF(x, y)		((size_t)(&((x *) 8)->y) - 8)
+
 /* Derived from ha_myisam::create and mi_create */
 static XTIndexPtr my_create_index(XTThreadPtr self, TABLE *table_arg, u_int idx, KEY *index)
 {
@@ -2084,7 +2107,7 @@ static XTIndexPtr my_create_index(XTThre
 
 	enter_();
 
-	pushsr_(ind, my_deref_index_data, (XTIndexPtr) xt_calloc(self, offsetof(XTIndexRec, mi_seg) + sizeof(XTIndexSegRec) * index->key_parts));
+	pushsr_(ind, my_deref_index_data, (XTIndexPtr) xt_calloc(self, MX_OFFSETOF(XTIndexRec, mi_seg) + sizeof(XTIndexSegRec) * index->key_parts));
 
 	XT_INDEX_INIT_LOCK(self, ind);
 	xt_init_mutex_with_autoname(self, &ind->mi_flush_lock);
@@ -2235,7 +2258,7 @@ static XTIndexPtr my_create_index(XTThre
 
 		/* NOTE: do not set if the field is only partially in the index!!! */
 		if (!partial_field)
-			bitmap_fast_test_and_set(&ind->mi_col_map, field->field_index);
+			MX_BIT_FAST_TEST_AND_SET(&ind->mi_col_map, field->field_index);
 	}
 
 	if (key_length > XT_INDEX_MAX_KEY_SIZE)
@@ -2243,6 +2266,7 @@ static XTIndexPtr my_create_index(XTThre
 
 	/* This is the maximum size of the index on disk: */
 	ind->mi_key_size = key_length;
+	ind->mi_max_items = (XT_INDEX_PAGE_SIZE-2) / (key_length+XT_RECORD_REF_SIZE);
 
 	if (ind->mi_fix_key) {
 		/* Special case for not-NULL 4 byte int value: */
@@ -2281,6 +2305,7 @@ static XTIndexPtr my_create_index(XTThre
 		ind->mi_prev_item = xt_prev_branch_item_var;
 		ind->mi_last_item = xt_last_branch_item_var;
 	}
+	ind->mi_lazy_delete = ind->mi_fix_key && ind->mi_max_items >= 4;
 
 	XT_NODE_ID(ind->mi_root) = 0;
 
@@ -2344,6 +2369,10 @@ xtPublic void myxt_setup_dictionary(XTTh
 	KEY_PART_INFO	*key_part;
 	KEY_PART_INFO	*key_part_end;
 
+#ifndef XT_USE_LAZY_DELETE
+	dic->dic_no_lazy_delete = TRUE;
+#endif
+
 	dic->dic_ind_cols_req = 0;
 	for (uint i=0; i<TS(my_tab)->keys; i++) {
 		index = &my_tab->key_info[i];
@@ -2602,7 +2631,7 @@ xtPublic void myxt_setup_dictionary(XTTh
 	dic->dic_mysql_rec_size = TS(my_tab)->reclength;
 }
 
-static u_int my_get_best_superset(XTThreadPtr self __attribute__((unused)), XTDictionaryPtr dic, XTIndexPtr ind)
+static u_int my_get_best_superset(XTThreadPtr XT_UNUSED(self), XTDictionaryPtr dic, XTIndexPtr ind)
 {
 	XTIndexPtr	super_ind;
 	u_int		super = 0;
@@ -2762,7 +2791,7 @@ static void ha_create_dd_index(XTThreadP
 	}
 }
 
-static char *my_type_to_string(XTThreadPtr self, Field *field, TABLE *my_tab __attribute__((unused)))
+static char *my_type_to_string(XTThreadPtr self, Field *field, TABLE *XT_UNUSED(my_tab))
 {
 	char		buffer[MAX_FIELD_WIDTH + 400], *ptr;
 	String		type((char *) buffer, sizeof(buffer), system_charset_info);
@@ -2834,7 +2863,7 @@ xtPublic XTDDTable *myxt_create_table_fr
  * MySQL CHARACTER UTILITIES
  */
 
-xtPublic void myxt_static_convert_identifier(XTThreadPtr self __attribute__((unused)), MX_CHARSET_INFO *cs, char *from, char *to, size_t to_len)
+xtPublic void myxt_static_convert_identifier(XTThreadPtr XT_UNUSED(self), MX_CHARSET_INFO *cs, char *from, char *to, size_t to_len)
 {
 	uint errors;
 
@@ -2877,11 +2906,16 @@ xtPublic char *myxt_convert_table_name(X
 	return to;
 }
 
-xtPublic void myxt_static_convert_table_name(XTThreadPtr self __attribute__((unused)), char *from, char *to, size_t to_len)
+xtPublic void myxt_static_convert_table_name(XTThreadPtr XT_UNUSED(self), char *from, char *to, size_t to_len)
 {
 	tablename_to_filename(from, to, to_len);
 }
 
+xtPublic void myxt_static_convert_file_name(char *from, char *to, size_t to_len)
+{
+	filename_to_tablename(from, to, to_len);
+}
+
 xtPublic int myxt_strcasecmp(char * a, char *b)
 {
 	return my_strcasecmp(&my_charset_utf8_general_ci, a, b);
@@ -2913,90 +2947,11 @@ xtPublic MX_CHARSET_INFO *myxt_getcharse
 	return &my_charset_utf8_general_ci;
 }
 
-#ifdef XT_STREAMING
-xtPublic xtBool myxt_use_blobs(XTOpenTablePtr ot, void **ret_pbms_table, xtWord1 *rec_buf)
-{
-	void	*pbms_table;
-	XTTable	*tab = ot->ot_table;
-	u_int	idx = 0;
-	Field	*field;
-	char	*blob_ref;
-	xtWord4	len;
-	char	in_url[PBMS_BLOB_URL_SIZE];
-	char	*out_url;
-
-	if (!xt_pbms_open_table(&pbms_table, tab->tab_name->ps_path))
-		return FAILED;
-
-	for (idx=0; idx<tab->tab_dic.dic_blob_count; idx++) {
-		field = tab->tab_dic.dic_blob_cols[idx];
-		if ((blob_ref = mx_get_length_and_data(field, (char *) rec_buf, &len)) && len) {
-			xt_strncpy(PBMS_BLOB_URL_SIZE, in_url, blob_ref, len);
-
-			if (!xt_pbms_use_blob(pbms_table, &out_url, in_url, field->field_index)) {
-				xt_pbms_close_table(pbms_table);
-				return FAILED;
-			}
-
-			if (out_url) {
-				len = strlen(out_url);
-				mx_set_length_and_data(field, (char *) rec_buf, len, out_url);
-			}
-		}
-	}
-	*ret_pbms_table = pbms_table;
-	return OK;
-}
-
-xtPublic void myxt_unuse_blobs(XTOpenTablePtr ot __attribute__((unused)), void *pbms_table)
-{
-	xt_pbms_close_table(pbms_table);
-}
-
-xtPublic xtBool myxt_retain_blobs(XTOpenTablePtr ot __attribute__((unused)), void *pbms_table, xtRecordID rec_id)
-{
-	xtBool				ok;
-	PBMSEngineRefRec	eng_ref;
-
-	memset(&eng_ref, 0, sizeof(PBMSEngineRefRec));
-	XT_SET_DISK_8(eng_ref.er_data, rec_id);
-	ok = xt_pbms_retain_blobs(pbms_table, &eng_ref);
-	xt_pbms_close_table(pbms_table);
-	return ok;
-}
-
-xtPublic void myxt_release_blobs(XTOpenTablePtr ot, xtWord1 *rec_buf, xtRecordID rec_id)
-{
-	void				*pbms_table;
-	XTTable				*tab = ot->ot_table;
-	u_int				idx = 0;
-	Field				*field;
-	char				*blob_ref;
-	xtWord4				len;
-	char				in_url[PBMS_BLOB_URL_SIZE];
-	PBMSEngineRefRec	eng_ref;
-
-	memset(&eng_ref, 0, sizeof(PBMSEngineRefRec));
-	XT_SET_DISK_8(eng_ref.er_data, rec_id);
-
-	if (!xt_pbms_open_table(&pbms_table, tab->tab_name->ps_path))
-		return;
-
-	for (idx=0; idx<tab->tab_dic.dic_blob_count; idx++) {
-		field = tab->tab_dic.dic_blob_cols[idx];
-		if ((blob_ref = mx_get_length_and_data(field, (char *) rec_buf, &len)) && len) {
-			xt_strncpy(PBMS_BLOB_URL_SIZE, in_url, blob_ref, len);
-
-			xt_pbms_release_blob(pbms_table, in_url, field->field_index, &eng_ref);
-		}
-	}
-
-	xt_pbms_close_table(pbms_table);
-}
-#endif // XT_STREAMING
-
 xtPublic void *myxt_create_thread()
 {
+#ifdef DRIZZLED
+	return (void *) 1;
+#else
 	THD *new_thd;
 
 	if (my_thread_init()) {
@@ -3015,8 +2970,14 @@ xtPublic void *myxt_create_thread()
 	lex_start(new_thd);
 
 	return (void *) new_thd;
+#endif
 }
 
+#ifdef DRIZZLED
+xtPublic void myxt_destroy_thread(void *, xtBool)
+{
+}
+#else
 xtPublic void myxt_destroy_thread(void *thread, xtBool end_threads)
 {
 	THD *thd = (THD *) thread;
@@ -3043,6 +3004,7 @@ xtPublic void myxt_destroy_thread(void *
 	if (end_threads)
 		my_thread_end();
 }
+#endif
 
 xtPublic XTThreadPtr myxt_get_self()
 {
@@ -3182,7 +3144,7 @@ xtPublic void myxt_get_status(XTThreadPt
  * MySQL Bit Maps
  */
 
-xtPublic void myxt_bitmap_init(XTThreadPtr self, MY_BITMAP *map, u_int n_bits)
+static void myxt_bitmap_init(XTThreadPtr self, MX_BITMAP *map, u_int n_bits)
 {
 	my_bitmap_map	*buf;
     uint			size_in_bytes = (((n_bits) + 31) / 32) * 4;
@@ -3194,7 +3156,7 @@ xtPublic void myxt_bitmap_init(XTThreadP
 	bitmap_clear_all(map);
 }
 
-xtPublic void myxt_bitmap_free(XTThreadPtr self, MY_BITMAP *map)
+static void myxt_bitmap_free(XTThreadPtr self, MX_BITMAP *map)
 {
 	if (map->bitmap) {
 		xt_free(self, map->bitmap);

=== modified file 'storage/pbxt/src/myxt_xt.h'
--- a/storage/pbxt/src/myxt_xt.h	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/myxt_xt.h	2009-08-18 07:46:53 +0000
@@ -70,6 +70,7 @@ XTDDTable	*myxt_create_table_from_table(
 void		myxt_static_convert_identifier(XTThreadPtr self, struct charset_info_st *cs, char *from, char *to, size_t to_len);
 char		*myxt_convert_identifier(XTThreadPtr self, struct charset_info_st *cs, char *from);
 void		myxt_static_convert_table_name(XTThreadPtr self, char *from, char *to, size_t to_len);
+void		myxt_static_convert_file_name(char *from, char *to, size_t to_len);
 char		*myxt_convert_table_name(XTThreadPtr self, char *from);
 int			myxt_strcasecmp(char * a, char *b);
 int			myxt_isspace(struct charset_info_st *cs, char a);
@@ -78,13 +79,6 @@ int			myxt_isdigit(struct charset_info_s
 
 struct charset_info_st *myxt_getcharset(bool convert);
 
-#ifdef XT_STREAMING
-xtBool		myxt_use_blobs(XTOpenTablePtr ot, void **ret_pbms_table, xtWord1 *rec_buf);
-void		myxt_unuse_blobs(XTOpenTablePtr ot, void *pbms_table);
-xtBool		myxt_retain_blobs(XTOpenTablePtr ot, void *pbms_table, xtRecordID record);
-void		myxt_release_blobs(XTOpenTablePtr ot, xtWord1 *rec_buf, xtRecordID record);
-#endif
-
 void		*myxt_create_thread();
 void		myxt_destroy_thread(void *thread, xtBool end_threads);
 XTThreadPtr	myxt_get_self();
@@ -92,9 +86,6 @@ XTThreadPtr	myxt_get_self();
 int			myxt_statistics_fill_table(XTThreadPtr self, void *th, void *ta, void *co, MX_CONST void *ch);
 void		myxt_get_status(XTThreadPtr self, XTStringBufferPtr strbuf);
 
-void		myxt_bitmap_init(XTThreadPtr self, MY_BITMAP *map, u_int n_bits);
-void		myxt_bitmap_free(XTThreadPtr self, MY_BITMAP *map);
-
 class XTDDColumnFactory
 {
 public:

=== modified file 'storage/pbxt/src/pbms.h'
--- a/storage/pbxt/src/pbms.h	2009-05-09 04:01:53 +0000
+++ b/storage/pbxt/src/pbms.h	2009-08-18 07:46:53 +0000
@@ -16,7 +16,8 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  *
- * Paul McCullagh
+ * Original author: Paul McCullagh
+ * Continued development: Barry Leslie
  * H&G2JCtL
  *
  * 2007-06-01
@@ -37,21 +38,26 @@
 #include <dirent.h>
 #include <signal.h>
 #include <ctype.h>
+#include <errno.h>
+
 
 #ifdef USE_PRAGMA_INTERFACE
 #pragma interface			/* gcc class implementation */
 #endif
 
+/*			2	10		1			10			20			10				10			20				20
+ * Format: "~*"<db_id><'~' || '_'><tab_id>"-"<blob_id>"-"<auth_code>"-"<server_id>"-"<blob_ref_id>"-"<blob_size>
+ */
+//If URL_FMT changes do not forget to update couldBeURL() in this file.
+ 
+#define URL_FMT "~*%lu%c%lu-%llu-%lx-%lu-%llu-%llu"
+
 #define MS_SHARED_MEMORY_MAGIC			0x7E9A120C
 #define MS_ENGINE_VERSION				1
-#define MS_CALLBACK_VERSION				1
-#define MS_SHARED_MEMORY_VERSION		1
-#define MS_ENGINE_LIST_SIZE				80
+#define MS_CALLBACK_VERSION				4
+#define MS_SHARED_MEMORY_VERSION		2
+#define MS_ENGINE_LIST_SIZE				10
 #define MS_TEMP_FILE_PREFIX				"pbms_temp_"
-#define MS_TEMP_FILE_PREFIX				"pbms_temp_"
-
-#define MS_RESULT_MESSAGE_SIZE			300
-#define MS_RESULT_STACK_SIZE			200
 
 #define MS_BLOB_HANDLE_SIZE				300
 
@@ -68,146 +74,81 @@
 #define MS_ERR_UNKNOWN_DB				8
 #define MS_ERR_REMOVING_REPO			9
 #define MS_ERR_DATABASE_DELETED			10
+#define MS_ERR_DUPLICATE				11						/* Attempt to insert a duplicate key into a system table. */
+#define MS_ERR_INVALID_RECORD			12
+#define MS_ERR_RECOVERY_IN_PROGRESS		13
+#define MS_ERR_DUPLICATE_DB				14
+#define MS_ERR_DUPLICATE_DB_ID			15
+#define MS_ERR_INVALID_OPERATION		16
 
 #define MS_LOCK_NONE					0
 #define MS_LOCK_READONLY				1
 #define MS_LOCK_READ_WRITE				2
 
-#define MS_XACT_NONE					0
-#define MS_XACT_BEGIN					1
-#define MS_XACT_COMMIT					2
-#define MS_XACT_ROLLBACK				3
-
-#define PBMS_ENGINE_REF_LEN				8
-#define PBMS_BLOB_URL_SIZE				200
+#define PBMS_BLOB_URL_SIZE				120
 
 #define PBMS_FIELD_COL_SIZE				128
 #define PBMS_FIELD_COND_SIZE			300
 
+#define MS_RESULT_MESSAGE_SIZE			300
+#define MS_RESULT_STACK_SIZE			200
+
+typedef struct PBMSResultRec {
+	int						mr_code;								/* Engine specific error code. */ 
+	char					mr_message[MS_RESULT_MESSAGE_SIZE];		/* Error message, required if non-zero return code. */
+	char					mr_stack[MS_RESULT_STACK_SIZE];			/* Trace information about where the error occurred. */
+} PBMSResultRec, *PBMSResultPtr;
+
+
 
 typedef struct PBMSBlobID {
+	u_int32_t				bi_db_id;	
 	u_int64_t				bi_blob_size;	
 	u_int64_t				bi_blob_id;				// or repo file offset if type = REPO
+	u_int64_t				bi_blob_ref_id;			
 	u_int32_t				bi_tab_id;				// or repo ID if type = REPO
 	u_int32_t				bi_auth_code;
 	u_int32_t				bi_blob_type;
 } PBMSBlobIDRec, *PBMSBlobIDPtr;
 
-typedef struct PBMSResultRec {
-	int						mr_code;								/* Engine specific error code. */ 
-	char					mr_message[MS_RESULT_MESSAGE_SIZE];		/* Error message, required if non-zero return code. */
-	char					mr_stack[MS_RESULT_STACK_SIZE];			/* Trace information about where the error occurred. */
-} PBMSResultRec, *PBMSResultPtr;
-
-typedef struct PBMSEngineRefRec {
-	unsigned char			er_data[PBMS_ENGINE_REF_LEN];
-} PBMSEngineRefRec, *PBMSEngineRefPtr;
-
 typedef struct PBMSBlobURL {
 	char					bu_data[PBMS_BLOB_URL_SIZE];
 } PBMSBlobURLRec, *PBMSBlobURLPtr;
 
-typedef struct PBMSFieldRef {
-	char					fr_column[PBMS_FIELD_COL_SIZE];
-	char					fr_cond[PBMS_FIELD_COND_SIZE];
-} PBMSFieldRefRec, *PBMSFieldRefPtr;
-/*
- * The engine must free its resources for the given thread.
- */
-typedef void (*MSCloseConnFunc)(void *thd);
-
-/* Before access BLOBs of a table, the streaming engine will open the table.
- * Open tables are managed as a pool by the streaming engine.
- * When a request is received, the streaming engine will ask all
- * registered engine to open the table. The engine must return a NULL
- * open_table pointer if it does not handle the table.
- * A callback allows an engine to request all open tables to be
- * closed by the streaming engine.
- */
-typedef int (*MSOpenTableFunc)(void *thd, const char *table_url, void **open_table, PBMSResultPtr result);
-typedef void (*MSCloseTableFunc)(void *thd, void *open_table);
-
-/*
- * When the streaming engine wants to use an open table handle from the
- * pool, it calls the lock table function.
- */ 
-typedef int (*MSLockTableFunc)(void *thd, int *xact, void *open_table, int lock_type, PBMSResultPtr result);
-typedef int (*MSUnlockTableFunc)(void *thd, int xact, void *open_table, PBMSResultPtr result);
-
-/* This function is used to locate and send a BLOB on the given stream.
- */
-typedef int (*MSSendBLOBFunc)(void *thd, void *open_table, const char *blob_column, const char *blob_url, void *stream, PBMSResultPtr result);
-
-/*
- * Lookup and engine reference, and return readable text.
- */
-typedef int (*MSLookupRefFunc)(void *thd, void *open_table, unsigned short col_index, PBMSEngineRefPtr eng_ref, PBMSFieldRefPtr feild_ref, PBMSResultPtr result);
-
 typedef struct PBMSEngineRec {
 	int						ms_version;							/* MS_ENGINE_VERSION */
 	int						ms_index;							/* The index into the engine list. */
 	int						ms_removing;						/* TRUE (1) if the engine is being removed. */
-	const char				*ms_engine_name;
-	void					*ms_engine_info;
-	MSCloseConnFunc			ms_close_conn;
-	MSOpenTableFunc			ms_open_table;
-	MSCloseTableFunc		ms_close_table;
-	MSLockTableFunc			ms_lock_table;
-	MSUnlockTableFunc		ms_unlock_table;
-	MSSendBLOBFunc			ms_send_blob;
-	MSLookupRefFunc			ms_lookup_ref;
+	int						ms_internal;						/* TRUE (1) if the engine is supported directly in the mysq/drizzle handler code . */
+	char					ms_engine_name[32];
 } PBMSEngineRec, *PBMSEnginePtr;
 
 /*
  * This function should never be called directly, it is called
  * by deregisterEngine() below.
  */
-typedef void (*ECDeregisterdFunc)(PBMSEnginePtr engine);
-
-typedef void (*ECTableCloseAllFunc)(const char *table_url);
-
-typedef int (*ECSetContentLenFunc)(void *stream, off_t len, PBMSResultPtr result);
-
-typedef int (*ECWriteHeadFunc)(void *stream, PBMSResultPtr result);
-
-typedef int (*ECWriteStreamFunc)(void *stream, void *buffer, size_t len, PBMSResultPtr result);
-
-/*
- * The engine should call this function from
- * its own close connection function!
- */
-typedef int (*ECCloseConnFunc)(void *thd, PBMSResultPtr result);
+typedef void (*ECRegisterdFunc)(PBMSEnginePtr engine);
 
-/*
- * Call this function before retaining or releasing BLOBs in a row.
- */
-typedef int (*ECOpenTableFunc)(void **open_table, char *table_path, PBMSResultPtr result);
+typedef void (*ECDeregisterdFunc)(PBMSEnginePtr engine);
 
 /*
- * Call this function when the operation is complete.
+ * Call this function to store a BLOB in the repository the BLOB's
+ * URL will be returned. The returned URL buffer is expected to be atleast 
+ * PBMS_BLOB_URL_SIZE long.
+ *
+ * The BLOB URL must still be retained or it will automaticly be deleted after a timeout expires.
  */
-typedef void (*ECCloseTableFunc)(void *open_table);
+typedef int (*ECCreateBlobsFunc)(bool built_in, const char *db_name, const char *tab_name, char *blob, size_t blob_len, char *blob_url, unsigned short col_index, PBMSResultPtr result);
 
 /*
  * Call this function for each BLOB to be retained. When a BLOB is used, the 
- * URL may be changed. The returned URL is valid as long as the the
- * table is open.
+ * URL may be changed. The returned URL buffer is expected to be atleast 
+ * PBMS_BLOB_URL_SIZE long.
  *
  * The returned URL must be inserted into the row in place of the given
  * URL.
  */
-typedef int (*ECUseBlobFunc)(void *open_table, char **ret_blob_url, char *blob_url, unsigned short col_index, PBMSResultPtr result);
-
-/*
- * Reference Blobs that has been uploaded to the streaming engine.
- *
- * All BLOBs specified by the use blob function are retained by
- * this function.
- *
- * The engine reference is a (unaligned) 8 byte value which
- * identifies the row that the BLOBs are in.
- */
-typedef int (*ECRetainBlobsFunc)(void *open_table, PBMSEngineRefPtr eng_ref, PBMSResultPtr result);
+typedef int (*ECRetainBlobsFunc)(bool built_in, const char *db_name, const char *tab_name, char *ret_blob_url, char *blob_url, unsigned short col_index, PBMSResultPtr result);
 
 /*
  * If a row containing a BLOB is deleted, then the BLOBs in the
@@ -216,27 +157,24 @@ typedef int (*ECRetainBlobsFunc)(void *o
  * Note: if a table is dropped, all the BLOBs referenced by the
  * table are automatically released.
  */
-typedef int (*ECReleaseBlobFunc)(void *open_table, char *blob_url, unsigned short col_index, PBMSEngineRefPtr eng_ref, PBMSResultPtr result);
+typedef int (*ECReleaseBlobFunc)(bool built_in, const char *db_name, const char *tab_name, char *blob_url, PBMSResultPtr result);
 
-typedef int (*ECDropTable)(const char *table_path, PBMSResultPtr result);
+typedef int (*ECDropTable)(bool built_in, const char *db_name, const char *tab_name, PBMSResultPtr result);
 
-typedef int (*ECRenameTable)(const char *from_table, const char *to_table, PBMSResultPtr result);
+typedef int (*ECRenameTable)(bool built_in, const char *db_name, const char *from_table, const char *to_table, PBMSResultPtr result);
+
+typedef void (*ECCallCompleted)(bool built_in, bool ok);
 
 typedef struct PBMSCallbacksRec {
 	int						cb_version;							/* MS_CALLBACK_VERSION */
+	ECRegisterdFunc			cb_register;
 	ECDeregisterdFunc		cb_deregister;
-	ECTableCloseAllFunc		cb_table_close_all;
-	ECSetContentLenFunc		cb_set_cont_len;
-	ECWriteHeadFunc			cb_write_head;
-	ECWriteStreamFunc		cb_write_stream;
-	ECCloseConnFunc			cb_close_conn;
-	ECOpenTableFunc			cb_open_table;
-	ECCloseTableFunc		cb_close_table;
-	ECUseBlobFunc			cb_use_blob;
-	ECRetainBlobsFunc		cb_retain_blobs;
+	ECCreateBlobsFunc		cb_create_blob;
+	ECRetainBlobsFunc		cb_retain_blob;
 	ECReleaseBlobFunc		cb_release_blob;
 	ECDropTable				cb_drop_table;
 	ECRenameTable			cb_rename_table;
+	ECCallCompleted			cb_completed;
 } PBMSCallbacksRec, *PBMSCallbacksPtr;
 
 typedef struct PBMSSharedMemoryRec {
@@ -251,24 +189,18 @@ typedef struct PBMSSharedMemoryRec {
 	PBMSEnginePtr			sm_engine_list[MS_ENGINE_LIST_SIZE];
 } PBMSSharedMemoryRec, *PBMSSharedMemoryPtr;
 
-#ifndef PBMS_API
-#ifndef PBMS_CLIENT_API
-Please define he value of PBMS_API
-#endif
-#else
+#ifdef PBMS_API
 
 class PBMS_API
 {
 private:
 	const char *temp_prefix[3];
+	bool built_in;
 
 public:
 	PBMS_API(): sharedMemory(NULL) { 
 		int i = 0;
 		temp_prefix[i++] = MS_TEMP_FILE_PREFIX;
-#ifdef MS_TEMP_FILE_PREFIX
-		temp_prefix[i++] = MS_TEMP_FILE_PREFIX;
-#endif
 		temp_prefix[i++] = NULL;
 		
 	}
@@ -276,6 +208,43 @@ public:
 	~PBMS_API() { }
 
 	/*
+	 * This method is called by the PBMS engine during startup.
+	 */
+	int PBMSStartup(PBMSCallbacksPtr callbacks, PBMSResultPtr result) {
+		int err;
+		
+		deleteTempFiles();
+		err = getSharedMemory(true, result);
+		if (!err)
+			sharedMemory->sm_callbacks = callbacks;
+			
+		return err;
+	}
+
+	/*
+	 * This method is called by the PBMS engine during startup.
+	 */
+	void PBMSShutdown() {
+		
+		if (!sharedMemory)
+			return;
+			
+		lock();
+		sharedMemory->sm_callbacks = NULL;
+
+		bool empty = true;
+		for (int i=0; i<sharedMemory->sm_list_len && empty; i++) {
+			if (sharedMemory->sm_engine_list[i]) 
+				empty = false;
+		}
+
+		unlock();
+		
+		if (empty) 
+			removeSharedMemory();
+	}
+
+	/*
 	 * Register the engine with the Stream Engine.
 	 */
 	int registerEngine(PBMSEnginePtr engine, PBMSResultPtr result) {
@@ -283,6 +252,7 @@ public:
 
 		deleteTempFiles();
 
+		// The first engine to register creates the shared memory.
 		if ((err = getSharedMemory(true, result)))
 			return err;
 
@@ -292,6 +262,10 @@ public:
 				engine->ms_index = i;
 				if (i >= sharedMemory->sm_list_len)
 					sharedMemory->sm_list_len = i+1;
+				if (sharedMemory->sm_callbacks)
+					sharedMemory->sm_callbacks->cb_register(engine);
+					
+				built_in = (engine->ms_internal == 1);
 				return MS_OK;
 			}
 		}
@@ -322,7 +296,7 @@ public:
 		PBMSResultRec result;
 		int err;
 
-		if ((err = getSharedMemory(true, &result)))
+		if ((err = getSharedMemory(false, &result)))
 			return;
 
 		lock();
@@ -342,207 +316,98 @@ public:
 
 		unlock();
 
-		if (empty) {
-			char	temp_file[100];
-
-			sharedMemory->sm_magic = 0;
-			free(sharedMemory);
-			sharedMemory = NULL;
-			const char **prefix = temp_prefix;
-			while (*prefix) {
-				getTempFileName(temp_file, *prefix, getpid());
-				unlink(temp_file);
-				prefix++;
-			}
-		}
+		if (empty) 
+			removeSharedMemory();
 	}
 
-	void closeAllTables(const char *table_url)
+	void removeSharedMemory() 
 	{
-		PBMSResultRec	result;
-		int					err;
-
-		if ((err = getSharedMemory(true, &result)))
-			return;
+		const char **prefix = temp_prefix;
+		char	temp_file[100];
 
+		// Do not remove the sharfed memory until after
+		// the PBMS engine has shutdown.
 		if (sharedMemory->sm_callbacks)
-			sharedMemory->sm_callbacks->cb_table_close_all(table_url);
-	}
-
-	int setContentLength(void *stream, off_t len, PBMSResultPtr result)
-	{
-		int err;
-
-		if ((err = getSharedMemory(true, result)))
-			return err;
-
-		return sharedMemory->sm_callbacks->cb_set_cont_len(stream, len, result);
-	}
-
-	int writeHead(void *stream, PBMSResultPtr result)
-	{
-		int err;
-
-		if ((err = getSharedMemory(true, result)))
-			return err;
-
-		return sharedMemory->sm_callbacks->cb_write_head(stream, result);
-	}
-
-	int writeStream(void *stream, void *buffer, size_t len, PBMSResultPtr result)
-	{
-		int err;
-
-		if ((err = getSharedMemory(true, result)))
-			return err;
-
-		return sharedMemory->sm_callbacks->cb_write_stream(stream, buffer, len, result);
-	}
-
-	int closeConn(void *thd, PBMSResultPtr result)
-	{
-		int err;
-
-		if ((err = getSharedMemory(true, result)))
-			return err;
-
-		if (!sharedMemory->sm_callbacks)
-			return MS_OK;
-
-		return sharedMemory->sm_callbacks->cb_close_conn(thd, result);
-	}
-
-	int openTable(void **open_table, char *table_path, PBMSResultPtr result)
-	{
-		int err;
-
-		if ((err = getSharedMemory(true, result)))
-			return err;
-
-		if (!sharedMemory->sm_callbacks) {
-			*open_table = NULL;
-			return MS_OK;
+			return;
+			
+		sharedMemory->sm_magic = 0;
+		free(sharedMemory);
+		sharedMemory = NULL;
+		
+		while (*prefix) {
+			getTempFileName(temp_file, *prefix, getpid());
+			unlink(temp_file);
+			prefix++;
 		}
-
-		return sharedMemory->sm_callbacks->cb_open_table(open_table, table_path, result);
-	}
-
-	int closeTable(void *open_table, PBMSResultPtr result)
-	{
-		int err;
-
-		if ((err = getSharedMemory(true, result)))
-			return err;
-
-		if (sharedMemory->sm_callbacks && open_table)
-			sharedMemory->sm_callbacks->cb_close_table(open_table);
-		return MS_OK;
 	}
-
-	int couldBeURL(char *blob_url)
-	/* ~*test/~1-150-2b5e0a7-0[*<blob size>][.ext] */
-	/* ~*test/_1-150-2b5e0a7-0[*<blob size>][.ext] */
-	{
-		char	*ptr;
-		size_t	len;
-		bool have_blob_size = false;
-
-		if (blob_url) {
-			if ((len = strlen(blob_url))) {
-				/* Too short: */
-				if (len <= 10)
-					return 0;
-
-				/* Required prefix: */
-				/* NOTE: ~> is deprecated v0.5.4+, now use ~* */
-				if (*blob_url != '~' || (*(blob_url + 1) != '>' && *(blob_url + 1) != '*'))
-					return 0;
-
-				ptr = blob_url + len - 1;
-
-				/* Allow for an optional extension: */
-				if (!isdigit(*ptr)) {
-					while (ptr > blob_url && *ptr != '/' && *ptr != '.')
-						ptr--;
-					if (ptr == blob_url || *ptr != '.')
-						return 0;
-					if (ptr == blob_url || !isdigit(*ptr))
-						return 0;
-				}
 	
-				// field 1: server id OR blob size
-				do_again:
-				while (ptr > blob_url && isdigit(*ptr))
-					ptr--;
-
-				if (ptr != blob_url && *ptr == '*' && !have_blob_size) {
-					ptr--;
-					have_blob_size = true;
-					goto do_again;
-				}
-				
-				if (ptr == blob_url || *ptr != '-')
-					return 0;
-					
-					
-				// field 2: Authoration code
-				ptr--;
-				if (!isxdigit(*ptr))
-					return 0;
-
-				while (ptr > blob_url && isxdigit(*ptr))
-					ptr--;
-
-				if (ptr == blob_url || *ptr != '-')
-					return 0;
-					
-				// field 3:offset
-				ptr--;
-				if (!isxdigit(*ptr))
-					return 0;
-					
-				while (ptr > blob_url && isdigit(*ptr))
-					ptr--;
-
-				if (ptr == blob_url || *ptr != '-')
-					return 0;
-					
-					
-				// field 4:Table id
-				ptr--;
-				if (!isdigit(*ptr))
-					return 0;
-
-				while (ptr > blob_url && isdigit(*ptr))
-					ptr--;
-
-				/* NOTE: ^ and : are deprecated v0.5.4+, now use ! and ~ */
-				if (ptr == blob_url || (*ptr != '^' && *ptr != ':' && *ptr != '_' && *ptr != '~'))
-					return 0;
-				ptr--;
-
-				if (ptr == blob_url || *ptr != '/')
-					return 0;
-				ptr--;
-				if (ptr == blob_url)
-					return 0;
-				return 1;
+	int couldBeURL(char *blob_url, int size)
+	{
+		if (blob_url && (size < PBMS_BLOB_URL_SIZE)) {
+			char			buffer[PBMS_BLOB_URL_SIZE+1];
+			u_int32_t		db_id = 0;
+			u_int32_t		tab_id = 0;
+			u_int64_t		blob_id = 0;
+			u_int64_t		blob_ref_id = 0;
+			u_int64_t		blob_size = 0;
+			u_int32_t		auth_code = 0;
+			u_int32_t		server_id = 0;
+			char		type, junk[5];
+			int			scanned;
+
+			junk[0] = 0;
+			if (blob_url[size]) { // There is no guarantee that the URL will be null terminated.
+				memcpy(buffer, blob_url, size);
+				buffer[size] = 0;
+				blob_url = buffer;
+			}
+			
+			scanned = sscanf(blob_url, URL_FMT"%4s", &db_id, &type, &tab_id, &blob_id, &auth_code, &server_id, &blob_ref_id, &blob_size, junk);
+			if (scanned != 8) {// If junk is found at the end this will also result in an invalid URL. 
+		printf("Bad URL \"%s\": scanned = %d, junk: %d, %d, %d, %d\n", blob_url, scanned, junk[0], junk[1], junk[2], junk[3]); 
+				return 0;
 			}
+			
+			if (junk[0] || (type != '~' && type != '_')) {
+		printf("Bad URL \"%s\": scanned = %d, junk: %d, %d, %d, %d\n", blob_url, scanned, junk[0], junk[1], junk[2], junk[3]); 
+				return 0;
+			}
+		
+			return 1;
 		}
+		
 		return 0;
 	}
-
-	int useBlob(void *open_table, char **ret_blob_url, char *blob_url, unsigned short col_index, PBMSResultPtr result)
+	
+	int  retainBlob(const char *db_name, const char *tab_name, char *ret_blob_url, char *blob_url, size_t blob_size, unsigned short col_index, PBMSResultPtr result)
 	{
 		int err;
+		char safe_url[PBMS_BLOB_URL_SIZE+1];
 
-		if ((err = getSharedMemory(true, result)))
+
+		if ((err = getSharedMemory(false, result)))
 			return err;
 
-		if (!couldBeURL(blob_url)) {
-			*ret_blob_url = NULL;
-			return MS_OK;
+		if (!couldBeURL(blob_url, blob_size)) {
+		
+			if (!sharedMemory->sm_callbacks)  {
+				*ret_blob_url = 0;
+				return MS_OK;
+			}
+			err = sharedMemory->sm_callbacks->cb_create_blob(built_in, db_name, tab_name, blob_url, blob_size, ret_blob_url, col_index, result);
+			if (err)
+				return err;
+				
+			blob_url = ret_blob_url;
+		} else {
+			// Make sure the url is a C string:
+			if (blob_url[blob_size]) {
+				memcpy(safe_url, blob_url, blob_size);
+				safe_url[blob_size] = 0;
+				blob_url = safe_url;
+			}
 		}
+		
 
 		if (!sharedMemory->sm_callbacks) {
 			result->mr_code = MS_ERR_INCORRECT_URL;
@@ -551,64 +416,71 @@ public:
 			return MS_ERR_INCORRECT_URL;
 		}
 
-		return sharedMemory->sm_callbacks->cb_use_blob(open_table, ret_blob_url, blob_url, col_index, result);
+		return sharedMemory->sm_callbacks->cb_retain_blob(built_in, db_name, tab_name, ret_blob_url, blob_url, col_index, result);
 	}
 
-	int retainBlobs(void *open_table, PBMSEngineRefPtr eng_ref, PBMSResultPtr result)
+	int releaseBlob(const char *db_name, const char *tab_name, char *blob_url, size_t blob_size, PBMSResultPtr result)
 	{
 		int err;
+		char safe_url[PBMS_BLOB_URL_SIZE+1];
 
-		if ((err = getSharedMemory(true, result)))
+		if ((err = getSharedMemory(false, result)))
 			return err;
 
 		if (!sharedMemory->sm_callbacks)
 			return MS_OK;
 
-		return sharedMemory->sm_callbacks->cb_retain_blobs(open_table, eng_ref, result);
+		if (!couldBeURL(blob_url, blob_size))
+			return MS_OK;
+
+		if (blob_url[blob_size]) {
+			memcpy(safe_url, blob_url, blob_size);
+			safe_url[blob_size] = 0;
+			blob_url = safe_url;
+		}
+		
+		return sharedMemory->sm_callbacks->cb_release_blob(built_in, db_name, tab_name, blob_url, result);
 	}
 
-	int releaseBlob(void *open_table, char *blob_url, unsigned short col_index, PBMSEngineRefPtr eng_ref, PBMSResultPtr result)
+	int dropTable(const char *db_name, const char *tab_name, PBMSResultPtr result)
 	{
 		int err;
 
-		if ((err = getSharedMemory(true, result)))
+		if ((err = getSharedMemory(false, result)))
 			return err;
 
 		if (!sharedMemory->sm_callbacks)
 			return MS_OK;
-
-		if (!couldBeURL(blob_url))
-			return MS_OK;
-
-		return sharedMemory->sm_callbacks->cb_release_blob(open_table, blob_url, col_index, eng_ref, result);
+			
+		return sharedMemory->sm_callbacks->cb_drop_table(built_in, db_name, tab_name, result);
 	}
 
-	int dropTable(const char *table_path, PBMSResultPtr result)
+	int renameTable(const char *db_name, const char *from_table, const char *to_table, PBMSResultPtr result)
 	{
 		int err;
 
-		if ((err = getSharedMemory(true, result)))
+		if ((err = getSharedMemory(false, result)))
 			return err;
 
 		if (!sharedMemory->sm_callbacks)
 			return MS_OK;
 			
-		return sharedMemory->sm_callbacks->cb_drop_table(table_path, result);
+		return sharedMemory->sm_callbacks->cb_rename_table(built_in, db_name, from_table, to_table, result);
 	}
 
-	int renameTable(const char *from_table, const char *to_table, PBMSResultPtr result)
+	void completed(int ok)
 	{
-		int err;
+		PBMSResultRec result;
 
-		if ((err = getSharedMemory(true, result)))
-			return err;
+		if (getSharedMemory(false, &result))
+			return;
 
 		if (!sharedMemory->sm_callbacks)
-			return MS_OK;
+			return;
 			
-		return sharedMemory->sm_callbacks->cb_rename_table(from_table, to_table, result);
+		sharedMemory->sm_callbacks->cb_completed(built_in, ok);
 	}
-
+	
 	volatile PBMSSharedMemoryPtr sharedMemory;
 
 private:
@@ -618,7 +490,6 @@ private:
 		int		r;
 		char	temp_file[100];
 		const char	**prefix = temp_prefix;
-		void		*tmp_p = NULL;
 
 		if (sharedMemory)
 			return MS_OK;
@@ -644,8 +515,7 @@ private:
 			}
 
 			buffer[tfer] = 0;
-			sscanf(buffer, "%p", &tmp_p);
-			sharedMemory = (PBMSSharedMemoryPtr) tmp_p;
+			sscanf(buffer, "%p", &sharedMemory);
 			if (!sharedMemory || sharedMemory->sm_magic != MS_SHARED_MEMORY_MAGIC) {
 				if (!create)
 					return MS_OK;
@@ -661,9 +531,9 @@ private:
 					return setOSResult(errno, "fseek", temp_file, result);
 				}
 
-				sprintf(buffer, "%p", (void *) sharedMemory);
+				sprintf(buffer, "%p", sharedMemory);
 				tfer = write(tmp_f, buffer, strlen(buffer));
-				if (tfer != (ssize_t) strlen(buffer)) {
+				if (tfer != strlen(buffer)) {
 					close(tmp_f);
 					return setOSResult(errno, "write", temp_file, result);
 				}
@@ -782,19 +652,20 @@ private:
 
 	void deleteTempFiles()
 	{
-		struct dirent *entry;
+		struct dirent	*entry;
 		struct dirent	*result;
 		DIR				*odir;
 		int				err;
-		size_t				sz;
+		size_t			sz;
 		char			temp_file[100];
 
-#ifdef XT_SOLARIS
+#ifdef __sun
 		sz = sizeof(struct dirent) + pathconf("/tmp/", _PC_NAME_MAX); // Solaris, see readdir(3C)
 #else
 		sz = sizeof(struct dirent);
 #endif
-		entry = (struct dirent*)malloc(sz);
+		if (!(entry = (struct dirent *) malloc(sz)))
+			return;
 		if (!(odir = opendir("/tmp/")))
 			return;
 		err = readdir_r(odir, entry, &result);
@@ -846,25 +717,25 @@ extern void PBMSDeinitBlobStreamingThrea
 extern void PBMSGetError(void *v_bs_thread, PBMSResultPtr result);
 
 /* 
-* PBMSCreateBlob():Creates a new blob in the database of the given size. cont_type can be NULL.
+* PBMSCreateBlob():Creates a new blob in the database of the given size.
 */
-extern bool PBMSCreateBlob(PBMSBlobIDPtr blob_id, char *database_name, char *cont_type, u_int64_t size);
+extern bool PBMSCreateBlob(PBMSBlobIDPtr blob_id, char *database_name, u_int64_t size);
 
 /* 
 * PBMSWriteBlob():Write the data to the blob in one or more chunks. The total size of all the chuncks of 
 * data written to the blob must match the size specified when the blob was created.
 */
-extern bool PBMSWriteBlob(PBMSBlobIDPtr blob_id, char *database_name, char *data, size_t size, size_t offset);
+extern bool PBMSWriteBlob(PBMSBlobIDPtr blob_id, char *data, size_t size, size_t offset);
 
 /* 
 * PBMSReadBlob():Read the blob data out of the blob in one or more chunks.
 */
-extern bool PBMSReadBlob(PBMSBlobIDPtr blob_id, char *database_name, char *buffer, size_t *size, size_t offset);
+extern bool PBMSReadBlob(PBMSBlobIDPtr blob_id, char *buffer, size_t *size, size_t offset);
 
 /*
 * PBMSIDToURL():Convert a blob id to a blob URL. The 'url' buffer must be atleast  PBMS_BLOB_URL_SIZE bytes in size.
 */
-extern bool PBMSIDToURL(PBMSBlobIDPtr blob_id, char *database_name, char *url);
+extern bool PBMSIDToURL(PBMSBlobIDPtr blob_id, char *url);
 
 /*
 * PBMSIDToURL():Convert a blob URL to a blob ID.

=== added file 'storage/pbxt/src/pbms_enabled.cc'
--- a/storage/pbxt/src/pbms_enabled.cc	1970-01-01 00:00:00 +0000
+++ b/storage/pbxt/src/pbms_enabled.cc	2009-08-18 07:46:53 +0000
@@ -0,0 +1,238 @@
+/* Copyright (c) 2009 PrimeBase Technologies GmbH, Germany
+ *
+ * PrimeBase Media Stream for MySQL
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Barry Leslie
+ *
+ * 2009-07-16
+ *
+ * H&G2JCtL
+ *
+ * PBMS interface used to enable engines for use with the PBMS engine.
+ *
+ * For an example on how to build this into an engine have a look at the PBXT engine
+ * in file ha_pbxt.cc. Search for 'PBMS_ENABLED'.
+ *
+ */
+
+#define PBMS_API	pbms_enabled_api
+
+#include "pbms_enabled.h"
+#ifdef DRIZZLED
+#include <sys/stat.h>
+#include <drizzled/common_includes.h>
+#include <drizzled/plugin.h>
+#else
+#include "mysql_priv.h"
+#include <mysql/plugin.h>
+#define session_alloc(sess, size) thd_alloc(sess, size);
+#define current_session current_thd
+#endif 
+
+#define GET_BLOB_FIELD(t, i) (Field_blob *)(t->field[t->s->blob_field[i]])
+#define DB_NAME(f) (f->table->s->db.str)
+#define TAB_NAME(f) (*(f->table_name))
+
+static PBMS_API pbms_api;
+
+PBMSEngineRec enabled_engine = {
+	MS_ENGINE_VERSION
+};
+
+//====================
+bool pbms_initialize(const char *engine_name, bool isServer, PBMSResultPtr result)
+{
+	int						err;
+
+	strncpy(enabled_engine.ms_engine_name, engine_name, 32);
+	enabled_engine.ms_internal = isServer;
+	enabled_engine.ms_engine_name[31] = 0;
+
+	err = pbms_api.registerEngine(&enabled_engine, result);
+
+	return (err == 0);
+}
+
+
+//====================
+void pbms_finalize()
+{
+	pbms_api.deregisterEngine(&enabled_engine);
+}
+
+//====================
+int pbms_write_row_blobs(TABLE *table, uchar *row_buffer, PBMSResultPtr result)
+{
+	Field_blob *field;
+	char *blob_rec, *blob;
+	size_t packlength, i, org_length, length;
+	char blob_url_buffer[PBMS_BLOB_URL_SIZE];
+	int err;
+	String type_name;
+
+	if (table->s->blob_fields == 0)
+		return 0;
+		
+	for (i= 0; i < table->s->blob_fields; i++) {
+		field = GET_BLOB_FIELD(table, i);
+
+		// Note: field->type() always returns MYSQL_TYPE_BLOB regardless of the type of BLOB
+		field->sql_type(type_name);
+		if (strcasecmp(type_name.c_ptr(), "LongBlob"))
+			continue;
+			
+		// Get the blob record:
+		blob_rec = (char *)row_buffer + field->offset(field->table->record[0]);
+		packlength = field->pack_length() - field->table->s->blob_ptr_size;
+
+		memcpy(&blob, blob_rec +packlength, sizeof(char*));
+		org_length = field->get_length((uchar *)blob_rec);
+
+		
+		// Signal PBMS to record a new reference to the BLOB.
+		// If 'blob' is not a BLOB URL then it will be stored in the repositor as a new BLOB
+		// and a reference to it will be created.
+		err = pbms_api.retainBlob(DB_NAME(field), TAB_NAME(field), blob_url_buffer, blob, org_length, field->field_index, result);
+		if (err)
+			return err;
+			
+		// If the BLOB length changed reset it. 
+		// This will happen if the BLOB data was replaced with a BLOB reference. 
+		length = strlen(blob_url_buffer)  +1;
+		if ((length != org_length) || memcmp(blob_url_buffer, blob, length)) {
+			if (length != org_length) {
+				field->store_length((uchar *)blob_rec, packlength, length);
+			}
+			
+			if (length > org_length) {
+				// This can only happen if the BLOB URL is actually larger than the BLOB itself.
+				blob = (char *) session_alloc(current_session, length);
+				memcpy(blob_rec+packlength, &blob, sizeof(char*));
+			}			
+			memcpy(blob, blob_url_buffer, length);
+		} 
+	}
+	
+	return 0;
+}
+
+//====================
+int pbms_delete_row_blobs(TABLE *table, const uchar *row_buffer, PBMSResultPtr result)
+{
+	Field_blob *field;
+	const char *blob_rec;
+	char *blob;
+	size_t packlength, i, length;
+	int err;
+	String type_name;
+
+	if (table->s->blob_fields == 0)
+		return 0;
+		
+	for (i= 0; i < table->s->blob_fields; i++) {
+		field = GET_BLOB_FIELD(table, i);
+
+		// Note: field->type() always returns MYSQL_TYPE_BLOB regardless of the type of BLOB
+		field->sql_type(type_name);
+		if (strcasecmp(type_name.c_ptr(), "LongBlob"))
+			continue;
+			
+		// Get the blob record:
+		blob_rec = (char *)row_buffer + field->offset(field->table->record[0]);
+		packlength = field->pack_length() - field->table->s->blob_ptr_size;
+
+		length = field->get_length((uchar *)blob_rec);
+		memcpy(&blob, blob_rec +packlength, sizeof(char*));
+		
+		// Signal PBMS to delete the reference to the BLOB.
+		err = pbms_api.releaseBlob(DB_NAME(field), TAB_NAME(field), blob, length, result);
+		if (err)
+			return err;
+	}
+	
+	return 0;
+}
+
+#define MAX_NAME_SIZE 64
+static void parse_table_path(const char *path, char *db_name, char *tab_name)
+{
+	const char *ptr = path + strlen(path) -1, *eptr;
+	int len;
+	
+	*db_name = *tab_name = 0;
+	
+	while ((ptr > path) && (*ptr != '/'))ptr --;
+	if (*ptr != '/') 
+		return;
+		
+	strncpy(tab_name, ptr+1, MAX_NAME_SIZE);
+	tab_name[MAX_NAME_SIZE-1] = 0;
+	eptr = ptr;
+	ptr--;
+	
+	while ((ptr > path) && (*ptr != '/'))ptr --;
+	if (*ptr != '/') 
+		return;
+	ptr++;
+	
+	len = eptr - ptr;
+	if (len >= MAX_NAME_SIZE)
+		len = MAX_NAME_SIZE-1;
+		
+	memcpy(db_name, ptr, len);
+	db_name[len] = 0;
+	
+}
+
+//====================
+int pbms_rename_table_with_blobs(const char *old_table_path, const char *new_table_path, PBMSResultPtr result)
+{
+	char o_db_name[MAX_NAME_SIZE], n_db_name[MAX_NAME_SIZE], o_tab_name[MAX_NAME_SIZE], n_tab_name[MAX_NAME_SIZE];
+
+	parse_table_path(old_table_path, o_db_name, o_tab_name);
+	parse_table_path(new_table_path, n_db_name, n_tab_name);
+	
+	if (strcmp(o_db_name, n_db_name)) {
+		result->mr_code = MS_ERR_INVALID_OPERATION;
+		strcpy(result->mr_message, "PBMS does not support renaming tables across databases.");
+		strcpy(result->mr_stack, "pbms_rename_table_with_blobs()");
+		return MS_ERR_INVALID_OPERATION;
+	}
+	
+	
+	 return pbms_api.renameTable(o_db_name, o_tab_name, n_tab_name, result);
+}
+
+//====================
+int pbms_delete_table_with_blobs(const char *table_path, PBMSResultPtr result)
+{
+	char db_name[MAX_NAME_SIZE], tab_name[MAX_NAME_SIZE];
+		
+	parse_table_path(table_path, db_name, tab_name);
+
+	return pbms_api.dropTable(db_name, tab_name, result);
+}
+
+//====================
+void pbms_completed(TABLE *table, bool ok)
+{
+	if ((!table) || (table->s->blob_fields != 0))
+		pbms_api.completed(ok) ;
+		
+	 return ;
+}
+

=== added file 'storage/pbxt/src/pbms_enabled.h'
--- a/storage/pbxt/src/pbms_enabled.h	1970-01-01 00:00:00 +0000
+++ b/storage/pbxt/src/pbms_enabled.h	2009-08-18 07:46:53 +0000
@@ -0,0 +1,110 @@
+/* Copyright (c) 2009 PrimeBase Technologies GmbH, Germany
+ *
+ * PrimeBase Media Stream for MySQL
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Barry Leslie
+ *
+ * 2009-07-16
+ *
+ * H&G2JCtL
+ *
+ * PBMS interface used to enable engines for use with the PBMS engine.
+ *
+ * For an example on how to build this into an engine have a look at the PBXT engine
+ * in file ha_pbxt.cc. Search for 'PBMS_ENABLED'.
+ *
+ */
+
+
+#ifndef __PBMS_ENABLED_H__
+#define __PBMS_ENABLED_H__
+
+#include "pbms.h"
+
+#ifdef DRIZZLED
+#include <drizzled/server_includes.h>
+#define TABLE Table
+#else
+#include <mysql_priv.h>
+#endif
+
+/*
+ * pbms_initialize() should be called from the engines plugIn's 'init()' function.
+ * The engine_name is the name of your engine, "PBXT" or "InnoDB" for example.
+ *
+ * The isServer flag indicates if this entire server is being enabled. This is only
+ * true if this is being built into the server's handler code above the engine level
+ * calls. 
+ */
+extern bool pbms_initialize(const char *engine_name, bool isServer, PBMSResultPtr result);
+
+/*
+ * pbms_finalize() should be called from the engines plugIn's 'deinit()' function.
+ */
+extern void pbms_finalize();
+
+/*
+ * pbms_write_row_blobs() should be called from the engine's 'write_row' function.
+ * It can alter the row data so it must be called before any other function using the row data.
+ * It should also be called from engine's 'update_row' function for the new row.
+ *
+ * pbms_completed() must be called after calling pbms_write_row_blobs() and just before
+ * returning from write_row() to indicate if the operation completed successfully.
+ */
+extern int pbms_write_row_blobs(TABLE *table, uchar *buf, PBMSResultPtr result);
+
+/*
+ * pbms_delete_row_blobs() should be called from the engine's 'delete_row' function.
+ * It should also be called from engine's 'update_row' function for the old row.
+ *
+ * pbms_completed() must be called after calling pbms_delete_row_blobs() and just before
+ * returning from delete_row() to indicate if the operation completed successfully.
+ */
+extern int pbms_delete_row_blobs(TABLE *table, const uchar *buf, PBMSResultPtr result);
+
+/*
+ * pbms_rename_table_with_blobs() should be called from the engine's 'rename_table' function.
+ *
+ * NOTE: Renaming tables across databases is not supported.
+ *
+ * pbms_completed() must be called after calling pbms_rename_table_with_blobs() and just before
+ * returning from rename_table() to indicate if the operation completed successfully.
+ */
+extern int pbms_rename_table_with_blobs(const char *old_table_path, const char *new_table_path, PBMSResultPtr result);
+
+/*
+ * pbms_delete_table_with_blobs() should be called from the engine's 'delete_table' function.
+ *
+ * NOTE: Currently pbms_delete_table_with_blobs() cannot be undone so it should only
+ * be called after the host engine has performed successfully drop it's table.
+ *
+ * pbms_completed() must be called after calling pbms_delete_table_with_blobs() and just before
+ * returning from delete_table() to indicate if the operation completed successfully.
+ */
+extern int pbms_delete_table_with_blobs(const char *table_path, PBMSResultPtr result);
+
+/*
+ * pbms_completed() must be called to indicate success or failure of a an operation after having
+ * called  pbms_write_row_blobs(), pbms_delete_row_blobs(), pbms_rename_table_with_blobs(), or
+ * pbms_delete_table_with_blobs().
+ *
+ * pbms_completed() has the effect of committing or rolling back the changes made if the session
+ * is in 'autocommit' mode.
+ */
+extern void pbms_completed(TABLE *table, bool ok);
+
+#endif

=== modified file 'storage/pbxt/src/pthread_xt.cc'
--- a/storage/pbxt/src/pthread_xt.cc	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/pthread_xt.cc	2009-08-18 07:46:53 +0000
@@ -395,20 +395,31 @@ int xt_p_cond_timedwait(xt_cond_type *co
 
 int xt_p_join(pthread_t thread, void **value)
 {
-	switch (WaitForSingleObject(thread, INFINITE)) {
-		case WAIT_OBJECT_0: 
-		case WAIT_TIMEOUT:
-			/* Don't do this! According to the Win docs:
-			 * _endthread automatically closes the thread handle 
-			 * (whereas _endthreadex does not). Therefore, when using 
-			 * _beginthread and _endthread, do not explicitly close the 
-			 * thread handle by calling the Win32 CloseHandle API.
-			CloseHandle(thread);
-			 */
-			break;
-		case WAIT_FAILED:
-			return GetLastError();
+	DWORD exitcode;
+
+	while(1) {
+		switch (WaitForSingleObject(thread, 10000)) {
+			case WAIT_OBJECT_0:
+				return 0;
+			case WAIT_TIMEOUT:
+				/* Don't do this! According to the Win docs:
+				 * _endthread automatically closes the thread handle
+				 * (whereas _endthreadex does not). Therefore, when using
+				 * _beginthread and _endthread, do not explicitly close the
+				 * thread handle by calling the Win32 CloseHandle API.
+				CloseHandle(thread);
+				 */
+				/* This is done so that if the thread was not [yet] in the running
+				 * state when this function was called we won't deadlock here.
+				 */
+				if (GetExitCodeThread(thread, &exitcode) && (exitcode == STILL_ACTIVE))
+					break;
+				return 0;
+			case WAIT_FAILED:
+				return GetLastError();
+		}
 	}
+
 	return 0;
 }
 

=== modified file 'storage/pbxt/src/restart_xt.cc'
--- a/storage/pbxt/src/restart_xt.cc	2009-04-02 10:03:14 +0000
+++ b/storage/pbxt/src/restart_xt.cc	2009-08-18 07:46:53 +0000
@@ -410,7 +410,7 @@ typedef struct XTOperation {
 	xtLogOffset				or_log_offset;
 } XTOperationRec, *XTOperationPtr;
 
-static int xres_cmp_op_seq(struct XTThread *self __attribute__((unused)), register const void *thunk __attribute__((unused)), register const void *a, register const void *b)
+static int xres_cmp_op_seq(struct XTThread *XT_UNUSED(self), register const void *XT_UNUSED(thunk), register const void *a, register const void *b)
 {
 	xtOpSeqNo		lf_op_seq = *((xtOpSeqNo *) a);
 	XTOperationPtr	lf_ptr = (XTOperationPtr) b;
@@ -480,19 +480,6 @@ static xtBool xres_add_index_entries(XTO
 		return OK;
 
 	for (idx_cnt=0, ind=tab->tab_dic.dic_keys; idx_cnt<tab->tab_dic.dic_key_count; idx_cnt++, ind++) {
-		/*
-		key.sk_on_key = FALSE;
-		key.sk_key_value.sv_flags = XT_SEARCH_WHOLE_KEY;
-		key.sk_key_value.sv_rec_id = rec_offset;
-		key.sk_key_value.sv_key = key.sk_key_buf;
-		key.sk_key_value.sv_length = myxt_create_key_from_row(*ind, key.sk_key_buf, rec_data, NULL);
-		if (!xt_idx_search(ot, *ind, &key)) {
-			ot->ot_err_index_no = (*ind)->mi_index_no;
-			return FAILED;
-		}
-		if (!key.sk_on_key) {
-		}
-		*/
 		if (!xt_idx_insert(ot, *ind, row_id, rec_id, rec_data, NULL, TRUE)) {
 			/* Check the error, certain errors are recoverable! */
 			XTThreadPtr self = xt_get_self();
@@ -509,7 +496,7 @@ static xtBool xres_add_index_entries(XTO
 			/* TODO: Write something to the index header to indicate that
 			 * it is corrupted.
 			 */
-			tab->tab_dic.dic_disable_index = XT_INDEX_CORRUPTED;
+			xt_tab_disable_index(ot->ot_table, XT_INDEX_CORRUPTED);
 			xt_log_and_clear_exception_ns();
 			return OK;
 		}
@@ -642,6 +629,9 @@ static void xres_apply_change(XTThreadPt
 	xtWord1				*rec_data = NULL;
 	XTTabRecFreeDPtr	free_data;
 
+	if (tab->tab_dic.dic_key_count == 0)
+		check_index = FALSE;
+
 	switch (record->xl.xl_status_1) {
 		case XT_LOG_ENT_REC_MODIFIED:
 		case XT_LOG_ENT_UPDATE:
@@ -651,20 +641,25 @@ static void xres_apply_change(XTThreadPt
 		case XT_LOG_ENT_INSERT_BG:
 		case XT_LOG_ENT_DELETE_BG:
 			rec_id = XT_GET_DISK_4(record->xu.xu_rec_id_4);
+
+			/* This should be done before we apply change to table, as otherwise we lose
+			 * the key value that we need to remove from index
+			 */
+			if (check_index && record->xl.xl_status_1 == XT_LOG_ENT_REC_MODIFIED) {
+				if ((rec_data = xres_load_record(self, ot, rec_id, NULL, 0, rec_buf, tab->tab_dic.dic_ind_cols_req)))
+					xres_remove_index_entries(ot, rec_id, rec_data);			
+			}
+
 			len = (size_t) XT_GET_DISK_2(record->xu.xu_size_2);
 			if (!XT_PWRITE_RR_FILE(ot->ot_rec_file, xt_rec_id_to_rec_offset(tab, rec_id), len, (xtWord1 *) &record->xu.xu_rec_type_1, &ot->ot_thread->st_statistics.st_rec, ot->ot_thread))
 				xt_throw(self);
 			tab->tab_bytes_to_flush += len;
 
-			if (check_index && ot->ot_table->tab_dic.dic_key_count) {
+			if (check_index) {
 				switch (record->xl.xl_status_1) {
 					case XT_LOG_ENT_DELETE:
 					case XT_LOG_ENT_DELETE_BG:
 						break;
-					case XT_LOG_ENT_REC_MODIFIED:
-						if ((rec_data = xres_load_record(self, ot, rec_id, NULL, 0, rec_buf, tab->tab_dic.dic_ind_cols_req)))
-							xres_remove_index_entries(ot, rec_id, rec_data);
-						/* No break required: */
 					default:
 						if ((rec_data = xres_load_record(self, ot, rec_id, &record->xu.xu_rec_type_1, len, rec_buf, tab->tab_dic.dic_ind_cols_req))) {
 							row_id = XT_GET_DISK_4(record->xu.xu_row_id_4);
@@ -859,9 +854,6 @@ static void xres_apply_change(XTThreadPt
 						goto do_rec_freed;
 					record_loaded = TRUE;
 				}
-#ifdef XT_STREAMING
-				myxt_release_blobs(ot, rec_data, rec_id);
-#endif
 			}
 
 			if (record->xl.xl_status_1 == XT_LOG_ENT_REC_REMOVED_EXT) {
@@ -967,31 +959,12 @@ static void xres_apply_change(XTThreadPt
 
 			if (check_index) {
 				cols_required = tab->tab_dic.dic_ind_cols_req;
-#ifdef XT_STREAMING
-				if (tab->tab_dic.dic_blob_cols_req > cols_required)
-					cols_required = tab->tab_dic.dic_blob_cols_req;
-#endif
 				if (!(rec_data = xres_load_record(self, ot, rec_id, &record->rb.rb_rec_type_1, rec_size, rec_buf, cols_required)))
 					goto go_on_to_free;
 				record_loaded = TRUE;
 				xres_remove_index_entries(ot, rec_id, rec_data);
 			}
 
-#ifdef XT_STREAMING
-			if (tab->tab_dic.dic_blob_count) {
-				if (!record_loaded) {
-					cols_required = tab->tab_dic.dic_blob_cols_req;
-					if (!(rec_data = xres_load_record(self, ot, rec_id, &record->rb.rb_rec_type_1, rec_size, rec_buf, cols_required)))
-						/* [(7)] REMOVE is followed by FREE:
-						goto get_rec_offset;
-						*/
-						goto go_on_to_free;
-					record_loaded = TRUE;
-				}
-				myxt_release_blobs(ot, rec_data, rec_id);
-			}
-#endif
-
 			if (data_log_id && data_log_offset && log_over_size) {
 				if (!ot->ot_thread->st_dlog_buf.dlb_delete_log(data_log_id, data_log_offset, log_over_size, tab->tab_id, rec_id, self)) {
 					if (ot->ot_thread->t_exception.e_xt_err != XT_ERR_BAD_EXT_RECORD &&
@@ -1560,7 +1533,7 @@ static xtBool xres_delete_data_log(XTDat
 	return OK;
 }
 
-static int xres_comp_flush_tabs(XTThreadPtr self __attribute__((unused)), register const void *thunk __attribute__((unused)), register const void *a, register const void *b)
+static int xres_comp_flush_tabs(XTThreadPtr XT_UNUSED(self), register const void *XT_UNUSED(thunk), register const void *a, register const void *b)
 {
 	xtTableID				tab_id = *((xtTableID *) a);
 	XTCheckPointTablePtr	cp_tab = (XTCheckPointTablePtr) b;
@@ -1868,7 +1841,7 @@ void XTXactRestart::xres_init(XTThreadPt
 	exit_();
 }
 
-void XTXactRestart::xres_exit(XTThreadPtr self __attribute__((unused)))
+void XTXactRestart::xres_exit(XTThreadPtr XT_UNUSED(self))
 {
 }
 
@@ -2700,7 +2673,7 @@ xtPublic xtBool xt_begin_checkpoint(XTDa
 		XTXactSegPtr 	seg;
 
 		seg = &db->db_xn_idx[i];
-		XT_XACT_WRITE_LOCK(&seg->xs_tab_lock, self);
+		XT_XACT_READ_LOCK(&seg->xs_tab_lock, self);
 		for (u_int j=0; j<XT_XN_HASH_TABLE_SIZE; j++) {
 			XTXactDataPtr	xact;
 			
@@ -2716,7 +2689,7 @@ xtPublic xtBool xt_begin_checkpoint(XTDa
 				xact = xact->xd_next_xact;
 			}
 		}
-		XT_XACT_UNLOCK(&seg->xs_tab_lock, self);
+		XT_XACT_UNLOCK(&seg->xs_tab_lock, self, FALSE);
 	}
 
 #ifdef TRACE_CHECKPOINT
@@ -3201,3 +3174,36 @@ xtPublic void xt_dump_xlogs(XTDatabaseHP
 	done:
 	db->db_xlog.xlog_seq_exit(&seq);
 }
+
+/* ----------------------------------------------------------------------
+ * D A T A B A S E   R E C O V E R Y   T H R E A D
+ */
+
+extern XTDatabaseHPtr pbxt_database;
+
+static void *xn_xres_run_recovery_thread(XTThreadPtr self)
+{
+	THD *mysql_thread;
+
+	mysql_thread = (THD *)myxt_create_thread();
+
+	while(!ha_resolve_by_legacy_type(mysql_thread, DB_TYPE_PBXT))
+		xt_sleep_milli_second(1);
+
+	xt_open_database(self, mysql_real_data_home, TRUE);
+	pbxt_database = self->st_database;
+	xt_heap_reference(self, pbxt_database);
+	myxt_destroy_thread(mysql_thread, TRUE);
+
+	return NULL;
+}
+
+xtPublic void xt_xres_start_database_recovery(XTThreadPtr self)
+{
+	char name[PATH_MAX];
+
+	sprintf(name, "DB-RECOVERY-%s", xt_last_directory_of_path(mysql_real_data_home));
+	xt_remove_dir_char(name);
+	XTThreadPtr thread = xt_create_daemon(self, name);
+	xt_run_thread(self, thread, xn_xres_run_recovery_thread);
+}

=== modified file 'storage/pbxt/src/restart_xt.h'
--- a/storage/pbxt/src/restart_xt.h	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/restart_xt.h	2009-08-17 11:12:36 +0000
@@ -131,4 +131,6 @@ xtWord8	xt_bytes_since_last_checkpoint(s
 void xt_print_log_record(xtLogID log, off_t offset, XTXactLogBufferDPtr record);
 void xt_dump_xlogs(struct XTDatabase *db, xtLogID start_log);
 
+xtPublic void xt_xres_start_database_recovery(XTThreadPtr self, const char *path);
+
 #endif

=== modified file 'storage/pbxt/src/sortedlist_xt.cc'
--- a/storage/pbxt/src/sortedlist_xt.cc	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/sortedlist_xt.cc	2009-08-17 11:12:36 +0000
@@ -234,7 +234,7 @@ xtPublic void xt_sl_delete_item_at(struc
 	XT_MEMMOVE(sl->sl_data, &sl->sl_data[idx * sl->sl_item_size], &sl->sl_data[(idx+1) * sl->sl_item_size], (sl->sl_usage_count-idx) * sl->sl_item_size);
 }
 
-xtPublic void xt_sl_remove_from_front(struct XTThread *self __attribute__((unused)), XTSortedListPtr sl, size_t items)
+xtPublic void xt_sl_remove_from_front(struct XTThread *XT_UNUSED(self), XTSortedListPtr sl, size_t items)
 {
 	if (sl->sl_usage_count <= items)
 		xt_sl_set_size(sl, 0);

=== removed file 'storage/pbxt/src/streaming_xt.cc'
--- a/storage/pbxt/src/streaming_xt.cc	2009-04-02 10:03:14 +0000
+++ b/storage/pbxt/src/streaming_xt.cc	1970-01-01 00:00:00 +0000
@@ -1,624 +0,0 @@
-/* Copyright (c) 2005 PrimeBase Technologies GmbH
- *
- * PrimeBase XT
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * 2006-06-07	Paul McCullagh
- *
- * H&G2JCtL
- *
- * This file contains PBXT streaming interface.
- */
-
-#include "xt_config.h"
-
-#ifdef XT_STREAMING
-#include "ha_pbxt.h"
-
-#include "thread_xt.h"
-#include "strutil_xt.h"
-#include "table_xt.h"
-#include "myxt_xt.h"
-#include "xaction_xt.h"
-#include "database_xt.h"
-#include "streaming_xt.h"
-
-extern PBMSEngineRec pbxt_engine;
-
-static PBMS_API pbxt_streaming;
-
-/* ----------------------------------------------------------------------
- * INIT & EXIT
- */
-
-xtPublic xtBool xt_init_streaming(void)
-{
-	XTThreadPtr				self = NULL;
-	int						err;
-	PBMSResultRec		result;
-
-	if ((err = pbxt_streaming.registerEngine(&pbxt_engine, &result))) {
-		xt_logf(XT_CONTEXT, XT_LOG_ERROR, "%s\n", result.mr_message);
-		return FAILED;
-	}
-	return OK;
-}
-
-xtPublic void xt_exit_streaming(void)
-{
-	pbxt_streaming.deregisterEngine(&pbxt_engine);
-}
-
-/* ----------------------------------------------------------------------
- * UTILITY FUNCTIONS
- */
-
-static void str_result_to_exception(XTExceptionPtr e, int r, PBMSResultPtr result)
-{
-	char *str, *end_str;
-
-	e->e_xt_err = r;
-	e->e_sys_err = result->mr_code;
-	xt_strcpy(XT_ERR_MSG_SIZE, e->e_err_msg, result->mr_message);
-
-	e->e_source_line = 0;
-	str = result->mr_stack;
-	if ((end_str = strchr(str, '('))) {
-		xt_strcpy_term(XT_MAX_FUNC_NAME_SIZE, e->e_func_name, str, '(');
-		str = end_str+1;
-		if ((end_str = strchr(str, ':'))) {
-			xt_strcpy_term(XT_SOURCE_FILE_NAME_SIZE, e->e_source_file, str, ':');
-			str = end_str+1;
-			if ((end_str = strchr(str, ')'))) {
-				char number[40];
-				
-				xt_strcpy_term(40, number, str, ')');
-				e->e_source_line = atol(number);
-				str = end_str+1;
-				if (*str == '\n')
-					str++;
-			}
-		}
-	}
-	
-	if (e->e_source_line == 0) {
-		*e->e_func_name = 0;
-		*e->e_source_file = 0;
-		xt_strcpy(XT_ERR_MSG_SIZE, e->e_catch_trace, result->mr_stack);
-	}
-	else
-		xt_strcpy(XT_ERR_MSG_SIZE, e->e_catch_trace, str);
-}
-
-static void str_exception_to_result(XTExceptionPtr e, PBMSResultPtr result)
-{
-	int len;
-
-	if (e->e_sys_err)
-		result->mr_code = e->e_sys_err;
-	else
-		result->mr_code = e->e_xt_err;
-	xt_strcpy(MS_RESULT_MESSAGE_SIZE, result->mr_message, e->e_err_msg);
-	xt_strcpy(MS_RESULT_STACK_SIZE, result->mr_stack, e->e_func_name);
-	xt_strcat(MS_RESULT_STACK_SIZE, result->mr_stack, "(");
-	xt_strcat(MS_RESULT_STACK_SIZE, result->mr_stack, e->e_source_file);
-	xt_strcat(MS_RESULT_STACK_SIZE, result->mr_stack, ":");
-	xt_strcati(MS_RESULT_STACK_SIZE, result->mr_stack, (int) e->e_source_line);
-	xt_strcat(MS_RESULT_STACK_SIZE, result->mr_stack, ")");
-	len = strlen(result->mr_stack);
-	if (strncmp(result->mr_stack, e->e_catch_trace, len) == 0)
-		xt_strcat(MS_RESULT_STACK_SIZE, result->mr_stack, e->e_catch_trace + len);
-	else {
-		xt_strcat(MS_RESULT_STACK_SIZE, result->mr_stack, "\n");
-		xt_strcat(MS_RESULT_STACK_SIZE, result->mr_stack, e->e_catch_trace);
-	}
-}
-
-static XTIndexPtr str_find_index(XTTableHPtr tab, u_int *col_list, u_int col_cnt)
-{
-	u_int			i, j;
-	XTIndexPtr		*ind;					/* MySQL/PBXT key description */
-
-	ind = tab->tab_dic.dic_keys;
-	for (i=0; i<tab->tab_dic.dic_key_count; i++) {
-		if ((*ind)->mi_seg_count == col_cnt) {
-			for (j=0; j<(*ind)->mi_seg_count; j++) {
-				if ((*ind)->mi_seg[j].col_idx != col_list[j])
-					goto loop;
-			}
-			return *ind;
-		}
-		
-		loop:
-		ind++;
-	}
-	return NULL;
-}
-
-static XTThreadPtr str_set_current_thread(THD *thd, PBMSResultPtr result)
-{
-	XTThreadPtr		self;
-	XTExceptionRec	e;
-
-	if (!(self = xt_ha_set_current_thread(thd, &e))) {
-		str_exception_to_result(&e, result);
-		return NULL;
-	}
-	return self;
-}
-
-/* ----------------------------------------------------------------------
- * BLOB STREAMING INTERFACE
- */
-
-static void pbxt_close_conn(void *thread)
-{
-	xt_ha_close_connection((THD *) thread);
-}
-
-static int pbxt_open_table(void *thread, const char *table_url, void **open_table, PBMSResultPtr result)
-{
-	THD				*thd = (THD *) thread;
-	XTThreadPtr		self;
-	XTTableHPtr		tab = NULL;
-	XTOpenTablePtr	ot = NULL;
-	int				err = MS_OK;
-
-	if (!(self = str_set_current_thread(thd, result)))
-		return MS_ERR_ENGINE;
-
-	try_(a) {
-		xt_ha_open_database_of_table(self, (XTPathStrPtr) table_url);
-		if (!(tab = xt_use_table(self, (XTPathStrPtr) table_url, FALSE, TRUE, NULL))) {
-			err = MS_ERR_UNKNOWN_TABLE;
-			goto done;
-		}
-		if (!(ot = xt_open_table(tab)))
-			throw_();
-		ot->ot_thread = self;
-		done:;
-	}
-	catch_(a) {
-		str_exception_to_result(&self->t_exception, result);
-		err = MS_ERR_ENGINE;
-	}
-	cont_(a);
-	if (tab)
-		xt_heap_release(self, tab);
-	*open_table = ot;
-	return err;
-}
-
-static void pbxt_close_table(void *thread, void *open_table_ptr)
-{
-	THD						*thd = (THD *) thread;
-	volatile XTThreadPtr	self, new_self = NULL;
-	XTOpenTablePtr			ot = (XTOpenTablePtr) open_table_ptr;
-	XTExceptionRec			e;
-
-	if (thd) {
-		if (!(self = xt_ha_set_current_thread(thd, &e))) {
-			xt_log_exception(NULL, &e, XT_LOG_DEFAULT);
-			return;
-		}
-	}
-	else if (!(self = xt_get_self())) {
-		if (!(new_self = xt_create_thread("TempForClose", FALSE, TRUE, &e))) {
-			xt_log_exception(NULL, &e, XT_LOG_DEFAULT);
-			return;
-		}
-		self = new_self;
-	}
-
-	ot->ot_thread = self;
-	try_(a) {
-		xt_close_table(ot, TRUE, FALSE);
-	}
-	catch_(a) {
-		xt_log_and_clear_exception(self);
-	}
-	cont_(a);
-	if (new_self)
-		xt_free_thread(self);
-}
-
-static int pbxt_lock_table(void *thread, int *xact, void *open_table, int lock_type, PBMSResultPtr result)
-{
-	THD				*thd = (THD *) thread;
-	XTThreadPtr		self;
-	XTOpenTablePtr	ot = (XTOpenTablePtr) open_table;
-	int				err = MS_OK;
-
-	if (!(self = str_set_current_thread(thd, result)))
-		return MS_ERR_ENGINE;
-
-	if (lock_type != MS_LOCK_NONE) {
-		try_(a) {
-			xt_ha_open_database_of_table(self, ot->ot_table->tab_name);
-			ot->ot_thread = self;
-		}
-		catch_(a) {
-			str_exception_to_result(&self->t_exception, result);
-			err = MS_ERR_ENGINE;
-		}
-		cont_(a);
-	}
-
-	if (!err && *xact == MS_XACT_BEGIN) {
-		if (self->st_xact_data)
-			*xact = MS_XACT_NONE;
-		else {
-			if (xt_xn_begin(self)) {
-				*xact = MS_XACT_COMMIT;
-			}
-			else {
-				str_exception_to_result(&self->t_exception, result);
-				err = MS_ERR_ENGINE;
-			}
-		}
-	}
-
-	return err;
-}
-
-static int pbxt_unlock_table(void *thread, int xact, void *open_table __attribute__((unused)), PBMSResultPtr result)
-{
-	THD				*thd = (THD *) thread;
-	XTThreadPtr		self = xt_ha_thd_to_self(thd);
-	int				err = MS_OK;
-
-	if (xact == MS_XACT_COMMIT) {
-		if (!xt_xn_commit(self)) {
-			str_exception_to_result(&self->t_exception, result);
-			err = MS_ERR_ENGINE;
-		}
-	}
-	else if (xact == MS_XACT_ROLLBACK) {
-		xt_xn_rollback(self);
-	}
-
-	return err;
-}
-
-static int pbxt_send_blob(void *thread, void *open_table, const char *blob_column, const char *blob_url_p, void *stream, PBMSResultPtr result)
-{
-	THD					*thd = (THD *) thread;
-	XTThreadPtr			self = xt_ha_thd_to_self(thd);
-	XTOpenTablePtr		ot = (XTOpenTablePtr) open_table;
-	int					err = MS_OK;
-	u_int				blob_col_idx, col_idx;
-	char				col_name[XT_IDENTIFIER_NAME_SIZE];
-	XTStringBufferRec	value;
-	u_int				col_list[XT_MAX_COLS_PER_INDEX];
-	u_int				col_cnt;
-	char				col_names[XT_ERR_MSG_SIZE - 200];
-	XTIdxSearchKeyRec	search_key;
-	XTIndexPtr			ind;
-	char				*blob_data;
-	size_t				blob_len;
-	const char			*blob_url = blob_url_p;
-
-	memset(&value, 0, sizeof(value));
-
-	*col_names = 0;
-
-	ot->ot_thread = self;
-	try_(a) {
-		if (ot->ot_row_wbuf_size < ot->ot_table->tab_dic.dic_mysql_buf_size) {
-			xt_realloc(self, (void **) &ot->ot_row_wbuffer, ot->ot_table->tab_dic.dic_mysql_buf_size);
-			ot->ot_row_wbuf_size = ot->ot_table->tab_dic.dic_mysql_buf_size;
-		}
-
-		xt_strcpy_url(XT_IDENTIFIER_NAME_SIZE, col_name, blob_column);
-		if (!myxt_find_column(ot, &blob_col_idx, col_name))
-			xt_throw_tabcolerr(XT_CONTEXT, XT_ERR_COLUMN_NOT_FOUND, ot->ot_table->tab_name, blob_column);
-
-		/* Prepare a row for the condition: */
-		const char *ptr;
-
-		col_cnt = 0;
-		while (*blob_url) {
-			ptr = xt_strchr(blob_url, '=');
-			xt_strncpy_url(XT_IDENTIFIER_NAME_SIZE, col_name, blob_url, (size_t) (ptr - blob_url));
-			if (!myxt_find_column(ot, &col_idx, col_name))
-				xt_throw_tabcolerr(XT_CONTEXT, XT_ERR_COLUMN_NOT_FOUND, ot->ot_table->tab_name, col_name);
-			if (*col_names)
-				xt_strcat(sizeof(col_names), col_names, ", ");
-			xt_strcat(sizeof(col_names), col_names, col_name);
-			blob_url = ptr;
-			if (*blob_url == '=')
-				blob_url++;
-			ptr = xt_strchr(blob_url, '&');
-			value.sb_len = 0;
-			xt_sb_concat_url_len(self, &value, blob_url, (size_t) (ptr - blob_url));
-			blob_url = ptr;
-			if (*blob_url == '&')
-				blob_url++;
-			if (!myxt_set_column(ot, (char *) ot->ot_row_rbuffer, col_idx, value.sb_cstring, value.sb_len))
-				xt_throw_tabcolerr(XT_CONTEXT, XT_ERR_CONVERSION, ot->ot_table->tab_name, col_name);
-			if (col_cnt < XT_MAX_COLS_PER_INDEX) {
-				col_list[col_cnt] = col_idx;
-				col_cnt++;
-			}
-		}
-
-		/* Find a matching index: */		
-		if (!(ind = str_find_index(ot->ot_table, col_list, col_cnt)))
-			xt_throw_ixterr(XT_CONTEXT, XT_ERR_NO_MATCHING_INDEX, col_names);
-
-		search_key.sk_key_value.sv_flags = 0;
-		search_key.sk_key_value.sv_rec_id = 0;
-		search_key.sk_key_value.sv_row_id = 0;
-		search_key.sk_key_value.sv_key = search_key.sk_key_buf;
-		search_key.sk_key_value.sv_length = myxt_create_key_from_row(ind, search_key.sk_key_buf, ot->ot_row_rbuffer, NULL);
-		search_key.sk_on_key = FALSE;
-
-		if (!xt_idx_search(ot, ind, &search_key))
-			xt_throw(self);
-
-		if (!ot->ot_curr_rec_id)
-			xt_throw_taberr(XT_CONTEXT, XT_ERR_NO_ROWS, ot->ot_table->tab_name);
-			
-		while (ot->ot_curr_rec_id) {
-			if (!search_key.sk_on_key)
-				xt_throw_taberr(XT_CONTEXT, XT_ERR_NO_ROWS, ot->ot_table->tab_name);
-
-			retry:
-			/* X TODO - Check if the write buffer is big enough here! */
-			switch (xt_tab_read_record(ot, ot->ot_row_wbuffer)) {
-				case FALSE:
-					if (xt_idx_next(ot, ind, &search_key))
-						break;
-				case XT_ERR:
-					xt_throw(self);
-				case XT_NEW:
-					if (xt_idx_match_search(ot, ind, &search_key, ot->ot_row_wbuffer, XT_S_MODE_MATCH))
-						goto success;
-					if (!xt_idx_next(ot, ind, &search_key))
-						xt_throw(self);
-					break;
-				case XT_RETRY:
-					goto retry;
-				default:
-					goto success;
-			}
-		}
-
-		success:
-		myxt_get_column_data(ot, (char *) ot->ot_row_wbuffer, blob_col_idx, &blob_data, &blob_len);
-
-		/* 
-		 * Write the content length, then write the HTTP
-		 * header, and then the content.
-		 */
-		err = pbxt_streaming.setContentLength(stream, blob_len, result);
-		if (!err)
-			err = pbxt_streaming.writeHead(stream, result);
-		if (!err)
-			err = pbxt_streaming.writeStream(stream, (void *) blob_data, blob_len, result);
-	}
-	catch_(a) {
-		str_exception_to_result(&self->t_exception, result);
-		if (result->mr_code == XT_ERR_NO_ROWS)
-			err = MS_ERR_NOT_FOUND;
-		else
-			err = MS_ERR_ENGINE;
-	}
-	cont_(a);
-	xt_sb_set_size(NULL, &value, 0);
-	return err;
-}
-
-int pbxt_lookup_ref(void *thread, void *open_table, unsigned short col_index, PBMSEngineRefPtr eng_ref, PBMSFieldRefPtr field_ref, PBMSResultPtr result)
-{
-	THD				*thd = (THD *) thread;
-	XTThreadPtr		self = xt_ha_thd_to_self(thd);
-	XTOpenTablePtr	ot = (XTOpenTablePtr) open_table;
-	int				err = MS_OK;
-	u_int			i, len;
-	char			*data;
-	XTIndexPtr		ind = NULL;
-
-	ot->ot_thread = self;
-	if (ot->ot_row_wbuf_size < ot->ot_table->tab_dic.dic_mysql_buf_size) {
-		xt_realloc(self, (void **) &ot->ot_row_wbuffer, ot->ot_table->tab_dic.dic_mysql_buf_size);
-		ot->ot_row_wbuf_size = ot->ot_table->tab_dic.dic_mysql_buf_size;
-	}
-
-	ot->ot_curr_rec_id = (xtRecordID) XT_GET_DISK_8(eng_ref->er_data);
-	switch (xt_tab_dirty_read_record(ot, ot->ot_row_wbuffer)) {
-		case FALSE:
-			err = MS_ERR_ENGINE;
-			break;
-		default:
-			break;
-	}
-
-	if (err) {
-		str_exception_to_result(&self->t_exception, result);
-		goto exit;
-	}
-
-	myxt_get_column_name(ot, col_index, PBMS_FIELD_COL_SIZE, field_ref->fr_column);
-
-	for (i=0; i<ot->ot_table->tab_dic.dic_key_count; i++) {
-		ind = ot->ot_table->tab_dic.dic_keys[i];
-		if (ind->mi_flags & (HA_UNIQUE_CHECK | HA_NOSAME))
-			break; 
-	}
-
-	if (ind) {
-		len = 0;
-		data = field_ref->fr_cond;
-		for (i=0; i<ind->mi_seg_count; i++) {
-			if (i > 0) {
-				xt_strcat(PBMS_FIELD_COND_SIZE, data, "&");
-				len = strlen(data);
-			}
-			myxt_get_column_name(ot, ind->mi_seg[i].col_idx, PBMS_FIELD_COND_SIZE - len, data + len);
-			len = strlen(data);
-			xt_strcat(PBMS_FIELD_COND_SIZE, data, "=");
-			len = strlen(data);
-			myxt_get_column_as_string(ot, (char *) ot->ot_row_wbuffer, ind->mi_seg[i].col_idx, PBMS_FIELD_COND_SIZE - len, data + len);
-			len = strlen(data);
-		}
-	}
-	else
-		xt_strcpy(PBMS_FIELD_COND_SIZE, field_ref->fr_cond, "*no unique key*");
-
-	exit:
-	return err;
-}
-
-PBMSEngineRec pbxt_engine = {
-	MS_ENGINE_VERSION,
-	0,
-	FALSE,
-	"PBXT",
-	NULL,
-	pbxt_close_conn,
-	pbxt_open_table,
-	pbxt_close_table,
-	pbxt_lock_table,
-	pbxt_unlock_table,
-	pbxt_send_blob,
-	pbxt_lookup_ref
-};
-
-/* ----------------------------------------------------------------------
- * CALL IN FUNCTIONS
- */
-
-xtPublic void xt_pbms_close_all_tables(const char *table_url)
-{
-	pbxt_streaming.closeAllTables(table_url);
-}
-
-xtPublic xtBool xt_pbms_close_connection(void *thd, XTExceptionPtr e)
-{
-	PBMSResultRec	result;
-	int				err;
-
-	err = pbxt_streaming.closeConn(thd, &result);
-	if (err) {
-		str_result_to_exception(e, err, &result);
-		return FAILED;
-	}
-	return OK;
-}
-
-xtPublic xtBool xt_pbms_open_table(void **open_table, char *table_path)
-{
-	PBMSResultRec	result;
-	int				err;
-
-	err = pbxt_streaming.openTable(open_table, table_path, &result);
-	if (err) {
-		XTThreadPtr	thread = xt_get_self();
-
-		str_result_to_exception(&thread->t_exception, err, &result);
-		return FAILED;
-	}
-	return OK;
-}
-
-xtPublic void xt_pbms_close_table(void *open_table)
-{
-	PBMSResultRec	result;
-	int				err;
-
-	err = pbxt_streaming.closeTable(open_table, &result);
-	if (err) {
-		XTThreadPtr	thread = xt_get_self();
-
-		str_result_to_exception(&thread->t_exception, err, &result);
-		xt_log_exception(thread, &thread->t_exception, XT_LOG_DEFAULT);
-	}
-}
-
-xtPublic xtBool xt_pbms_use_blob(void *open_table, char **ret_blob_url, char *blob_url, unsigned short col_index)
-{
-	PBMSResultRec	result;
-	int				err;
-
-	err = pbxt_streaming.useBlob(open_table, ret_blob_url, blob_url, col_index, &result);
-	if (err) {
-		XTThreadPtr	thread = xt_get_self();
-
-		str_result_to_exception(&thread->t_exception, err, &result);
-		return FAILED;
-	}
-	return OK;
-}
-
-xtPublic xtBool xt_pbms_retain_blobs(void *open_table, PBMSEngineRefPtr eng_ref)
-{
-	PBMSResultRec	result;
-	int				err;
-
-	err = pbxt_streaming.retainBlobs(open_table, eng_ref, &result);
-	if (err) {
-		XTThreadPtr	thread = xt_get_self();
-
-		str_result_to_exception(&thread->t_exception, err, &result);
-		return FAILED;
-	}
-	return OK;
-}
-
-xtPublic void xt_pbms_release_blob(void *open_table, char *blob_url, unsigned short col_index, PBMSEngineRefPtr eng_ref)
-{
-	PBMSResultRec	result;
-	int				err;
-
-	err = pbxt_streaming.releaseBlob(open_table, blob_url, col_index, eng_ref, &result);
-	if (err) {
-		XTThreadPtr	thread = xt_get_self();
-
-		str_result_to_exception(&thread->t_exception, err, &result);
-		xt_log_exception(thread, &thread->t_exception, XT_LOG_DEFAULT);
-	}
-}
-
-xtPublic void xt_pbms_drop_table(const char *table_path)
-{
-	PBMSResultRec	result;
-	int				err;
-
-	err = pbxt_streaming.dropTable(table_path, &result);
-	if (err) {
-		XTThreadPtr	thread = xt_get_self();
-
-		str_result_to_exception(&thread->t_exception, err, &result);
-		xt_log_exception(thread, &thread->t_exception, XT_LOG_DEFAULT);
-	}
-}
-
-xtPublic void xt_pbms_rename_table(const char *from_table, const char *to_table)
-{
-	PBMSResultRec	result;
-	int				err;
-
-	err = pbxt_streaming.renameTable(from_table, to_table, &result);
-	if (err) {
-		XTThreadPtr	thread = xt_get_self();
-
-		str_result_to_exception(&thread->t_exception, err, &result);
-		xt_log_exception(thread, &thread->t_exception, XT_LOG_DEFAULT);
-	}
-}
-
-#endif // XT_STREAMING

=== removed file 'storage/pbxt/src/streaming_xt.h'
--- a/storage/pbxt/src/streaming_xt.h	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/streaming_xt.h	1970-01-01 00:00:00 +0000
@@ -1,46 +0,0 @@
-/* Copyright (c) 2005 PrimeBase Technologies GmbH
- *
- * PrimeBase XT
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * 2006-06-07	Paul McCullagh
- *
- * H&G2JCtL
- *
- * This file contains PBXT streaming interface.
- */
-
-#ifndef __streaming_xt_h__
-#define __streaming_xt_h__
-
-#include "xt_defs.h"
-#define PBMS_API	pbms_api_PBXT
-#include "pbms.h"
-
-xtBool xt_init_streaming(void);
-void xt_exit_streaming(void);
-
-void	xt_pbms_close_all_tables(const char *table_url);
-xtBool	xt_pbms_close_connection(void *thd, XTExceptionPtr e);
-xtBool	xt_pbms_open_table(void **open_table, char *table_path);
-void	xt_pbms_close_table(void *open_table);
-xtBool	xt_pbms_use_blob(void *open_table, char **ret_blob_url, char *blob_url, unsigned short col_index);
-xtBool	xt_pbms_retain_blobs(void *open_table, PBMSEngineRefPtr eng_ref);
-void	xt_pbms_release_blob(void *open_table, char *blob_url, unsigned short col_index, PBMSEngineRefPtr eng_ref);
-void	xt_pbms_drop_table(const char *table_path);
-void	xt_pbms_rename_table(const char *from_table, const char *to_table);
-
-#endif

=== modified file 'storage/pbxt/src/strutil_xt.cc'
--- a/storage/pbxt/src/strutil_xt.cc	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/strutil_xt.cc	2009-08-18 07:46:53 +0000
@@ -365,9 +365,10 @@ xtPublic void xt_int8_to_byte_size(xtInt
 	sprintf(string, "%s %s (%"PRId64" bytes)", val_str, unit, value);
 }
 
+/* Version number must also be set in configure.in! */
 xtPublic c_char *xt_get_version(void)
 {
-	return "1.0.08 RC";
+	return "1.0.08c RC";
 }
 
 /* Copy and URL decode! */

=== modified file 'storage/pbxt/src/systab_xt.cc'
--- a/storage/pbxt/src/systab_xt.cc	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/systab_xt.cc	2009-08-17 11:12:36 +0000
@@ -217,7 +217,7 @@ bool XTLocationTable::seqScanNext(char *
 	uint32		len;
 	Field		*curr_field;
 	byte		*save;
-	MY_BITMAP	*save_write_set;
+	MX_BITMAP	*save_write_set;
 
 	last_access = CS_GET_DISK_4(blob->rb_last_access_4);
 	last_ref = CS_GET_DISK_4(blob->rb_last_ref_4);
@@ -336,7 +336,6 @@ bool XTLocationTable::seqScanNext(char *
 	table->write_set = save_write_set;
 	return true;
 #endif
-	return false;
 }
 
 void XTLocationTable::loadRow(char *buf, xtWord4 row_id)
@@ -345,7 +344,7 @@ void XTLocationTable::loadRow(char *buf,
 	Field			*curr_field;
 	XTTablePathPtr	tp_ptr;
 	byte			*save;
-	MY_BITMAP		*save_write_set;
+	MX_BITMAP		*save_write_set;
 
 	/* ASSERT_COLUMN_MARKED_FOR_WRITE is failing when
 	 * I use store()!??
@@ -386,7 +385,7 @@ void XTLocationTable::loadRow(char *buf,
 	table->write_set = save_write_set;
 }
 
-xtWord4 XTLocationTable::seqScanPos(xtWord1 *buf __attribute__((unused)))
+xtWord4 XTLocationTable::seqScanPos(xtWord1 *XT_UNUSED(buf))
 {
 	return lt_index-1;
 }
@@ -451,7 +450,7 @@ bool XTStatisticsTable::seqScanNext(char
 void XTStatisticsTable::loadRow(char *buf, xtWord4 rec_id)
 {
 	TABLE			*table = ost_my_table;
-	MY_BITMAP		*save_write_set;
+	MX_BITMAP		*save_write_set;
 	Field			*curr_field;
 	byte			*save;
 	const char		*stat_name;
@@ -503,7 +502,7 @@ void XTStatisticsTable::loadRow(char *bu
 	table->write_set = save_write_set;
 }
 
-xtWord4 XTStatisticsTable::seqScanPos(xtWord1 *buf __attribute__((unused)))
+xtWord4 XTStatisticsTable::seqScanPos(xtWord1 *XT_UNUSED(buf))
 {
 	return tt_index-1;
 }
@@ -531,14 +530,14 @@ void st_path_to_table_name(size_t size, 
 		*str = '.';
 }
 
-void XTSystemTableShare::startUp(XTThreadPtr self __attribute__((unused)))
+void XTSystemTableShare::startUp(XTThreadPtr XT_UNUSED(self))
 {
 	thr_lock_init(&sys_location_lock);
 	thr_lock_init(&sys_statistics_lock);
 	sys_lock_inited = TRUE;
 }
 
-void XTSystemTableShare::shutDown(XTThreadPtr self __attribute__((unused)))
+void XTSystemTableShare::shutDown(XTThreadPtr XT_UNUSED(self))
 {
 	if (sys_lock_inited) {
 		thr_lock_delete(&sys_location_lock);
@@ -588,7 +587,7 @@ bool XTSystemTableShare::doesSystemTable
 	return false;
 }
 
-void XTSystemTableShare::createSystemTables(XTThreadPtr self __attribute__((unused)), XTDatabaseHPtr db __attribute__((unused)))
+void XTSystemTableShare::createSystemTables(XTThreadPtr XT_UNUSED(self), XTDatabaseHPtr XT_UNUSED(db))
 {
 	int		i = 0;
 

=== modified file 'storage/pbxt/src/systab_xt.h'
--- a/storage/pbxt/src/systab_xt.h	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/systab_xt.h	2009-08-17 11:12:36 +0000
@@ -85,15 +85,15 @@ public:
 	virtual bool use() { return true; }
 	virtual bool unuse() { return true; }
 	virtual bool seqScanInit() { return true; }
-	virtual bool seqScanNext(char *buf __attribute__((unused)), bool *eof) {
+	virtual bool seqScanNext(char *XT_UNUSED(buf), bool *eof) {
 		*eof = true;
 		return false;
 	}
 	virtual int	getRefLen() { return 4; }
-	virtual xtWord4 seqScanPos(xtWord1 *buf __attribute__((unused))) {
+	virtual xtWord4 seqScanPos(xtWord1 *XT_UNUSED(buf)) {
 		return 0;
 	}
-	virtual bool seqScanRead(xtWord4 rec_id __attribute__((unused)), char *buf __attribute__((unused))) {
+	virtual bool seqScanRead(xtWord4 XT_UNUSED(rec_id), char *XT_UNUSED(buf)) {
 		return true;
 	}
 

=== modified file 'storage/pbxt/src/tabcache_xt.cc'
--- a/storage/pbxt/src/tabcache_xt.cc	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/tabcache_xt.cc	2009-08-17 11:12:36 +0000
@@ -26,6 +26,10 @@
 
 #include "xt_config.h"
 
+#ifdef DRIZZLED
+#include <bitset>
+#endif
+
 #include <signal.h>
 
 #include "pthread_xt.h"
@@ -63,7 +67,7 @@ xtPublic void xt_tc_init(XTThreadPtr sel
 		for (u_int i=0; i<XT_TC_SEGMENT_COUNT; i++) {
 			xt_tab_cache.tcm_segment[i].tcs_cache_in_use = 0;
 			xt_tab_cache.tcm_segment[i].tcs_hash_table = (XTTabCachePagePtr *) xt_calloc(self, xt_tab_cache.tcm_hash_size * sizeof(XTTabCachePagePtr));
-			xt_rwmutex_init_with_autoname(self, &xt_tab_cache.tcm_segment[i].tcs_lock);
+			TAB_CAC_INIT_LOCK(self, &xt_tab_cache.tcm_segment[i].tcs_lock);
 		}
 
 		xt_init_mutex_with_autoname(self, &xt_tab_cache.tcm_lock);
@@ -97,7 +101,7 @@ xtPublic void xt_tc_exit(XTThreadPtr sel
 
 			xt_free(self, xt_tab_cache.tcm_segment[i].tcs_hash_table);
 			xt_tab_cache.tcm_segment[i].tcs_hash_table = NULL;
-			xt_rwmutex_free(self, &xt_tab_cache.tcm_segment[i].tcs_lock);
+			TAB_CAC_FREE_LOCK(self, &xt_tab_cache.tcm_segment[i].tcs_lock);
 		}
 	}
 
@@ -213,7 +217,7 @@ xtBool XTTabCache::xt_tc_write(XT_ROW_RE
 	page->tcp_dirty = TRUE;
 	ASSERT_NS(page->tcp_db_id == tci_table->tab_db->db_id && page->tcp_tab_id == tci_table->tab_id);
 	*op_seq = tci_table->tab_seq.ts_set_op_seq(page);
-	xt_rwmutex_unlock(&seg->tcs_lock, thread->t_id);
+	TAB_CAC_UNLOCK(&seg->tcs_lock, thread->t_id);
 	return OK;
 }
 
@@ -269,21 +273,36 @@ xtBool XTTabCache::xt_tc_write_cond(XTTh
 	page->tcp_dirty = TRUE;
 	ASSERT(page->tcp_db_id == tci_table->tab_db->db_id && page->tcp_tab_id == tci_table->tab_id);
 	*op_seq = tci_table->tab_seq.ts_set_op_seq(page);
-	xt_rwmutex_unlock(&seg->tcs_lock, self->t_id);
+	TAB_CAC_UNLOCK(&seg->tcs_lock, self->t_id);
 	return TRUE;
 
 	no_change:
-	xt_rwmutex_unlock(&seg->tcs_lock, self->t_id);
+	TAB_CAC_UNLOCK(&seg->tcs_lock, self->t_id);
 	return FALSE;
 }
 
 xtBool XTTabCache::xt_tc_read(XT_ROW_REC_FILE_PTR file, xtRefID ref_id, size_t size, xtWord1 *data, XTThreadPtr thread)
 {
+#ifdef XT_USE_ROW_REC_MMAP_FILES
 	return tc_read_direct(file, ref_id, size, data, thread);
+#else
+	size_t				offset;
+	XTTabCachePagePtr	page;
+	XTTabCacheSegPtr	seg;
+
+	if (!tc_fetch(file, ref_id, &seg, &page, &offset, TRUE, thread))
+		return FAILED;
+	/* A read must be completely on a page: */
+	ASSERT_NS(offset + size <= tci_page_size);
+	memcpy(data, page->tcp_data + offset, size);
+	TAB_CAC_UNLOCK(&seg->tcs_lock, thread->t_id);
+	return OK;
+#endif
 }
 
 xtBool XTTabCache::xt_tc_read_4(XT_ROW_REC_FILE_PTR file, xtRefID ref_id, xtWord4 *value, XTThreadPtr thread)
 {
+#ifdef XT_USE_ROW_REC_MMAP_FILES
 	register u_int				page_idx;
 	register XTTabCachePagePtr	page;
 	register XTTabCacheSegPtr	seg;
@@ -300,7 +319,7 @@ xtBool XTTabCache::xt_tc_read_4(XT_ROW_R
 	seg = &dcg->tcm_segment[hash_idx & XT_TC_SEGMENT_MASK];
 	hash_idx = (hash_idx >> XT_TC_SEGMENT_SHIFTS) % dcg->tcm_hash_size;
 
-	xt_rwmutex_slock(&seg->tcs_lock, thread->t_id);
+	TAB_CAC_READ_LOCK(&seg->tcs_lock, thread->t_id);
 	page = seg->tcs_hash_table[hash_idx];
 	while (page) {
 		if (page->tcp_page_idx == page_idx && page->tcp_file_id == file->fr_id) {
@@ -311,53 +330,60 @@ xtBool XTTabCache::xt_tc_read_4(XT_ROW_R
 			ASSERT_NS(offset + 4 <= this->tci_page_size);
 			buffer = page->tcp_data + offset;
 			*value = XT_GET_DISK_4(buffer);
-			xt_rwmutex_unlock(&seg->tcs_lock, thread->t_id);
+			TAB_CAC_UNLOCK(&seg->tcs_lock, thread->t_id);
 			return OK;
 		}
 		page = page->tcp_next;
 	}
-	xt_rwmutex_unlock(&seg->tcs_lock, thread->t_id);
+	TAB_CAC_UNLOCK(&seg->tcs_lock, thread->t_id);
 
-#ifdef XT_USE_ROW_REC_MMAP_FILES
 	return xt_pread_fmap_4(file, address, value, &thread->st_statistics.st_rec, thread);
 #else
-	xtWord1 data[4];
+	size_t				offset;
+	XTTabCachePagePtr	page;
+	XTTabCacheSegPtr	seg;
+	xtWord1				*data;
 
-	if (!XT_PREAD_RR_FILE(file, address, 4, 4, data, NULL, &thread->st_statistics.st_rec, thread))
+	if (!tc_fetch(file, ref_id, &seg, &page, &offset, TRUE, thread))
 		return FAILED;
+	/* A read must be completely on a page: */
+	ASSERT_NS(offset + 4 <= tci_page_size);
+	data = page->tcp_data + offset;
 	*value = XT_GET_DISK_4(data);
+	TAB_CAC_UNLOCK(&seg->tcs_lock, thread->t_id);
 	return OK;
 #endif
 }
 
-xtBool XTTabCache::xt_tc_get_page(XT_ROW_REC_FILE_PTR file, xtRefID ref_id, XTTabCachePagePtr *ret_page, size_t *offset, XTThreadPtr thread)
+xtBool XTTabCache::xt_tc_get_page(XT_ROW_REC_FILE_PTR file, xtRefID ref_id, xtBool load, XTTabCachePagePtr *ret_page, size_t *offset, XTThreadPtr thread)
 {
 	XTTabCachePagePtr	page;
 	XTTabCacheSegPtr	seg;
 
-#ifdef XT_SEQ_SCAN_FROM_MEMORY
-	if (!tc_fetch_direct(file, ref_id, &seg, &page, offset, thread))
-		return FAILED;
-	if (!seg) {
-		*ret_page = NULL;
-		return OK;
+	if (load) {
+		if (!tc_fetch(file, ref_id, &seg, &page, offset, TRUE, thread))
+			return FAILED;
+	}
+	else {
+		if (!tc_fetch_direct(file, ref_id, &seg, &page, offset, thread))
+			return FAILED;
+		if (!seg) {
+			*ret_page = NULL;
+			return OK;
+		}
 	}
-#else
-	if (!tc_fetch(file, ref_id, &seg, &page, offset, TRUE, thread))
-		return FAILED;
-#endif
 	page->tcp_lock_count++;
-	xt_rwmutex_unlock(&seg->tcs_lock, thread->t_id);
+	TAB_CAC_UNLOCK(&seg->tcs_lock, thread->t_id);
 	*ret_page = page;
 	return OK;
 }
 
-void XTTabCache::xt_tc_release_page(XT_ROW_REC_FILE_PTR file __attribute__((unused)), XTTabCachePagePtr page, XTThreadPtr thread)
+void XTTabCache::xt_tc_release_page(XT_ROW_REC_FILE_PTR XT_UNUSED(file), XTTabCachePagePtr page, XTThreadPtr thread)
 {
 	XTTabCacheSegPtr	seg;
 
 	seg = &xt_tab_cache.tcm_segment[page->tcp_seg];
-	xt_rwmutex_xlock(&seg->tcs_lock, thread->t_id);
+	TAB_CAC_WRITE_LOCK(&seg->tcs_lock, thread->t_id);
 
 #ifdef DEBUG
 	XTTabCachePagePtr lpage, ppage;
@@ -379,7 +405,7 @@ void XTTabCache::xt_tc_release_page(XT_R
 	if (page->tcp_lock_count > 0)
 		page->tcp_lock_count--;
 
-	xt_rwmutex_unlock(&seg->tcs_lock, thread->t_id);
+	TAB_CAC_UNLOCK(&seg->tcs_lock, thread->t_id);
 }
 
 xtBool XTTabCache::xt_tc_read_page(XT_ROW_REC_FILE_PTR file, xtRefID ref_id, xtWord1 *data, XTThreadPtr thread)
@@ -412,7 +438,7 @@ xtBool XTTabCache::tc_read_direct(XT_ROW
 	seg = &dcg->tcm_segment[hash_idx & XT_TC_SEGMENT_MASK];
 	hash_idx = (hash_idx >> XT_TC_SEGMENT_SHIFTS) % dcg->tcm_hash_size;
 
-	xt_rwmutex_slock(&seg->tcs_lock, thread->t_id);
+	TAB_CAC_READ_LOCK(&seg->tcs_lock, thread->t_id);
 	page = seg->tcs_hash_table[hash_idx];
 	while (page) {
 		if (page->tcp_page_idx == page_idx && page->tcp_file_id == file->fr_id) {
@@ -421,12 +447,12 @@ xtBool XTTabCache::tc_read_direct(XT_ROW
 			offset = (ref_id % this->tci_rows_per_page) * this->tci_rec_size;
 			ASSERT_NS(offset + size <= this->tci_page_size);
 			memcpy(data, page->tcp_data + offset, size);
-			xt_rwmutex_unlock(&seg->tcs_lock, thread->t_id);
+			TAB_CAC_UNLOCK(&seg->tcs_lock, thread->t_id);
 			return OK;
 		}
 		page = page->tcp_next;
 	}
-	xt_rwmutex_unlock(&seg->tcs_lock, thread->t_id);
+	TAB_CAC_UNLOCK(&seg->tcs_lock, thread->t_id);
 	if (!XT_PREAD_RR_FILE(file, address, size, 0, data, &red_size, &thread->st_statistics.st_rec, thread))
 		return FAILED;
 	memset(data + red_size, 0, size - red_size);
@@ -450,7 +476,7 @@ xtBool XTTabCache::tc_fetch_direct(XT_RO
 	seg = &dcg->tcm_segment[hash_idx & XT_TC_SEGMENT_MASK];
 	hash_idx = (hash_idx >> XT_TC_SEGMENT_SHIFTS) % dcg->tcm_hash_size;
 
-	xt_rwmutex_xlock(&seg->tcs_lock, thread->t_id);
+	TAB_CAC_WRITE_LOCK(&seg->tcs_lock, thread->t_id);
 	page = seg->tcs_hash_table[hash_idx];
 	while (page) {
 		if (page->tcp_page_idx == page_idx && page->tcp_file_id == file->fr_id) {
@@ -460,7 +486,7 @@ xtBool XTTabCache::tc_fetch_direct(XT_RO
 		}
 		page = page->tcp_next;
 	}
-	xt_rwmutex_unlock(&seg->tcs_lock, thread->t_id);
+	TAB_CAC_UNLOCK(&seg->tcs_lock, thread->t_id);
 	*ret_seg = NULL;
 	*ret_page = NULL;
 	return OK;
@@ -492,7 +518,7 @@ xtBool XTTabCache::tc_fetch(XT_ROW_REC_F
 	seg = &dcg->tcm_segment[hash_idx & XT_TC_SEGMENT_MASK];
 	hash_idx = (hash_idx >> XT_TC_SEGMENT_SHIFTS) % dcg->tcm_hash_size;
 
-	xt_rwmutex_slock(&seg->tcs_lock, thread->t_id);
+	TAB_CAC_READ_LOCK(&seg->tcs_lock, thread->t_id);
 	page = seg->tcs_hash_table[hash_idx];
 	while (page) {
 		if (page->tcp_page_idx == page_idx && page->tcp_file_id == file->fr_id) {
@@ -528,7 +554,7 @@ xtBool XTTabCache::tc_fetch(XT_ROW_REC_F
 		}
 		page = page->tcp_next;
 	}
-	xt_rwmutex_unlock(&seg->tcs_lock, thread->t_id);
+	TAB_CAC_UNLOCK(&seg->tcs_lock, thread->t_id);
 	
 	/* Page not found, allocate a new page: */
 	size_t page_size = offsetof(XTTabCachePageRec, tcp_data) + this->tci_page_size;
@@ -674,7 +700,7 @@ xtBool XTTabCache::tc_fetch(XT_ROW_REC_F
 #endif
 
 	/* Add the page to the cache! */
-	xt_rwmutex_xlock(&seg->tcs_lock, thread->t_id);
+	TAB_CAC_WRITE_LOCK(&seg->tcs_lock, thread->t_id);
 	page = seg->tcs_hash_table[hash_idx];
 	while (page) {
 		if (page->tcp_page_idx == page_idx && page->tcp_file_id == file->fr_id) {
@@ -898,11 +924,11 @@ static size_t tabc_free_page(XTThreadPtr
 	}
 
 	seg = &dcg->tcm_segment[page->tcp_seg];
-	xt_rwmutex_xlock(&seg->tcs_lock, self->t_id);
+	TAB_CAC_WRITE_LOCK(&seg->tcs_lock, self->t_id);
 
 	if (page->tcp_dirty) {
 		if (!was_dirty) {
-			xt_rwmutex_unlock(&seg->tcs_lock, self->t_id);
+			TAB_CAC_UNLOCK(&seg->tcs_lock, self->t_id);
 			goto retry_2;
 		}
 
@@ -923,7 +949,7 @@ static size_t tabc_free_page(XTThreadPtr
 				XTDatabaseHPtr db = tab->tab_db;
 
 				rewait:
-				xt_rwmutex_unlock(&seg->tcs_lock, self->t_id);
+				TAB_CAC_UNLOCK(&seg->tcs_lock, self->t_id);
 
 				/* Flush the log, in case this is holding up the
 				 * writer!
@@ -963,7 +989,7 @@ static size_t tabc_free_page(XTThreadPtr
 				db->db_wr_freeer_waiting = FALSE;
 				freer_(); // xt_unlock_mutex(&db->db_wr_lock)
 
-				xt_rwmutex_xlock(&seg->tcs_lock, self->t_id);
+				TAB_CAC_WRITE_LOCK(&seg->tcs_lock, self->t_id);
 				if (XTTableSeq::xt_op_is_before(tab->tab_head_op_seq, page->tcp_op_seq))
 					goto rewait;
 			}
@@ -988,11 +1014,11 @@ static size_t tabc_free_page(XTThreadPtr
 			 */
 			if ((page = page->tcp_mr_used)) {
 				page_cnt++;
-				xt_rwmutex_unlock(&seg->tcs_lock, self->t_id);
+				TAB_CAC_UNLOCK(&seg->tcs_lock, self->t_id);
 				goto retry_2;
 			}
 		}
-		xt_rwmutex_unlock(&seg->tcs_lock, self->t_id);
+		TAB_CAC_UNLOCK(&seg->tcs_lock, self->t_id);
 		dcg->tcm_free_try_count++;				
 
 		/* Starting to spin, free the threads: */
@@ -1047,7 +1073,7 @@ static size_t tabc_free_page(XTThreadPtr
 	seg->tcs_cache_in_use -= freed_space;
 	xt_free_ns(page);
 
-	xt_rwmutex_unlock(&seg->tcs_lock, self->t_id);
+	TAB_CAC_UNLOCK(&seg->tcs_lock, self->t_id);
 	self->st_statistics.st_rec_cache_frees++;
 	dcg->tcm_free_try_count = 0;
 	return freed_space;
@@ -1156,7 +1182,7 @@ static void *tabc_fr_run_thread(XTThread
 	return NULL;
 }
 
-static void tabc_fr_free_thread(XTThreadPtr self, void *data __attribute__((unused)))
+static void tabc_fr_free_thread(XTThreadPtr self, void *XT_UNUSED(data))
 {
 	if (xt_tab_cache.tcm_freeer_thread) {
 		xt_lock_mutex(self, &xt_tab_cache.tcm_freeer_lock);
@@ -1238,7 +1264,7 @@ xtPublic void xt_load_pages(XTThreadPtr 
 	while (rec_id<tab->tab_row_eof_id) {
 		if (!tab->tab_rows.tc_fetch(ot->ot_row_file, rec_id, &seg, &page, &poffset, TRUE, self))
 			xt_throw(self);
-		xt_rwmutex_unlock(&seg->tcs_lock, self->t_id);
+		TAB_CAC_UNLOCK(&seg->tcs_lock, self->t_id);
 		rec_id += tab->tab_rows.tci_rows_per_page;
 	}
 
@@ -1246,7 +1272,7 @@ xtPublic void xt_load_pages(XTThreadPtr 
 	while (rec_id<tab->tab_rec_eof_id) {
 		if (!tab->tab_recs.tc_fetch(ot->ot_rec_file, rec_id, &seg, &page, &poffset, TRUE, self))
 			xt_throw(self);
-		xt_rwmutex_unlock(&seg->tcs_lock, self->t_id);
+		TAB_CAC_UNLOCK(&seg->tcs_lock, self->t_id);
 		rec_id += tab->tab_recs.tci_rows_per_page;
 	}
 }

=== modified file 'storage/pbxt/src/tabcache_xt.h'
--- a/storage/pbxt/src/tabcache_xt.h	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/tabcache_xt.h	2009-08-17 11:12:36 +0000
@@ -125,11 +125,11 @@ typedef struct XTTableSeq {
 		xt_init_mutex_with_autoname(self, &ts_ns_lock);
 	}
 
-	void xt_op_seq_set(XTThreadPtr self __attribute__((unused)), xtOpSeqNo n) {
+	void xt_op_seq_set(XTThreadPtr XT_UNUSED(self), xtOpSeqNo n) {
 		ts_next_seq = n;
 	}
 
-	void xt_op_seq_exit(XTThreadPtr self __attribute__((unused))) {
+	void xt_op_seq_exit(XTThreadPtr XT_UNUSED(self)) {
 		xt_free_mutex(&ts_ns_lock);
 	}
 
@@ -150,12 +150,50 @@ typedef struct XTTableSeq {
 #endif
 } XTTableSeqRec, *XTTableSeqPtr;
 
+#ifdef XT_NO_ATOMICS
+#define TAB_CAC_USE_PTHREAD_RW
+#else
+//#define TAB_CAC_USE_RWMUTEX
+//#define TAB_CAC_USE_PTHREAD_RW
+//#define IDX_USE_SPINXSLOCK
+#define TAB_CAC_USE_XSMUTEX
+#endif
+
+#ifdef TAB_CAC_USE_XSMUTEX
+#define TAB_CAC_LOCK_TYPE				XTXSMutexRec
+#define TAB_CAC_INIT_LOCK(s, i)			xt_xsmutex_init_with_autoname(s, i)
+#define TAB_CAC_FREE_LOCK(s, i)			xt_xsmutex_free(s, i)	
+#define TAB_CAC_READ_LOCK(i, o)			xt_xsmutex_slock(i, o)
+#define TAB_CAC_WRITE_LOCK(i, o)		xt_xsmutex_xlock(i, o)
+#define TAB_CAC_UNLOCK(i, o)			xt_xsmutex_unlock(i, o)
+#elif defined(TAB_CAC_USE_PTHREAD_RW)
+#define TAB_CAC_LOCK_TYPE				xt_rwlock_type
+#define TAB_CAC_INIT_LOCK(s, i)			xt_init_rwlock(s, i)
+#define TAB_CAC_FREE_LOCK(s, i)			xt_free_rwlock(i)	
+#define TAB_CAC_READ_LOCK(i, o)			xt_slock_rwlock_ns(i)
+#define TAB_CAC_WRITE_LOCK(i, o)		xt_xlock_rwlock_ns(i)
+#define TAB_CAC_UNLOCK(i, o)			xt_unlock_rwlock_ns(i)
+#elif defined(TAB_CAC_USE_RWMUTEX)
+#define TAB_CAC_LOCK_TYPE				XTRWMutexRec
+#define TAB_CAC_INIT_LOCK(s, i)			xt_rwmutex_init_with_autoname(s, i)
+#define TAB_CAC_FREE_LOCK(s, i)			xt_rwmutex_free(s, i)	
+#define TAB_CAC_READ_LOCK(i, o)			xt_rwmutex_slock(i, o)
+#define TAB_CAC_WRITE_LOCK(i, o)		xt_rwmutex_xlock(i, o)
+#define TAB_CAC_UNLOCK(i, o)			xt_rwmutex_unlock(i, o)
+#elif defined(TAB_CAC_USE_SPINXSLOCK)
+#define TAB_CAC_LOCK_TYPE				XTSpinXSLockRec
+#define TAB_CAC_INIT_LOCK(s, i)			xt_spinxslock_init_with_autoname(s, i)
+#define TAB_CAC_FREE_LOCK(s, i)			xt_spinxslock_free(s, i)	
+#define TAB_CAC_READ_LOCK(i, o)			xt_spinxslock_slock(i, o)
+#define TAB_CAC_WRITE_LOCK(i, o)		xt_spinxslock_xlock(i, o)
+#define TAB_CAC_UNLOCK(i, o)			xt_spinxslock_unlock(i, o)
+#endif
+
 /* A disk cache segment. The cache is divided into a number of segments
  * to improve concurrency.
  */
 typedef struct XTTabCacheSeg {
-	XTRWMutexRec				tcs_lock;						/* The cache segment read/write lock. */
-	//xt_cond_type			tcs_cond;
+	TAB_CAC_LOCK_TYPE		tcs_lock;						/* The cache segment read/write lock. */
 	XTTabCachePagePtr		*tcs_hash_table;
 	size_t					tcs_cache_in_use;
 } XTTabCacheSegRec, *XTTabCacheSegPtr;
@@ -220,7 +258,7 @@ public:
 	xtBool					xt_tc_read(XT_ROW_REC_FILE_PTR file, xtRefID ref_id, size_t size, xtWord1 *data, XTThreadPtr thread);
 	xtBool					xt_tc_read_4(XT_ROW_REC_FILE_PTR file, xtRefID ref_id, xtWord4 *data, XTThreadPtr thread);
 	xtBool					xt_tc_read_page(XT_ROW_REC_FILE_PTR file, xtRefID ref_id, xtWord1 *data, XTThreadPtr thread);
-	xtBool					xt_tc_get_page(XT_ROW_REC_FILE_PTR file, xtRefID ref_id, XTTabCachePagePtr *page, size_t *offset, XTThreadPtr thread);
+	xtBool					xt_tc_get_page(XT_ROW_REC_FILE_PTR file, xtRefID ref_id, xtBool load, XTTabCachePagePtr *page, size_t *offset, XTThreadPtr thread);
 	void					xt_tc_release_page(XT_ROW_REC_FILE_PTR file, XTTabCachePagePtr page, XTThreadPtr thread);
 	xtBool					tc_fetch(XT_ROW_REC_FILE_PTR file, xtRefID ref_id, XTTabCacheSegPtr *ret_seg, XTTabCachePagePtr *ret_page, size_t *offset, xtBool read, XTThreadPtr thread);
 

=== modified file 'storage/pbxt/src/table_xt.cc'
--- a/storage/pbxt/src/table_xt.cc	2009-04-02 20:36:52 +0000
+++ b/storage/pbxt/src/table_xt.cc	2009-08-18 07:46:53 +0000
@@ -35,7 +35,7 @@
 #include <drizzled/common.h>
 #include <mysys/thr_lock.h>
 #include <drizzled/dtcollation.h>
-#include <drizzled/handlerton.h>
+#include <drizzled/plugin/storage_engine.h>
 #else
 #include "mysql_priv.h"
 #endif
@@ -47,9 +47,6 @@
 #include "myxt_xt.h"
 #include "cache_xt.h"
 #include "trace_xt.h"
-#ifdef XT_STREAMING
-#include "streaming_xt.h"
-#endif
 #include "index_xt.h"
 #include "restart_xt.h"
 #include "systab_xt.h"
@@ -293,17 +290,17 @@ static void tab_get_row_file_name(char *
 	sprintf(table_name, "%s-%lu.xtr", name, (u_long) tab_id);
 }
 
-static void tab_get_data_file_name(char *table_name, char *name, xtTableID tab_id __attribute__((unused)))
+static void tab_get_data_file_name(char *table_name, char *name, xtTableID XT_UNUSED(tab_id))
 {
 	sprintf(table_name, "%s.xtd", name);
 }
 
-static void tab_get_index_file_name(char *table_name, char *name, xtTableID tab_id __attribute__((unused)))
+static void tab_get_index_file_name(char *table_name, char *name, xtTableID XT_UNUSED(tab_id))
 {
 	sprintf(table_name, "%s.xti", name);
 }
 
-static void tab_free_by_id(XTThreadPtr self __attribute__((unused)), void *thunk __attribute__((unused)), void *item)
+static void tab_free_by_id(XTThreadPtr self, void *XT_UNUSED(thunk), void *item)
 {
 	XTTableEntryPtr	te_ptr = (XTTableEntryPtr) item;
 
@@ -315,7 +312,7 @@ static void tab_free_by_id(XTThreadPtr s
 	te_ptr->te_table = NULL;
 }
 
-static int tab_comp_by_id(XTThreadPtr self __attribute__((unused)), register const void *thunk __attribute__((unused)), register const void *a, register const void *b)
+static int tab_comp_by_id(XTThreadPtr XT_UNUSED(self), register const void *XT_UNUSED(thunk), register const void *a, register const void *b)
 {
 	xtTableID		te_id = *((xtTableID *) a);
 	XTTableEntryPtr	te_ptr = (XTTableEntryPtr) b;
@@ -327,14 +324,14 @@ static int tab_comp_by_id(XTThreadPtr se
 	return 1;
 }
 
-static void tab_free_path(XTThreadPtr self __attribute__((unused)), void *thunk __attribute__((unused)), void *item)
+static void tab_free_path(XTThreadPtr self, void *XT_UNUSED(thunk), void *item)
 {
 	XTTablePathPtr	tp_ptr = *((XTTablePathPtr *) item);
 
 	xt_free(self, tp_ptr);
 }
 
-static int tab_comp_path(XTThreadPtr self __attribute__((unused)), register const void *thunk __attribute__((unused)), register const void *a, register const void *b)
+static int tab_comp_path(XTThreadPtr XT_UNUSED(self), register const void *XT_UNUSED(thunk), register const void *a, register const void *b)
 {
 	char			*path = (char *) a;
 	XTTablePathPtr	tp_ptr = *((XTTablePathPtr *) b);
@@ -398,7 +395,7 @@ xtPublic xtBool xt_describe_tables_next(
 	return_(TRUE);
 }
 
-xtPublic void xt_describe_tables_exit(XTThreadPtr self __attribute__((unused)), XTTableDescPtr td)
+xtPublic void xt_describe_tables_exit(XTThreadPtr XT_UNUSED(self), XTTableDescPtr td)
 {
 	if (td->td_open_dir)
 		xt_dir_close(NULL, td->td_open_dir);
@@ -410,8 +407,11 @@ xtPublic void xt_tab_init_db(XTThreadPtr
 {
 	XTTableDescRec		desc;
 	XTTableEntryRec		te_tab;
+	XTTableEntryPtr		te_ptr;
 	XTTablePathPtr		db_path;
+	char				pbuf[PATH_MAX];
 	int					len;
+	u_int				edx;
 
 	enter_();
 	pushr_(xt_tab_exit_db, db);
@@ -425,7 +425,6 @@ xtPublic void xt_tab_init_db(XTThreadPtr
 	if (db->db_multi_path) {
 		XTOpenFilePtr	of;
 		char			*buffer, *ptr, *path;
-		char			pbuf[PATH_MAX];
 
 		xt_strcpy(PATH_MAX, pbuf, db->db_main_path);
 		xt_add_location_file(PATH_MAX, pbuf);
@@ -490,6 +489,27 @@ xtPublic void xt_tab_init_db(XTThreadPtr
 	}
 	freer_(); // xt_describe_tables_exit(&desc)
 
+	/* 
+	 * The purpose of this code is to ensure that all tables are opened and cached,
+	 * which is actually only required if tables have foreign key references.
+	 *
+	 * In other words, a side affect of this code is that FK references between tables
+	 * are registered, and checked.
+	 *
+	 * Unfortunately we don't know if a table is referenced by a FK, so we have to open
+	 * all tables.
+	 * 
+	 * Cannot open tables in the loop above because db->db_table_by_id which is built 
+	 * above is used by xt_use_table_no_lock() 
+	 */
+	xt_enum_tables_init(&edx);
+	while ((te_ptr = xt_enum_tables_next(self, db, &edx))) {
+		xt_strcpy(PATH_MAX, pbuf, te_ptr->te_tab_path->tp_path);
+		xt_add_dir_char(PATH_MAX, pbuf);
+		xt_strcat(PATH_MAX, pbuf, te_ptr->te_tab_name);
+		xt_heap_release(self, xt_use_table_no_lock(self, db, (XTPathStrPtr)pbuf, FALSE, FALSE, NULL, NULL));
+	}
+
 	popr_(); // Discard xt_tab_exit_db(db)
 	exit_();
 }
@@ -605,8 +625,9 @@ xtPublic void xt_tab_exit_db(XTThreadPtr
 	}
 }
 
-static void tab_check_table(XTThreadPtr self __attribute__((unused)), XTTableHPtr tab __attribute__((unused)))
+static void tab_check_table(XTThreadPtr self, XTTableHPtr XT_UNUSED(tab))
 {
+	(void) self;
 	enter_();
 	exit_();
 }
@@ -661,7 +682,7 @@ xtPublic void xt_enum_tables_init(u_int 
 	*edx = 0;
 }
 
-xtPublic XTTableEntryPtr xt_enum_tables_next(XTThreadPtr self __attribute__((unused)), XTDatabaseHPtr db, u_int *edx)
+xtPublic XTTableEntryPtr xt_enum_tables_next(XTThreadPtr XT_UNUSED(self), XTDatabaseHPtr db, u_int *edx)
 {
 	XTTableEntryPtr en_ptr;
 
@@ -727,6 +748,12 @@ static xtBool tab_find_table(XTThreadPtr
 	return FALSE;
 }
 
+xtPublic void xt_tab_disable_index(XTTableHPtr tab, u_int ind_error)
+{
+	tab->tab_dic.dic_disable_index = ind_error;
+	xt_tab_set_table_repair_pending(tab);
+}
+
 xtPublic void xt_tab_set_index_error(XTTableHPtr tab)
 {
 	switch (tab->tab_dic.dic_disable_index) {
@@ -803,22 +830,39 @@ static void tab_load_index_header(XTThre
 			tab->tab_index_page_size = XT_GET_DISK_4(index_fmt->if_page_size_4);
 		}	
 
+#ifdef XT_USE_LAZY_DELETE
+		if (tab->tab_dic.dic_index_ver <= XT_IND_NO_LAZY_DELETE)
+			tab->tab_dic.dic_no_lazy_delete = TRUE;
+		else
+			tab->tab_dic.dic_no_lazy_delete = FALSE;
+#else
+		tab->tab_dic.dic_no_lazy_delete = TRUE;
+#endif
+
 		/* Incorrect version of index is handled by allowing a sequential scan, but no index access.
 		 * Recovery with the wrong index type will not recover the indexes, a REPAIR TABLE
 		 * will be required!
 		 */
 		if (tab->tab_dic.dic_index_ver != XT_IND_CURRENT_VERSION) {
-			if (tab->tab_dic.dic_index_ver != XT_IND_CURRENT_VERSION)
-				tab->tab_dic.dic_disable_index = XT_INDEX_TOO_OLD;
-			else
-				tab->tab_dic.dic_disable_index = XT_INDEX_TOO_NEW;
+			switch (tab->tab_dic.dic_index_ver) {
+				case XT_IND_NO_LAZY_DELETE:
+				case XT_IND_LAZY_DELETE_OK:
+					/* I can handle this type of index. */
+					break;
+				default:
+					if (tab->tab_dic.dic_index_ver < XT_IND_CURRENT_VERSION)
+						xt_tab_disable_index(tab, XT_INDEX_TOO_OLD);
+					else
+						xt_tab_disable_index(tab, XT_INDEX_TOO_NEW);
+					break;
+			}
 		}
 		else if (tab->tab_index_page_size != XT_INDEX_PAGE_SIZE)
-			tab->tab_dic.dic_disable_index = XT_INDEX_BAD_BLOCK;
+			xt_tab_disable_index(tab, XT_INDEX_BAD_BLOCK);
 	}
 	else {
 		memset(tab->tab_index_head, 0, XT_INDEX_HEAD_SIZE);
-		tab->tab_dic.dic_disable_index = XT_INDEX_MISSING;
+		xt_tab_disable_index(tab, XT_INDEX_MISSING);
 		tab->tab_index_header_size = XT_INDEX_HEAD_SIZE;
 		tab->tab_index_page_size = XT_INDEX_PAGE_SIZE;
 		tab->tab_dic.dic_index_ver = 0;
@@ -1089,6 +1133,8 @@ static int tab_new_handle(XTThreadPtr se
 
 	xt_heap_set_release_callback(self, tab, tab_onrelease);
 
+	tab->tab_repair_pending = xt_tab_is_table_repair_pending(tab);
+
 	popr_(); // Discard xt_heap_release(tab)
 
 	xt_ht_put(self, db->db_tables, tab);
@@ -1216,11 +1262,6 @@ static XTOpenTablePoolPtr tab_lock_table
 		return_(NULL);
 	}
 
-#ifdef XT_STREAMING
-	/* Tell PBMS to close all open tables of this sort: */
-	xt_pbms_close_all_tables(name->ps_path);
-#endif
-
 	/* Wait for all open tables to close: */
 	xt_db_wait_for_open_tables(self, table_pool);
 
@@ -1297,9 +1338,6 @@ xtPublic void xt_create_table(XTThreadPt
 
 		/* Remove the PBMS table: */
 		ASSERT(xt_get_self() == self);
-#ifdef XT_STREAMING
-		xt_pbms_drop_table(name->ps_path);
-#endif
 
 		/* Remove the table from the directory. It will get a new
 		 * ID so the handle in the directory will no longer be valid.
@@ -1572,7 +1610,7 @@ xtPublic void xt_create_table(XTThreadPt
 	exit_();
 }
 
-xtPublic void xt_drop_table(XTThreadPtr self, XTPathStrPtr tab_name)
+xtPublic void xt_drop_table(XTThreadPtr self, XTPathStrPtr tab_name, xtBool drop_db)
 {
 	XTDatabaseHPtr		db = self->st_database;
 	XTOpenTablePoolPtr	table_pool;
@@ -1596,8 +1634,16 @@ xtPublic void xt_drop_table(XTThreadPtr 
 		tab_id = tab->tab_id;	/* tab is not null if returned table_pool is not null */
 		/* check if other tables refer this */
 		if (!self->st_ignore_fkeys) 
-			can_drop = tab->tab_dic.dic_table->checkCanDrop();
+			can_drop = tab->tab_dic.dic_table->checkCanDrop(drop_db);
 	}
+#ifdef DRIZZLED 
+	/* See the comment in ha_pbxt::delete_table regarding different implmentation of DROP TABLE
+         * in MySQL and Drizzle
+         */
+	else {
+		xt_throw_xterr(XT_CONTEXT, XT_ERR_TABLE_NOT_FOUND);
+	}
+#endif
 
 	if (can_drop) {
 		if (tab_id) {
@@ -1614,9 +1660,6 @@ xtPublic void xt_drop_table(XTThreadPtr 
 			tab_delete_table_files(self, tab_name, tab_id);
 
 			ASSERT(xt_get_self() == self);
-#ifdef XT_STREAMING
-			xt_pbms_drop_table(tab_name->ps_path);
-#endif
 			if ((te_ptr = (XTTableEntryPtr) xt_sl_find(self, db->db_table_by_id, &tab_id))) {
 				tab_remove_table_path(self, db, te_ptr->te_tab_path);
 				xt_sl_delete(self, db->db_table_by_id, &tab_id);
@@ -1733,6 +1776,7 @@ xtPublic void xt_check_table(XTThreadPtr
 	u_llong					max_comp_rec_len = 0;
 	size_t					rec_size;
 	size_t					row_size;
+	u_llong					ext_data_len = 0;
 
 #if defined(DUMP_CHECK_TABLE) || defined(CHECK_TABLE_STATS)
 	printf("\nCHECK TABLE: %s\n", tab->tab_name->ps_path);
@@ -1832,6 +1876,7 @@ xtPublic void xt_check_table(XTThreadPtr
 				printf("record-X ");
 #endif
 				alloc_rec_count++;
+				ext_data_len += XT_GET_DISK_4(rec_buf->re_log_dat_siz_4);
 				row_size = XT_GET_DISK_4(rec_buf->re_log_dat_siz_4) + ot->ot_rec_size - XT_REC_EXT_HEADER_SIZE;
 				alloc_rec_bytes += row_size;
 				if (!min_comp_rec_len || row_size < min_comp_rec_len)
@@ -1887,6 +1932,9 @@ xtPublic void xt_check_table(XTThreadPtr
 	}
 	
 #ifdef CHECK_TABLE_STATS
+	if (!tab->tab_dic.dic_rec_fixed)
+		printf("Extendend data length   = %llu\n", ext_data_len);
+	
 	if (alloc_rec_count) {
 		printf("Minumum comp. rec. len. = %llu\n", (u_llong) min_comp_rec_len);
 		printf("Average comp. rec. len. = %llu\n", (u_llong) ((double) alloc_rec_bytes / (double) alloc_rec_count + (double) 0.5));
@@ -2055,6 +2103,8 @@ xtPublic void xt_rename_table(XTThreadPt
 	popr_(); // Discard xt_free(te_new_name);
 
 	tab = xt_use_table_no_lock(self, db, new_name, FALSE, FALSE, &dic, NULL);
+	/* All renamed tables are considered repaired! */
+	xt_tab_table_repaired(tab);
 	xt_heap_release(self, tab);
 
 	freer_(); // myxt_free_dictionary(&dic)
@@ -2306,6 +2356,9 @@ xtPublic XTOpenTablePtr tab_open_table(X
 		return NULL;
 	memset(ot, 0, offsetof(XTOpenTableRec, ot_ind_wbuf));
 
+	ot->ot_seq_page = NULL;
+	ot->ot_seq_data = NULL;
+
 	self = xt_get_self();
 	try_(a) {
 		xt_heap_reference(self, tab);
@@ -3353,6 +3406,16 @@ xtPublic int xt_tab_dirty_read_record(re
 	return OK;
 }
 
+#ifdef XT_USE_ROW_REC_MMAP_FILES
+/* Loading into cache is not required,
+ * Instead we copy the memory map to load the
+ * data.
+ */
+#define TAB_ROW_LOAD_CACHE		FALSE
+#else
+#define TAB_ROW_LOAD_CACHE		TRUE
+#endif
+
 /*
  * Pull the entire row pointer file into memory.
  */
@@ -3376,7 +3439,7 @@ xtPublic void xt_tab_load_row_pointers(X
 		end_offset = xt_row_id_to_row_offset(tab, eof_rec_id);
 		rec_id = 1;
 		while (rec_id < eof_rec_id) {
-			if (!tab->tab_rows.xt_tc_get_page(ot->ot_row_file, rec_id, &page, &poffset, self))
+			if (!tab->tab_rows.xt_tc_get_page(ot->ot_row_file, rec_id, TAB_ROW_LOAD_CACHE, &page, &poffset, self))
 				xt_throw(self);
 			if (page)
 				tab->tab_rows.xt_tc_release_page(ot->ot_row_file, page, self);
@@ -3392,7 +3455,7 @@ xtPublic void xt_tab_load_row_pointers(X
 				XT_LOCK_MEMORY_PTR(buff_ptr, ot->ot_row_file, offset, tfer, &self->st_statistics.st_rec, self);
 				if (buff_ptr) {
 					memcpy(buffer, buff_ptr, tfer);
-					XT_UNLOCK_MEMORY_PTR(ot->ot_row_file, self);
+					XT_UNLOCK_MEMORY_PTR(ot->ot_row_file, buff_ptr, TRUE, self);
 				}
 			}
 			rec_id += tab->tab_rows.tci_rows_per_page;
@@ -3521,7 +3584,7 @@ static void tab_restore_exception(XTExce
  * FALSE if the record has already been freed. 
  *
  */
-xtPublic int xt_tab_remove_record(XTOpenTablePtr ot, xtRecordID rec_id, xtWord1 *rec_data, xtRecordID *prev_var_id, xtBool clean_delete, xtRowID row_id, xtXactID xn_id __attribute__((unused)))
+xtPublic int xt_tab_remove_record(XTOpenTablePtr ot, xtRecordID rec_id, xtWord1 *rec_data, xtRecordID *prev_var_id, xtBool clean_delete, xtRowID row_id, xtXactID XT_UNUSED(xn_id))
 {
 	register XTTableHPtr	tab = ot->ot_table;
 	size_t					rec_size;
@@ -3664,49 +3727,6 @@ xtPublic int xt_tab_remove_record(XTOpen
 		}
 	}
 
-#ifdef XT_STREAMING
-	if (tab->tab_dic.dic_blob_count) {
-		/* If the record contains any LONGBLOB then check how much
-		 * space we need.
-		 */
-		size_t blob_size;
-
-		switch (old_rec_type) {
-			case XT_TAB_STATUS_DELETE:
-			case XT_TAB_STATUS_DEL_CLEAN:
-				break;
-			case XT_TAB_STATUS_FIXED:
-			case XT_TAB_STATUS_FIX_CLEAN:
-				/* Should not be the case, record with LONGBLOB can never be fixed! */
-				break;
-			case XT_TAB_STATUS_VARIABLE:
-			case XT_TAB_STATUS_VAR_CLEAN:
-				cols_req = tab->tab_dic.dic_blob_cols_req;
-				cols_in_buffer = cols_req;
-				blob_size = myxt_load_row_length(ot, rec_size - XT_REC_FIX_HEADER_SIZE, ot->ot_row_rbuffer + XT_REC_FIX_HEADER_SIZE, &cols_in_buffer);
-				if (cols_in_buffer < cols_req)
-					blob_size = tab->tab_dic.dic_rec_size;
-				else 
-					blob_size += XT_REC_FIX_HEADER_SIZE;
-				if (blob_size > rec_size)
-					rec_size = blob_size;
-				break;
-			case XT_TAB_STATUS_EXT_DLOG:
-			case XT_TAB_STATUS_EXT_CLEAN:
-				cols_req = tab->tab_dic.dic_blob_cols_req;
-				cols_in_buffer = cols_req;
-				blob_size = myxt_load_row_length(ot, rec_size - XT_REC_EXT_HEADER_SIZE, ot->ot_row_rbuffer + XT_REC_EXT_HEADER_SIZE, &cols_in_buffer);
-				if (cols_in_buffer < cols_req)
-					blob_size = tab->tab_dic.dic_rec_size;
-				else 
-					blob_size += XT_REC_EXT_HEADER_SIZE;
-				if (blob_size > rec_size)
-					rec_size = blob_size;
-				break;
-		}
-	}
-#endif
-
 	set_removed:
 	if (XT_REC_IS_EXT_DLOG(old_rec_type)) {
 		/* {LOCK-EXT-REC} Lock, and read again to make sure that the
@@ -3810,7 +3830,7 @@ static xtRowID tab_new_row(XTOpenTablePt
 				xt_unlock_mutex_ns(&tab->tab_row_lock);
 				return 0;
 			}
-			xt_rwmutex_unlock(&seg->tcs_lock, ot->ot_thread->t_id);
+			TAB_CAC_UNLOCK(&seg->tcs_lock, ot->ot_thread->t_id);
 		}
 		tab->tab_row_eof_id++;
 	}
@@ -4343,15 +4363,6 @@ xtPublic xtBool xt_tab_new_record(XTOpen
 	xtRowID					row_id;
 	u_int					idx_cnt = 0;
 	XTIndexPtr				*ind;
-#ifdef XT_STREAMING
-	void					*pbms_table;
-
-	/* PBMS: Reference BLOBs!? */
-	if (tab->tab_dic.dic_blob_count) {
-		if (!myxt_use_blobs(ot, &pbms_table, rec_buf))
-			return FAILED;
-	}
-#endif
 
 	if (!myxt_store_row(ot, &rec_info, (char *) rec_buf))
 		goto failed_0;
@@ -4386,17 +4397,6 @@ xtPublic xtBool xt_tab_new_record(XTOpen
 		}
 	}
 
-#ifdef XT_STREAMING
-	/* Reference the BLOBs in the row: */
-	if (tab->tab_dic.dic_blob_count) {
-		if (!myxt_retain_blobs(ot, pbms_table, rec_info.ri_rec_id)) {
-			pbms_table = NULL;
-			goto failed_2;
-		}
-		pbms_table = NULL;
-	}
-#endif
-
 	/* Do the foreign key stuff: */
 	if (ot->ot_table->tab_dic.dic_table->dt_fkeys.size() > 0) {
 		if (!ot->ot_table->tab_dic.dic_table->insertRow(ot, rec_buf))
@@ -4417,10 +4417,6 @@ xtPublic xtBool xt_tab_new_record(XTOpen
 	tab_free_row_on_fail(ot, tab, row_id);
 
 	failed_0:
-#ifdef XT_STREAMING
-	if (tab->tab_dic.dic_blob_count && pbms_table)
-		myxt_unuse_blobs(ot, pbms_table);
-#endif
 	return FAILED;
 }
 
@@ -4524,15 +4520,6 @@ static xtBool tab_overwrite_record(XTOpe
 	xtLogOffset				log_offset;
 	xtBool					prev_ext_rec;
 
-#ifdef XT_STREAMING
-	void					*pbms_table;
-
-	if (tab->tab_dic.dic_blob_count) {
-		if (!myxt_use_blobs(ot, &pbms_table, after_buf))
-			return FAILED;
-	}
-#endif
-
 	if (!myxt_store_row(ot, &rec_info, (char *) after_buf))
 		goto failed_0;
 
@@ -4596,16 +4583,6 @@ static xtBool tab_overwrite_record(XTOpe
 	if (prev_ext_rec)
 		tab_free_ext_record_on_fail(ot, rec_id, &prev_rec_head, TRUE);
 
-#ifdef XT_STREAMING
-	if (tab->tab_dic.dic_blob_count) {
-		/* Retain the BLOBs new record: */
-		if (!myxt_retain_blobs(ot, pbms_table, rec_id))
-			return FAILED;
-		/* Release the BLOBs in the old record: */
-		myxt_release_blobs(ot, before_buf, rec_id);
-	}
-#endif
-
 	return OK;
 
 	failed_2:
@@ -4648,11 +4625,6 @@ static xtBool tab_overwrite_record(XTOpe
 		tab_free_ext_record_on_fail(ot, rec_id, &prev_rec_head, TRUE);
 
 	failed_0:
-#ifdef XT_STREAMING
-	/* Unuse the BLOBs of the new record: */
-	if (tab->tab_dic.dic_blob_count && pbms_table)
-		myxt_unuse_blobs(ot, pbms_table);
-#endif
 	return FAILED;
 }
 
@@ -4666,10 +4638,6 @@ xtPublic xtBool xt_tab_update_record(XTO
 	u_int					idx_cnt = 0;
 	XTIndexPtr				*ind;
 
-#ifdef XT_STREAMING
-	void					*pbms_table;
-#endif
-
 	/*
 	 * Originally only the flag ot->ot_curr_updated was checked, and if it was on, then
 	 * tab_overwrite_record() was called, but this caused crashes in some cases like:
@@ -4709,14 +4677,6 @@ xtPublic xtBool xt_tab_update_record(XTO
 	row_id = ot->ot_curr_row_id;
 	self = ot->ot_thread;
 
-#ifdef XT_STREAMING
-	/* PBMS: Reference BLOBs!? */
-	if (tab->tab_dic.dic_blob_count) {
-		if (!myxt_use_blobs(ot, &pbms_table, after_buf))
-			return FAILED;
-	}
-#endif
-
 	if (!myxt_store_row(ot, &rec_info, (char *) after_buf))
 		goto failed_0;
 
@@ -4766,17 +4726,6 @@ xtPublic xtBool xt_tab_update_record(XTO
 		}
 	}
 
-#ifdef XT_STREAMING
-	/* Reference the BLOBs in the row: */
-	if (tab->tab_dic.dic_blob_count) {
-		if (!myxt_retain_blobs(ot, pbms_table, rec_info.ri_rec_id)) {
-			pbms_table = NULL;
-			goto failed_2;
-		}
-		pbms_table = NULL;
-	}
-#endif
-
 	if (ot->ot_table->tab_dic.dic_table->dt_trefs || ot->ot_table->tab_dic.dic_table->dt_fkeys.size() > 0) {
 		if (!ot->ot_table->tab_dic.dic_table->updateRow(ot, before_buf, after_buf))
 			goto failed_2;
@@ -4793,10 +4742,6 @@ xtPublic xtBool xt_tab_update_record(XTO
 	XT_TAB_ROW_UNLOCK(&tab->tab_row_rwlock[row_id % XT_ROW_RWLOCKS], ot->ot_thread);
 
 	failed_0:
-#ifdef XT_STREAMING
-	if (tab->tab_dic.dic_blob_count && pbms_table)
-		myxt_unuse_blobs(ot, pbms_table);
-#endif
 	return FAILED;
 }
 
@@ -4906,6 +4851,7 @@ xtPublic xtBool xt_tab_seq_init(XTOpenTa
 	register XTTableHPtr tab = ot->ot_table;
 	
 	ot->ot_seq_page = NULL;
+	ot->ot_seq_data = NULL;
 	ot->ot_on_page = FALSE;
 	ot->ot_seq_offset = 0;
 
@@ -4958,6 +4904,7 @@ xtPublic void xt_tab_seq_reset(XTOpenTab
 	ot->ot_seq_rec_id = 0;
 	ot->ot_seq_eof_id = 0;
 	ot->ot_seq_page = NULL;
+	ot->ot_seq_data = NULL;
 	ot->ot_on_page = FALSE;
 	ot->ot_seq_offset = 0;
 }
@@ -4970,23 +4917,40 @@ xtPublic void xt_tab_seq_exit(XTOpenTabl
 		tab->tab_recs.xt_tc_release_page(ot->ot_rec_file, ot->ot_seq_page, ot->ot_thread);
 		ot->ot_seq_page = NULL;
 	}
+	if (ot->ot_seq_data)
+		XT_UNLOCK_MEMORY_PTR(ot->ot_rec_file, ot->ot_seq_data, TRUE, ot->ot_thread);
 	ot->ot_on_page = FALSE;
 }
 
+#ifdef XT_USE_ROW_REC_MMAP_FILES
+#define TAB_SEQ_LOAD_CACHE		FALSE
+#else
+#ifdef XT_SEQ_SCAN_LOADS_CACHE
+#define TAB_SEQ_LOAD_CACHE		TRUE
+#else
+#define TAB_SEQ_LOAD_CACHE		FALSE
+#endif
+#endif
+
 xtPublic xtBool xt_tab_seq_next(XTOpenTablePtr ot, xtWord1 *buffer, xtBool *eof)
 {
 	register XTTableHPtr	tab = ot->ot_table;
 	register size_t			rec_size = tab->tab_dic.dic_rec_size;
 	xtWord1					*buff_ptr;
 	xtRecordID				new_rec_id;
-	xtBool					ptr_locked;
 	xtRecordID				invalid_rec = 0;
-	XTTabRecHeadDRec		rec_head;
 
 	next_page:
 	if (!ot->ot_on_page) {
-		if (!(ot->ot_on_page = tab->tab_recs.xt_tc_get_page(ot->ot_rec_file, ot->ot_seq_rec_id, &ot->ot_seq_page, &ot->ot_seq_offset, ot->ot_thread)))
+		if (!(ot->ot_on_page = tab->tab_recs.xt_tc_get_page(ot->ot_rec_file, ot->ot_seq_rec_id, TAB_SEQ_LOAD_CACHE, &ot->ot_seq_page, &ot->ot_seq_offset, ot->ot_thread)))
 			return FAILED;
+		if (!ot->ot_seq_page) {
+			XT_LOCK_MEMORY_PTR(ot->ot_seq_data, ot->ot_rec_file, xt_rec_id_to_rec_offset(tab, ot->ot_seq_rec_id), tab->tab_rows.tci_page_size, &ot->ot_thread->st_statistics.st_rec, ot->ot_thread);
+			if (!ot->ot_seq_data)
+				return FAILED;
+			ot->ot_on_page = TRUE;
+			ot->ot_seq_offset = 0;
+		}
 	}
 
 	next_record:
@@ -5001,22 +4965,19 @@ xtPublic xtBool xt_tab_seq_next(XTOpenTa
 			tab->tab_recs.xt_tc_release_page(ot->ot_rec_file, ot->ot_seq_page, ot->ot_thread);
 			ot->ot_seq_page = NULL;
 		}
+		if (ot->ot_seq_data)
+			/* NULL here means that in the case of non-memory mapped
+			 * files we "keep" the lock.
+			 */
+			XT_UNLOCK_MEMORY_PTR(ot->ot_rec_file, ot->ot_seq_data, FALSE, ot->ot_thread);
 		ot->ot_on_page = FALSE;
 		goto next_page;
 	}
 
-	if (ot->ot_seq_page) {
-		ptr_locked = FALSE;
+	if (ot->ot_seq_page)
 		buff_ptr = ot->ot_seq_page->tcp_data + ot->ot_seq_offset;
-	}
-	else {
-		size_t red_size;
-
-		ptr_locked = TRUE;
-		if (!xt_pread_fmap(ot->ot_rec_file, xt_rec_id_to_rec_offset(tab, ot->ot_seq_rec_id), sizeof(XTTabRecHeadDRec), sizeof(XTTabRecHeadDRec), &rec_head, &red_size, &ot->ot_thread->st_statistics.st_rec, ot->ot_thread))
-			return FAILED;
-		buff_ptr = (xtWord1 *) &rec_head;
-	}
+	else
+		buff_ptr = ot->ot_seq_data + ot->ot_seq_offset;
 
 	/* This is the current record: */
 	ot->ot_curr_rec_id = ot->ot_seq_rec_id;
@@ -5033,7 +4994,6 @@ xtPublic xtBool xt_tab_seq_next(XTOpenTa
 		case XT_ERR:
 			goto failed;
 		case XT_NEW:
-			ptr_locked = FALSE;
 			buff_ptr = ot->ot_row_rbuffer;
 			if (!xt_tab_get_rec_data(ot, new_rec_id, rec_size, ot->ot_row_rbuffer))
 				return XT_ERR;
@@ -5066,8 +5026,6 @@ xtPublic xtBool xt_tab_seq_next(XTOpenTa
 			invalid_rec = 0;
 			goto next_record;
 		default:
-			if (ptr_locked)
-				XT_LOCK_MEMORY_PTR(buff_ptr, ot->ot_rec_file, xt_rec_id_to_rec_offset(tab, ot->ot_curr_rec_id), tab->tab_rows.tci_page_size, &ot->ot_thread->st_statistics.st_rec, ot->ot_thread);
 			break;
 	}
 
@@ -5099,17 +5057,176 @@ xtPublic xtBool xt_tab_seq_next(XTOpenTa
 			break;
 		}
 	}
-	if (ptr_locked)
-		XT_UNLOCK_MEMORY_PTR(ot->ot_rec_file, ot->ot_thread);
 
 	*eof = FALSE;
 	return OK;
 
 	failed_1:
-	if (ptr_locked)
-		XT_UNLOCK_MEMORY_PTR(ot->ot_rec_file, ot->ot_thread);
 
 	failed:
 	return FAILED;
 }
 
+/*
+ * -----------------------------------------------------------------------
+ * REPAIR TABLE
+ */
+
+#define REP_FIND		0
+#define REP_ADD			1
+#define REP_DEL			2
+
+static xtBool tab_exec_repair_pending(XTDatabaseHPtr db, int what, char *table_name)
+{
+	XTThreadPtr			thread = xt_get_self();
+	char				file_path[PATH_MAX];
+	XTOpenFilePtr		of = NULL;
+	int					len;
+	char				*buffer = NULL, *ptr, *name;
+	char				ch;
+	xtBool				found = FALSE;
+
+	xt_strcpy(PATH_MAX, file_path, db->db_main_path);
+	xt_add_pbxt_file(PATH_MAX, file_path, "repair-pending");
+	
+	if (what == REP_ADD) {
+		if (!xt_open_file_ns(&of, file_path, XT_FS_CREATE | XT_FS_MAKE_PATH))
+			return FALSE;
+	}
+	else {
+		if (!xt_open_file_ns(&of, file_path, XT_FS_DEFAULT))
+			return FALSE;
+	}
+	if (!of)
+		return FALSE;
+
+	len = (int) xt_seek_eof_file(NULL, of);
+	
+	if (!(buffer = (char *) xt_malloc_ns(len + 1)))
+		goto failed;
+
+	if (!xt_pread_file(of, 0, len, len, buffer, NULL, &thread->st_statistics.st_x, thread))
+		goto failed;
+
+	buffer[len] = 0;
+	ptr = buffer;
+	for(;;) {
+		name = ptr;
+		while (*ptr && *ptr != '\n' && *ptr != '\r')
+			ptr++;
+		if (ptr > name) {
+			ch = *ptr;
+			*ptr = 0;
+			if (xt_tab_compare_names(name, table_name) == 0) {
+				*ptr = ch;
+				found = TRUE;
+				break;
+			}	
+			*ptr = ch;
+		}
+		if (!*ptr)
+			break;
+		ptr++;
+	}
+
+	switch (what) {
+		case REP_ADD:
+			if (!found) {
+				/* Remove any trailing empty lines: */
+				while (len > 0) {
+					if (buffer[len-1] != '\n' && buffer[len-1] != '\r')
+						break;
+					len--;
+				}
+				if (len > 0) {
+					if (!xt_pwrite_file(of, len, 1, (void *) "\n", &thread->st_statistics.st_x, thread))
+						goto failed;
+					len++;
+				}
+				if (!xt_pwrite_file(of, len, strlen(table_name), table_name, &thread->st_statistics.st_x, thread))
+					goto failed;
+				len += strlen(table_name);
+				if (!xt_set_eof_file(NULL, of, len))
+					goto failed;
+			}
+			break;
+		case REP_DEL:
+			if (found) {
+				if (*ptr != '\0')
+					ptr++;
+				memmove(name, ptr, len - (ptr - buffer));
+				len = len - (ptr - name);
+
+				/* Remove trailing empty lines: */
+				while (len > 0) {
+					if (buffer[len-1] != '\n' && buffer[len-1] != '\r')
+						break;
+					len--;
+				}
+
+				if (len > 0) {
+					if (!xt_pwrite_file(of, 0, len, buffer, &thread->st_statistics.st_x, thread))
+						goto failed;
+					if (!xt_set_eof_file(NULL, of, len))
+						goto failed;
+				}
+			}
+			break;
+	}
+
+	xt_close_file_ns(of);
+	xt_free_ns(buffer);
+
+	if (len == 0)
+		xt_fs_delete(NULL, file_path);
+	return found;
+
+	failed:
+	if (of)
+		xt_close_file_ns(of);
+	if (buffer)
+		xt_free_ns(buffer);
+	xt_log_and_clear_exception(thread);
+	return FALSE;
+}
+
+xtPublic void tab_make_table_name(XTTableHPtr tab, char *table_name, size_t size)
+{
+	char	name_buf[XT_IDENTIFIER_NAME_SIZE*3+3];
+
+	xt_2nd_last_name_of_path(sizeof(name_buf), name_buf, tab->tab_name->ps_path);
+	myxt_static_convert_file_name(name_buf, table_name, size);
+	xt_strcat(size, table_name, ".");
+	myxt_static_convert_file_name(xt_last_name_of_path(tab->tab_name->ps_path), name_buf, sizeof(name_buf));
+	xt_strcat(size, table_name, name_buf);
+}
+
+xtPublic xtBool xt_tab_is_table_repair_pending(XTTableHPtr tab)
+{
+	char table_name[XT_IDENTIFIER_NAME_SIZE*3+3];
+
+	tab_make_table_name(tab, table_name, sizeof(table_name));
+	return tab_exec_repair_pending(tab->tab_db, REP_FIND, table_name);
+}
+
+xtPublic void xt_tab_table_repaired(XTTableHPtr tab)
+{
+	if (tab->tab_repair_pending) {
+		char table_name[XT_IDENTIFIER_NAME_SIZE*3+3];
+
+		tab->tab_repair_pending = FALSE;
+		tab_make_table_name(tab, table_name, sizeof(table_name));
+		tab_exec_repair_pending(tab->tab_db, REP_DEL, table_name);
+	}
+}
+
+xtPublic void xt_tab_set_table_repair_pending(XTTableHPtr tab)
+{
+	if (!tab->tab_repair_pending) {
+		char table_name[XT_IDENTIFIER_NAME_SIZE*3+3];
+
+		tab->tab_repair_pending = TRUE;
+		tab_make_table_name(tab, table_name, sizeof(table_name));
+		tab_exec_repair_pending(tab->tab_db, REP_ADD, table_name);
+	}
+}

=== modified file 'storage/pbxt/src/table_xt.h'
--- a/storage/pbxt/src/table_xt.h	2009-04-02 10:03:14 +0000
+++ b/storage/pbxt/src/table_xt.h	2009-08-18 07:46:53 +0000
@@ -45,7 +45,17 @@ struct XTTablePath;
 #define XT_TAB_INCOMPATIBLE_VERSION	4
 #define XT_TAB_CURRENT_VERSION		5
 
-#define XT_IND_CURRENT_VERSION		3
+/* This version of the index does not have lazy
+ * delete. The new version is compatible with
+ * this and maintains the old format.
+ */
+#define XT_IND_NO_LAZY_DELETE		3
+#define XT_IND_LAZY_DELETE_OK		4
+#ifdef XT_USE_LAZY_DELETE
+#define XT_IND_CURRENT_VERSION		XT_IND_LAZY_DELETE_OK
+#else
+#define XT_IND_CURRENT_VERSION		XT_IND_NO_LAZY_DELETE
+#endif
 
 #define XT_HEAD_BUFFER_SIZE			1024
 
@@ -100,15 +110,21 @@ struct XTTablePath;
 #define XT_TAB_POOL_CLOSED			3				/* Cannot open table at the moment, the pool is closed. */
 #define XT_TAB_FAILED				4
 
-#define XT_TAB_ROW_USE_RW_MUTEX
+#ifdef XT_NO_ATOMICS
+#define XT_TAB_ROW_USE_PTHREAD_RW
+#else
+//#define XT_TAB_ROW_USE_RWMUTEX
+//#define XT_TAB_ROW_USE_SPINXSLOCK
+#define XT_TAB_ROW_USE_XSMUTEX
+#endif
 
-#ifdef XT_TAB_ROW_USE_FASTWRLOCK
-#define XT_TAB_ROW_LOCK_TYPE			XTFastRWLockRec
-#define XT_TAB_ROW_INIT_LOCK(s, i)		xt_fastrwlock_init(s, i)
-#define XT_TAB_ROW_FREE_LOCK(s, i)		xt_fastrwlock_free(s, i)	
-#define XT_TAB_ROW_READ_LOCK(i, s)		xt_fastrwlock_slock(i, s)
-#define XT_TAB_ROW_WRITE_LOCK(i, s)		xt_fastrwlock_xlock(i, s)
-#define XT_TAB_ROW_UNLOCK(i, s)			xt_fastrwlock_unlock(i, s)
+#ifdef XT_TAB_ROW_USE_XSMUTEX
+#define XT_TAB_ROW_LOCK_TYPE			XTXSMutexRec
+#define XT_TAB_ROW_INIT_LOCK(s, i)		xt_xsmutex_init_with_autoname(s, i)
+#define XT_TAB_ROW_FREE_LOCK(s, i)		xt_xsmutex_free(s, i)	
+#define XT_TAB_ROW_READ_LOCK(i, s)		xt_xsmutex_slock(i, (s)->t_id)
+#define XT_TAB_ROW_WRITE_LOCK(i, s)		xt_xsmutex_xlock(i, (s)->t_id)
+#define XT_TAB_ROW_UNLOCK(i, s)			xt_xsmutex_unlock(i, (s)->t_id)
 #elif defined(XT_TAB_ROW_USE_PTHREAD_RW)
 #define XT_TAB_ROW_LOCK_TYPE			xt_rwlock_type
 #define XT_TAB_ROW_INIT_LOCK(s, i)		xt_init_rwlock(s, i)
@@ -116,16 +132,23 @@ struct XTTablePath;
 #define XT_TAB_ROW_READ_LOCK(i, s)		xt_slock_rwlock_ns(i)
 #define XT_TAB_ROW_WRITE_LOCK(i, s)		xt_xlock_rwlock_ns(i)
 #define XT_TAB_ROW_UNLOCK(i, s)			xt_unlock_rwlock_ns(i)
-#elif defined(XT_TAB_ROW_USE_RW_MUTEX)
+#elif defined(XT_TAB_ROW_USE_RWMUTEX)
 #define XT_TAB_ROW_LOCK_TYPE			XTRWMutexRec
 #define XT_TAB_ROW_INIT_LOCK(s, i)		xt_rwmutex_init_with_autoname(s, i)
 #define XT_TAB_ROW_FREE_LOCK(s, i)		xt_rwmutex_free(s, i)	
 #define XT_TAB_ROW_READ_LOCK(i, s)		xt_rwmutex_slock(i, (s)->t_id)
 #define XT_TAB_ROW_WRITE_LOCK(i, s)		xt_rwmutex_xlock(i, (s)->t_id)
 #define XT_TAB_ROW_UNLOCK(i, s)			xt_rwmutex_unlock(i, (s)->t_id)
+#elif defined(XT_TAB_ROW_USE_SPINXSLOCK)
+#define XT_TAB_ROW_LOCK_TYPE			XTSpinXSLockRec
+#define XT_TAB_ROW_INIT_LOCK(s, i)		xt_spinxslock_init_with_autoname(s, i)
+#define XT_TAB_ROW_FREE_LOCK(s, i)		xt_spinxslock_free(s, i)	
+#define XT_TAB_ROW_READ_LOCK(i, s)		xt_spinxslock_slock(i, (s)->t_id)
+#define XT_TAB_ROW_WRITE_LOCK(i, s)		xt_spinxslock_xlock(i, (s)->t_id)
+#define XT_TAB_ROW_UNLOCK(i, s)			xt_spinxslock_unlock(i, (s)->t_id)
 #else
 #define XT_TAB_ROW_LOCK_TYPE			XTSpinLockRec
-#define XT_TAB_ROW_INIT_LOCK(s, i)		xt_spinlock_init(s, i)
+#define XT_TAB_ROW_INIT_LOCK(s, i)		xt_spinlock_init_with_autoname(s, i)
 #define XT_TAB_ROW_FREE_LOCK(s, i)		xt_spinlock_free(s, i)	
 #define XT_TAB_ROW_READ_LOCK(i, s)		xt_spinlock_lock(i)
 #define XT_TAB_ROW_WRITE_LOCK(i, s)		xt_spinlock_lock(i)
@@ -310,6 +333,7 @@ typedef struct XTTable : public XTHeap {
 	/* Values that belong in the header when flushed! */
 	xtBool					tab_flush_pending;						/* TRUE if the table needs to be flushed */
 	xtBool					tab_recovery_done;						/* TRUE if the table has been recovered */
+	xtBool					tab_repair_pending;						/* TRUE if the table has been marked for repair */
 	xtBool					tab_temporary;							/* TRUE if this is a temporary table {TEMP-TABLES}. */
 	off_t					tab_bytes_to_flush;						/* Number of bytes of the record/row files to flush. */
 
@@ -441,6 +465,9 @@ typedef struct XTOpenTable {
 	xtRecordID				ot_seq_rec_id;							/* Current position of a sequential scan. */
 	xtRecordID				ot_seq_eof_id;							/* The EOF at the start of the sequential scan. */
 	XTTabCachePagePtr		ot_seq_page;							/* If ot_seq_buffer is non-NULL, then a page has been locked! */
+	xtWord1					*ot_seq_data;							/* Non-NULL if the data references memory mapped memory, or if it was
+																	 * allocated if no memory mapping is being used.
+																	 */
 	xtBool					ot_on_page;
 	size_t					ot_seq_offset;							/* Offset on the current page. */
 } XTOpenTableRec, *XTOpenTablePtr;
@@ -488,7 +515,7 @@ XTTableHPtr			xt_use_table_no_lock(XTThr
 int					xt_use_table_by_id(struct XTThread *self, XTTableHPtr *tab, struct XTDatabase *db, xtTableID tab_id);
 XTOpenTablePtr		xt_open_table(XTTableHPtr tab);
 void				xt_close_table(XTOpenTablePtr ot, xtBool flush, xtBool have_table_lock);
-void				xt_drop_table(struct XTThread *self, XTPathStrPtr name);
+void				xt_drop_table(struct XTThread *self, XTPathStrPtr name, xtBool drop_db);
 void				xt_check_table(XTThreadPtr self, XTOpenTablePtr tab);
 void				xt_rename_table(struct XTThread *self, XTPathStrPtr old_name, XTPathStrPtr new_name);
 
@@ -536,8 +563,13 @@ xtBool				xt_tab_put_eof_rec_data(XTOpen
 xtBool				xt_tab_put_log_op_rec_data(XTOpenTablePtr ot, u_int status, xtRecordID free_rec_id, xtRecordID rec_id, size_t size, xtWord1 *buffer);
 xtBool				xt_tab_put_log_rec_data(XTOpenTablePtr ot, u_int status, xtRecordID free_rec_id, xtRecordID rec_id, size_t size, xtWord1 *buffer, xtOpSeqNo *op_seq);
 xtBool				xt_tab_get_rec_data(register XTOpenTablePtr ot, xtRecordID rec_id, size_t size, xtWord1 *buffer);
+void				xt_tab_disable_index(XTTableHPtr tab, u_int ind_error);
 void				xt_tab_set_index_error(XTTableHPtr tab);
 
+xtBool				xt_tab_is_table_repair_pending(XTTableHPtr tab);
+void				xt_tab_table_repaired(XTTableHPtr tab);
+void				xt_tab_set_table_repair_pending(XTTableHPtr tab);
+
 inline off_t		xt_row_id_to_row_offset(register XTTableHPtr tab, xtRowID row_id)
 {
 	return (off_t) tab->tab_rows.tci_header_size + (off_t) (row_id - 1) * (off_t) tab->tab_rows.tci_rec_size;
@@ -605,3 +637,4 @@ inline xtIndexNodeID xt_ind_offset_to_no
 	while (0)
 
 #endif
+

=== modified file 'storage/pbxt/src/thread_xt.cc'
--- a/storage/pbxt/src/thread_xt.cc	2009-04-02 10:03:14 +0000
+++ b/storage/pbxt/src/thread_xt.cc	2009-08-17 11:12:36 +0000
@@ -23,6 +23,10 @@
 
 #include "xt_config.h"
 
+#ifdef DRIZZLED
+#include <bitset>
+#endif
+
 #ifndef XT_WIN
 #include <unistd.h>
 #include <sys/time.h>
@@ -177,7 +181,7 @@ static void thr_log_newline(XTThreadPtr 
 #endif
 #endif
 
-void xt_log_flush(XTThreadPtr self __attribute__((unused)))
+void xt_log_flush(XTThreadPtr XT_UNUSED(self))
 {
 	fflush(log_file);
 }
@@ -466,7 +470,7 @@ static void thr_free_resources(XTThreadP
 	}
 }
 
-xtPublic void xt_bug(XTThreadPtr self __attribute__((unused)))
+xtPublic void xt_bug(XTThreadPtr XT_UNUSED(self))
 {
 	static int *bug_ptr = NULL;
 	
@@ -532,7 +536,11 @@ xtPublic void xt_throw_error(XTThreadPtr
 
 #define XT_SYS_ERR_SIZE		300
 
-static c_char *thr_get_sys_error(int err, char *err_msg __attribute__((unused)))
+#ifdef XT_WIN
+static c_char *thr_get_sys_error(int err, char *err_msg)
+#else
+static c_char *thr_get_sys_error(int err, char *XT_UNUSED(err_msg))
+#endif
 {
 #ifdef XT_WIN
 	char *ptr;
@@ -638,7 +646,7 @@ static c_char *thr_get_err_string(int xt
 		case XT_ERR_INDEX_CORRUPTED:		str = "Table `%s` index is corrupted, REPAIR TABLE required"; break;
 		case XT_ERR_NO_INDEX_CACHE:			str = "Not enough index cache memory to handle concurrent updates"; break;
 		case XT_ERR_INDEX_LOG_CORRUPT:		str = "Index log corrupt: '%s'"; break;
-		case XT_ERR_TOO_MANY_THREADS:		str = "Too many threads: %s, increase max_connections"; break;
+		case XT_ERR_TOO_MANY_THREADS:		str = "Too many threads: %s, increase pbxt_max_threads"; break;
 		case XT_ERR_TOO_MANY_WAITERS:		str = "Too many waiting threads: %s"; break;
 		case XT_ERR_INDEX_OLD_VERSION:		str = "Table `%s` index created by an older version, REPAIR TABLE required"; break;
 		case XT_ERR_PBXT_TABLE_EXISTS:		str = "System table cannot be dropped because PBXT table still exists"; break;
@@ -869,13 +877,18 @@ xtPublic void xt_log_errno(XTThreadPtr s
  * -----------------------------------------------------------------------
  * Assertions and failures (one breakpoints for all failures)
  */
+//#define CRASH_ON_ASSERT
 
-xtPublic xtBool xt_assert(XTThreadPtr self __attribute__((unused)), c_char *expr, c_char *func, c_char *file, u_int line)
+xtPublic xtBool xt_assert(XTThreadPtr self, c_char *expr, c_char *func, c_char *file, u_int line)
 {
+	(void) self;
 #ifdef DEBUG
 	//xt_set_fflush(TRUE);
 	//xt_dump_trace();
 	printf("%s(%s:%d) %s\n", func, file, (int) line, expr);
+#ifdef CRASH_ON_ASSERT
+	abort();
+#endif
 #ifdef XT_WIN
 	FatalAppExit(0, "Assertion Failed!");
 #endif
@@ -981,11 +994,13 @@ static xtBool thr_setup_signals(void)
 }
 #endif
 
-static void *thr_main(void *data)
+typedef void *(*ThreadMainFunc)(XTThreadPtr self);
+
+extern "C" void *thr_main(void *data)
 {
 	ThreadDataPtr	td = (ThreadDataPtr) data;
 	XTThreadPtr		self = td->td_thr;
-	void			*(*start_routine)(XTThreadPtr);
+	ThreadMainFunc		start_routine;
 	void			*return_data;
 
 	enter_();
@@ -1857,7 +1872,7 @@ xtPublic void xt_signal_thread(XTThreadP
 	xt_broadcast_cond_ns(&target->t_cond);
 }
 
-xtPublic void xt_terminate_thread(XTThreadPtr self __attribute__((unused)), XTThreadPtr target)
+xtPublic void xt_terminate_thread(XTThreadPtr XT_UNUSED(self), XTThreadPtr target)
 {
 	target->t_quit = TRUE;
 	target->t_delayed_signal = SIGTERM;

=== modified file 'storage/pbxt/src/thread_xt.h'
--- a/storage/pbxt/src/thread_xt.h	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/thread_xt.h	2009-08-17 11:12:36 +0000
@@ -44,14 +44,6 @@
  * Macros and defines
  */
 
-#ifdef XT_WIN
-#define __FUNC__						__FUNCTION__
-#elif defined(XT_SOLARIS)
-#define __FUNC__						"__func__"
-#else
-#define __FUNC__						__PRETTY_FUNCTION__
-#endif
-
 #define XT_ERR_MSG_SIZE					(PATH_MAX + 200)
 
 #ifdef DEBUG
@@ -291,6 +283,12 @@ typedef struct XTThread {
 	xtBool					st_xact_long_running;			/* TRUE if this is a long running writer transaction. */
 	xtWord4					st_visible_time;				/* Transactions committed before this time are visible. */
 	XTDataLogBufferRec		st_dlog_buf;
+	
+	/* A list of the last 10 transactions run by this connection: */
+#ifdef XT_WAIT_FOR_CLEANUP
+	u_int					st_last_xact;
+	xtXactID				st_prev_xact[XT_MAX_XACT_BEHIND];
+#endif
 
 	int						st_xact_mode;					/* The transaction mode. */
 	xtBool					st_ignore_fkeys;				/* TRUE if we must ignore foreign keys. */

=== modified file 'storage/pbxt/src/trace_xt.cc'
--- a/storage/pbxt/src/trace_xt.cc	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/trace_xt.cc	2009-08-17 11:12:36 +0000
@@ -72,6 +72,12 @@ xtPublic xtBool xt_init_trace(void)
 	trace_log_offset = 0;
 	trace_log_end = 0;
 	trace_stat_count = 0;
+
+#ifdef XT_TRACK_CONNECTIONS
+	for (int i=0; i<XT_TRACK_MAX_CONNS; i++)
+		xt_track_conn_info[i].cu_t_id = i;
+#endif
+
 	return TRUE;
 }
 
@@ -343,3 +349,45 @@ xtPublic void xt_ftracef(char *fmt, ...)
 	va_end(ap);
 }
 
+/*
+ * -----------------------------------------------------------------------
+ * CONNECTION TRACKING
+ */
+
+#ifdef XT_TRACK_CONNECTIONS
+XTConnInfoRec	xt_track_conn_info[XT_TRACK_MAX_CONNS];
+
+static int trace_comp_conn_info(const void *a, const void *b)
+{
+	XTConnInfoPtr	ci_a = (XTConnInfoPtr) a, ci_b = (XTConnInfoPtr) b;
+
+	if (ci_a->ci_curr_xact_id > ci_b->ci_curr_xact_id)
+		return 1;
+	if (ci_a->ci_curr_xact_id < ci_b->ci_curr_xact_id)
+		return -1;
+	return 0;
+}
+
+xtPublic void xt_dump_conn_tracking(void)
+{
+	XTConnInfoRec	conn_info[XT_TRACK_MAX_CONNS];
+	XTConnInfoPtr	ptr;
+
+	memcpy(conn_info, xt_track_conn_info, sizeof(xt_track_conn_info));
+	qsort(conn_info, XT_TRACK_MAX_CONNS, sizeof(XTConnInfoRec), trace_comp_conn_info);
+
+	ptr = conn_info;
+	for (int i=0; i<XT_TRACK_MAX_CONNS; i++) {
+		if (ptr->ci_curr_xact_id || ptr->ci_prev_xact_id) {
+			printf("%3d curr=%d prev=%d prev-time=%ld\n", (int) ptr->cu_t_id, (int) ptr->ci_curr_xact_id, (int) ptr->ci_prev_xact_id, (long) ptr->ci_prev_xact_time);
+			if (i+1<XT_TRACK_MAX_CONNS) {
+				printf("    diff=%d\n", (int) (ptr+1)->ci_curr_xact_id - (int) ptr->ci_curr_xact_id);
+			}
+		}
+		ptr++;
+	}
+}
+
+#endif
+
+

=== modified file 'storage/pbxt/src/trace_xt.h'
--- a/storage/pbxt/src/trace_xt.h	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/trace_xt.h	2009-08-17 11:12:36 +0000
@@ -46,4 +46,29 @@ void	xt_ftracef(char *fmt, ...);
 //#define PBXT_HANDLER_TRACE
 #endif
 
+/*
+ * -----------------------------------------------------------------------
+ * CONNECTION TRACKING
+ */
+
+#define XT_TRACK_CONNECTIONS
+
+#ifdef XT_TRACK_CONNECTIONS
+#define XT_TRACK_MAX_CONNS		500
+
+typedef struct XTConnInfo {
+	xtThreadID			cu_t_id;
+	xtXactID			ci_curr_xact_id;
+	xtWord8				ci_xact_start;
+
+	xtXactID			ci_prev_xact_id;
+	xtWord8				ci_prev_xact_time;
+} XTConnInfoRec, *XTConnInfoPtr;
+
+extern XTConnInfoRec xt_track_conn_info[XT_TRACK_MAX_CONNS];
+
+void	xt_dump_conn_tracking(void);
+
+#endif
+
 #endif

=== modified file 'storage/pbxt/src/util_xt.cc'
--- a/storage/pbxt/src/util_xt.cc	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/util_xt.cc	2009-08-17 11:12:36 +0000
@@ -61,7 +61,7 @@ xtPublic xtWord8 xt_time_now(void)
 	return ms;
 }
 
-xtPublic void xt_free_nothing(struct XTThread XT_UNUSED(*thr), void XT_UNUSED(*x))
+xtPublic void xt_free_nothing(struct XTThread *XT_UNUSED(thread), void *XT_UNUSED(x))
 {
 }
 

=== modified file 'storage/pbxt/src/xaction_xt.cc'
--- a/storage/pbxt/src/xaction_xt.cc	2009-04-02 10:03:14 +0000
+++ b/storage/pbxt/src/xaction_xt.cc	2009-08-17 11:12:36 +0000
@@ -23,6 +23,10 @@
 
 #include "xt_config.h"
 
+#ifdef DRIZZLED
+#include <bitset>
+#endif
+
 #include <time.h>
 #include <signal.h>
 
@@ -48,7 +52,7 @@
 #endif
 
 static void xn_sw_wait_for_xact(XTThreadPtr self, XTDatabaseHPtr db, u_int hsecs);
-static xtBool xn_get_xact_details(XTDatabaseHPtr db, xtXactID xn_id, XTThreadPtr thread __attribute__((unused)), int *flags, xtXactID *start, xtXactID *end, xtThreadID *thd_id);
+static xtBool xn_get_xact_details(XTDatabaseHPtr db, xtXactID xn_id, XTThreadPtr XT_UNUSED(thread), int *flags, xtXactID *start, xtXactID *end, xtThreadID *thd_id);
 static xtBool xn_get_xact_pointer(XTDatabaseHPtr db, xtXactID xn_id, XTXactDataPtr *xact_ptr);
 
 /* ============================================================================================== */
@@ -203,7 +207,7 @@ typedef struct XNWaitFor {
 	xtXactID				wf_for_me_xn_id;		/* The transaction we are waiting for. */
 } XNWaitForRec, *XNWaitForPtr;
 
-static int xn_compare_wait_for(XTThreadPtr XT_UNUSED(self), register const void XT_UNUSED(*thunk), register const void *a, register const void *b)
+static int xn_compare_wait_for(XTThreadPtr XT_UNUSED(self), register const void *XT_UNUSED(thunk), register const void *a, register const void *b)
 {
 	xtXactID		*x = (xtXactID *) a;
 	XNWaitForPtr	y = (XNWaitForPtr) b;
@@ -215,7 +219,7 @@ static int xn_compare_wait_for(XTThreadP
 	return 1;
 }
 
-static void xn_free_wait_for(XTThreadPtr XT_UNUSED(self), void XT_UNUSED(*thunk), void XT_UNUSED(*item))
+static void xn_free_wait_for(XTThreadPtr XT_UNUSED(self), void *XT_UNUSED(thunk), void *XT_UNUSED(item))
 {
 }
 
@@ -446,7 +450,9 @@ xtPublic xtBool xt_xn_wait_for_xact(XTTh
 			xt_timed_wait_cond_ns(&my_wt->wt_cond, &my_wt->wt_lock, WAIT_FOR_XACT_TIME);
 		}
 
+		/* Unreachable
 		xt_unlock_mutex_ns(&my_wt->wt_lock);
+		*/
 	}
 
 	if (xw) {
@@ -753,12 +759,13 @@ xtPublic xtXactID xt_xn_get_curr_id(XTDa
 	return curr_xn_id;
 }
 
-xtPublic XTXactDataPtr xt_xn_add_old_xact(XTDatabaseHPtr db, xtXactID xn_id, XTThreadPtr thread __attribute__((unused)))
+xtPublic XTXactDataPtr xt_xn_add_old_xact(XTDatabaseHPtr db, xtXactID xn_id, XTThreadPtr thread)
 {
 	register XTXactDataPtr	xact;
 	register XTXactSegPtr 	seg;
 	register XTXactDataPtr	*hash;
 
+	(void) thread;
 	seg = &db->db_xn_idx[xn_id & XT_XN_SEGMENT_MASK];
 	XT_XACT_WRITE_LOCK(&seg->xs_tab_lock, thread);
 	hash = &seg->xs_table[(xn_id >> XT_XN_SEGMENT_SHIFTS) % XT_XN_HASH_TABLE_SIZE];
@@ -778,7 +785,7 @@ xtPublic XTXactDataPtr xt_xn_add_old_xac
 		 */
 		db->db_sw_faster |= XT_SW_NO_MORE_XACT_SLOTS;
 		if (!(xact = (XTXactDataPtr) xt_malloc_ns(sizeof(XTXactDataRec)))) {
-			XT_XACT_UNLOCK(&seg->xs_tab_lock, thread);
+			XT_XACT_UNLOCK(&seg->xs_tab_lock, thread, TRUE);
 			return NULL;
 		}
 	}
@@ -797,7 +804,7 @@ xtPublic XTXactDataPtr xt_xn_add_old_xac
 		seg->xs_last_xn_id = xn_id;
 
 	done_ok:
-	XT_XACT_UNLOCK(&seg->xs_tab_lock, thread);
+	XT_XACT_UNLOCK(&seg->xs_tab_lock, thread, TRUE);
 #ifdef HIGH_X
 	tot_alloced++;
 	if (tot_alloced > high_alloced)
@@ -806,12 +813,13 @@ xtPublic XTXactDataPtr xt_xn_add_old_xac
 	return xact;
 }
 
-static XTXactDataPtr xn_add_new_xact(XTDatabaseHPtr db, xtXactID xn_id, XTThreadPtr thread __attribute__((unused)))
+static XTXactDataPtr xn_add_new_xact(XTDatabaseHPtr db, xtXactID xn_id, XTThreadPtr thread)
 {
 	register XTXactDataPtr	xact;
 	register XTXactSegPtr 	seg;
 	register XTXactDataPtr	*hash;
 
+	(void) thread;
 	seg = &db->db_xn_idx[xn_id & XT_XN_SEGMENT_MASK];
 	XT_XACT_WRITE_LOCK(&seg->xs_tab_lock, thread);
 	hash = &seg->xs_table[(xn_id >> XT_XN_SEGMENT_SHIFTS) % XT_XN_HASH_TABLE_SIZE];
@@ -825,7 +833,7 @@ static XTXactDataPtr xn_add_new_xact(XTD
 		 */
 		db->db_sw_faster |= XT_SW_NO_MORE_XACT_SLOTS;
 		if (!(xact = (XTXactDataPtr) xt_malloc_ns(sizeof(XTXactDataRec)))) {
-			XT_XACT_UNLOCK(&seg->xs_tab_lock, thread);
+			XT_XACT_UNLOCK(&seg->xs_tab_lock, thread, TRUE);
 			return NULL;
 		}
 	}
@@ -841,7 +849,7 @@ static XTXactDataPtr xn_add_new_xact(XTD
 	xact->xd_flags = 0;
 
 	seg->xs_last_xn_id = xn_id;
-	XT_XACT_UNLOCK(&seg->xs_tab_lock, thread);
+	XT_XACT_UNLOCK(&seg->xs_tab_lock, thread, TRUE);
 #ifdef HIGH_X
 	tot_alloced++;
 	if (tot_alloced > high_alloced)
@@ -850,7 +858,7 @@ static XTXactDataPtr xn_add_new_xact(XTD
 	return xact;
 }
 
-static xtBool xn_get_xact_details(XTDatabaseHPtr db, xtXactID xn_id, XTThreadPtr thread __attribute__((unused)), int *flags, xtXactID *start, xtWord4 *end, xtThreadID *thd_id)
+static xtBool xn_get_xact_details(XTDatabaseHPtr db, xtXactID xn_id, XTThreadPtr XT_UNUSED(thread), int *flags, xtXactID *start, xtWord4 *end, xtThreadID *thd_id)
 {
 	register XTXactSegPtr 	seg;
 	register XTXactDataPtr	xact;
@@ -874,7 +882,7 @@ static xtBool xn_get_xact_details(XTData
 		}
 		xact = xact->xd_next_xact;
 	}
-	XT_XACT_UNLOCK(&seg->xs_tab_lock, thread);
+	XT_XACT_UNLOCK(&seg->xs_tab_lock, thread, FALSE);
 	return found;
 }
 
@@ -900,11 +908,11 @@ static xtBool xn_get_xact_pointer(XTData
 		}
 		xact = xact->xd_next_xact;
 	}
-	XT_XACT_UNLOCK(&seg->xs_tab_lock, thread);
+	XT_XACT_UNLOCK(&seg->xs_tab_lock, thread, FALSE);
 	return found;
 }
 
-static xtBool xn_get_xact_start(XTDatabaseHPtr db, xtXactID xn_id, XTThreadPtr thread __attribute__((unused)), xtLogID *log_id, xtLogOffset *log_offset)
+static xtBool xn_get_xact_start(XTDatabaseHPtr db, xtXactID xn_id, XTThreadPtr XT_UNUSED(thread), xtLogID *log_id, xtLogOffset *log_offset)
 {
 	register XTXactSegPtr 	seg;
 	register XTXactDataPtr	xact;
@@ -922,12 +930,12 @@ static xtBool xn_get_xact_start(XTDataba
 		}
 		xact = xact->xd_next_xact;
 	}
-	XT_XACT_UNLOCK(&seg->xs_tab_lock, thread);
+	XT_XACT_UNLOCK(&seg->xs_tab_lock, thread, FALSE);
 	return found;
 }
 
 /* NOTE: this function may only be used by the sweeper or the recovery process. */
-xtPublic XTXactDataPtr xt_xn_get_xact(XTDatabaseHPtr db, xtXactID xn_id, XTThreadPtr thread __attribute__((unused)))
+xtPublic XTXactDataPtr xt_xn_get_xact(XTDatabaseHPtr db, xtXactID xn_id, XTThreadPtr XT_UNUSED(thread))
 {
 	register XTXactSegPtr 	seg;
 	register XTXactDataPtr	xact;
@@ -940,7 +948,7 @@ xtPublic XTXactDataPtr xt_xn_get_xact(XT
 			break;
 		xact = xact->xd_next_xact;
 	}
-	XT_XACT_UNLOCK(&seg->xs_tab_lock, thread);
+	XT_XACT_UNLOCK(&seg->xs_tab_lock, thread, FALSE);
 	return xact;
 }
 
@@ -948,11 +956,12 @@ xtPublic XTXactDataPtr xt_xn_get_xact(XT
  * Delete a transaction, return TRUE if the transaction
  * was found.
  */
-xtPublic xtBool xt_xn_delete_xact(XTDatabaseHPtr db, xtXactID xn_id, XTThreadPtr thread __attribute__((unused)))
+xtPublic xtBool xt_xn_delete_xact(XTDatabaseHPtr db, xtXactID xn_id, XTThreadPtr thread)
 {
 	XTXactDataPtr	xact, pxact = NULL;
 	XTXactSegPtr 	seg;
 
+	(void) thread;
 	seg = &db->db_xn_idx[xn_id & XT_XN_SEGMENT_MASK];
 	XT_XACT_WRITE_LOCK(&seg->xs_tab_lock, thread);
 	xact = seg->xs_table[(xn_id >> XT_XN_SEGMENT_SHIFTS) % XT_XN_HASH_TABLE_SIZE];
@@ -963,13 +972,13 @@ xtPublic xtBool xt_xn_delete_xact(XTData
 			else
 				 seg->xs_table[(xn_id >> XT_XN_SEGMENT_SHIFTS) % XT_XN_HASH_TABLE_SIZE] = xact->xd_next_xact;
 			xn_free_xact(db, seg, xact);
-			XT_XACT_UNLOCK(&seg->xs_tab_lock, thread);
+			XT_XACT_UNLOCK(&seg->xs_tab_lock, thread, TRUE);
 			return TRUE;
 		}
 		pxact = xact;
 		xact = xact->xd_next_xact;
 	}
-	XT_XACT_UNLOCK(&seg->xs_tab_lock, thread);
+	XT_XACT_UNLOCK(&seg->xs_tab_lock, thread, TRUE);
 	return FALSE;
 }
 
@@ -1253,6 +1262,10 @@ xtPublic xtBool xt_xn_begin(XTThreadPtr 
 #ifdef TRACE_TRANSACTION
 	xt_ttracef(self, "BEGIN T%lu\n", (u_long) self->st_xact_data->xd_start_xn_id);
 #endif
+#ifdef XT_TRACK_CONNECTIONS
+	xt_track_conn_info[self->t_id].ci_curr_xact_id = self->st_xact_data->xd_start_xn_id;
+	xt_track_conn_info[self->t_id].ci_xact_start = xt_trace_clock();
+#endif
 	return OK;
 }
 
@@ -1375,6 +1388,13 @@ static xtBool xn_end_xact(XTThreadPtr th
 
 		thread->st_xact_data = NULL;
 
+#ifdef XT_TRACK_CONNECTIONS
+		xt_track_conn_info[thread->t_id].ci_prev_xact_id = xt_track_conn_info[thread->t_id].ci_curr_xact_id;
+		xt_track_conn_info[thread->t_id].ci_prev_xact_time = xt_trace_clock() - xt_track_conn_info[thread->t_id].ci_xact_start;
+		xt_track_conn_info[thread->t_id].ci_curr_xact_id = 0;
+		xt_track_conn_info[thread->t_id].ci_xact_start = 0;
+#endif
+
 		xt_xn_wakeup_waiting_threads(thread);
 
 		/* {WAKE-SW} Waking the sweeper
@@ -1401,6 +1421,19 @@ static xtBool xn_end_xact(XTThreadPtr th
 
 		/* Don't get too far ahead of the sweeper! */
 		if (writer) {
+#ifdef XT_WAIT_FOR_CLEANUP
+			xtXactID	wait_xn_id;
+			
+			/* This is the transaction that was committed 3 transactions ago: */
+			wait_xn_id = thread->st_prev_xact[thread->st_last_xact];
+			thread->st_prev_xact[thread->st_last_xact] = xn_id;
+			/* This works because XT_MAX_XACT_BEHIND == 2! */
+			ASSERT_NS((thread->st_last_xact + 1) % XT_MAX_XACT_BEHIND == thread->st_last_xact ^ 1);
+			thread->st_last_xact ^= 1;
+			while (xt_xn_is_before(db->db_xn_to_clean_id, wait_xn_id) && (db->db_sw_faster & XT_SW_TOO_FAR_BEHIND)) {
+				xt_critical_wait();
+			}
+#else
 			if ((db->db_sw_faster & XT_SW_TOO_FAR_BEHIND) != 0) {
 				xtWord8	then = xt_trace_clock() + (xtWord8) 20000;
 
@@ -1412,6 +1445,7 @@ static xtBool xn_end_xact(XTThreadPtr th
 						break;
 				}
 			}
+#endif
 		}
 	}
 	return ok;
@@ -1854,7 +1888,7 @@ static xtBool xn_sw_cleanup_done(XTThrea
 	return FALSE;
 }
 
-static void xn_sw_clean_indices(XTThreadPtr self __attribute__((unused)), XTOpenTablePtr ot, xtRecordID rec_id, xtRowID row_id, xtWord1 *rec_data, xtWord1 *rec_buffer)
+static void xn_sw_clean_indices(XTThreadPtr XT_NDEBUG_UNUSED(self), XTOpenTablePtr ot, xtRecordID rec_id, xtRowID row_id, xtWord1 *rec_data, xtWord1 *rec_buffer)
 {
 	XTTableHPtr	tab = ot->ot_table;
 	u_int		cols_req;
@@ -2599,7 +2633,13 @@ xtPublic void xt_wait_for_sweeper(XTThre
 
 	if (db->db_sw_thread) {
 		then = time(NULL);
-		while (!xt_xn_is_before(xt_xn_get_curr_id(db), db->db_xn_to_clean_id)) { // was db->db_xn_to_clean_id <= xt_xn_get_curr_id(db)
+		/* Changed xt_xn_get_curr_id(db) to db->db_xn_curr_id,
+		 * This should work because we are not concerned about the difference
+		 * between xt_xn_get_curr_id(db) and db->db_xn_curr_id,
+		 * Which is just a matter of when transactions we can expect ot find
+		 * in memory (see {GAP-INC-ADD-XACT})
+		 */
+		while (!xt_xn_is_before(db->db_xn_curr_id, db->db_xn_to_clean_id)) { // was db->db_xn_to_clean_id <= xt_xn_get_curr_id(db)
 			xt_lock_mutex(self, &db->db_sw_lock);
 			pushr_(xt_unlock_mutex, &db->db_sw_lock);
 			xt_wakeup_sweeper(db);

=== modified file 'storage/pbxt/src/xaction_xt.h'
--- a/storage/pbxt/src/xaction_xt.h	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/xaction_xt.h	2009-08-17 11:12:36 +0000
@@ -36,28 +36,48 @@ struct XTOpenTable;
 
 #ifdef XT_USE_XACTION_DEBUG_SIZES
 
-#define XT_XN_DATA_ALLOC_COUNT	400
-#define XT_XN_SEGMENT_SHIFTS	1
-#define XT_XN_HASH_TABLE_SIZE	31
 #define XT_TN_NUMBER_INCREMENT	20
 #define XT_TN_MAX_TO_FREE		20
 #define XT_TN_MAX_TO_FREE_WASTE	3
 #define XT_TN_MAX_TO_FREE_CHECK	3
 #define XT_TN_MAX_TO_FREE_INC	3
 
+#define XT_XN_SEGMENT_SHIFTS	1
+
 #else
 
-#define XT_XN_DATA_ALLOC_COUNT	1250	// Number of pre-allocated transaction data structures per segment
-#define XT_XN_SEGMENT_SHIFTS	5		// (32)
-#define XT_XN_HASH_TABLE_SIZE	1279	// This is a prime number!
 #define XT_TN_NUMBER_INCREMENT	100		// The increment of the transaction number on restart
 #define XT_TN_MAX_TO_FREE		800		// The maximum size of the "to free" list
 #define XT_TN_MAX_TO_FREE_WASTE	400
 #define XT_TN_MAX_TO_FREE_CHECK	100		// Once we have exceeded the limit, we only try in intervals
 #define XT_TN_MAX_TO_FREE_INC	100
 
+//#define XT_XN_SEGMENT_SHIFTS	5		// (32)
+//#define XT_XN_SEGMENT_SHIFTS	6		// (64)
+//#define XT_XN_SEGMENT_SHIFTS	7		// (128)
+#define XT_XN_SEGMENT_SHIFTS	8		// (256)
+//#define XT_XN_SEGMENT_SHIFTS	9		// (512)
+
 #endif
 
+/* The hash table size (a prime number) */
+#if XT_XN_SEGMENT_SHIFTS == 1		// (1)
+#define XT_XN_HASH_TABLE_SIZE	1301
+#elif XT_XN_SEGMENT_SHIFTS == 5		// (32)
+#define XT_XN_HASH_TABLE_SIZE	1009
+#elif XT_XN_SEGMENT_SHIFTS == 6		// (64)
+#define XT_XN_HASH_TABLE_SIZE	503
+#elif XT_XN_SEGMENT_SHIFTS == 7		// (128)
+#define XT_XN_HASH_TABLE_SIZE	251
+#elif XT_XN_SEGMENT_SHIFTS == 8		// (256)
+#define XT_XN_HASH_TABLE_SIZE	127
+#elif XT_XN_SEGMENT_SHIFTS == 9		// (512)
+#define XT_XN_HASH_TABLE_SIZE	67
+#endif
+
+/* Number of pre-allocated transaction data structures per segment */
+#define XT_XN_DATA_ALLOC_COUNT	XT_XN_HASH_TABLE_SIZE
+
 #define XT_XN_NO_OF_SEGMENTS	(1 << XT_XN_SEGMENT_SHIFTS)
 #define XT_XN_SEGMENT_MASK		(XT_XN_NO_OF_SEGMENTS - 1)
 
@@ -94,36 +114,34 @@ typedef struct XTXactData {
 
 } XTXactDataRec, *XTXactDataPtr;
 
-#define XT_XACT_USE_SPINLOCK
+#ifdef XT_NO_ATOMICS
+#define XT_XACT_USE_PTHREAD_RW
+#else
+//#define XT_XACT_USE_SKEWRWLOCK
+#define XT_XACT_USE_SPINXSLOCK
+#endif
 
-#ifdef XT_XACT_USE_FASTWRLOCK
-#define XT_XACT_LOCK_TYPE				XTFastRWLockRec
-#define XT_XACT_INIT_LOCK(s, i)			xt_fastrwlock_init(s, i)
-#define XT_XACT_FREE_LOCK(s, i)			xt_fastrwlock_free(s, i)	
-#define XT_XACT_READ_LOCK(i, s)			xt_fastrwlock_slock(i, s)
-#define XT_XACT_WRITE_LOCK(i, s)		xt_fastrwlock_xlock(i, s)
-#define XT_XACT_UNLOCK(i, s)			xt_fastrwlock_unlock(i, s)
-#elif defined(XT_XACT_USE_PTHREAD_RW)
+#if defined(XT_XACT_USE_PTHREAD_RW)
 #define XT_XACT_LOCK_TYPE				xt_rwlock_type
 #define XT_XACT_INIT_LOCK(s, i)			xt_init_rwlock(s, i)
 #define XT_XACT_FREE_LOCK(s, i)			xt_free_rwlock(i)	
 #define XT_XACT_READ_LOCK(i, s)			xt_slock_rwlock_ns(i)
 #define XT_XACT_WRITE_LOCK(i, s)		xt_xlock_rwlock_ns(i)
-#define XT_XACT_UNLOCK(i, s)			xt_unlock_rwlock_ns(i)
-#elif defined(XT_XACT_USE_RW_MUTEX)
-#define XT_XACT_LOCK_TYPE				XTRWMutexRec
-#define XT_XACT_INIT_LOCK(s, i)			xt_rwmutex_init(s, i)
-#define XT_XACT_FREE_LOCK(s, i)			xt_rwmutex_free(s, i)	
-#define XT_XACT_READ_LOCK(i, s)			xt_rwmutex_slock(i, (s)->t_id)
-#define XT_XACT_WRITE_LOCK(i, s)		xt_rwmutex_xlock(i, (s)->t_id)
-#define XT_XACT_UNLOCK(i, s)			xt_rwmutex_unlock(i, (s)->t_id)
+#define XT_XACT_UNLOCK(i, s, b)			xt_unlock_rwlock_ns(i)
+#elif defined(XT_XACT_USE_SPINXSLOCK)
+#define XT_XACT_LOCK_TYPE				XTSpinXSLockRec
+#define XT_XACT_INIT_LOCK(s, i)			xt_spinxslock_init_with_autoname(s, i)
+#define XT_XACT_FREE_LOCK(s, i)			xt_spinxslock_free(s, i)	
+#define XT_XACT_READ_LOCK(i, s)			xt_spinxslock_slock(i)
+#define XT_XACT_WRITE_LOCK(i, s)		xt_spinxslock_xlock(i, (s)->t_id)
+#define XT_XACT_UNLOCK(i, s, b)			xt_spinxslock_unlock(i, b)
 #else
-#define XT_XACT_LOCK_TYPE				XTSpinLockRec
-#define XT_XACT_INIT_LOCK(s, i)			xt_spinlock_init_with_autoname(s, i)
-#define XT_XACT_FREE_LOCK(s, i)			xt_spinlock_free(s, i)	
-#define XT_XACT_READ_LOCK(i, s)			xt_spinlock_lock(i)
-#define XT_XACT_WRITE_LOCK(i, s)		xt_spinlock_lock(i)
-#define XT_XACT_UNLOCK(i, s)			xt_spinlock_unlock(i)
+#define XT_XACT_LOCK_TYPE				XTSkewRWLockRec
+#define XT_XACT_INIT_LOCK(s, i)			xt_skewrwlock_init_with_autoname(s, i)
+#define XT_XACT_FREE_LOCK(s, i)			xt_skewrwlock_free(s, i)	
+#define XT_XACT_READ_LOCK(i, s)			xt_skewrwlock_slock(i)
+#define XT_XACT_WRITE_LOCK(i, s)		xt_skewrwlock_xlock(i, (s)->t_id)
+#define XT_XACT_UNLOCK(i, s, b)			xt_skewrwlock_unlock(i, b)
 #endif
 
 /* We store the transactions in a number of segments, each

=== modified file 'storage/pbxt/src/xactlog_xt.cc'
--- a/storage/pbxt/src/xactlog_xt.cc	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/xactlog_xt.cc	2009-08-18 07:46:53 +0000
@@ -28,6 +28,10 @@
 
 #include "xt_config.h"
 
+#ifdef DRIZZLED
+#include <bitset>
+#endif
+
 #include <signal.h>
 
 #include "xactlog_xt.h"
@@ -600,7 +604,12 @@ void XTDatabaseLog::xlog_setup(XTThreadP
 	
 		xt_init_mutex_with_autoname(self, &xl_write_lock);
 		xt_init_cond(self, &xl_write_cond);
+#ifdef XT_XLOG_WAIT_SPINS
 		xt_writing = 0;
+		xt_waiting = 0;
+#else
+		xt_writing = FALSE;
+#endif
 		xl_log_id = 0;
 		xl_log_file = 0;
 	
@@ -752,6 +761,7 @@ xtBool XTDatabaseLog::xlog_append(XTThre
 	xtLogOffset	req_flush_log_offset;
 	size_t		part_size;
 	xtWord8		flush_time;
+	xtWord2		sum;
 
 	if (!size1) {
 		/* Just flush the buffer... */
@@ -790,13 +800,13 @@ xtBool XTDatabaseLog::xlog_append(XTThre
 		 * enough space in the buffer, or a flush
 		 * is required.
 		 */
+		xtWord8	then;
 		 
 		/*
 		 * The objective of the following code is to
 		 * pick one writer, out of all threads.
-		 * The others rest will wait for the writer.
+		 * The rest will wait for the writer.
 		 */
-		xtBool i_am_writer;
 
 		if (write_reason == WR_FLUSH) {
 			/* Before we flush, check if we should wait for running
@@ -805,8 +815,7 @@ xtBool XTDatabaseLog::xlog_append(XTThre
 			if (xl_db->db_xn_writer_count - xl_db->db_xn_writer_wait_count - xl_db->db_xn_long_running_count > 0 && xl_last_flush_time) {
 				/* Wait for about as long as the last flush took,
 				 * the idea is to saturate the disk with flushing...: */
-				xtWord8	then = xt_trace_clock() + (xtWord8) xl_last_flush_time;
-
+				then = xt_trace_clock() + (xtWord8) xl_last_flush_time;
 				for (;;) {
 					xt_critical_wait();
 					/* If a thread leaves this loop because times up, or
@@ -831,6 +840,55 @@ xtBool XTDatabaseLog::xlog_append(XTThre
 			}
 		}
 
+#ifdef XT_XLOG_WAIT_SPINS
+		/* Spin for 1/1000s: */
+		then = xt_trace_clock() + (xtWord8) 1000;
+		for (;;) {
+			if (!xt_atomic_tas4(&xt_writing, 1))
+				break;
+
+			/* If I am not the writer, then I just waited for the
+			 * writer. So it may be that my requirements have now
+			 * been met!
+			 */
+			if (write_reason == WR_FLUSH) {
+				/* If the reason was to flush, then
+				 * check the last flush sequence, maybe it is passed
+				 * our required sequence.
+				 */
+				if (xt_comp_log_pos(req_flush_log_id, req_flush_log_offset, xl_flush_log_id, xl_flush_log_offset) <= 0) {
+					/* The required flush position of the log is before
+					 * or equal to the actual flush position. This means the condition
+					 * for this thread have been satified (via group commit).
+					 * Nothing more to do!
+					 */
+					ASSERT_NS(xt_comp_log_pos(xl_write_log_id, xl_write_log_offset, xl_append_log_id, xl_append_log_offset) <= 0);
+					return OK;
+				}
+			}
+			else {
+				/* It may be that there is now space in the append buffer: */
+				if (xl_append_buf_pos + size1 + size2 <= xl_size_of_buffers)
+					goto copy_to_log_buffer;
+			}
+
+			if (xt_trace_clock() >= then) {
+				xt_lock_mutex_ns(&xl_write_lock);
+				xt_waiting++;
+				if (!xt_timed_wait_cond_ns(&xl_write_cond, &xl_write_lock, 500)) {
+					xt_waiting--;
+					xt_unlock_mutex_ns(&xl_write_lock);
+					return FALSE;
+				}
+				xt_waiting--;
+				xt_unlock_mutex_ns(&xl_write_lock);
+			}
+			else
+				xt_critical_wait();
+		}
+#else
+		xtBool i_am_writer;
+
 		i_am_writer = FALSE;
 		xt_lock_mutex_ns(&xl_write_lock);
 		if (xt_writing) {
@@ -873,6 +931,7 @@ xtBool XTDatabaseLog::xlog_append(XTThre
 				
 			goto write_log_to_file;
 		}
+#endif
 
 		/* I am the writer, check the conditions, again: */
 		if (write_reason == WR_FLUSH) {
@@ -881,8 +940,14 @@ xtBool XTDatabaseLog::xlog_append(XTThre
 				/* The writers required flush position is before or equal
 				 * to the actual position, so the writer is done...
 				 */
+#ifdef XT_XLOG_WAIT_SPINS
+				xt_writing = 0;
+				if (xt_waiting)
+					xt_cond_wakeall(&xl_write_cond);
+#else
 				xt_writing = FALSE;
 				xt_cond_wakeall(&xl_write_cond);
+#endif
 				ASSERT_NS(xt_comp_log_pos(xl_write_log_id, xl_write_log_offset, xl_append_log_id, xl_append_log_offset) <= 0);
 				return OK;
 			}
@@ -923,8 +988,14 @@ xtBool XTDatabaseLog::xlog_append(XTThre
 						xt_unlock_mutex_ns(&xl_db->db_wr_lock);
 					}
 				}
+#ifdef XT_XLOG_WAIT_SPINS
+				xt_writing = 0;
+				if (xt_waiting)
+					xt_cond_wakeall(&xl_write_cond);
+#else
 				xt_writing = FALSE;
 				xt_cond_wakeall(&xl_write_cond);
+#endif
 				ASSERT_NS(xt_comp_log_pos(xl_write_log_id, xl_write_log_offset, xl_append_log_id, xl_append_log_offset) <= 0);
 				return ok;
 			}
@@ -934,8 +1005,14 @@ xtBool XTDatabaseLog::xlog_append(XTThre
 			 * to copy our data into the buffer:
 			 */
 			if (xl_append_buf_pos + size1 + size2 <= xl_size_of_buffers) {
+#ifdef XT_XLOG_WAIT_SPINS
+				xt_writing = 0;
+				if (xt_waiting)
+					xt_cond_wakeall(&xl_write_cond);
+#else
 				xt_writing = FALSE;
 				xt_cond_wakeall(&xl_write_cond);
+#endif
 				goto copy_to_log_buffer;
 			}
 		}
@@ -1055,6 +1132,7 @@ xtBool XTDatabaseLog::xlog_append(XTThre
 				/* [(8)] Flush the compactor log. */
 				xt_lock_mutex_ns(&xl_db->db_co_dlog_lock);
 				if (!xl_db->db_co_thread->st_dlog_buf.dlb_flush_log(TRUE, thread)) {
+					xl_log_bytes_written -= part_size;
 					xt_unlock_mutex_ns(&xl_db->db_co_dlog_lock);
 					goto write_failed;
 				}
@@ -1063,8 +1141,10 @@ xtBool XTDatabaseLog::xlog_append(XTThre
 
 			/* And flush if required: */
 			flush_time = thread->st_statistics.st_xlog.ts_flush_time;
-			if (!xt_flush_file(xl_log_file, &thread->st_statistics.st_xlog, thread))
+			if (!xt_flush_file(xl_log_file, &thread->st_statistics.st_xlog, thread)) {
+				xl_log_bytes_written -= part_size;
 				goto write_failed;
+			}
 			xl_last_flush_time = (u_int) (thread->st_statistics.st_xlog.ts_flush_time - flush_time);
 
 			xl_log_bytes_flushed = xl_log_bytes_written;
@@ -1085,8 +1165,14 @@ xtBool XTDatabaseLog::xlog_append(XTThre
 				 * position, continue writing: */
 				goto rewrite;
 
+#ifdef XT_XLOG_WAIT_SPINS
+			xt_writing = 0;
+			if (xt_waiting)
+				xt_cond_wakeall(&xl_write_cond);
+#else
 			xt_writing = FALSE;
 			xt_cond_wakeall(&xl_write_cond);
+#endif
 			ASSERT_NS(xt_comp_log_pos(xl_write_log_id, xl_write_log_offset, xl_append_log_id, xl_append_log_offset) <= 0);
 			return OK;
 		}
@@ -1100,8 +1186,14 @@ xtBool XTDatabaseLog::xlog_append(XTThre
 		if (xl_append_buf_pos + size1 + size2 > xl_size_of_buffers)
 			goto rewrite;
 
+#ifdef XT_XLOG_WAIT_SPINS
+		xt_writing = 0;
+		if (xt_waiting)
+			xt_cond_wakeall(&xl_write_cond);
+#else
 		xt_writing = FALSE;
 		xt_cond_wakeall(&xl_write_cond);
+#endif
 	}
 
 	copy_to_log_buffer:
@@ -1146,8 +1238,6 @@ xtBool XTDatabaseLog::xlog_append(XTThre
 		case XT_LOG_ENT_DELETE_BG:
 		case XT_LOG_ENT_DELETE_FL:
 		case XT_LOG_ENT_DELETE_FL_BG:
-			xtWord2 sum;
-			
 			sum = XT_GET_DISK_2(record->xu.xu_checksum_2) ^ XT_CHECKSUM_2(xl_append_log_id);
 			XT_SET_DISK_2(record->xu.xu_checksum_2, sum);
 
@@ -1158,6 +1248,10 @@ xtBool XTDatabaseLog::xlog_append(XTThre
 				xl_db->db_xn_total_writer_count++;
 			}
 			break;
+		case XT_LOG_ENT_REC_REMOVED_BI:
+			sum = XT_GET_DISK_2(record->xu.xu_checksum_2) ^ XT_CHECKSUM_2(xl_append_log_id);
+			XT_SET_DISK_2(record->xu.xu_checksum_2, sum);
+			break;
 		case XT_LOG_ENT_ROW_NEW:
 		case XT_LOG_ENT_ROW_NEW_FL:
 			record->xl.xl_checksum_1 ^= XT_CHECKSUM_1(xl_append_log_id);
@@ -1209,8 +1303,14 @@ xtBool XTDatabaseLog::xlog_append(XTThre
 	return OK;
 
 	write_failed:
+#ifdef XT_XLOG_WAIT_SPINS
+	xt_writing = 0;
+	if (xt_waiting)
+		xt_cond_wakeall(&xl_write_cond);
+#else
 	xt_writing = FALSE;
 	xt_cond_wakeall(&xl_write_cond);
+#endif
 	return FAILED;
 }
 
@@ -1595,7 +1695,7 @@ void XTDatabaseLog::xlog_seq_close(XTXac
 	seq->xseq_log_eof = 0;
 }
 
-xtBool XTDatabaseLog::xlog_seq_start(XTXactSeqReadPtr seq, xtLogID log_id, xtLogOffset log_offset, xtBool missing_ok __attribute__((unused)))
+xtBool XTDatabaseLog::xlog_seq_start(XTXactSeqReadPtr seq, xtLogID log_id, xtLogOffset log_offset, xtBool XT_UNUSED(missing_ok))
 {
 	if (seq->xseq_rec_log_id != log_id) {
 		seq->xseq_rec_log_id = log_id;
@@ -2094,7 +2194,9 @@ xtBool XTDatabaseLog::xlog_seq_next(XTXa
 			goto return_empty;
 		}
 		default:
-			ASSERT_NS(FALSE);
+			/* It is possible to land here after a crash, if the
+			 * log was not completely written.
+			 */
 			seq->xseq_record_len = 0;
 			goto return_empty;
 	}
@@ -2304,7 +2406,13 @@ static void xlog_wr_wait_for_log_flush(X
 				 * the wait, and the sweeper has nothing to do, and the checkpointer.
 				 */
 				if (db->db_xn_curr_id == last_xn_id &&
-					xt_xn_is_before(xt_xn_get_curr_id(db), db->db_xn_to_clean_id) && // db->db_xn_curr_id < db->db_xn_to_clean_id
+					/* Changed xt_xn_get_curr_id(db) to db->db_xn_curr_id,
+					 * This should work because we are not concerned about the difference
+					 * between xt_xn_get_curr_id(db) and db->db_xn_curr_id,
+					 * Which is just a matter of when transactions we can expect ot find
+					 * in memory (see {GAP-INC-ADD-XACT})
+					 */
+					xt_xn_is_before(db->db_xn_curr_id, db->db_xn_to_clean_id) && // db->db_xn_curr_id < db->db_xn_to_clean_id
 					!db->db_restart.xres_is_checkpoint_pending(db->db_xlog.xl_write_log_id, db->db_xlog.xl_write_log_offset)) {
 					/* There seems to be no activity at the moment.
 					 * this might be a good time to write the log data.
@@ -2409,9 +2517,6 @@ static void xlog_wr_main(XTThreadPtr sel
 				if (!record) {
 					break;
 				}
-				/* Count the number of bytes read from the log: */
-				db->db_xlog.xl_log_bytes_read += ws->ws_seqread.xseq_record_len;
-
 				switch (record->xl.xl_status_1) {
 					case XT_LOG_ENT_HEADER:
 						break;
@@ -2435,6 +2540,8 @@ static void xlog_wr_main(XTThreadPtr sel
 						xt_xres_apply_in_order(self, ws, ws->ws_seqread.xseq_rec_log_id, ws->ws_seqread.xseq_rec_log_offset, record);
 						break;
 				}
+				/* Count the number of bytes read from the log: */
+				db->db_xlog.xl_log_bytes_read += ws->ws_seqread.xseq_record_len;
 			}
 		}
 

=== modified file 'storage/pbxt/src/xactlog_xt.h'
--- a/storage/pbxt/src/xactlog_xt.h	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/xactlog_xt.h	2009-08-17 11:12:36 +0000
@@ -373,6 +373,13 @@ typedef struct XTXactLogFile {
 /*
  * The transaction log. Each database has one.
  */
+ 
+/* Does not seem to make much difference... */
+#ifndef XT_NO_ATOMICS
+/* This function uses atomic ops: */
+//#define XT_XLOG_WAIT_SPINS
+#endif
+
 typedef struct XTDatabaseLog {
 	struct XTDatabase		*xl_db;
 
@@ -390,7 +397,12 @@ typedef struct XTDatabaseLog {
 	/* The writer log buffer: */
 	xt_mutex_type			xl_write_lock;
 	xt_cond_type			xl_write_cond;
+#ifdef XT_XLOG_WAIT_SPINS
+	xtWord4					xt_writing;						/* 1 if a thread is writing. */
+	xtWord4					xt_waiting;						/* Count of the threads waiting on the xl_write_cond. */
+#else
 	xtBool					xt_writing;						/* TRUE if a thread is writing. */
+#endif
 	xtLogID					xl_log_id;						/* The number of the write log. */
 	XTOpenFilePtr			xl_log_file;					/* The open write log. */
 

=== modified file 'storage/pbxt/src/xt_config.h'
--- a/storage/pbxt/src/xt_config.h	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/xt_config.h	2009-08-31 11:07:44 +0000
@@ -81,7 +81,8 @@ const int max_connections = 500;
 #define DEBUG
 #endif // _DEBUG
 #else
-#define XT_STREAMING
+// Paul suggested to disable PBMS in MariaDB for now.
+// #define PBMS_ENABLED
 #endif
 
 #ifdef __FreeBSD__
@@ -96,4 +97,22 @@ const int max_connections = 500;
 #define XT_SOLARIS
 #endif
 
+/*
+ * Definition of which atomic operations to use:
+ */
+#ifdef XT_WIN
+/* MS Studio style embedded assembler for x86 */
+#define XT_ATOMIC_WIN32_X86
+#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+/* Use GNU style embedded assembler for x86 */
+#define XT_ATOMIC_GNUC_X86
+#elif defined(XT_SOLARIS)
+/* Use Sun atomic operations library
+ * http://docs.sun.com/app/docs/doc/816-5168/atomic-ops-3c?a=view
+ */
+#define XT_ATOMIC_SOLARIS_LIB
+#else
+#define XT_NO_ATOMICS
+#endif
+
 #endif

=== modified file 'storage/pbxt/src/xt_defs.h'
--- a/storage/pbxt/src/xt_defs.h	2009-03-26 12:18:01 +0000
+++ b/storage/pbxt/src/xt_defs.h	2009-08-17 11:12:36 +0000
@@ -187,7 +187,16 @@ typedef struct XTPathStr {
 	char				ps_path[XT_VAR_LENGTH];
 } *XTPathStrPtr;
 
-#define XT_UNUSED(x)	x __attribute__((__unused__))
+//#define XT_UNUSED(x)		x __attribute__((__unused__))
+#define XT_UNUSED(x)
+
+/* Only used when DEBUG is on: */
+#ifdef DEBUG
+#define XT_NDEBUG_UNUSED(x)	x
+#else
+//#define XT_NDEBUG_UNUSED(x)	x __attribute__((__unused__))
+#define XT_NDEBUG_UNUSED(x)
+#endif
 
 /* ----------------------------------------------------------------------
  * MAIN CONSTANTS
@@ -267,8 +276,10 @@ typedef struct XTPathStr {
  * the row list is scanned.
  *
  * For more details see [(9)].
+ * 223, 1019, 3613
  */
-#define XT_ROW_RWLOCKS					223
+#define XT_ROW_RWLOCKS					1019
+//#define XT_ROW_RWLOCKS					223
 
 /*
  * These are the number of row lock "slots" per table.
@@ -292,31 +303,20 @@ typedef struct XTPathStr {
  */
 #define XT_OPEN_TABLE_FREE_TIME			30
 
-#ifdef XT_USE_GLOBAL_DEBUG_SIZES
-/*
- * DEBUG SIZES!
- * Reduce the thresholds to make things happen faster.
+/* Define this in order to use memory mapped files
+ * (record and row pointer files only).
+ *
+ * This makes no difference in sysbench R/W performance
+ * test.
  */
+//#define XT_USE_ROW_REC_MMAP_FILES
 
-//#undef XT_ROW_RWLOCKS
-//#define XT_ROW_RWLOCKS				2
-
-//#undef XT_TAB_MIN_VAR_REC_LENGTH
-//#define XT_TAB_MIN_VAR_REC_LENGTH		20
-
-//#undef XT_ROW_LOCK_COUNT
-//#define XT_ROW_LOCK_COUNT				(XT_ROW_RWLOCKS * 2)
-
-//#undef XT_INDEX_PAGE_SHIFTS
-//#define XT_INDEX_PAGE_SHIFTS			12
-
-//#undef XT_INDEX_WRITE_BUFFER_SIZE
-//#define XT_INDEX_WRITE_BUFFER_SIZE	(40 * 1024)
-
-#endif
-
-/* Define this in order to use memory mapped files: */
-#define XT_USE_ROW_REC_MMAP_FILES
+/* Define this if sequential scan should load data into the 
+ * record cache.
+ *
+ * This is the way InnoDB behaves.
+ */
+#define XT_SEQ_SCAN_LOADS_CACHE
 
 /* Define this in order to use direct I/O on index files: */
 /* NOTE: DO NOT ENABLE!
@@ -326,32 +326,34 @@ typedef struct XTPathStr {
  */
 //#define XT_USE_DIRECT_IO_ON_INDEX
 
-#ifdef XT_USE_ROW_REC_MMAP_FILES
-
-#define XT_SEQ_SCAN_FROM_MEMORY
-#define XT_ROW_REC_FILE_PTR				XTMapFilePtr
-#define XT_PWRITE_RR_FILE				xt_pwrite_fmap
-#define XT_PREAD_RR_FILE				xt_pread_fmap
-#define XT_FLUSH_RR_FILE				xt_flush_fmap
-#define XT_CLOSE_RR_FILE_NS				xt_close_fmap_ns
-
-#else
-
-#define XT_ROW_REC_FILE_PTR				XTOpenFilePtr
-#define XT_PWRITE_RR_FILE				xt_pwrite_file
-#define XT_PREAD_RR_FILE				xt_pread_file
-#define XT_FLUSH_RR_FILE				xt_flush_file
-#define XT_CLOSE_RR_FILE_NS				xt_close_file_ns
+/*
+ * Define this variable if PBXT should do lazy deleting in indexes
+ * Note, even if the variable is not defined, PBXT will handle
+ * lazy deleted items in an index.
+ *
+ * NOTE: This can cause significant degrade of index scan speed.
+ * 25% on sysbench readonly index scan tests.
+ */
+//#define XT_USE_LAZY_DELETE
 
-#endif
+/*
+ * Define this variable if a connection should wait for the
+ * sweeper to clean up previous transactions executed by the
+ * connection, before continuing.
+ *
+ * The number of transactions that the sweeper is aload to
+ * lag can be dynamic, but there is a limit (XT_MAX_XACT_BEHIND)
+ */
+#define XT_WAIT_FOR_CLEANUP
 
-#ifdef XT_SEQ_SCAN_FROM_MEMORY
-#define XT_LOCK_MEMORY_PTR(x, f, a, s, v, c)	do { x = xt_lock_fmap_ptr(f, a, s, v, c); } while (0)
-#define XT_UNLOCK_MEMORY_PTR(f, v)				xt_unlock_fmap_ptr(f, v);
-#else
-#define XT_LOCK_MEMORY_PTR(x, f, a, v, c)
-#define XT_UNLOCK_MEMORY_PTR(f, v)
-#endif
+/*
+ * This seems to be the optimal value, at least according to
+ * sysbench/sysbench run --test=oltp --num-threads=128 --max-requests=50000 --mysql-user=root 
+ * --oltp-table-size=100000 --oltp-table-name=sb_pbxt --mysql-engine-trx=yes
+ *
+ * Using 8, 16 and 128 threads.
+ */
+#define XT_MAX_XACT_BEHIND				2
 
 /* {NO-ACTION-BUG}
  * Define this to implement NO ACTION correctly
@@ -405,6 +407,60 @@ typedef struct XTPathStr {
 #define XT_ADD_PTR(p, l)				((void *) ((char *) (p) + (l)))
 
 /* ----------------------------------------------------------------------
+ * DEFINES DEPENDENT ON  CONSTANTS
+ */
+
+#ifdef XT_USE_ROW_REC_MMAP_FILES
+
+#define XT_ROW_REC_FILE_PTR						XTMapFilePtr
+#define XT_PWRITE_RR_FILE						xt_pwrite_fmap
+#define XT_PREAD_RR_FILE						xt_pread_fmap
+#define XT_FLUSH_RR_FILE						xt_flush_fmap
+#define XT_CLOSE_RR_FILE_NS						xt_close_fmap_ns
+
+#define XT_LOCK_MEMORY_PTR(x, f, a, s, v, c)	do { x = xt_lock_fmap_ptr(f, a, s, v, c); } while (0)
+#define XT_UNLOCK_MEMORY_PTR(f, d, e, v)		do { xt_unlock_fmap_ptr(f, v); d = NULL; } while (0)
+
+#else
+
+#define XT_ROW_REC_FILE_PTR						XTOpenFilePtr
+#define XT_PWRITE_RR_FILE						xt_pwrite_file
+#define XT_PREAD_RR_FILE						xt_pread_file
+#define XT_FLUSH_RR_FILE						xt_flush_file
+#define XT_CLOSE_RR_FILE_NS						xt_close_file_ns
+
+#define XT_LOCK_MEMORY_PTR(x, f, a, s, v, c)	do { if (!xt_lock_file_ptr(f, &x, a, s, v, c)) x = NULL; } while (0)
+#define XT_UNLOCK_MEMORY_PTR(f, d, e, v)		do { if (e) { xt_unlock_file_ptr(f, d, v); d = NULL; } } while (0)
+
+#endif
+
+/* ----------------------------------------------------------------------
+ * DEBUG SIZES!
+ * Reduce the thresholds to make things happen faster.
+ */
+
+#ifdef XT_USE_GLOBAL_DEBUG_SIZES
+
+//#undef XT_ROW_RWLOCKS
+//#define XT_ROW_RWLOCKS				2
+
+//#undef XT_TAB_MIN_VAR_REC_LENGTH
+//#define XT_TAB_MIN_VAR_REC_LENGTH		20
+
+//#undef XT_ROW_LOCK_COUNT
+//#define XT_ROW_LOCK_COUNT				(XT_ROW_RWLOCKS * 2)
+
+//#undef XT_INDEX_PAGE_SHIFTS
+//#define XT_INDEX_PAGE_SHIFTS			8	// 256
+//#undef XT_BLOCK_SIZE_FOR_DIRECT_IO
+//#define XT_BLOCK_SIZE_FOR_DIRECT_IO	256
+
+//#undef XT_INDEX_WRITE_BUFFER_SIZE
+//#define XT_INDEX_WRITE_BUFFER_SIZE	(40 * 1024)
+
+#endif
+
+/* ----------------------------------------------------------------------
  * BYTE ORDER
  */
 
@@ -645,6 +701,14 @@ typedef struct xtIndexNodeID {
 #define XT_XACT_ID_SIZE			4
 #define XT_CHECKSUM4_XACT(x)	(x)
 
+#ifdef XT_WIN
+#define __FUNC__				__FUNCTION__
+#elif defined(XT_SOLARIS)
+#define __FUNC__				"__func__"
+#else
+#define __FUNC__				__PRETTY_FUNCTION__
+#endif
+
 /* ----------------------------------------------------------------------
  * GLOBAL VARIABLES
  */
@@ -669,6 +733,7 @@ extern xtBool				pbxt_crash_debug;
 #define MYSQL_THD							Session *
 #define THR_THD								THR_Session
 #define STRUCT_TABLE						class Table
+#define TABLE_SHARE							TableShare
 
 #define MYSQL_TYPE_STRING					DRIZZLE_TYPE_VARCHAR
 #define MYSQL_TYPE_VARCHAR					DRIZZLE_TYPE_VARCHAR
@@ -687,6 +752,7 @@ extern xtBool				pbxt_crash_debug;
 
 #define mx_tmp_use_all_columns(x, y)		(x)->use_all_columns(y)
 #define mx_tmp_restore_column_map(x, y)		(x)->restore_column_map(y)
+#define MX_BIT_FAST_TEST_AND_SET(x, y)		bitmap_test_and_set(x, y)
 
 #define MX_TABLE_TYPES_T					handler::Table_flags
 #define MX_UINT8_T							uint8_t
@@ -696,6 +762,7 @@ extern xtBool				pbxt_crash_debug;
 #define MX_CHARSET_INFO						struct charset_info_st
 #define MX_CONST_CHARSET_INFO				const struct charset_info_st			
 #define MX_CONST							const
+
 #define my_bool								bool
 #define int16								int16_t
 #define int32								int32_t
@@ -712,6 +779,9 @@ extern xtBool				pbxt_crash_debug;
 
 #define HA_CAN_SQL_HANDLER					0
 #define HA_CAN_INSERT_DELAYED				0
+#define HA_BINLOG_ROW_CAPABLE				0
+#define HA_BINLOG_STMT_CAPABLE				0
+#define HA_CACHE_TBL_TRANSACT				0
 
 #define max									cmax
 #define min									cmin
@@ -734,6 +804,7 @@ extern xtBool				pbxt_crash_debug;
 #define thd_tablespace_op					session_tablespace_op
 #define thd_alloc							session_alloc
 #define thd_make_lex_string					session_make_lex_string
+#define column_bitmaps_signal()
 
 #define my_pthread_setspecific_ptr(T, V)	pthread_setspecific(T, (void*) (V))
 
@@ -750,6 +821,9 @@ extern xtBool				pbxt_crash_debug;
                                    (((uint32_t) (((const unsigned char*) (A))[1])) << 16) +\
                                    (((uint32_t) (((const unsigned char*) (A))[0])) << 24)))
 
+class PBXTStorageEngine;
+typedef PBXTStorageEngine handlerton;
+
 #else // DRIZZLED
 /* The MySQL case: */
 #if MYSQL_VERSION_ID >= 60008
@@ -760,6 +834,7 @@ extern xtBool				pbxt_crash_debug;
 
 #define mx_tmp_use_all_columns				dbug_tmp_use_all_columns
 #define mx_tmp_restore_column_map(x, y)		dbug_tmp_restore_column_map((x)->read_set, y)
+#define MX_BIT_FAST_TEST_AND_SET(x, y)		bitmap_fast_test_and_set(x, y)
 
 #define MX_TABLE_TYPES_T					ulonglong
 #define MX_UINT8_T							uint8
@@ -772,6 +847,11 @@ extern xtBool				pbxt_crash_debug;
 
 #endif // DRIZZLED
 
+#define MX_BITMAP							MY_BITMAP
+#define MX_BIT_SIZE()						n_bits
+#define MX_BIT_IS_SUBSET(x, y)				bitmap_is_subset(x, y)
+#define MX_BIT_SET(x, y)					bitmap_set_bit(x, y)
+
 #ifndef XT_SCAN_CORE_DEFINED
 #define XT_SCAN_CORE_DEFINED
 xtBool	xt_mm_scan_core(void);