← Back to team overview

zorba-coders team mailing list archive

[Merge] lp:~zorba-coders/zorba/hashmap into lp:zorba

 

Markos Zaharioudakis has proposed merging lp:~zorba-coders/zorba/hashmap into lp:zorba.

Requested reviews:
  Markos Zaharioudakis (markos-za)

For more details, see:
https://code.launchpad.net/~zorba-coders/zorba/hashmap/+merge/125187

HashMap optimizations
-- 
https://code.launchpad.net/~zorba-coders/zorba/hashmap/+merge/125187
Your team Zorba Coders is subscribed to branch lp:zorba.
=== modified file 'src/context/static_context.h'
--- src/context/static_context.h	2012-09-17 00:36:37 +0000
+++ src/context/static_context.h	2012-09-19 12:44:24 +0000
@@ -42,6 +42,7 @@
 
 #include "zorbautils/hashmap_zstring.h"
 #include "zorbautils/hashmap_itemp.h"
+#include "zorbautils/checked_vector.h"
 
 #include "common/shared_types.h"
 

=== modified file 'src/store/naive/qname_pool.cpp'
--- src/store/naive/qname_pool.cpp	2012-09-17 00:36:37 +0000
+++ src/store/naive/qname_pool.cpp	2012-09-19 12:44:24 +0000
@@ -63,12 +63,12 @@
 ********************************************************************************/
 QNamePool::~QNamePool() 
 {
-  csize n = theHashSet.theHashTab.size();
+  csize n = theHashSet.capacity();
   for (csize i = 0; i < n; ++i)
   {
     if (!theHashSet.theHashTab[i].isFree() &&
-        theHashSet.theHashTab[i].theItem->isOverflow())
-      delete theHashSet.theHashTab[i].theItem;
+        theHashSet.theHashTab[i].key()->isOverflow())
+      delete theHashSet.theHashTab[i].key();
   }
 
   if (theCache != NULL)
@@ -297,12 +297,12 @@
 
       bool found;
       entry = theHashSet.hashInsert(qn, hval, found);
-      entry->theItem = qn;
+      entry->key() = qn;
       ZORBA_FATAL(!found, "");
     }
     else
     {
-      qn = entry->theItem;
+      qn = entry->key();
       cachePin(qn);
     }
 
@@ -389,12 +389,12 @@
 
       bool found;
       entry = theHashSet.hashInsert(qn, hval, found);
-      entry->theItem = qn;
+      entry->key() = qn;
       ZORBA_FATAL(!found, "");
     }
     else
     {
-      qn = entry->theItem;
+      qn = entry->key();
       cachePin(qn);
     }
 
@@ -478,7 +478,7 @@
 
   while (entry != NULL)
   {
-    QNameItem* qn = entry->theItem;
+    QNameItem* qn = entry->key();
 
     if (ztd::equals(qn->getLocalName(), ln, lnlen) &&
         ztd::equals(qn->getNamespace(), ns, nslen) &&

=== modified file 'src/store/naive/string_pool.cpp'
--- src/store/naive/string_pool.cpp	2012-09-17 00:36:37 +0000
+++ src/store/naive/string_pool.cpp	2012-09-19 12:44:24 +0000
@@ -28,13 +28,14 @@
 StringPool::~StringPool() 
 {
   csize count = 0;
-  csize n = theHashTab.size();
+  csize n = capacity();
 
   for (csize i = 0; i < n; ++i)
   {
-    if (theHashTab[i].theItem.is_shared())
+    if (!theHashTab[i].isFree() && theHashTab[i].key().is_shared())
     {
-      std::cerr << "ID: " << i << " Referenced URI: " << theHashTab[i].theItem << std::endl;
+      std::cerr << "ID: " << i << " Referenced URI: "
+                << theHashTab[i].key() << std::endl;
       //delete theHashTab[i].theString.getp();
       count++;
     }
@@ -57,7 +58,8 @@
   bool found = false;
 
   zstring::size_type len = strlen(str);
-  ulong hval = hashfun::h32(str, len, FNV_32_INIT) % theHashTabSize;
+
+  ulong hval = hashfun::h32(str, len, FNV_32_INIT) % bucket_count();
 
   {
     SYNC_CODE(AutoMutex lock(&theMutex);)
@@ -68,7 +70,7 @@
     {
       while (entry != NULL)
       {
-        if (ztd::equals(entry->theItem, str, len))
+        if (ztd::equals(entry->key(), str, len))
         {
           found = true;
           break;
@@ -79,7 +81,7 @@
 
     if (found)
     {
-      outStr = entry->theItem;
+      outStr = entry->key();
       return false;
     }
   }
@@ -97,76 +99,78 @@
 ********************************************************************************/
 void StringPool::garbageCollect()
 {
-  HashEntry<zstring, DummyHashValue>* entry;
+  HashEntry<zstring, DummyHashValue>* currEntry;
 
   HashEntry<zstring, DummyHashValue>* freeList = NULL;
 
-  zstring::size_type size = theHashTabSize;
+  csize size = bucket_count();
 
-  for (ulong i = 0; i < size; ++i)
+  for (csize i = 0; i < size; ++i)
   {
-    entry = &theHashTab[i];
+    currEntry = &theHashTab[i];
 
     // If the current hash bucket is empty, move to the next one
-    if (entry->isFree())
+    if (currEntry->isFree())
     {
-      ZORBA_FATAL(entry->theNext == 0, "");
+      ZORBA_FATAL(currEntry->theNext == 0, "");
       continue;
     }
 
     // Handle the 1st hash entry of the current hash bucket
-    while (!entry->theItem.is_shared())
+    while (!currEntry->key().is_shared())
     {
-      if (entry->theNext == 0)
+      if (currEntry->theNext == 0)
       {
-        entry->setFree();
-        theNumEntries--;
+        currEntry->setFree();
+        --theNumEntries;
         break;
       }
       else
       {
-        HashEntry<zstring, DummyHashValue>* nextEntry = entry->getNext();
-        *entry = *nextEntry;
-        entry->setNext(nextEntry->getNext());
+        HashEntry<zstring, DummyHashValue>* nextEntry = currEntry->getNext();
+        assert(!nextEntry->isFree());
+        *currEntry = *nextEntry;
+        currEntry->setNext(nextEntry->getNext());
         nextEntry->setFree();
         nextEntry->setNext(freeList);
         freeList = nextEntry;
-        theNumEntries--;
+        --theNumEntries;
       }
     }
 
     // Handle the overflow entries of the current hash bucket.
-    HashEntry<zstring, DummyHashValue>* prevEntry = entry;
-    entry = entry->getNext();
+    HashEntry<zstring, DummyHashValue>* prevEntry = currEntry;
+    currEntry = currEntry->getNext();
 
-    while (entry != NULL)
+    while (currEntry != NULL)
     {
-      if (!entry->theItem.is_shared())
+      if (!currEntry->key().is_shared())
       {
-        prevEntry->setNext(entry->getNext());
-        entry->setFree();
-        entry->setNext(freeList);
-        freeList = entry;
-        theNumEntries--;
+        prevEntry->setNext(currEntry->getNext());
+        currEntry->setFree();
+        currEntry->setNext(freeList);
+        freeList = currEntry;
+        --theNumEntries;
 
-        entry = prevEntry->getNext();
+        currEntry = prevEntry->getNext();
       }
       else
       {
-        prevEntry = entry;
-        entry = entry->getNext();
+        prevEntry = currEntry;
+        currEntry = currEntry->getNext();
       }
     }
   }
 
   if (freeList != NULL)
   {
-    entry = freeList;
-    while (entry->theNext != 0)
-      entry = entry->getNext();
-
-    entry->setNext(theHashTab[theHashTabSize].getNext());
-    theHashTab[theHashTabSize].setNext(freeList);
+    currEntry = freeList;
+    while (currEntry->theNext != 0)
+      currEntry = currEntry->getNext();
+
+    currEntry->setNext(theHashTab[bucket_count()].getNext());
+
+    theHashTab[bucket_count()].setNext(freeList);
   }
 }
 

=== modified file 'src/unit_tests/CMakeLists.txt'
--- src/unit_tests/CMakeLists.txt	2012-09-17 00:36:37 +0000
+++ src/unit_tests/CMakeLists.txt	2012-09-19 12:44:24 +0000
@@ -25,6 +25,7 @@
   test_uri.cpp
   test_json_parser.cpp
   test_fs_iterator.cpp
+  test_hashmaps.cpp
 #  memory_manager.cpp
 )
 

=== added file 'src/unit_tests/test_hashmaps.cpp'
--- src/unit_tests/test_hashmaps.cpp	1970-01-01 00:00:00 +0000
+++ src/unit_tests/test_hashmaps.cpp	2012-09-19 12:44:24 +0000
@@ -0,0 +1,257 @@
+
+#include <cstdlib>
+#include <stdio.h>
+#include <iostream>
+
+#include <zorba/util/time.h>
+
+#include "zorbautils/hashmap.h"
+#include "util/hashmap32.h"
+#include "util/hashmap.h"
+#include "util/unordered_map.h"
+
+
+namespace zorba {
+
+namespace UnitTests {
+
+
+class IntCompFunc
+{
+public:
+  static bool equal(uint64_t k1, uint64_t k2) 
+  {
+    return k1 == k2;
+  }
+
+  static uint32_t hash(uint64_t key)
+  {
+#if 1
+    return key;
+#else
+    char buf[9];
+    buf[0] = (unsigned char)(key>>56);
+    buf[1] = (unsigned char)(key>>48);
+    buf[2] = (unsigned char)(key>>40);
+    buf[3] = (unsigned char)(key>>32);
+    buf[4] = (unsigned char)(key>>24);
+    buf[5] = (unsigned char)(key>>16);
+    buf[6] = (unsigned char)(key>>8 );
+    buf[7] = (unsigned char)(key    );
+    buf[8] = 0;
+    return (uint32_t)hashfun::h64((void*)buf,8,FNV_64_INIT);
+#endif
+  }
+};
+
+
+class StrCompFunc
+{
+public:
+  static bool equal(const zstring& k1, const zstring& k2) 
+  {
+    return k1 == k2;
+  }
+
+  static uint32_t hash(const zstring& key)
+  { 
+    return hashfun::h32(key.c_str(), FNV_32_INIT);
+  }
+};
+
+
+int test_hashmaps(int argc, char* argv[])
+{
+  if (argc < 4)
+    return 1;
+
+  int test_id = atoi(argv[1]);
+
+  double load_factor = atof(argv[2]);
+
+  int num = atoi(argv[3]);
+
+  uint64_t* int_buf = new uint64_t[num];
+
+  for (int i = 0; i < num; ++i)
+  {
+    int_buf[i] = rand() % num;
+  }
+
+  zstring* str_buf = new zstring[num];
+
+  for (int i = 0; i < num; ++i)
+  {
+    char tmp[20];
+    sprintf(tmp, "%d", rand() % num);
+    str_buf[i] = tmp;
+
+    //std::cout << str_buf[i] << std::endl;
+  }
+
+
+  HashMap<uint64_t, int, IntCompFunc> map1(1024, false);
+  HashMap<zstring, int, StrCompFunc> map2(1024, false);
+
+  std::unordered_map<uint64_t, int> map3(1024);
+  std::unordered_map<zstring, int> map4(1024);
+
+  hash64map<int> map5(1024, load_factor);
+  hashmap<zstring, int> map6(1024, load_factor);
+
+  map1.set_load_factor(load_factor);
+  map2.set_load_factor(load_factor);
+
+  if (test_id == 1)
+  {
+    zorba::time::walltime startTime;
+    zorba::time::get_current_walltime(startTime);
+
+    for (int i = 0; i < num; ++i)
+    {
+      uint64_t key = int_buf[i];
+      int value = 1;
+      (void)map1.insert(key, value);
+    }
+    
+    zorba::time::walltime stopTime;
+    time::get_current_walltime(stopTime);
+
+    double time = time::get_walltime_elapsed(startTime, stopTime);  
+
+    std::cout << "Load factor = " << load_factor << std::endl
+              << "Num numeric insertions = " << num << std::endl
+              << "HashMap entries = " << map1.size() << std::endl
+              << "HashMap capacity = " << map1.capacity() << std::endl
+              << "HashMap colisions = " << map1.collisions() << std::endl
+              << "Time = " << time << std::endl;
+  }
+  else if (test_id == 2)
+  {
+    zorba::time::walltime startTime;
+    zorba::time::get_current_walltime(startTime);
+
+    for (int i = 0; i < num; ++i)
+    {
+      zstring key = str_buf[i];
+      int value = 1;
+      (void)map2.insert(key, value);
+    }
+
+    zorba::time::walltime stopTime;
+    time::get_current_walltime(stopTime);
+
+    double time = time::get_walltime_elapsed(startTime, stopTime);  
+
+    std::cout << "Load factor = " << load_factor << std::endl
+              << "Num zstring insertions = " << num << std::endl
+              << "HashMap entries = " << map2.size() << std::endl
+              << "HashMap buckets = " << map2.bucket_count() << std::endl
+              << "HashMap capacity = " << map2.capacity() << std::endl
+              << "Time = " << time << std::endl;
+  }
+  else if (test_id == 3)
+  {
+    zorba::time::walltime startTime;
+    zorba::time::get_current_walltime(startTime);
+
+    for (int i = 0; i < num; ++i)
+    {
+      uint64_t key = int_buf[i];
+      int value = 1;
+      (void)map3.insert(std::pair<uint64_t, int>(key, value));
+    }
+    
+    zorba::time::walltime stopTime;
+    time::get_current_walltime(stopTime);
+
+    double time = time::get_walltime_elapsed(startTime, stopTime);  
+
+    std::cout << "Load factor = " << load_factor << std::endl
+              << "Num numeric insertions = " << num << std::endl
+              << "UnorderedMap entries = " << map3.size() << std::endl
+              << "UnorderedMap buckets = " << map3.bucket_count() << std::endl
+              << "Time = " << time << std::endl;
+  }
+  else if (test_id == 4)
+  {
+    zorba::time::walltime startTime;
+    zorba::time::get_current_walltime(startTime);
+
+    for (int i = 0; i < num; ++i)
+    {
+      zstring key = str_buf[i];
+      int value = 1;
+      (void)map4.insert(std::pair<zstring, int>(key, value));
+    }
+
+    zorba::time::walltime stopTime;
+    time::get_current_walltime(stopTime);
+
+    double time = time::get_walltime_elapsed(startTime, stopTime);  
+
+    std::cout << "Load factor = " << load_factor << std::endl
+              << "Num zstring insertions = " << num << std::endl
+              << "UnorderedMap entries = " << map4.size() << std::endl
+              << "UnorderedMap buckets = " << map4.bucket_count() << std::endl
+              << "Time = " << time << std::endl;
+  }
+  else if (test_id == 5)
+  {
+    zorba::time::walltime startTime;
+    zorba::time::get_current_walltime(startTime);
+
+    for (int i = 0; i < num; ++i)
+    {
+      uint64_t key = int_buf[i];
+      int value = 1;
+      (void)map5.put_unsync(key, value);
+    }
+
+    zorba::time::walltime stopTime;
+    time::get_current_walltime(stopTime);
+
+    double time = time::get_walltime_elapsed(startTime, stopTime);  
+
+    std::cout << "Load factor = " << load_factor << std::endl
+              << "Num numeric insertions = " << num << std::endl
+              << "hashmap entries = " << map5.size() << std::endl
+              << "Time = " << time << std::endl;
+  }
+  else if (test_id == 6)
+  {
+    zorba::time::walltime startTime;
+    zorba::time::get_current_walltime(startTime);
+
+    for (int i = 0; i < num; ++i)
+    {
+      zstring key = str_buf[i];
+      int value = 1;
+      (void)map6.put(key, value, false);
+    }
+
+    zorba::time::walltime stopTime;
+    time::get_current_walltime(stopTime);
+
+    double time = time::get_walltime_elapsed(startTime, stopTime);  
+
+    std::cout << "Load factor = " << load_factor << std::endl
+              << "Num zstring insertions = " << num << std::endl
+              << "hashmap entries = " << map6.size() << std::endl
+              << "Time = " << time << std::endl;
+  }
+  else
+  {
+    std::cout << "Invalid test id" << std::endl;
+    return 2;
+  }
+
+  delete [] int_buf;
+  delete [] str_buf;
+
+  return 0;
+}
+
+
+}
+}

=== modified file 'src/unit_tests/unit_test_list.h'
--- src/unit_tests/unit_test_list.h	2012-09-17 00:36:37 +0000
+++ src/unit_tests/unit_test_list.h	2012-09-19 12:44:24 +0000
@@ -25,34 +25,51 @@
 namespace UnitTests {
 
   int runUriTest(int argc, char* argv[]);
+
   int runDebuggerProtocolTest(int argc, char* argv[]);
+
   int test_base64( int, char*[] );
+
   int test_base64_streambuf( int, char*[] );
+
   int test_fs_iterator( int, char*[] );
+
 #ifndef ZORBA_NO_ICU
   int test_icu_streambuf( int, char*[] );
 #endif /* ZORBA_NO_ICU */
+
   int test_json_parser( int, char*[] );
+
   int test_string( int, char*[] );
+
   int test_unique_ptr( int, char*[] );
+
   int test_fs_iterator( int, char*[] );
-  //int test_mem_manager( int, char*[] );
+
 #ifndef ZORBA_NO_FULL_TEXT
   int test_stemmer( int, char*[] );
   int test_thesaurus( int, char*[] );
   int test_tokenizer( int, char*[] );
 #endif /* ZORBA_NO_FULL_TEXT */
+
 #ifndef ZORBA_HAVE_UNIQUE_PTR
   int test_unique_ptr( int, char*[] );
 #endif /* ZORBA_HAVE_UNIQUE_PTR */
+<<<<<<< TREE
   int test_uuid( int, char*[] );
+=======
+
+>>>>>>> MERGE-SOURCE
 #ifndef ZORBA_HAVE_UNORDERED_MAP
   int test_unordered_map( int, char*[] );
 #endif /* ZORBA_HAVE_UNORDERED_MAP */
+
 #ifndef ZORBA_HAVE_UNORDERED_SET
   int test_unordered_set( int, char*[] );
 #endif /* ZORBA_HAVE_UNORDERED_SET */
 
+  int test_hashmaps(int argc, char* argv[]);
+
   void initializeTestList();
 
 } // namespace UnitTests

=== modified file 'src/unit_tests/unit_tests.cpp'
--- src/unit_tests/unit_tests.cpp	2012-09-17 00:36:37 +0000
+++ src/unit_tests/unit_tests.cpp	2012-09-19 12:44:24 +0000
@@ -38,29 +38,43 @@
 void initializeTestList() 
 {
   libunittests["base64"] = test_base64;
+
   libunittests["base64_streambuf"] = test_base64_streambuf;
+
   libunittests["fs_iterator"] = test_fs_iterator;
-  //libunittests["memory_manager"] = test_mem_manager;
+
 #ifndef ZORBA_NO_ICU
   libunittests["icu_streambuf"] = test_icu_streambuf;
 #endif /* ZORBA_NO_ICU */
+
   libunittests["json_parser"] = test_json_parser;
+
   libunittests["string"] = test_string;
+
 #ifndef ZORBA_NO_FULL_TEXT
   libunittests["stemmer"] = test_stemmer;
   libunittests["thesaurus"] = test_thesaurus;
   libunittests["tokenizer"] = test_tokenizer;
 #endif /* ZORBA_NO_FULL_TEXT */
+
 #ifndef ZORBA_HAVE_UNIQUE_PTR
   libunittests["unique_ptr"] = test_unique_ptr;
 #endif /* ZORBA_HAVE_UNIQUE_PTR */
+<<<<<<< TREE
   libunittests["uuid"] = test_uuid;
+=======
+
+>>>>>>> MERGE-SOURCE
 #ifndef ZORBA_HAVE_UNORDERED_MAP
   libunittests["unordered_map"] = test_unordered_map;
 #endif /* ZORBA_HAVE_UNORDERED_MAP */
+
 #ifndef ZORBA_HAVE_UNORDERED_SET
   libunittests["unordered_set"] = test_unordered_set;
 #endif /* ZORBA_HAVE_UNORDERED_SET */
+
+  libunittests["hashmaps"] = test_hashmaps;
+
   libunittests["uri"] = runUriTest;
 
 #ifdef ZORBA_WITH_DEBUGGER

=== modified file 'src/zorbaserialization/bin_archiver.cpp'
--- src/zorbaserialization/bin_archiver.cpp	2012-09-17 00:36:37 +0000
+++ src/zorbaserialization/bin_archiver.cpp	2012-09-19 12:44:24 +0000
@@ -137,7 +137,7 @@
 BinArchiver::BinArchiver(std::ostream* os)
   :
   Archiver(true),
-  theStringPool(1024, false, false),
+  theStringPool(1024, false),
   theFirstBinaryString(0)
 {
   this->is = NULL;

=== modified file 'src/zorbautils/hashmap.h'
--- src/zorbautils/hashmap.h	2012-09-17 00:36:37 +0000
+++ src/zorbautils/hashmap.h	2012-09-19 12:44:24 +0000
@@ -16,6 +16,7 @@
 #ifndef ZORBA_UTILS_HASHMAP_H
 #define ZORBA_UTILS_HASHMAP_H
 
+#include <vector>
 
 #include <cstddef>
 #include <zorba/config.h>
@@ -23,9 +24,10 @@
 #include "common/common.h"
 
 #include "zorbautils/fatal.h"
-#include "zorbautils/checked_vector.h"
 #include "zorbautils/mutex.h"
 
+#include "store/api/shared_types.h"
+
 
 namespace zorba
 {
@@ -42,18 +44,81 @@
 template <class T, class V>
 class HashEntry
 {
+  struct KeyHolder
+  {
+    char theKey[sizeof(T)];
+  };
+
+  struct ValueHolder
+  {
+    char theValue[sizeof(V)];
+  };
+
 public:
   bool         theIsFree;
-  T            theItem;
-  V            theValue;
+  KeyHolder    theKey;
+  ValueHolder  theValue;
   ptrdiff_t    theNext;  // offset from "this" to the next entry.
 
-  HashEntry() : theIsFree(true), theNext(0) { }
+  HashEntry() 
+    :
+    theIsFree(true),
+    theNext(0)
+  {
+  }
+
+  HashEntry(const HashEntry<T, V>& other)
+  {
+    theIsFree = other.theIsFree;
+    theNext = other.theNext;
+    if (!theIsFree)
+    {
+      new (&theKey) T(other.key());
+      new (&theValue) V(other.value());
+    }
+  }
 
   ~HashEntry()
   {
-    theIsFree = true;
-    theNext = 0;
+    if (!theIsFree)
+    {
+      key().~T();
+      value().~V();
+    }
+  }
+
+  HashEntry<T, V>& operator = (const HashEntry<T, V>& other)
+  {
+    if (theIsFree)
+    {
+      assert(false);
+
+      if (!other.theIsFree)
+      {
+        new (&theKey) T(other.key());
+        new (&theValue) V(other.value());
+      }
+    }
+    else
+    {
+      if (!other.theIsFree)
+      {
+        key() = other.key();
+        value() = other.value();
+      }
+      else
+      {
+        assert(false);
+
+        key().~T();
+        value().~V();
+      }
+    }
+
+    theIsFree = other.theIsFree;
+    theNext = other.theNext;
+
+    return *this;
   }
 
   bool isFree() const
@@ -63,15 +128,39 @@
 
   void setFree()
   {
-    theItem.~T();
+    key().~T();
+    value().~V();
     theIsFree = true;
+    theNext = 0;
   }
 
   void unsetFree()
   {
+    new (&theKey) T;
+    new (&theValue) V;
     theIsFree = false;
   }
 
+  T& key()
+  {
+    return *reinterpret_cast<T*>(&theKey);
+  }
+
+  const T& key() const
+  {
+    return *reinterpret_cast<const T*>(&theKey);
+  }
+
+  const V& value() const
+  {
+    return *reinterpret_cast<const V*>(&theValue);
+  }
+
+  V& value()
+  {
+    return *reinterpret_cast<V*>(&theValue);
+  }
+
   void setNext(HashEntry* nextEntry)
   {
     theNext = (nextEntry == NULL ? 0 : nextEntry - this);
@@ -105,17 +194,17 @@
 
   theHashTab     : The hash table. The table is implemented as a vector of hash
                    entries and is devided in 2 areas: Each entry between 0 and
-                   theHashTabSize - 1 is the head of a hash bucket. Each entry
-                   between theHashTabSize+1 and theHashTab.size()-1 is either
+                   theNumBuckets - 1 is the head of a hash bucket. Each entry
+                   between theNumBuckets+1 and theHashTab.size()-1 is either
                    a "collision" entry (i.e., it belongs to a hash bucket with
                    more than one entries) or a "free" entry (i.e. it does not
                    currently belong to any bucket, but is available for
                    allocation as a collision entry when needed). Free entries
                    in the collision area are linked in a free list. Entry
-                   theHashTab[theHashTabSize] is reserved as the head of this
+                   theHashTab[theNumBuckets] is reserved as the head of this
                    free list.
-  theHashTabSize : The current number of hash buckets in theHashTab.
-  theInitialSize : The initial number of hash buckets.
+  theNumBuckets : The current number of hash buckets in theHashTab.
+
   theLoadFactor  : The max fraction of non-empty hash buckets after which the
                    hash table is doubled in size.
 
@@ -130,11 +219,11 @@
     friend class HashMap;
 
   protected:
-    checked_vector<HashEntry<T, V> >*  theHashTab;
-    size_t                             thePos;
+    std::vector<HashEntry<T, V> >*  theHashTab;
+    csize                           thePos;
 
   protected:
-    iterator(checked_vector<HashEntry<T, V> >* ht, size_t pos)
+    iterator(std::vector<HashEntry<T, V> >* ht, csize pos)
       :
       theHashTab(ht),
       thePos(pos)
@@ -150,7 +239,7 @@
 
       HashEntry<T, V>& entry = (*theHashTab)[thePos];
 
-      return entry.theItem;
+      return entry.key();
     }
 
   public:
@@ -194,7 +283,7 @@
 
       const HashEntry<T, V>& entry = (*theHashTab)[thePos];
 
-      return std::pair<T, V>(entry.theItem, entry.theValue);
+      return std::pair<T, V>(entry.key(), entry.value());
     }
 
     const T& getKey() const
@@ -203,16 +292,25 @@
 
       const HashEntry<T, V>& entry = (*theHashTab)[thePos];
 
-      return entry.theItem;
-    }
-
-    V& getValue() const
-    {
-      ZORBA_FATAL(thePos < theHashTab->size(), "");
-
-      HashEntry<T, V>& entry = (*theHashTab)[thePos];
-
-      return entry.theValue;
+      return entry.key();
+    }
+
+    const V& getValue() const
+    {
+      ZORBA_FATAL(thePos < theHashTab->size(), "");
+
+      HashEntry<T, V>& entry = (*theHashTab)[thePos];
+
+      return entry.value();
+    }
+
+    V& getValue()
+    {
+      ZORBA_FATAL(thePos < theHashTab->size(), "");
+
+      HashEntry<T, V>& entry = (*theHashTab)[thePos];
+
+      return entry.value();
     }
 
     void setValue(const V& val)
@@ -221,7 +319,7 @@
 
       HashEntry<T, V>& entry = (*theHashTab)[thePos];
 
-      entry.theValue = val;
+      entry.value() = val;
     }
   };
 
@@ -230,20 +328,22 @@
   static const double DEFAULT_LOAD_FACTOR;
 
 protected:
-  ulong                             theNumEntries;
-
-  size_t                            theHashTabSize;
-  size_t                            theInitialSize;
-  checked_vector<HashEntry<T, V> >  theHashTab;
-  double                            theLoadFactor;
-  C                                 theCompareFunction;
-
-  bool                              theUseTransfer;
-
-  SYNC_CODE(mutable Mutex           theMutex;)
-  SYNC_CODE(Mutex                 * theMutexp;)
-
-  int                               numCollisions;
+  std::vector<HashEntry<T, V> >  theHashTab;
+
+  csize                          theNumBuckets;
+
+  csize                          theNumEntries;
+
+  double                         theLoadFactor;
+
+  double                         theMaxLoad;
+
+  C                              theCompareFunction;
+
+  SYNC_CODE(mutable Mutex        theMutex;)
+  SYNC_CODE(Mutex              * theMutexp;)
+
+  csize                          theNumCollisions;
 
 public:
 
@@ -257,19 +357,20 @@
   depends on some parametrs (e.g. the collation or timezone). These parameters
   are provided as data members of the given comparison-function obj.
 ********************************************************************************/
-HashMap(const C& compFunction, size_t size, bool sync, bool useTransfer = false)
+HashMap(const C& compFunction, csize size, bool sync)
   :
+  theNumBuckets(size),
   theNumEntries(0),
-  theHashTabSize(size),
-  theInitialSize(size),
-  theHashTab(computeTabSize(size)),
   theLoadFactor(DEFAULT_LOAD_FACTOR),
   theCompareFunction(compFunction),
-  theUseTransfer(useTransfer),
-  numCollisions(0)
+  theNumCollisions(0)
 {
+  theHashTab.resize(computeCapacity(size));
+
   formatCollisionArea();
 
+  theMaxLoad = theNumBuckets * theLoadFactor;
+
   SYNC_CODE(theMutexp = (sync ? &theMutex : NULL);)
 }
 
@@ -284,18 +385,19 @@
   theCompareFunction data member is initialized with the default constructor
   of the C class.
 ********************************************************************************/
-HashMap(size_t size, bool sync, bool useTransfer = false)
+HashMap(csize size, bool sync)
   :
+  theNumBuckets(size),
   theNumEntries(0),
-  theHashTabSize(size),
-  theInitialSize(size),
-  theHashTab(computeTabSize(size)),
   theLoadFactor(DEFAULT_LOAD_FACTOR),
-  theUseTransfer(useTransfer),
-  numCollisions(0)
+  theNumCollisions(0)
 {
+  theHashTab.resize(computeCapacity(size));
+
   formatCollisionArea();
 
+  theMaxLoad = theNumBuckets * theLoadFactor;
+
   SYNC_CODE(theMutexp = (sync ? &theMutex : NULL);)
 }
 
@@ -340,7 +442,7 @@
 /*******************************************************************************
 
 ********************************************************************************/
-ulong size() const
+csize size() const
 {
   return theNumEntries;
 }
@@ -349,7 +451,7 @@
 /*******************************************************************************
 
 ********************************************************************************/
-size_t capacity() const
+csize capacity() const
 {
   return theHashTab.size();
 }
@@ -358,6 +460,24 @@
 /*******************************************************************************
 
 ********************************************************************************/
+csize bucket_count() const
+{
+  return theNumBuckets;
+}
+
+
+/*******************************************************************************
+
+********************************************************************************/
+csize collisions() const
+{
+  return theNumCollisions;
+}
+
+
+/*******************************************************************************
+
+********************************************************************************/
 C get_compare_function()
 {
   return theCompareFunction;
@@ -388,13 +508,17 @@
 void clearNoSync()
 {
   theNumEntries = 0;
-  numCollisions = 0;
-
-  size_t n = theHashTab.size();
-
-  for (size_t i = 0; i < n; ++i)
+  theNumCollisions = 0;
+
+  csize n = theHashTab.size();
+
+  HashEntry<T, V>* entry = &theHashTab[0];
+  HashEntry<T, V>* lastentry = &theHashTab[n];
+
+  for (; entry < lastentry; ++entry)
   {
-    theHashTab[i].~HashEntry<T, V>();
+    if (!entry->isFree())
+      entry->setFree();
   }
 
   formatCollisionArea();
@@ -406,13 +530,13 @@
 ********************************************************************************/
 iterator begin() const
 {
-  return iterator(const_cast<checked_vector<HashEntry<T, V> >*>(&theHashTab), 0);
+  return iterator(const_cast<std::vector<HashEntry<T, V> >*>(&theHashTab), 0);
 }
 
 
 iterator end() const
 {
-  return iterator(const_cast<checked_vector<HashEntry<T, V> >*>(&theHashTab),
+  return iterator(const_cast<std::vector<HashEntry<T, V> >*>(&theHashTab),
                   theHashTab.size());
 }
 
@@ -437,7 +561,7 @@
 
   while (entry != NULL)
   {
-    if (equal(entry->theItem, item))
+    if (equal(entry->key(), item))
       return true;
 
     entry = entry->getNext();
@@ -468,7 +592,7 @@
 
   while (entry != NULL)
   {
-    if (equal(entry->theItem, item))
+    if (equal(entry->key(), item))
       return iterator(&theHashTab, entry - &theHashTab[0]);
 
     entry = entry->getNext();
@@ -498,9 +622,9 @@
 
   while (entry != NULL)
   {
-    if (equal(entry->theItem, item))
+    if (equal(entry->key(), item))
     {
-      value = entry->theValue;
+      value = entry->value();
       return true;
     }
 
@@ -527,8 +651,8 @@
 
   if (!found)
   {
-    entry->theItem = pair.first;
-    entry->theValue = pair.second;
+    entry->key() = pair.first;
+    entry->value() = pair.second;
   }
 
   return !found;
@@ -552,12 +676,12 @@
 
   if (!found)
   {
-    entry->theItem = item;
-    entry->theValue = value;
+    entry->key() = item;
+    entry->value() = value;
   }
   else
   {
-    value = entry->theValue;
+    value = entry->value();
   }
 
   return !found;
@@ -586,7 +710,7 @@
 
     while (entry != NULL)
     {
-      if (equal(entry->theItem, item))
+      if (equal(entry->key(), item))
       {
         found = true;
         break;
@@ -602,7 +726,7 @@
   }
   else
   {
-    entry->theValue = value;
+    entry->value() = value;
     return true;
   }
 }
@@ -616,13 +740,13 @@
 {
   SYNC_CODE(AutoMutex lock(theMutexp);)
 
-  if (ite.thePos < theHashTabSize)
+  if (ite.thePos < theNumBuckets)
   {
     eraseEntry(&theHashTab[ite.thePos], NULL);
   }
   else
   {
-    const T& item = theHashTab[ite.thePos].theItem;
+    const T& item = theHashTab[ite.thePos].key();
 
     ulong hval = hash(item);
 
@@ -665,7 +789,7 @@
   // If the item to remove is in the 1st entry of a bucket, then if the
   // bucket has no other entries, just call the destructor on that entry,
   // else copy the 2nd entry to the 1st entry and freeup the 2nd entry.
-  if (equal(entry->theItem, item))
+  if (equal(entry->key(), item))
   {
     eraseEntry(entry, NULL);
     return true;
@@ -678,7 +802,7 @@
 
   while (entry != NULL)
   {
-    if (equal(entry->theItem, item))
+    if (equal(entry->key(), item))
     {
       eraseEntry(entry, preventry);
       return true;
@@ -698,9 +822,9 @@
 /*******************************************************************************
 
 ********************************************************************************/
-size_t computeTabSize(size_t size) const
+csize computeCapacity(csize size) const
 {
-  return size + 32 + size/5;
+  return size + 32 + size / (5 - 10 * (theLoadFactor - 0.7)) ;
 }
 
 
@@ -724,7 +848,7 @@
 ********************************************************************************/
 HashEntry<T, V>* bucket(ulong hvalue)
 {
-  return &theHashTab[hvalue % theHashTabSize];
+  return &theHashTab[hvalue % theNumBuckets];
 }
 
 
@@ -733,7 +857,7 @@
 ********************************************************************************/
 const HashEntry<T, V>* bucket(ulong hvalue) const
 {
-  return &theHashTab[hvalue % theHashTabSize];
+  return &theHashTab[hvalue % theNumBuckets];
 }
 
 
@@ -742,7 +866,7 @@
 ********************************************************************************/
 HashEntry<T, V>* freelist()
 {
-  return &theHashTab[theHashTabSize];
+  return &theHashTab[theNumBuckets];
 }
 
 
@@ -755,41 +879,38 @@
   {
     if (entry->theNext == 0)
     {
-      entry->~HashEntry<T, V>();
+      entry->setFree();
     }
     else
     {
       HashEntry<T, V>* nextEntry = entry->getNext();
       *entry = *nextEntry;
       entry->setNext(nextEntry->getNext());
-      nextEntry->~HashEntry<T, V>();
+      nextEntry->setFree();
       nextEntry->setNext(freelist()->getNext());
       freelist()->setNext(nextEntry);
     }
 
-    theNumEntries--;
+    --theNumEntries;
 
-    if (theHashTabSize > theInitialSize &&
-        theNumEntries < (theHashTabSize / 2) * theLoadFactor)
+    if (theNumEntries < theMaxLoad / 2)
     {
-      resizeHashTab(theHashTabSize / 2);
+      resizeHashTab(theNumBuckets / 2);
     }
 
   }
   else
   {
     preventry->setNext(entry->getNext());
-    entry->~HashEntry<T, V>();
+    entry->setFree();
     entry->setNext(freelist()->getNext());
     freelist()->setNext(entry);
 
-    theNumEntries--;
-    numCollisions--;
+    --theNumEntries;
 
-    if (theHashTabSize > theInitialSize &&
-        theNumEntries < (theHashTabSize / 2) * theLoadFactor)
+    if (theNumEntries < theMaxLoad / 2)
     {
-      resizeHashTab(theHashTabSize / 2);
+      resizeHashTab(theNumBuckets / 2);
     }
   }
 }
@@ -812,7 +933,7 @@
   // If the hash bucket is empty, its 1st entry is used to store the new string.
   if (headEntry->isFree())
   {
-    theNumEntries++;
+    ++theNumEntries;
     headEntry->unsetFree();
     return headEntry;
   }
@@ -822,7 +943,7 @@
 
   while (currEntry != NULL)
   {
-    if (equal(currEntry->theItem, item))
+    if (equal(currEntry->key(), item))
     {
       found = true;
       return currEntry;
@@ -836,22 +957,22 @@
   // Do garbage collection if the hash table is more than 60% full. Note that
   // gc does NOT resize theHashTab, so after gc, the item still belongs to the
   // same bucket as before gc.
-  if (theNumEntries > theHashTabSize * theLoadFactor)
+  if (theNumEntries > theMaxLoad)
   {
     garbageCollect();
 
     if (headEntry->isFree())
     {
-      theNumEntries++;
+      ++theNumEntries;
       headEntry->unsetFree();
       return headEntry;
     }
   }
 
   // Double the size of the hash table if it is more than 60% full.
-  if (theNumEntries > theHashTabSize * theLoadFactor)
+  if (theNumEntries > theMaxLoad)
   {
-    resizeHashTab(theHashTabSize * 2);
+    resizeHashTab(theNumBuckets * 2);
     goto retry;
   }
 
@@ -859,8 +980,8 @@
   // collision list for its bucket. We place the new item right after the
   // headEntry for the bucket.
 
-  theNumEntries++;
-  numCollisions++;
+  ++theNumEntries;
+  ++theNumCollisions;
 
   // If no free entry exists, we extend the collision area of the hash table.
   if (freelist()->getNext() == 0)
@@ -886,9 +1007,13 @@
 ********************************************************************************/
 void extendCollisionArea()
 {
-  size_t oldSize = theHashTab.size();
-  size_t numCollisionEntries = oldSize - theHashTabSize;
-  size_t newSize = theHashTabSize + 2 * numCollisionEntries;
+  csize oldSize = theHashTab.size();
+  csize numCollisionEntries = oldSize - theNumBuckets;
+  csize newSize = theNumBuckets + 2 * numCollisionEntries;
+
+  std::cout << "Extending collision area" << std::endl
+            << "numBuckets = " << theNumBuckets << " numCollisionEntries = "
+            << numCollisionEntries << std::endl << std::endl;
 
   //foo();  for setting a breakpoint
 
@@ -909,7 +1034,8 @@
     firstentry = freelist();
 
   HashEntry<T, V>* lastentry = &theHashTab[theHashTab.size() - 1];
-  for (HashEntry<T, V>* entry = firstentry; entry < lastentry; entry++)
+
+  for (HashEntry<T, V>* entry = firstentry; entry < lastentry; ++entry)
     entry->theNext = 1;
 
   lastentry->theNext = 0;
@@ -919,31 +1045,34 @@
 /*******************************************************************************
 
 ********************************************************************************/
-void resizeHashTab(size_t newSize)
+void resizeHashTab(csize newSize)
 {
-  HashEntry<T, V>* entry;
-  HashEntry<T, V>* oldentry;
+  if (newSize == 0)
+    newSize = 3;
+
+  csize oldcap = theHashTab.size();
+  csize newcap = computeCapacity(newSize);
 
   // Create a new vector of new size and swap theHashTab with this new vector
-  checked_vector<HashEntry<T, V> > oldTab(computeTabSize(newSize));
+  std::vector<HashEntry<T, V> > oldTab(newcap);
   theHashTab.swap(oldTab);
 
-  size_t oldsize = oldTab.size();
-  theHashTabSize = newSize;
+  theNumBuckets = newSize;
+  theMaxLoad = theNumBuckets * theLoadFactor;
 
   formatCollisionArea();
 
-  numCollisions = 0;
+  HashEntry<T, V>* entry;
+  HashEntry<T, V>* oldentry = &oldTab[0];
+  HashEntry<T, V>* lastentry = &oldTab[oldcap];
 
   // Now rehash every entry
-  for (size_t i = 0; i < oldsize; i++)
+  for (; oldentry < lastentry; ++oldentry)
   {
-    oldentry = &oldTab[i];
-
     if (oldentry->isFree())
       continue;
 
-    entry = bucket (hash(oldentry->theItem));
+    entry = bucket(hash(oldentry->key()));
 
     if (!entry->isFree())
     {
@@ -962,12 +1091,11 @@
       freelist()->setNext(entry->getNext());
       entry->setNext(headEntry->getNext());
       headEntry->setNext(entry);
-      numCollisions++;
     }
 
-    entry->theItem = oldentry->theItem;
-    entry->theValue = oldentry->theValue;
     entry->unsetFree();
+    entry->key() = oldentry->key();
+    entry->value() = oldentry->value();
   }
 }
 

=== modified file 'src/zorbautils/hashset.h'
--- src/zorbautils/hashset.h	2012-09-17 00:36:37 +0000
+++ src/zorbautils/hashset.h	2012-09-19 12:44:24 +0000
@@ -45,16 +45,16 @@
 typedef typename HashMap<T, DummyHashValue, C>::iterator iterator;
 
 
-HashSet(const C& compFunction, ulong size, bool sync, bool useTransfer = false)
+HashSet(const C& compFunction, ulong size, bool sync)
   :
-  HashMap<T, DummyHashValue, C>(compFunction, size, sync, useTransfer) 
+  HashMap<T, DummyHashValue, C>(compFunction, size, sync) 
 {
 }
 
 
 HashSet(ulong size, bool sync)
   :
-  HashMap<T, DummyHashValue, C>(size, sync, false) 
+  HashMap<T, DummyHashValue, C>(size, sync) 
 {
 }
 
@@ -100,12 +100,7 @@
 
   if (!found)
   {
-    /*
-    if (this->theUseTransfer)
-      entry->theItem.transfer(item);
-    else
-    */
-      entry->theItem = item;
+    entry->key() = item;
   }
 
   return !found;
@@ -129,19 +124,13 @@
   entry = this->hashInsert(item, this->hash(item), found);
   if (!found)
   {
-    /*
-    if (this->theUseTransfer)
-      entry->theItem.transfer(item);
-    else
-    */
-      entry->theItem = item;
-
-    outItem = entry->theItem;
+    entry->key() = item;
+    outItem = entry->key();
     return true;
   }
   else
   {
-    outItem = entry->theItem;
+    outItem = entry->key();
     return false;
   }
 }
@@ -164,13 +153,13 @@
   entry = this->hashInsert(item, this->hash(item), found);
   if (!found)
   {
-    entry->theItem = item;
-    outItem = entry->theItem;
+    entry->key() = item;
+    outItem = entry->key();
     return true;
   }
   else
   {
-    outItem = entry->theItem;
+    outItem = entry->key();
     return false;
   }
 }


Follow ups