[svn] Improve documentation.

[wget] / src / hash.c
diff --git a/src/hash.c b/src/hash.c

index e54fb33a3696ad585ce6b2ffe6c801169a82c3d3..b51187f3bf1f5e1068a852029301eafa67499892 100644 (file)
--- a/src/hash.c
+++ b/src/hash.c
@@ -1,26 +1,41 @@
  /* Hash tables.
-   Copyright (C) 2000 Free Software Foundation, Inc.
+   Copyright (C) 2000, 2001 Free Software Foundation, Inc.
  
-This file is part of Wget.
+This file is part of GNU Wget.
  
-This program is free software; you can redistribute it and/or modify
+GNU Wget is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
+the Free Software Foundation; either version 2 of the License, or (at
+your option) any later version.
  
-This program is distributed in the hope that it will be useful,
+GNU Wget is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  
  You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
+along with Wget; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+In addition, as a special exception, the Free Software Foundation
+gives permission to link the code of its release of Wget with the
+OpenSSL project's "OpenSSL" library (or with modified versions of it
+that use the same license as the "OpenSSL" library), and distribute
+the linked executables.  You must obey the GNU General Public License
+in all respects for all of the code used other than "OpenSSL".  If you
+modify this file, you may extend this exception to your version of the
+file, but you are not obligated to do so.  If you do not wish to do
+so, delete this exception statement from your version.  */
  
  #ifdef HAVE_CONFIG_H
  # include <config.h>
  #endif
  
+#ifdef HAVE_STRING_H
+# include <string.h>
+#else
+# include <strings.h>
+#endif /* HAVE_STRING_H */
  #include <stdlib.h>
  #include <assert.h>
  
@@ -30,51 +45,153 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  #include "hash.h"
  
  #ifdef STANDALONE
+# undef xmalloc
+# undef xrealloc
+# undef xfree
+
  # define xmalloc malloc
  # define xrealloc realloc
-#endif
+# define xfree free
  
-/* This file implements simple hash tables based on linear probing.
-   The hash table stores key-value pairs in a contiguous array.  Both
-   key and value are void pointers that the hash and test functions
-   know how to handle.
-
-   Although Knuth & co. recommend double hashing over linear probing,
-   we use the latter because it accesses array elements sequentially
-   in case of a collision, yielding in better cache behaviour and
-   ultimately in better speed.  To avoid collision problems with
-   linear probing, we make sure that the table grows as soon as the
-   fullness/size ratio exceeds 75%.  */
+# undef TOLOWER
+# define TOLOWER(x) ('A' <= (x) && (x) <= 'Z' ? (x) - 32 : (x))
+#endif
  
-struct ht_pair {
+/* INTERFACE:
+
+   Hash tables are a technique used to implement mapping between
+   objects with near-constant-time access and storage.  The table
+   associates keys to values, and a value can be very quickly
+   retrieved by providing the key.  Fast lookup tables are typically
+   implemented as hash tables.
+
+   The entry points are
+     hash_table_new       -- creates the table.
+     hash_table_destroy   -- destroys the table.
+     hash_table_put       -- establishes or updates key->value mapping.
+     hash_table_get       -- retrieves value of key.
+     hash_table_get_pair  -- get key/value pair for key.
+     hash_table_contains  -- test whether the table contains key.
+     hash_table_remove    -- remove the key->value mapping for key.
+     hash_table_map       -- iterate through table mappings.
+     hash_table_clear     -- clear hash table contents.
+     hash_table_count     -- return the number of entries in the table.
+
+   The hash table grows internally as new entries are added and is not
+   limited in size, except by available memory.  The table doubles
+   with each resize, which ensures that the amortized time per
+   operation remains constant.
+
+   By default, tables created by hash_table_new consider the keys to
+   be equal if their pointer values are the same.  You can use
+   make_string_hash_table to create tables whose keys are considered
+   equal if their string contents are the same.  In the general case,
+   the criterion of equality used to compare keys is specified at
+   table creation time with two callback functions, "hash" and "test".
+   The hash function transforms the key into an arbitrary number that
+   must be the same for two equal keys.  The test function accepts two
+   keys and returns non-zero if they are to be considered equal.
+
+   Note that neither keys nor values are copied when inserted into the
+   hash table, so they must exist for the lifetime of the table.  This
+   means that e.g. the use of static strings is OK, but objects with a
+   shorter life-time need to be copied (with strdup() or the like in
+   the case of strings) before being inserted.  */
+
+/* IMPLEMENTATION:
+
+   The hash table is implemented as an open-addressed table with
+   linear probing collision resolution.
+
+   For those not up to CS parlance, it means that all the hash entries
+   (pairs of pointers key and value) are stored in a contiguous array.
+   The position of each mapping is determined by the hash value of its
+   key and the size of the table: location := hash(key) % size.  If
+   two different keys end up on the same position (collide), the one
+   that came second is placed at the next empty position following the
+   occupied place.  This collision resolution technique is called
+   "linear probing".
+
+   There are more advanced collision resolution methods (quadratic
+   probing, double hashing), but we don't use them because they incur
+   more non-sequential access to the array, which results in worse CPU
+   cache behavior.  Linear probing works well as long as the
+   count/size ratio (fullness) is kept below 75%.  We make sure to
+   grow and rehash the table whenever this threshold is exceeded.
+
+   Collisions make deletion tricky because clearing a position
+   followed by a colliding entry would make the position seem empty
+   and the colliding entry not found.  One solution is to leave a
+   "tombstone" instead of clearing the entry, and another is to
+   carefully rehash the entries immediately following the deleted one.
+   We use the latter method because it results in less bookkeeping and
+   faster retrieval at the (slight) expense of deletion.  */
+
+/* Maximum allowed fullness: when hash table's fullness exceeds this
+   value, the table is resized.  */
+#define HASH_MAX_FULLNESS 0.75
+
+/* The hash table size is multiplied by this factor (and then rounded
+   to the next prime) with each resize.  This guarantees infrequent
+   resizes.  */
+#define HASH_RESIZE_FACTOR 2
+
+struct mapping {
    void *key;
    void *value;
  };
  
  struct hash_table {
-  unsigned long (*hash_function) (const void *);
-  int (*test_function) (const void *, const void *);
+  unsigned long (*hash_function) PARAMS ((const void *));
+  int (*test_function) PARAMS ((const void *, const void *));
  
-  int size;                    /* size of the array */
-  int fullness;                        /* number of non-empty fields */
-  int count;                   /* number of non-empty, non-deleted
-                                   fields. */
+  int size;                    /* size of the array. */
+  int count;                   /* number of non-empty entries. */
  
-  struct ht_pair *pairs;
+  int resize_threshold;                /* after size exceeds this number of
+                                  entries, resize the table.  */
+  int prime_offset;            /* the offset of the current prime in
+                                  the prime table. */
+
+  struct mapping *mappings;    /* the array of mapping pairs. */
  };
  
-#define ENTRY_DELETED ((void *)0xdeadbeef)
+/* We use all-bit-set marker to mean that a mapping is empty.  It is
+   (hopefully) illegal as a pointer, and it allows the users to use
+   NULL (as well as any non-negative integer) as key.  */
+#define NON_EMPTY(mp) (mp->key != (void *)~(unsigned long)0)
  
-#define DELETED_ENTRY_P(ptr) ((ptr) == ENTRY_DELETED)
-#define EMPTY_ENTRY_P(ptr)   ((ptr) == NULL)
+/* "Next" mapping is the mapping after MP, but wrapping back to
+   MAPPINGS when MP would reach MAPPINGS+SIZE.  */
+#define NEXT_MAPPING(mp, mappings, size) (mp != mappings + (size - 1)  \
+                                         ? mp + 1 : mappings)
  
-/* Find a prime near, but greather than or equal to SIZE. */
+/* Loop over non-empty mappings starting at MP. */
+#define LOOP_NON_EMPTY(mp, mappings, size)                             \
+  for (; NON_EMPTY (mp); mp = NEXT_MAPPING (mp, mappings, size))
  
-int
-prime_size (int size)
+/* #### Some implementations multiply the hash with the "golden ratio"
+   of the table to get better spread for keys that do not come from a
+   good hashing source.  I'm not sure if that is necessary for the
+   hash functions we use.  */
+
+#define HASH_POSITION(ht, key) (ht->hash_function (key) % ht->size)
+
+/* Find a prime near, but greather than or equal to SIZE.  Of course,
+   the primes are not calculated, but looked up from a table.  The
+   table does not contain all primes in range, just a selection useful
+   for this purpose.
+
+   PRIME_OFFSET is a minor optimization: if specified, it starts the
+   search for the prime number beginning with the specific offset in
+   the prime number table.  The final offset is stored in the same
+   variable.  */
+
+static int
+prime_size (int size, int *prime_offset)
  {
    static const unsigned long primes [] = {
-    19, 29, 41, 59, 79, 107, 149, 197, 263, 347, 457, 599, 787, 1031,
+    13, 19, 29, 41, 59, 79, 107, 149, 197, 263, 347, 457, 599, 787, 1031,
      1361, 1777, 2333, 3037, 3967, 5167, 6719, 8737, 11369, 14783,
      19219, 24989, 32491, 42257, 54941, 71429, 92861, 120721, 156941,
      204047, 265271, 344857, 448321, 582821, 757693, 985003, 1280519,
@@ -82,35 +199,80 @@ prime_size (int size)
      10445899, 13579681, 17653589, 22949669, 29834603, 38784989,
      50420551, 65546729, 85210757, 110774011, 144006217, 187208107,
      243370577, 316381771, 411296309, 534685237, 695090819, 903618083,
-    1174703521, 1527114613, 1985248999, 2580823717UL, 3355070839UL
+    1174703521, 1527114613, 1985248999,
+    (unsigned long)0x99d43ea5, (unsigned long)0xc7fa5177
    };
-  int i;
-  for (i = 0; i < ARRAY_SIZE (primes); i++)
+  int i = *prime_offset;
+
+  for (; i < countof (primes); i++)
      if (primes[i] >= size)
-      return primes[i];
-  /* huh? */
-  return size;
+      {
+       /* Set the offset to the next prime.  That is safe because,
+          next time we are called, it will be with a larger SIZE,
+          which means we could never return the same prime anyway.
+          (If that is not the case, the caller can simply reset
+          *prime_offset.)  */
+       *prime_offset = i + 1;
+       return primes[i];
+      }
+
+  abort ();
+  return 0;
  }
  
-/* Create a hash table of INITIAL_SIZE with hash function
-   HASH_FUNCTION and test function TEST_FUNCTION.  If you wish to
-   start out with a "small" table which will be regrown as needed,
-   specify 0 as INITIAL_SIZE.  */
+static unsigned long ptrhash PARAMS ((const void *));
+static int ptrcmp PARAMS ((const void *, const void *));
+
+/* Create a hash table with hash function HASH_FUNCTION and test
+   function TEST_FUNCTION.  The table is empty (its count is 0), but
+   pre-allocated to store at least ITEMS items.
+
+   ITEMS is the number of items that the table can accept without
+   needing to resize.  It is useful when creating a table that is to
+   be immediately filled with a known number of items.  In that case,
+   the regrows are a waste of time, and specifying ITEMS correctly
+   will avoid them altogether.
+
+   Note that hash tables grow dynamically regardless of ITEMS.  The
+   only use of ITEMS is to preallocate the table and avoid unnecessary
+   dynamic regrows.  Don't bother making ITEMS prime because it's not
+   used as size unchanged.  To start with a small table that grows as
+   needed, simply specify zero ITEMS.
+
+   If HASH_FUNCTION is not provided, identity table is assumed,
+   i.e. key pointers are compared as keys.  If you want strings with
+   equal contents to hash the same, use make_string_hash_table.  */
  
  struct hash_table *
-hash_table_new (int initial_size,
+hash_table_new (int items,
                 unsigned long (*hash_function) (const void *),
                 int (*test_function) (const void *, const void *))
  {
-  struct hash_table *ht
-    = (struct hash_table *)xmalloc (sizeof (struct hash_table));
-  ht->hash_function = hash_function;
-  ht->test_function = test_function;
-  ht->size = prime_size (initial_size);
-  ht->fullness = 0;
-  ht->count    = 0;
-  ht->pairs = xmalloc (ht->size * sizeof (struct ht_pair));
-  memset (ht->pairs, '\0', ht->size * sizeof (struct ht_pair));
+  int size;
+  struct hash_table *ht = xnew (struct hash_table);
+
+  ht->hash_function = hash_function ? hash_function : ptrhash;
+  ht->test_function = test_function ? test_function : ptrcmp;
+
+  /* If the size of struct hash_table ever becomes a concern, this
+     field can go.  (Wget doesn't create many hashes.)  */
+  ht->prime_offset = 0;
+
+  /* Calculate the size that ensures that the table will store at
+     least ITEMS keys without the need to resize.  */
+  size = 1 + items / HASH_MAX_FULLNESS;
+  size = prime_size (size, &ht->prime_offset);
+  ht->size = size;
+  ht->resize_threshold = size * HASH_MAX_FULLNESS;
+  /*assert (ht->resize_threshold >= items);*/
+
+  ht->mappings = xnew_array (struct mapping, ht->size);
+  /* Mark mappings as empty.  We use 0xff rather than 0 to mark empty
+     keys because it allows us to store NULL keys to the table.  */
+  memset (ht->mappings, 255, size * sizeof (struct mapping));
+
+  ht->count = 0;
+
    return ht;
  }
  
@@ -119,103 +281,113 @@ hash_table_new (int initial_size,
  void
  hash_table_destroy (struct hash_table *ht)
  {
-  free (ht->pairs);
-  free (ht);
+  xfree (ht->mappings);
+  xfree (ht);
+}
+
+/* The heart of most functions in this file -- find the mapping whose
+   KEY is equal to key, using linear probing.  Returns the mapping
+   that matches KEY, or the first empty mapping if none matches.  */
+
+static inline struct mapping *
+find_mapping (const struct hash_table *ht, const void *key)
+{
+  struct mapping *mappings = ht->mappings;
+  int size = ht->size;
+  struct mapping *mp = mappings + HASH_POSITION (ht, key);
+  int (*equals) PARAMS ((const void *, const void *)) = ht->test_function;
+
+  LOOP_NON_EMPTY (mp, mappings, size)
+    if (equals (key, mp->key))
+      break;
+  return mp;
  }
  
  /* Get the value that corresponds to the key KEY in the hash table HT.
     If no value is found, return NULL.  Note that NULL is a legal value
     for value; if you are storing NULLs in your hash table, you can use
-   hash_table_exists to be sure that a (possibly NULL) value exists in
-   the table.  */
+   hash_table_contains to be sure that a (possibly NULL) value exists
+   in the table.  Or, you can use hash_table_get_pair instead of this
+   function.  */
  
  void *
-hash_table_get (struct hash_table *ht, const void *key)
+hash_table_get (const struct hash_table *ht, const void *key)
  {
-  int location = ht->hash_function (key) % ht->size;
-  while (1)
-    {
-      struct ht_pair *the_pair = ht->pairs + location;
-      if (EMPTY_ENTRY_P (the_pair->key))
-       return NULL;
-      else if (DELETED_ENTRY_P (the_pair->key)
-              || !ht->test_function (key, the_pair->key))
-       {
-         ++location;
-         if (location == ht->size)
-           location = 0;
-       }
-      else
-       return the_pair->value;
-    }
+  struct mapping *mp = find_mapping (ht, key);
+  if (NON_EMPTY (mp))
+    return mp->value;
+  else
+    return NULL;
  }
  
-/* Return 1 if KEY exists in HT, 0 otherwise. */
+/* Like hash_table_get, but writes out the pointers to both key and
+   value.  Returns non-zero on success.  */
  
  int
-hash_table_exists (struct hash_table *ht, const void *key)
+hash_table_get_pair (const struct hash_table *ht, const void *lookup_key,
+                    void *orig_key, void *value)
  {
-  int location = ht->hash_function (key) % ht->size;
-  while (1)
+  struct mapping *mp = find_mapping (ht, lookup_key);
+  if (NON_EMPTY (mp))
      {
-      struct ht_pair *the_pair = ht->pairs + location;
-      if (EMPTY_ENTRY_P (the_pair->key))
-       return 0;
-      else if (DELETED_ENTRY_P (the_pair->key)
-              || !ht->test_function (key, the_pair->key))
-       {
-         ++location;
-         if (location == ht->size)
-           location = 0;
-       }
-      else
-       return 1;
+      if (orig_key)
+       *(void **)orig_key = mp->key;
+      if (value)
+       *(void **)value = mp->value;
+      return 1;
      }
+  else
+    return 0;
  }
  
-#define MAX(i, j) (((i) >= (j)) ? (i) : (j))
+/* Return 1 if HT contains KEY, 0 otherwise. */
+
+int
+hash_table_contains (const struct hash_table *ht, const void *key)
+{
+  struct mapping *mp = find_mapping (ht, key);
+  return NON_EMPTY (mp);
+}
  
  /* Grow hash table HT as necessary, and rehash all the key-value
-   pairs.  */
+   mappings.  */
  
  static void
  grow_hash_table (struct hash_table *ht)
  {
-  int i;
-  struct ht_pair *old_pairs = ht->pairs;
-  int old_count = ht->count;   /* for assert() below */
-  int old_size = ht->size;
-
-  /* Normally, the idea is to double ht->size (and round it to next
-     prime) on each regrow:
-
-         ht->size = prime_size (ht->size * 2);
-
-     But it is possible that the table has large fullness because of
-     the many deleted entries.  If that is the case, we don't want to
-     blindly grow the table; we just want to rehash it.  For that
-     reason, we use ht->count as the relevant parameter.  MAX is used
-     only because we don't want to actually shrink the table.  (But
-     maybe that's wrong.)  */
+  struct mapping *old_mappings = ht->mappings;
+  struct mapping *old_end      = ht->mappings + ht->size;
+  struct mapping *mp, *mappings;
+  int newsize;
  
-  int needed_size = prime_size (ht->count * 2);
-  ht->size = MAX (old_size, needed_size);
-
-  ht->pairs = xmalloc (ht->size * sizeof (struct ht_pair));
-  memset (ht->pairs, '\0', ht->size * sizeof (struct ht_pair));
+  newsize = prime_size (ht->size * HASH_RESIZE_FACTOR, &ht->prime_offset);
+#if 0
+  printf ("growing from %d to %d; fullness %.2f%% to %.2f%%\n",
+         ht->size, newsize,
+         100.0 * ht->count / ht->size,
+         100.0 * ht->count / newsize);
+#endif
  
-  /* Need to reset these two; hash_table_put will reinitialize them.  */
-  ht->fullness = 0;
-  ht->count    = 0;
-  for (i = 0; i < old_size; i++)
-    {
-      struct ht_pair *the_pair = old_pairs + i;
-      if (!EMPTY_ENTRY_P (the_pair->key)
-         && !DELETED_ENTRY_P (the_pair->key))
-       hash_table_put (ht, the_pair->key, the_pair->value);
-    }
-  assert (ht->count == old_count);
-  free (old_pairs);
+  ht->size = newsize;
+  ht->resize_threshold = newsize * HASH_MAX_FULLNESS;
+
+  mappings = xnew_array (struct mapping, newsize);
+  memset (mappings, 255, newsize * sizeof (struct mapping));
+  ht->mappings = mappings;
+
+  for (mp = old_mappings; mp < old_end; mp++)
+    if (NON_EMPTY (mp))
+      {
+       struct mapping *new_mp = mappings + HASH_POSITION (ht, mp->key);
+       /* We don't need to test for uniqueness of keys because they
+          come from the hash table and are therefore known to be
+          unique.  */
+       LOOP_NON_EMPTY (new_mp, mappings, newsize)
+         ;
+       *new_mp = *mp;
+      }
+
+  xfree (old_mappings);
  }
  
  /* Put VALUE in the hash table HT under the key KEY.  This regrows the
@@ -224,139 +396,276 @@ grow_hash_table (struct hash_table *ht)
  void
  hash_table_put (struct hash_table *ht, const void *key, void *value)
  {
-  int location = ht->hash_function (key) % ht->size;
-  while (1)
+  struct mapping *mp = find_mapping (ht, key);
+  if (NON_EMPTY (mp))
      {
-      struct ht_pair *the_pair = ht->pairs + location;
-      if (EMPTY_ENTRY_P (the_pair->key))
-       {
-         ++ht->fullness;
-         ++ht->count;
-       just_insert:
-         the_pair->key = (void *)key; /* const? */
-         the_pair->value = value;
-         break;
-       }
-      else if (DELETED_ENTRY_P (the_pair->key))
-       {
-         /* We're replacing a deleteed entry, so ht->count gets
-             increased, but ht->fullness remains unchanged.  */
-         ++ht->count;
-         goto just_insert;
-       }
-      else if (ht->test_function (key, the_pair->key))
-       {
-         /* We're replacing an existing entry, so ht->count and
-             ht->fullness remain unchanged.  */
-         goto just_insert;
-       }
-      else
-       {
-         ++location;
-         if (location == ht->size)
-           location = 0;
-       }
+      /* update existing item */
+      mp->key   = (void *)key; /* const? */
+      mp->value = value;
+      return;
+    }
+
+  /* If adding the item would make the table exceed max. fullness,
+     grow the table first.  */
+  if (ht->count >= ht->resize_threshold)
+    {
+      grow_hash_table (ht);
+      mp = find_mapping (ht, key);
      }
-  if (ht->fullness * 4 > ht->size * 3)
-    /* When fullness exceeds 75% of size, regrow the table. */
-    grow_hash_table (ht);
+
+  /* add new item */
+  ++ht->count;
+  mp->key   = (void *)key;     /* const? */
+  mp->value = value;
  }
  
-/* Remove KEY from HT. */
+/* Remove a mapping that matches KEY from HT.  Return 0 if there was
+   no such entry; return 1 if an entry was removed.  */
  
  int
  hash_table_remove (struct hash_table *ht, const void *key)
  {
-  int location = ht->hash_function (key) % ht->size;
-  while (1)
+  struct mapping *mp = find_mapping (ht, key);
+  if (!NON_EMPTY (mp))
+    return 0;
+  else
      {
-      struct ht_pair *the_pair = ht->pairs + location;
-      if (EMPTY_ENTRY_P (the_pair->key))
-       return 0;
-      else if (DELETED_ENTRY_P (the_pair->key)
-              || !ht->test_function (key, the_pair->key))
-       {
-         ++location;
-         if (location == ht->size)
-           location = 0;
-       }
-      else
+      int size = ht->size;
+      struct mapping *mappings = ht->mappings;
+
+      mp->key = NULL;
+      --ht->count;
+
+      /* Rehash all the entries following MP.  The alternative
+        approach is to mark the entry as deleted, i.e. create a
+        "tombstone".  That makes remove faster, but leaves a lot of
+        garbage and slows down hash_table_get and hash_table_put.  */
+
+      mp = NEXT_MAPPING (mp, mappings, size);
+      LOOP_NON_EMPTY (mp, mappings, size)
         {
-         /* We don't really remove an entry from the hash table: we
-            just mark it as deleted.  This is because there may be
-            other entries located after this entry whose hash number
-            points to a location before this entry.  (Example: keys
-            A, B and C have the same hash.  If you were to really
-            *delete* B from the table, C could no longer be found.)
-
-            As an optimization, it might be worthwhile to check
-            whether the immediately preceding entry is empty and, if
-            so, really delete the pair (set it to empty and decrease
-            the fullness along with the count).  I *think* it should
-            be safe.  */
-         the_pair->key = ENTRY_DELETED;
-         --ht->count;
-         return 1;
+         const void *key2 = mp->key;
+         struct mapping *mp_new = mappings + HASH_POSITION (ht, key2);
+
+         /* Find the new location for the key. */
+
+         LOOP_NON_EMPTY (mp_new, mappings, size)
+           if (key2 == mp_new->key)
+             /* The mapping MP (key2) is already where we want it (in
+                MP_NEW's "chain" of keys.)  */
+             goto next_rehash;
+
+         *mp_new = *mp;
+         mp->key = NULL;
+
+       next_rehash:
+         ;
         }
+      return 1;
      }
  }
  
+/* Clear HT of all entries.  After calling this function, the count
+   and the fullness of the hash table will be zero.  The size will
+   remain unchanged.  */
+
  void
  hash_table_clear (struct hash_table *ht)
  {
-  memset (ht->pairs, '\0', ht->size * sizeof (struct ht_pair));
-  ht->fullness = 0;
-  ht->count    = 0;
+  memset (ht->mappings, '\0', ht->size * sizeof (struct mapping));
+  ht->count = 0;
  }
  
+/* Map MAPFUN over all the mappings in hash table HT.  MAPFUN is
+   called with three arguments: the key, the value, and MAPARG.
+
+   It is undefined what happens if you add or remove entries in the
+   hash table while hash_table_map is running.  The exception is the
+   entry you're currently mapping over; you may remove or change that
+   entry.  */
+
  void
  hash_table_map (struct hash_table *ht,
                 int (*mapfun) (void *, void *, void *),
-               void *closure)
+               void *maparg)
  {
-  int i;
-  for (i = 0; i < ht->size; i++)
-    {
-      struct ht_pair *the_pair = ht->pairs + i;
-      if (!EMPTY_ENTRY_P (the_pair->key)
-         && !DELETED_ENTRY_P (the_pair->key))
-       if (mapfun (the_pair->key, the_pair->value, closure))
+  struct mapping *mp  = ht->mappings;
+  struct mapping *end = ht->mappings + ht->size;
+
+  for (; mp < end; mp++)
+    if (NON_EMPTY (mp))
+      {
+       void *key;
+      repeat:
+       key = mp->key;
+       if (mapfun (key, mp->value, maparg))
           return;
-    }
+       /* hash_table_remove might have moved the adjacent
+          mappings. */
+       if (mp->key != key && NON_EMPTY (mp))
+         goto repeat;
+      }
+}
+
+/* Return the number of elements in the hash table.  This is not the
+   same as the physical size of the hash table, which is always
+   greater than the number of elements.  */
+
+int
+hash_table_count (const struct hash_table *ht)
+{
+  return ht->count;
  }
  \f
-/* Support for hash tables whose keys are strings.  */
+/* Functions from this point onward are meant for convenience and
+   don't strictly belong to this file.  However, this is as good a
+   place for them as any.  */
+
+/* Rules for creating custom hash and test functions:
+
+   - The test function returns non-zero for keys that are considered
+     "equal", zero otherwise.
+
+   - The hash function returns a number that represents the
+     "distinctness" of the object.  In more precise terms, it means
+     that for any two objects that test "equal" under the test
+     function, the hash function MUST produce the same result.
+
+     This does not mean that all different objects must produce
+     different values (that would be "perfect" hashing), only that
+     non-distinct objects must produce the same values!  For instance,
+     a hash function that returns 0 for any given object is a
+     perfectly valid (albeit extremely bad) hash function.  A hash
+     function that hashes a string by adding up all its characters is
+     another example of a valid (but quite bad) hash function.
+
+     It is not hard to make hash and test functions agree about
+     equality.  For example, if the test function compares strings
+     case-insensitively, the hash function can lower-case the
+     characters when calculating the hash value.  That ensures that
+     two strings differing only in case will hash the same.
+
+   - If you care about performance, choose a hash function with as
+     good "spreading" as possible.  A good hash function will use all
+     the bits of the input when calculating the hash, and will react
+     to even small changes in input with a completely different
+     output.  Finally, don't make the hash function itself overly
+     slow, because you'll be incurring a non-negligible overhead to
+     all hash table operations.  */
+
+/*
+ * Support for hash tables whose keys are strings.
+ *
+ */
+   
+/* 31 bit hash function.  Taken from Gnome's glib, modified to use
+   standard C types.
+
+   We used to use the popular hash function from the Dragon Book, but
+   this one seems to perform much better.  */
  
-/* supposedly from the Dragon Book P436. */
  unsigned long
-string_hash (const void *sv)
+string_hash (const void *key)
  {
-  unsigned int h = 0;
-  unsigned const char *x = (unsigned const char *) sv;
-
-  while (*x)
-    {
-      unsigned int g;
-      h = (h << 4) + *x++;
-      if ((g = h & 0xf0000000) != 0)
-       h = (h ^ (g >> 24)) ^ g;
-    }
-
+  const char *p = key;
+  unsigned int h = *p;
+  
+  if (h)
+    for (p += 1; *p != '\0'; p++)
+      h = (h << 5) - h + *p;
+  
    return h;
  }
  
+/* Frontend for strcmp usable for hash tables. */
+
  int
  string_cmp (const void *s1, const void *s2)
  {
    return !strcmp ((const char *)s1, (const char *)s2);
  }
  
+/* Return a hash table of preallocated to store at least ITEMS items
+   suitable to use strings as keys.  */
+
+struct hash_table *
+make_string_hash_table (int items)
+{
+  return hash_table_new (items, string_hash, string_cmp);
+}
+
+/*
+ * Support for hash tables whose keys are strings, but which are
+ * compared case-insensitively.
+ *
+ */
+
+/* Like string_hash, but produce the same hash regardless of the case. */
+
+static unsigned long
+string_hash_nocase (const void *key)
+{
+  const char *p = key;
+  unsigned int h = TOLOWER (*p);
+  
+  if (h)
+    for (p += 1; *p != '\0'; p++)
+      h = (h << 5) - h + TOLOWER (*p);
+  
+  return h;
+}
+
+/* Like string_cmp, but doing case-insensitive compareison. */
+
+static int
+string_cmp_nocase (const void *s1, const void *s2)
+{
+  return !strcasecmp ((const char *)s1, (const char *)s2);
+}
+
+/* Like make_string_hash_table, but uses string_hash_nocase and
+   string_cmp_nocase.  */
+
  struct hash_table *
-make_string_hash_table (int initial_size)
+make_nocase_string_hash_table (int items)
  {
-  return hash_table_new (initial_size, string_hash, string_cmp);
+  return hash_table_new (items, string_hash_nocase, string_cmp_nocase);
  }
  
+/* Hashing of pointers.  Used for hash tables that are keyed by
+   pointer identity.  (Common Lisp calls them EQ hash tables, and Java
+   calls them IdentityHashMaps.)  */
+
+static unsigned long
+ptrhash (const void *ptr)
+{
+  unsigned long key = (unsigned long)ptr;
+  key += (key << 12);
+  key ^= (key >> 22);
+  key += (key << 4);
+  key ^= (key >> 9);
+  key += (key << 10);
+  key ^= (key >> 2);
+  key += (key << 7);
+  key ^= (key >> 12);
+#if SIZEOF_LONG > 4
+  key += (key << 44);
+  key ^= (key >> 54);
+  key += (key << 36);
+  key ^= (key >> 41);
+  key += (key << 42);
+  key ^= (key >> 34);
+  key += (key << 39);
+  key ^= (key >> 44);
+#endif
+  return key;
+}
+
+static int
+ptrcmp (const void *ptr1, const void *ptr2)
+{
+  return ptr1 == ptr2;
+}
  \f
  #ifdef STANDALONE
  
@@ -364,7 +673,7 @@ make_string_hash_table (int initial_size)
  #include <string.h>
  
  int
-print_hash_table_mapper (const void *key, void *value, void *count)
+print_hash_table_mapper (void *key, void *value, void *count)
  {
    ++*(int *)count;
    printf ("%s: %s\n", (const char *)key, (char *)value);
@@ -390,13 +699,25 @@ main (void)
        if (len <= 1)
         continue;
        line[--len] = '\0';
-      hash_table_put (ht, strdup (line), "here I am!");
-      if (len % 2)
-       hash_table_remove (ht, line);
+      if (!hash_table_contains (ht, line))
+       hash_table_put (ht, strdup (line), "here I am!");
+#if 1
+      if (len % 5 == 0)
+       {
+         char *line_copy;
+         if (hash_table_get_pair (ht, line, &line_copy, NULL))
+           {
+             hash_table_remove (ht, line);
+             xfree (line_copy);
+           }
+       }
+#endif
      }
-  print_hash (ht);
  #if 0
-  printf ("%d %d %d\n", ht->count, ht->fullness, ht->size);
+  print_hash (ht);
+#endif
+#if 1
+  printf ("%d %d\n", ht->count, ht->size);
  #endif
    return 0;
  }