[svn] Update copyright notices.

[wget] / src / hash.c
diff --git a/src/hash.c b/src/hash.c

index 0749f57bdb58cf35549c310a9078438ba3a3c0df..970b274beada4dc70a9a0e827482ff61eb6b80e3 100644 (file)
--- a/src/hash.c
+++ b/src/hash.c
@@ -1,26 +1,31 @@
  /* Hash tables.
-   Copyright (C) 2000 Free Software Foundation, Inc.
+   Copyright (C) 2000, 2001 Free Software Foundation, Inc.
  
-This file is part of Wget.
+This file is part of GNU Wget.
  
-This program is free software; you can redistribute it and/or modify
+GNU Wget is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or (at
  your option) any later version.
  
-This program is distributed in the hope that it will be useful,
+GNU Wget is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  
  You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
+along with Wget; if not, write to the Free Software
  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  
  #ifdef HAVE_CONFIG_H
  # include <config.h>
  #endif
  
+#ifdef HAVE_STRING_H
+# include <string.h>
+#else
+# include <strings.h>
+#endif /* HAVE_STRING_H */
  #include <stdlib.h>
  #include <assert.h>
  
@@ -30,8 +35,13 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  #include "hash.h"
  
  #ifdef STANDALONE
+# undef xmalloc
+# undef xrealloc
+# undef xfree
+
  # define xmalloc malloc
  # define xrealloc realloc
+# define xfree free
  #endif
  
  /* INTERFACE:
@@ -44,11 +54,11 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
     The basics are all covered.  hash_table_new creates a hash table,
     and hash_table_destroy deletes it.  hash_table_put establishes a
     mapping between a key and a value.  hash_table_get retrieves the
-   value that corresponds to a key.  hash_table_exists queries whether
-   a key is stored in a table at all.  hash_table_remove removes a
-   mapping that corresponds to a key.  hash_table_map allows you to
-   map through all the entries in a hash table.  hash_table_clear
-   clears all the entries from the hash table.
+   value that corresponds to a key.  hash_table_contains queries
+   whether a key is stored in a table at all.  hash_table_remove
+   removes a mapping that corresponds to a key.  hash_table_map allows
+   you to map through all the entries in a hash table.
+   hash_table_clear clears all the entries from the hash table.
  
     The number of mappings in a table is not limited, except by the
     amount of memory.  As you add new elements to a table, it regrows
@@ -57,9 +67,9 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  
     The hashing and equality functions are normally provided by the
     user.  For the special (and frequent) case of hashing strings, you
-   can use the pre-canned make_string_hash_table(), which provides the
-   string hashing function from the Dragon Book, and a string equality
-   wrapper around strcmp().
+   can use the pre-canned make_string_hash_table(), which provides an
+   efficient string hashing function, and a string equality wrapper
+   around strcmp().
  
     When specifying your own hash and test functions, make sure the
     following holds true:
@@ -76,13 +86,15 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
       distinct value, only that non-distinct objects must produce the
       same values!  For instance, a hash function that returns 0 for
       any given object is a perfectly valid (albeit extremely bad) hash
+     function.  A hash function that hashes a string by adding up all
+     its characters is another example of a valid (but quite bad) hash
       function.
  
       The above stated rule is quite easy to enforce.  For example, if
       your testing function compares strings case-insensitively, all
       your function needs to do is lower-case the string characters
       before calculating a hash.  That way you have easily guaranteed
-     that changes in case will not result in a different hash.
+     that case differences will not result in a different hash.
  
     - (optional) Choose the hash function to get as good "spreading" as
       possible.  A good hash function will react to even a small change
@@ -115,8 +127,8 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  
     Collisions make deletion tricky because finding collisions again
     relies on new empty spots not being created.  That's why
-   hash_table_remove only marks the spot as deleted rather than really
-   making it empty. */
+   hash_table_remove is careful to rehash the mappings that follow the
+   deleted one.  */
  
  struct mapping {
    void *key;
@@ -128,22 +140,27 @@ struct hash_table {
    int (*test_function) (const void *, const void *);
  
    int size;                    /* size of the array */
-  int fullness;                        /* number of non-empty fields */
    int count;                   /* number of non-empty, non-deleted
                                     fields. */
  
+  int resize_threshold;                /* after size exceeds this number of
+                                  entries, resize the table.  */
+
    struct mapping *mappings;
  };
  
-#define ENTRY_DELETED ((void *)0xdeadbeef)
-#define ENTRY_EMPTY   NULL
+#define EMPTY_MAPPING_P(mp)  ((mp)->key == NULL)
+#define NEXT_MAPPING(mp, mappings, size) (mp == mappings + (size - 1)  \
+                                         ? mappings : mp + 1)
  
-#define DELETED_ENTRY_P(ptr) ((ptr) == ENTRY_DELETED)
-#define EMPTY_ENTRY_P(ptr)   ((ptr) == ENTRY_EMPTY)
+#define LOOP_NON_EMPTY(mp, mappings, size)                             \
+  for (; !EMPTY_MAPPING_P (mp); mp = NEXT_MAPPING (mp, mappings, size))
+
+#define HASH_POSITION(ht, key) (ht->hash_function (key) % ht->size)
  
  /* Find a prime near, but greather than or equal to SIZE. */
  
-int
+static int
  prime_size (int size)
  {
    static const unsigned long primes [] = {
@@ -166,9 +183,12 @@ prime_size (int size)
  }
  
  /* Create a hash table of INITIAL_SIZE with hash function
-   HASH_FUNCTION and test function TEST_FUNCTION.  If you wish to
-   start out with a "small" table which will be regrown as needed,
-   specify 0 as INITIAL_SIZE.  */
+   HASH_FUNCTION and test function TEST_FUNCTION.  INITIAL_SIZE will
+   be rounded to the next prime, so you don't have to worry about it
+   being a prime number.
+
+   Consequently, if you wish to start out with a "small" table which
+   will be regrown as needed, specify INITIAL_SIZE 0.  */
  
  struct hash_table *
  hash_table_new (int initial_size,
@@ -177,13 +197,18 @@ hash_table_new (int initial_size,
  {
    struct hash_table *ht
      = (struct hash_table *)xmalloc (sizeof (struct hash_table));
+
    ht->hash_function = hash_function;
    ht->test_function = test_function;
+
    ht->size = prime_size (initial_size);
-  ht->fullness = 0;
+  ht->resize_threshold = ht->size * 3 / 4;
+
    ht->count    = 0;
+
    ht->mappings = xmalloc (ht->size * sizeof (struct mapping));
    memset (ht->mappings, '\0', ht->size * sizeof (struct mapping));
+
    return ht;
  }
  
@@ -192,54 +217,43 @@ hash_table_new (int initial_size,
  void
  hash_table_destroy (struct hash_table *ht)
  {
-  free (ht->mappings);
-  free (ht);
+  xfree (ht->mappings);
+  xfree (ht);
  }
  
  /* The heart of almost all functions in this file -- find the mapping
-   whose KEY is equal to key, using a linear probing loop.  Returns
-   the offset of the mapping in ht->mappings.  This should probably be
-   declared inline.  */
+   whose KEY is equal to key, using linear probing.  Returns the
+   mapping that matches KEY, or NULL if none matches.  */
  
-static int
+static inline struct mapping *
  find_mapping (struct hash_table *ht, const void *key)
  {
    struct mapping *mappings = ht->mappings;
    int size = ht->size;
-  int location = ht->hash_function (key) % size;
-  while (1)
-    {
-      struct mapping *mp = mappings + location;
-      void *mp_key = mp->key;
+  struct mapping *mp = mappings + HASH_POSITION (ht, key);
+  int (*equals) (const void *, const void *) = ht->test_function;
  
-      if (EMPTY_ENTRY_P (mp_key))
-       return -1;
-      else if (DELETED_ENTRY_P (mp_key)
-              || !ht->test_function (key, mp_key))
-       {
-         if (++location == size)
-           location = 0;
-       }
-      else
-       return location;
-    }
+  LOOP_NON_EMPTY (mp, mappings, size)
+    if (equals (key, mp->key))
+      return mp;
+  return NULL;
  }
  
  /* Get the value that corresponds to the key KEY in the hash table HT.
     If no value is found, return NULL.  Note that NULL is a legal value
     for value; if you are storing NULLs in your hash table, you can use
-   hash_table_exists to be sure that a (possibly NULL) value exists in
-   the table.  Or, you can use hash_table_get_pair instead of this
+   hash_table_contains to be sure that a (possibly NULL) value exists
+   in the table.  Or, you can use hash_table_get_pair instead of this
     function.  */
  
  void *
  hash_table_get (struct hash_table *ht, const void *key)
  {
-  int location = find_mapping (ht, key);
-  if (location < 0)
-    return NULL;
+  struct mapping *mp = find_mapping (ht, key);
+  if (mp)
+    return mp->value;
    else
-    return ht->mappings[location].value;
+    return NULL;
  }
  
  /* Like hash_table_get, but writes out the pointers to both key and
@@ -249,78 +263,64 @@ int
  hash_table_get_pair (struct hash_table *ht, const void *lookup_key,
                      void *orig_key, void *value)
  {
-  int location = find_mapping (ht, lookup_key);
-  if (location < 0)
-    return 0;
-  else
+  struct mapping *mp = find_mapping (ht, lookup_key);
+
+  if (mp)
      {
-      struct mapping *mp = ht->mappings + location;
        if (orig_key)
         *(void **)orig_key = mp->key;
        if (value)
         *(void **)value = mp->value;
        return 1;
      }
+  else
+    return 0;
  }
  
-/* Return 1 if KEY exists in HT, 0 otherwise. */
+/* Return 1 if HT contains KEY, 0 otherwise. */
  
  int
-hash_table_exists (struct hash_table *ht, const void *key)
+hash_table_contains (struct hash_table *ht, const void *key)
  {
-  return find_mapping (ht, key) >= 0;
+  return find_mapping (ht, key) != NULL;
  }
  
-#define MAX(i, j) (((i) >= (j)) ? (i) : (j))
-
  /* Grow hash table HT as necessary, and rehash all the key-value
     mappings.  */
  
  static void
  grow_hash_table (struct hash_table *ht)
  {
-  int i;
    struct mapping *old_mappings = ht->mappings;
-  int old_count = ht->count;   /* for assert() below */
-  int old_size = ht->size;
-
-  /* To minimize the number of regrowth, we'd like to resize the hash
-     table exponentially.  Normally, this would be done by doubling
-     ht->size (and round it to next prime) on each regrow:
-
-         ht->size = prime_size (ht->size * 2);
-
-     But it is possible that the table has large fullness because of
-     the many deleted entries.  If that is the case, we don't want to
-     blindly grow the table; we just want to rehash it.  For that
-     reason, we use ht->count as the relevant parameter.  MAX is used
-     only because we don't want to actually shrink the table.  (But
-     maybe that's wrong.)  */
-
-  int needed_size = prime_size (ht->count * 3);
-  ht->size = MAX (old_size, needed_size);
+  struct mapping *old_end      = ht->mappings + ht->size;
+  struct mapping *mp, *mappings;
+  int newsize;
  
+  newsize = prime_size (ht->size * 2);
  #if 0
-  printf ("growing from %d to %d\n", old_size, ht->size);
+  printf ("growing from %d to %d\n", ht->size, newsize);
  #endif
  
-  ht->mappings = xmalloc (ht->size * sizeof (struct mapping));
-  memset (ht->mappings, '\0', ht->size * sizeof (struct mapping));
-
-  /* Need to reset these two; hash_table_put will reinitialize them.  */
-  ht->fullness = 0;
-  ht->count    = 0;
-  for (i = 0; i < old_size; i++)
-    {
-      struct mapping *mp = old_mappings + i;
-      void *mp_key = mp->key;
-
-      if (!EMPTY_ENTRY_P (mp_key)
-         && !DELETED_ENTRY_P (mp_key))
-       hash_table_put (ht, mp_key, mp->value);
-    }
-  assert (ht->count == old_count);
-  free (old_mappings);
+  ht->size = newsize;
+  ht->resize_threshold = newsize * 3 / 4;
+
+  mappings = xmalloc (ht->size * sizeof (struct mapping));
+  memset (mappings, '\0', ht->size * sizeof (struct mapping));
+  ht->mappings = mappings;
+
+  for (mp = old_mappings; mp < old_end; mp++)
+    if (!EMPTY_MAPPING_P (mp))
+      {
+       struct mapping *new_mp = mappings + HASH_POSITION (ht, mp->key);
+       /* We don't need to call test function and worry about
+          collisions because all the keys come from the hash table
+          and are therefore guaranteed to be unique.  */
+       LOOP_NON_EMPTY (new_mp, mappings, newsize)
+         ;
+       *new_mp = *mp;
+      }
+
+  xfree (old_mappings);
  }
  
  /* Put VALUE in the hash table HT under the key KEY.  This regrows the
@@ -329,92 +329,71 @@ grow_hash_table (struct hash_table *ht)
  void
  hash_table_put (struct hash_table *ht, const void *key, void *value)
  {
-  /* Cannot use find_mapping here because we treat deleted entries
-     specially.  */
-
    struct mapping *mappings = ht->mappings;
    int size = ht->size;
-  int location = ht->hash_function (key) % size;
-  while (1)
-    {
-      struct mapping *mp = mappings + location;
-      void *mp_key = mp->key;
+  int (*equals) (const void *, const void *) = ht->test_function;
  
-      if (EMPTY_ENTRY_P (mp_key))
-       {
-         ++ht->fullness;
-         ++ht->count;
-       just_insert:
-         mp->key = (void *)key; /* const? */
-         mp->value = value;
-         break;
-       }
-      else if (DELETED_ENTRY_P (mp_key))
-       {
-         /* We're replacing a deleteed entry, so ht->count gets
-             increased, but ht->fullness remains unchanged.  */
-         ++ht->count;
-         goto just_insert;
-       }
-      else if (ht->test_function (key, mp_key))
-       {
-         /* We're replacing an existing entry, so ht->count and
-             ht->fullness remain unchanged.  */
-         goto just_insert;
-       }
-      else
-       {
-         if (++location == size)
-           location = 0;
-       }
-    }
-  if (ht->fullness * 4 > ht->size * 3)
-    /* When fullness exceeds 75% of size, regrow the table. */
+  struct mapping *mp = mappings + HASH_POSITION (ht, key);
+
+  LOOP_NON_EMPTY (mp, mappings, size)
+    if (equals (key, mp->key))
+      {
+       mp->key   = (void *)key; /* const? */
+       mp->value = value;
+       return;
+      }
+
+  ++ht->count;
+  mp->key   = (void *)key;     /* const? */
+  mp->value = value;
+
+  if (ht->count > ht->resize_threshold)
+    /* When table is 75% full, regrow it. */
      grow_hash_table (ht);
  }
  
-/* Remove KEY from HT. */
+/* Remove a mapping that matches KEY from HT.  Return 0 if there was
+   no such entry; return 1 if an entry was removed.  */
  
  int
  hash_table_remove (struct hash_table *ht, const void *key)
  {
-  int location = find_mapping (ht, key);
-  if (location < 0)
+  struct mapping *mp = find_mapping (ht, key);
+  if (!mp)
      return 0;
    else
      {
+      int size = ht->size;
        struct mapping *mappings = ht->mappings;
-      struct mapping *mp = mappings + location;
-      /* We don't really remove an entry from the hash table: we just
-        mark it as deleted.  This is because there may be other
-        entries located after this entry whose hash points to a
-        location before this entry.  (Example: keys A, B and C have
-        the same hash.  If you were to really *delete* B from the
-        table, C could no longer be found.) */
-
-      /* Optimization addendum: if the mapping that follows LOCATION
-        is already empty, that is a sure sign that nobody depends on
-        LOCATION being non-empty.  (This is because we're using
-        linear probing.  This would not be the case with double
-        hashing.)  In that case, we may safely delete the mapping.  */
-
-      /* This could be generalized so that the all the non-empty
-        locations following LOCATION are simply shifted leftward.  It
-        would make deletion a bit slower, but it would remove the
-        ugly DELETED_ENTRY_P checks from all the rest of the code,
-        making the whole thing faster.  */
-      int location_after = (location + 1) == ht->size ? 0 : location + 1;
-      struct mapping *mp_after = mappings + location_after;
-
-      if (EMPTY_ENTRY_P (mp_after->key))
-       {
-         mp->key = ENTRY_EMPTY;
-         --ht->fullness;
-       }
-      else
-       mp->key = ENTRY_DELETED;
  
+      mp->key = NULL;
        --ht->count;
+
+      /* Rehash all the entries following MP.  The alternative
+        approach is to mark the entry as deleted, i.e. create a
+        "tombstone".  That makes remove faster, but leaves a lot of
+        garbage and slows down hash_table_get and hash_table_put.  */
+
+      mp = NEXT_MAPPING (mp, mappings, size);
+      LOOP_NON_EMPTY (mp, mappings, size)
+       {
+         const void *key2 = mp->key;
+         struct mapping *mp_new = mappings + HASH_POSITION (ht, key2);
+
+         /* Find the new location for the key. */
+
+         LOOP_NON_EMPTY (mp_new, mappings, size)
+           if (key2 == mp_new->key)
+             /* The mapping MP (key2) is already where we want it (in
+                MP_NEW's "chain" of keys.)  */
+             goto next_rehash;
+
+         *mp_new = *mp;
+         mp->key = NULL;
+
+       next_rehash:
+         ;
+       }
        return 1;
      }
  }
@@ -427,54 +406,131 @@ void
  hash_table_clear (struct hash_table *ht)
  {
    memset (ht->mappings, '\0', ht->size * sizeof (struct mapping));
-  ht->fullness = 0;
-  ht->count    = 0;
+  ht->count = 0;
  }
  
  /* Map MAPFUN over all the mappings in hash table HT.  MAPFUN is
     called with three arguments: the key, the value, and the CLOSURE.
-   Don't add or remove entries from HT while hash_table_map is being
-   called, or strange things may happen.  */
+
+   It is undefined what happens if you add or remove entries in the
+   hash table while hash_table_map is running.  The exception is the
+   entry you're currently mapping over; you may remove or change that
+   entry.  */
  
  void
  hash_table_map (struct hash_table *ht,
                 int (*mapfun) (void *, void *, void *),
                 void *closure)
  {
-  struct mapping *mappings = ht->mappings;
-  int i;
-  for (i = 0; i < ht->size; i++)
-    {
-      struct mapping *mp = mappings + i;
-      void *mp_key = mp->key;
-
-      if (!EMPTY_ENTRY_P (mp_key)
-         && !DELETED_ENTRY_P (mp_key))
-       if (mapfun (mp_key, mp->value, closure))
+  struct mapping *mp  = ht->mappings;
+  struct mapping *end = ht->mappings + ht->size;
+
+  for (; mp < end; mp++)
+    if (!EMPTY_MAPPING_P (mp))
+      {
+       void *key;
+      repeat:
+       key = mp->key;
+       if (mapfun (key, mp->value, closure))
           return;
-    }
+       /* hash_table_remove might have moved the adjacent
+          mappings. */
+       if (mp->key != key && !EMPTY_MAPPING_P (mp))
+         goto repeat;
+      }
+}
+
+/* Return the number of elements in the hash table.  This is not the
+   same as the physical size of the hash table, which is always
+   greater than the number of elements.  */
+
+int
+hash_table_count (struct hash_table *ht)
+{
+  return ht->count;
  }
  \f
-/* Support for hash tables whose keys are strings.  */
+/* Functions from this point onward are meant for convenience and
+   don't strictly belong to this file.  However, this is as good a
+   place for them as any.  */
+
+/* ========
+   Support for hash tables whose keys are strings.
+   ======== */
+
+/* 31 bit hash function.  Taken from Gnome's glib, modified to use
+   standard C types.
+
+   We used to use the popular hash function from the Dragon Book, but
+   this one seems to perform much better.  */
  
-/* supposedly from the Dragon Book P436. */
  unsigned long
-string_hash (const void *sv)
+string_hash (const void *key)
  {
-  unsigned int h = 0;
-  unsigned const char *x = (unsigned const char *) sv;
+  const char *p = key;
+  unsigned int h = *p;
+  
+  if (h)
+    for (p += 1; *p != '\0'; p++)
+      h = (h << 5) - h + *p;
+  
+  return h;
+}
  
-  while (*x)
-    {
-      unsigned int g;
-      h = (h << 4) + *x++;
-      if ((g = h & 0xf0000000) != 0)
-       h = (h ^ (g >> 24)) ^ g;
-    }
+/* Frontend for strcmp usable for hash tables. */
+
+int
+string_cmp (const void *s1, const void *s2)
+{
+  return !strcmp ((const char *)s1, (const char *)s2);
+}
  
+/* Return a hash table of initial size INITIAL_SIZE suitable to use
+   strings as keys.  */
+
+struct hash_table *
+make_string_hash_table (int initial_size)
+{
+  return hash_table_new (initial_size, string_hash, string_cmp);
+}
+
+/* ========
+   Support for hash tables whose keys are strings, but which are
+   compared case-insensitively.
+   ======== */
+
+/* Like string_hash, but produce the same hash regardless of the case. */
+
+static unsigned long
+string_hash_nocase (const void *key)
+{
+  const char *p = key;
+  unsigned int h = TOLOWER (*p);
+  
+  if (h)
+    for (p += 1; *p != '\0'; p++)
+      h = (h << 5) - h + TOLOWER (*p);
+  
    return h;
  }
  
+/* Like string_cmp, but doing case-insensitive compareison. */
+
+static int
+string_cmp_nocase (const void *s1, const void *s2)
+{
+  return !strcasecmp ((const char *)s1, (const char *)s2);
+}
+
+/* Like make_string_hash_table, but uses string_hash_nocase and
+   string_cmp_nocase.  */
+
+struct hash_table *
+make_nocase_string_hash_table (int initial_size)
+{
+  return hash_table_new (initial_size, string_hash_nocase, string_cmp_nocase);
+}
+
  #if 0
  /* If I ever need it: hashing of integers. */
  
@@ -492,22 +548,6 @@ inthash (unsigned int key)
    return key;
  }
  #endif
-
-int
-string_cmp (const void *s1, const void *s2)
-{
-  return !strcmp ((const char *)s1, (const char *)s2);
-}
-
-/* Return a hash table of initial size INITIAL_SIZE suitable to use
-   strings as keys.  */
-
-struct hash_table *
-make_string_hash_table (int initial_size)
-{
-  return hash_table_new (initial_size, string_hash, string_cmp);
-}
-
  \f
  #ifdef STANDALONE
  
@@ -541,16 +581,16 @@ main (void)
        if (len <= 1)
         continue;
        line[--len] = '\0';
-      if (!hash_table_exists (ht, line))
+      if (!hash_table_contains (ht, line))
         hash_table_put (ht, strdup (line), "here I am!");
  #if 1
-      if (len % 3)
+      if (len % 5 == 0)
         {
           char *line_copy;
           if (hash_table_get_pair (ht, line, &line_copy, NULL))
             {
               hash_table_remove (ht, line);
-             free (line_copy);
+             xfree (line_copy);
             }
         }
  #endif
@@ -559,7 +599,7 @@ main (void)
    print_hash (ht);
  #endif
  #if 1
-  printf ("%d %d %d\n", ht->count, ht->fullness, ht->size);
+  printf ("%d %d\n", ht->count, ht->size);
  #endif
    return 0;
  }