/* Hash tables.
- Copyright (C) 2000 Free Software Foundation, Inc.
+ Copyright (C) 2000, 2001 Free Software Foundation, Inc.
-This file is part of Wget.
+This file is part of GNU Wget.
-This program is free software; you can redistribute it and/or modify
+GNU Wget is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version.
-This program is distributed in the hope that it will be useful,
+GNU Wget is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
+along with Wget; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
#ifdef HAVE_CONFIG_H
# include <config.h>
#endif
+#ifdef HAVE_STRING_H
+# include <string.h>
+#else
+# include <strings.h>
+#endif /* HAVE_STRING_H */
#include <stdlib.h>
#include <assert.h>
#include "hash.h"
#ifdef STANDALONE
+# undef xmalloc
+# undef xrealloc
+# undef xfree
+
# define xmalloc malloc
# define xrealloc realloc
+# define xfree free
#endif
/* INTERFACE:
The basics are all covered. hash_table_new creates a hash table,
and hash_table_destroy deletes it. hash_table_put establishes a
mapping between a key and a value. hash_table_get retrieves the
- value that corresponds to a key. hash_table_exists queries whether
- a key is stored in a table at all. hash_table_remove removes a
- mapping that corresponds to a key. hash_table_map allows you to
- map through all the entries in a hash table. hash_table_clear
- clears all the entries from the hash table.
+ value that corresponds to a key. hash_table_contains queries
+ whether a key is stored in a table at all. hash_table_remove
+ removes a mapping that corresponds to a key. hash_table_map allows
+ you to map through all the entries in a hash table.
+ hash_table_clear clears all the entries from the hash table.
The number of mappings in a table is not limited, except by the
amount of memory. As you add new elements to a table, it regrows
The hashing and equality functions are normally provided by the
user. For the special (and frequent) case of hashing strings, you
- can use the pre-canned make_string_hash_table(), which provides the
- string hashing function from the Dragon Book, and a string equality
- wrapper around strcmp().
+ can use the pre-canned make_string_hash_table(), which provides an
+ efficient string hashing function, and a string equality wrapper
+ around strcmp().
When specifying your own hash and test functions, make sure the
following holds true:
distinct value, only that non-distinct objects must produce the
same values! For instance, a hash function that returns 0 for
any given object is a perfectly valid (albeit extremely bad) hash
+ function. A hash function that hashes a string by adding up all
+ its characters is another example of a valid (but quite bad) hash
function.
The above stated rule is quite easy to enforce. For example, if
your testing function compares strings case-insensitively, all
your function needs to do is lower-case the string characters
before calculating a hash. That way you have easily guaranteed
- that changes in case will not result in a different hash.
+ that case differences will not result in a different hash.
- (optional) Choose the hash function to get as good "spreading" as
possible. A good hash function will react to even a small change
Collisions make deletion tricky because finding collisions again
relies on new empty spots not being created. That's why
- hash_table_remove only marks the spot as deleted rather than really
- making it empty. */
+ hash_table_remove is careful to rehash the mappings that follow the
+ deleted one. */
struct mapping {
void *key;
int (*test_function) (const void *, const void *);
int size; /* size of the array */
- int fullness; /* number of non-empty fields */
int count; /* number of non-empty, non-deleted
fields. */
+ int resize_threshold; /* after size exceeds this number of
+ entries, resize the table. */
+
struct mapping *mappings;
};
-#define ENTRY_DELETED ((void *)0xdeadbeef)
-#define ENTRY_EMPTY NULL
+#define EMPTY_MAPPING_P(mp) ((mp)->key == NULL)
+#define NEXT_MAPPING(mp, mappings, size) (mp == mappings + (size - 1) \
+ ? mappings : mp + 1)
-#define DELETED_ENTRY_P(ptr) ((ptr) == ENTRY_DELETED)
-#define EMPTY_ENTRY_P(ptr) ((ptr) == ENTRY_EMPTY)
+#define LOOP_NON_EMPTY(mp, mappings, size) \
+ for (; !EMPTY_MAPPING_P (mp); mp = NEXT_MAPPING (mp, mappings, size))
+
+#define HASH_POSITION(ht, key) (ht->hash_function (key) % ht->size)
/* Find a prime near, but greather than or equal to SIZE. */
-int
+static int
prime_size (int size)
{
static const unsigned long primes [] = {
}
/* Create a hash table of INITIAL_SIZE with hash function
- HASH_FUNCTION and test function TEST_FUNCTION. If you wish to
- start out with a "small" table which will be regrown as needed,
- specify 0 as INITIAL_SIZE. */
+ HASH_FUNCTION and test function TEST_FUNCTION. INITIAL_SIZE will
+ be rounded to the next prime, so you don't have to worry about it
+ being a prime number.
+
+ Consequently, if you wish to start out with a "small" table which
+ will be regrown as needed, specify INITIAL_SIZE 0. */
struct hash_table *
hash_table_new (int initial_size,
{
struct hash_table *ht
= (struct hash_table *)xmalloc (sizeof (struct hash_table));
+
ht->hash_function = hash_function;
ht->test_function = test_function;
+
ht->size = prime_size (initial_size);
- ht->fullness = 0;
+ ht->resize_threshold = ht->size * 3 / 4;
+
ht->count = 0;
+
ht->mappings = xmalloc (ht->size * sizeof (struct mapping));
memset (ht->mappings, '\0', ht->size * sizeof (struct mapping));
+
return ht;
}
void
hash_table_destroy (struct hash_table *ht)
{
- free (ht->mappings);
- free (ht);
+ xfree (ht->mappings);
+ xfree (ht);
}
/* The heart of almost all functions in this file -- find the mapping
- whose KEY is equal to key, using a linear probing loop. Returns
- the offset of the mapping in ht->mappings. This should probably be
- declared inline. */
+ whose KEY is equal to key, using linear probing. Returns the
+ mapping that matches KEY, or NULL if none matches. */
-static int
+static inline struct mapping *
find_mapping (struct hash_table *ht, const void *key)
{
struct mapping *mappings = ht->mappings;
int size = ht->size;
- int location = ht->hash_function (key) % size;
- while (1)
- {
- struct mapping *mp = mappings + location;
- void *mp_key = mp->key;
+ struct mapping *mp = mappings + HASH_POSITION (ht, key);
+ int (*equals) (const void *, const void *) = ht->test_function;
- if (EMPTY_ENTRY_P (mp_key))
- return -1;
- else if (DELETED_ENTRY_P (mp_key)
- || !ht->test_function (key, mp_key))
- {
- if (++location == size)
- location = 0;
- }
- else
- return location;
- }
+ LOOP_NON_EMPTY (mp, mappings, size)
+ if (equals (key, mp->key))
+ return mp;
+ return NULL;
}
/* Get the value that corresponds to the key KEY in the hash table HT.
If no value is found, return NULL. Note that NULL is a legal value
for value; if you are storing NULLs in your hash table, you can use
- hash_table_exists to be sure that a (possibly NULL) value exists in
- the table. Or, you can use hash_table_get_pair instead of this
+ hash_table_contains to be sure that a (possibly NULL) value exists
+ in the table. Or, you can use hash_table_get_pair instead of this
function. */
void *
hash_table_get (struct hash_table *ht, const void *key)
{
- int location = find_mapping (ht, key);
- if (location < 0)
- return NULL;
+ struct mapping *mp = find_mapping (ht, key);
+ if (mp)
+ return mp->value;
else
- return ht->mappings[location].value;
+ return NULL;
}
/* Like hash_table_get, but writes out the pointers to both key and
hash_table_get_pair (struct hash_table *ht, const void *lookup_key,
void *orig_key, void *value)
{
- int location = find_mapping (ht, lookup_key);
- if (location < 0)
- return 0;
- else
+ struct mapping *mp = find_mapping (ht, lookup_key);
+
+ if (mp)
{
- struct mapping *mp = ht->mappings + location;
if (orig_key)
*(void **)orig_key = mp->key;
if (value)
*(void **)value = mp->value;
return 1;
}
+ else
+ return 0;
}
-/* Return 1 if KEY exists in HT, 0 otherwise. */
+/* Return 1 if HT contains KEY, 0 otherwise. */
int
-hash_table_exists (struct hash_table *ht, const void *key)
+hash_table_contains (struct hash_table *ht, const void *key)
{
- return find_mapping (ht, key) >= 0;
+ return find_mapping (ht, key) != NULL;
}
-#define MAX(i, j) (((i) >= (j)) ? (i) : (j))
-
/* Grow hash table HT as necessary, and rehash all the key-value
mappings. */
static void
grow_hash_table (struct hash_table *ht)
{
- int i;
struct mapping *old_mappings = ht->mappings;
- int old_count = ht->count; /* for assert() below */
- int old_size = ht->size;
-
- /* To minimize the number of regrowth, we'd like to resize the hash
- table exponentially. Normally, this would be done by doubling
- ht->size (and round it to next prime) on each regrow:
-
- ht->size = prime_size (ht->size * 2);
-
- But it is possible that the table has large fullness because of
- the many deleted entries. If that is the case, we don't want to
- blindly grow the table; we just want to rehash it. For that
- reason, we use ht->count as the relevant parameter. MAX is used
- only because we don't want to actually shrink the table. (But
- maybe that's wrong.) */
-
- int needed_size = prime_size (ht->count * 3);
- ht->size = MAX (old_size, needed_size);
+ struct mapping *old_end = ht->mappings + ht->size;
+ struct mapping *mp, *mappings;
+ int newsize;
+ newsize = prime_size (ht->size * 2);
#if 0
- printf ("growing from %d to %d\n", old_size, ht->size);
+ printf ("growing from %d to %d\n", ht->size, newsize);
#endif
- ht->mappings = xmalloc (ht->size * sizeof (struct mapping));
- memset (ht->mappings, '\0', ht->size * sizeof (struct mapping));
-
- /* Need to reset these two; hash_table_put will reinitialize them. */
- ht->fullness = 0;
- ht->count = 0;
- for (i = 0; i < old_size; i++)
- {
- struct mapping *mp = old_mappings + i;
- void *mp_key = mp->key;
-
- if (!EMPTY_ENTRY_P (mp_key)
- && !DELETED_ENTRY_P (mp_key))
- hash_table_put (ht, mp_key, mp->value);
- }
- assert (ht->count == old_count);
- free (old_mappings);
+ ht->size = newsize;
+ ht->resize_threshold = newsize * 3 / 4;
+
+ mappings = xmalloc (ht->size * sizeof (struct mapping));
+ memset (mappings, '\0', ht->size * sizeof (struct mapping));
+ ht->mappings = mappings;
+
+ for (mp = old_mappings; mp < old_end; mp++)
+ if (!EMPTY_MAPPING_P (mp))
+ {
+ struct mapping *new_mp = mappings + HASH_POSITION (ht, mp->key);
+ /* We don't need to call test function and worry about
+ collisions because all the keys come from the hash table
+ and are therefore guaranteed to be unique. */
+ LOOP_NON_EMPTY (new_mp, mappings, newsize)
+ ;
+ *new_mp = *mp;
+ }
+
+ xfree (old_mappings);
}
/* Put VALUE in the hash table HT under the key KEY. This regrows the
void
hash_table_put (struct hash_table *ht, const void *key, void *value)
{
- /* Cannot use find_mapping here because we treat deleted entries
- specially. */
-
struct mapping *mappings = ht->mappings;
int size = ht->size;
- int location = ht->hash_function (key) % size;
- while (1)
- {
- struct mapping *mp = mappings + location;
- void *mp_key = mp->key;
+ int (*equals) (const void *, const void *) = ht->test_function;
- if (EMPTY_ENTRY_P (mp_key))
- {
- ++ht->fullness;
- ++ht->count;
- just_insert:
- mp->key = (void *)key; /* const? */
- mp->value = value;
- break;
- }
- else if (DELETED_ENTRY_P (mp_key))
- {
- /* We're replacing a deleteed entry, so ht->count gets
- increased, but ht->fullness remains unchanged. */
- ++ht->count;
- goto just_insert;
- }
- else if (ht->test_function (key, mp_key))
- {
- /* We're replacing an existing entry, so ht->count and
- ht->fullness remain unchanged. */
- goto just_insert;
- }
- else
- {
- if (++location == size)
- location = 0;
- }
- }
- if (ht->fullness * 4 > ht->size * 3)
- /* When fullness exceeds 75% of size, regrow the table. */
+ struct mapping *mp = mappings + HASH_POSITION (ht, key);
+
+ LOOP_NON_EMPTY (mp, mappings, size)
+ if (equals (key, mp->key))
+ {
+ mp->key = (void *)key; /* const? */
+ mp->value = value;
+ return;
+ }
+
+ ++ht->count;
+ mp->key = (void *)key; /* const? */
+ mp->value = value;
+
+ if (ht->count > ht->resize_threshold)
+ /* When table is 75% full, regrow it. */
grow_hash_table (ht);
}
-/* Remove KEY from HT. */
+/* Remove a mapping that matches KEY from HT. Return 0 if there was
+ no such entry; return 1 if an entry was removed. */
int
hash_table_remove (struct hash_table *ht, const void *key)
{
- int location = find_mapping (ht, key);
- if (location < 0)
+ struct mapping *mp = find_mapping (ht, key);
+ if (!mp)
return 0;
else
{
+ int size = ht->size;
struct mapping *mappings = ht->mappings;
- struct mapping *mp = mappings + location;
- /* We don't really remove an entry from the hash table: we just
- mark it as deleted. This is because there may be other
- entries located after this entry whose hash points to a
- location before this entry. (Example: keys A, B and C have
- the same hash. If you were to really *delete* B from the
- table, C could no longer be found.) */
-
- /* Optimization addendum: if the mapping that follows LOCATION
- is already empty, that is a sure sign that nobody depends on
- LOCATION being non-empty. (This is because we're using
- linear probing. This would not be the case with double
- hashing.) In that case, we may safely delete the mapping. */
-
- /* This could be generalized so that the all the non-empty
- locations following LOCATION are simply shifted leftward. It
- would make deletion a bit slower, but it would remove the
- ugly DELETED_ENTRY_P checks from all the rest of the code,
- making the whole thing faster. */
- int location_after = (location + 1) == ht->size ? 0 : location + 1;
- struct mapping *mp_after = mappings + location_after;
-
- if (EMPTY_ENTRY_P (mp_after->key))
- {
- mp->key = ENTRY_EMPTY;
- --ht->fullness;
- }
- else
- mp->key = ENTRY_DELETED;
+ mp->key = NULL;
--ht->count;
+
+ /* Rehash all the entries following MP. The alternative
+ approach is to mark the entry as deleted, i.e. create a
+ "tombstone". That makes remove faster, but leaves a lot of
+ garbage and slows down hash_table_get and hash_table_put. */
+
+ mp = NEXT_MAPPING (mp, mappings, size);
+ LOOP_NON_EMPTY (mp, mappings, size)
+ {
+ const void *key2 = mp->key;
+ struct mapping *mp_new = mappings + HASH_POSITION (ht, key2);
+
+ /* Find the new location for the key. */
+
+ LOOP_NON_EMPTY (mp_new, mappings, size)
+ if (key2 == mp_new->key)
+ /* The mapping MP (key2) is already where we want it (in
+ MP_NEW's "chain" of keys.) */
+ goto next_rehash;
+
+ *mp_new = *mp;
+ mp->key = NULL;
+
+ next_rehash:
+ ;
+ }
return 1;
}
}
hash_table_clear (struct hash_table *ht)
{
memset (ht->mappings, '\0', ht->size * sizeof (struct mapping));
- ht->fullness = 0;
- ht->count = 0;
+ ht->count = 0;
}
/* Map MAPFUN over all the mappings in hash table HT. MAPFUN is
called with three arguments: the key, the value, and the CLOSURE.
- Don't add or remove entries from HT while hash_table_map is being
- called, or strange things may happen. */
+
+ It is undefined what happens if you add or remove entries in the
+ hash table while hash_table_map is running. The exception is the
+ entry you're currently mapping over; you may remove or change that
+ entry. */
void
hash_table_map (struct hash_table *ht,
int (*mapfun) (void *, void *, void *),
void *closure)
{
- struct mapping *mappings = ht->mappings;
- int i;
- for (i = 0; i < ht->size; i++)
- {
- struct mapping *mp = mappings + i;
- void *mp_key = mp->key;
-
- if (!EMPTY_ENTRY_P (mp_key)
- && !DELETED_ENTRY_P (mp_key))
- if (mapfun (mp_key, mp->value, closure))
+ struct mapping *mp = ht->mappings;
+ struct mapping *end = ht->mappings + ht->size;
+
+ for (; mp < end; mp++)
+ if (!EMPTY_MAPPING_P (mp))
+ {
+ void *key;
+ repeat:
+ key = mp->key;
+ if (mapfun (key, mp->value, closure))
return;
- }
+ /* hash_table_remove might have moved the adjacent
+ mappings. */
+ if (mp->key != key && !EMPTY_MAPPING_P (mp))
+ goto repeat;
+ }
+}
+
+/* Return the number of elements in the hash table. This is not the
+ same as the physical size of the hash table, which is always
+ greater than the number of elements. */
+
+int
+hash_table_count (struct hash_table *ht)
+{
+ return ht->count;
}
\f
-/* Support for hash tables whose keys are strings. */
+/* Functions from this point onward are meant for convenience and
+ don't strictly belong to this file. However, this is as good a
+ place for them as any. */
+
+/* ========
+ Support for hash tables whose keys are strings.
+ ======== */
+
+/* 31 bit hash function. Taken from Gnome's glib, modified to use
+ standard C types.
+
+ We used to use the popular hash function from the Dragon Book, but
+ this one seems to perform much better. */
-/* supposedly from the Dragon Book P436. */
unsigned long
-string_hash (const void *sv)
+string_hash (const void *key)
{
- unsigned int h = 0;
- unsigned const char *x = (unsigned const char *) sv;
+ const char *p = key;
+ unsigned int h = *p;
+
+ if (h)
+ for (p += 1; *p != '\0'; p++)
+ h = (h << 5) - h + *p;
+
+ return h;
+}
- while (*x)
- {
- unsigned int g;
- h = (h << 4) + *x++;
- if ((g = h & 0xf0000000) != 0)
- h = (h ^ (g >> 24)) ^ g;
- }
+/* Frontend for strcmp usable for hash tables. */
+
+int
+string_cmp (const void *s1, const void *s2)
+{
+ return !strcmp ((const char *)s1, (const char *)s2);
+}
+/* Return a hash table of initial size INITIAL_SIZE suitable to use
+ strings as keys. */
+
+struct hash_table *
+make_string_hash_table (int initial_size)
+{
+ return hash_table_new (initial_size, string_hash, string_cmp);
+}
+
+/* ========
+ Support for hash tables whose keys are strings, but which are
+ compared case-insensitively.
+ ======== */
+
+/* Like string_hash, but produce the same hash regardless of the case. */
+
+static unsigned long
+string_hash_nocase (const void *key)
+{
+ const char *p = key;
+ unsigned int h = TOLOWER (*p);
+
+ if (h)
+ for (p += 1; *p != '\0'; p++)
+ h = (h << 5) - h + TOLOWER (*p);
+
return h;
}
+/* Like string_cmp, but doing case-insensitive compareison. */
+
+static int
+string_cmp_nocase (const void *s1, const void *s2)
+{
+ return !strcasecmp ((const char *)s1, (const char *)s2);
+}
+
+/* Like make_string_hash_table, but uses string_hash_nocase and
+ string_cmp_nocase. */
+
+struct hash_table *
+make_nocase_string_hash_table (int initial_size)
+{
+ return hash_table_new (initial_size, string_hash_nocase, string_cmp_nocase);
+}
+
#if 0
/* If I ever need it: hashing of integers. */
return key;
}
#endif
-
-int
-string_cmp (const void *s1, const void *s2)
-{
- return !strcmp ((const char *)s1, (const char *)s2);
-}
-
-/* Return a hash table of initial size INITIAL_SIZE suitable to use
- strings as keys. */
-
-struct hash_table *
-make_string_hash_table (int initial_size)
-{
- return hash_table_new (initial_size, string_hash, string_cmp);
-}
-
\f
#ifdef STANDALONE
if (len <= 1)
continue;
line[--len] = '\0';
- if (!hash_table_exists (ht, line))
+ if (!hash_table_contains (ht, line))
hash_table_put (ht, strdup (line), "here I am!");
#if 1
- if (len % 3)
+ if (len % 5 == 0)
{
char *line_copy;
if (hash_table_get_pair (ht, line, &line_copy, NULL))
{
hash_table_remove (ht, line);
- free (line_copy);
+ xfree (line_copy);
}
}
#endif
print_hash (ht);
#endif
#if 1
- printf ("%d %d %d\n", ht->count, ht->fullness, ht->size);
+ printf ("%d %d\n", ht->count, ht->size);
#endif
return 0;
}