file, but you are not obligated to do so. If you do not wish to do
so, delete this exception statement from your version. */
+/* With -DSTANDALONE, this file can be compiled outside Wget source
+ tree. To test, also use -DTEST. */
+
#ifdef HAVE_CONFIG_H
# include <config.h>
#endif
-#ifdef HAVE_STRING_H
-# include <string.h>
-#else
-# include <strings.h>
-#endif /* HAVE_STRING_H */
+#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
+#include <string.h>
+#include <limits.h>
-#include "wget.h"
-#include "utils.h"
-
-#include "hash.h"
-
-#ifdef STANDALONE
-# undef xmalloc
-# undef xrealloc
-# undef xfree
-
-# define xmalloc malloc
-# define xrealloc realloc
+#ifndef STANDALONE
+/* Get Wget's utility headers. */
+# include "wget.h"
+# include "utils.h"
+#else
+/* Make do without them. */
+# define xnew(x) xmalloc (sizeof (x))
+# define xnew_array(type, x) xmalloc (sizeof (type) * (x))
+# define xmalloc malloc /* or something that exits
+ if not enough memory */
# define xfree free
-
-# undef TOLOWER
+# define countof(x) (sizeof (x) / sizeof ((x)[0]))
# define TOLOWER(x) ('A' <= (x) && (x) <= 'Z' ? (x) - 32 : (x))
#endif
+#include "hash.h"
+
/* INTERFACE:
Hash tables are a technique used to implement mapping between
The hash table is implemented as an open-addressed table with
linear probing collision resolution.
- For those not up to CS parlance, it means that all the hash entries
- (pairs of pointers key and value) are stored in a contiguous array.
- The position of each mapping is determined by the hash value of its
- key and the size of the table: location := hash(key) % size. If
- two different keys end up on the same position (collide), the one
- that came second is placed at the next empty position following the
- occupied place. This collision resolution technique is called
- "linear probing".
+ The above means that all the hash entries (pairs of pointers, key
+ and value) are stored in a contiguous array. The position of each
+ mapping is determined by the hash value of its key and the size of
+ the table: location := hash(key) % size. If two different keys end
+ up on the same position (collide), the one that came second is
+ placed at the next empty position following the occupied place.
+ This collision resolution technique is called "linear probing".
There are more advanced collision resolution methods (quadratic
probing, double hashing), but we don't use them because they incur
void *value;
};
+typedef unsigned long (*hashfun_t) (const void *);
+typedef int (*testfun_t) (const void *, const void *);
+
struct hash_table {
- unsigned long (*hash_function) PARAMS ((const void *));
- int (*test_function) PARAMS ((const void *, const void *));
+ hashfun_t hash_function;
+ testfun_t test_function;
+ struct mapping *mappings; /* pointer to the table entries. */
int size; /* size of the array. */
- int count; /* number of non-empty entries. */
+ int count; /* number of non-empty entries. */
int resize_threshold; /* after size exceeds this number of
entries, resize the table. */
int prime_offset; /* the offset of the current prime in
the prime table. */
-
- struct mapping *mappings; /* the array of mapping pairs. */
};
-/* We use all-bit-set marker to mean that a mapping is empty. It is
- (hopefully) illegal as a pointer, and it allows the users to use
- NULL (as well as any non-negative integer) as key. */
-#define NON_EMPTY(mp) (mp->key != (void *)~(unsigned long)0)
+/* We use the all-bits-set constant (INVALID_PTR) marker to mean that
+ a mapping is empty. It is unaligned and therefore illegal as a
+ pointer. INVALID_PTR_BYTE (0xff) is the one-byte value used to
+ initialize the mappings array as empty.
+
+ The all-bits-set value is a better choice than NULL because it
+ allows the use of NULL/0 keys. Since the keys are either integers
+ or pointers, the only key that cannot be used is the integer value
+ -1. This is acceptable because it still allows the use of
+ nonnegative integer keys. */
+
+#define INVALID_PTR ((void *) ~0UL)
+#ifndef UCHAR_MAX
+# define UCHAR_MAX 0xff
+#endif
+#define INVALID_PTR_BYTE UCHAR_MAX
+
+#define NON_EMPTY(mp) ((mp)->key != INVALID_PTR)
+#define MARK_AS_EMPTY(mp) ((mp)->key = INVALID_PTR)
/* "Next" mapping is the mapping after MP, but wrapping back to
MAPPINGS when MP would reach MAPPINGS+SIZE. */
#define LOOP_NON_EMPTY(mp, mappings, size) \
for (; NON_EMPTY (mp); mp = NEXT_MAPPING (mp, mappings, size))
-/* #### Some implementations multiply the hash with the "golden ratio"
- of the table to get better spread for keys that do not come from a
- good hashing source. I'm not sure if that is necessary for the
- hash functions we use. */
+/* Return the position of KEY in hash table SIZE large, hash function
+ being HASHFUN. */
+#define HASH_POSITION(key, hashfun, size) ((hashfun) (key) % size)
-#define HASH_POSITION(ht, key) (ht->hash_function (key) % ht->size)
-
-/* Find a prime near, but greather than or equal to SIZE. Of course,
- the primes are not calculated, but looked up from a table. The
- table does not contain all primes in range, just a selection useful
+/* Find a prime near, but greather than or equal to SIZE. The primes
+ are looked up from a table with a selection of primes convenient
for this purpose.
- PRIME_OFFSET is a minor optimization: if specified, it starts the
- search for the prime number beginning with the specific offset in
- the prime number table. The final offset is stored in the same
- variable. */
+ PRIME_OFFSET is a minor optimization: it specifies start position
+ for the search for the large enough prime. The final offset is
+ stored in the same variable. That way the list of primes does not
+ have to be scanned from the beginning each time around. */
static int
prime_size (int size, int *prime_offset)
{
- static const unsigned long primes [] = {
+ static const int primes[] = {
13, 19, 29, 41, 59, 79, 107, 149, 197, 263, 347, 457, 599, 787, 1031,
1361, 1777, 2333, 3037, 3967, 5167, 6719, 8737, 11369, 14783,
19219, 24989, 32491, 42257, 54941, 71429, 92861, 120721, 156941,
10445899, 13579681, 17653589, 22949669, 29834603, 38784989,
50420551, 65546729, 85210757, 110774011, 144006217, 187208107,
243370577, 316381771, 411296309, 534685237, 695090819, 903618083,
- 1174703521, 1527114613, 1985248999,
- (unsigned long)0x99d43ea5, (unsigned long)0xc7fa5177
+ 1174703521, 1527114613, 1837299131, 2147483647
};
- int i = *prime_offset;
+ int i;
- for (; i < countof (primes); i++)
+ for (i = *prime_offset; i < countof (primes); i++)
if (primes[i] >= size)
{
/* Set the offset to the next prime. That is safe because,
}
abort ();
- return 0;
}
-static unsigned long ptrhash PARAMS ((const void *));
-static int ptrcmp PARAMS ((const void *, const void *));
+static int cmp_pointer (const void *, const void *);
/* Create a hash table with hash function HASH_FUNCTION and test
function TEST_FUNCTION. The table is empty (its count is 0), but
used as size unchanged. To start with a small table that grows as
needed, simply specify zero ITEMS.
- If HASH_FUNCTION is not provided, identity table is assumed,
- i.e. key pointers are compared as keys. If you want strings with
- equal contents to hash the same, use make_string_hash_table. */
+ If hash and test callbacks are not specified, identity mapping is
+ assumed, i.e. pointer values are used for key comparison. (Common
+ Lisp calls such tables EQ hash tables, and Java calls them
+ IdentityHashMaps.) If your keys require different comparison,
+ specify hash and test functions. For easy use of C strings as hash
+ keys, you can use the convenience functions make_string_hash_table
+ and make_nocase_string_hash_table. */
struct hash_table *
hash_table_new (int items,
int size;
struct hash_table *ht = xnew (struct hash_table);
- ht->hash_function = hash_function ? hash_function : ptrhash;
- ht->test_function = test_function ? test_function : ptrcmp;
+ ht->hash_function = hash_function ? hash_function : hash_pointer;
+ ht->test_function = test_function ? test_function : cmp_pointer;
/* If the size of struct hash_table ever becomes a concern, this
field can go. (Wget doesn't create many hashes.) */
/*assert (ht->resize_threshold >= items);*/
ht->mappings = xnew_array (struct mapping, ht->size);
+
/* Mark mappings as empty. We use 0xff rather than 0 to mark empty
- keys because it allows us to store NULL keys to the table. */
- memset (ht->mappings, 255, size * sizeof (struct mapping));
+ keys because it allows us to use NULL/0 as keys. */
+ memset (ht->mappings, INVALID_PTR_BYTE, size * sizeof (struct mapping));
ht->count = 0;
{
struct mapping *mappings = ht->mappings;
int size = ht->size;
- struct mapping *mp = mappings + HASH_POSITION (ht, key);
- int (*equals) PARAMS ((const void *, const void *)) = ht->test_function;
+ struct mapping *mp = mappings + HASH_POSITION (key, ht->hash_function, size);
+ testfun_t equals = ht->test_function;
LOOP_NON_EMPTY (mp, mappings, size)
if (equals (key, mp->key))
static void
grow_hash_table (struct hash_table *ht)
{
+ hashfun_t hasher = ht->hash_function;
struct mapping *old_mappings = ht->mappings;
struct mapping *old_end = ht->mappings + ht->size;
struct mapping *mp, *mappings;
ht->resize_threshold = newsize * HASH_MAX_FULLNESS;
mappings = xnew_array (struct mapping, newsize);
- memset (mappings, 255, newsize * sizeof (struct mapping));
+ memset (mappings, INVALID_PTR_BYTE, newsize * sizeof (struct mapping));
ht->mappings = mappings;
for (mp = old_mappings; mp < old_end; mp++)
if (NON_EMPTY (mp))
{
- struct mapping *new_mp = mappings + HASH_POSITION (ht, mp->key);
+ struct mapping *new_mp;
/* We don't need to test for uniqueness of keys because they
come from the hash table and are therefore known to be
unique. */
+ new_mp = mappings + HASH_POSITION (mp->key, hasher, newsize);
LOOP_NON_EMPTY (new_mp, mappings, newsize)
;
*new_mp = *mp;
{
int size = ht->size;
struct mapping *mappings = ht->mappings;
+ hashfun_t hasher = ht->hash_function;
- mp->key = NULL;
+ MARK_AS_EMPTY (mp);
--ht->count;
/* Rehash all the entries following MP. The alternative
approach is to mark the entry as deleted, i.e. create a
- "tombstone". That makes remove faster, but leaves a lot of
+ "tombstone". That speeds up removal, but leaves a lot of
garbage and slows down hash_table_get and hash_table_put. */
mp = NEXT_MAPPING (mp, mappings, size);
LOOP_NON_EMPTY (mp, mappings, size)
{
const void *key2 = mp->key;
- struct mapping *mp_new = mappings + HASH_POSITION (ht, key2);
+ struct mapping *mp_new;
/* Find the new location for the key. */
-
+ mp_new = mappings + HASH_POSITION (key2, hasher, size);
LOOP_NON_EMPTY (mp_new, mappings, size)
if (key2 == mp_new->key)
/* The mapping MP (key2) is already where we want it (in
goto next_rehash;
*mp_new = *mp;
- mp->key = NULL;
+ MARK_AS_EMPTY (mp);
next_rehash:
;
void
hash_table_clear (struct hash_table *ht)
{
- memset (ht->mappings, '\0', ht->size * sizeof (struct mapping));
+ memset (ht->mappings, INVALID_PTR_BYTE, ht->size * sizeof (struct mapping));
ht->count = 0;
}
don't strictly belong to this file. However, this is as good a
place for them as any. */
-/* Rules for creating custom hash and test functions:
+/* Guidelines for creating custom hash and test functions:
- The test function returns non-zero for keys that are considered
"equal", zero otherwise.
We used to use the popular hash function from the Dragon Book, but
this one seems to perform much better. */
-unsigned long
-string_hash (const void *key)
+static unsigned long
+hash_string (const void *key)
{
const char *p = key;
unsigned int h = *p;
/* Frontend for strcmp usable for hash tables. */
-int
-string_cmp (const void *s1, const void *s2)
+static int
+cmp_string (const void *s1, const void *s2)
{
return !strcmp ((const char *)s1, (const char *)s2);
}
struct hash_table *
make_string_hash_table (int items)
{
- return hash_table_new (items, string_hash, string_cmp);
+ return hash_table_new (items, hash_string, cmp_string);
}
/*
*
*/
-/* Like string_hash, but produce the same hash regardless of the case. */
+/* Like hash_string, but produce the same hash regardless of the case. */
static unsigned long
-string_hash_nocase (const void *key)
+hash_string_nocase (const void *key)
{
const char *p = key;
unsigned int h = TOLOWER (*p);
struct hash_table *
make_nocase_string_hash_table (int items)
{
- return hash_table_new (items, string_hash_nocase, string_cmp_nocase);
+ return hash_table_new (items, hash_string_nocase, string_cmp_nocase);
}
-/* Hashing of pointers. Used for hash tables that are keyed by
- pointer identity. (Common Lisp calls them EQ hash tables, and Java
- calls them IdentityHashMaps.) */
+/* Hashing of numeric values, such as pointers and integers.
-static unsigned long
-ptrhash (const void *ptr)
+ This implementation is the Robert Jenkins' 32 bit Mix Function,
+ with a simple adaptation for 64-bit values. It offers excellent
+ spreading of values and doesn't need to know the hash table size to
+ work (unlike the very popular Knuth's multiplication hash). */
+
+unsigned long
+hash_pointer (const void *ptr)
{
unsigned long key = (unsigned long)ptr;
key += (key << 12);
}
static int
-ptrcmp (const void *ptr1, const void *ptr2)
+cmp_pointer (const void *ptr1, const void *ptr2)
{
return ptr1 == ptr2;
}
\f
-#ifdef STANDALONE
+#ifdef TEST
#include <stdio.h>
#include <string.h>
#endif
return 0;
}
-#endif
+#endif /* TEST */