2 Copyright (C) 2000-2005 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software Foundation, Inc.,
18 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 In addition, as a special exception, the Free Software Foundation
21 gives permission to link the code of its release of Wget with the
22 OpenSSL project's "OpenSSL" library (or with modified versions of it
23 that use the same license as the "OpenSSL" library), and distribute
24 the linked executables. You must obey the GNU General Public License
25 in all respects for all of the code used other than "OpenSSL". If you
26 modify this file, you may extend this exception to your version of the
27 file, but you are not obligated to do so. If you do not wish to do
28 so, delete this exception statement from your version. */
30 /* With -DSTANDALONE, this file can be compiled outside Wget source
31 tree. To test, also use -DTEST. */
44 /* Get Wget's utility headers. */
48 /* Make do without them. */
49 # define xnew(x) xmalloc (sizeof (x))
50 # define xnew_array(type, x) xmalloc (sizeof (type) * (x))
51 # define xmalloc malloc
54 # define countof(x) (sizeof (x) / sizeof ((x)[0]))
56 # define TOLOWER(x) ('A' <= (x) && (x) <= 'Z' ? (x) - 32 : (x))
57 # if __STDC_VERSION__ >= 199901L
58 # include <stdint.h> /* for uintptr_t */
60 typedef unsigned long uintptr_t;
68 Hash tables are a technique used to implement mapping between
69 objects with near-constant-time access and storage. The table
70 associates keys to values, and a value can be very quickly
71 retrieved by providing the key. Fast lookup tables are typically
72 implemented as hash tables.
75 hash_table_new -- creates the table.
76 hash_table_destroy -- destroys the table.
77 hash_table_put -- establishes or updates key->value mapping.
78 hash_table_get -- retrieves value of key.
79 hash_table_get_pair -- get key/value pair for key.
80 hash_table_contains -- test whether the table contains key.
81 hash_table_remove -- remove key->value mapping for given key.
82 hash_table_for_each -- call function for each table entry.
83 hash_table_iterate -- iterate over entries in hash table.
84 hash_table_iter_next -- return next element during iteration.
85 hash_table_clear -- clear hash table contents.
86 hash_table_count -- return the number of entries in the table.
88 The hash table grows internally as new entries are added and is not
89 limited in size, except by available memory. The table doubles
90 with each resize, which ensures that the amortized time per
91 operation remains constant.
93 If not instructed otherwise, tables created by hash_table_new
94 consider the keys to be equal if their pointer values are the same.
95 You can use make_string_hash_table to create tables whose keys are
96 considered equal if their string contents are the same. In the
97 general case, the criterion of equality used to compare keys is
98 specified at table creation time with two callback functions,
99 "hash" and "test". The hash function transforms the key into an
100 arbitrary number that must be the same for two equal keys. The
101 test function accepts two keys and returns non-zero if they are to
104 Note that neither keys nor values are copied when inserted into the
105 hash table, so they must exist for the lifetime of the table. This
106 means that e.g. the use of static strings is OK, but objects with a
107 shorter life-time probably need to be copied (with strdup() or the
108 like in the case of strings) before being inserted. */
112 The hash table is implemented as an open-addressed table with
113 linear probing collision resolution.
115 The above means that all the cells (each cell containing a key and
116 a value pointer) are stored in a contiguous array. Array position
117 of each cell is determined by the hash value of its key and the
118 size of the table: location := hash(key) % size. If two different
119 keys end up on the same position (collide), the one that came
120 second is stored in the first unoccupied cell that follows it.
121 This collision resolution technique is called "linear probing".
123 There are more advanced collision resolution methods (quadratic
124 probing, double hashing), but we don't use them because they incur
125 more non-sequential access to the array, which results in worse CPU
126 cache behavior. Linear probing works well as long as the
127 count/size ratio (fullness) is kept below 75%. We make sure to
128 grow and rehash the table whenever this threshold is exceeded.
130 Collisions complicate deletion because simply clearing a cell
131 followed by previously collided entries would cause those neighbors
132 to not be picked up by find_cell later. One solution is to leave a
133 "tombstone" marker instead of clearing the cell, and another is to
134 recalculate the positions of adjacent cells. We take the latter
135 approach because it results in less bookkeeping garbage and faster
136 retrieval at the (slight) expense of deletion. */
138 /* Maximum allowed fullness: when hash table's fullness exceeds this
139 value, the table is resized. */
140 #define HASH_MAX_FULLNESS 0.75
142 /* The hash table size is multiplied by this factor (and then rounded
143 to the next prime) with each resize. This guarantees infrequent
145 #define HASH_RESIZE_FACTOR 2
152 typedef unsigned long (*hashfun_t) (const void *);
153 typedef int (*testfun_t) (const void *, const void *);
156 hashfun_t hash_function;
157 testfun_t test_function;
159 struct cell *cells; /* contiguous array of cells. */
160 int size; /* size of the array. */
162 int count; /* number of occupied entries. */
163 int resize_threshold; /* after size exceeds this number of
164 entries, resize the table. */
165 int prime_offset; /* the offset of the current prime in
169 /* We use the all-bits-set constant (INVALID_PTR) marker to mean that
170 a cell is empty. It is unaligned and therefore illegal as a
171 pointer. INVALID_PTR_CHAR (0xff) is the single-character constant
172 used to initialize the entire cells array as empty.
174 The all-bits-set value is a better choice than NULL because it
175 allows the use of NULL/0 keys. Since the keys are either integers
176 or pointers, the only key that cannot be used is the integer value
177 -1. This is acceptable because it still allows the use of
178 nonnegative integer keys. */
180 #define INVALID_PTR ((void *) ~(uintptr_t) 0)
182 # define UCHAR_MAX 0xff
184 #define INVALID_PTR_CHAR UCHAR_MAX
186 /* Whether the cell C is occupied (non-empty). */
187 #define CELL_OCCUPIED(c) ((c)->key != INVALID_PTR)
189 /* Clear the cell C, i.e. mark it as empty (unoccupied). */
190 #define CLEAR_CELL(c) ((c)->key = INVALID_PTR)
192 /* "Next" cell is the cell following C, but wrapping back to CELLS
193 when C would reach CELLS+SIZE. */
194 #define NEXT_CELL(c, cells, size) (c != cells + (size - 1) ? c + 1 : cells)
196 /* Loop over occupied cells starting at C, terminating the loop when
197 an empty cell is encountered. */
198 #define FOREACH_OCCUPIED_ADJACENT(c, cells, size) \
199 for (; CELL_OCCUPIED (c); c = NEXT_CELL (c, cells, size))
201 /* Return the position of KEY in hash table SIZE large, hash function
203 #define HASH_POSITION(key, hashfun, size) ((hashfun) (key) % size)
205 /* Find a prime near, but greather than or equal to SIZE. The primes
206 are looked up from a table with a selection of primes convenient
209 PRIME_OFFSET is a minor optimization: it specifies start position
210 for the search for the large enough prime. The final offset is
211 stored in the same variable. That way the list of primes does not
212 have to be scanned from the beginning each time around. */
215 prime_size (int size, int *prime_offset)
217 static const int primes[] = {
218 13, 19, 29, 41, 59, 79, 107, 149, 197, 263, 347, 457, 599, 787, 1031,
219 1361, 1777, 2333, 3037, 3967, 5167, 6719, 8737, 11369, 14783,
220 19219, 24989, 32491, 42257, 54941, 71429, 92861, 120721, 156941,
221 204047, 265271, 344857, 448321, 582821, 757693, 985003, 1280519,
222 1664681, 2164111, 2813353, 3657361, 4754591, 6180989, 8035301,
223 10445899, 13579681, 17653589, 22949669, 29834603, 38784989,
224 50420551, 65546729, 85210757, 110774011, 144006217, 187208107,
225 243370577, 316381771, 411296309, 534685237, 695090819, 903618083,
226 1174703521, 1527114613, 1837299131, 2147483647
230 for (i = *prime_offset; i < countof (primes); i++)
231 if (primes[i] >= size)
233 /* Set the offset to the next prime. That is safe because,
234 next time we are called, it will be with a larger SIZE,
235 which means we could never return the same prime anyway.
236 (If that is not the case, the caller can simply reset
238 *prime_offset = i + 1;
245 static int cmp_pointer (const void *, const void *);
247 /* Create a hash table with hash function HASH_FUNCTION and test
248 function TEST_FUNCTION. The table is empty (its count is 0), but
249 pre-allocated to store at least ITEMS items.
251 ITEMS is the number of items that the table can accept without
252 needing to resize. It is useful when creating a table that is to
253 be immediately filled with a known number of items. In that case,
254 the regrows are a waste of time, and specifying ITEMS correctly
255 will avoid them altogether.
257 Note that hash tables grow dynamically regardless of ITEMS. The
258 only use of ITEMS is to preallocate the table and avoid unnecessary
259 dynamic regrows. Don't bother making ITEMS prime because it's not
260 used as size unchanged. To start with a small table that grows as
261 needed, simply specify zero ITEMS.
263 If hash and test callbacks are not specified, identity mapping is
264 assumed, i.e. pointer values are used for key comparison. (Common
265 Lisp calls such tables EQ hash tables, and Java calls them
266 IdentityHashMaps.) If your keys require different comparison,
267 specify hash and test functions. For easy use of C strings as hash
268 keys, you can use the convenience functions make_string_hash_table
269 and make_nocase_string_hash_table. */
272 hash_table_new (int items,
273 unsigned long (*hash_function) (const void *),
274 int (*test_function) (const void *, const void *))
277 struct hash_table *ht = xnew (struct hash_table);
279 ht->hash_function = hash_function ? hash_function : hash_pointer;
280 ht->test_function = test_function ? test_function : cmp_pointer;
282 /* If the size of struct hash_table ever becomes a concern, this
283 field can go. (Wget doesn't create many hashes.) */
284 ht->prime_offset = 0;
286 /* Calculate the size that ensures that the table will store at
287 least ITEMS keys without the need to resize. */
288 size = 1 + items / HASH_MAX_FULLNESS;
289 size = prime_size (size, &ht->prime_offset);
291 ht->resize_threshold = size * HASH_MAX_FULLNESS;
292 /*assert (ht->resize_threshold >= items);*/
294 ht->cells = xnew_array (struct cell, ht->size);
296 /* Mark cells as empty. We use 0xff rather than 0 to mark empty
297 keys because it allows us to use NULL/0 as keys. */
298 memset (ht->cells, INVALID_PTR_CHAR, size * sizeof (struct cell));
305 /* Free the data associated with hash table HT. */
308 hash_table_destroy (struct hash_table *ht)
314 /* The heart of most functions in this file -- find the cell whose
315 KEY is equal to key, using linear probing. Returns the cell
316 that matches KEY, or the first empty cell if none matches. */
318 static inline struct cell *
319 find_cell (const struct hash_table *ht, const void *key)
321 struct cell *cells = ht->cells;
323 struct cell *c = cells + HASH_POSITION (key, ht->hash_function, size);
324 testfun_t equals = ht->test_function;
326 FOREACH_OCCUPIED_ADJACENT (c, cells, size)
327 if (equals (key, c->key))
332 /* Get the value that corresponds to the key KEY in the hash table HT.
333 If no value is found, return NULL. Note that NULL is a legal value
334 for value; if you are storing NULLs in your hash table, you can use
335 hash_table_contains to be sure that a (possibly NULL) value exists
336 in the table. Or, you can use hash_table_get_pair instead of this
340 hash_table_get (const struct hash_table *ht, const void *key)
342 struct cell *c = find_cell (ht, key);
343 if (CELL_OCCUPIED (c))
349 /* Like hash_table_get, but writes out the pointers to both key and
350 value. Returns non-zero on success. */
353 hash_table_get_pair (const struct hash_table *ht, const void *lookup_key,
354 void *orig_key, void *value)
356 struct cell *c = find_cell (ht, lookup_key);
357 if (CELL_OCCUPIED (c))
360 *(void **)orig_key = c->key;
362 *(void **)value = c->value;
369 /* Return 1 if HT contains KEY, 0 otherwise. */
372 hash_table_contains (const struct hash_table *ht, const void *key)
374 struct cell *c = find_cell (ht, key);
375 return CELL_OCCUPIED (c);
378 /* Grow hash table HT as necessary, and rehash all the key-value
382 grow_hash_table (struct hash_table *ht)
384 hashfun_t hasher = ht->hash_function;
385 struct cell *old_cells = ht->cells;
386 struct cell *old_end = ht->cells + ht->size;
387 struct cell *c, *cells;
390 newsize = prime_size (ht->size * HASH_RESIZE_FACTOR, &ht->prime_offset);
392 printf ("growing from %d to %d; fullness %.2f%% to %.2f%%\n",
394 100.0 * ht->count / ht->size,
395 100.0 * ht->count / newsize);
399 ht->resize_threshold = newsize * HASH_MAX_FULLNESS;
401 cells = xnew_array (struct cell, newsize);
402 memset (cells, INVALID_PTR_CHAR, newsize * sizeof (struct cell));
405 for (c = old_cells; c < old_end; c++)
406 if (CELL_OCCUPIED (c))
409 /* We don't need to test for uniqueness of keys because they
410 come from the hash table and are therefore known to be
412 new_c = cells + HASH_POSITION (c->key, hasher, newsize);
413 FOREACH_OCCUPIED_ADJACENT (new_c, cells, newsize)
421 /* Put VALUE in the hash table HT under the key KEY. This regrows the
422 table if necessary. */
425 hash_table_put (struct hash_table *ht, const void *key, void *value)
427 struct cell *c = find_cell (ht, key);
428 if (CELL_OCCUPIED (c))
430 /* update existing item */
431 c->key = (void *)key; /* const? */
436 /* If adding the item would make the table exceed max. fullness,
437 grow the table first. */
438 if (ht->count >= ht->resize_threshold)
440 grow_hash_table (ht);
441 c = find_cell (ht, key);
446 c->key = (void *)key; /* const? */
450 /* Remove KEY->value mapping from HT. Return 0 if there was no such
451 entry; return 1 if an entry was removed. */
454 hash_table_remove (struct hash_table *ht, const void *key)
456 struct cell *c = find_cell (ht, key);
457 if (!CELL_OCCUPIED (c))
462 struct cell *cells = ht->cells;
463 hashfun_t hasher = ht->hash_function;
468 /* Rehash all the entries following C. The alternative
469 approach is to mark the entry as deleted, i.e. create a
470 "tombstone". That speeds up removal, but leaves a lot of
471 garbage and slows down hash_table_get and hash_table_put. */
473 c = NEXT_CELL (c, cells, size);
474 FOREACH_OCCUPIED_ADJACENT (c, cells, size)
476 const void *key2 = c->key;
479 /* Find the new location for the key. */
480 c_new = cells + HASH_POSITION (key2, hasher, size);
481 FOREACH_OCCUPIED_ADJACENT (c_new, cells, size)
482 if (key2 == c_new->key)
483 /* The cell C (key2) is already where we want it (in
484 C_NEW's "chain" of keys.) */
497 /* Clear HT of all entries. After calling this function, the count
498 and the fullness of the hash table will be zero. The size will
502 hash_table_clear (struct hash_table *ht)
504 memset (ht->cells, INVALID_PTR_CHAR, ht->size * sizeof (struct cell));
508 /* Call FN for each entry in HT. FN is called with three arguments:
509 the key, the value, and ARG. When FN returns a non-zero value, the
512 It is undefined what happens if you add or remove entries in the
513 hash table while hash_table_for_each is running. The exception is
514 the entry you're currently mapping over; you may call
515 hash_table_put or hash_table_remove on that entry's key. That is
516 also the reason why this function cannot be implemented in terms of
517 hash_table_iterate. */
520 hash_table_for_each (struct hash_table *ht,
521 int (*fn) (void *, void *, void *), void *arg)
523 struct cell *c = ht->cells;
524 struct cell *end = ht->cells + ht->size;
527 if (CELL_OCCUPIED (c))
532 if (fn (key, c->value, arg))
534 /* hash_table_remove might have moved the adjacent cells. */
535 if (c->key != key && CELL_OCCUPIED (c))
540 /* Initiate iteration over HT. Entries are obtained with
541 hash_table_iter_next, a typical iteration loop looking like this:
543 hash_table_iterator iter;
544 for (hash_table_iterate (ht, &iter); hash_table_iter_next (&iter); )
545 ... do something with iter.key and iter.value ...
547 The iterator does not need to be deallocated after use. The hash
548 table must not be modified while being iterated over. */
551 hash_table_iterate (struct hash_table *ht, hash_table_iterator *iter)
553 iter->pos = ht->cells;
554 iter->end = ht->cells + ht->size;
557 /* Get the next hash table entry. ITER is an iterator object
558 initialized using hash_table_iterate. While there are more
559 entries, the key and value pointers are stored to ITER->key and
560 ITER->value respectively and 1 is returned. When there are no more
561 entries, 0 is returned.
563 If the hash table is modified between calls to this function, the
564 result is undefined. */
567 hash_table_iter_next (hash_table_iterator *iter)
569 struct cell *c = iter->pos;
570 struct cell *end = iter->end;
572 if (CELL_OCCUPIED (c))
575 iter->value = c->value;
582 /* Return the number of elements in the hash table. This is not the
583 same as the physical size of the hash table, which is always
584 greater than the number of elements. */
587 hash_table_count (const struct hash_table *ht)
592 /* Functions from this point onward are meant for convenience and
593 don't strictly belong to this file. However, this is as good a
594 place for them as any. */
596 /* Guidelines for creating custom hash and test functions:
598 - The test function returns non-zero for keys that are considered
599 "equal", zero otherwise.
601 - The hash function returns a number that represents the
602 "distinctness" of the object. In more precise terms, it means
603 that for any two objects that test "equal" under the test
604 function, the hash function MUST produce the same result.
606 This does not mean that all different objects must produce
607 different values (that would be "perfect" hashing), only that
608 non-distinct objects must produce the same values! For instance,
609 a hash function that returns 0 for any given object is a
610 perfectly valid (albeit extremely bad) hash function. A hash
611 function that hashes a string by adding up all its characters is
612 another example of a valid (but still quite bad) hash function.
614 It is not hard to make hash and test functions agree about
615 equality. For example, if the test function compares strings
616 case-insensitively, the hash function can lower-case the
617 characters when calculating the hash value. That ensures that
618 two strings differing only in case will hash the same.
620 - To prevent performance degradation, choose a hash function with
621 as good "spreading" as possible. A good hash function will use
622 all the bits of the input when calculating the hash, and will
623 react to even small changes in input with a completely different
624 output. But don't make the hash function itself overly slow,
625 because you'll be incurring a non-negligible overhead to all hash
629 * Support for hash tables whose keys are strings.
633 /* Base 31 hash function. Taken from Gnome's glib, modified to use
636 We used to use the popular hash function from the Dragon Book, but
637 this one seems to perform much better, both by being faster and by
638 generating less collisions. */
641 hash_string (const void *key)
647 for (p += 1; *p != '\0'; p++)
648 h = (h << 5) - h + *p;
653 /* Frontend for strcmp usable for hash tables. */
656 cmp_string (const void *s1, const void *s2)
658 return !strcmp ((const char *)s1, (const char *)s2);
661 /* Return a hash table of preallocated to store at least ITEMS items
662 suitable to use strings as keys. */
665 make_string_hash_table (int items)
667 return hash_table_new (items, hash_string, cmp_string);
671 * Support for hash tables whose keys are strings, but which are
672 * compared case-insensitively.
676 /* Like hash_string, but produce the same hash regardless of the case. */
679 hash_string_nocase (const void *key)
682 unsigned int h = TOLOWER (*p);
685 for (p += 1; *p != '\0'; p++)
686 h = (h << 5) - h + TOLOWER (*p);
691 /* Like string_cmp, but doing case-insensitive compareison. */
694 string_cmp_nocase (const void *s1, const void *s2)
696 return !strcasecmp ((const char *)s1, (const char *)s2);
699 /* Like make_string_hash_table, but uses string_hash_nocase and
700 string_cmp_nocase. */
703 make_nocase_string_hash_table (int items)
705 return hash_table_new (items, hash_string_nocase, string_cmp_nocase);
708 /* Hashing of numeric values, such as pointers and integers.
710 This implementation is the Robert Jenkins' 32 bit Mix Function,
711 with a simple adaptation for 64-bit values. According to Jenkins
712 it should offer excellent spreading of values. Unlike the popular
713 Knuth's multiplication hash, this function doesn't need to know the
714 hash table size to work. */
717 hash_pointer (const void *ptr)
719 uintptr_t key = (uintptr_t) ptr;
728 #if SIZEOF_VOID_P > 4
738 return (unsigned long) key;
742 cmp_pointer (const void *ptr1, const void *ptr2)
753 print_hash (struct hash_table *sht)
755 hash_table_iterator iter;
758 for (hash_table_iterate (sht, &iter); hash_table_iter_next (&iter);
760 printf ("%s: %s\n", iter.key, iter.value);
761 assert (count == sht->count);
767 struct hash_table *ht = make_string_hash_table (0);
769 while ((fgets (line, sizeof (line), stdin)))
771 int len = strlen (line);
775 if (!hash_table_contains (ht, line))
776 hash_table_put (ht, strdup (line), "here I am!");
781 if (hash_table_get_pair (ht, line, &line_copy, NULL))
783 hash_table_remove (ht, line);
793 printf ("%d %d\n", ht->count, ht->size);