2 Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
3 2008 Free Software Foundation, Inc.
5 This file is part of GNU Wget.
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or (at
10 your option) any later version.
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with Wget. If not, see <http://www.gnu.org/licenses/>.
20 Additional permission under GNU GPL version 3 section 7
22 If you modify this program, or any covered work, by linking or
23 combining it with the OpenSSL project's OpenSSL library (or a
24 modified version of that library), containing parts covered by the
25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26 grants you additional permission to convey the resulting work.
27 Corresponding Source for a non-source form of such a combination
28 shall include the source code for the parts of OpenSSL used as well
29 as that of the covered work. */
31 /* With -DSTANDALONE, this file can be compiled outside Wget source
32 tree. To test, also use -DTEST. */
34 #define USE_GNULIB_ALLOC
47 /* Get Wget's utility headers. */
50 /* Make do without them. */
51 # define xnew(x) xmalloc (sizeof (x))
52 # define xnew_array(type, x) xmalloc (sizeof (type) * (x))
53 # define xmalloc malloc
56 # define countof(x) (sizeof (x) / sizeof ((x)[0]))
59 # define c_tolower(x) tolower ((unsigned char) (x))
60 # if __STDC_VERSION__ >= 199901L
61 # include <stdint.h> /* for uintptr_t */
63 typedef unsigned long uintptr_t;
71 Hash tables are a technique used to implement mapping between
72 objects with near-constant-time access and storage. The table
73 associates keys to values, and a value can be very quickly
74 retrieved by providing the key. Fast lookup tables are typically
75 implemented as hash tables.
78 hash_table_new -- creates the table.
79 hash_table_destroy -- destroys the table.
80 hash_table_put -- establishes or updates key->value mapping.
81 hash_table_get -- retrieves value of key.
82 hash_table_get_pair -- get key/value pair for key.
83 hash_table_contains -- test whether the table contains key.
84 hash_table_remove -- remove key->value mapping for given key.
85 hash_table_for_each -- call function for each table entry.
86 hash_table_iterate -- iterate over entries in hash table.
87 hash_table_iter_next -- return next element during iteration.
88 hash_table_clear -- clear hash table contents.
89 hash_table_count -- return the number of entries in the table.
91 The hash table grows internally as new entries are added and is not
92 limited in size, except by available memory. The table doubles
93 with each resize, which ensures that the amortized time per
94 operation remains constant.
96 If not instructed otherwise, tables created by hash_table_new
97 consider the keys to be equal if their pointer values are the same.
98 You can use make_string_hash_table to create tables whose keys are
99 considered equal if their string contents are the same. In the
100 general case, the criterion of equality used to compare keys is
101 specified at table creation time with two callback functions,
102 "hash" and "test". The hash function transforms the key into an
103 arbitrary number that must be the same for two equal keys. The
104 test function accepts two keys and returns non-zero if they are to
107 Note that neither keys nor values are copied when inserted into the
108 hash table, so they must exist for the lifetime of the table. This
109 means that e.g. the use of static strings is OK, but objects with a
110 shorter life-time probably need to be copied (with strdup() or the
111 like in the case of strings) before being inserted. */
115 The hash table is implemented as an open-addressed table with
116 linear probing collision resolution.
118 The above means that all the cells (each cell containing a key and
119 a value pointer) are stored in a contiguous array. Array position
120 of each cell is determined by the hash value of its key and the
121 size of the table: location := hash(key) % size. If two different
122 keys end up on the same position (collide), the one that came
123 second is stored in the first unoccupied cell that follows it.
124 This collision resolution technique is called "linear probing".
126 There are more advanced collision resolution methods (quadratic
127 probing, double hashing), but we don't use them because they incur
128 more non-sequential access to the array, which results in worse CPU
129 cache behavior. Linear probing works well as long as the
130 count/size ratio (fullness) is kept below 75%. We make sure to
131 grow and rehash the table whenever this threshold is exceeded.
133 Collisions complicate deletion because simply clearing a cell
134 followed by previously collided entries would cause those neighbors
135 to not be picked up by find_cell later. One solution is to leave a
136 "tombstone" marker instead of clearing the cell, and another is to
137 recalculate the positions of adjacent cells. We take the latter
138 approach because it results in less bookkeeping garbage and faster
139 retrieval at the (slight) expense of deletion. */
141 /* Maximum allowed fullness: when hash table's fullness exceeds this
142 value, the table is resized. */
143 #define HASH_MAX_FULLNESS 0.75
145 /* The hash table size is multiplied by this factor (and then rounded
146 to the next prime) with each resize. This guarantees infrequent
148 #define HASH_RESIZE_FACTOR 2
155 typedef unsigned long (*hashfun_t) (const void *);
156 typedef int (*testfun_t) (const void *, const void *);
159 hashfun_t hash_function;
160 testfun_t test_function;
162 struct cell *cells; /* contiguous array of cells. */
163 int size; /* size of the array. */
165 int count; /* number of occupied entries. */
166 int resize_threshold; /* after size exceeds this number of
167 entries, resize the table. */
168 int prime_offset; /* the offset of the current prime in
172 /* We use the all-bits-set constant (INVALID_PTR) marker to mean that
173 a cell is empty. It is unaligned and therefore illegal as a
174 pointer. INVALID_PTR_CHAR (0xff) is the single-character constant
175 used to initialize the entire cells array as empty.
177 The all-bits-set value is a better choice than NULL because it
178 allows the use of NULL/0 keys. Since the keys are either integers
179 or pointers, the only key that cannot be used is the integer value
180 -1. This is acceptable because it still allows the use of
181 nonnegative integer keys. */
183 #define INVALID_PTR ((void *) ~(uintptr_t) 0)
185 # define UCHAR_MAX 0xff
187 #define INVALID_PTR_CHAR UCHAR_MAX
189 /* Whether the cell C is occupied (non-empty). */
190 #define CELL_OCCUPIED(c) ((c)->key != INVALID_PTR)
192 /* Clear the cell C, i.e. mark it as empty (unoccupied). */
193 #define CLEAR_CELL(c) ((c)->key = INVALID_PTR)
195 /* "Next" cell is the cell following C, but wrapping back to CELLS
196 when C would reach CELLS+SIZE. */
197 #define NEXT_CELL(c, cells, size) (c != cells + (size - 1) ? c + 1 : cells)
199 /* Loop over occupied cells starting at C, terminating the loop when
200 an empty cell is encountered. */
201 #define FOREACH_OCCUPIED_ADJACENT(c, cells, size) \
202 for (; CELL_OCCUPIED (c); c = NEXT_CELL (c, cells, size))
204 /* Return the position of KEY in hash table SIZE large, hash function
206 #define HASH_POSITION(key, hashfun, size) ((hashfun) (key) % size)
208 /* Find a prime near, but greather than or equal to SIZE. The primes
209 are looked up from a table with a selection of primes convenient
212 PRIME_OFFSET is a minor optimization: it specifies start position
213 for the search for the large enough prime. The final offset is
214 stored in the same variable. That way the list of primes does not
215 have to be scanned from the beginning each time around. */
218 prime_size (int size, int *prime_offset)
220 static const int primes[] = {
221 13, 19, 29, 41, 59, 79, 107, 149, 197, 263, 347, 457, 599, 787, 1031,
222 1361, 1777, 2333, 3037, 3967, 5167, 6719, 8737, 11369, 14783,
223 19219, 24989, 32491, 42257, 54941, 71429, 92861, 120721, 156941,
224 204047, 265271, 344857, 448321, 582821, 757693, 985003, 1280519,
225 1664681, 2164111, 2813353, 3657361, 4754591, 6180989, 8035301,
226 10445899, 13579681, 17653589, 22949669, 29834603, 38784989,
227 50420551, 65546729, 85210757, 110774011, 144006217, 187208107,
228 243370577, 316381771, 411296309, 534685237, 695090819, 903618083,
229 1174703521, 1527114613, 1837299131, 2147483647
233 for (i = *prime_offset; i < countof (primes); i++)
234 if (primes[i] >= size)
236 /* Set the offset to the next prime. That is safe because,
237 next time we are called, it will be with a larger SIZE,
238 which means we could never return the same prime anyway.
239 (If that is not the case, the caller can simply reset
241 *prime_offset = i + 1;
248 static int cmp_pointer (const void *, const void *);
250 /* Create a hash table with hash function HASH_FUNCTION and test
251 function TEST_FUNCTION. The table is empty (its count is 0), but
252 pre-allocated to store at least ITEMS items.
254 ITEMS is the number of items that the table can accept without
255 needing to resize. It is useful when creating a table that is to
256 be immediately filled with a known number of items. In that case,
257 the regrows are a waste of time, and specifying ITEMS correctly
258 will avoid them altogether.
260 Note that hash tables grow dynamically regardless of ITEMS. The
261 only use of ITEMS is to preallocate the table and avoid unnecessary
262 dynamic regrows. Don't bother making ITEMS prime because it's not
263 used as size unchanged. To start with a small table that grows as
264 needed, simply specify zero ITEMS.
266 If hash and test callbacks are not specified, identity mapping is
267 assumed, i.e. pointer values are used for key comparison. (Common
268 Lisp calls such tables EQ hash tables, and Java calls them
269 IdentityHashMaps.) If your keys require different comparison,
270 specify hash and test functions. For easy use of C strings as hash
271 keys, you can use the convenience functions make_string_hash_table
272 and make_nocase_string_hash_table. */
275 hash_table_new (int items,
276 unsigned long (*hash_function) (const void *),
277 int (*test_function) (const void *, const void *))
280 struct hash_table *ht = xnew (struct hash_table);
282 ht->hash_function = hash_function ? hash_function : hash_pointer;
283 ht->test_function = test_function ? test_function : cmp_pointer;
285 /* If the size of struct hash_table ever becomes a concern, this
286 field can go. (Wget doesn't create many hashes.) */
287 ht->prime_offset = 0;
289 /* Calculate the size that ensures that the table will store at
290 least ITEMS keys without the need to resize. */
291 size = 1 + items / HASH_MAX_FULLNESS;
292 size = prime_size (size, &ht->prime_offset);
294 ht->resize_threshold = size * HASH_MAX_FULLNESS;
295 /*assert (ht->resize_threshold >= items);*/
297 ht->cells = xnew_array (struct cell, ht->size);
299 /* Mark cells as empty. We use 0xff rather than 0 to mark empty
300 keys because it allows us to use NULL/0 as keys. */
301 memset (ht->cells, INVALID_PTR_CHAR, size * sizeof (struct cell));
308 /* Free the data associated with hash table HT. */
311 hash_table_destroy (struct hash_table *ht)
317 /* The heart of most functions in this file -- find the cell whose
318 KEY is equal to key, using linear probing. Returns the cell
319 that matches KEY, or the first empty cell if none matches. */
321 static inline struct cell *
322 find_cell (const struct hash_table *ht, const void *key)
324 struct cell *cells = ht->cells;
326 struct cell *c = cells + HASH_POSITION (key, ht->hash_function, size);
327 testfun_t equals = ht->test_function;
329 FOREACH_OCCUPIED_ADJACENT (c, cells, size)
330 if (equals (key, c->key))
335 /* Get the value that corresponds to the key KEY in the hash table HT.
336 If no value is found, return NULL. Note that NULL is a legal value
337 for value; if you are storing NULLs in your hash table, you can use
338 hash_table_contains to be sure that a (possibly NULL) value exists
339 in the table. Or, you can use hash_table_get_pair instead of this
343 hash_table_get (const struct hash_table *ht, const void *key)
345 struct cell *c = find_cell (ht, key);
346 if (CELL_OCCUPIED (c))
352 /* Like hash_table_get, but writes out the pointers to both key and
353 value. Returns non-zero on success. */
356 hash_table_get_pair (const struct hash_table *ht, const void *lookup_key,
357 void *orig_key, void *value)
359 struct cell *c = find_cell (ht, lookup_key);
360 if (CELL_OCCUPIED (c))
363 *(void **)orig_key = c->key;
365 *(void **)value = c->value;
372 /* Return 1 if HT contains KEY, 0 otherwise. */
375 hash_table_contains (const struct hash_table *ht, const void *key)
377 struct cell *c = find_cell (ht, key);
378 return CELL_OCCUPIED (c);
381 /* Grow hash table HT as necessary, and rehash all the key-value
385 grow_hash_table (struct hash_table *ht)
387 hashfun_t hasher = ht->hash_function;
388 struct cell *old_cells = ht->cells;
389 struct cell *old_end = ht->cells + ht->size;
390 struct cell *c, *cells;
393 newsize = prime_size (ht->size * HASH_RESIZE_FACTOR, &ht->prime_offset);
395 printf ("growing from %d to %d; fullness %.2f%% to %.2f%%\n",
397 100.0 * ht->count / ht->size,
398 100.0 * ht->count / newsize);
402 ht->resize_threshold = newsize * HASH_MAX_FULLNESS;
404 cells = xnew_array (struct cell, newsize);
405 memset (cells, INVALID_PTR_CHAR, newsize * sizeof (struct cell));
408 for (c = old_cells; c < old_end; c++)
409 if (CELL_OCCUPIED (c))
412 /* We don't need to test for uniqueness of keys because they
413 come from the hash table and are therefore known to be
415 new_c = cells + HASH_POSITION (c->key, hasher, newsize);
416 FOREACH_OCCUPIED_ADJACENT (new_c, cells, newsize)
424 /* Put VALUE in the hash table HT under the key KEY. This regrows the
425 table if necessary. */
428 hash_table_put (struct hash_table *ht, const void *key, void *value)
430 struct cell *c = find_cell (ht, key);
431 if (CELL_OCCUPIED (c))
433 /* update existing item */
434 c->key = (void *)key; /* const? */
439 /* If adding the item would make the table exceed max. fullness,
440 grow the table first. */
441 if (ht->count >= ht->resize_threshold)
443 grow_hash_table (ht);
444 c = find_cell (ht, key);
449 c->key = (void *)key; /* const? */
453 /* Remove KEY->value mapping from HT. Return 0 if there was no such
454 entry; return 1 if an entry was removed. */
457 hash_table_remove (struct hash_table *ht, const void *key)
459 struct cell *c = find_cell (ht, key);
460 if (!CELL_OCCUPIED (c))
465 struct cell *cells = ht->cells;
466 hashfun_t hasher = ht->hash_function;
471 /* Rehash all the entries following C. The alternative
472 approach is to mark the entry as deleted, i.e. create a
473 "tombstone". That speeds up removal, but leaves a lot of
474 garbage and slows down hash_table_get and hash_table_put. */
476 c = NEXT_CELL (c, cells, size);
477 FOREACH_OCCUPIED_ADJACENT (c, cells, size)
479 const void *key2 = c->key;
482 /* Find the new location for the key. */
483 c_new = cells + HASH_POSITION (key2, hasher, size);
484 FOREACH_OCCUPIED_ADJACENT (c_new, cells, size)
485 if (key2 == c_new->key)
486 /* The cell C (key2) is already where we want it (in
487 C_NEW's "chain" of keys.) */
500 /* Clear HT of all entries. After calling this function, the count
501 and the fullness of the hash table will be zero. The size will
505 hash_table_clear (struct hash_table *ht)
507 memset (ht->cells, INVALID_PTR_CHAR, ht->size * sizeof (struct cell));
511 /* Call FN for each entry in HT. FN is called with three arguments:
512 the key, the value, and ARG. When FN returns a non-zero value, the
515 It is undefined what happens if you add or remove entries in the
516 hash table while hash_table_for_each is running. The exception is
517 the entry you're currently mapping over; you may call
518 hash_table_put or hash_table_remove on that entry's key. That is
519 also the reason why this function cannot be implemented in terms of
520 hash_table_iterate. */
523 hash_table_for_each (struct hash_table *ht,
524 int (*fn) (void *, void *, void *), void *arg)
526 struct cell *c = ht->cells;
527 struct cell *end = ht->cells + ht->size;
530 if (CELL_OCCUPIED (c))
535 if (fn (key, c->value, arg))
537 /* hash_table_remove might have moved the adjacent cells. */
538 if (c->key != key && CELL_OCCUPIED (c))
543 /* Initiate iteration over HT. Entries are obtained with
544 hash_table_iter_next, a typical iteration loop looking like this:
546 hash_table_iterator iter;
547 for (hash_table_iterate (ht, &iter); hash_table_iter_next (&iter); )
548 ... do something with iter.key and iter.value ...
550 The iterator does not need to be deallocated after use. The hash
551 table must not be modified while being iterated over. */
554 hash_table_iterate (struct hash_table *ht, hash_table_iterator *iter)
556 iter->pos = ht->cells;
557 iter->end = ht->cells + ht->size;
560 /* Get the next hash table entry. ITER is an iterator object
561 initialized using hash_table_iterate. While there are more
562 entries, the key and value pointers are stored to ITER->key and
563 ITER->value respectively and 1 is returned. When there are no more
564 entries, 0 is returned.
566 If the hash table is modified between calls to this function, the
567 result is undefined. */
570 hash_table_iter_next (hash_table_iterator *iter)
572 struct cell *c = iter->pos;
573 struct cell *end = iter->end;
575 if (CELL_OCCUPIED (c))
578 iter->value = c->value;
585 /* Return the number of elements in the hash table. This is not the
586 same as the physical size of the hash table, which is always
587 greater than the number of elements. */
590 hash_table_count (const struct hash_table *ht)
595 /* Functions from this point onward are meant for convenience and
596 don't strictly belong to this file. However, this is as good a
597 place for them as any. */
599 /* Guidelines for creating custom hash and test functions:
601 - The test function returns non-zero for keys that are considered
602 "equal", zero otherwise.
604 - The hash function returns a number that represents the
605 "distinctness" of the object. In more precise terms, it means
606 that for any two objects that test "equal" under the test
607 function, the hash function MUST produce the same result.
609 This does not mean that all different objects must produce
610 different values (that would be "perfect" hashing), only that
611 non-distinct objects must produce the same values! For instance,
612 a hash function that returns 0 for any given object is a
613 perfectly valid (albeit extremely bad) hash function. A hash
614 function that hashes a string by adding up all its characters is
615 another example of a valid (but still quite bad) hash function.
617 It is not hard to make hash and test functions agree about
618 equality. For example, if the test function compares strings
619 case-insensitively, the hash function can lower-case the
620 characters when calculating the hash value. That ensures that
621 two strings differing only in case will hash the same.
623 - To prevent performance degradation, choose a hash function with
624 as good "spreading" as possible. A good hash function will use
625 all the bits of the input when calculating the hash, and will
626 react to even small changes in input with a completely different
627 output. But don't make the hash function itself overly slow,
628 because you'll be incurring a non-negligible overhead to all hash
632 * Support for hash tables whose keys are strings.
636 /* Base 31 hash function. Taken from Gnome's glib, modified to use
639 We used to use the popular hash function from the Dragon Book, but
640 this one seems to perform much better, both by being faster and by
641 generating less collisions. */
644 hash_string (const void *key)
650 for (p += 1; *p != '\0'; p++)
651 h = (h << 5) - h + *p;
656 /* Frontend for strcmp usable for hash tables. */
659 cmp_string (const void *s1, const void *s2)
661 return !strcmp ((const char *)s1, (const char *)s2);
664 /* Return a hash table of preallocated to store at least ITEMS items
665 suitable to use strings as keys. */
668 make_string_hash_table (int items)
670 return hash_table_new (items, hash_string, cmp_string);
674 * Support for hash tables whose keys are strings, but which are
675 * compared case-insensitively.
679 /* Like hash_string, but produce the same hash regardless of the case. */
682 hash_string_nocase (const void *key)
685 unsigned int h = c_tolower (*p);
688 for (p += 1; *p != '\0'; p++)
689 h = (h << 5) - h + c_tolower (*p);
694 /* Like string_cmp, but doing case-insensitive compareison. */
697 string_cmp_nocase (const void *s1, const void *s2)
699 return !strcasecmp ((const char *)s1, (const char *)s2);
702 /* Like make_string_hash_table, but uses string_hash_nocase and
703 string_cmp_nocase. */
706 make_nocase_string_hash_table (int items)
708 return hash_table_new (items, hash_string_nocase, string_cmp_nocase);
711 /* Hashing of numeric values, such as pointers and integers.
713 This implementation is the Robert Jenkins' 32 bit Mix Function,
714 with a simple adaptation for 64-bit values. According to Jenkins
715 it should offer excellent spreading of values. Unlike the popular
716 Knuth's multiplication hash, this function doesn't need to know the
717 hash table size to work. */
720 hash_pointer (const void *ptr)
722 uintptr_t key = (uintptr_t) ptr;
731 #if SIZEOF_VOID_P > 4
741 return (unsigned long) key;
745 cmp_pointer (const void *ptr1, const void *ptr2)
756 print_hash (struct hash_table *sht)
758 hash_table_iterator iter;
761 for (hash_table_iterate (sht, &iter); hash_table_iter_next (&iter);
763 printf ("%s: %s\n", iter.key, iter.value);
764 assert (count == sht->count);
770 struct hash_table *ht = make_string_hash_table (0);
772 while ((fgets (line, sizeof (line), stdin)))
774 int len = strlen (line);
778 if (!hash_table_contains (ht, line))
779 hash_table_put (ht, strdup (line), "here I am!");
784 if (hash_table_get_pair (ht, line, &line_copy, NULL))
786 hash_table_remove (ht, line);
796 printf ("%d %d\n", ht->count, ht->size);