2 Copyright (C) 2000-2005 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software Foundation, Inc.,
18 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 In addition, as a special exception, the Free Software Foundation
21 gives permission to link the code of its release of Wget with the
22 OpenSSL project's "OpenSSL" library (or with modified versions of it
23 that use the same license as the "OpenSSL" library), and distribute
24 the linked executables. You must obey the GNU General Public License
25 in all respects for all of the code used other than "OpenSSL". If you
26 modify this file, you may extend this exception to your version of the
27 file, but you are not obligated to do so. If you do not wish to do
28 so, delete this exception statement from your version. */
30 /* With -DSTANDALONE, this file can be compiled outside Wget source
31 tree. To test, also use -DTEST. */
44 /* Get Wget's utility headers. */
48 /* Make do without them. */
49 # define xnew(x) xmalloc (sizeof (x))
50 # define xnew_array(type, x) xmalloc (sizeof (type) * (x))
51 # define xmalloc malloc
54 # define countof(x) (sizeof (x) / sizeof ((x)[0]))
56 # define TOLOWER(x) ('A' <= (x) && (x) <= 'Z' ? (x) - 32 : (x))
63 Hash tables are a technique used to implement mapping between
64 objects with near-constant-time access and storage. The table
65 associates keys to values, and a value can be very quickly
66 retrieved by providing the key. Fast lookup tables are typically
67 implemented as hash tables.
70 hash_table_new -- creates the table.
71 hash_table_destroy -- destroys the table.
72 hash_table_put -- establishes or updates key->value mapping.
73 hash_table_get -- retrieves value of key.
74 hash_table_get_pair -- get key/value pair for key.
75 hash_table_contains -- test whether the table contains key.
76 hash_table_remove -- remove key->value mapping for given key.
77 hash_table_for_each -- call function for each table entry.
78 hash_table_iterate -- iterate over entries in hash table.
79 hash_table_iter_next -- return next element during iteration.
80 hash_table_clear -- clear hash table contents.
81 hash_table_count -- return the number of entries in the table.
83 The hash table grows internally as new entries are added and is not
84 limited in size, except by available memory. The table doubles
85 with each resize, which ensures that the amortized time per
86 operation remains constant.
88 If not instructed otherwise, tables created by hash_table_new
89 consider the keys to be equal if their pointer values are the same.
90 You can use make_string_hash_table to create tables whose keys are
91 considered equal if their string contents are the same. In the
92 general case, the criterion of equality used to compare keys is
93 specified at table creation time with two callback functions,
94 "hash" and "test". The hash function transforms the key into an
95 arbitrary number that must be the same for two equal keys. The
96 test function accepts two keys and returns non-zero if they are to
99 Note that neither keys nor values are copied when inserted into the
100 hash table, so they must exist for the lifetime of the table. This
101 means that e.g. the use of static strings is OK, but objects with a
102 shorter life-time probably need to be copied (with strdup() or the
103 like in the case of strings) before being inserted. */
107 The hash table is implemented as an open-addressed table with
108 linear probing collision resolution.
110 The above means that all the cells (each cell containing a key and
111 a value pointer) are stored in a contiguous array. Array position
112 of each cell is determined by the hash value of its key and the
113 size of the table: location := hash(key) % size. If two different
114 keys end up on the same position (collide), the one that came
115 second is stored in the first unoccupied cell that follows it.
116 This collision resolution technique is called "linear probing".
118 There are more advanced collision resolution methods (quadratic
119 probing, double hashing), but we don't use them because they incur
120 more non-sequential access to the array, which results in worse CPU
121 cache behavior. Linear probing works well as long as the
122 count/size ratio (fullness) is kept below 75%. We make sure to
123 grow and rehash the table whenever this threshold is exceeded.
125 Collisions complicate deletion because simply clearing a cell
126 followed by previously collided entries would cause those neighbors
127 to not be picked up by find_cell later. One solution is to leave a
128 "tombstone" marker instead of clearing the cell, and another is to
129 recalculate the positions of adjacent cells. We take the latter
130 approach because it results in less bookkeeping garbage and faster
131 retrieval at the (slight) expense of deletion. */
133 /* Maximum allowed fullness: when hash table's fullness exceeds this
134 value, the table is resized. */
135 #define HASH_MAX_FULLNESS 0.75
137 /* The hash table size is multiplied by this factor (and then rounded
138 to the next prime) with each resize. This guarantees infrequent
140 #define HASH_RESIZE_FACTOR 2
147 typedef unsigned long (*hashfun_t) (const void *);
148 typedef int (*testfun_t) (const void *, const void *);
151 hashfun_t hash_function;
152 testfun_t test_function;
154 struct cell *cells; /* contiguous array of cells. */
155 int size; /* size of the array. */
157 int count; /* number of occupied entries. */
158 int resize_threshold; /* after size exceeds this number of
159 entries, resize the table. */
160 int prime_offset; /* the offset of the current prime in
164 /* We use the all-bits-set constant (INVALID_PTR) marker to mean that
165 a cell is empty. It is unaligned and therefore illegal as a
166 pointer. INVALID_PTR_CHAR (0xff) is the single-character constant
167 used to initialize the entire cells array as empty.
169 The all-bits-set value is a better choice than NULL because it
170 allows the use of NULL/0 keys. Since the keys are either integers
171 or pointers, the only key that cannot be used is the integer value
172 -1. This is acceptable because it still allows the use of
173 nonnegative integer keys. */
175 #define INVALID_PTR ((void *) ~(uintptr_t) 0)
177 # define UCHAR_MAX 0xff
179 #define INVALID_PTR_CHAR UCHAR_MAX
181 /* Whether the cell C is occupied (non-empty). */
182 #define CELL_OCCUPIED(c) ((c)->key != INVALID_PTR)
184 /* Clear the cell C, i.e. mark it as empty (unoccupied). */
185 #define CLEAR_CELL(c) ((c)->key = INVALID_PTR)
187 /* "Next" cell is the cell following C, but wrapping back to CELLS
188 when C would reach CELLS+SIZE. */
189 #define NEXT_CELL(c, cells, size) (c != cells + (size - 1) ? c + 1 : cells)
191 /* Loop over occupied cells starting at C, terminating the loop when
192 an empty cell is encountered. */
193 #define FOREACH_OCCUPIED_ADJACENT(c, cells, size) \
194 for (; CELL_OCCUPIED (c); c = NEXT_CELL (c, cells, size))
196 /* Return the position of KEY in hash table SIZE large, hash function
198 #define HASH_POSITION(key, hashfun, size) ((hashfun) (key) % size)
200 /* Find a prime near, but greather than or equal to SIZE. The primes
201 are looked up from a table with a selection of primes convenient
204 PRIME_OFFSET is a minor optimization: it specifies start position
205 for the search for the large enough prime. The final offset is
206 stored in the same variable. That way the list of primes does not
207 have to be scanned from the beginning each time around. */
210 prime_size (int size, int *prime_offset)
212 static const int primes[] = {
213 13, 19, 29, 41, 59, 79, 107, 149, 197, 263, 347, 457, 599, 787, 1031,
214 1361, 1777, 2333, 3037, 3967, 5167, 6719, 8737, 11369, 14783,
215 19219, 24989, 32491, 42257, 54941, 71429, 92861, 120721, 156941,
216 204047, 265271, 344857, 448321, 582821, 757693, 985003, 1280519,
217 1664681, 2164111, 2813353, 3657361, 4754591, 6180989, 8035301,
218 10445899, 13579681, 17653589, 22949669, 29834603, 38784989,
219 50420551, 65546729, 85210757, 110774011, 144006217, 187208107,
220 243370577, 316381771, 411296309, 534685237, 695090819, 903618083,
221 1174703521, 1527114613, 1837299131, 2147483647
225 for (i = *prime_offset; i < countof (primes); i++)
226 if (primes[i] >= size)
228 /* Set the offset to the next prime. That is safe because,
229 next time we are called, it will be with a larger SIZE,
230 which means we could never return the same prime anyway.
231 (If that is not the case, the caller can simply reset
233 *prime_offset = i + 1;
240 static int cmp_pointer (const void *, const void *);
242 /* Create a hash table with hash function HASH_FUNCTION and test
243 function TEST_FUNCTION. The table is empty (its count is 0), but
244 pre-allocated to store at least ITEMS items.
246 ITEMS is the number of items that the table can accept without
247 needing to resize. It is useful when creating a table that is to
248 be immediately filled with a known number of items. In that case,
249 the regrows are a waste of time, and specifying ITEMS correctly
250 will avoid them altogether.
252 Note that hash tables grow dynamically regardless of ITEMS. The
253 only use of ITEMS is to preallocate the table and avoid unnecessary
254 dynamic regrows. Don't bother making ITEMS prime because it's not
255 used as size unchanged. To start with a small table that grows as
256 needed, simply specify zero ITEMS.
258 If hash and test callbacks are not specified, identity mapping is
259 assumed, i.e. pointer values are used for key comparison. (Common
260 Lisp calls such tables EQ hash tables, and Java calls them
261 IdentityHashMaps.) If your keys require different comparison,
262 specify hash and test functions. For easy use of C strings as hash
263 keys, you can use the convenience functions make_string_hash_table
264 and make_nocase_string_hash_table. */
267 hash_table_new (int items,
268 unsigned long (*hash_function) (const void *),
269 int (*test_function) (const void *, const void *))
272 struct hash_table *ht = xnew (struct hash_table);
274 ht->hash_function = hash_function ? hash_function : hash_pointer;
275 ht->test_function = test_function ? test_function : cmp_pointer;
277 /* If the size of struct hash_table ever becomes a concern, this
278 field can go. (Wget doesn't create many hashes.) */
279 ht->prime_offset = 0;
281 /* Calculate the size that ensures that the table will store at
282 least ITEMS keys without the need to resize. */
283 size = 1 + items / HASH_MAX_FULLNESS;
284 size = prime_size (size, &ht->prime_offset);
286 ht->resize_threshold = size * HASH_MAX_FULLNESS;
287 /*assert (ht->resize_threshold >= items);*/
289 ht->cells = xnew_array (struct cell, ht->size);
291 /* Mark cells as empty. We use 0xff rather than 0 to mark empty
292 keys because it allows us to use NULL/0 as keys. */
293 memset (ht->cells, INVALID_PTR_CHAR, size * sizeof (struct cell));
300 /* Free the data associated with hash table HT. */
303 hash_table_destroy (struct hash_table *ht)
309 /* The heart of most functions in this file -- find the cell whose
310 KEY is equal to key, using linear probing. Returns the cell
311 that matches KEY, or the first empty cell if none matches. */
313 static inline struct cell *
314 find_cell (const struct hash_table *ht, const void *key)
316 struct cell *cells = ht->cells;
318 struct cell *c = cells + HASH_POSITION (key, ht->hash_function, size);
319 testfun_t equals = ht->test_function;
321 FOREACH_OCCUPIED_ADJACENT (c, cells, size)
322 if (equals (key, c->key))
327 /* Get the value that corresponds to the key KEY in the hash table HT.
328 If no value is found, return NULL. Note that NULL is a legal value
329 for value; if you are storing NULLs in your hash table, you can use
330 hash_table_contains to be sure that a (possibly NULL) value exists
331 in the table. Or, you can use hash_table_get_pair instead of this
335 hash_table_get (const struct hash_table *ht, const void *key)
337 struct cell *c = find_cell (ht, key);
338 if (CELL_OCCUPIED (c))
344 /* Like hash_table_get, but writes out the pointers to both key and
345 value. Returns non-zero on success. */
348 hash_table_get_pair (const struct hash_table *ht, const void *lookup_key,
349 void *orig_key, void *value)
351 struct cell *c = find_cell (ht, lookup_key);
352 if (CELL_OCCUPIED (c))
355 *(void **)orig_key = c->key;
357 *(void **)value = c->value;
364 /* Return 1 if HT contains KEY, 0 otherwise. */
367 hash_table_contains (const struct hash_table *ht, const void *key)
369 struct cell *c = find_cell (ht, key);
370 return CELL_OCCUPIED (c);
373 /* Grow hash table HT as necessary, and rehash all the key-value
377 grow_hash_table (struct hash_table *ht)
379 hashfun_t hasher = ht->hash_function;
380 struct cell *old_cells = ht->cells;
381 struct cell *old_end = ht->cells + ht->size;
382 struct cell *c, *cells;
385 newsize = prime_size (ht->size * HASH_RESIZE_FACTOR, &ht->prime_offset);
387 printf ("growing from %d to %d; fullness %.2f%% to %.2f%%\n",
389 100.0 * ht->count / ht->size,
390 100.0 * ht->count / newsize);
394 ht->resize_threshold = newsize * HASH_MAX_FULLNESS;
396 cells = xnew_array (struct cell, newsize);
397 memset (cells, INVALID_PTR_CHAR, newsize * sizeof (struct cell));
400 for (c = old_cells; c < old_end; c++)
401 if (CELL_OCCUPIED (c))
404 /* We don't need to test for uniqueness of keys because they
405 come from the hash table and are therefore known to be
407 new_c = cells + HASH_POSITION (c->key, hasher, newsize);
408 FOREACH_OCCUPIED_ADJACENT (new_c, cells, newsize)
416 /* Put VALUE in the hash table HT under the key KEY. This regrows the
417 table if necessary. */
420 hash_table_put (struct hash_table *ht, const void *key, void *value)
422 struct cell *c = find_cell (ht, key);
423 if (CELL_OCCUPIED (c))
425 /* update existing item */
426 c->key = (void *)key; /* const? */
431 /* If adding the item would make the table exceed max. fullness,
432 grow the table first. */
433 if (ht->count >= ht->resize_threshold)
435 grow_hash_table (ht);
436 c = find_cell (ht, key);
441 c->key = (void *)key; /* const? */
445 /* Remove KEY->value mapping from HT. Return 0 if there was no such
446 entry; return 1 if an entry was removed. */
449 hash_table_remove (struct hash_table *ht, const void *key)
451 struct cell *c = find_cell (ht, key);
452 if (!CELL_OCCUPIED (c))
457 struct cell *cells = ht->cells;
458 hashfun_t hasher = ht->hash_function;
463 /* Rehash all the entries following C. The alternative
464 approach is to mark the entry as deleted, i.e. create a
465 "tombstone". That speeds up removal, but leaves a lot of
466 garbage and slows down hash_table_get and hash_table_put. */
468 c = NEXT_CELL (c, cells, size);
469 FOREACH_OCCUPIED_ADJACENT (c, cells, size)
471 const void *key2 = c->key;
474 /* Find the new location for the key. */
475 c_new = cells + HASH_POSITION (key2, hasher, size);
476 FOREACH_OCCUPIED_ADJACENT (c_new, cells, size)
477 if (key2 == c_new->key)
478 /* The cell C (key2) is already where we want it (in
479 C_NEW's "chain" of keys.) */
492 /* Clear HT of all entries. After calling this function, the count
493 and the fullness of the hash table will be zero. The size will
497 hash_table_clear (struct hash_table *ht)
499 memset (ht->cells, INVALID_PTR_CHAR, ht->size * sizeof (struct cell));
503 /* Call FN for each entry in HT. FN is called with three arguments:
504 the key, the value, and ARG. When FN returns a non-zero value, the
507 It is undefined what happens if you add or remove entries in the
508 hash table while hash_table_for_each is running. The exception is
509 the entry you're currently mapping over; you may call
510 hash_table_put or hash_table_remove on that entry's key. That is
511 also the reason why this function cannot be implemented in terms of
512 hash_table_iterate. */
515 hash_table_for_each (struct hash_table *ht,
516 int (*fn) (void *, void *, void *), void *arg)
518 struct cell *c = ht->cells;
519 struct cell *end = ht->cells + ht->size;
522 if (CELL_OCCUPIED (c))
527 if (fn (key, c->value, arg))
529 /* hash_table_remove might have moved the adjacent cells. */
530 if (c->key != key && CELL_OCCUPIED (c))
535 /* Initiate iteration over HT. Entries are obtained with
536 hash_table_iter_next, a typical iteration loop looking like this:
538 hash_table_iterator iter;
539 for (hash_table_iterate (ht, &iter); hash_table_iter_next (&iter); )
540 ... do something with iter.key and iter.value ...
542 The iterator does not need to be deallocated after use. The hash
543 table must not be modified while being iterated over. */
546 hash_table_iterate (struct hash_table *ht, hash_table_iterator *iter)
548 iter->pos = ht->cells;
549 iter->end = ht->cells + ht->size;
552 /* Get the next hash table entry. ITER is an iterator object
553 initialized using hash_table_iterate. While there are more
554 entries, the key and value pointers are stored to ITER->key and
555 ITER->value respectively and 1 is returned. When there are no more
556 entries, 0 is returned.
558 If the hash table is modified between calls to this function, the
559 result is undefined. */
562 hash_table_iter_next (hash_table_iterator *iter)
564 struct cell *c = iter->pos;
565 struct cell *end = iter->end;
567 if (CELL_OCCUPIED (c))
570 iter->value = c->value;
577 /* Return the number of elements in the hash table. This is not the
578 same as the physical size of the hash table, which is always
579 greater than the number of elements. */
582 hash_table_count (const struct hash_table *ht)
587 /* Functions from this point onward are meant for convenience and
588 don't strictly belong to this file. However, this is as good a
589 place for them as any. */
591 /* Guidelines for creating custom hash and test functions:
593 - The test function returns non-zero for keys that are considered
594 "equal", zero otherwise.
596 - The hash function returns a number that represents the
597 "distinctness" of the object. In more precise terms, it means
598 that for any two objects that test "equal" under the test
599 function, the hash function MUST produce the same result.
601 This does not mean that all different objects must produce
602 different values (that would be "perfect" hashing), only that
603 non-distinct objects must produce the same values! For instance,
604 a hash function that returns 0 for any given object is a
605 perfectly valid (albeit extremely bad) hash function. A hash
606 function that hashes a string by adding up all its characters is
607 another example of a valid (but still quite bad) hash function.
609 It is not hard to make hash and test functions agree about
610 equality. For example, if the test function compares strings
611 case-insensitively, the hash function can lower-case the
612 characters when calculating the hash value. That ensures that
613 two strings differing only in case will hash the same.
615 - To prevent performance degradation, choose a hash function with
616 as good "spreading" as possible. A good hash function will use
617 all the bits of the input when calculating the hash, and will
618 react to even small changes in input with a completely different
619 output. But don't make the hash function itself overly slow,
620 because you'll be incurring a non-negligible overhead to all hash
624 * Support for hash tables whose keys are strings.
628 /* Base 31 hash function. Taken from Gnome's glib, modified to use
631 We used to use the popular hash function from the Dragon Book, but
632 this one seems to perform much better, both by being faster and by
633 generating less collisions. */
636 hash_string (const void *key)
642 for (p += 1; *p != '\0'; p++)
643 h = (h << 5) - h + *p;
648 /* Frontend for strcmp usable for hash tables. */
651 cmp_string (const void *s1, const void *s2)
653 return !strcmp ((const char *)s1, (const char *)s2);
656 /* Return a hash table of preallocated to store at least ITEMS items
657 suitable to use strings as keys. */
660 make_string_hash_table (int items)
662 return hash_table_new (items, hash_string, cmp_string);
666 * Support for hash tables whose keys are strings, but which are
667 * compared case-insensitively.
671 /* Like hash_string, but produce the same hash regardless of the case. */
674 hash_string_nocase (const void *key)
677 unsigned int h = TOLOWER (*p);
680 for (p += 1; *p != '\0'; p++)
681 h = (h << 5) - h + TOLOWER (*p);
686 /* Like string_cmp, but doing case-insensitive compareison. */
689 string_cmp_nocase (const void *s1, const void *s2)
691 return !strcasecmp ((const char *)s1, (const char *)s2);
694 /* Like make_string_hash_table, but uses string_hash_nocase and
695 string_cmp_nocase. */
698 make_nocase_string_hash_table (int items)
700 return hash_table_new (items, hash_string_nocase, string_cmp_nocase);
703 /* Hashing of numeric values, such as pointers and integers.
705 This implementation is the Robert Jenkins' 32 bit Mix Function,
706 with a simple adaptation for 64-bit values. According to Jenkins
707 it should offer excellent spreading of values. Unlike the popular
708 Knuth's multiplication hash, this function doesn't need to know the
709 hash table size to work. */
712 hash_pointer (const void *ptr)
714 uintptr_t key = (uintptr_t) ptr;
723 #if SIZEOF_VOID_P > 4
733 return (unsigned long) key;
737 cmp_pointer (const void *ptr1, const void *ptr2)
748 print_hash (struct hash_table *sht)
750 hash_table_iterator iter;
753 for (hash_table_iterate (sht, &iter); hash_table_iter_next (&iter);
755 printf ("%s: %s\n", iter.key, iter.value);
756 assert (count == sht->count);
762 struct hash_table *ht = make_string_hash_table (0);
764 while ((fgets (line, sizeof (line), stdin)))
766 int len = strlen (line);
770 if (!hash_table_contains (ht, line))
771 hash_table_put (ht, strdup (line), "here I am!");
776 if (hash_table_get_pair (ht, line, &line_copy, NULL))
778 hash_table_remove (ht, line);
788 printf ("%d %d\n", ht->count, ht->size);