2 Copyright (C) 2000-2005 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software Foundation, Inc.,
18 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 In addition, as a special exception, the Free Software Foundation
21 gives permission to link the code of its release of Wget with the
22 OpenSSL project's "OpenSSL" library (or with modified versions of it
23 that use the same license as the "OpenSSL" library), and distribute
24 the linked executables. You must obey the GNU General Public License
25 in all respects for all of the code used other than "OpenSSL". If you
26 modify this file, you may extend this exception to your version of the
27 file, but you are not obligated to do so. If you do not wish to do
28 so, delete this exception statement from your version. */
30 /* With -DSTANDALONE, this file can be compiled outside Wget source
31 tree. To test, also use -DTEST. */
44 /* Get Wget's utility headers. */
48 /* Make do without them. */
49 # define xnew(x) xmalloc (sizeof (x))
50 # define xnew_array(type, x) xmalloc (sizeof (type) * (x))
51 # define xmalloc malloc
53 # define countof(x) (sizeof (x) / sizeof ((x)[0]))
54 # define TOLOWER(x) ('A' <= (x) && (x) <= 'Z' ? (x) - 32 : (x))
61 Hash tables are a technique used to implement mapping between
62 objects with near-constant-time access and storage. The table
63 associates keys to values, and a value can be very quickly
64 retrieved by providing the key. Fast lookup tables are typically
65 implemented as hash tables.
68 hash_table_new -- creates the table.
69 hash_table_destroy -- destroys the table.
70 hash_table_put -- establishes or updates key->value mapping.
71 hash_table_get -- retrieves value of key.
72 hash_table_get_pair -- get key/value pair for key.
73 hash_table_contains -- test whether the table contains key.
74 hash_table_remove -- remove key->value mapping for given key.
75 hash_table_map -- iterate through table entries.
76 hash_table_clear -- clear hash table contents.
77 hash_table_count -- return the number of entries in the table.
79 The hash table grows internally as new entries are added and is not
80 limited in size, except by available memory. The table doubles
81 with each resize, which ensures that the amortized time per
82 operation remains constant.
84 By default, tables created by hash_table_new consider the keys to
85 be equal if their pointer values are the same. You can use
86 make_string_hash_table to create tables whose keys are considered
87 equal if their string contents are the same. In the general case,
88 the criterion of equality used to compare keys is specified at
89 table creation time with two callback functions, "hash" and "test".
90 The hash function transforms the key into an arbitrary number that
91 must be the same for two equal keys. The test function accepts two
92 keys and returns non-zero if they are to be considered equal.
94 Note that neither keys nor values are copied when inserted into the
95 hash table, so they must exist for the lifetime of the table. This
96 means that e.g. the use of static strings is OK, but objects with a
97 shorter life-time need to be copied (with strdup() or the like in
98 the case of strings) before being inserted. */
102 The hash table is implemented as an open-addressed table with
103 linear probing collision resolution.
105 The above means that all the cells (each cell containing a key and
106 a value pointer) are stored in a contiguous array. Array position
107 of each cell is determined by the hash value of its key and the
108 size of the table: location := hash(key) % size. If two different
109 keys end up on the same position (collide), the one that came
110 second is stored in the first unoccupied cell that follows it.
111 This collision resolution technique is called "linear probing".
113 There are more advanced collision resolution methods (quadratic
114 probing, double hashing), but we don't use them because they incur
115 more non-sequential access to the array, which results in worse CPU
116 cache behavior. Linear probing works well as long as the
117 count/size ratio (fullness) is kept below 75%. We make sure to
118 grow and rehash the table whenever this threshold is exceeded.
120 Collisions complicate deletion because simply clearing a cell
121 followed by previously collided entries would cause those neighbors
122 to not be picked up by find_cell later. One solution is to leave a
123 "tombstone" marker instead of clearing the cell, and another is to
124 recalculate the positions of adjacent cells. We take the latter
125 approach because it results in less bookkeeping garbage and faster
126 retrieval at the (slight) expense of deletion. */
128 /* Maximum allowed fullness: when hash table's fullness exceeds this
129 value, the table is resized. */
130 #define HASH_MAX_FULLNESS 0.75
132 /* The hash table size is multiplied by this factor (and then rounded
133 to the next prime) with each resize. This guarantees infrequent
135 #define HASH_RESIZE_FACTOR 2
142 typedef unsigned long (*hashfun_t) (const void *);
143 typedef int (*testfun_t) (const void *, const void *);
146 hashfun_t hash_function;
147 testfun_t test_function;
149 struct cell *cells; /* contiguous array of cells. */
150 int size; /* size of the array. */
152 int count; /* number of occupied entries. */
153 int resize_threshold; /* after size exceeds this number of
154 entries, resize the table. */
155 int prime_offset; /* the offset of the current prime in
159 /* We use the all-bits-set constant (INVALID_PTR) marker to mean that
160 a cell is empty. It is unaligned and therefore illegal as a
161 pointer. INVALID_PTR_CHAR (0xff) is the single-character constant
162 used to initialize the entire cells array as empty.
164 The all-bits-set value is a better choice than NULL because it
165 allows the use of NULL/0 keys. Since the keys are either integers
166 or pointers, the only key that cannot be used is the integer value
167 -1. This is acceptable because it still allows the use of
168 nonnegative integer keys. */
170 #define INVALID_PTR ((void *) ~0UL)
172 # define UCHAR_MAX 0xff
174 #define INVALID_PTR_CHAR UCHAR_MAX
176 /* Whether the cell C is occupied (non-empty). */
177 #define CELL_OCCUPIED(c) ((c)->key != INVALID_PTR)
179 /* Clear the cell C, i.e. mark it as empty (unoccupied). */
180 #define CLEAR_CELL(c) ((c)->key = INVALID_PTR)
182 /* "Next" cell is the cell following C, but wrapping back to CELLS
183 when C would reach CELLS+SIZE. */
184 #define NEXT_CELL(c, cells, size) (c != cells + (size - 1) ? c + 1 : cells)
186 /* Loop over occupied cells starting at C, terminating the loop when
187 an empty cell is encountered. */
188 #define FOREACH_OCCUPIED_ADJACENT(c, cells, size) \
189 for (; CELL_OCCUPIED (c); c = NEXT_CELL (c, cells, size))
191 /* Return the position of KEY in hash table SIZE large, hash function
193 #define HASH_POSITION(key, hashfun, size) ((hashfun) (key) % size)
195 /* Find a prime near, but greather than or equal to SIZE. The primes
196 are looked up from a table with a selection of primes convenient
199 PRIME_OFFSET is a minor optimization: it specifies start position
200 for the search for the large enough prime. The final offset is
201 stored in the same variable. That way the list of primes does not
202 have to be scanned from the beginning each time around. */
205 prime_size (int size, int *prime_offset)
207 static const int primes[] = {
208 13, 19, 29, 41, 59, 79, 107, 149, 197, 263, 347, 457, 599, 787, 1031,
209 1361, 1777, 2333, 3037, 3967, 5167, 6719, 8737, 11369, 14783,
210 19219, 24989, 32491, 42257, 54941, 71429, 92861, 120721, 156941,
211 204047, 265271, 344857, 448321, 582821, 757693, 985003, 1280519,
212 1664681, 2164111, 2813353, 3657361, 4754591, 6180989, 8035301,
213 10445899, 13579681, 17653589, 22949669, 29834603, 38784989,
214 50420551, 65546729, 85210757, 110774011, 144006217, 187208107,
215 243370577, 316381771, 411296309, 534685237, 695090819, 903618083,
216 1174703521, 1527114613, 1837299131, 2147483647
220 for (i = *prime_offset; i < countof (primes); i++)
221 if (primes[i] >= size)
223 /* Set the offset to the next prime. That is safe because,
224 next time we are called, it will be with a larger SIZE,
225 which means we could never return the same prime anyway.
226 (If that is not the case, the caller can simply reset
228 *prime_offset = i + 1;
235 static int cmp_pointer (const void *, const void *);
237 /* Create a hash table with hash function HASH_FUNCTION and test
238 function TEST_FUNCTION. The table is empty (its count is 0), but
239 pre-allocated to store at least ITEMS items.
241 ITEMS is the number of items that the table can accept without
242 needing to resize. It is useful when creating a table that is to
243 be immediately filled with a known number of items. In that case,
244 the regrows are a waste of time, and specifying ITEMS correctly
245 will avoid them altogether.
247 Note that hash tables grow dynamically regardless of ITEMS. The
248 only use of ITEMS is to preallocate the table and avoid unnecessary
249 dynamic regrows. Don't bother making ITEMS prime because it's not
250 used as size unchanged. To start with a small table that grows as
251 needed, simply specify zero ITEMS.
253 If hash and test callbacks are not specified, identity mapping is
254 assumed, i.e. pointer values are used for key comparison. (Common
255 Lisp calls such tables EQ hash tables, and Java calls them
256 IdentityHashMaps.) If your keys require different comparison,
257 specify hash and test functions. For easy use of C strings as hash
258 keys, you can use the convenience functions make_string_hash_table
259 and make_nocase_string_hash_table. */
262 hash_table_new (int items,
263 unsigned long (*hash_function) (const void *),
264 int (*test_function) (const void *, const void *))
267 struct hash_table *ht = xnew (struct hash_table);
269 ht->hash_function = hash_function ? hash_function : hash_pointer;
270 ht->test_function = test_function ? test_function : cmp_pointer;
272 /* If the size of struct hash_table ever becomes a concern, this
273 field can go. (Wget doesn't create many hashes.) */
274 ht->prime_offset = 0;
276 /* Calculate the size that ensures that the table will store at
277 least ITEMS keys without the need to resize. */
278 size = 1 + items / HASH_MAX_FULLNESS;
279 size = prime_size (size, &ht->prime_offset);
281 ht->resize_threshold = size * HASH_MAX_FULLNESS;
282 /*assert (ht->resize_threshold >= items);*/
284 ht->cells = xnew_array (struct cell, ht->size);
286 /* Mark cells as empty. We use 0xff rather than 0 to mark empty
287 keys because it allows us to use NULL/0 as keys. */
288 memset (ht->cells, INVALID_PTR_CHAR, size * sizeof (struct cell));
295 /* Free the data associated with hash table HT. */
298 hash_table_destroy (struct hash_table *ht)
304 /* The heart of most functions in this file -- find the cell whose
305 KEY is equal to key, using linear probing. Returns the cell
306 that matches KEY, or the first empty cell if none matches. */
308 static inline struct cell *
309 find_cell (const struct hash_table *ht, const void *key)
311 struct cell *cells = ht->cells;
313 struct cell *c = cells + HASH_POSITION (key, ht->hash_function, size);
314 testfun_t equals = ht->test_function;
316 FOREACH_OCCUPIED_ADJACENT (c, cells, size)
317 if (equals (key, c->key))
322 /* Get the value that corresponds to the key KEY in the hash table HT.
323 If no value is found, return NULL. Note that NULL is a legal value
324 for value; if you are storing NULLs in your hash table, you can use
325 hash_table_contains to be sure that a (possibly NULL) value exists
326 in the table. Or, you can use hash_table_get_pair instead of this
330 hash_table_get (const struct hash_table *ht, const void *key)
332 struct cell *c = find_cell (ht, key);
333 if (CELL_OCCUPIED (c))
339 /* Like hash_table_get, but writes out the pointers to both key and
340 value. Returns non-zero on success. */
343 hash_table_get_pair (const struct hash_table *ht, const void *lookup_key,
344 void *orig_key, void *value)
346 struct cell *c = find_cell (ht, lookup_key);
347 if (CELL_OCCUPIED (c))
350 *(void **)orig_key = c->key;
352 *(void **)value = c->value;
359 /* Return 1 if HT contains KEY, 0 otherwise. */
362 hash_table_contains (const struct hash_table *ht, const void *key)
364 struct cell *c = find_cell (ht, key);
365 return CELL_OCCUPIED (c);
368 /* Grow hash table HT as necessary, and rehash all the key-value
372 grow_hash_table (struct hash_table *ht)
374 hashfun_t hasher = ht->hash_function;
375 struct cell *old_cells = ht->cells;
376 struct cell *old_end = ht->cells + ht->size;
377 struct cell *c, *cells;
380 newsize = prime_size (ht->size * HASH_RESIZE_FACTOR, &ht->prime_offset);
382 printf ("growing from %d to %d; fullness %.2f%% to %.2f%%\n",
384 100.0 * ht->count / ht->size,
385 100.0 * ht->count / newsize);
389 ht->resize_threshold = newsize * HASH_MAX_FULLNESS;
391 cells = xnew_array (struct cell, newsize);
392 memset (cells, INVALID_PTR_CHAR, newsize * sizeof (struct cell));
395 for (c = old_cells; c < old_end; c++)
396 if (CELL_OCCUPIED (c))
399 /* We don't need to test for uniqueness of keys because they
400 come from the hash table and are therefore known to be
402 new_c = cells + HASH_POSITION (c->key, hasher, newsize);
403 FOREACH_OCCUPIED_ADJACENT (new_c, cells, newsize)
411 /* Put VALUE in the hash table HT under the key KEY. This regrows the
412 table if necessary. */
415 hash_table_put (struct hash_table *ht, const void *key, void *value)
417 struct cell *c = find_cell (ht, key);
418 if (CELL_OCCUPIED (c))
420 /* update existing item */
421 c->key = (void *)key; /* const? */
426 /* If adding the item would make the table exceed max. fullness,
427 grow the table first. */
428 if (ht->count >= ht->resize_threshold)
430 grow_hash_table (ht);
431 c = find_cell (ht, key);
436 c->key = (void *)key; /* const? */
440 /* Remove KEY->value mapping from HT. Return 0 if there was no such
441 entry; return 1 if an entry was removed. */
444 hash_table_remove (struct hash_table *ht, const void *key)
446 struct cell *c = find_cell (ht, key);
447 if (!CELL_OCCUPIED (c))
452 struct cell *cells = ht->cells;
453 hashfun_t hasher = ht->hash_function;
458 /* Rehash all the entries following C. The alternative
459 approach is to mark the entry as deleted, i.e. create a
460 "tombstone". That speeds up removal, but leaves a lot of
461 garbage and slows down hash_table_get and hash_table_put. */
463 c = NEXT_CELL (c, cells, size);
464 FOREACH_OCCUPIED_ADJACENT (c, cells, size)
466 const void *key2 = c->key;
469 /* Find the new location for the key. */
470 c_new = cells + HASH_POSITION (key2, hasher, size);
471 FOREACH_OCCUPIED_ADJACENT (c_new, cells, size)
472 if (key2 == c_new->key)
473 /* The cell C (key2) is already where we want it (in
474 C_NEW's "chain" of keys.) */
487 /* Clear HT of all entries. After calling this function, the count
488 and the fullness of the hash table will be zero. The size will
492 hash_table_clear (struct hash_table *ht)
494 memset (ht->cells, INVALID_PTR_CHAR, ht->size * sizeof (struct cell));
498 /* Map MAPFUN over all entries in HT. MAPFUN is called with three
499 arguments: the key, the value, and MAPARG.
501 It is undefined what happens if you add or remove entries in the
502 hash table while hash_table_map is running. The exception is the
503 entry you're currently mapping over; you may remove or change that
507 hash_table_map (struct hash_table *ht,
508 int (*mapfun) (void *, void *, void *),
511 struct cell *c = ht->cells;
512 struct cell *end = ht->cells + ht->size;
515 if (CELL_OCCUPIED (c))
520 if (mapfun (key, c->value, maparg))
522 /* hash_table_remove might have moved the adjacent cells. */
523 if (c->key != key && CELL_OCCUPIED (c))
528 /* Return the number of elements in the hash table. This is not the
529 same as the physical size of the hash table, which is always
530 greater than the number of elements. */
533 hash_table_count (const struct hash_table *ht)
538 /* Functions from this point onward are meant for convenience and
539 don't strictly belong to this file. However, this is as good a
540 place for them as any. */
542 /* Guidelines for creating custom hash and test functions:
544 - The test function returns non-zero for keys that are considered
545 "equal", zero otherwise.
547 - The hash function returns a number that represents the
548 "distinctness" of the object. In more precise terms, it means
549 that for any two objects that test "equal" under the test
550 function, the hash function MUST produce the same result.
552 This does not mean that all different objects must produce
553 different values (that would be "perfect" hashing), only that
554 non-distinct objects must produce the same values! For instance,
555 a hash function that returns 0 for any given object is a
556 perfectly valid (albeit extremely bad) hash function. A hash
557 function that hashes a string by adding up all its characters is
558 another example of a valid (but quite bad) hash function.
560 It is not hard to make hash and test functions agree about
561 equality. For example, if the test function compares strings
562 case-insensitively, the hash function can lower-case the
563 characters when calculating the hash value. That ensures that
564 two strings differing only in case will hash the same.
566 - If you care about performance, choose a hash function with as
567 good "spreading" as possible. A good hash function will use all
568 the bits of the input when calculating the hash, and will react
569 to even small changes in input with a completely different
570 output. Finally, don't make the hash function itself overly
571 slow, because you'll be incurring a non-negligible overhead to
572 all hash table operations. */
575 * Support for hash tables whose keys are strings.
579 /* 31 bit hash function. Taken from Gnome's glib, modified to use
582 We used to use the popular hash function from the Dragon Book, but
583 this one seems to perform much better. */
586 hash_string (const void *key)
592 for (p += 1; *p != '\0'; p++)
593 h = (h << 5) - h + *p;
598 /* Frontend for strcmp usable for hash tables. */
601 cmp_string (const void *s1, const void *s2)
603 return !strcmp ((const char *)s1, (const char *)s2);
606 /* Return a hash table of preallocated to store at least ITEMS items
607 suitable to use strings as keys. */
610 make_string_hash_table (int items)
612 return hash_table_new (items, hash_string, cmp_string);
616 * Support for hash tables whose keys are strings, but which are
617 * compared case-insensitively.
621 /* Like hash_string, but produce the same hash regardless of the case. */
624 hash_string_nocase (const void *key)
627 unsigned int h = TOLOWER (*p);
630 for (p += 1; *p != '\0'; p++)
631 h = (h << 5) - h + TOLOWER (*p);
636 /* Like string_cmp, but doing case-insensitive compareison. */
639 string_cmp_nocase (const void *s1, const void *s2)
641 return !strcasecmp ((const char *)s1, (const char *)s2);
644 /* Like make_string_hash_table, but uses string_hash_nocase and
645 string_cmp_nocase. */
648 make_nocase_string_hash_table (int items)
650 return hash_table_new (items, hash_string_nocase, string_cmp_nocase);
653 /* Hashing of numeric values, such as pointers and integers.
655 This implementation is the Robert Jenkins' 32 bit Mix Function,
656 with a simple adaptation for 64-bit values. It offers excellent
657 spreading of values and doesn't need to know the hash table size to
658 work (unlike the very popular Knuth's multiplication hash). */
661 hash_pointer (const void *ptr)
663 unsigned long key = (unsigned long)ptr;
686 cmp_pointer (const void *ptr1, const void *ptr2)
697 print_hash_table_mapper (void *key, void *value, void *count)
700 printf ("%s: %s\n", (const char *)key, (char *)value);
705 print_hash (struct hash_table *sht)
708 hash_table_map (sht, print_hash_table_mapper, &debug_count);
709 assert (debug_count == sht->count);
715 struct hash_table *ht = make_string_hash_table (0);
717 while ((fgets (line, sizeof (line), stdin)))
719 int len = strlen (line);
723 if (!hash_table_contains (ht, line))
724 hash_table_put (ht, strdup (line), "here I am!");
729 if (hash_table_get_pair (ht, line, &line_copy, NULL))
731 hash_table_remove (ht, line);
741 printf ("%d %d\n", ht->count, ht->size);