sjero.net Git - wget/blob - src/hash.c

   1 /* Hash tables.
   2    Copyright (C) 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #ifdef HAVE_CONFIG_H
  31 # include <config.h>
  32 #endif
  33
  34 #ifdef HAVE_STRING_H
  35 # include <string.h>
  36 #else
  37 # include <strings.h>
  38 #endif /* HAVE_STRING_H */
  39 #include <stdlib.h>
  40 #include <assert.h>
  41
  42 #include "wget.h"
  43 #include "utils.h"
  44
  45 #include "hash.h"
  46
  47 #ifdef STANDALONE
  48 # undef xmalloc
  49 # undef xrealloc
  50 # undef xfree
  51
  52 # define xmalloc malloc
  53 # define xrealloc realloc
  54 # define xfree free
  55
  56 # undef TOLOWER
  57 # define TOLOWER(x) ('A' <= (x) && (x) <= 'Z' ? (x) - 32 : (x))
  58 #endif
  59
  60 /* INTERFACE:
  61
  62    Hash tables are an implementation technique used to implement
  63    mapping between objects.  Assuming a good hashing function is used,
  64    they provide near-constant-time access and storing of information.
  65    Duplicate keys are not allowed.
  66
  67    This file defines the following entry points: hash_table_new
  68    creates a hash table, and hash_table_destroy deletes it.
  69    hash_table_put establishes a mapping between a key and a value.
  70    hash_table_get retrieves the value that corresponds to a key.
  71    hash_table_contains queries whether a key is stored in a table at
  72    all.  hash_table_remove removes a mapping that corresponds to a
  73    key.  hash_table_map allows you to map through all the entries in a
  74    hash table.  hash_table_clear clears all the entries from the hash
  75    table.
  76
  77    The number of mappings in a table is not limited, except by the
  78    amount of memory.  As you add new elements to a table, it regrows
  79    as necessary.  If you have an idea about how many elements you will
  80    store, you can provide a hint to hash_table_new().
  81
  82    The hashing and equality functions depend on the type of key and
  83    are normally provided by the user.  For the special (and frequent)
  84    case of using string keys, you can use the pre-canned
  85    make_string_hash_table(), which provides an efficient string
  86    hashing function, and a string equality wrapper around strcmp().
  87
  88    When specifying your hash and test functions, make sure the
  89    following holds true:
  90
  91    - The test function returns non-zero for keys that are considered
  92      "equal", zero otherwise.
  93
  94    - The hash function returns a number that represents the
  95      "distinctness" of the object.  In more precise terms, it means
  96      that for any two objects that test "equal" under the test
  97      function, the hash function MUST produce the same result.
  98
  99      This does not mean that each distinct object must produce a
 100      distinct value, only that non-distinct objects must produce the
 101      same values!  For instance, a hash function that returns 0 for
 102      any given object is a perfectly valid (albeit extremely bad) hash
 103      function.  A hash function that hashes a string by adding up all
 104      its characters is another example of a valid (but quite bad) hash
 105      function.
 106
 107      The above stated rule is quite easy to enforce.  For example, if
 108      your testing function compares strings case-insensitively, all
 109      your function needs to do is lower-case the string characters
 110      before calculating a hash.  That way you have easily guaranteed
 111      that case differences will not result in a different hash.
 112
 113    - If you care about performance, choose a hash function with as
 114      good "spreading" as possible.  A good hash function will react to
 115      even a small change in its input with a completely different
 116      resulting hash.  Finally, don't make the hash function itself
 117      overly slow, because you'll be incurring a non-negligible
 118      overhead to reads and writes to the hash table.
 119
 120    Note that neither keys nor values are copied when inserted into the
 121    hash table, so they must exist for the lifetime of the table.  This
 122    means that e.g. the use of static strings is OK, but objects with a
 123    shorter life-time need to be copied (with strdup() or the like in
 124    the case of strings) before being inserted.  */
 125
 126 /* IMPLEMENTATION:
 127
 128    All the hash mappings (key-value pairs of pointers) are stored in a
 129    contiguous array.  The position of each mapping is determined by
 130    the hash value of its key and the size of the table: location :=
 131    hash(key) % size.  If two different keys end up on the same
 132    position (hash collision), the one that came second is placed at
 133    the next empty position following the occupied place.  This
 134    collision resolution technique is called "linear probing".
 135
 136    There are more advanced collision resolution mechanisms (quadratic
 137    probing, double hashing), but we don't use them because they incur
 138    more non-sequential access to the array, which results in worse
 139    cache behavior.  Linear probing works well as long as the
 140    fullness/size ratio is kept below 75%.  We make sure to regrow or
 141    rehash the hash table whenever this threshold is exceeded.
 142
 143    Collisions make deletion tricky because finding collisions again
 144    relies on new empty spots not being created.  That's why
 145    hash_table_remove is careful to rehash the mappings that follow the
 146    deleted one.  */
 147
 148 struct mapping {
 149   void *key;
 150   void *value;
 151 };
 152
 153 struct hash_table {
 154   unsigned long (*hash_function) PARAMS ((const void *));
 155   int (*test_function) PARAMS ((const void *, const void *));
 156
 157   int size;                     /* size of the array */
 158   int count;                    /* number of non-empty, non-deleted
 159                                    fields. */
 160
 161   int resize_threshold;         /* after size exceeds this number of
 162                                    entries, resize the table.  */
 163   int prime_offset;             /* the offset of the current prime in
 164                                    the prime table. */
 165
 166   struct mapping *mappings;     /* the array of mapping pairs. */
 167 };
 168
 169 #define EMPTY_MAPPING_P(mp)  ((mp)->key == NULL)
 170 #define NEXT_MAPPING(mp, mappings, size) (mp == mappings + (size - 1)   \
 171                                           ? mappings : mp + 1)
 172
 173 #define LOOP_NON_EMPTY(mp, mappings, size)                              \
 174   for (; !EMPTY_MAPPING_P (mp); mp = NEXT_MAPPING (mp, mappings, size))
 175
 176 /* #### We might want to multiply with the "golden ratio" here to get
 177    better randomness for keys that do not result from a good hash
 178    function.  This is currently not a problem in Wget because we only
 179    use the string hash tables.  */
 180
 181 #define HASH_POSITION(ht, key) (ht->hash_function (key) % ht->size)
 182
 183 /* Find a prime near, but greather than or equal to SIZE.  Of course,
 184    the primes are not calculated, but looked up from a table.  The
 185    table does not contain all primes in range, just a selection useful
 186    for this purpose.
 187
 188    PRIME_OFFSET is a micro-optimization: if specified, it starts the
 189    search for the prime number beginning with the specific offset in
 190    the prime number table.  The final offset is stored in the same
 191    variable.  */
 192
 193 static int
 194 prime_size (int size, int *prime_offset)
 195 {
 196   static const unsigned long primes [] = {
 197     13, 19, 29, 41, 59, 79, 107, 149, 197, 263, 347, 457, 599, 787, 1031,
 198     1361, 1777, 2333, 3037, 3967, 5167, 6719, 8737, 11369, 14783,
 199     19219, 24989, 32491, 42257, 54941, 71429, 92861, 120721, 156941,
 200     204047, 265271, 344857, 448321, 582821, 757693, 985003, 1280519,
 201     1664681, 2164111, 2813353, 3657361, 4754591, 6180989, 8035301,
 202     10445899, 13579681, 17653589, 22949669, 29834603, 38784989,
 203     50420551, 65546729, 85210757, 110774011, 144006217, 187208107,
 204     243370577, 316381771, 411296309, 534685237, 695090819, 903618083,
 205     1174703521, 1527114613, 1985248999,
 206     (unsigned long)0x99d43ea5, (unsigned long)0xc7fa5177
 207   };
 208   int i = *prime_offset;
 209
 210   for (; i < countof (primes); i++)
 211     if (primes[i] >= size)
 212       {
 213         /* Set the offset to the next prime.  That is safe because,
 214            next time we are called, it will be with a larger SIZE,
 215            which means we could never return the same prime anyway.
 216            (If that is not the case, the caller can simply reset
 217            *prime_offset.)  */
 218         *prime_offset = i + 1;
 219         return primes[i];
 220       }
 221
 222   abort ();
 223   return 0;
 224 }
 225
 226 /* Create a hash table of INITIAL_SIZE with hash function
 227    HASH_FUNCTION and test function TEST_FUNCTION.  INITIAL_SIZE will
 228    be rounded to the next prime, so you don't have to worry about it
 229    being a prime number.
 230
 231    Consequently, if you wish to start out with a "small" table which
 232    will be regrown as needed, specify INITIAL_SIZE 0.
 233
 234    If HASH_FUNCTION is not provided, identity table is assumed,
 235    i.e. key pointers are compared as keys.  If you want strings with
 236    equal contents to hash the same, use make_string_hash_table.  */
 237
 238 struct hash_table *
 239 hash_table_new (int initial_size,
 240                 unsigned long (*hash_function) (const void *),
 241                 int (*test_function) (const void *, const void *))
 242 {
 243   struct hash_table *ht
 244     = (struct hash_table *)xmalloc (sizeof (struct hash_table));
 245
 246   ht->hash_function = hash_function ? hash_function : ptrhash;
 247   ht->test_function = test_function ? test_function : ptrcmp;
 248
 249   ht->prime_offset = 0;
 250   ht->size = prime_size (initial_size, &ht->prime_offset);
 251   ht->resize_threshold = ht->size * 3 / 4;
 252
 253   ht->count = 0;
 254
 255   ht->mappings = xmalloc (ht->size * sizeof (struct mapping));
 256   memset (ht->mappings, '\0', ht->size * sizeof (struct mapping));
 257
 258   return ht;
 259 }
 260
 261 /* Free the data associated with hash table HT. */
 262
 263 void
 264 hash_table_destroy (struct hash_table *ht)
 265 {
 266   xfree (ht->mappings);
 267   xfree (ht);
 268 }
 269
 270 /* The heart of almost all functions in this file -- find the mapping
 271    whose KEY is equal to key, using linear probing.  Returns the
 272    mapping that matches KEY, or NULL if none matches.  */
 273
 274 static inline struct mapping *
 275 find_mapping (const struct hash_table *ht, const void *key)
 276 {
 277   struct mapping *mappings = ht->mappings;
 278   int size = ht->size;
 279   struct mapping *mp = mappings + HASH_POSITION (ht, key);
 280   int (*equals) PARAMS ((const void *, const void *)) = ht->test_function;
 281
 282   LOOP_NON_EMPTY (mp, mappings, size)
 283     if (equals (key, mp->key))
 284       return mp;
 285   return NULL;
 286 }
 287
 288 /* Get the value that corresponds to the key KEY in the hash table HT.
 289    If no value is found, return NULL.  Note that NULL is a legal value
 290    for value; if you are storing NULLs in your hash table, you can use
 291    hash_table_contains to be sure that a (possibly NULL) value exists
 292    in the table.  Or, you can use hash_table_get_pair instead of this
 293    function.  */
 294
 295 void *
 296 hash_table_get (const struct hash_table *ht, const void *key)
 297 {
 298   struct mapping *mp = find_mapping (ht, key);
 299   if (mp)
 300     return mp->value;
 301   else
 302     return NULL;
 303 }
 304
 305 /* Like hash_table_get, but writes out the pointers to both key and
 306    value.  Returns non-zero on success.  */
 307
 308 int
 309 hash_table_get_pair (const struct hash_table *ht, const void *lookup_key,
 310                      void *orig_key, void *value)
 311 {
 312   struct mapping *mp = find_mapping (ht, lookup_key);
 313
 314   if (mp)
 315     {
 316       if (orig_key)
 317         *(void **)orig_key = mp->key;
 318       if (value)
 319         *(void **)value = mp->value;
 320       return 1;
 321     }
 322   else
 323     return 0;
 324 }
 325
 326 /* Return 1 if HT contains KEY, 0 otherwise. */
 327
 328 int
 329 hash_table_contains (const struct hash_table *ht, const void *key)
 330 {
 331   return find_mapping (ht, key) != NULL;
 332 }
 333
 334 /* Grow hash table HT as necessary, and rehash all the key-value
 335    mappings.  */
 336
 337 static void
 338 grow_hash_table (struct hash_table *ht)
 339 {
 340   struct mapping *old_mappings = ht->mappings;
 341   struct mapping *old_end      = ht->mappings + ht->size;
 342   struct mapping *mp, *mappings;
 343   int newsize;
 344
 345   newsize = prime_size (ht->size * 2, &ht->prime_offset);
 346 #if 0
 347   printf ("growing from %d to %d; fullness %.2f%% to %.2f%%\n",
 348           ht->size, newsize,
 349           (double)100 * ht->count / ht->size,
 350           (double)100 * ht->count / newsize);
 351 #endif
 352
 353   ht->size = newsize;
 354   ht->resize_threshold = newsize * 3 / 4;
 355
 356   mappings = xmalloc (ht->size * sizeof (struct mapping));
 357   memset (mappings, '\0', ht->size * sizeof (struct mapping));
 358   ht->mappings = mappings;
 359
 360   for (mp = old_mappings; mp < old_end; mp++)
 361     if (!EMPTY_MAPPING_P (mp))
 362       {
 363         struct mapping *new_mp = mappings + HASH_POSITION (ht, mp->key);
 364         /* We don't need to call test function and worry about
 365            collisions because all the keys come from the hash table
 366            and are therefore guaranteed to be unique.  */
 367         LOOP_NON_EMPTY (new_mp, mappings, newsize)
 368           ;
 369         *new_mp = *mp;
 370       }
 371
 372   xfree (old_mappings);
 373 }
 374
 375 /* Put VALUE in the hash table HT under the key KEY.  This regrows the
 376    table if necessary.  */
 377
 378 void
 379 hash_table_put (struct hash_table *ht, const void *key, void *value)
 380 {
 381   struct mapping *mappings = ht->mappings;
 382   int size = ht->size;
 383   int (*equals) PARAMS ((const void *, const void *)) = ht->test_function;
 384
 385   struct mapping *mp = mappings + HASH_POSITION (ht, key);
 386
 387   LOOP_NON_EMPTY (mp, mappings, size)
 388     if (equals (key, mp->key))
 389       {
 390         mp->key   = (void *)key; /* const? */
 391         mp->value = value;
 392         return;
 393       }
 394
 395   ++ht->count;
 396   mp->key   = (void *)key;      /* const? */
 397   mp->value = value;
 398
 399   if (ht->count > ht->resize_threshold)
 400     /* When table is 75% full, regrow it. */
 401     grow_hash_table (ht);
 402 }
 403
 404 /* Remove a mapping that matches KEY from HT.  Return 0 if there was
 405    no such entry; return 1 if an entry was removed.  */
 406
 407 int
 408 hash_table_remove (struct hash_table *ht, const void *key)
 409 {
 410   struct mapping *mp = find_mapping (ht, key);
 411   if (!mp)
 412     return 0;
 413   else
 414     {
 415       int size = ht->size;
 416       struct mapping *mappings = ht->mappings;
 417
 418       mp->key = NULL;
 419       --ht->count;
 420
 421       /* Rehash all the entries following MP.  The alternative
 422          approach is to mark the entry as deleted, i.e. create a
 423          "tombstone".  That makes remove faster, but leaves a lot of
 424          garbage and slows down hash_table_get and hash_table_put.  */
 425
 426       mp = NEXT_MAPPING (mp, mappings, size);
 427       LOOP_NON_EMPTY (mp, mappings, size)
 428         {
 429           const void *key2 = mp->key;
 430           struct mapping *mp_new = mappings + HASH_POSITION (ht, key2);
 431
 432           /* Find the new location for the key. */
 433
 434           LOOP_NON_EMPTY (mp_new, mappings, size)
 435             if (key2 == mp_new->key)
 436               /* The mapping MP (key2) is already where we want it (in
 437                  MP_NEW's "chain" of keys.)  */
 438               goto next_rehash;
 439
 440           *mp_new = *mp;
 441           mp->key = NULL;
 442
 443         next_rehash:
 444           ;
 445         }
 446       return 1;
 447     }
 448 }
 449
 450 /* Clear HT of all entries.  After calling this function, the count
 451    and the fullness of the hash table will be zero.  The size will
 452    remain unchanged.  */
 453
 454 void
 455 hash_table_clear (struct hash_table *ht)
 456 {
 457   memset (ht->mappings, '\0', ht->size * sizeof (struct mapping));
 458   ht->count = 0;
 459 }
 460
 461 /* Map MAPFUN over all the mappings in hash table HT.  MAPFUN is
 462    called with three arguments: the key, the value, and the CLOSURE.
 463
 464    It is undefined what happens if you add or remove entries in the
 465    hash table while hash_table_map is running.  The exception is the
 466    entry you're currently mapping over; you may remove or change that
 467    entry.  */
 468
 469 void
 470 hash_table_map (struct hash_table *ht,
 471                 int (*mapfun) (void *, void *, void *),
 472                 void *closure)
 473 {
 474   struct mapping *mp  = ht->mappings;
 475   struct mapping *end = ht->mappings + ht->size;
 476
 477   for (; mp < end; mp++)
 478     if (!EMPTY_MAPPING_P (mp))
 479       {
 480         void *key;
 481       repeat:
 482         key = mp->key;
 483         if (mapfun (key, mp->value, closure))
 484           return;
 485         /* hash_table_remove might have moved the adjacent
 486            mappings. */
 487         if (mp->key != key && !EMPTY_MAPPING_P (mp))
 488           goto repeat;
 489       }
 490 }
 491
 492 /* Return the number of elements in the hash table.  This is not the
 493    same as the physical size of the hash table, which is always
 494    greater than the number of elements.  */
 495
 496 int
 497 hash_table_count (const struct hash_table *ht)
 498 {
 499   return ht->count;
 500 }
 501 \f
 502 /* Functions from this point onward are meant for convenience and
 503    don't strictly belong to this file.  However, this is as good a
 504    place for them as any.  */
 505
 506 /*
 507  * Support for hash tables whose keys are strings.
 508  *
 509  */
 510
 511 /* 31 bit hash function.  Taken from Gnome's glib, modified to use
 512    standard C types.
 513
 514    We used to use the popular hash function from the Dragon Book, but
 515    this one seems to perform much better.  */
 516
 517 unsigned long
 518 string_hash (const void *key)
 519 {
 520   const char *p = key;
 521   unsigned int h = *p;
 522
 523   if (h)
 524     for (p += 1; *p != '\0'; p++)
 525       h = (h << 5) - h + *p;
 526
 527   return h;
 528 }
 529
 530 /* Frontend for strcmp usable for hash tables. */
 531
 532 int
 533 string_cmp (const void *s1, const void *s2)
 534 {
 535   return !strcmp ((const char *)s1, (const char *)s2);
 536 }
 537
 538 /* Return a hash table of initial size INITIAL_SIZE suitable to use
 539    strings as keys.  */
 540
 541 struct hash_table *
 542 make_string_hash_table (int initial_size)
 543 {
 544   return hash_table_new (initial_size, string_hash, string_cmp);
 545 }
 546
 547 /*
 548  * Support for hash tables whose keys are strings, but which are
 549  * compared case-insensitively.
 550  *
 551  */
 552
 553 /* Like string_hash, but produce the same hash regardless of the case. */
 554
 555 static unsigned long
 556 string_hash_nocase (const void *key)
 557 {
 558   const char *p = key;
 559   unsigned int h = TOLOWER (*p);
 560
 561   if (h)
 562     for (p += 1; *p != '\0'; p++)
 563       h = (h << 5) - h + TOLOWER (*p);
 564
 565   return h;
 566 }
 567
 568 /* Like string_cmp, but doing case-insensitive compareison. */
 569
 570 static int
 571 string_cmp_nocase (const void *s1, const void *s2)
 572 {
 573   return !strcasecmp ((const char *)s1, (const char *)s2);
 574 }
 575
 576 /* Like make_string_hash_table, but uses string_hash_nocase and
 577    string_cmp_nocase.  */
 578
 579 struct hash_table *
 580 make_nocase_string_hash_table (int initial_size)
 581 {
 582   return hash_table_new (initial_size, string_hash_nocase, string_cmp_nocase);
 583 }
 584
 585 /* Hashing of pointers.  Used for hash tables that are keyed by
 586    pointer identity.  (Common Lisp calls them EQ hash tables, and Java
 587    calls them IdentityHashMaps.)  */
 588
 589 unsigned long
 590 ptrhash (const void *ptr)
 591 {
 592   unsigned long key = (unsigned long)ptr;
 593   key += (key << 12);
 594   key ^= (key >> 22);
 595   key += (key << 4);
 596   key ^= (key >> 9);
 597   key += (key << 10);
 598   key ^= (key >> 2);
 599   key += (key << 7);
 600   key ^= (key >> 12);
 601 #if SIZEOF_LONG > 4
 602   key += (key << 44);
 603   key ^= (key >> 54);
 604   key += (key << 36);
 605   key ^= (key >> 41);
 606   key += (key << 42);
 607   key ^= (key >> 34);
 608   key += (key << 39);
 609   key ^= (key >> 44);
 610 #endif
 611   return key;
 612 }
 613
 614 int
 615 ptrcmp (const void *ptr1, const void *ptr2)
 616 {
 617   return ptr1 == ptr2;
 618 }
 619
 620 #if 0
 621 /* Currently unused: hashing of integers. */
 622
 623 unsigned long
 624 inthash (unsigned int key)
 625 {
 626   key += (key << 12);
 627   key ^= (key >> 22);
 628   key += (key << 4);
 629   key ^= (key >> 9);
 630   key += (key << 10);
 631   key ^= (key >> 2);
 632   key += (key << 7);
 633   key ^= (key >> 12);
 634   return key;
 635 }
 636 #endif
 637 \f
 638 #ifdef STANDALONE
 639
 640 #include <stdio.h>
 641 #include <string.h>
 642
 643 int
 644 print_hash_table_mapper (void *key, void *value, void *count)
 645 {
 646   ++*(int *)count;
 647   printf ("%s: %s\n", (const char *)key, (char *)value);
 648   return 0;
 649 }
 650
 651 void
 652 print_hash (struct hash_table *sht)
 653 {
 654   int debug_count = 0;
 655   hash_table_map (sht, print_hash_table_mapper, &debug_count);
 656   assert (debug_count == sht->count);
 657 }
 658
 659 int
 660 main (void)
 661 {
 662   struct hash_table *ht = make_string_hash_table (0);
 663   char line[80];
 664   while ((fgets (line, sizeof (line), stdin)))
 665     {
 666       int len = strlen (line);
 667       if (len <= 1)
 668         continue;
 669       line[--len] = '\0';
 670       if (!hash_table_contains (ht, line))
 671         hash_table_put (ht, strdup (line), "here I am!");
 672 #if 1
 673       if (len % 5 == 0)
 674         {
 675           char *line_copy;
 676           if (hash_table_get_pair (ht, line, &line_copy, NULL))
 677             {
 678               hash_table_remove (ht, line);
 679               xfree (line_copy);
 680             }
 681         }
 682 #endif
 683     }
 684 #if 0
 685   print_hash (ht);
 686 #endif
 687 #if 1
 688   printf ("%d %d\n", ht->count, ht->size);
 689 #endif
 690   return 0;
 691 }
 692 #endif