sjero.net Git - wget/blob - src/hash.c

   1 /* Hash tables.
   2    Copyright (C) 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #ifdef HAVE_CONFIG_H
  21 # include <config.h>
  22 #endif
  23
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif /* HAVE_STRING_H */
  29 #include <stdlib.h>
  30 #include <assert.h>
  31
  32 #include "wget.h"
  33 #include "utils.h"
  34
  35 #include "hash.h"
  36
  37 #ifdef STANDALONE
  38 # undef xmalloc
  39 # undef xrealloc
  40 # undef xfree
  41
  42 # define xmalloc malloc
  43 # define xrealloc realloc
  44 # define xfree free
  45
  46 # undef TOLOWER
  47 # define TOLOWER(x) ('A' <= (x) && (x) <= 'Z' ? (x) - 32 : (x))
  48 #endif
  49
  50 /* INTERFACE:
  51
  52    Hash tables are an implementation technique used to implement
  53    mapping between objects.  Provided a good hashing function is used,
  54    they guarantee constant-time access and storing of information.
  55    Duplicate keys are not allowed.
  56
  57    The basics are all covered.  hash_table_new creates a hash table,
  58    and hash_table_destroy deletes it.  hash_table_put establishes a
  59    mapping between a key and a value.  hash_table_get retrieves the
  60    value that corresponds to a key.  hash_table_contains queries
  61    whether a key is stored in a table at all.  hash_table_remove
  62    removes a mapping that corresponds to a key.  hash_table_map allows
  63    you to map through all the entries in a hash table.
  64    hash_table_clear clears all the entries from the hash table.
  65
  66    The number of mappings in a table is not limited, except by the
  67    amount of memory.  As you add new elements to a table, it regrows
  68    as necessary.  If you have an idea about how many elements you will
  69    store, you can provide a hint to hash_table_new().
  70
  71    The hashing and equality functions are normally provided by the
  72    user.  For the special (and frequent) case of hashing strings, you
  73    can use the pre-canned make_string_hash_table(), which provides an
  74    efficient string hashing function, and a string equality wrapper
  75    around strcmp().
  76
  77    When specifying your own hash and test functions, make sure the
  78    following holds true:
  79
  80    - The test function returns non-zero for keys that are considered
  81      "equal", zero otherwise.
  82
  83    - The hash function returns a number that represents the
  84      "distinctness" of the object.  In more precise terms, it means
  85      that for any two objects that test "equal" under the test
  86      function, the hash function MUST produce the same result.
  87
  88      This does not mean that each distinct object must produce a
  89      distinct value, only that non-distinct objects must produce the
  90      same values!  For instance, a hash function that returns 0 for
  91      any given object is a perfectly valid (albeit extremely bad) hash
  92      function.  A hash function that hashes a string by adding up all
  93      its characters is another example of a valid (but quite bad) hash
  94      function.
  95
  96      The above stated rule is quite easy to enforce.  For example, if
  97      your testing function compares strings case-insensitively, all
  98      your function needs to do is lower-case the string characters
  99      before calculating a hash.  That way you have easily guaranteed
 100      that case differences will not result in a different hash.
 101
 102    - (optional) Choose the hash function to get as good "spreading" as
 103      possible.  A good hash function will react to even a small change
 104      in its input with a completely different resulting hash.
 105      Finally, don't make your hash function extremely slow, because
 106      you're then defeating the purpose of hashing.
 107
 108    Note that neither keys nor values are copied when inserted into the
 109    hash table, so they must exist for the lifetime of the table.  This
 110    means that e.g. the use of static strings is OK, but objects with a
 111    shorter life-time need to be copied (with strdup() or the like in
 112    the case of strings) before being inserted.  */
 113
 114 /* IMPLEMENTATION:
 115
 116    All the hash mappings (key-value pairs of pointers) are stored in a
 117    contiguous array.  The position of each mapping is determined by
 118    applying the hash function to the key: location = hash(key) % size.
 119    If two different keys end up on the same position, the collision is
 120    resolved by placing the second mapping at the next empty place in
 121    the array following the occupied place.  This method of collision
 122    resolution is called "linear probing".
 123
 124    There are more advanced collision resolution mechanisms (quadratic
 125    probing, double hashing), but we don't use them because they
 126    involve more non-sequential access to the array, and therefore
 127    worse cache behavior.  Linear probing works well as long as the
 128    fullness/size ratio is kept below 75%.  We make sure to regrow or
 129    rehash the hash table whenever this threshold is exceeded.
 130
 131    Collisions make deletion tricky because finding collisions again
 132    relies on new empty spots not being created.  That's why
 133    hash_table_remove is careful to rehash the mappings that follow the
 134    deleted one.  */
 135
 136 struct mapping {
 137   void *key;
 138   void *value;
 139 };
 140
 141 struct hash_table {
 142   unsigned long (*hash_function) PARAMS ((const void *));
 143   int (*test_function) PARAMS ((const void *, const void *));
 144
 145   int size;                     /* size of the array */
 146   int count;                    /* number of non-empty, non-deleted
 147                                    fields. */
 148
 149   int resize_threshold;         /* after size exceeds this number of
 150                                    entries, resize the table.  */
 151   int prime_offset;             /* the offset of the current prime in
 152                                    the prime table. */
 153
 154   struct mapping *mappings;     /* the array of mapping pairs. */
 155 };
 156
 157 #define EMPTY_MAPPING_P(mp)  ((mp)->key == NULL)
 158 #define NEXT_MAPPING(mp, mappings, size) (mp == mappings + (size - 1)   \
 159                                           ? mappings : mp + 1)
 160
 161 #define LOOP_NON_EMPTY(mp, mappings, size)                              \
 162   for (; !EMPTY_MAPPING_P (mp); mp = NEXT_MAPPING (mp, mappings, size))
 163
 164 /* #### We might want to multiply with the "golden ratio" here to get
 165    better randomness for keys that do not result from a good hash
 166    function.  This is currently not a problem in Wget because we only
 167    use the string hash tables.  */
 168
 169 #define HASH_POSITION(ht, key) (ht->hash_function (key) % ht->size)
 170
 171 /* Find a prime near, but greather than or equal to SIZE.  Of course,
 172    the primes are not calculated, but looked up from a table.  The
 173    table does not contain all primes in range, just a selection useful
 174    for this purpose.
 175
 176    PRIME_OFFSET is a micro-optimization: if specified, it starts the
 177    search for the prime number beginning with the specific offset in
 178    the prime number table.  The final offset is stored in the same
 179    variable.  */
 180
 181 static int
 182 prime_size (int size, int *prime_offset)
 183 {
 184   static const unsigned long primes [] = {
 185     13, 19, 29, 41, 59, 79, 107, 149, 197, 263, 347, 457, 599, 787, 1031,
 186     1361, 1777, 2333, 3037, 3967, 5167, 6719, 8737, 11369, 14783,
 187     19219, 24989, 32491, 42257, 54941, 71429, 92861, 120721, 156941,
 188     204047, 265271, 344857, 448321, 582821, 757693, 985003, 1280519,
 189     1664681, 2164111, 2813353, 3657361, 4754591, 6180989, 8035301,
 190     10445899, 13579681, 17653589, 22949669, 29834603, 38784989,
 191     50420551, 65546729, 85210757, 110774011, 144006217, 187208107,
 192     243370577, 316381771, 411296309, 534685237, 695090819, 903618083,
 193     1174703521, 1527114613, 1985248999,
 194     (unsigned long)0x99d43ea5, (unsigned long)0xc7fa5177
 195   };
 196   int i = *prime_offset;
 197
 198   for (; i < ARRAY_SIZE (primes); i++)
 199     if (primes[i] >= size)
 200       {
 201         /* Set the offset to the next prime.  That is safe because,
 202            next time we are called, it will be with a larger SIZE,
 203            which means we could never return the same prime anyway.
 204            (If that is not the case, the caller can simply reset
 205            *prime_offset.)  */
 206         *prime_offset = i + 1;
 207         return primes[i];
 208       }
 209
 210   abort ();
 211   return 0;
 212 }
 213
 214 /* Create a hash table of INITIAL_SIZE with hash function
 215    HASH_FUNCTION and test function TEST_FUNCTION.  INITIAL_SIZE will
 216    be rounded to the next prime, so you don't have to worry about it
 217    being a prime number.
 218
 219    Consequently, if you wish to start out with a "small" table which
 220    will be regrown as needed, specify INITIAL_SIZE 0.  */
 221
 222 struct hash_table *
 223 hash_table_new (int initial_size,
 224                 unsigned long (*hash_function) (const void *),
 225                 int (*test_function) (const void *, const void *))
 226 {
 227   struct hash_table *ht
 228     = (struct hash_table *)xmalloc (sizeof (struct hash_table));
 229
 230   ht->hash_function = hash_function;
 231   ht->test_function = test_function;
 232
 233   ht->prime_offset = 0;
 234   ht->size = prime_size (initial_size, &ht->prime_offset);
 235   ht->resize_threshold = ht->size * 3 / 4;
 236
 237   ht->count = 0;
 238
 239   ht->mappings = xmalloc (ht->size * sizeof (struct mapping));
 240   memset (ht->mappings, '\0', ht->size * sizeof (struct mapping));
 241
 242   return ht;
 243 }
 244
 245 /* Free the data associated with hash table HT. */
 246
 247 void
 248 hash_table_destroy (struct hash_table *ht)
 249 {
 250   xfree (ht->mappings);
 251   xfree (ht);
 252 }
 253
 254 /* The heart of almost all functions in this file -- find the mapping
 255    whose KEY is equal to key, using linear probing.  Returns the
 256    mapping that matches KEY, or NULL if none matches.  */
 257
 258 static inline struct mapping *
 259 find_mapping (struct hash_table *ht, const void *key)
 260 {
 261   struct mapping *mappings = ht->mappings;
 262   int size = ht->size;
 263   struct mapping *mp = mappings + HASH_POSITION (ht, key);
 264   int (*equals) PARAMS ((const void *, const void *)) = ht->test_function;
 265
 266   LOOP_NON_EMPTY (mp, mappings, size)
 267     if (equals (key, mp->key))
 268       return mp;
 269   return NULL;
 270 }
 271
 272 /* Get the value that corresponds to the key KEY in the hash table HT.
 273    If no value is found, return NULL.  Note that NULL is a legal value
 274    for value; if you are storing NULLs in your hash table, you can use
 275    hash_table_contains to be sure that a (possibly NULL) value exists
 276    in the table.  Or, you can use hash_table_get_pair instead of this
 277    function.  */
 278
 279 void *
 280 hash_table_get (struct hash_table *ht, const void *key)
 281 {
 282   struct mapping *mp = find_mapping (ht, key);
 283   if (mp)
 284     return mp->value;
 285   else
 286     return NULL;
 287 }
 288
 289 /* Like hash_table_get, but writes out the pointers to both key and
 290    value.  Returns non-zero on success.  */
 291
 292 int
 293 hash_table_get_pair (struct hash_table *ht, const void *lookup_key,
 294                      void *orig_key, void *value)
 295 {
 296   struct mapping *mp = find_mapping (ht, lookup_key);
 297
 298   if (mp)
 299     {
 300       if (orig_key)
 301         *(void **)orig_key = mp->key;
 302       if (value)
 303         *(void **)value = mp->value;
 304       return 1;
 305     }
 306   else
 307     return 0;
 308 }
 309
 310 /* Return 1 if HT contains KEY, 0 otherwise. */
 311
 312 int
 313 hash_table_contains (struct hash_table *ht, const void *key)
 314 {
 315   return find_mapping (ht, key) != NULL;
 316 }
 317
 318 /* Grow hash table HT as necessary, and rehash all the key-value
 319    mappings.  */
 320
 321 static void
 322 grow_hash_table (struct hash_table *ht)
 323 {
 324   struct mapping *old_mappings = ht->mappings;
 325   struct mapping *old_end      = ht->mappings + ht->size;
 326   struct mapping *mp, *mappings;
 327   int newsize;
 328
 329   newsize = prime_size (ht->size * 2, &ht->prime_offset);
 330 #if 0
 331   printf ("growing from %d to %d; fullness %.2f%% to %.2f%%\n",
 332           ht->size, newsize,
 333           (double)100 * ht->count / ht->size,
 334           (double)100 * ht->count / newsize);
 335 #endif
 336
 337   ht->size = newsize;
 338   ht->resize_threshold = newsize * 3 / 4;
 339
 340   mappings = xmalloc (ht->size * sizeof (struct mapping));
 341   memset (mappings, '\0', ht->size * sizeof (struct mapping));
 342   ht->mappings = mappings;
 343
 344   for (mp = old_mappings; mp < old_end; mp++)
 345     if (!EMPTY_MAPPING_P (mp))
 346       {
 347         struct mapping *new_mp = mappings + HASH_POSITION (ht, mp->key);
 348         /* We don't need to call test function and worry about
 349            collisions because all the keys come from the hash table
 350            and are therefore guaranteed to be unique.  */
 351         LOOP_NON_EMPTY (new_mp, mappings, newsize)
 352           ;
 353         *new_mp = *mp;
 354       }
 355
 356   xfree (old_mappings);
 357 }
 358
 359 /* Put VALUE in the hash table HT under the key KEY.  This regrows the
 360    table if necessary.  */
 361
 362 void
 363 hash_table_put (struct hash_table *ht, const void *key, void *value)
 364 {
 365   struct mapping *mappings = ht->mappings;
 366   int size = ht->size;
 367   int (*equals) PARAMS ((const void *, const void *)) = ht->test_function;
 368
 369   struct mapping *mp = mappings + HASH_POSITION (ht, key);
 370
 371   LOOP_NON_EMPTY (mp, mappings, size)
 372     if (equals (key, mp->key))
 373       {
 374         mp->key   = (void *)key; /* const? */
 375         mp->value = value;
 376         return;
 377       }
 378
 379   ++ht->count;
 380   mp->key   = (void *)key;      /* const? */
 381   mp->value = value;
 382
 383   if (ht->count > ht->resize_threshold)
 384     /* When table is 75% full, regrow it. */
 385     grow_hash_table (ht);
 386 }
 387
 388 /* Remove a mapping that matches KEY from HT.  Return 0 if there was
 389    no such entry; return 1 if an entry was removed.  */
 390
 391 int
 392 hash_table_remove (struct hash_table *ht, const void *key)
 393 {
 394   struct mapping *mp = find_mapping (ht, key);
 395   if (!mp)
 396     return 0;
 397   else
 398     {
 399       int size = ht->size;
 400       struct mapping *mappings = ht->mappings;
 401
 402       mp->key = NULL;
 403       --ht->count;
 404
 405       /* Rehash all the entries following MP.  The alternative
 406          approach is to mark the entry as deleted, i.e. create a
 407          "tombstone".  That makes remove faster, but leaves a lot of
 408          garbage and slows down hash_table_get and hash_table_put.  */
 409
 410       mp = NEXT_MAPPING (mp, mappings, size);
 411       LOOP_NON_EMPTY (mp, mappings, size)
 412         {
 413           const void *key2 = mp->key;
 414           struct mapping *mp_new = mappings + HASH_POSITION (ht, key2);
 415
 416           /* Find the new location for the key. */
 417
 418           LOOP_NON_EMPTY (mp_new, mappings, size)
 419             if (key2 == mp_new->key)
 420               /* The mapping MP (key2) is already where we want it (in
 421                  MP_NEW's "chain" of keys.)  */
 422               goto next_rehash;
 423
 424           *mp_new = *mp;
 425           mp->key = NULL;
 426
 427         next_rehash:
 428           ;
 429         }
 430       return 1;
 431     }
 432 }
 433
 434 /* Clear HT of all entries.  After calling this function, the count
 435    and the fullness of the hash table will be zero.  The size will
 436    remain unchanged.  */
 437
 438 void
 439 hash_table_clear (struct hash_table *ht)
 440 {
 441   memset (ht->mappings, '\0', ht->size * sizeof (struct mapping));
 442   ht->count = 0;
 443 }
 444
 445 /* Map MAPFUN over all the mappings in hash table HT.  MAPFUN is
 446    called with three arguments: the key, the value, and the CLOSURE.
 447
 448    It is undefined what happens if you add or remove entries in the
 449    hash table while hash_table_map is running.  The exception is the
 450    entry you're currently mapping over; you may remove or change that
 451    entry.  */
 452
 453 void
 454 hash_table_map (struct hash_table *ht,
 455                 int (*mapfun) (void *, void *, void *),
 456                 void *closure)
 457 {
 458   struct mapping *mp  = ht->mappings;
 459   struct mapping *end = ht->mappings + ht->size;
 460
 461   for (; mp < end; mp++)
 462     if (!EMPTY_MAPPING_P (mp))
 463       {
 464         void *key;
 465       repeat:
 466         key = mp->key;
 467         if (mapfun (key, mp->value, closure))
 468           return;
 469         /* hash_table_remove might have moved the adjacent
 470            mappings. */
 471         if (mp->key != key && !EMPTY_MAPPING_P (mp))
 472           goto repeat;
 473       }
 474 }
 475
 476 /* Return the number of elements in the hash table.  This is not the
 477    same as the physical size of the hash table, which is always
 478    greater than the number of elements.  */
 479
 480 int
 481 hash_table_count (struct hash_table *ht)
 482 {
 483   return ht->count;
 484 }
 485 \f
 486 /* Functions from this point onward are meant for convenience and
 487    don't strictly belong to this file.  However, this is as good a
 488    place for them as any.  */
 489
 490 /* ========
 491    Support for hash tables whose keys are strings.
 492    ======== */
 493
 494 /* 31 bit hash function.  Taken from Gnome's glib, modified to use
 495    standard C types.
 496
 497    We used to use the popular hash function from the Dragon Book, but
 498    this one seems to perform much better.  */
 499
 500 unsigned long
 501 string_hash (const void *key)
 502 {
 503   const char *p = key;
 504   unsigned int h = *p;
 505
 506   if (h)
 507     for (p += 1; *p != '\0'; p++)
 508       h = (h << 5) - h + *p;
 509
 510   return h;
 511 }
 512
 513 /* Frontend for strcmp usable for hash tables. */
 514
 515 int
 516 string_cmp (const void *s1, const void *s2)
 517 {
 518   return !strcmp ((const char *)s1, (const char *)s2);
 519 }
 520
 521 /* Return a hash table of initial size INITIAL_SIZE suitable to use
 522    strings as keys.  */
 523
 524 struct hash_table *
 525 make_string_hash_table (int initial_size)
 526 {
 527   return hash_table_new (initial_size, string_hash, string_cmp);
 528 }
 529
 530 /* ========
 531    Support for hash tables whose keys are strings, but which are
 532    compared case-insensitively.
 533    ======== */
 534
 535 /* Like string_hash, but produce the same hash regardless of the case. */
 536
 537 static unsigned long
 538 string_hash_nocase (const void *key)
 539 {
 540   const char *p = key;
 541   unsigned int h = TOLOWER (*p);
 542
 543   if (h)
 544     for (p += 1; *p != '\0'; p++)
 545       h = (h << 5) - h + TOLOWER (*p);
 546
 547   return h;
 548 }
 549
 550 /* Like string_cmp, but doing case-insensitive compareison. */
 551
 552 static int
 553 string_cmp_nocase (const void *s1, const void *s2)
 554 {
 555   return !strcasecmp ((const char *)s1, (const char *)s2);
 556 }
 557
 558 /* Like make_string_hash_table, but uses string_hash_nocase and
 559    string_cmp_nocase.  */
 560
 561 struct hash_table *
 562 make_nocase_string_hash_table (int initial_size)
 563 {
 564   return hash_table_new (initial_size, string_hash_nocase, string_cmp_nocase);
 565 }
 566
 567 #if 0
 568 /* If I ever need it: hashing of integers. */
 569
 570 unsigned int
 571 inthash (unsigned int key)
 572 {
 573   key += (key << 12);
 574   key ^= (key >> 22);
 575   key += (key << 4);
 576   key ^= (key >> 9);
 577   key += (key << 10);
 578   key ^= (key >> 2);
 579   key += (key << 7);
 580   key ^= (key >> 12);
 581   return key;
 582 }
 583 #endif
 584 \f
 585 #ifdef STANDALONE
 586
 587 #include <stdio.h>
 588 #include <string.h>
 589
 590 int
 591 print_hash_table_mapper (void *key, void *value, void *count)
 592 {
 593   ++*(int *)count;
 594   printf ("%s: %s\n", (const char *)key, (char *)value);
 595   return 0;
 596 }
 597
 598 void
 599 print_hash (struct hash_table *sht)
 600 {
 601   int debug_count = 0;
 602   hash_table_map (sht, print_hash_table_mapper, &debug_count);
 603   assert (debug_count == sht->count);
 604 }
 605
 606 int
 607 main (void)
 608 {
 609   struct hash_table *ht = make_string_hash_table (0);
 610   char line[80];
 611   while ((fgets (line, sizeof (line), stdin)))
 612     {
 613       int len = strlen (line);
 614       if (len <= 1)
 615         continue;
 616       line[--len] = '\0';
 617       if (!hash_table_contains (ht, line))
 618         hash_table_put (ht, strdup (line), "here I am!");
 619 #if 1
 620       if (len % 5 == 0)
 621         {
 622           char *line_copy;
 623           if (hash_table_get_pair (ht, line, &line_copy, NULL))
 624             {
 625               hash_table_remove (ht, line);
 626               xfree (line_copy);
 627             }
 628         }
 629 #endif
 630     }
 631 #if 0
 632   print_hash (ht);
 633 #endif
 634 #if 1
 635   printf ("%d %d\n", ht->count, ht->size);
 636 #endif
 637   return 0;
 638 }
 639 #endif