sjero.net Git - wget/blob - src/utils.c

   1 /* Various functions of utilitarian nature.
   2    Copyright (C) 1995, 1996, 1997, 1998, 2000, 2001
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2 of the License, or
  10 (at your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget; if not, write to the Free Software
  19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  20
  21 In addition, as a special exception, the Free Software Foundation
  22 gives permission to link the code of its release of Wget with the
  23 OpenSSL project's "OpenSSL" library (or with modified versions of it
  24 that use the same license as the "OpenSSL" library), and distribute
  25 the linked executables.  You must obey the GNU General Public License
  26 in all respects for all of the code used other than "OpenSSL".  If you
  27 modify this file, you may extend this exception to your version of the
  28 file, but you are not obligated to do so.  If you do not wish to do
  29 so, delete this exception statement from your version.  */
  30
  31 #include <config.h>
  32
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #ifdef HAVE_STRING_H
  36 # include <string.h>
  37 #else  /* not HAVE_STRING_H */
  38 # include <strings.h>
  39 #endif /* not HAVE_STRING_H */
  40 #include <sys/types.h>
  41 #ifdef HAVE_UNISTD_H
  42 # include <unistd.h>
  43 #endif
  44 #ifdef HAVE_MMAP
  45 # include <sys/mman.h>
  46 #endif
  47 #ifdef HAVE_PWD_H
  48 # include <pwd.h>
  49 #endif
  50 #include <limits.h>
  51 #ifdef HAVE_UTIME_H
  52 # include <utime.h>
  53 #endif
  54 #ifdef HAVE_SYS_UTIME_H
  55 # include <sys/utime.h>
  56 #endif
  57 #include <errno.h>
  58 #ifdef NeXT
  59 # include <libc.h>              /* for access() */
  60 #endif
  61 #include <fcntl.h>
  62 #include <assert.h>
  63
  64 /* For TIOCGWINSZ and friends: */
  65 #ifdef HAVE_SYS_IOCTL_H
  66 # include <sys/ioctl.h>
  67 #endif
  68 #ifdef HAVE_TERMIOS_H
  69 # include <termios.h>
  70 #endif
  71
  72 /* Needed for run_with_timeout. */
  73 #undef USE_SIGNAL_TIMEOUT
  74 #ifdef HAVE_SIGNAL_H
  75 # include <signal.h>
  76 #endif
  77 #ifdef HAVE_SETJMP_H
  78 # include <setjmp.h>
  79 #endif
  80 /* If sigsetjmp is a macro, configure won't pick it up. */
  81 #ifdef sigsetjmp
  82 # define HAVE_SIGSETJMP
  83 #endif
  84 #ifdef HAVE_SIGNAL
  85 # ifdef HAVE_SIGSETJMP
  86 #  define USE_SIGNAL_TIMEOUT
  87 # endif
  88 # ifdef HAVE_SIGBLOCK
  89 #  define USE_SIGNAL_TIMEOUT
  90 # endif
  91 #endif
  92
  93 #include "wget.h"
  94 #include "utils.h"
  95 #include "hash.h"
  96
  97 #ifndef errno
  98 extern int errno;
  99 #endif
 100
 101 /* This section implements several wrappers around the basic
 102    allocation routines.  This is done for two reasons: first, so that
 103    the callers of these functions need not consistently check for
 104    errors.  If there is not enough virtual memory for running Wget,
 105    something is seriously wrong, and Wget exits with an appropriate
 106    error message.
 107
 108    The second reason why these are useful is that, if DEBUG_MALLOC is
 109    defined, they also provide a handy (if crude) malloc debugging
 110    interface that checks memory leaks.  */
 111
 112 /* Croak the fatal memory error and bail out with non-zero exit
 113    status.  */
 114 static void
 115 memfatal (const char *what)
 116 {
 117   /* Make sure we don't try to store part of the log line, and thus
 118      call malloc.  */
 119   log_set_save_context (0);
 120   logprintf (LOG_ALWAYS, _("%s: %s: Not enough memory.\n"), exec_name, what);
 121   exit (1);
 122 }
 123
 124 /* These functions end with _real because they need to be
 125    distinguished from the debugging functions, and from the macros.
 126    Explanation follows:
 127
 128    If memory debugging is not turned on, wget.h defines these:
 129
 130      #define xmalloc xmalloc_real
 131      #define xrealloc xrealloc_real
 132      #define xstrdup xstrdup_real
 133      #define xfree free
 134
 135    In case of memory debugging, the definitions are a bit more
 136    complex, because we want to provide more information, *and* we want
 137    to call the debugging code.  (The former is the reason why xmalloc
 138    and friends need to be macros in the first place.)  Then it looks
 139    like this:
 140
 141      #define xmalloc(a) xmalloc_debug (a, __FILE__, __LINE__)
 142      #define xfree(a)   xfree_debug (a, __FILE__, __LINE__)
 143      #define xrealloc(a, b) xrealloc_debug (a, b, __FILE__, __LINE__)
 144      #define xstrdup(a) xstrdup_debug (a, __FILE__, __LINE__)
 145
 146    Each of the *_debug function does its magic and calls the real one.  */
 147
 148 #ifdef DEBUG_MALLOC
 149 # define STATIC_IF_DEBUG static
 150 #else
 151 # define STATIC_IF_DEBUG
 152 #endif
 153
 154 STATIC_IF_DEBUG void *
 155 xmalloc_real (size_t size)
 156 {
 157   void *ptr = malloc (size);
 158   if (!ptr)
 159     memfatal ("malloc");
 160   return ptr;
 161 }
 162
 163 STATIC_IF_DEBUG void *
 164 xrealloc_real (void *ptr, size_t newsize)
 165 {
 166   void *newptr;
 167
 168   /* Not all Un*xes have the feature of realloc() that calling it with
 169      a NULL-pointer is the same as malloc(), but it is easy to
 170      simulate.  */
 171   if (ptr)
 172     newptr = realloc (ptr, newsize);
 173   else
 174     newptr = malloc (newsize);
 175   if (!newptr)
 176     memfatal ("realloc");
 177   return newptr;
 178 }
 179
 180 STATIC_IF_DEBUG char *
 181 xstrdup_real (const char *s)
 182 {
 183   char *copy;
 184
 185 #ifndef HAVE_STRDUP
 186   int l = strlen (s);
 187   copy = malloc (l + 1);
 188   if (!copy)
 189     memfatal ("strdup");
 190   memcpy (copy, s, l + 1);
 191 #else  /* HAVE_STRDUP */
 192   copy = strdup (s);
 193   if (!copy)
 194     memfatal ("strdup");
 195 #endif /* HAVE_STRDUP */
 196
 197   return copy;
 198 }
 199
 200 #ifdef DEBUG_MALLOC
 201
 202 /* Crude home-grown routines for debugging some malloc-related
 203    problems.  Featured:
 204
 205    * Counting the number of malloc and free invocations, and reporting
 206      the "balance", i.e. how many times more malloc was called than it
 207      was the case with free.
 208
 209    * Making malloc store its entry into a simple array and free remove
 210      stuff from that array.  At the end, print the pointers which have
 211      not been freed, along with the source file and the line number.
 212      This also has the side-effect of detecting freeing memory that
 213      was never allocated.
 214
 215    Note that this kind of memory leak checking strongly depends on
 216    every malloc() being followed by a free(), even if the program is
 217    about to finish.  Wget is careful to free the data structure it
 218    allocated in init.c.  */
 219
 220 static int malloc_count, free_count;
 221
 222 static struct {
 223   char *ptr;
 224   const char *file;
 225   int line;
 226 } malloc_debug[100000];
 227
 228 /* Both register_ptr and unregister_ptr take O(n) operations to run,
 229    which can be a real problem.  It would be nice to use a hash table
 230    for malloc_debug, but the functions in hash.c are not suitable
 231    because they can call malloc() themselves.  Maybe it would work if
 232    the hash table were preallocated to a huge size, and if we set the
 233    rehash threshold to 1.0.  */
 234
 235 /* Register PTR in malloc_debug.  Abort if this is not possible
 236    (presumably due to the number of current allocations exceeding the
 237    size of malloc_debug.)  */
 238
 239 static void
 240 register_ptr (void *ptr, const char *file, int line)
 241 {
 242   int i;
 243   for (i = 0; i < countof (malloc_debug); i++)
 244     if (malloc_debug[i].ptr == NULL)
 245       {
 246         malloc_debug[i].ptr = ptr;
 247         malloc_debug[i].file = file;
 248         malloc_debug[i].line = line;
 249         return;
 250       }
 251   abort ();
 252 }
 253
 254 /* Unregister PTR from malloc_debug.  Abort if PTR is not present in
 255    malloc_debug.  (This catches calling free() with a bogus pointer.)  */
 256
 257 static void
 258 unregister_ptr (void *ptr)
 259 {
 260   int i;
 261   for (i = 0; i < countof (malloc_debug); i++)
 262     if (malloc_debug[i].ptr == ptr)
 263       {
 264         malloc_debug[i].ptr = NULL;
 265         return;
 266       }
 267   abort ();
 268 }
 269
 270 /* Print the malloc debug stats that can be gathered from the above
 271    information.  Currently this is the count of mallocs, frees, the
 272    difference between the two, and the dump of the contents of
 273    malloc_debug.  The last part are the memory leaks.  */
 274
 275 void
 276 print_malloc_debug_stats (void)
 277 {
 278   int i;
 279   printf ("\nMalloc:  %d\nFree:    %d\nBalance: %d\n\n",
 280           malloc_count, free_count, malloc_count - free_count);
 281   for (i = 0; i < countof (malloc_debug); i++)
 282     if (malloc_debug[i].ptr != NULL)
 283       printf ("0x%08ld: %s:%d\n", (long)malloc_debug[i].ptr,
 284               malloc_debug[i].file, malloc_debug[i].line);
 285 }
 286
 287 void *
 288 xmalloc_debug (size_t size, const char *source_file, int source_line)
 289 {
 290   void *ptr = xmalloc_real (size);
 291   ++malloc_count;
 292   register_ptr (ptr, source_file, source_line);
 293   return ptr;
 294 }
 295
 296 void
 297 xfree_debug (void *ptr, const char *source_file, int source_line)
 298 {
 299   assert (ptr != NULL);
 300   ++free_count;
 301   unregister_ptr (ptr);
 302   free (ptr);
 303 }
 304
 305 void *
 306 xrealloc_debug (void *ptr, size_t newsize, const char *source_file, int source_line)
 307 {
 308   void *newptr = xrealloc_real (ptr, newsize);
 309   if (!ptr)
 310     {
 311       ++malloc_count;
 312       register_ptr (newptr, source_file, source_line);
 313     }
 314   else if (newptr != ptr)
 315     {
 316       unregister_ptr (ptr);
 317       register_ptr (newptr, source_file, source_line);
 318     }
 319   return newptr;
 320 }
 321
 322 char *
 323 xstrdup_debug (const char *s, const char *source_file, int source_line)
 324 {
 325   char *copy = xstrdup_real (s);
 326   ++malloc_count;
 327   register_ptr (copy, source_file, source_line);
 328   return copy;
 329 }
 330
 331 #endif /* DEBUG_MALLOC */
 332 \f
 333 /* Utility function: like xstrdup(), but also lowercases S.  */
 334
 335 char *
 336 xstrdup_lower (const char *s)
 337 {
 338   char *copy = xstrdup (s);
 339   char *p = copy;
 340   for (; *p; p++)
 341     *p = TOLOWER (*p);
 342   return copy;
 343 }
 344
 345 /* Return a count of how many times CHR occurs in STRING. */
 346
 347 int
 348 count_char (const char *string, char chr)
 349 {
 350   const char *p;
 351   int count = 0;
 352   for (p = string; *p; p++)
 353     if (*p == chr)
 354       ++count;
 355   return count;
 356 }
 357
 358 /* Copy the string formed by two pointers (one on the beginning, other
 359    on the char after the last char) to a new, malloc-ed location.
 360    0-terminate it.  */
 361 char *
 362 strdupdelim (const char *beg, const char *end)
 363 {
 364   char *res = (char *)xmalloc (end - beg + 1);
 365   memcpy (res, beg, end - beg);
 366   res[end - beg] = '\0';
 367   return res;
 368 }
 369
 370 /* Parse a string containing comma-separated elements, and return a
 371    vector of char pointers with the elements.  Spaces following the
 372    commas are ignored.  */
 373 char **
 374 sepstring (const char *s)
 375 {
 376   char **res;
 377   const char *p;
 378   int i = 0;
 379
 380   if (!s || !*s)
 381     return NULL;
 382   res = NULL;
 383   p = s;
 384   while (*s)
 385     {
 386       if (*s == ',')
 387         {
 388           res = (char **)xrealloc (res, (i + 2) * sizeof (char *));
 389           res[i] = strdupdelim (p, s);
 390           res[++i] = NULL;
 391           ++s;
 392           /* Skip the blanks following the ','.  */
 393           while (ISSPACE (*s))
 394             ++s;
 395           p = s;
 396         }
 397       else
 398         ++s;
 399     }
 400   res = (char **)xrealloc (res, (i + 2) * sizeof (char *));
 401   res[i] = strdupdelim (p, s);
 402   res[i + 1] = NULL;
 403   return res;
 404 }
 405 \f
 406 /* Return pointer to a static char[] buffer in which zero-terminated
 407    string-representation of TM (in form hh:mm:ss) is printed.
 408
 409    If TM is non-NULL, the current time-in-seconds will be stored
 410    there.
 411
 412    (#### This is misleading: one would expect TM would be used instead
 413    of the current time in that case.  This design was probably
 414    influenced by the design time(2), and should be changed at some
 415    points.  No callers use non-NULL TM anyway.)  */
 416
 417 char *
 418 time_str (time_t *tm)
 419 {
 420   static char output[15];
 421   struct tm *ptm;
 422   time_t secs = time (tm);
 423
 424   if (secs == -1)
 425     {
 426       /* In case of error, return the empty string.  Maybe we should
 427          just abort if this happens?  */
 428       *output = '\0';
 429       return output;
 430     }
 431   ptm = localtime (&secs);
 432   sprintf (output, "%02d:%02d:%02d", ptm->tm_hour, ptm->tm_min, ptm->tm_sec);
 433   return output;
 434 }
 435
 436 /* Like the above, but include the date: YYYY-MM-DD hh:mm:ss.  */
 437
 438 char *
 439 datetime_str (time_t *tm)
 440 {
 441   static char output[20];       /* "YYYY-MM-DD hh:mm:ss" + \0 */
 442   struct tm *ptm;
 443   time_t secs = time (tm);
 444
 445   if (secs == -1)
 446     {
 447       /* In case of error, return the empty string.  Maybe we should
 448          just abort if this happens?  */
 449       *output = '\0';
 450       return output;
 451     }
 452   ptm = localtime (&secs);
 453   sprintf (output, "%04d-%02d-%02d %02d:%02d:%02d",
 454            ptm->tm_year + 1900, ptm->tm_mon + 1, ptm->tm_mday,
 455            ptm->tm_hour, ptm->tm_min, ptm->tm_sec);
 456   return output;
 457 }
 458 \f
 459 /* The Windows versions of the following two functions are defined in
 460    mswindows.c.  */
 461
 462 #ifndef WINDOWS
 463 void
 464 fork_to_background (void)
 465 {
 466   pid_t pid;
 467   /* Whether we arrange our own version of opt.lfilename here.  */
 468   int changedp = 0;
 469
 470   if (!opt.lfilename)
 471     {
 472       opt.lfilename = unique_name (DEFAULT_LOGFILE, 0);
 473       changedp = 1;
 474     }
 475   pid = fork ();
 476   if (pid < 0)
 477     {
 478       /* parent, error */
 479       perror ("fork");
 480       exit (1);
 481     }
 482   else if (pid != 0)
 483     {
 484       /* parent, no error */
 485       printf (_("Continuing in background, pid %d.\n"), (int)pid);
 486       if (changedp)
 487         printf (_("Output will be written to `%s'.\n"), opt.lfilename);
 488       exit (0);                 /* #### should we use _exit()? */
 489     }
 490
 491   /* child: give up the privileges and keep running. */
 492   setsid ();
 493   freopen ("/dev/null", "r", stdin);
 494   freopen ("/dev/null", "w", stdout);
 495   freopen ("/dev/null", "w", stderr);
 496 }
 497 #endif /* not WINDOWS */
 498 \f
 499 /* "Touch" FILE, i.e. make its atime and mtime equal to the time
 500    specified with TM.  */
 501 void
 502 touch (const char *file, time_t tm)
 503 {
 504 #ifdef HAVE_STRUCT_UTIMBUF
 505   struct utimbuf times;
 506   times.actime = times.modtime = tm;
 507 #else
 508   time_t times[2];
 509   times[0] = times[1] = tm;
 510 #endif
 511
 512   if (utime (file, &times) == -1)
 513     logprintf (LOG_NOTQUIET, "utime(%s): %s\n", file, strerror (errno));
 514 }
 515
 516 /* Checks if FILE is a symbolic link, and removes it if it is.  Does
 517    nothing under MS-Windows.  */
 518 int
 519 remove_link (const char *file)
 520 {
 521   int err = 0;
 522   struct stat st;
 523
 524   if (lstat (file, &st) == 0 && S_ISLNK (st.st_mode))
 525     {
 526       DEBUGP (("Unlinking %s (symlink).\n", file));
 527       err = unlink (file);
 528       if (err != 0)
 529         logprintf (LOG_VERBOSE, _("Failed to unlink symlink `%s': %s\n"),
 530                    file, strerror (errno));
 531     }
 532   return err;
 533 }
 534
 535 /* Does FILENAME exist?  This is quite a lousy implementation, since
 536    it supplies no error codes -- only a yes-or-no answer.  Thus it
 537    will return that a file does not exist if, e.g., the directory is
 538    unreadable.  I don't mind it too much currently, though.  The
 539    proper way should, of course, be to have a third, error state,
 540    other than true/false, but that would introduce uncalled-for
 541    additional complexity to the callers.  */
 542 int
 543 file_exists_p (const char *filename)
 544 {
 545 #ifdef HAVE_ACCESS
 546   return access (filename, F_OK) >= 0;
 547 #else
 548   struct stat buf;
 549   return stat (filename, &buf) >= 0;
 550 #endif
 551 }
 552
 553 /* Returns 0 if PATH is a directory, 1 otherwise (any kind of file).
 554    Returns 0 on error.  */
 555 int
 556 file_non_directory_p (const char *path)
 557 {
 558   struct stat buf;
 559   /* Use lstat() rather than stat() so that symbolic links pointing to
 560      directories can be identified correctly.  */
 561   if (lstat (path, &buf) != 0)
 562     return 0;
 563   return S_ISDIR (buf.st_mode) ? 0 : 1;
 564 }
 565
 566 /* Return the size of file named by FILENAME, or -1 if it cannot be
 567    opened or seeked into. */
 568 long
 569 file_size (const char *filename)
 570 {
 571   long size;
 572   /* We use fseek rather than stat to determine the file size because
 573      that way we can also verify whether the file is readable.
 574      Inspired by the POST patch by Arnaud Wylie.  */
 575   FILE *fp = fopen (filename, "rb");
 576   if (!fp)
 577     return -1;
 578   fseek (fp, 0, SEEK_END);
 579   size = ftell (fp);
 580   fclose (fp);
 581   return size;
 582 }
 583
 584 /* stat file names named PREFIX.1, PREFIX.2, etc., until one that
 585    doesn't exist is found.  Return a freshly allocated copy of the
 586    unused file name.  */
 587
 588 static char *
 589 unique_name_1 (const char *prefix)
 590 {
 591   int count = 1;
 592   int plen = strlen (prefix);
 593   char *template = (char *)alloca (plen + 1 + 24);
 594   char *template_tail = template + plen;
 595
 596   memcpy (template, prefix, plen);
 597   *template_tail++ = '.';
 598
 599   do
 600     number_to_string (template_tail, count++);
 601   while (file_exists_p (template));
 602
 603   return xstrdup (template);
 604 }
 605
 606 /* Return a unique file name, based on FILE.
 607
 608    More precisely, if FILE doesn't exist, it is returned unmodified.
 609    If not, FILE.1 is tried, then FILE.2, etc.  The first FILE.<number>
 610    file name that doesn't exist is returned.
 611
 612    The resulting file is not created, only verified that it didn't
 613    exist at the point in time when the function was called.
 614    Therefore, where security matters, don't rely that the file created
 615    by this function exists until you open it with O_EXCL or
 616    something.
 617
 618    If ALLOW_PASSTHROUGH is 0, it always returns a freshly allocated
 619    string.  Otherwise, it may return FILE if the file doesn't exist
 620    (and therefore doesn't need changing).  */
 621
 622 char *
 623 unique_name (const char *file, int allow_passthrough)
 624 {
 625   /* If the FILE itself doesn't exist, return it without
 626      modification. */
 627   if (!file_exists_p (file))
 628     return allow_passthrough ? (char *)file : xstrdup (file);
 629
 630   /* Otherwise, find a numeric suffix that results in unused file name
 631      and return it.  */
 632   return unique_name_1 (file);
 633 }
 634 \f
 635 /* Create DIRECTORY.  If some of the pathname components of DIRECTORY
 636    are missing, create them first.  In case any mkdir() call fails,
 637    return its error status.  Returns 0 on successful completion.
 638
 639    The behaviour of this function should be identical to the behaviour
 640    of `mkdir -p' on systems where mkdir supports the `-p' option.  */
 641 int
 642 make_directory (const char *directory)
 643 {
 644   int quit = 0;
 645   int i;
 646   int ret = 0;
 647   char *dir;
 648
 649   /* Make a copy of dir, to be able to write to it.  Otherwise, the
 650      function is unsafe if called with a read-only char *argument.  */
 651   STRDUP_ALLOCA (dir, directory);
 652
 653   /* If the first character of dir is '/', skip it (and thus enable
 654      creation of absolute-pathname directories.  */
 655   for (i = (*dir == '/'); 1; ++i)
 656     {
 657       for (; dir[i] && dir[i] != '/'; i++)
 658         ;
 659       if (!dir[i])
 660         quit = 1;
 661       dir[i] = '\0';
 662       /* Check whether the directory already exists.  Allow creation of
 663          of intermediate directories to fail, as the initial path components
 664          are not necessarily directories!  */
 665       if (!file_exists_p (dir))
 666         ret = mkdir (dir, 0777);
 667       else
 668         ret = 0;
 669       if (quit)
 670         break;
 671       else
 672         dir[i] = '/';
 673     }
 674   return ret;
 675 }
 676
 677 /* Merge BASE with FILE.  BASE can be a directory or a file name, FILE
 678    should be a file name.
 679
 680    file_merge("/foo/bar", "baz")  => "/foo/baz"
 681    file_merge("/foo/bar/", "baz") => "/foo/bar/baz"
 682    file_merge("foo", "bar")       => "bar"
 683
 684    In other words, it's a simpler and gentler version of uri_merge_1.  */
 685
 686 char *
 687 file_merge (const char *base, const char *file)
 688 {
 689   char *result;
 690   const char *cut = (const char *)strrchr (base, '/');
 691
 692   if (!cut)
 693     return xstrdup (file);
 694
 695   result = (char *)xmalloc (cut - base + 1 + strlen (file) + 1);
 696   memcpy (result, base, cut - base);
 697   result[cut - base] = '/';
 698   strcpy (result + (cut - base) + 1, file);
 699
 700   return result;
 701 }
 702 \f
 703 static int in_acclist PARAMS ((const char *const *, const char *, int));
 704
 705 /* Determine whether a file is acceptable to be followed, according to
 706    lists of patterns to accept/reject.  */
 707 int
 708 acceptable (const char *s)
 709 {
 710   int l = strlen (s);
 711
 712   while (l && s[l] != '/')
 713     --l;
 714   if (s[l] == '/')
 715     s += (l + 1);
 716   if (opt.accepts)
 717     {
 718       if (opt.rejects)
 719         return (in_acclist ((const char *const *)opt.accepts, s, 1)
 720                 && !in_acclist ((const char *const *)opt.rejects, s, 1));
 721       else
 722         return in_acclist ((const char *const *)opt.accepts, s, 1);
 723     }
 724   else if (opt.rejects)
 725     return !in_acclist ((const char *const *)opt.rejects, s, 1);
 726   return 1;
 727 }
 728
 729 /* Compare S1 and S2 frontally; S2 must begin with S1.  E.g. if S1 is
 730    `/something', frontcmp() will return 1 only if S2 begins with
 731    `/something'.  Otherwise, 0 is returned.  */
 732 int
 733 frontcmp (const char *s1, const char *s2)
 734 {
 735   for (; *s1 && *s2 && (*s1 == *s2); ++s1, ++s2);
 736   return !*s1;
 737 }
 738
 739 /* Iterate through STRLIST, and return the first element that matches
 740    S, through wildcards or front comparison (as appropriate).  */
 741 static char *
 742 proclist (char **strlist, const char *s, enum accd flags)
 743 {
 744   char **x;
 745
 746   for (x = strlist; *x; x++)
 747     if (has_wildcards_p (*x))
 748       {
 749         if (fnmatch (*x, s, FNM_PATHNAME) == 0)
 750           break;
 751       }
 752     else
 753       {
 754         char *p = *x + ((flags & ALLABS) && (**x == '/')); /* Remove '/' */
 755         if (frontcmp (p, s))
 756           break;
 757       }
 758   return *x;
 759 }
 760
 761 /* Returns whether DIRECTORY is acceptable for download, wrt the
 762    include/exclude lists.
 763
 764    If FLAGS is ALLABS, the leading `/' is ignored in paths; relative
 765    and absolute paths may be freely intermixed.  */
 766 int
 767 accdir (const char *directory, enum accd flags)
 768 {
 769   /* Remove starting '/'.  */
 770   if (flags & ALLABS && *directory == '/')
 771     ++directory;
 772   if (opt.includes)
 773     {
 774       if (!proclist (opt.includes, directory, flags))
 775         return 0;
 776     }
 777   if (opt.excludes)
 778     {
 779       if (proclist (opt.excludes, directory, flags))
 780         return 0;
 781     }
 782   return 1;
 783 }
 784
 785 /* Return non-zero if STRING ends with TAIL.  For instance:
 786
 787    match_tail ("abc", "bc", 0)  -> 1
 788    match_tail ("abc", "ab", 0)  -> 0
 789    match_tail ("abc", "abc", 0) -> 1
 790
 791    If FOLD_CASE_P is non-zero, the comparison will be
 792    case-insensitive.  */
 793
 794 int
 795 match_tail (const char *string, const char *tail, int fold_case_p)
 796 {
 797   int i, j;
 798
 799   /* We want this to be fast, so we code two loops, one with
 800      case-folding, one without. */
 801
 802   if (!fold_case_p)
 803     {
 804       for (i = strlen (string), j = strlen (tail); i >= 0 && j >= 0; i--, j--)
 805         if (string[i] != tail[j])
 806           break;
 807     }
 808   else
 809     {
 810       for (i = strlen (string), j = strlen (tail); i >= 0 && j >= 0; i--, j--)
 811         if (TOLOWER (string[i]) != TOLOWER (tail[j]))
 812           break;
 813     }
 814
 815   /* If the tail was exhausted, the match was succesful.  */
 816   if (j == -1)
 817     return 1;
 818   else
 819     return 0;
 820 }
 821
 822 /* Checks whether string S matches each element of ACCEPTS.  A list
 823    element are matched either with fnmatch() or match_tail(),
 824    according to whether the element contains wildcards or not.
 825
 826    If the BACKWARD is 0, don't do backward comparison -- just compare
 827    them normally.  */
 828 static int
 829 in_acclist (const char *const *accepts, const char *s, int backward)
 830 {
 831   for (; *accepts; accepts++)
 832     {
 833       if (has_wildcards_p (*accepts))
 834         {
 835           /* fnmatch returns 0 if the pattern *does* match the
 836              string.  */
 837           if (fnmatch (*accepts, s, 0) == 0)
 838             return 1;
 839         }
 840       else
 841         {
 842           if (backward)
 843             {
 844               if (match_tail (s, *accepts, 0))
 845                 return 1;
 846             }
 847           else
 848             {
 849               if (!strcmp (s, *accepts))
 850                 return 1;
 851             }
 852         }
 853     }
 854   return 0;
 855 }
 856
 857 /* Return the location of STR's suffix (file extension).  Examples:
 858    suffix ("foo.bar")       -> "bar"
 859    suffix ("foo.bar.baz")   -> "baz"
 860    suffix ("/foo/bar")      -> NULL
 861    suffix ("/foo.bar/baz")  -> NULL  */
 862 char *
 863 suffix (const char *str)
 864 {
 865   int i;
 866
 867   for (i = strlen (str); i && str[i] != '/' && str[i] != '.'; i--)
 868     ;
 869
 870   if (str[i++] == '.')
 871     return (char *)str + i;
 872   else
 873     return NULL;
 874 }
 875
 876 /* Return non-zero if S contains globbing wildcards (`*', `?', `[' or
 877    `]').  */
 878
 879 int
 880 has_wildcards_p (const char *s)
 881 {
 882   for (; *s; s++)
 883     if (*s == '*' || *s == '?' || *s == '[' || *s == ']')
 884       return 1;
 885   return 0;
 886 }
 887
 888 /* Return non-zero if FNAME ends with a typical HTML suffix.  The
 889    following (case-insensitive) suffixes are presumed to be HTML files:
 890
 891      html
 892      htm
 893      ?html (`?' matches one character)
 894
 895    #### CAVEAT.  This is not necessarily a good indication that FNAME
 896    refers to a file that contains HTML!  */
 897 int
 898 has_html_suffix_p (const char *fname)
 899 {
 900   char *suf;
 901
 902   if ((suf = suffix (fname)) == NULL)
 903     return 0;
 904   if (!strcasecmp (suf, "html"))
 905     return 1;
 906   if (!strcasecmp (suf, "htm"))
 907     return 1;
 908   if (suf[0] && !strcasecmp (suf + 1, "html"))
 909     return 1;
 910   return 0;
 911 }
 912
 913 /* Read a line from FP and return the pointer to freshly allocated
 914    storage.  The storage space is obtained through malloc() and should
 915    be freed with free() when it is no longer needed.
 916
 917    The length of the line is not limited, except by available memory.
 918    The newline character at the end of line is retained.  The line is
 919    terminated with a zero character.
 920
 921    After end-of-file is encountered without anything being read, NULL
 922    is returned.  NULL is also returned on error.  To distinguish
 923    between these two cases, use the stdio function ferror().  */
 924
 925 char *
 926 read_whole_line (FILE *fp)
 927 {
 928   int length = 0;
 929   int bufsize = 82;
 930   char *line = (char *)xmalloc (bufsize);
 931
 932   while (fgets (line + length, bufsize - length, fp))
 933     {
 934       length += strlen (line + length);
 935       if (length == 0)
 936         /* Possible for example when reading from a binary file where
 937            a line begins with \0.  */
 938         continue;
 939
 940       if (line[length - 1] == '\n')
 941         break;
 942
 943       /* fgets() guarantees to read the whole line, or to use up the
 944          space we've given it.  We can double the buffer
 945          unconditionally.  */
 946       bufsize <<= 1;
 947       line = xrealloc (line, bufsize);
 948     }
 949   if (length == 0 || ferror (fp))
 950     {
 951       xfree (line);
 952       return NULL;
 953     }
 954   if (length + 1 < bufsize)
 955     /* Relieve the memory from our exponential greediness.  We say
 956        `length + 1' because the terminating \0 is not included in
 957        LENGTH.  We don't need to zero-terminate the string ourselves,
 958        though, because fgets() does that.  */
 959     line = xrealloc (line, length + 1);
 960   return line;
 961 }
 962 \f
 963 /* Read FILE into memory.  A pointer to `struct file_memory' are
 964    returned; use struct element `content' to access file contents, and
 965    the element `length' to know the file length.  `content' is *not*
 966    zero-terminated, and you should *not* read or write beyond the [0,
 967    length) range of characters.
 968
 969    After you are done with the file contents, call read_file_free to
 970    release the memory.
 971
 972    Depending on the operating system and the type of file that is
 973    being read, read_file() either mmap's the file into memory, or
 974    reads the file into the core using read().
 975
 976    If file is named "-", fileno(stdin) is used for reading instead.
 977    If you want to read from a real file named "-", use "./-" instead.  */
 978
 979 struct file_memory *
 980 read_file (const char *file)
 981 {
 982   int fd;
 983   struct file_memory *fm;
 984   long size;
 985   int inhibit_close = 0;
 986
 987   /* Some magic in the finest tradition of Perl and its kin: if FILE
 988      is "-", just use stdin.  */
 989   if (HYPHENP (file))
 990     {
 991       fd = fileno (stdin);
 992       inhibit_close = 1;
 993       /* Note that we don't inhibit mmap() in this case.  If stdin is
 994          redirected from a regular file, mmap() will still work.  */
 995     }
 996   else
 997     fd = open (file, O_RDONLY);
 998   if (fd < 0)
 999     return NULL;
1000   fm = xmalloc (sizeof (struct file_memory));
1001
1002 #ifdef HAVE_MMAP
1003   {
1004     struct stat buf;
1005     if (fstat (fd, &buf) < 0)
1006       goto mmap_lose;
1007     fm->length = buf.st_size;
1008     /* NOTE: As far as I know, the callers of this function never
1009        modify the file text.  Relying on this would enable us to
1010        specify PROT_READ and MAP_SHARED for a marginal gain in
1011        efficiency, but at some cost to generality.  */
1012     fm->content = mmap (NULL, fm->length, PROT_READ | PROT_WRITE,
1013                         MAP_PRIVATE, fd, 0);
1014     if (fm->content == (char *)MAP_FAILED)
1015       goto mmap_lose;
1016     if (!inhibit_close)
1017       close (fd);
1018
1019     fm->mmap_p = 1;
1020     return fm;
1021   }
1022
1023  mmap_lose:
1024   /* The most common reason why mmap() fails is that FD does not point
1025      to a plain file.  However, it's also possible that mmap() doesn't
1026      work for a particular type of file.  Therefore, whenever mmap()
1027      fails, we just fall back to the regular method.  */
1028 #endif /* HAVE_MMAP */
1029
1030   fm->length = 0;
1031   size = 512;                   /* number of bytes fm->contents can
1032                                    hold at any given time. */
1033   fm->content = xmalloc (size);
1034   while (1)
1035     {
1036       long nread;
1037       if (fm->length > size / 2)
1038         {
1039           /* #### I'm not sure whether the whole exponential-growth
1040              thing makes sense with kernel read.  On Linux at least,
1041              read() refuses to read more than 4K from a file at a
1042              single chunk anyway.  But other Unixes might optimize it
1043              better, and it doesn't *hurt* anything, so I'm leaving
1044              it.  */
1045
1046           /* Normally, we grow SIZE exponentially to make the number
1047              of calls to read() and realloc() logarithmic in relation
1048              to file size.  However, read() can read an amount of data
1049              smaller than requested, and it would be unreasonable to
1050              double SIZE every time *something* was read.  Therefore,
1051              we double SIZE only when the length exceeds half of the
1052              entire allocated size.  */
1053           size <<= 1;
1054           fm->content = xrealloc (fm->content, size);
1055         }
1056       nread = read (fd, fm->content + fm->length, size - fm->length);
1057       if (nread > 0)
1058         /* Successful read. */
1059         fm->length += nread;
1060       else if (nread < 0)
1061         /* Error. */
1062         goto lose;
1063       else
1064         /* EOF */
1065         break;
1066     }
1067   if (!inhibit_close)
1068     close (fd);
1069   if (size > fm->length && fm->length != 0)
1070     /* Due to exponential growth of fm->content, the allocated region
1071        might be much larger than what is actually needed.  */
1072     fm->content = xrealloc (fm->content, fm->length);
1073   fm->mmap_p = 0;
1074   return fm;
1075
1076  lose:
1077   if (!inhibit_close)
1078     close (fd);
1079   xfree (fm->content);
1080   xfree (fm);
1081   return NULL;
1082 }
1083
1084 /* Release the resources held by FM.  Specifically, this calls
1085    munmap() or xfree() on fm->content, depending whether mmap or
1086    malloc/read were used to read in the file.  It also frees the
1087    memory needed to hold the FM structure itself.  */
1088
1089 void
1090 read_file_free (struct file_memory *fm)
1091 {
1092 #ifdef HAVE_MMAP
1093   if (fm->mmap_p)
1094     {
1095       munmap (fm->content, fm->length);
1096     }
1097   else
1098 #endif
1099     {
1100       xfree (fm->content);
1101     }
1102   xfree (fm);
1103 }
1104 \f
1105 /* Free the pointers in a NULL-terminated vector of pointers, then
1106    free the pointer itself.  */
1107 void
1108 free_vec (char **vec)
1109 {
1110   if (vec)
1111     {
1112       char **p = vec;
1113       while (*p)
1114         xfree (*p++);
1115       xfree (vec);
1116     }
1117 }
1118
1119 /* Append vector V2 to vector V1.  The function frees V2 and
1120    reallocates V1 (thus you may not use the contents of neither
1121    pointer after the call).  If V1 is NULL, V2 is returned.  */
1122 char **
1123 merge_vecs (char **v1, char **v2)
1124 {
1125   int i, j;
1126
1127   if (!v1)
1128     return v2;
1129   if (!v2)
1130     return v1;
1131   if (!*v2)
1132     {
1133       /* To avoid j == 0 */
1134       xfree (v2);
1135       return v1;
1136     }
1137   /* Count v1.  */
1138   for (i = 0; v1[i]; i++);
1139   /* Count v2.  */
1140   for (j = 0; v2[j]; j++);
1141   /* Reallocate v1.  */
1142   v1 = (char **)xrealloc (v1, (i + j + 1) * sizeof (char **));
1143   memcpy (v1 + i, v2, (j + 1) * sizeof (char *));
1144   xfree (v2);
1145   return v1;
1146 }
1147
1148 /* A set of simple-minded routines to store strings in a linked list.
1149    This used to also be used for searching, but now we have hash
1150    tables for that.  */
1151
1152 /* It's a shame that these simple things like linked lists and hash
1153    tables (see hash.c) need to be implemented over and over again.  It
1154    would be nice to be able to use the routines from glib -- see
1155    www.gtk.org for details.  However, that would make Wget depend on
1156    glib, and I want to avoid dependencies to external libraries for
1157    reasons of convenience and portability (I suspect Wget is more
1158    portable than anything ever written for Gnome).  */
1159
1160 /* Append an element to the list.  If the list has a huge number of
1161    elements, this can get slow because it has to find the list's
1162    ending.  If you think you have to call slist_append in a loop,
1163    think about calling slist_prepend() followed by slist_nreverse().  */
1164
1165 slist *
1166 slist_append (slist *l, const char *s)
1167 {
1168   slist *newel = (slist *)xmalloc (sizeof (slist));
1169   slist *beg = l;
1170
1171   newel->string = xstrdup (s);
1172   newel->next = NULL;
1173
1174   if (!l)
1175     return newel;
1176   /* Find the last element.  */
1177   while (l->next)
1178     l = l->next;
1179   l->next = newel;
1180   return beg;
1181 }
1182
1183 /* Prepend S to the list.  Unlike slist_append(), this is O(1).  */
1184
1185 slist *
1186 slist_prepend (slist *l, const char *s)
1187 {
1188   slist *newel = (slist *)xmalloc (sizeof (slist));
1189   newel->string = xstrdup (s);
1190   newel->next = l;
1191   return newel;
1192 }
1193
1194 /* Destructively reverse L. */
1195
1196 slist *
1197 slist_nreverse (slist *l)
1198 {
1199   slist *prev = NULL;
1200   while (l)
1201     {
1202       slist *next = l->next;
1203       l->next = prev;
1204       prev = l;
1205       l = next;
1206     }
1207   return prev;
1208 }
1209
1210 /* Is there a specific entry in the list?  */
1211 int
1212 slist_contains (slist *l, const char *s)
1213 {
1214   for (; l; l = l->next)
1215     if (!strcmp (l->string, s))
1216       return 1;
1217   return 0;
1218 }
1219
1220 /* Free the whole slist.  */
1221 void
1222 slist_free (slist *l)
1223 {
1224   while (l)
1225     {
1226       slist *n = l->next;
1227       xfree (l->string);
1228       xfree (l);
1229       l = n;
1230     }
1231 }
1232 \f
1233 /* Sometimes it's useful to create "sets" of strings, i.e. special
1234    hash tables where you want to store strings as keys and merely
1235    query for their existence.  Here is a set of utility routines that
1236    makes that transparent.  */
1237
1238 void
1239 string_set_add (struct hash_table *ht, const char *s)
1240 {
1241   /* First check whether the set element already exists.  If it does,
1242      do nothing so that we don't have to free() the old element and
1243      then strdup() a new one.  */
1244   if (hash_table_contains (ht, s))
1245     return;
1246
1247   /* We use "1" as value.  It provides us a useful and clear arbitrary
1248      value, and it consumes no memory -- the pointers to the same
1249      string "1" will be shared by all the key-value pairs in all `set'
1250      hash tables.  */
1251   hash_table_put (ht, xstrdup (s), "1");
1252 }
1253
1254 /* Synonym for hash_table_contains... */
1255
1256 int
1257 string_set_contains (struct hash_table *ht, const char *s)
1258 {
1259   return hash_table_contains (ht, s);
1260 }
1261
1262 static int
1263 string_set_free_mapper (void *key, void *value_ignored, void *arg_ignored)
1264 {
1265   xfree (key);
1266   return 0;
1267 }
1268
1269 void
1270 string_set_free (struct hash_table *ht)
1271 {
1272   hash_table_map (ht, string_set_free_mapper, NULL);
1273   hash_table_destroy (ht);
1274 }
1275
1276 static int
1277 free_keys_and_values_mapper (void *key, void *value, void *arg_ignored)
1278 {
1279   xfree (key);
1280   xfree (value);
1281   return 0;
1282 }
1283
1284 /* Another utility function: call free() on all keys and values of HT.  */
1285
1286 void
1287 free_keys_and_values (struct hash_table *ht)
1288 {
1289   hash_table_map (ht, free_keys_and_values_mapper, NULL);
1290 }
1291
1292 \f
1293 /* Engine for legible and legible_very_long; this function works on
1294    strings.  */
1295
1296 static char *
1297 legible_1 (const char *repr)
1298 {
1299   static char outbuf[128];
1300   int i, i1, mod;
1301   char *outptr;
1302   const char *inptr;
1303
1304   /* Reset the pointers.  */
1305   outptr = outbuf;
1306   inptr = repr;
1307   /* If the number is negative, shift the pointers.  */
1308   if (*inptr == '-')
1309     {
1310       *outptr++ = '-';
1311       ++inptr;
1312     }
1313   /* How many digits before the first separator?  */
1314   mod = strlen (inptr) % 3;
1315   /* Insert them.  */
1316   for (i = 0; i < mod; i++)
1317     *outptr++ = inptr[i];
1318   /* Now insert the rest of them, putting separator before every
1319      third digit.  */
1320   for (i1 = i, i = 0; inptr[i1]; i++, i1++)
1321     {
1322       if (i % 3 == 0 && i1 != 0)
1323         *outptr++ = ',';
1324       *outptr++ = inptr[i1];
1325     }
1326   /* Zero-terminate the string.  */
1327   *outptr = '\0';
1328   return outbuf;
1329 }
1330
1331 /* Legible -- return a static pointer to the legibly printed long.  */
1332 char *
1333 legible (long l)
1334 {
1335   char inbuf[24];
1336   /* Print the number into the buffer.  */
1337   number_to_string (inbuf, l);
1338   return legible_1 (inbuf);
1339 }
1340
1341 /* Write a string representation of NUMBER into the provided buffer.
1342    We cannot use sprintf() because we cannot be sure whether the
1343    platform supports printing of what we chose for VERY_LONG_TYPE.
1344
1345    Example: Gcc supports `long long' under many platforms, but on many
1346    of those the native libc knows nothing of it and therefore cannot
1347    print it.
1348
1349    How long BUFFER needs to be depends on the platform and the content
1350    of NUMBER.  For 64-bit VERY_LONG_TYPE (the most common case), 24
1351    bytes are sufficient.  Using more might be a good idea.
1352
1353    This function does not go through the hoops that long_to_string
1354    goes to because it doesn't aspire to be fast.  (It's called perhaps
1355    once in a Wget run.)  */
1356
1357 static void
1358 very_long_to_string (char *buffer, VERY_LONG_TYPE number)
1359 {
1360   int i = 0;
1361   int j;
1362
1363   /* Print the number backwards... */
1364   do
1365     {
1366       buffer[i++] = '0' + number % 10;
1367       number /= 10;
1368     }
1369   while (number);
1370
1371   /* ...and reverse the order of the digits. */
1372   for (j = 0; j < i / 2; j++)
1373     {
1374       char c = buffer[j];
1375       buffer[j] = buffer[i - 1 - j];
1376       buffer[i - 1 - j] = c;
1377     }
1378   buffer[i] = '\0';
1379 }
1380
1381 /* The same as legible(), but works on VERY_LONG_TYPE.  See sysdep.h.  */
1382 char *
1383 legible_very_long (VERY_LONG_TYPE l)
1384 {
1385   char inbuf[128];
1386   /* Print the number into the buffer.  */
1387   very_long_to_string (inbuf, l);
1388   return legible_1 (inbuf);
1389 }
1390
1391 /* Count the digits in a (long) integer.  */
1392 int
1393 numdigit (long number)
1394 {
1395   int cnt = 1;
1396   if (number < 0)
1397     {
1398       number = -number;
1399       ++cnt;
1400     }
1401   while ((number /= 10) > 0)
1402     ++cnt;
1403   return cnt;
1404 }
1405
1406 /* A half-assed implementation of INT_MAX on machines that don't
1407    bother to define one. */
1408 #ifndef INT_MAX
1409 # define INT_MAX ((int) ~((unsigned)1 << 8 * sizeof (int) - 1))
1410 #endif
1411
1412 #define ONE_DIGIT(figure) *p++ = n / (figure) + '0'
1413 #define ONE_DIGIT_ADVANCE(figure) (ONE_DIGIT (figure), n %= (figure))
1414
1415 #define DIGITS_1(figure) ONE_DIGIT (figure)
1416 #define DIGITS_2(figure) ONE_DIGIT_ADVANCE (figure); DIGITS_1 ((figure) / 10)
1417 #define DIGITS_3(figure) ONE_DIGIT_ADVANCE (figure); DIGITS_2 ((figure) / 10)
1418 #define DIGITS_4(figure) ONE_DIGIT_ADVANCE (figure); DIGITS_3 ((figure) / 10)
1419 #define DIGITS_5(figure) ONE_DIGIT_ADVANCE (figure); DIGITS_4 ((figure) / 10)
1420 #define DIGITS_6(figure) ONE_DIGIT_ADVANCE (figure); DIGITS_5 ((figure) / 10)
1421 #define DIGITS_7(figure) ONE_DIGIT_ADVANCE (figure); DIGITS_6 ((figure) / 10)
1422 #define DIGITS_8(figure) ONE_DIGIT_ADVANCE (figure); DIGITS_7 ((figure) / 10)
1423 #define DIGITS_9(figure) ONE_DIGIT_ADVANCE (figure); DIGITS_8 ((figure) / 10)
1424 #define DIGITS_10(figure) ONE_DIGIT_ADVANCE (figure); DIGITS_9 ((figure) / 10)
1425
1426 /* DIGITS_<11-20> are only used on machines with 64-bit longs. */
1427
1428 #define DIGITS_11(figure) ONE_DIGIT_ADVANCE (figure); DIGITS_10 ((figure) / 10)
1429 #define DIGITS_12(figure) ONE_DIGIT_ADVANCE (figure); DIGITS_11 ((figure) / 10)
1430 #define DIGITS_13(figure) ONE_DIGIT_ADVANCE (figure); DIGITS_12 ((figure) / 10)
1431 #define DIGITS_14(figure) ONE_DIGIT_ADVANCE (figure); DIGITS_13 ((figure) / 10)
1432 #define DIGITS_15(figure) ONE_DIGIT_ADVANCE (figure); DIGITS_14 ((figure) / 10)
1433 #define DIGITS_16(figure) ONE_DIGIT_ADVANCE (figure); DIGITS_15 ((figure) / 10)
1434 #define DIGITS_17(figure) ONE_DIGIT_ADVANCE (figure); DIGITS_16 ((figure) / 10)
1435 #define DIGITS_18(figure) ONE_DIGIT_ADVANCE (figure); DIGITS_17 ((figure) / 10)
1436 #define DIGITS_19(figure) ONE_DIGIT_ADVANCE (figure); DIGITS_18 ((figure) / 10)
1437
1438 /* Print NUMBER to BUFFER in base 10.  This should be completely
1439    equivalent to `sprintf(buffer, "%ld", number)', only much faster.
1440
1441    The speedup may make a difference in programs that frequently
1442    convert numbers to strings.  Some implementations of sprintf,
1443    particularly the one in GNU libc, have been known to be extremely
1444    slow compared to this function.
1445
1446    Return the pointer to the location where the terminating zero was
1447    printed.  (Equivalent to calling buffer+strlen(buffer) after the
1448    function is done.)
1449
1450    BUFFER should be big enough to accept as many bytes as you expect
1451    the number to take up.  On machines with 64-bit longs the maximum
1452    needed size is 24 bytes.  That includes the digits needed for the
1453    largest 64-bit number, the `-' sign in case it's negative, and the
1454    terminating '\0'.  */
1455
1456 char *
1457 number_to_string (char *buffer, long number)
1458 {
1459   char *p = buffer;
1460   long n = number;
1461
1462 #if (SIZEOF_LONG != 4) && (SIZEOF_LONG != 8)
1463   /* We are running in a strange or misconfigured environment.  Let
1464      sprintf cope with it.  */
1465   sprintf (buffer, "%ld", n);
1466   p += strlen (buffer);
1467 #else  /* (SIZEOF_LONG == 4) || (SIZEOF_LONG == 8) */
1468
1469   if (n < 0)
1470     {
1471       if (n < -INT_MAX)
1472         {
1473           /* We cannot print a '-' and assign -n to n because -n would
1474              overflow.  Let sprintf deal with this border case.  */
1475           sprintf (buffer, "%ld", n);
1476           p += strlen (buffer);
1477           return p;
1478         }
1479
1480       *p++ = '-';
1481       n = -n;
1482     }
1483
1484   if      (n < 10)                   { DIGITS_1 (1); }
1485   else if (n < 100)                  { DIGITS_2 (10); }
1486   else if (n < 1000)                 { DIGITS_3 (100); }
1487   else if (n < 10000)                { DIGITS_4 (1000); }
1488   else if (n < 100000)               { DIGITS_5 (10000); }
1489   else if (n < 1000000)              { DIGITS_6 (100000); }
1490   else if (n < 10000000)             { DIGITS_7 (1000000); }
1491   else if (n < 100000000)            { DIGITS_8 (10000000); }
1492   else if (n < 1000000000)           { DIGITS_9 (100000000); }
1493 #if SIZEOF_LONG == 4
1494   /* ``if (1)'' serves only to preserve editor indentation. */
1495   else if (1)                        { DIGITS_10 (1000000000); }
1496 #else  /* SIZEOF_LONG != 4 */
1497   else if (n < 10000000000L)         { DIGITS_10 (1000000000L); }
1498   else if (n < 100000000000L)        { DIGITS_11 (10000000000L); }
1499   else if (n < 1000000000000L)       { DIGITS_12 (100000000000L); }
1500   else if (n < 10000000000000L)      { DIGITS_13 (1000000000000L); }
1501   else if (n < 100000000000000L)     { DIGITS_14 (10000000000000L); }
1502   else if (n < 1000000000000000L)    { DIGITS_15 (100000000000000L); }
1503   else if (n < 10000000000000000L)   { DIGITS_16 (1000000000000000L); }
1504   else if (n < 100000000000000000L)  { DIGITS_17 (10000000000000000L); }
1505   else if (n < 1000000000000000000L) { DIGITS_18 (100000000000000000L); }
1506   else                               { DIGITS_19 (1000000000000000000L); }
1507 #endif /* SIZEOF_LONG != 4 */
1508
1509   *p = '\0';
1510 #endif /* (SIZEOF_LONG == 4) || (SIZEOF_LONG == 8) */
1511
1512   return p;
1513 }
1514
1515 #undef ONE_DIGIT
1516 #undef ONE_DIGIT_ADVANCE
1517
1518 #undef DIGITS_1
1519 #undef DIGITS_2
1520 #undef DIGITS_3
1521 #undef DIGITS_4
1522 #undef DIGITS_5
1523 #undef DIGITS_6
1524 #undef DIGITS_7
1525 #undef DIGITS_8
1526 #undef DIGITS_9
1527 #undef DIGITS_10
1528 #undef DIGITS_11
1529 #undef DIGITS_12
1530 #undef DIGITS_13
1531 #undef DIGITS_14
1532 #undef DIGITS_15
1533 #undef DIGITS_16
1534 #undef DIGITS_17
1535 #undef DIGITS_18
1536 #undef DIGITS_19
1537 \f
1538 /* Support for timers. */
1539
1540 #undef TIMER_WINDOWS
1541 #undef TIMER_GETTIMEOFDAY
1542 #undef TIMER_TIME
1543
1544 /* Depending on the OS and availability of gettimeofday(), one and
1545    only one of the above constants will be defined.  Virtually all
1546    modern Unix systems will define TIMER_GETTIMEOFDAY; Windows will
1547    use TIMER_WINDOWS.  TIMER_TIME is a catch-all method for
1548    non-Windows systems without gettimeofday.
1549
1550    #### Perhaps we should also support ftime(), which exists on old
1551    BSD 4.2-influenced systems?  (It also existed under MS DOS Borland
1552    C, if memory serves me.)  */
1553
1554 #ifdef WINDOWS
1555 # define TIMER_WINDOWS
1556 #else  /* not WINDOWS */
1557 # ifdef HAVE_GETTIMEOFDAY
1558 #  define TIMER_GETTIMEOFDAY
1559 # else
1560 #  define TIMER_TIME
1561 # endif
1562 #endif /* not WINDOWS */
1563
1564 #ifdef TIMER_GETTIMEOFDAY
1565 typedef struct timeval wget_sys_time;
1566 #endif
1567
1568 #ifdef TIMER_TIME
1569 typedef time_t wget_sys_time;
1570 #endif
1571
1572 #ifdef TIMER_WINDOWS
1573 typedef ULARGE_INTEGER wget_sys_time;
1574 #endif
1575
1576 struct wget_timer {
1577   /* The starting point in time which, subtracted from the current
1578      time, yields elapsed time. */
1579   wget_sys_time start;
1580
1581   /* The most recent elapsed time, calculated by wtimer_elapsed().
1582      Measured in milliseconds.  */
1583   double elapsed_last;
1584
1585   /* Approximately, the time elapsed between the true start of the
1586      measurement and the time represented by START.  */
1587   double elapsed_pre_start;
1588 };
1589
1590 /* Allocate a timer.  It is not legal to do anything with a freshly
1591    allocated timer, except call wtimer_reset() or wtimer_delete().  */
1592
1593 struct wget_timer *
1594 wtimer_allocate (void)
1595 {
1596   struct wget_timer *wt =
1597     (struct wget_timer *)xmalloc (sizeof (struct wget_timer));
1598   return wt;
1599 }
1600
1601 /* Allocate a new timer and reset it.  Return the new timer. */
1602
1603 struct wget_timer *
1604 wtimer_new (void)
1605 {
1606   struct wget_timer *wt = wtimer_allocate ();
1607   wtimer_reset (wt);
1608   return wt;
1609 }
1610
1611 /* Free the resources associated with the timer.  Its further use is
1612    prohibited.  */
1613
1614 void
1615 wtimer_delete (struct wget_timer *wt)
1616 {
1617   xfree (wt);
1618 }
1619
1620 /* Store system time to WST.  */
1621
1622 static void
1623 wtimer_sys_set (wget_sys_time *wst)
1624 {
1625 #ifdef TIMER_GETTIMEOFDAY
1626   gettimeofday (wst, NULL);
1627 #endif
1628
1629 #ifdef TIMER_TIME
1630   time (wst);
1631 #endif
1632
1633 #ifdef TIMER_WINDOWS
1634   /* We use GetSystemTime to get the elapsed time.  MSDN warns that
1635      system clock adjustments can skew the output of GetSystemTime
1636      when used as a timer and gives preference to GetTickCount and
1637      high-resolution timers.  But GetTickCount can overflow, and hires
1638      timers are typically used for profiling, not for regular time
1639      measurement.  Since we handle clock skew anyway, we just use
1640      GetSystemTime.  */
1641   FILETIME ft;
1642   SYSTEMTIME st;
1643   GetSystemTime (&st);
1644
1645   /* As recommended by MSDN, we convert SYSTEMTIME to FILETIME, copy
1646      FILETIME to ULARGE_INTEGER, and use regular 64-bit integer
1647      arithmetic on that.  */
1648   SystemTimeToFileTime (&st, &ft);
1649   wst->HighPart = ft.dwHighDateTime;
1650   wst->LowPart  = ft.dwLowDateTime;
1651 #endif
1652 }
1653
1654 /* Reset timer WT.  This establishes the starting point from which
1655    wtimer_elapsed() will return the number of elapsed
1656    milliseconds.  It is allowed to reset a previously used timer.  */
1657
1658 void
1659 wtimer_reset (struct wget_timer *wt)
1660 {
1661   /* Set the start time to the current time. */
1662   wtimer_sys_set (&wt->start);
1663   wt->elapsed_last = 0;
1664   wt->elapsed_pre_start = 0;
1665 }
1666
1667 static double
1668 wtimer_sys_diff (wget_sys_time *wst1, wget_sys_time *wst2)
1669 {
1670 #ifdef TIMER_GETTIMEOFDAY
1671   return ((double)(wst1->tv_sec - wst2->tv_sec) * 1000
1672           + (double)(wst1->tv_usec - wst2->tv_usec) / 1000);
1673 #endif
1674
1675 #ifdef TIMER_TIME
1676   return 1000 * (*wst1 - *wst2);
1677 #endif
1678
1679 #ifdef WINDOWS
1680   /* VC++ 6 doesn't support direct cast of uint64 to double.  To work
1681      around this, we subtract, then convert to signed, then finally to
1682      double.  */
1683   return (double)(signed __int64)(wst1->QuadPart - wst2->QuadPart) / 10000;
1684 #endif
1685 }
1686
1687 /* Return the number of milliseconds elapsed since the timer was last
1688    reset.  It is allowed to call this function more than once to get
1689    increasingly higher elapsed values.  These timers handle clock
1690    skew.  */
1691
1692 double
1693 wtimer_elapsed (struct wget_timer *wt)
1694 {
1695   wget_sys_time now;
1696   double elapsed;
1697
1698   wtimer_sys_set (&now);
1699   elapsed = wt->elapsed_pre_start + wtimer_sys_diff (&now, &wt->start);
1700
1701   /* Ideally we'd just return the difference between NOW and
1702      wt->start.  However, the system timer can be set back, and we
1703      could return a value smaller than when we were last called, even
1704      a negative value.  Both of these would confuse the callers, which
1705      expect us to return monotonically nondecreasing values.
1706
1707      Therefore: if ELAPSED is smaller than its previous known value,
1708      we reset wt->start to the current time and effectively start
1709      measuring from this point.  But since we don't want the elapsed
1710      value to start from zero, we set elapsed_pre_start to the last
1711      elapsed time and increment all future calculations by that
1712      amount.  */
1713
1714   if (elapsed < wt->elapsed_last)
1715     {
1716       wt->start = now;
1717       wt->elapsed_pre_start = wt->elapsed_last;
1718       elapsed = wt->elapsed_last;
1719     }
1720
1721   wt->elapsed_last = elapsed;
1722   return elapsed;
1723 }
1724
1725 /* Return the assessed granularity of the timer implementation, in
1726    milliseconds.  This is used by code that tries to substitute a
1727    better value for timers that have returned zero.  */
1728
1729 double
1730 wtimer_granularity (void)
1731 {
1732 #ifdef TIMER_GETTIMEOFDAY
1733   /* Granularity of gettimeofday varies wildly between architectures.
1734      However, it appears that on modern machines it tends to be better
1735      than 1ms.  Assume 100 usecs.  (Perhaps the configure process
1736      could actually measure this?)  */
1737   return 0.1;
1738 #endif
1739
1740 #ifdef TIMER_TIME
1741   return 1000;
1742 #endif
1743
1744 #ifdef TIMER_WINDOWS
1745   /* According to MSDN, GetSystemTime returns a broken-down time
1746      structure the smallest member of which are milliseconds.  */
1747   return 1;
1748 #endif
1749 }
1750 \f
1751 /* This should probably be at a better place, but it doesn't really
1752    fit into html-parse.c.  */
1753
1754 /* The function returns the pointer to the malloc-ed quoted version of
1755    string s.  It will recognize and quote numeric and special graphic
1756    entities, as per RFC1866:
1757
1758    `&' -> `&amp;'
1759    `<' -> `&lt;'
1760    `>' -> `&gt;'
1761    `"' -> `&quot;'
1762    SP  -> `&#32;'
1763
1764    No other entities are recognized or replaced.  */
1765 char *
1766 html_quote_string (const char *s)
1767 {
1768   const char *b = s;
1769   char *p, *res;
1770   int i;
1771
1772   /* Pass through the string, and count the new size.  */
1773   for (i = 0; *s; s++, i++)
1774     {
1775       if (*s == '&')
1776         i += 4;                 /* `amp;' */
1777       else if (*s == '<' || *s == '>')
1778         i += 3;                 /* `lt;' and `gt;' */
1779       else if (*s == '\"')
1780         i += 5;                 /* `quot;' */
1781       else if (*s == ' ')
1782         i += 4;                 /* #32; */
1783     }
1784   res = (char *)xmalloc (i + 1);
1785   s = b;
1786   for (p = res; *s; s++)
1787     {
1788       switch (*s)
1789         {
1790         case '&':
1791           *p++ = '&';
1792           *p++ = 'a';
1793           *p++ = 'm';
1794           *p++ = 'p';
1795           *p++ = ';';
1796           break;
1797         case '<': case '>':
1798           *p++ = '&';
1799           *p++ = (*s == '<' ? 'l' : 'g');
1800           *p++ = 't';
1801           *p++ = ';';
1802           break;
1803         case '\"':
1804           *p++ = '&';
1805           *p++ = 'q';
1806           *p++ = 'u';
1807           *p++ = 'o';
1808           *p++ = 't';
1809           *p++ = ';';
1810           break;
1811         case ' ':
1812           *p++ = '&';
1813           *p++ = '#';
1814           *p++ = '3';
1815           *p++ = '2';
1816           *p++ = ';';
1817           break;
1818         default:
1819           *p++ = *s;
1820         }
1821     }
1822   *p = '\0';
1823   return res;
1824 }
1825
1826 /* Determine the width of the terminal we're running on.  If that's
1827    not possible, return 0.  */
1828
1829 int
1830 determine_screen_width (void)
1831 {
1832   /* If there's a way to get the terminal size using POSIX
1833      tcgetattr(), somebody please tell me.  */
1834 #ifndef TIOCGWINSZ
1835   return 0;
1836 #else  /* TIOCGWINSZ */
1837   int fd;
1838   struct winsize wsz;
1839
1840   if (opt.lfilename != NULL)
1841     return 0;
1842
1843   fd = fileno (stderr);
1844   if (ioctl (fd, TIOCGWINSZ, &wsz) < 0)
1845     return 0;                   /* most likely ENOTTY */
1846
1847   return wsz.ws_col;
1848 #endif /* TIOCGWINSZ */
1849 }
1850
1851 /* Return a random number between 0 and MAX-1, inclusive.
1852
1853    If MAX is greater than the value of RAND_MAX+1 on the system, the
1854    returned value will be in the range [0, RAND_MAX].  This may be
1855    fixed in a future release.
1856
1857    The random number generator is seeded automatically the first time
1858    it is called.
1859
1860    This uses rand() for portability.  It has been suggested that
1861    random() offers better randomness, but this is not required for
1862    Wget, so I chose to go for simplicity and use rand
1863    unconditionally.
1864
1865    DO NOT use this for cryptographic purposes.  It is only meant to be
1866    used in situations where quality of the random numbers returned
1867    doesn't really matter.  */
1868
1869 int
1870 random_number (int max)
1871 {
1872   static int seeded;
1873   double bounded;
1874   int rnd;
1875
1876   if (!seeded)
1877     {
1878       srand (time (NULL));
1879       seeded = 1;
1880     }
1881   rnd = rand ();
1882
1883   /* On systems that don't define RAND_MAX, assume it to be 2**15 - 1,
1884      and enforce that assumption by masking other bits.  */
1885 #ifndef RAND_MAX
1886 # define RAND_MAX 32767
1887   rnd &= RAND_MAX;
1888 #endif
1889
1890   /* This is equivalent to rand() % max, but uses the high-order bits
1891      for better randomness on architecture where rand() is implemented
1892      using a simple congruential generator.  */
1893
1894   bounded = (double)max * rnd / (RAND_MAX + 1.0);
1895   return (int)bounded;
1896 }
1897
1898 /* Return a random uniformly distributed floating point number in the
1899    [0, 1) range.  The precision of returned numbers is 9 digits.
1900
1901    Modify this to use erand48() where available!  */
1902
1903 double
1904 random_float (void)
1905 {
1906   /* We can't rely on any specific value of RAND_MAX, but I'm pretty
1907      sure it's greater than 1000.  */
1908   int rnd1 = random_number (1000);
1909   int rnd2 = random_number (1000);
1910   int rnd3 = random_number (1000);
1911   return rnd1 / 1000.0 + rnd2 / 1000000.0 + rnd3 / 1000000000.0;
1912 }
1913
1914 #if 0
1915 /* A debugging function for checking whether an MD5 library works. */
1916
1917 #include "gen-md5.h"
1918
1919 char *
1920 debug_test_md5 (char *buf)
1921 {
1922   unsigned char raw[16];
1923   static char res[33];
1924   unsigned char *p1;
1925   char *p2;
1926   int cnt;
1927   ALLOCA_MD5_CONTEXT (ctx);
1928
1929   gen_md5_init (ctx);
1930   gen_md5_update ((unsigned char *)buf, strlen (buf), ctx);
1931   gen_md5_finish (ctx, raw);
1932
1933   p1 = raw;
1934   p2 = res;
1935   cnt = 16;
1936   while (cnt--)
1937     {
1938       *p2++ = XNUM_TO_digit (*p1 >> 4);
1939       *p2++ = XNUM_TO_digit (*p1 & 0xf);
1940       ++p1;
1941     }
1942   *p2 = '\0';
1943
1944   return res;
1945 }
1946 #endif
1947 \f
1948 /* Implementation of run_with_timeout, a generic timeout-forcing
1949    routine for systems with Unix-like signal handling.  */
1950
1951 #ifdef USE_SIGNAL_TIMEOUT
1952 # ifdef HAVE_SIGSETJMP
1953 #  define SETJMP(env) sigsetjmp (env, 1)
1954
1955 static sigjmp_buf run_with_timeout_env;
1956
1957 static RETSIGTYPE
1958 abort_run_with_timeout (int sig)
1959 {
1960   assert (sig == SIGALRM);
1961   siglongjmp (run_with_timeout_env, -1);
1962 }
1963 # else /* not HAVE_SIGSETJMP */
1964 #  define SETJMP(env) setjmp (env)
1965
1966 static jmp_buf run_with_timeout_env;
1967
1968 static RETSIGTYPE
1969 abort_run_with_timeout (int sig)
1970 {
1971   assert (sig == SIGALRM);
1972   /* We don't have siglongjmp to preserve the set of blocked signals;
1973      if we longjumped out of the handler at this point, SIGALRM would
1974      remain blocked.  We must unblock it manually. */
1975   int mask = siggetmask ();
1976   mask &= ~sigmask (SIGALRM);
1977   sigsetmask (mask);
1978
1979   /* Now it's safe to longjump. */
1980   longjmp (run_with_timeout_env, -1);
1981 }
1982 # endif /* not HAVE_SIGSETJMP */
1983
1984 /* Arrange for SIGALRM to be delivered in TIMEOUT seconds.  This uses
1985    setitimer where available, alarm otherwise.
1986
1987    TIMEOUT should be non-zero.  If the timeout value is so small that
1988    it would be rounded to zero, it is rounded to the least legal value
1989    instead (1us for setitimer, 1s for alarm).  That ensures that
1990    SIGALRM will be delivered in all cases.  */
1991
1992 static void
1993 alarm_set (double timeout)
1994 {
1995 #ifdef ITIMER_REAL
1996   /* Use the modern itimer interface. */
1997   struct itimerval itv;
1998   memset (&itv, 0, sizeof (itv));
1999   itv.it_value.tv_sec = (long) timeout;
2000   itv.it_value.tv_usec = 1000000L * (timeout - (long)timeout);
2001   if (itv.it_value.tv_sec == 0 && itv.it_value.tv_usec == 0)
2002     /* Ensure that we wait for at least the minimum interval.
2003        Specifying zero would mean "wait forever".  */
2004     itv.it_value.tv_usec = 1;
2005   setitimer (ITIMER_REAL, &itv, NULL);
2006 #else  /* not ITIMER_REAL */
2007   /* Use the old alarm() interface. */
2008   int secs = (int) timeout;
2009   if (secs == 0)
2010     /* Round TIMEOUTs smaller than 1 to 1, not to zero.  This is
2011        because alarm(0) means "never deliver the alarm", i.e. "wait
2012        forever", which is not what someone who specifies a 0.5s
2013        timeout would expect.  */
2014     secs = 1;
2015   alarm (secs);
2016 #endif /* not ITIMER_REAL */
2017 }
2018
2019 /* Cancel the alarm set with alarm_set. */
2020
2021 static void
2022 alarm_cancel (void)
2023 {
2024 #ifdef ITIMER_REAL
2025   struct itimerval disable;
2026   memset (&disable, 0, sizeof (disable));
2027   setitimer (ITIMER_REAL, &disable, NULL);
2028 #else  /* not ITIMER_REAL */
2029   alarm (0);
2030 #endif /* not ITIMER_REAL */
2031 }
2032
2033 /* Call FUN(ARG), but don't allow it to run for more than TIMEOUT
2034    seconds.  Returns non-zero if the function was interrupted with a
2035    timeout, zero otherwise.
2036
2037    This works by setting up SIGALRM to be delivered in TIMEOUT seconds
2038    using setitimer() or alarm().  The timeout is enforced by
2039    longjumping out of the SIGALRM handler.  This has several
2040    advantages compared to the traditional approach of relying on
2041    signals causing system calls to exit with EINTR:
2042
2043      * The callback function is *forcibly* interrupted after the
2044        timeout expires, (almost) regardless of what it was doing and
2045        whether it was in a syscall.  For example, a calculation that
2046        takes a long time is interrupted as reliably as an IO
2047        operation.
2048
2049      * It works with both SYSV and BSD signals because it doesn't
2050        depend on the default setting of SA_RESTART.
2051
2052      * It doesn't special handler setup beyond a simple call to
2053        signal().  (It does use sigsetjmp/siglongjmp, but they're
2054        optional.)
2055
2056    The only downside is that, if FUN allocates internal resources that
2057    are normally freed prior to exit from the functions, they will be
2058    lost in case of timeout.  */
2059
2060 int
2061 run_with_timeout (double timeout, void (*fun) (void *), void *arg)
2062 {
2063   int saved_errno;
2064
2065   if (timeout == 0)
2066     {
2067       fun (arg);
2068       return 0;
2069     }
2070
2071   signal (SIGALRM, abort_run_with_timeout);
2072   if (SETJMP (run_with_timeout_env) != 0)
2073     {
2074       /* Longjumped out of FUN with a timeout. */
2075       signal (SIGALRM, SIG_DFL);
2076       return 1;
2077     }
2078   alarm_set (timeout);
2079   fun (arg);
2080
2081   /* Preserve errno in case alarm() or signal() modifies it. */
2082   saved_errno = errno;
2083   alarm_cancel ();
2084   signal (SIGALRM, SIG_DFL);
2085   errno = saved_errno;
2086
2087   return 0;
2088 }
2089
2090 #else  /* not USE_SIGNAL_TIMEOUT */
2091
2092 #ifndef WINDOWS
2093 /* A stub version of run_with_timeout that just calls FUN(ARG).  Don't
2094    define it under Windows, because Windows has its own version of
2095    run_with_timeout that uses threads.  */
2096
2097 int
2098 run_with_timeout (double timeout, void (*fun) (void *), void *arg)
2099 {
2100   fun (arg);
2101   return 0;
2102 }
2103 #endif /* not WINDOWS */
2104 #endif /* not USE_SIGNAL_TIMEOUT */