sjero.net Git - wget/blob - src/utils.c

   1 /* Various functions of utilitarian nature.
   2    Copyright (C) 1995, 1996, 1997, 1998, 2000 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else  /* not HAVE_STRING_H */
  27 # include <strings.h>
  28 #endif /* not HAVE_STRING_H */
  29 #include <ctype.h>
  30 #include <sys/types.h>
  31 #ifdef HAVE_UNISTD_H
  32 # include <unistd.h>
  33 #endif
  34 #ifdef HAVE_MMAP
  35 # include <sys/mman.h>
  36 #endif
  37 #ifdef HAVE_PWD_H
  38 # include <pwd.h>
  39 #endif
  40 #include <limits.h>
  41 #ifdef HAVE_UTIME_H
  42 # include <utime.h>
  43 #endif
  44 #ifdef HAVE_SYS_UTIME_H
  45 # include <sys/utime.h>
  46 #endif
  47 #include <errno.h>
  48 #ifdef NeXT
  49 # include <libc.h>              /* for access() */
  50 #endif
  51 #include <fcntl.h>
  52 #include <assert.h>
  53
  54 #include "wget.h"
  55 #include "utils.h"
  56 #include "fnmatch.h"
  57 #include "hash.h"
  58
  59 #ifndef errno
  60 extern int errno;
  61 #endif
  62
  63
  64 /* Croak the fatal memory error and bail out with non-zero exit
  65    status.  */
  66 static void
  67 memfatal (const char *s)
  68 {
  69   /* HACK: expose save_log_p from log.c, so we can turn it off in
  70      order to prevent saving the log.  Saving the log is dangerous
  71      because logprintf() and logputs() can call malloc(), so this
  72      could infloop.  When logging is turned off, infloop can no longer
  73      happen.  */
  74   extern int save_log_p;
  75
  76   save_log_p = 0;
  77   logprintf (LOG_ALWAYS, _("%s: %s: Not enough memory.\n"), exec_name, s);
  78   exit (1);
  79 }
  80
  81 /* xmalloc, xrealloc and xstrdup exit the program if there is not
  82    enough memory.  xstrdup also implements strdup on systems that do
  83    not have it.  xfree is provided to make leak-tracking easier.
  84    Currently it's a no-op.  */
  85
  86 void *
  87 xmalloc (size_t size)
  88 {
  89   void *res;
  90
  91   res = malloc (size);
  92   if (!res)
  93     memfatal ("malloc");
  94   return res;
  95 }
  96
  97 void
  98 xfree (void *ptr)
  99 {
 100   free (ptr);
 101 }
 102
 103 void *
 104 xrealloc (void *obj, size_t size)
 105 {
 106   void *res;
 107
 108   /* Not all Un*xes have the feature of realloc() that calling it with
 109      a NULL-pointer is the same as malloc(), but it is easy to
 110      simulate.  */
 111   if (obj)
 112     res = realloc (obj, size);
 113   else
 114     res = malloc (size);
 115   if (!res)
 116     memfatal ("realloc");
 117   return res;
 118 }
 119
 120 char *
 121 xstrdup (const char *s)
 122 {
 123 #ifndef HAVE_STRDUP
 124   int l = strlen (s);
 125   char *s1 = malloc (l + 1);
 126   if (!s1)
 127     memfatal ("strdup");
 128   memcpy (s1, s, l + 1);
 129   return s1;
 130 #else  /* HAVE_STRDUP */
 131   char *s1 = strdup (s);
 132   if (!s1)
 133     memfatal ("strdup");
 134   return s1;
 135 #endif /* HAVE_STRDUP */
 136 }
 137 \f
 138 /* Copy the string formed by two pointers (one on the beginning, other
 139    on the char after the last char) to a new, malloc-ed location.
 140    0-terminate it.  */
 141 char *
 142 strdupdelim (const char *beg, const char *end)
 143 {
 144   char *res = (char *)xmalloc (end - beg + 1);
 145   memcpy (res, beg, end - beg);
 146   res[end - beg] = '\0';
 147   return res;
 148 }
 149
 150 /* Parse a string containing comma-separated elements, and return a
 151    vector of char pointers with the elements.  Spaces following the
 152    commas are ignored.  */
 153 char **
 154 sepstring (const char *s)
 155 {
 156   char **res;
 157   const char *p;
 158   int i = 0;
 159
 160   if (!s || !*s)
 161     return NULL;
 162   res = NULL;
 163   p = s;
 164   while (*s)
 165     {
 166       if (*s == ',')
 167         {
 168           res = (char **)xrealloc (res, (i + 2) * sizeof (char *));
 169           res[i] = strdupdelim (p, s);
 170           res[++i] = NULL;
 171           ++s;
 172           /* Skip the blanks following the ','.  */
 173           while (ISSPACE (*s))
 174             ++s;
 175           p = s;
 176         }
 177       else
 178         ++s;
 179     }
 180   res = (char **)xrealloc (res, (i + 2) * sizeof (char *));
 181   res[i] = strdupdelim (p, s);
 182   res[i + 1] = NULL;
 183   return res;
 184 }
 185 \f
 186 /* Return pointer to a static char[] buffer in which zero-terminated
 187    string-representation of TM (in form hh:mm:ss) is printed.  It is
 188    shamelessly non-reentrant, but it doesn't matter, really.
 189
 190    If TM is non-NULL, the time_t of the current time will be stored
 191    there.  */
 192 char *
 193 time_str (time_t *tm)
 194 {
 195   static char tms[15];
 196   struct tm *ptm;
 197   time_t tim;
 198
 199   *tms = '\0';
 200   tim = time (tm);
 201   if (tim == -1)
 202     return tms;
 203   ptm = localtime (&tim);
 204   sprintf (tms, "%02d:%02d:%02d", ptm->tm_hour, ptm->tm_min, ptm->tm_sec);
 205   return tms;
 206 }
 207
 208 /* Returns an error message for ERRNUM.  #### This requires more work.
 209    This function, as well as the whole error system, is very
 210    ill-conceived.  */
 211 const char *
 212 uerrmsg (uerr_t errnum)
 213 {
 214   switch (errnum)
 215     {
 216     case URLUNKNOWN:
 217       return _("Unknown/unsupported protocol");
 218       break;
 219     case URLBADPORT:
 220       return _("Invalid port specification");
 221       break;
 222     case URLBADHOST:
 223       return _("Invalid host name");
 224       break;
 225     default:
 226       abort ();
 227       /* $@#@#$ compiler.  */
 228       return NULL;
 229     }
 230 }
 231 \f
 232 /* The Windows versions of the following two functions are defined in
 233    mswindows.c.  */
 234
 235 /* A cuserid() immitation using getpwuid(), to avoid hassling with
 236    utmp.  Besides, not all systems have cuesrid().  Under Windows, it
 237    is defined in mswindows.c.
 238
 239    If WHERE is non-NULL, the username will be stored there.
 240    Otherwise, it will be returned as a static buffer (as returned by
 241    getpwuid()).  In the latter case, the buffer should be copied
 242    before calling getpwuid() or pwd_cuserid() again.  */
 243 #ifndef WINDOWS
 244 char *
 245 pwd_cuserid (char *where)
 246 {
 247   struct passwd *pwd;
 248
 249   if (!(pwd = getpwuid (getuid ())) || !pwd->pw_name)
 250     return NULL;
 251   if (where)
 252     {
 253       strcpy (where, pwd->pw_name);
 254       return where;
 255     }
 256   else
 257     return pwd->pw_name;
 258 }
 259
 260 void
 261 fork_to_background (void)
 262 {
 263   pid_t pid;
 264   /* Whether we arrange our own version of opt.lfilename here.  */
 265   int changedp = 0;
 266
 267   if (!opt.lfilename)
 268     {
 269       opt.lfilename = unique_name (DEFAULT_LOGFILE);
 270       changedp = 1;
 271     }
 272   pid = fork ();
 273   if (pid < 0)
 274     {
 275       /* parent, error */
 276       perror ("fork");
 277       exit (1);
 278     }
 279   else if (pid != 0)
 280     {
 281       /* parent, no error */
 282       printf (_("Continuing in background.\n"));
 283       if (changedp)
 284         printf (_("Output will be written to `%s'.\n"), opt.lfilename);
 285       exit (0);
 286     }
 287   /* child: keep running */
 288 }
 289 #endif /* not WINDOWS */
 290 \f
 291 /* Canonicalize PATH, and return a new path.  The new path differs from PATH
 292    in that:
 293         Multple `/'s are collapsed to a single `/'.
 294         Leading `./'s and trailing `/.'s are removed.
 295         Trailing `/'s are removed.
 296         Non-leading `../'s and trailing `..'s are handled by removing
 297         portions of the path.
 298
 299    E.g. "a/b/c/./../d/.." will yield "a/b".  This function originates
 300    from GNU Bash.
 301
 302    Changes for Wget:
 303         Always use '/' as stub_char.
 304         Don't check for local things using canon_stat.
 305         Change the original string instead of strdup-ing.
 306         React correctly when beginning with `./' and `../'.  */
 307 void
 308 path_simplify (char *path)
 309 {
 310   register int i, start, ddot;
 311   char stub_char;
 312
 313   if (!*path)
 314     return;
 315
 316   /*stub_char = (*path == '/') ? '/' : '.';*/
 317   stub_char = '/';
 318
 319   /* Addition: Remove all `./'-s preceding the string.  If `../'-s
 320      precede, put `/' in front and remove them too.  */
 321   i = 0;
 322   ddot = 0;
 323   while (1)
 324     {
 325       if (path[i] == '.' && path[i + 1] == '/')
 326         i += 2;
 327       else if (path[i] == '.' && path[i + 1] == '.' && path[i + 2] == '/')
 328         {
 329           i += 3;
 330           ddot = 1;
 331         }
 332       else
 333         break;
 334     }
 335   if (i)
 336     strcpy (path, path + i - ddot);
 337
 338   /* Replace single `.' or `..' with `/'.  */
 339   if ((path[0] == '.' && path[1] == '\0')
 340       || (path[0] == '.' && path[1] == '.' && path[2] == '\0'))
 341     {
 342       path[0] = stub_char;
 343       path[1] = '\0';
 344       return;
 345     }
 346   /* Walk along PATH looking for things to compact.  */
 347   i = 0;
 348   while (1)
 349     {
 350       if (!path[i])
 351         break;
 352
 353       while (path[i] && path[i] != '/')
 354         i++;
 355
 356       start = i++;
 357
 358       /* If we didn't find any slashes, then there is nothing left to do.  */
 359       if (!path[start])
 360         break;
 361
 362       /* Handle multiple `/'s in a row.  */
 363       while (path[i] == '/')
 364         i++;
 365
 366       if ((start + 1) != i)
 367         {
 368           strcpy (path + start + 1, path + i);
 369           i = start + 1;
 370         }
 371
 372       /* Check for trailing `/'.  */
 373       if (start && !path[i])
 374         {
 375         zero_last:
 376           path[--i] = '\0';
 377           break;
 378         }
 379
 380       /* Check for `../', `./' or trailing `.' by itself.  */
 381       if (path[i] == '.')
 382         {
 383           /* Handle trailing `.' by itself.  */
 384           if (!path[i + 1])
 385             goto zero_last;
 386
 387           /* Handle `./'.  */
 388           if (path[i + 1] == '/')
 389             {
 390               strcpy (path + i, path + i + 1);
 391               i = (start < 0) ? 0 : start;
 392               continue;
 393             }
 394
 395           /* Handle `../' or trailing `..' by itself.  */
 396           if (path[i + 1] == '.' &&
 397               (path[i + 2] == '/' || !path[i + 2]))
 398             {
 399               while (--start > -1 && path[start] != '/');
 400               strcpy (path + start + 1, path + i + 2);
 401               i = (start < 0) ? 0 : start;
 402               continue;
 403             }
 404         }       /* path == '.' */
 405     } /* while */
 406
 407   if (!*path)
 408     {
 409       *path = stub_char;
 410       path[1] = '\0';
 411     }
 412 }
 413 \f
 414 /* "Touch" FILE, i.e. make its atime and mtime equal to the time
 415    specified with TM.  */
 416 void
 417 touch (const char *file, time_t tm)
 418 {
 419 #ifdef HAVE_STRUCT_UTIMBUF
 420   struct utimbuf times;
 421   times.actime = times.modtime = tm;
 422 #else
 423   time_t times[2];
 424   times[0] = times[1] = tm;
 425 #endif
 426
 427   if (utime (file, &times) == -1)
 428     logprintf (LOG_NOTQUIET, "utime(%s): %s\n", file, strerror (errno));
 429 }
 430
 431 /* Checks if FILE is a symbolic link, and removes it if it is.  Does
 432    nothing under MS-Windows.  */
 433 int
 434 remove_link (const char *file)
 435 {
 436   int err = 0;
 437   struct stat st;
 438
 439   if (lstat (file, &st) == 0 && S_ISLNK (st.st_mode))
 440     {
 441       DEBUGP (("Unlinking %s (symlink).\n", file));
 442       err = unlink (file);
 443       if (err != 0)
 444         logprintf (LOG_VERBOSE, _("Failed to unlink symlink `%s': %s\n"),
 445                    file, strerror (errno));
 446     }
 447   return err;
 448 }
 449
 450 /* Does FILENAME exist?  This is quite a lousy implementation, since
 451    it supplies no error codes -- only a yes-or-no answer.  Thus it
 452    will return that a file does not exist if, e.g., the directory is
 453    unreadable.  I don't mind it too much currently, though.  The
 454    proper way should, of course, be to have a third, error state,
 455    other than true/false, but that would introduce uncalled-for
 456    additional complexity to the callers.  */
 457 int
 458 file_exists_p (const char *filename)
 459 {
 460 #ifdef HAVE_ACCESS
 461   return access (filename, F_OK) >= 0;
 462 #else
 463   struct stat buf;
 464   return stat (filename, &buf) >= 0;
 465 #endif
 466 }
 467
 468 /* Returns 0 if PATH is a directory, 1 otherwise (any kind of file).
 469    Returns 0 on error.  */
 470 int
 471 file_non_directory_p (const char *path)
 472 {
 473   struct stat buf;
 474   /* Use lstat() rather than stat() so that symbolic links pointing to
 475      directories can be identified correctly.  */
 476   if (lstat (path, &buf) != 0)
 477     return 0;
 478   return S_ISDIR (buf.st_mode) ? 0 : 1;
 479 }
 480
 481 /* Return a unique filename, given a prefix and count */
 482 static char *
 483 unique_name_1 (const char *fileprefix, int count)
 484 {
 485   char *filename;
 486
 487   if (count)
 488     {
 489       filename = (char *)xmalloc (strlen (fileprefix) + numdigit (count) + 2);
 490       sprintf (filename, "%s.%d", fileprefix, count);
 491     }
 492   else
 493     filename = xstrdup (fileprefix);
 494
 495   if (!file_exists_p (filename))
 496     return filename;
 497   else
 498     {
 499       xfree (filename);
 500       return NULL;
 501     }
 502 }
 503
 504 /* Return a unique file name, based on PREFIX.  */
 505 char *
 506 unique_name (const char *prefix)
 507 {
 508   char *file = NULL;
 509   int count = 0;
 510
 511   while (!file)
 512     file = unique_name_1 (prefix, count++);
 513   return file;
 514 }
 515 \f
 516 /* Create DIRECTORY.  If some of the pathname components of DIRECTORY
 517    are missing, create them first.  In case any mkdir() call fails,
 518    return its error status.  Returns 0 on successful completion.
 519
 520    The behaviour of this function should be identical to the behaviour
 521    of `mkdir -p' on systems where mkdir supports the `-p' option.  */
 522 int
 523 make_directory (const char *directory)
 524 {
 525   int quit = 0;
 526   int i;
 527   char *dir;
 528
 529   /* Make a copy of dir, to be able to write to it.  Otherwise, the
 530      function is unsafe if called with a read-only char *argument.  */
 531   STRDUP_ALLOCA (dir, directory);
 532
 533   /* If the first character of dir is '/', skip it (and thus enable
 534      creation of absolute-pathname directories.  */
 535   for (i = (*dir == '/'); 1; ++i)
 536     {
 537       for (; dir[i] && dir[i] != '/'; i++)
 538         ;
 539       if (!dir[i])
 540         quit = 1;
 541       dir[i] = '\0';
 542       /* Check whether the directory already exists.  */
 543       if (!file_exists_p (dir))
 544         {
 545           if (mkdir (dir, 0777) < 0)
 546             return -1;
 547         }
 548       if (quit)
 549         break;
 550       else
 551         dir[i] = '/';
 552     }
 553   return 0;
 554 }
 555 \f
 556 static int in_acclist PARAMS ((const char *const *, const char *, int));
 557
 558 /* Determine whether a file is acceptable to be followed, according to
 559    lists of patterns to accept/reject.  */
 560 int
 561 acceptable (const char *s)
 562 {
 563   int l = strlen (s);
 564
 565   while (l && s[l] != '/')
 566     --l;
 567   if (s[l] == '/')
 568     s += (l + 1);
 569   if (opt.accepts)
 570     {
 571       if (opt.rejects)
 572         return (in_acclist ((const char *const *)opt.accepts, s, 1)
 573                 && !in_acclist ((const char *const *)opt.rejects, s, 1));
 574       else
 575         return in_acclist ((const char *const *)opt.accepts, s, 1);
 576     }
 577   else if (opt.rejects)
 578     return !in_acclist ((const char *const *)opt.rejects, s, 1);
 579   return 1;
 580 }
 581
 582 /* Compare S1 and S2 frontally; S2 must begin with S1.  E.g. if S1 is
 583    `/something', frontcmp() will return 1 only if S2 begins with
 584    `/something'.  Otherwise, 0 is returned.  */
 585 int
 586 frontcmp (const char *s1, const char *s2)
 587 {
 588   for (; *s1 && *s2 && (*s1 == *s2); ++s1, ++s2);
 589   return !*s1;
 590 }
 591
 592 /* Iterate through STRLIST, and return the first element that matches
 593    S, through wildcards or front comparison (as appropriate).  */
 594 static char *
 595 proclist (char **strlist, const char *s, enum accd flags)
 596 {
 597   char **x;
 598
 599   for (x = strlist; *x; x++)
 600     if (has_wildcards_p (*x))
 601       {
 602         if (fnmatch (*x, s, FNM_PATHNAME) == 0)
 603           break;
 604       }
 605     else
 606       {
 607         char *p = *x + ((flags & ALLABS) && (**x == '/')); /* Remove '/' */
 608         if (frontcmp (p, s))
 609           break;
 610       }
 611   return *x;
 612 }
 613
 614 /* Returns whether DIRECTORY is acceptable for download, wrt the
 615    include/exclude lists.
 616
 617    If FLAGS is ALLABS, the leading `/' is ignored in paths; relative
 618    and absolute paths may be freely intermixed.  */
 619 int
 620 accdir (const char *directory, enum accd flags)
 621 {
 622   /* Remove starting '/'.  */
 623   if (flags & ALLABS && *directory == '/')
 624     ++directory;
 625   if (opt.includes)
 626     {
 627       if (!proclist (opt.includes, directory, flags))
 628         return 0;
 629     }
 630   if (opt.excludes)
 631     {
 632       if (proclist (opt.excludes, directory, flags))
 633         return 0;
 634     }
 635   return 1;
 636 }
 637
 638 /* Match the end of STRING against PATTERN.  For instance:
 639
 640    match_backwards ("abc", "bc") -> 1
 641    match_backwards ("abc", "ab") -> 0
 642    match_backwards ("abc", "abc") -> 1 */
 643 static int
 644 match_backwards (const char *string, const char *pattern)
 645 {
 646   int i, j;
 647
 648   for (i = strlen (string), j = strlen (pattern); i >= 0 && j >= 0; i--, j--)
 649     if (string[i] != pattern[j])
 650       break;
 651   /* If the pattern was exhausted, the match was succesful.  */
 652   if (j == -1)
 653     return 1;
 654   else
 655     return 0;
 656 }
 657
 658 /* Checks whether string S matches each element of ACCEPTS.  A list
 659    element are matched either with fnmatch() or match_backwards(),
 660    according to whether the element contains wildcards or not.
 661
 662    If the BACKWARD is 0, don't do backward comparison -- just compare
 663    them normally.  */
 664 static int
 665 in_acclist (const char *const *accepts, const char *s, int backward)
 666 {
 667   for (; *accepts; accepts++)
 668     {
 669       if (has_wildcards_p (*accepts))
 670         {
 671           /* fnmatch returns 0 if the pattern *does* match the
 672              string.  */
 673           if (fnmatch (*accepts, s, 0) == 0)
 674             return 1;
 675         }
 676       else
 677         {
 678           if (backward)
 679             {
 680               if (match_backwards (s, *accepts))
 681                 return 1;
 682             }
 683           else
 684             {
 685               if (!strcmp (s, *accepts))
 686                 return 1;
 687             }
 688         }
 689     }
 690   return 0;
 691 }
 692
 693 /* Return the malloc-ed suffix of STR.  For instance:
 694    suffix ("foo.bar")       -> "bar"
 695    suffix ("foo.bar.baz")   -> "baz"
 696    suffix ("/foo/bar")      -> NULL
 697    suffix ("/foo.bar/baz")  -> NULL  */
 698 char *
 699 suffix (const char *str)
 700 {
 701   int i;
 702
 703   for (i = strlen (str); i && str[i] != '/' && str[i] != '.'; i--);
 704   if (str[i++] == '.')
 705     return xstrdup (str + i);
 706   else
 707     return NULL;
 708 }
 709
 710 /* Read a line from FP.  The function reallocs the storage as needed
 711    to accomodate for any length of the line.  Reallocs are done
 712    storage exponentially, doubling the storage after each overflow to
 713    minimize the number of calls to realloc() and fgets().  The newline
 714    character at the end of line is retained.
 715
 716    After end-of-file is encountered without anything being read, NULL
 717    is returned.  NULL is also returned on error.  To distinguish
 718    between these two cases, use the stdio function ferror().  */
 719
 720 char *
 721 read_whole_line (FILE *fp)
 722 {
 723   int length = 0;
 724   int bufsize = 81;
 725   char *line = (char *)xmalloc (bufsize);
 726
 727   while (fgets (line + length, bufsize - length, fp))
 728     {
 729       length += strlen (line + length);
 730       assert (length > 0);
 731       if (line[length - 1] == '\n')
 732         break;
 733       /* fgets() guarantees to read the whole line, or to use up the
 734          space we've given it.  We can double the buffer
 735          unconditionally.  */
 736       bufsize <<= 1;
 737       line = xrealloc (line, bufsize);
 738     }
 739   if (length == 0 || ferror (fp))
 740     {
 741       xfree (line);
 742       return NULL;
 743     }
 744   if (length + 1 < bufsize)
 745     /* Relieve the memory from our exponential greediness.  We say
 746        `length + 1' because the terminating \0 is not included in
 747        LENGTH.  We don't need to zero-terminate the string ourselves,
 748        though, because fgets() does that.  */
 749     line = xrealloc (line, length + 1);
 750   return line;
 751 }
 752 \f
 753 /* Read FILE into memory.  A pointer to `struct file_memory' are
 754    returned; use struct element `content' to access file contents, and
 755    the element `length' to know the file length.  `content' is *not*
 756    zero-terminated, and you should *not* read or write beyond the [0,
 757    length) range of characters.
 758
 759    After you are done with the file contents, call read_file_free to
 760    release the memory.
 761
 762    Depending on the operating system and the type of file that is
 763    being read, read_file() either mmap's the file into memory, or
 764    reads the file into the core using read().
 765
 766    If file is named "-", fileno(stdin) is used for reading instead.
 767    If you want to read from a real file named "-", use "./-" instead.  */
 768
 769 struct file_memory *
 770 read_file (const char *file)
 771 {
 772   int fd;
 773   struct file_memory *fm;
 774   long size;
 775   int inhibit_close = 0;
 776
 777   /* Some magic in the finest tradition of Perl and its kin: if FILE
 778      is "-", just use stdin.  */
 779   if (HYPHENP (file))
 780     {
 781       fd = fileno (stdin);
 782       inhibit_close = 1;
 783       /* Note that we don't inhibit mmap() in this case.  If stdin is
 784          redirected from a regular file, mmap() will still work.  */
 785     }
 786   else
 787     fd = open (file, O_RDONLY);
 788   if (fd < 0)
 789     return NULL;
 790   fm = xmalloc (sizeof (struct file_memory));
 791
 792 #ifdef HAVE_MMAP
 793   {
 794     struct stat buf;
 795     if (fstat (fd, &buf) < 0)
 796       goto mmap_lose;
 797     fm->length = buf.st_size;
 798     /* NOTE: As far as I know, the callers of this function never
 799        modify the file text.  Relying on this would enable us to
 800        specify PROT_READ and MAP_SHARED for a marginal gain in
 801        efficiency, but at some cost to generality.  */
 802     fm->content = mmap (NULL, fm->length, PROT_READ | PROT_WRITE,
 803                         MAP_PRIVATE, fd, 0);
 804     if (fm->content == MAP_FAILED)
 805       goto mmap_lose;
 806     if (!inhibit_close)
 807       close (fd);
 808
 809     fm->mmap_p = 1;
 810     return fm;
 811   }
 812
 813  mmap_lose:
 814   /* The most common reason why mmap() fails is that FD does not point
 815      to a plain file.  However, it's also possible that mmap() doesn't
 816      work for a particular type of file.  Therefore, whenever mmap()
 817      fails, we just fall back to the regular method.  */
 818 #endif /* HAVE_MMAP */
 819
 820   fm->length = 0;
 821   size = 512;                   /* number of bytes fm->contents can
 822                                    hold at any given time. */
 823   fm->content = xmalloc (size);
 824   while (1)
 825     {
 826       long nread;
 827       if (fm->length > size / 2)
 828         {
 829           /* #### I'm not sure whether the whole exponential-growth
 830              thing makes sense with kernel read.  On Linux at least,
 831              read() refuses to read more than 4K from a file at a
 832              single chunk anyway.  But other Unixes might optimize it
 833              better, and it doesn't *hurt* anything, so I'm leaving
 834              it.  */
 835
 836           /* Normally, we grow SIZE exponentially to make the number
 837              of calls to read() and realloc() logarithmic in relation
 838              to file size.  However, read() can read an amount of data
 839              smaller than requested, and it would be unreasonably to
 840              double SIZE every time *something* was read.  Therefore,
 841              we double SIZE only when the length exceeds half of the
 842              entire allocated size.  */
 843           size <<= 1;
 844           fm->content = xrealloc (fm->content, size);
 845         }
 846       nread = read (fd, fm->content + fm->length, size - fm->length);
 847       if (nread > 0)
 848         /* Successful read. */
 849         fm->length += nread;
 850       else if (nread < 0)
 851         /* Error. */
 852         goto lose;
 853       else
 854         /* EOF */
 855         break;
 856     }
 857   if (!inhibit_close)
 858     close (fd);
 859   if (size > fm->length && fm->length != 0)
 860     /* Due to exponential growth of fm->content, the allocated region
 861        might be much larger than what is actually needed.  */
 862     fm->content = xrealloc (fm->content, fm->length);
 863   fm->mmap_p = 0;
 864   return fm;
 865
 866  lose:
 867   if (!inhibit_close)
 868     close (fd);
 869   xfree (fm->content);
 870   xfree (fm);
 871   return NULL;
 872 }
 873
 874 /* Release the resources held by FM.  Specifically, this calls
 875    munmap() or xfree() on fm->content, depending whether mmap or
 876    malloc/read were used to read in the file.  It also frees the
 877    memory needed to hold the FM structure itself.  */
 878
 879 void
 880 read_file_free (struct file_memory *fm)
 881 {
 882 #ifdef HAVE_MMAP
 883   if (fm->mmap_p)
 884     {
 885       munmap (fm->content, fm->length);
 886     }
 887   else
 888 #endif
 889     {
 890       xfree (fm->content);
 891     }
 892   xfree (fm);
 893 }
 894 \f
 895 /* Free the pointers in a NULL-terminated vector of pointers, then
 896    free the pointer itself.  */
 897 void
 898 free_vec (char **vec)
 899 {
 900   if (vec)
 901     {
 902       char **p = vec;
 903       while (*p)
 904         xfree (*p++);
 905       xfree (vec);
 906     }
 907 }
 908
 909 /* Append vector V2 to vector V1.  The function frees V2 and
 910    reallocates V1 (thus you may not use the contents of neither
 911    pointer after the call).  If V1 is NULL, V2 is returned.  */
 912 char **
 913 merge_vecs (char **v1, char **v2)
 914 {
 915   int i, j;
 916
 917   if (!v1)
 918     return v2;
 919   if (!v2)
 920     return v1;
 921   if (!*v2)
 922     {
 923       /* To avoid j == 0 */
 924       xfree (v2);
 925       return v1;
 926     }
 927   /* Count v1.  */
 928   for (i = 0; v1[i]; i++);
 929   /* Count v2.  */
 930   for (j = 0; v2[j]; j++);
 931   /* Reallocate v1.  */
 932   v1 = (char **)xrealloc (v1, (i + j + 1) * sizeof (char **));
 933   memcpy (v1 + i, v2, (j + 1) * sizeof (char *));
 934   xfree (v2);
 935   return v1;
 936 }
 937
 938 /* A set of simple-minded routines to store strings in a linked list.
 939    This used to also be used for searching, but now we have hash
 940    tables for that.  */
 941
 942 /* It's a shame that these simple things like linked lists and hash
 943    tables (see hash.c) need to be implemented over and over again.  It
 944    would be nice to be able to use the routines from glib -- see
 945    www.gtk.org for details.  However, that would make Wget depend on
 946    glib, and I want to avoid dependencies to external libraries for
 947    reasons of convenience and portability (I suspect Wget is more
 948    portable than anything ever written for Gnome).  */
 949
 950 /* Append an element to the list.  If the list has a huge number of
 951    elements, this can get slow because it has to find the list's
 952    ending.  If you think you have to call slist_append in a loop,
 953    think about calling slist_prepend() followed by slist_nreverse().  */
 954
 955 slist *
 956 slist_append (slist *l, const char *s)
 957 {
 958   slist *newel = (slist *)xmalloc (sizeof (slist));
 959   slist *beg = l;
 960
 961   newel->string = xstrdup (s);
 962   newel->next = NULL;
 963
 964   if (!l)
 965     return newel;
 966   /* Find the last element.  */
 967   while (l->next)
 968     l = l->next;
 969   l->next = newel;
 970   return beg;
 971 }
 972
 973 /* Prepend S to the list.  Unlike slist_append(), this is O(1).  */
 974
 975 slist *
 976 slist_prepend (slist *l, const char *s)
 977 {
 978   slist *newel = (slist *)xmalloc (sizeof (slist));
 979   newel->string = xstrdup (s);
 980   newel->next = l;
 981   return newel;
 982 }
 983
 984 /* Destructively reverse L. */
 985
 986 slist *
 987 slist_nreverse (slist *l)
 988 {
 989   slist *prev = NULL;
 990   while (l)
 991     {
 992       slist *next = l->next;
 993       l->next = prev;
 994       prev = l;
 995       l = next;
 996     }
 997   return prev;
 998 }
 999
1000 /* Is there a specific entry in the list?  */
1001 int
1002 slist_contains (slist *l, const char *s)
1003 {
1004   for (; l; l = l->next)
1005     if (!strcmp (l->string, s))
1006       return 1;
1007   return 0;
1008 }
1009
1010 /* Free the whole slist.  */
1011 void
1012 slist_free (slist *l)
1013 {
1014   while (l)
1015     {
1016       slist *n = l->next;
1017       xfree (l->string);
1018       xfree (l);
1019       l = n;
1020     }
1021 }
1022 \f
1023 /* Sometimes it's useful to create "sets" of strings, i.e. special
1024    hash tables where you want to store strings as keys and merely
1025    query for their existence.  Here is a set of utility routines that
1026    makes that transparent.  */
1027
1028 void
1029 string_set_add (struct hash_table *ht, const char *s)
1030 {
1031   /* First check whether the set element already exists.  If it does,
1032      do nothing so that we don't have to free() the old element and
1033      then strdup() a new one.  */
1034   if (hash_table_exists (ht, s))
1035     return;
1036
1037   /* We use "1" as value.  It provides us a useful and clear arbitrary
1038      value, and it consumes no memory -- the pointers to the same
1039      string "1" will be shared by all the key-value pairs in all `set'
1040      hash tables.  */
1041   hash_table_put (ht, xstrdup (s), "1");
1042 }
1043
1044 /* Synonym for hash_table_exists... */
1045
1046 int
1047 string_set_exists (struct hash_table *ht, const char *s)
1048 {
1049   return hash_table_exists (ht, s);
1050 }
1051
1052 static int
1053 string_set_free_mapper (void *key, void *value_ignored, void *arg_ignored)
1054 {
1055   xfree (key);
1056   return 0;
1057 }
1058
1059 void
1060 string_set_free (struct hash_table *ht)
1061 {
1062   hash_table_map (ht, string_set_free_mapper, NULL);
1063   hash_table_destroy (ht);
1064 }
1065
1066 static int
1067 free_keys_and_values_mapper (void *key, void *value, void *arg_ignored)
1068 {
1069   xfree (key);
1070   xfree (value);
1071   return 0;
1072 }
1073
1074 /* Another utility function: call free() on all keys and values of HT.  */
1075
1076 void
1077 free_keys_and_values (struct hash_table *ht)
1078 {
1079   hash_table_map (ht, free_keys_and_values_mapper, NULL);
1080 }
1081
1082 \f
1083 /* Engine for legible and legible_long_long; this function works on
1084    strings.  */
1085
1086 static char *
1087 legible_1 (const char *repr)
1088 {
1089   static char outbuf[128];
1090   int i, i1, mod;
1091   char *outptr;
1092   const char *inptr;
1093
1094   /* Reset the pointers.  */
1095   outptr = outbuf;
1096   inptr = repr;
1097   /* If the number is negative, shift the pointers.  */
1098   if (*inptr == '-')
1099     {
1100       *outptr++ = '-';
1101       ++inptr;
1102     }
1103   /* How many digits before the first separator?  */
1104   mod = strlen (inptr) % 3;
1105   /* Insert them.  */
1106   for (i = 0; i < mod; i++)
1107     *outptr++ = inptr[i];
1108   /* Now insert the rest of them, putting separator before every
1109      third digit.  */
1110   for (i1 = i, i = 0; inptr[i1]; i++, i1++)
1111     {
1112       if (i % 3 == 0 && i1 != 0)
1113         *outptr++ = ',';
1114       *outptr++ = inptr[i1];
1115     }
1116   /* Zero-terminate the string.  */
1117   *outptr = '\0';
1118   return outbuf;
1119 }
1120
1121 /* Legible -- return a static pointer to the legibly printed long.  */
1122 char *
1123 legible (long l)
1124 {
1125   char inbuf[24];
1126   /* Print the number into the buffer.  */
1127   long_to_string (inbuf, l);
1128   return legible_1 (inbuf);
1129 }
1130
1131 /* The same as legible(), but works on VERY_LONG_TYPE.  See sysdep.h.  */
1132 char *
1133 legible_very_long (VERY_LONG_TYPE l)
1134 {
1135   char inbuf[128];
1136   /* Print the number into the buffer.  */
1137   sprintf (inbuf, VERY_LONG_FORMAT, l);
1138   return legible_1 (inbuf);
1139 }
1140
1141 /* Count the digits in a (long) integer.  */
1142 int
1143 numdigit (long a)
1144 {
1145   int res = 1;
1146   while ((a /= 10) != 0)
1147     ++res;
1148   return res;
1149 }
1150
1151 /* Print NUMBER to BUFFER.  This is equivalent to sprintf(buffer,
1152    "%ld", number), only much faster.
1153
1154    BUFFER should accept 24 bytes.  This should suffice for the longest
1155    numbers on 64-bit machines, including the `-' sign and the trailing
1156    \0.  */
1157 void
1158 long_to_string (char *buffer, long number)
1159 {
1160 #if (SIZEOF_LONG != 4) && (SIZEOF_LONG != 8)
1161   /* Huh? */
1162   sprintf (buffer, "%ld", number);
1163 #else /* (SIZEOF_LONG == 4) || (SIZEOF_LONG == 8) */
1164   char *p = buffer;
1165   int force = 0;
1166
1167   if (number < 0)
1168     {
1169       *p++ = '-';
1170       number = -number;
1171     }
1172
1173 #define FROB(figure) do {                                               \
1174     if (force || number >= figure)                                      \
1175       *p++ = number / figure + '0', number %= figure, force = 1;        \
1176     } while (0)
1177 #if SIZEOF_LONG == 8
1178   FROB (1000000000000000000L);
1179   FROB (100000000000000000L);
1180   FROB (10000000000000000L);
1181   FROB (1000000000000000L);
1182   FROB (100000000000000L);
1183   FROB (10000000000000L);
1184   FROB (1000000000000L);
1185   FROB (100000000000L);
1186   FROB (10000000000L);
1187 #endif /* SIZEOF_LONG == 8 */
1188   FROB (1000000000);
1189   FROB (100000000);
1190   FROB (10000000);
1191   FROB (1000000);
1192   FROB (100000);
1193   FROB (10000);
1194   FROB (1000);
1195   FROB (100);
1196   FROB (10);
1197 #undef FROB
1198   *p++ = number + '0';
1199   *p = '\0';
1200 #endif /* (SIZEOF_LONG == 4) || (SIZEOF_LONG == 8) */
1201 }
1202 \f
1203 /* This should probably be at a better place, but it doesn't really
1204    fit into html-parse.c.  */
1205
1206 /* The function returns the pointer to the malloc-ed quoted version of
1207    string s.  It will recognize and quote numeric and special graphic
1208    entities, as per RFC1866:
1209
1210    `&' -> `&amp;'
1211    `<' -> `&lt;'
1212    `>' -> `&gt;'
1213    `"' -> `&quot;'
1214    SP  -> `&#32;'
1215
1216    No other entities are recognized or replaced.  */
1217 char *
1218 html_quote_string (const char *s)
1219 {
1220   const char *b = s;
1221   char *p, *res;
1222   int i;
1223
1224   /* Pass through the string, and count the new size.  */
1225   for (i = 0; *s; s++, i++)
1226     {
1227       if (*s == '&')
1228         i += 4;                 /* `amp;' */
1229       else if (*s == '<' || *s == '>')
1230         i += 3;                 /* `lt;' and `gt;' */
1231       else if (*s == '\"')
1232         i += 5;                 /* `quot;' */
1233       else if (*s == ' ')
1234         i += 4;                 /* #32; */
1235     }
1236   res = (char *)xmalloc (i + 1);
1237   s = b;
1238   for (p = res; *s; s++)
1239     {
1240       switch (*s)
1241         {
1242         case '&':
1243           *p++ = '&';
1244           *p++ = 'a';
1245           *p++ = 'm';
1246           *p++ = 'p';
1247           *p++ = ';';
1248           break;
1249         case '<': case '>':
1250           *p++ = '&';
1251           *p++ = (*s == '<' ? 'l' : 'g');
1252           *p++ = 't';
1253           *p++ = ';';
1254           break;
1255         case '\"':
1256           *p++ = '&';
1257           *p++ = 'q';
1258           *p++ = 'u';
1259           *p++ = 'o';
1260           *p++ = 't';
1261           *p++ = ';';
1262           break;
1263         case ' ':
1264           *p++ = '&';
1265           *p++ = '#';
1266           *p++ = '3';
1267           *p++ = '2';
1268           *p++ = ';';
1269           break;
1270         default:
1271           *p++ = *s;
1272         }
1273     }
1274   *p = '\0';
1275   return res;
1276 }