sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif
  29 #include <sys/types.h>
  30 #ifdef HAVE_UNISTD_H
  31 # include <unistd.h>
  32 #endif
  33 #include <errno.h>
  34 #include <assert.h>
  35
  36 #include "wget.h"
  37 #include "utils.h"
  38 #include "url.h"
  39 #include "host.h"
  40 #include "hash.h"
  41
  42 #ifndef errno
  43 extern int errno;
  44 #endif
  45
  46 /* Is X "."?  */
  47 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  48 /* Is X ".."?  */
  49 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  50
  51 static int urlpath_length PARAMS ((const char *));
  52
  53 struct scheme_data
  54 {
  55   char *leading_string;
  56   int default_port;
  57 };
  58
  59 /* Supported schemes: */
  60 static struct scheme_data supported_schemes[] =
  61 {
  62   { "http://",  DEFAULT_HTTP_PORT },
  63 #ifdef HAVE_SSL
  64   { "https://", DEFAULT_HTTPS_PORT },
  65 #endif
  66   { "ftp://",   DEFAULT_FTP_PORT },
  67
  68   /* SCHEME_INVALID */
  69   { NULL,       -1 }
  70 };
  71
  72 static char *construct_relative PARAMS ((const char *, const char *));
  73
  74 \f
  75 /* Support for encoding and decoding of URL strings.  We determine
  76    whether a character is unsafe through static table lookup.  This
  77    code assumes ASCII character set and 8-bit chars.  */
  78
  79 enum {
  80   urlchr_reserved = 1,
  81   urlchr_unsafe   = 2
  82 };
  83
  84 #define R  urlchr_reserved
  85 #define U  urlchr_unsafe
  86 #define RU R|U
  87
  88 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
  89
  90 /* rfc1738 reserved chars, preserved from encoding.  */
  91
  92 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
  93
  94 /* rfc1738 unsafe chars, plus some more.  */
  95
  96 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
  97
  98 const static unsigned char urlchr_table[256] =
  99 {
 100   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 101   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 102   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 103   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 104   U,  0,  U, RU,   0,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 105   0,  0,  0,  R,   0,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 106   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 107   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 108  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 109   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 110   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 111   0,  0,  0,  U,   U,  U,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 112   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 113   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 114   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 115   0,  0,  0,  U,   U,  U,  U,  U,   /* x   y   z   {    |   }   ~   DEL */
 116
 117   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 118   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 119   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 120   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 121
 122   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 123   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 124   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 125   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 126 };
 127
 128 /* Decodes the forms %xy in a URL to the character the hexadecimal
 129    code of which is xy.  xy are hexadecimal digits from
 130    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 131    hex-digits or `%' precedes `\0', the sequence is inserted
 132    literally.  */
 133
 134 static void
 135 decode_string (char *s)
 136 {
 137   char *t = s;                  /* t - tortoise */
 138   char *h = s;                  /* h - hare     */
 139
 140   for (; *h; h++, t++)
 141     {
 142       if (*h != '%')
 143         {
 144         copychar:
 145           *t = *h;
 146         }
 147       else
 148         {
 149           /* Do nothing if '%' is not followed by two hex digits. */
 150           if (!*(h + 1) || !*(h + 2)
 151               || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
 152             goto copychar;
 153           *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
 154           h += 2;
 155         }
 156     }
 157   *t = '\0';
 158 }
 159
 160 /* Like encode_string, but return S if there are no unsafe chars.  */
 161
 162 static char *
 163 encode_string_maybe (const char *s)
 164 {
 165   const char *p1;
 166   char *p2, *newstr;
 167   int newlen;
 168   int addition = 0;
 169
 170   for (p1 = s; *p1; p1++)
 171     if (UNSAFE_CHAR (*p1))
 172       addition += 2;            /* Two more characters (hex digits) */
 173
 174   if (!addition)
 175     return (char *)s;
 176
 177   newlen = (p1 - s) + addition;
 178   newstr = (char *)xmalloc (newlen + 1);
 179
 180   p1 = s;
 181   p2 = newstr;
 182   while (*p1)
 183     {
 184       if (UNSAFE_CHAR (*p1))
 185         {
 186           unsigned char c = *p1++;
 187           *p2++ = '%';
 188           *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 189           *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 190         }
 191       else
 192         *p2++ = *p1++;
 193     }
 194   *p2 = '\0';
 195   assert (p2 - newstr == newlen);
 196
 197   return newstr;
 198 }
 199
 200 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
 201    given string, returning a malloc-ed %XX encoded string.  */
 202
 203 char *
 204 encode_string (const char *s)
 205 {
 206   char *encoded = encode_string_maybe (s);
 207   if (encoded != s)
 208     return encoded;
 209   else
 210     return xstrdup (s);
 211 }
 212
 213 /* Encode unsafe characters in PTR to %xx.  If such encoding is done,
 214    the old value of PTR is freed and PTR is made to point to the newly
 215    allocated storage.  */
 216
 217 #define ENCODE(ptr) do {                        \
 218   char *e_new = encode_string_maybe (ptr);      \
 219   if (e_new != ptr)                             \
 220     {                                           \
 221       xfree (ptr);                              \
 222       ptr = e_new;                              \
 223     }                                           \
 224 } while (0)
 225 \f
 226 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
 227
 228 /* Decide whether to encode, decode, or pass through the char at P.
 229    This used to be a macro, but it got a little too convoluted.  */
 230 static inline enum copy_method
 231 decide_copy_method (const char *p)
 232 {
 233   if (*p == '%')
 234     {
 235       if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
 236         {
 237           /* %xx sequence: decode it, unless it would decode to an
 238              unsafe or a reserved char; in that case, leave it as
 239              is. */
 240           char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
 241             XCHAR_TO_XDIGIT (*(p + 2));
 242
 243           if (UNSAFE_CHAR (preempt) || RESERVED_CHAR (preempt))
 244             return CM_PASSTHROUGH;
 245           else
 246             return CM_DECODE;
 247         }
 248       else
 249         /* Garbled %.. sequence: encode `%'. */
 250         return CM_ENCODE;
 251     }
 252   else if (UNSAFE_CHAR (*p) && !RESERVED_CHAR (*p))
 253     return CM_ENCODE;
 254   else
 255     return CM_PASSTHROUGH;
 256 }
 257
 258 /* Translate a %-quoting (but possibly non-conformant) input string S
 259    into a %-quoting (and conformant) output string.  If no characters
 260    are encoded or decoded, return the same string S; otherwise, return
 261    a freshly allocated string with the new contents.
 262
 263    After a URL has been run through this function, the protocols that
 264    use `%' as the quote character can use the resulting string as-is,
 265    while those that don't call decode_string() to get to the intended
 266    data.  This function is also stable: after an input string is
 267    transformed the first time, all further transformations of the
 268    result yield the same result string.
 269
 270    Let's discuss why this function is needed.
 271
 272    Imagine Wget is to retrieve `http://abc.xyz/abc def'.  Since a raw
 273    space character would mess up the HTTP request, it needs to be
 274    quoted, like this:
 275
 276        GET /abc%20def HTTP/1.0
 277
 278    So it appears that the unsafe chars need to be quoted, as with
 279    encode_string.  But what if we're requested to download
 280    `abc%20def'?  Remember that %-encoding is valid URL syntax, so what
 281    the user meant was a literal space, and he was kind enough to quote
 282    it.  In that case, Wget should obviously leave the `%20' as is, and
 283    send the same request as above.  So in this case we may not call
 284    encode_string.
 285
 286    But what if the requested URI is `abc%20 def'?  If we call
 287    encode_string, we end up with `/abc%2520%20def', which is almost
 288    certainly not intended.  If we don't call encode_string, we are
 289    left with the embedded space and cannot send the request.  What the
 290    user meant was for Wget to request `/abc%20%20def', and this is
 291    where reencode_string kicks in.
 292
 293    Wget used to solve this by first decoding %-quotes, and then
 294    encoding all the "unsafe" characters found in the resulting string.
 295    This was wrong because it didn't preserve certain URL special
 296    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 297    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 298    whether we considered `+' reserved (it is).  One of these results
 299    is inevitable because by the second step we would lose information
 300    on whether the `+' was originally encoded or not.  Both results
 301    were wrong because in CGI parameters + means space, while %2B means
 302    literal plus.  reencode_string correctly translates the above to
 303    "a%2B+b", i.e. returns the original string.
 304
 305    This function uses an algorithm proposed by Anon Sricharoenchai:
 306
 307    1. Encode all URL_UNSAFE and the "%" that are not followed by 2
 308       hexdigits.
 309
 310    2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
 311       "+".
 312
 313    ...except that this code conflates the two steps, and decides
 314    whether to encode, decode, or pass through each character in turn.
 315    The function still uses two passes, but their logic is the same --
 316    the first pass exists merely for the sake of allocation.  Another
 317    small difference is that we include `+' to URL_RESERVED.
 318
 319    Anon's test case:
 320
 321    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 322    ->
 323    "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
 324
 325    Simpler test cases:
 326
 327    "foo bar"         -> "foo%20bar"
 328    "foo%20bar"       -> "foo%20bar"
 329    "foo %20bar"      -> "foo%20%20bar"
 330    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 331    "foo%25%20bar"    -> "foo%25%20bar"
 332    "foo%2%20bar"     -> "foo%252%20bar"
 333    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 334    "foo%2b+bar"      -> "foo%2b+bar"  */
 335
 336 char *
 337 reencode_string (const char *s)
 338 {
 339   const char *p1;
 340   char *newstr, *p2;
 341   int oldlen, newlen;
 342
 343   int encode_count = 0;
 344   int decode_count = 0;
 345
 346   /* First, pass through the string to see if there's anything to do,
 347      and to calculate the new length.  */
 348   for (p1 = s; *p1; p1++)
 349     {
 350       switch (decide_copy_method (p1))
 351         {
 352         case CM_ENCODE:
 353           ++encode_count;
 354           break;
 355         case CM_DECODE:
 356           ++decode_count;
 357           break;
 358         case CM_PASSTHROUGH:
 359           break;
 360         }
 361     }
 362
 363   if (!encode_count && !decode_count)
 364     /* The string is good as it is. */
 365     return (char *)s;           /* C const model sucks. */
 366
 367   oldlen = p1 - s;
 368   /* Each encoding adds two characters (hex digits), while each
 369      decoding removes two characters.  */
 370   newlen = oldlen + 2 * (encode_count - decode_count);
 371   newstr = xmalloc (newlen + 1);
 372
 373   p1 = s;
 374   p2 = newstr;
 375
 376   while (*p1)
 377     {
 378       switch (decide_copy_method (p1))
 379         {
 380         case CM_ENCODE:
 381           {
 382             unsigned char c = *p1++;
 383             *p2++ = '%';
 384             *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 385             *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 386           }
 387           break;
 388         case CM_DECODE:
 389           *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
 390                    + (XCHAR_TO_XDIGIT (*(p1 + 2))));
 391           p1 += 3;              /* skip %xx */
 392           break;
 393         case CM_PASSTHROUGH:
 394           *p2++ = *p1++;
 395         }
 396     }
 397   *p2 = '\0';
 398   assert (p2 - newstr == newlen);
 399   return newstr;
 400 }
 401
 402 /* Run PTR_VAR through reencode_string.  If a new string is consed,
 403    free PTR_VAR and make it point to the new storage.  Obviously,
 404    PTR_VAR needs to be an lvalue.  */
 405
 406 #define REENCODE(ptr_var) do {                  \
 407   char *rf_new = reencode_string (ptr_var);     \
 408   if (rf_new != ptr_var)                        \
 409     {                                           \
 410       xfree (ptr_var);                          \
 411       ptr_var = rf_new;                         \
 412     }                                           \
 413 } while (0)
 414 \f
 415 /* Returns the scheme type if the scheme is supported, or
 416    SCHEME_INVALID if not.  */
 417 enum url_scheme
 418 url_scheme (const char *url)
 419 {
 420   int i;
 421
 422   for (i = 0; supported_schemes[i].leading_string; i++)
 423     if (!strncasecmp (url, supported_schemes[i].leading_string,
 424                       strlen (supported_schemes[i].leading_string)))
 425       return (enum url_scheme)i;
 426   return SCHEME_INVALID;
 427 }
 428
 429 /* Return the number of characters needed to skip the scheme part of
 430    the URL, e.g. `http://'.  If no scheme is found, returns 0.  */
 431 int
 432 url_skip_scheme (const char *url)
 433 {
 434   const char *p = url;
 435
 436   /* Skip the scheme name.  We allow `-' and `+' because of `whois++',
 437      etc. */
 438   while (ISALNUM (*p) || *p == '-' || *p == '+')
 439     ++p;
 440   if (*p != ':')
 441     return 0;
 442   /* Skip ':'. */
 443   ++p;
 444
 445   /* Skip "//" if found. */
 446   if (*p == '/' && *(p + 1) == '/')
 447     p += 2;
 448
 449   return p - url;
 450 }
 451
 452 /* Returns 1 if the URL begins with a scheme (supported or
 453    unsupported), 0 otherwise.  */
 454 int
 455 url_has_scheme (const char *url)
 456 {
 457   const char *p = url;
 458   while (ISALNUM (*p) || *p == '-' || *p == '+')
 459     ++p;
 460   return *p == ':';
 461 }
 462
 463 int
 464 scheme_default_port (enum url_scheme scheme)
 465 {
 466   return supported_schemes[scheme].default_port;
 467 }
 468
 469 /* Skip the username and password, if present here.  The function
 470    should be called *not* with the complete URL, but with the part
 471    right after the scheme.
 472
 473    If no username and password are found, return 0.  */
 474 int
 475 url_skip_uname (const char *url)
 476 {
 477   const char *p;
 478
 479   /* Look for '@' that comes before '/' or '?'. */
 480   p = (const char *)strpbrk (url, "/?@");
 481   if (!p || *p != '@')
 482     return 0;
 483
 484   return p - url + 1;
 485 }
 486
 487 static int
 488 parse_uname (const char *str, int len, char **user, char **passwd)
 489 {
 490   char *colon;
 491
 492   if (len == 0)
 493     /* Empty user name not allowed. */
 494     return 0;
 495
 496   colon = memchr (str, ':', len);
 497   if (colon == str)
 498     /* Empty user name again. */
 499     return 0;
 500
 501   if (colon)
 502     {
 503       int pwlen = len - (colon + 1 - str);
 504       *passwd = xmalloc (pwlen + 1);
 505       memcpy (*passwd, colon + 1, pwlen);
 506       (*passwd)[pwlen] = '\0';
 507       len -= pwlen + 1;
 508     }
 509   else
 510     *passwd = NULL;
 511
 512   *user = xmalloc (len + 1);
 513   memcpy (*user, str, len);
 514   (*user)[len] = '\0';
 515
 516   return 1;
 517 }
 518
 519 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 520    popularized by Netscape and NcFTP.  HTTP shorthands look like this:
 521
 522    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 523    www.foo.com[:port]            -> http://www.foo.com[:port]
 524
 525    FTP shorthands look like this:
 526
 527    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 528    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 529
 530    If the URL needs not or cannot be rewritten, return NULL.  */
 531 char *
 532 rewrite_shorthand_url (const char *url)
 533 {
 534   const char *p;
 535
 536   if (url_has_scheme (url))
 537     return NULL;
 538
 539   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 540      latter Netscape.  */
 541   for (p = url; *p && *p != ':' && *p != '/'; p++)
 542     ;
 543
 544   if (p == url)
 545     return NULL;
 546
 547   if (*p == ':')
 548     {
 549       const char *pp, *path;
 550       char *res;
 551       /* If the characters after the colon and before the next slash
 552          or end of string are all digits, it's HTTP.  */
 553       int digits = 0;
 554       for (pp = p + 1; ISDIGIT (*pp); pp++)
 555         ++digits;
 556       if (digits > 0
 557           && (*pp == '/' || *pp == '\0'))
 558         goto http;
 559
 560       /* Prepend "ftp://" to the entire URL... */
 561       path = p + 1;
 562       res = xmalloc (6 + strlen (url) + 1);
 563       sprintf (res, "ftp://%s", url);
 564       /* ...and replace ':' with '/'. */
 565       res[6 + (p - url)] = '/';
 566       return res;
 567     }
 568   else
 569     {
 570       char *res;
 571     http:
 572       /* Just prepend "http://" to what we have. */
 573       res = xmalloc (7 + strlen (url) + 1);
 574       sprintf (res, "http://%s", url);
 575       return res;
 576     }
 577 }
 578 \f
 579 static void parse_path PARAMS ((const char *, char **, char **));
 580
 581 static char *
 582 strpbrk_or_eos (const char *s, const char *accept)
 583 {
 584   char *p = strpbrk (s, accept);
 585   if (!p)
 586     p = (char *)s + strlen (s);
 587   return p;
 588 }
 589
 590 /* Turn STR into lowercase; return non-zero if a character was
 591    actually changed. */
 592
 593 static int
 594 lowercase_str (char *str)
 595 {
 596   int change = 0;
 597   for (; *str; str++)
 598     if (ISUPPER (*str))
 599       {
 600         change = 1;
 601         *str = TOLOWER (*str);
 602       }
 603   return change;
 604 }
 605
 606 static char *parse_errors[] = {
 607 #define PE_NO_ERROR            0
 608   "No error",
 609 #define PE_UNRECOGNIZED_SCHEME 1
 610   "Unrecognized scheme",
 611 #define PE_EMPTY_HOST          2
 612   "Empty host",
 613 #define PE_BAD_PORT_NUMBER     3
 614   "Bad port number",
 615 #define PE_INVALID_USER_NAME   4
 616   "Invalid user name"
 617 };
 618
 619 #define SETERR(p, v) do {                       \
 620   if (p)                                        \
 621     *(p) = (v);                                 \
 622 } while (0)
 623
 624 /* Parse a URL.
 625
 626    Return a new struct url if successful, NULL on error.  In case of
 627    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 628    error code. */
 629 struct url *
 630 url_parse (const char *url, int *error)
 631 {
 632   struct url *u;
 633   const char *p;
 634   int path_modified, host_modified;
 635
 636   enum url_scheme scheme;
 637
 638   const char *uname_b,     *uname_e;
 639   const char *host_b,      *host_e;
 640   const char *path_b,      *path_e;
 641   const char *params_b,    *params_e;
 642   const char *query_b,     *query_e;
 643   const char *fragment_b,  *fragment_e;
 644
 645   int port;
 646   char *user = NULL, *passwd = NULL;
 647
 648   char *url_encoded;
 649
 650   scheme = url_scheme (url);
 651   if (scheme == SCHEME_INVALID)
 652     {
 653       SETERR (error, PE_UNRECOGNIZED_SCHEME);
 654       return NULL;
 655     }
 656
 657   url_encoded = reencode_string (url);
 658   p = url_encoded;
 659
 660   p += strlen (supported_schemes[scheme].leading_string);
 661   uname_b = p;
 662   p += url_skip_uname (p);
 663   uname_e = p;
 664
 665   /* scheme://user:pass@host[:port]... */
 666   /*                    ^              */
 667
 668   /* We attempt to break down the URL into the components path,
 669      params, query, and fragment.  They are ordered like this:
 670
 671        scheme://host[:port][/path][;params][?query][#fragment]  */
 672
 673   params_b   = params_e   = NULL;
 674   query_b    = query_e    = NULL;
 675   fragment_b = fragment_e = NULL;
 676
 677   host_b = p;
 678   p = strpbrk_or_eos (p, ":/;?#");
 679   host_e = p;
 680
 681   if (host_b == host_e)
 682     {
 683       SETERR (error, PE_EMPTY_HOST);
 684       return NULL;
 685     }
 686
 687   port = scheme_default_port (scheme);
 688   if (*p == ':')
 689     {
 690       const char *port_b, *port_e, *pp;
 691
 692       /* scheme://host:port/tralala */
 693       /*              ^             */
 694       ++p;
 695       port_b = p;
 696       p = strpbrk_or_eos (p, "/;?#");
 697       port_e = p;
 698
 699       if (port_b == port_e)
 700         {
 701           /* http://host:/whatever */
 702           /*             ^         */
 703           SETERR (error, PE_BAD_PORT_NUMBER);
 704           return NULL;
 705         }
 706
 707       for (port = 0, pp = port_b; pp < port_e; pp++)
 708         {
 709           if (!ISDIGIT (*pp))
 710             {
 711               /* http://host:12randomgarbage/blah */
 712               /*               ^                  */
 713               SETERR (error, PE_BAD_PORT_NUMBER);
 714               return NULL;
 715             }
 716           port = 10 * port + (*pp - '0');
 717         }
 718     }
 719
 720   if (*p == '/')
 721     {
 722       ++p;
 723       path_b = p;
 724       p = strpbrk_or_eos (p, ";?#");
 725       path_e = p;
 726     }
 727   else
 728     {
 729       /* Path is not allowed not to exist. */
 730       path_b = path_e = p;
 731     }
 732
 733   if (*p == ';')
 734     {
 735       ++p;
 736       params_b = p;
 737       p = strpbrk_or_eos (p, "?#");
 738       params_e = p;
 739     }
 740   if (*p == '?')
 741     {
 742       ++p;
 743       query_b = p;
 744       p = strpbrk_or_eos (p, "#");
 745       query_e = p;
 746     }
 747   if (*p == '#')
 748     {
 749       ++p;
 750       fragment_b = p;
 751       p += strlen (p);
 752       fragment_e = p;
 753     }
 754   assert (*p == 0);
 755
 756   if (uname_b != uname_e)
 757     {
 758       /* http://user:pass@host */
 759       /*        ^         ^    */
 760       /*     uname_b   uname_e */
 761       if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
 762         {
 763           SETERR (error, PE_INVALID_USER_NAME);
 764           return NULL;
 765         }
 766     }
 767
 768   u = (struct url *)xmalloc (sizeof (struct url));
 769   memset (u, 0, sizeof (*u));
 770
 771   u->scheme = scheme;
 772   u->host   = strdupdelim (host_b, host_e);
 773   u->port   = port;
 774   u->user   = user;
 775   u->passwd = passwd;
 776
 777   u->path = strdupdelim (path_b, path_e);
 778   path_modified = path_simplify (u->path);
 779   parse_path (u->path, &u->dir, &u->file);
 780
 781   host_modified = lowercase_str (u->host);
 782
 783   if (params_b)
 784     u->params = strdupdelim (params_b, params_e);
 785   if (query_b)
 786     u->query = strdupdelim (query_b, query_e);
 787   if (fragment_b)
 788     u->fragment = strdupdelim (fragment_b, fragment_e);
 789
 790   if (path_modified || u->fragment || host_modified || path_b == path_e)
 791     {
 792       /* If we suspect that a transformation has rendered what
 793          url_string might return different from URL_ENCODED, rebuild
 794          u->url using url_string.  */
 795       u->url = url_string (u, 0);
 796
 797       if (url_encoded != url)
 798         xfree ((char *) url_encoded);
 799     }
 800   else
 801     {
 802       if (url_encoded == url)
 803         u->url    = xstrdup (url);
 804       else
 805         u->url    = url_encoded;
 806     }
 807   url_encoded = NULL;
 808
 809   return u;
 810 }
 811
 812 const char *
 813 url_error (int error_code)
 814 {
 815   assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
 816   return parse_errors[error_code];
 817 }
 818
 819 static void
 820 parse_path (const char *quoted_path, char **dir, char **file)
 821 {
 822   char *path, *last_slash;
 823
 824   STRDUP_ALLOCA (path, quoted_path);
 825   decode_string (path);
 826
 827   last_slash = strrchr (path, '/');
 828   if (!last_slash)
 829     {
 830       *dir = xstrdup ("");
 831       *file = xstrdup (path);
 832     }
 833   else
 834     {
 835       *dir = strdupdelim (path, last_slash);
 836       *file = xstrdup (last_slash + 1);
 837     }
 838 }
 839
 840 /* Note: URL's "full path" is the path with the query string and
 841    params appended.  The "fragment" (#foo) is intentionally ignored,
 842    but that might be changed.  For example, if the original URL was
 843    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
 844    the full path will be "/foo/bar/baz;bullshit?querystring".  */
 845
 846 /* Return the length of the full path, without the terminating
 847    zero.  */
 848
 849 static int
 850 full_path_length (const struct url *url)
 851 {
 852   int len = 0;
 853
 854 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
 855
 856   FROB (path);
 857   FROB (params);
 858   FROB (query);
 859
 860 #undef FROB
 861
 862   return len;
 863 }
 864
 865 /* Write out the full path. */
 866
 867 static void
 868 full_path_write (const struct url *url, char *where)
 869 {
 870 #define FROB(el, chr) do {                      \
 871   char *f_el = url->el;                         \
 872   if (f_el) {                                   \
 873     int l = strlen (f_el);                      \
 874     *where++ = chr;                             \
 875     memcpy (where, f_el, l);                    \
 876     where += l;                                 \
 877   }                                             \
 878 } while (0)
 879
 880   FROB (path, '/');
 881   FROB (params, ';');
 882   FROB (query, '?');
 883
 884 #undef FROB
 885 }
 886
 887 /* Public function for getting the "full path".  E.g. if u->path is
 888    "foo/bar" and u->query is "param=value", full_path will be
 889    "/foo/bar?param=value". */
 890
 891 char *
 892 url_full_path (const struct url *url)
 893 {
 894   int length = full_path_length (url);
 895   char *full_path = (char *)xmalloc(length + 1);
 896
 897   full_path_write (url, full_path);
 898   full_path[length] = '\0';
 899
 900   return full_path;
 901 }
 902
 903 /* Sync u->path and u->url with u->dir and u->file. */
 904
 905 static void
 906 sync_path (struct url *url)
 907 {
 908   char *newpath;
 909
 910   xfree (url->path);
 911
 912   if (!*url->dir)
 913     {
 914       newpath = xstrdup (url->file);
 915       REENCODE (newpath);
 916     }
 917   else
 918     {
 919       int dirlen = strlen (url->dir);
 920       int filelen = strlen (url->file);
 921
 922       newpath = xmalloc (dirlen + 1 + filelen + 1);
 923       memcpy (newpath, url->dir, dirlen);
 924       newpath[dirlen] = '/';
 925       memcpy (newpath + dirlen + 1, url->file, filelen);
 926       newpath[dirlen + 1 + filelen] = '\0';
 927       REENCODE (newpath);
 928     }
 929
 930   url->path = newpath;
 931
 932   /* Synchronize u->url. */
 933   xfree (url->url);
 934   url->url = url_string (url, 0);
 935 }
 936
 937 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
 938    This way we can sync u->path and u->url when they get changed.  */
 939
 940 void
 941 url_set_dir (struct url *url, const char *newdir)
 942 {
 943   xfree (url->dir);
 944   url->dir = xstrdup (newdir);
 945   sync_path (url);
 946 }
 947
 948 void
 949 url_set_file (struct url *url, const char *newfile)
 950 {
 951   xfree (url->file);
 952   url->file = xstrdup (newfile);
 953   sync_path (url);
 954 }
 955
 956 void
 957 url_free (struct url *url)
 958 {
 959   xfree (url->host);
 960   xfree (url->path);
 961   xfree (url->url);
 962
 963   FREE_MAYBE (url->params);
 964   FREE_MAYBE (url->query);
 965   FREE_MAYBE (url->fragment);
 966   FREE_MAYBE (url->user);
 967   FREE_MAYBE (url->passwd);
 968
 969   xfree (url->dir);
 970   xfree (url->file);
 971
 972   xfree (url);
 973 }
 974 \f
 975 struct urlpos *
 976 get_urls_file (const char *file)
 977 {
 978   struct file_memory *fm;
 979   struct urlpos *head, *tail;
 980   const char *text, *text_end;
 981
 982   /* Load the file.  */
 983   fm = read_file (file);
 984   if (!fm)
 985     {
 986       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 987       return NULL;
 988     }
 989   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 990   head = tail = NULL;
 991   text = fm->content;
 992   text_end = fm->content + fm->length;
 993   while (text < text_end)
 994     {
 995       const char *line_beg = text;
 996       const char *line_end = memchr (text, '\n', text_end - text);
 997       if (!line_end)
 998         line_end = text_end;
 999       else
1000         ++line_end;
1001       text = line_end;
1002       while (line_beg < line_end
1003              && ISSPACE (*line_beg))
1004         ++line_beg;
1005       while (line_end > line_beg + 1
1006              && ISSPACE (*(line_end - 1)))
1007         --line_end;
1008       if (line_end > line_beg)
1009         {
1010           /* URL is in the [line_beg, line_end) region. */
1011
1012           int up_error_code;
1013           char *url_text;
1014           struct urlpos *entry;
1015           struct url *url;
1016
1017           /* We must copy the URL to a zero-terminated string, and we
1018              can't use alloca because we're in a loop.  *sigh*.  */
1019           url_text = strdupdelim (line_beg, line_end);
1020
1021           if (opt.base_href)
1022             {
1023               /* Merge opt.base_href with URL. */
1024               char *merged = uri_merge (opt.base_href, url_text);
1025               xfree (url_text);
1026               url_text = merged;
1027             }
1028
1029           url = url_parse (url_text, &up_error_code);
1030           if (!url)
1031             {
1032               logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1033                          file, url_text, url_error (up_error_code));
1034               xfree (url_text);
1035               continue;
1036             }
1037           xfree (url_text);
1038
1039           entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1040           memset (entry, 0, sizeof (*entry));
1041           entry->next = NULL;
1042           entry->url = url;
1043
1044           if (!head)
1045             head = entry;
1046           else
1047             tail->next = entry;
1048           tail = entry;
1049         }
1050     }
1051   read_file_free (fm);
1052   return head;
1053 }
1054 \f
1055 /* Free the linked list of urlpos.  */
1056 void
1057 free_urlpos (struct urlpos *l)
1058 {
1059   while (l)
1060     {
1061       struct urlpos *next = l->next;
1062       if (l->url)
1063         url_free (l->url);
1064       FREE_MAYBE (l->local_name);
1065       xfree (l);
1066       l = next;
1067     }
1068 }
1069
1070 /* Rotate FNAME opt.backups times */
1071 void
1072 rotate_backups(const char *fname)
1073 {
1074   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1075   char *from = (char *)alloca (maxlen);
1076   char *to = (char *)alloca (maxlen);
1077   struct stat sb;
1078   int i;
1079
1080   if (stat (fname, &sb) == 0)
1081     if (S_ISREG (sb.st_mode) == 0)
1082       return;
1083
1084   for (i = opt.backups; i > 1; i--)
1085     {
1086       sprintf (from, "%s.%d", fname, i - 1);
1087       sprintf (to, "%s.%d", fname, i);
1088       /* #### This will fail on machines without the rename() system
1089          call.  */
1090       rename (from, to);
1091     }
1092
1093   sprintf (to, "%s.%d", fname, 1);
1094   rename(fname, to);
1095 }
1096
1097 /* Create all the necessary directories for PATH (a file).  Calls
1098    mkdirhier() internally.  */
1099 int
1100 mkalldirs (const char *path)
1101 {
1102   const char *p;
1103   char *t;
1104   struct stat st;
1105   int res;
1106
1107   p = path + strlen (path);
1108   for (; *p != '/' && p != path; p--);
1109   /* Don't create if it's just a file.  */
1110   if ((p == path) && (*p != '/'))
1111     return 0;
1112   t = strdupdelim (path, p);
1113   /* Check whether the directory exists.  */
1114   if ((stat (t, &st) == 0))
1115     {
1116       if (S_ISDIR (st.st_mode))
1117         {
1118           xfree (t);
1119           return 0;
1120         }
1121       else
1122         {
1123           /* If the dir exists as a file name, remove it first.  This
1124              is *only* for Wget to work with buggy old CERN http
1125              servers.  Here is the scenario: When Wget tries to
1126              retrieve a directory without a slash, e.g.
1127              http://foo/bar (bar being a directory), CERN server will
1128              not redirect it too http://foo/bar/ -- it will generate a
1129              directory listing containing links to bar/file1,
1130              bar/file2, etc.  Wget will lose because it saves this
1131              HTML listing to a file `bar', so it cannot create the
1132              directory.  To work around this, if the file of the same
1133              name exists, we just remove it and create the directory
1134              anyway.  */
1135           DEBUGP (("Removing %s because of directory danger!\n", t));
1136           unlink (t);
1137         }
1138     }
1139   res = make_directory (t);
1140   if (res != 0)
1141     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1142   xfree (t);
1143   return res;
1144 }
1145
1146 static int
1147 count_slashes (const char *s)
1148 {
1149   int i = 0;
1150   while (*s)
1151     if (*s++ == '/')
1152       ++i;
1153   return i;
1154 }
1155
1156 /* Return the path name of the URL-equivalent file name, with a
1157    remote-like structure of directories.  */
1158 static char *
1159 mkstruct (const struct url *u)
1160 {
1161   char *dir, *dir_preencoding;
1162   char *file, *res, *dirpref;
1163   char *query = u->query && *u->query ? u->query : NULL;
1164   int l;
1165
1166   if (opt.cut_dirs)
1167     {
1168       char *ptr = u->dir + (*u->dir == '/');
1169       int slash_count = 1 + count_slashes (ptr);
1170       int cut = MINVAL (opt.cut_dirs, slash_count);
1171       for (; cut && *ptr; ptr++)
1172         if (*ptr == '/')
1173           --cut;
1174       STRDUP_ALLOCA (dir, ptr);
1175     }
1176   else
1177     dir = u->dir + (*u->dir == '/');
1178
1179   /* Check for the true name (or at least a consistent name for saving
1180      to directory) of HOST, reusing the hlist if possible.  */
1181   if (opt.add_hostdir)
1182     {
1183       /* Add dir_prefix and hostname (if required) to the beginning of
1184          dir.  */
1185       dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1186                                 + strlen (u->host)
1187                                 + 1 + numdigit (u->port)
1188                                 + 1);
1189       if (!DOTP (opt.dir_prefix))
1190         sprintf (dirpref, "%s/%s", opt.dir_prefix, u->host);
1191       else
1192         strcpy (dirpref, u->host);
1193
1194       if (u->port != scheme_default_port (u->scheme))
1195         {
1196           int len = strlen (dirpref);
1197           dirpref[len] = ':';
1198           long_to_string (dirpref + len + 1, u->port);
1199         }
1200     }
1201   else                          /* not add_hostdir */
1202     {
1203       if (!DOTP (opt.dir_prefix))
1204         dirpref = opt.dir_prefix;
1205       else
1206         dirpref = "";
1207     }
1208
1209   /* If there is a prefix, prepend it.  */
1210   if (*dirpref)
1211     {
1212       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1213       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1214       dir = newdir;
1215     }
1216
1217   dir_preencoding = dir;
1218   dir = reencode_string (dir_preencoding);
1219
1220   l = strlen (dir);
1221   if (l && dir[l - 1] == '/')
1222     dir[l - 1] = '\0';
1223
1224   if (!*u->file)
1225     file = "index.html";
1226   else
1227     file = u->file;
1228
1229   /* Finally, construct the full name.  */
1230   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file)
1231                          + (query ? (1 + strlen (query)) : 0)
1232                          + 1);
1233   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1234   if (query)
1235     {
1236       strcat (res, "?");
1237       strcat (res, query);
1238     }
1239   if (dir != dir_preencoding)
1240     xfree (dir);
1241   return res;
1242 }
1243
1244 /* Compose a file name out of BASE, an unescaped file name, and QUERY,
1245    an escaped query string.  The trick is to make sure that unsafe
1246    characters in BASE are escaped, and that slashes in QUERY are also
1247    escaped.  */
1248
1249 static char *
1250 compose_file_name (char *base, char *query)
1251 {
1252   char result[256];
1253   char *from;
1254   char *to = result;
1255
1256   /* Copy BASE to RESULT and encode all unsafe characters.  */
1257   from = base;
1258   while (*from && to - result < sizeof (result))
1259     {
1260       if (UNSAFE_CHAR (*from))
1261         {
1262           unsigned char c = *from++;
1263           *to++ = '%';
1264           *to++ = XDIGIT_TO_XCHAR (c >> 4);
1265           *to++ = XDIGIT_TO_XCHAR (c & 0xf);
1266         }
1267       else
1268         *to++ = *from++;
1269     }
1270
1271   if (query && to - result < sizeof (result))
1272     {
1273       *to++ = '?';
1274
1275       /* Copy QUERY to RESULT and encode all '/' characters. */
1276       from = query;
1277       while (*from && to - result < sizeof (result))
1278         {
1279           if (*from == '/')
1280             {
1281               *to++ = '%';
1282               *to++ = '2';
1283               *to++ = 'F';
1284               ++from;
1285             }
1286           else
1287             *to++ = *from++;
1288         }
1289     }
1290
1291   if (to - result < sizeof (result))
1292     *to = '\0';
1293   else
1294     /* Truncate input which is too long, presumably due to a huge
1295        query string.  */
1296     result[sizeof (result) - 1] = '\0';
1297
1298   return xstrdup (result);
1299 }
1300
1301 /* Create a unique filename, corresponding to a given URL.  Calls
1302    mkstruct if necessary.  Does *not* actually create any directories.  */
1303 char *
1304 url_filename (const struct url *u)
1305 {
1306   char *file, *name;
1307   int have_prefix = 0;          /* whether we must prepend opt.dir_prefix */
1308
1309   if (opt.dirstruct)
1310     {
1311       file = mkstruct (u);
1312       have_prefix = 1;
1313     }
1314   else
1315     {
1316       char *base = *u->file ? u->file : "index.html";
1317       char *query = u->query && *u->query ? u->query : NULL;
1318       file = compose_file_name (base, query);
1319     }
1320
1321   if (!have_prefix)
1322     {
1323       /* Check whether the prefix directory is something other than "."
1324          before prepending it.  */
1325       if (!DOTP (opt.dir_prefix))
1326         {
1327           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1328                                          + 1 + strlen (file) + 1);
1329           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1330           xfree (file);
1331           file = nfile;
1332         }
1333     }
1334   /* DOS-ish file systems don't like `%' signs in them; we change it
1335      to `@'.  */
1336 #ifdef WINDOWS
1337   {
1338     char *p = file;
1339     for (p = file; *p; p++)
1340       if (*p == '%')
1341         *p = '@';
1342   }
1343 #endif /* WINDOWS */
1344
1345   /* Check the cases in which the unique extensions are not used:
1346      1) Clobbering is turned off (-nc).
1347      2) Retrieval with regetting.
1348      3) Timestamping is used.
1349      4) Hierarchy is built.
1350
1351      The exception is the case when file does exist and is a
1352      directory (actually support for bad httpd-s).  */
1353   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1354       && !(file_exists_p (file) && !file_non_directory_p (file)))
1355     return file;
1356
1357   /* Find a unique name.  */
1358   name = unique_name (file);
1359   xfree (file);
1360   return name;
1361 }
1362
1363 /* Like strlen(), but allow the URL to be ended with '?'.  */
1364 static int
1365 urlpath_length (const char *url)
1366 {
1367   const char *q = strpbrk_or_eos (url, "?;#");
1368   return q - url;
1369 }
1370
1371 /* Find the last occurrence of character C in the range [b, e), or
1372    NULL, if none are present.  This is almost completely equivalent to
1373    { *e = '\0'; return strrchr(b); }, except that it doesn't change
1374    the contents of the string.  */
1375 static const char *
1376 find_last_char (const char *b, const char *e, char c)
1377 {
1378   for (; e > b; e--)
1379     if (*e == c)
1380       return e;
1381   return NULL;
1382 }
1383
1384 /* Resolve the result of "linking" a base URI (BASE) to a
1385    link-specified URI (LINK).
1386
1387    Either of the URIs may be absolute or relative, complete with the
1388    host name, or path only.  This tries to behave "reasonably" in all
1389    foreseeable cases.  It employs little specific knowledge about
1390    schemes or URL-specific stuff -- it just works on strings.
1391
1392    The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1393    See uri_merge for a gentler interface to this functionality.
1394
1395    Perhaps this function should handle `./' and `../' so that the evil
1396    path_simplify can go.  */
1397 static char *
1398 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1399 {
1400   char *constr;
1401
1402   if (no_scheme)
1403     {
1404       const char *end = base + urlpath_length (base);
1405
1406       if (!*link)
1407         {
1408           /* Empty LINK points back to BASE, query string and all. */
1409           constr = xstrdup (base);
1410         }
1411       else if (*link == '?')
1412         {
1413           /* LINK points to the same location, but changes the query
1414              string.  Examples: */
1415           /* uri_merge("path",         "?new") -> "path?new"     */
1416           /* uri_merge("path?foo",     "?new") -> "path?new"     */
1417           /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1418           /* uri_merge("path#foo",     "?new") -> "path?new"     */
1419           int baselength = end - base;
1420           constr = xmalloc (baselength + linklength + 1);
1421           memcpy (constr, base, baselength);
1422           memcpy (constr + baselength, link, linklength);
1423           constr[baselength + linklength] = '\0';
1424         }
1425       else if (*link == '#')
1426         {
1427           /* uri_merge("path",         "#new") -> "path#new"     */
1428           /* uri_merge("path#foo",     "#new") -> "path#new"     */
1429           /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1430           /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1431           int baselength;
1432           const char *end1 = strchr (base, '#');
1433           if (!end1)
1434             end1 = base + strlen (base);
1435           baselength = end1 - base;
1436           constr = xmalloc (baselength + linklength + 1);
1437           memcpy (constr, base, baselength);
1438           memcpy (constr + baselength, link, linklength);
1439           constr[baselength + linklength] = '\0';
1440         }
1441       else if (*link == '/')
1442         {
1443           /* LINK is an absolute path: we need to replace everything
1444              after (and including) the FIRST slash with LINK.
1445
1446              So, if BASE is "http://host/whatever/foo/bar", and LINK is
1447              "/qux/xyzzy", our result should be
1448              "http://host/qux/xyzzy".  */
1449           int span;
1450           const char *slash;
1451           const char *start_insert = NULL; /* for gcc to shut up. */
1452           const char *pos = base;
1453           int seen_slash_slash = 0;
1454           /* We're looking for the first slash, but want to ignore
1455              double slash. */
1456         again:
1457           slash = memchr (pos, '/', end - pos);
1458           if (slash && !seen_slash_slash)
1459             if (*(slash + 1) == '/')
1460               {
1461                 pos = slash + 2;
1462                 seen_slash_slash = 1;
1463                 goto again;
1464               }
1465
1466           /* At this point, SLASH is the location of the first / after
1467              "//", or the first slash altogether.  START_INSERT is the
1468              pointer to the location where LINK will be inserted.  When
1469              examining the last two examples, keep in mind that LINK
1470              begins with '/'. */
1471
1472           if (!slash && !seen_slash_slash)
1473             /* example: "foo" */
1474             /*           ^    */
1475             start_insert = base;
1476           else if (!slash && seen_slash_slash)
1477             /* example: "http://foo" */
1478             /*                     ^ */
1479             start_insert = end;
1480           else if (slash && !seen_slash_slash)
1481             /* example: "foo/bar" */
1482             /*           ^        */
1483             start_insert = base;
1484           else if (slash && seen_slash_slash)
1485             /* example: "http://something/" */
1486             /*                           ^  */
1487             start_insert = slash;
1488
1489           span = start_insert - base;
1490           constr = (char *)xmalloc (span + linklength + 1);
1491           if (span)
1492             memcpy (constr, base, span);
1493           if (linklength)
1494             memcpy (constr + span, link, linklength);
1495           constr[span + linklength] = '\0';
1496         }
1497       else
1498         {
1499           /* LINK is a relative URL: we need to replace everything
1500              after last slash (possibly empty) with LINK.
1501
1502              So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1503              our result should be "whatever/foo/qux/xyzzy".  */
1504           int need_explicit_slash = 0;
1505           int span;
1506           const char *start_insert;
1507           const char *last_slash = find_last_char (base, end, '/');
1508           if (!last_slash)
1509             {
1510               /* No slash found at all.  Append LINK to what we have,
1511                  but we'll need a slash as a separator.
1512
1513                  Example: if base == "foo" and link == "qux/xyzzy", then
1514                  we cannot just append link to base, because we'd get
1515                  "fooqux/xyzzy", whereas what we want is
1516                  "foo/qux/xyzzy".
1517
1518                  To make sure the / gets inserted, we set
1519                  need_explicit_slash to 1.  We also set start_insert
1520                  to end + 1, so that the length calculations work out
1521                  correctly for one more (slash) character.  Accessing
1522                  that character is fine, since it will be the
1523                  delimiter, '\0' or '?'.  */
1524               /* example: "foo?..." */
1525               /*               ^    ('?' gets changed to '/') */
1526               start_insert = end + 1;
1527               need_explicit_slash = 1;
1528             }
1529           else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1530             {
1531               /* example: http://host"  */
1532               /*                      ^ */
1533               start_insert = end + 1;
1534               need_explicit_slash = 1;
1535             }
1536           else
1537             {
1538               /* example: "whatever/foo/bar" */
1539               /*                        ^    */
1540               start_insert = last_slash + 1;
1541             }
1542
1543           span = start_insert - base;
1544           constr = (char *)xmalloc (span + linklength + 1);
1545           if (span)
1546             memcpy (constr, base, span);
1547           if (need_explicit_slash)
1548             constr[span - 1] = '/';
1549           if (linklength)
1550             memcpy (constr + span, link, linklength);
1551           constr[span + linklength] = '\0';
1552         }
1553     }
1554   else /* !no_scheme */
1555     {
1556       constr = strdupdelim (link, link + linklength);
1557     }
1558   return constr;
1559 }
1560
1561 /* Merge BASE with LINK and return the resulting URI.  This is an
1562    interface to uri_merge_1 that assumes that LINK is a
1563    zero-terminated string.  */
1564 char *
1565 uri_merge (const char *base, const char *link)
1566 {
1567   return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
1568 }
1569 \f
1570 #define APPEND(p, s) do {                       \
1571   int len = strlen (s);                         \
1572   memcpy (p, s, len);                           \
1573   p += len;                                     \
1574 } while (0)
1575
1576 /* Use this instead of password when the actual password is supposed
1577    to be hidden.  We intentionally use a generic string without giving
1578    away the number of characters in the password, like previous
1579    versions did.  */
1580 #define HIDDEN_PASSWORD "*password*"
1581
1582 /* Recreate the URL string from the data in URL.
1583
1584    If HIDE is non-zero (as it is when we're calling this on a URL we
1585    plan to print, but not when calling it to canonicalize a URL for
1586    use within the program), password will be hidden.  Unsafe
1587    characters in the URL will be quoted.  */
1588
1589 char *
1590 url_string (const struct url *url, int hide_password)
1591 {
1592   int size;
1593   char *result, *p;
1594   char *quoted_user = NULL, *quoted_passwd = NULL;
1595
1596   int scheme_port  = supported_schemes[url->scheme].default_port;
1597   char *scheme_str = supported_schemes[url->scheme].leading_string;
1598   int fplen = full_path_length (url);
1599
1600   assert (scheme_str != NULL);
1601
1602   /* Make sure the user name and password are quoted. */
1603   if (url->user)
1604     {
1605       quoted_user = encode_string_maybe (url->user);
1606       if (url->passwd)
1607         {
1608           if (hide_password)
1609             quoted_passwd = HIDDEN_PASSWORD;
1610           else
1611             quoted_passwd = encode_string_maybe (url->passwd);
1612         }
1613     }
1614
1615   size = (strlen (scheme_str)
1616           + strlen (url->host)
1617           + fplen
1618           + 1);
1619   if (url->port != scheme_port)
1620     size += 1 + numdigit (url->port);
1621   if (quoted_user)
1622     {
1623       size += 1 + strlen (quoted_user);
1624       if (quoted_passwd)
1625         size += 1 + strlen (quoted_passwd);
1626     }
1627
1628   p = result = xmalloc (size);
1629
1630   APPEND (p, scheme_str);
1631   if (quoted_user)
1632     {
1633       APPEND (p, quoted_user);
1634       if (quoted_passwd)
1635         {
1636           *p++ = ':';
1637           APPEND (p, quoted_passwd);
1638         }
1639       *p++ = '@';
1640     }
1641
1642   APPEND (p, url->host);
1643   if (url->port != scheme_port)
1644     {
1645       *p++ = ':';
1646       long_to_string (p, url->port);
1647       p += strlen (p);
1648     }
1649
1650   full_path_write (url, p);
1651   p += fplen;
1652   *p++ = '\0';
1653
1654   assert (p - result == size);
1655
1656   if (quoted_user && quoted_user != url->user)
1657     xfree (quoted_user);
1658   if (quoted_passwd && !hide_password
1659       && quoted_passwd != url->passwd)
1660     xfree (quoted_passwd);
1661
1662   return result;
1663 }
1664 \f
1665 /* Returns proxy host address, in accordance with SCHEME.  */
1666 char *
1667 getproxy (enum url_scheme scheme)
1668 {
1669   char *proxy = NULL;
1670   char *rewritten_url;
1671   static char rewritten_storage[1024];
1672
1673   switch (scheme)
1674     {
1675     case SCHEME_HTTP:
1676       proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1677       break;
1678 #ifdef HAVE_SSL
1679     case SCHEME_HTTPS:
1680       proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1681       break;
1682 #endif
1683     case SCHEME_FTP:
1684       proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1685       break;
1686     case SCHEME_INVALID:
1687       break;
1688     }
1689   if (!proxy || !*proxy)
1690     return NULL;
1691
1692   /* Handle shorthands. */
1693   rewritten_url = rewrite_shorthand_url (proxy);
1694   if (rewritten_url)
1695     {
1696       strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
1697       rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
1698       proxy = rewritten_storage;
1699     }
1700
1701   return proxy;
1702 }
1703
1704 /* Should a host be accessed through proxy, concerning no_proxy?  */
1705 int
1706 no_proxy_match (const char *host, const char **no_proxy)
1707 {
1708   if (!no_proxy)
1709     return 1;
1710   else
1711     return !sufmatch (no_proxy, host);
1712 }
1713 \f
1714 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1715 static const char *replace_attr PARAMS ((const char *, int, FILE *,
1716                                          const char *));
1717 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
1718                                                       const char *, int));
1719 static char *local_quote_string PARAMS ((const char *));
1720
1721 /* Change the links in one HTML file.  LINKS is a list of links in the
1722    document, along with their positions and the desired direction of
1723    the conversion.  */
1724 void
1725 convert_links (const char *file, struct urlpos *links)
1726 {
1727   struct file_memory *fm;
1728   FILE *fp;
1729   const char *p;
1730   downloaded_file_t downloaded_file_return;
1731
1732   struct urlpos *link;
1733   int to_url_count = 0, to_file_count = 0;
1734
1735   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1736
1737   {
1738     /* First we do a "dry run": go through the list L and see whether
1739        any URL needs to be converted in the first place.  If not, just
1740        leave the file alone.  */
1741     int dry_count = 0;
1742     struct urlpos *dry = links;
1743     for (dry = links; dry; dry = dry->next)
1744       if (dry->convert != CO_NOCONVERT)
1745         ++dry_count;
1746     if (!dry_count)
1747       {
1748         logputs (LOG_VERBOSE, _("nothing to do.\n"));
1749         return;
1750       }
1751   }
1752
1753   fm = read_file (file);
1754   if (!fm)
1755     {
1756       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1757                  file, strerror (errno));
1758       return;
1759     }
1760
1761   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1762   if (opt.backup_converted && downloaded_file_return)
1763     write_backup_file (file, downloaded_file_return);
1764
1765   /* Before opening the file for writing, unlink the file.  This is
1766      important if the data in FM is mmaped.  In such case, nulling the
1767      file, which is what fopen() below does, would make us read all
1768      zeroes from the mmaped region.  */
1769   if (unlink (file) < 0 && errno != ENOENT)
1770     {
1771       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1772                  file, strerror (errno));
1773       read_file_free (fm);
1774       return;
1775     }
1776   /* Now open the file for writing.  */
1777   fp = fopen (file, "wb");
1778   if (!fp)
1779     {
1780       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1781                  file, strerror (errno));
1782       read_file_free (fm);
1783       return;
1784     }
1785
1786   /* Here we loop through all the URLs in file, replacing those of
1787      them that are downloaded with relative references.  */
1788   p = fm->content;
1789   for (link = links; link; link = link->next)
1790     {
1791       char *url_start = fm->content + link->pos;
1792
1793       if (link->pos >= fm->length)
1794         {
1795           DEBUGP (("Something strange is going on.  Please investigate."));
1796           break;
1797         }
1798       /* If the URL is not to be converted, skip it.  */
1799       if (link->convert == CO_NOCONVERT)
1800         {
1801           DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
1802           continue;
1803         }
1804
1805       /* Echo the file contents, up to the offending URL's opening
1806          quote, to the outfile.  */
1807       fwrite (p, 1, url_start - p, fp);
1808       p = url_start;
1809
1810       switch (link->convert)
1811         {
1812         case CO_CONVERT_TO_RELATIVE:
1813           /* Convert absolute URL to relative. */
1814           {
1815             char *newname = construct_relative (file, link->local_name);
1816             char *quoted_newname = local_quote_string (newname);
1817
1818             if (!link->link_refresh_p)
1819               p = replace_attr (p, link->size, fp, quoted_newname);
1820             else
1821               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
1822                                              link->refresh_timeout);
1823
1824             DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1825                      link->url->url, newname, link->pos, file));
1826             xfree (newname);
1827             xfree (quoted_newname);
1828             ++to_file_count;
1829             break;
1830           }
1831         case CO_CONVERT_TO_COMPLETE:
1832           /* Convert the link to absolute URL. */
1833           {
1834             char *newlink = link->url->url;
1835             char *quoted_newlink = html_quote_string (newlink);
1836
1837             if (!link->link_refresh_p)
1838               p = replace_attr (p, link->size, fp, quoted_newlink);
1839             else
1840               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
1841                                              link->refresh_timeout);
1842
1843             DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1844                      newlink, link->pos, file));
1845             xfree (quoted_newlink);
1846             ++to_url_count;
1847             break;
1848           }
1849         case CO_NULLIFY_BASE:
1850           /* Change the base href to "". */
1851           p = replace_attr (p, link->size, fp, "");
1852           break;
1853         case CO_NOCONVERT:
1854           abort ();
1855           break;
1856         }
1857     }
1858
1859   /* Output the rest of the file. */
1860   if (p - fm->content < fm->length)
1861     fwrite (p, 1, fm->length - (p - fm->content), fp);
1862   fclose (fp);
1863   read_file_free (fm);
1864
1865   logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
1866 }
1867
1868 /* Construct and return a malloced copy of the relative link from two
1869    pieces of information: local name S1 of the referring file and
1870    local name S2 of the referred file.
1871
1872    So, if S1 is "jagor.srce.hr/index.html" and S2 is
1873    "jagor.srce.hr/images/news.gif", the function will return
1874    "images/news.gif".
1875
1876    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1877    "fly.cc.fer.hr/images/fly.gif", the function will return
1878    "../images/fly.gif".
1879
1880    Caveats: S1 should not begin with `/', unless S2 also begins with
1881    '/'.  S1 should not contain things like ".." and such --
1882    construct_relative ("fly/ioccc/../index.html",
1883    "fly/images/fly.gif") will fail.  (A workaround is to call
1884    something like path_simplify() on S1).  */
1885 static char *
1886 construct_relative (const char *s1, const char *s2)
1887 {
1888   int i, cnt, sepdirs1;
1889   char *res;
1890
1891   if (*s2 == '/')
1892     return xstrdup (s2);
1893   /* S1 should *not* be absolute, if S2 wasn't.  */
1894   assert (*s1 != '/');
1895   i = cnt = 0;
1896   /* Skip the directories common to both strings.  */
1897   while (1)
1898     {
1899       while (s1[i] && s2[i]
1900              && (s1[i] == s2[i])
1901              && (s1[i] != '/')
1902              && (s2[i] != '/'))
1903         ++i;
1904       if (s1[i] == '/' && s2[i] == '/')
1905         cnt = ++i;
1906       else
1907         break;
1908     }
1909   for (sepdirs1 = 0; s1[i]; i++)
1910     if (s1[i] == '/')
1911       ++sepdirs1;
1912   /* Now, construct the file as of:
1913      - ../ repeated sepdirs1 time
1914      - all the non-mutual directories of S2.  */
1915   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1916   for (i = 0; i < sepdirs1; i++)
1917     memcpy (res + 3 * i, "../", 3);
1918   strcpy (res + 3 * i, s2 + cnt);
1919   return res;
1920 }
1921 \f
1922 static void
1923 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1924 {
1925   /* Rather than just writing over the original .html file with the
1926      converted version, save the former to *.orig.  Note we only do
1927      this for files we've _successfully_ downloaded, so we don't
1928      clobber .orig files sitting around from previous invocations. */
1929
1930   /* Construct the backup filename as the original name plus ".orig". */
1931   size_t         filename_len = strlen(file);
1932   char*          filename_plus_orig_suffix;
1933   boolean        already_wrote_backup_file = FALSE;
1934   slist*         converted_file_ptr;
1935   static slist*  converted_files = NULL;
1936
1937   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1938     {
1939       /* Just write "orig" over "html".  We need to do it this way
1940          because when we're checking to see if we've downloaded the
1941          file before (to see if we can skip downloading it), we don't
1942          know if it's a text/html file.  Therefore we don't know yet
1943          at that stage that -E is going to cause us to tack on
1944          ".html", so we need to compare vs. the original URL plus
1945          ".orig", not the original URL plus ".html.orig". */
1946       filename_plus_orig_suffix = alloca (filename_len + 1);
1947       strcpy(filename_plus_orig_suffix, file);
1948       strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1949     }
1950   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1951     {
1952       /* Append ".orig" to the name. */
1953       filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1954       strcpy(filename_plus_orig_suffix, file);
1955       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1956     }
1957
1958   /* We can get called twice on the same URL thanks to the
1959      convert_all_links() call in main().  If we write the .orig file
1960      each time in such a case, it'll end up containing the first-pass
1961      conversion, not the original file.  So, see if we've already been
1962      called on this file. */
1963   converted_file_ptr = converted_files;
1964   while (converted_file_ptr != NULL)
1965     if (strcmp(converted_file_ptr->string, file) == 0)
1966       {
1967         already_wrote_backup_file = TRUE;
1968         break;
1969       }
1970     else
1971       converted_file_ptr = converted_file_ptr->next;
1972
1973   if (!already_wrote_backup_file)
1974     {
1975       /* Rename <file> to <file>.orig before former gets written over. */
1976       if (rename(file, filename_plus_orig_suffix) != 0)
1977         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1978                    file, filename_plus_orig_suffix, strerror (errno));
1979
1980       /* Remember that we've already written a .orig backup for this file.
1981          Note that we never free this memory since we need it till the
1982          convert_all_links() call, which is one of the last things the
1983          program does before terminating.  BTW, I'm not sure if it would be
1984          safe to just set 'converted_file_ptr->string' to 'file' below,
1985          rather than making a copy of the string...  Another note is that I
1986          thought I could just add a field to the urlpos structure saying
1987          that we'd written a .orig file for this URL, but that didn't work,
1988          so I had to make this separate list.
1989          -- Dan Harkless <wget@harkless.org>
1990
1991          This [adding a field to the urlpos structure] didn't work
1992          because convert_file() is called from convert_all_links at
1993          the end of the retrieval with a freshly built new urlpos
1994          list.
1995          -- Hrvoje Niksic <hniksic@arsdigita.com>
1996       */
1997       converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1998       converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
1999       converted_file_ptr->next = converted_files;
2000       converted_files = converted_file_ptr;
2001     }
2002 }
2003
2004 static int find_fragment PARAMS ((const char *, int, const char **,
2005                                   const char **));
2006
2007 /* Replace an attribute's original text with NEW_TEXT. */
2008
2009 static const char *
2010 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
2011 {
2012   int quote_flag = 0;
2013   char quote_char = '\"';       /* use "..." for quoting, unless the
2014                                    original value is quoted, in which
2015                                    case reuse its quoting char. */
2016   const char *frag_beg, *frag_end;
2017
2018   /* Structure of our string is:
2019        "...old-contents..."
2020        <---    size    --->  (with quotes)
2021      OR:
2022        ...old-contents...
2023        <---    size   -->    (no quotes)   */
2024
2025   if (*p == '\"' || *p == '\'')
2026     {
2027       quote_char = *p;
2028       quote_flag = 1;
2029       ++p;
2030       size -= 2;                /* disregard opening and closing quote */
2031     }
2032   putc (quote_char, fp);
2033   fputs (new_text, fp);
2034
2035   /* Look for fragment identifier, if any. */
2036   if (find_fragment (p, size, &frag_beg, &frag_end))
2037     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2038   p += size;
2039   if (quote_flag)
2040     ++p;
2041   putc (quote_char, fp);
2042
2043   return p;
2044 }
2045
2046 /* The same as REPLACE_ATTR, but used when replacing
2047    <meta http-equiv=refresh content="new_text"> because we need to
2048    append "timeout_value; URL=" before the next_text.  */
2049
2050 static const char *
2051 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
2052                            const char *new_text, int timeout)
2053 {
2054   /* "0; URL=..." */
2055   char *new_with_timeout = (char *)alloca (numdigit (timeout)
2056                                            + 6 /* "; URL=" */
2057                                            + strlen (new_text)
2058                                            + 1);
2059   sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
2060
2061   return replace_attr (p, size, fp, new_with_timeout);
2062 }
2063
2064 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2065    preceded by '&'.  If the character is not found, return zero.  If
2066    the character is found, return 1 and set BP and EP to point to the
2067    beginning and end of the region.
2068
2069    This is used for finding the fragment indentifiers in URLs.  */
2070
2071 static int
2072 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2073 {
2074   const char *end = beg + size;
2075   int saw_amp = 0;
2076   for (; beg < end; beg++)
2077     {
2078       switch (*beg)
2079         {
2080         case '&':
2081           saw_amp = 1;
2082           break;
2083         case '#':
2084           if (!saw_amp)
2085             {
2086               *bp = beg;
2087               *ep = end;
2088               return 1;
2089             }
2090           /* fallthrough */
2091         default:
2092           saw_amp = 0;
2093         }
2094     }
2095   return 0;
2096 }
2097
2098 /* Quote FILE for use as local reference to an HTML file.
2099
2100    We quote ? as %3F to avoid passing part of the file name as the
2101    parameter when browsing the converted file through HTTP.  However,
2102    it is safe to do this only when `--html-extension' is turned on.
2103    This is because converting "index.html?foo=bar" to
2104    "index.html%3Ffoo=bar" would break local browsing, as the latter
2105    isn't even recognized as an HTML file!  However, converting
2106    "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
2107    safe for both local and HTTP-served browsing.  */
2108
2109 static char *
2110 local_quote_string (const char *file)
2111 {
2112   const char *file_sans_qmark;
2113   int qm;
2114
2115   if (!opt.html_extension)
2116     return html_quote_string (file);
2117
2118   qm = count_char (file, '?');
2119
2120   if (qm)
2121     {
2122       const char *from = file;
2123       char *to, *newname;
2124
2125       /* qm * 2 because we replace each question mark with "%3F",
2126          i.e. replace one char with three, hence two more.  */
2127       int fsqlen = strlen (file) + qm * 2;
2128
2129       to = newname = (char *)alloca (fsqlen + 1);
2130       for (; *from; from++)
2131         {
2132           if (*from != '?')
2133             *to++ = *from;
2134           else
2135             {
2136               *to++ = '%';
2137               *to++ = '3';
2138               *to++ = 'F';
2139             }
2140         }
2141       assert (to - newname == fsqlen);
2142       *to = '\0';
2143
2144       file_sans_qmark = newname;
2145     }
2146   else
2147     file_sans_qmark = file;
2148
2149   return html_quote_string (file_sans_qmark);
2150 }
2151
2152 /* We're storing "modes" of type downloaded_file_t in the hash table.
2153    However, our hash tables only accept pointers for keys and values.
2154    So when we need a pointer, we use the address of a
2155    downloaded_file_t variable of static storage.  */
2156
2157 static downloaded_file_t *
2158 downloaded_mode_to_ptr (downloaded_file_t mode)
2159 {
2160   static downloaded_file_t
2161     v1 = FILE_NOT_ALREADY_DOWNLOADED,
2162     v2 = FILE_DOWNLOADED_NORMALLY,
2163     v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2164     v4 = CHECK_FOR_FILE;
2165
2166   switch (mode)
2167     {
2168     case FILE_NOT_ALREADY_DOWNLOADED:
2169       return &v1;
2170     case FILE_DOWNLOADED_NORMALLY:
2171       return &v2;
2172     case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2173       return &v3;
2174     case CHECK_FOR_FILE:
2175       return &v4;
2176     }
2177   return NULL;
2178 }
2179
2180 /* This should really be merged with dl_file_url_map and
2181    downloaded_html_files in recur.c.  This was originally a list, but
2182    I changed it to a hash table beause it was actually taking a lot of
2183    time to find things in it.  */
2184
2185 static struct hash_table *downloaded_files_hash;
2186
2187 /* Remembers which files have been downloaded.  In the standard case, should be
2188    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2189    download successfully (i.e. not for ones we have failures on or that we skip
2190    due to -N).
2191
2192    When we've downloaded a file and tacked on a ".html" extension due to -E,
2193    call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2194    FILE_DOWNLOADED_NORMALLY.
2195
2196    If you just want to check if a file has been previously added without adding
2197    it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
2198    with local filenames, not remote URLs. */
2199 downloaded_file_t
2200 downloaded_file (downloaded_file_t mode, const char *file)
2201 {
2202   downloaded_file_t *ptr;
2203
2204   if (mode == CHECK_FOR_FILE)
2205     {
2206       if (!downloaded_files_hash)
2207         return FILE_NOT_ALREADY_DOWNLOADED;
2208       ptr = hash_table_get (downloaded_files_hash, file);
2209       if (!ptr)
2210         return FILE_NOT_ALREADY_DOWNLOADED;
2211       return *ptr;
2212     }
2213
2214   if (!downloaded_files_hash)
2215     downloaded_files_hash = make_string_hash_table (0);
2216
2217   ptr = hash_table_get (downloaded_files_hash, file);
2218   if (ptr)
2219     return *ptr;
2220
2221   ptr = downloaded_mode_to_ptr (mode);
2222   hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2223
2224   return FILE_NOT_ALREADY_DOWNLOADED;
2225 }
2226
2227 static int
2228 df_free_mapper (void *key, void *value, void *ignored)
2229 {
2230   xfree (key);
2231   return 0;
2232 }
2233
2234 void
2235 downloaded_files_free (void)
2236 {
2237   if (downloaded_files_hash)
2238     {
2239       hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2240       hash_table_destroy (downloaded_files_hash);
2241       downloaded_files_hash = NULL;
2242     }
2243 }