sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif
  29 #include <sys/types.h>
  30 #ifdef HAVE_UNISTD_H
  31 # include <unistd.h>
  32 #endif
  33 #include <errno.h>
  34 #include <assert.h>
  35
  36 #include "wget.h"
  37 #include "utils.h"
  38 #include "url.h"
  39 #include "host.h"
  40 #include "hash.h"
  41
  42 #ifndef errno
  43 extern int errno;
  44 #endif
  45
  46 /* Is X "."?  */
  47 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  48 /* Is X ".."?  */
  49 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  50
  51 static int urlpath_length PARAMS ((const char *));
  52
  53 struct scheme_data
  54 {
  55   char *leading_string;
  56   int default_port;
  57 };
  58
  59 /* Supported schemes: */
  60 static struct scheme_data supported_schemes[] =
  61 {
  62   { "http://",  DEFAULT_HTTP_PORT },
  63 #ifdef HAVE_SSL
  64   { "https://", DEFAULT_HTTPS_PORT },
  65 #endif
  66   { "ftp://",   DEFAULT_FTP_PORT },
  67
  68   /* SCHEME_INVALID */
  69   { NULL,       -1 }
  70 };
  71
  72 static char *construct_relative PARAMS ((const char *, const char *));
  73
  74 \f
  75 /* Support for encoding and decoding of URL strings.  We determine
  76    whether a character is unsafe through static table lookup.  This
  77    code assumes ASCII character set and 8-bit chars.  */
  78
  79 enum {
  80   urlchr_reserved = 1,
  81   urlchr_unsafe   = 2
  82 };
  83
  84 #define R  urlchr_reserved
  85 #define U  urlchr_unsafe
  86 #define RU R|U
  87
  88 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
  89
  90 /* rfc1738 reserved chars, preserved from encoding.  */
  91
  92 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
  93
  94 /* rfc1738 unsafe chars, plus some more.  */
  95
  96 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
  97
  98 const static unsigned char urlchr_table[256] =
  99 {
 100   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 101   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 102   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 103   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 104   U,  0,  U, RU,   0,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 105   0,  0,  0,  R,   0,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 106   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 107   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 108  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 109   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 110   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 111   0,  0,  0,  U,   U,  U,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 112   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 113   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 114   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 115   0,  0,  0,  U,   U,  U,  U,  U,   /* x   y   z   {    |   }   ~   DEL */
 116
 117   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 118   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 119   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 120   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 121
 122   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 123   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 124   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 125   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 126 };
 127
 128 /* Decodes the forms %xy in a URL to the character the hexadecimal
 129    code of which is xy.  xy are hexadecimal digits from
 130    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 131    hex-digits or `%' precedes `\0', the sequence is inserted
 132    literally.  */
 133
 134 static void
 135 decode_string (char *s)
 136 {
 137   char *t = s;                  /* t - tortoise */
 138   char *h = s;                  /* h - hare     */
 139
 140   for (; *h; h++, t++)
 141     {
 142       if (*h != '%')
 143         {
 144         copychar:
 145           *t = *h;
 146         }
 147       else
 148         {
 149           /* Do nothing if '%' is not followed by two hex digits. */
 150           if (!*(h + 1) || !*(h + 2)
 151               || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
 152             goto copychar;
 153           *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
 154           h += 2;
 155         }
 156     }
 157   *t = '\0';
 158 }
 159
 160 /* Like encode_string, but return S if there are no unsafe chars.  */
 161
 162 static char *
 163 encode_string_maybe (const char *s)
 164 {
 165   const char *p1;
 166   char *p2, *newstr;
 167   int newlen;
 168   int addition = 0;
 169
 170   for (p1 = s; *p1; p1++)
 171     if (UNSAFE_CHAR (*p1))
 172       addition += 2;            /* Two more characters (hex digits) */
 173
 174   if (!addition)
 175     return (char *)s;
 176
 177   newlen = (p1 - s) + addition;
 178   newstr = (char *)xmalloc (newlen + 1);
 179
 180   p1 = s;
 181   p2 = newstr;
 182   while (*p1)
 183     {
 184       if (UNSAFE_CHAR (*p1))
 185         {
 186           unsigned char c = *p1++;
 187           *p2++ = '%';
 188           *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 189           *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 190         }
 191       else
 192         *p2++ = *p1++;
 193     }
 194   *p2 = '\0';
 195   assert (p2 - newstr == newlen);
 196
 197   return newstr;
 198 }
 199
 200 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
 201    given string, returning a malloc-ed %XX encoded string.  */
 202
 203 char *
 204 encode_string (const char *s)
 205 {
 206   char *encoded = encode_string_maybe (s);
 207   if (encoded != s)
 208     return encoded;
 209   else
 210     return xstrdup (s);
 211 }
 212
 213 /* Encode unsafe characters in PTR to %xx.  If such encoding is done,
 214    the old value of PTR is freed and PTR is made to point to the newly
 215    allocated storage.  */
 216
 217 #define ENCODE(ptr) do {                        \
 218   char *e_new = encode_string_maybe (ptr);      \
 219   if (e_new != ptr)                             \
 220     {                                           \
 221       xfree (ptr);                              \
 222       ptr = e_new;                              \
 223     }                                           \
 224 } while (0)
 225 \f
 226 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
 227
 228 /* Decide whether to encode, decode, or pass through the char at P.
 229    This used to be a macro, but it got a little too convoluted.  */
 230 static inline enum copy_method
 231 decide_copy_method (const char *p)
 232 {
 233   if (*p == '%')
 234     {
 235       if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
 236         {
 237           /* %xx sequence: decode it, unless it would decode to an
 238              unsafe or a reserved char; in that case, leave it as
 239              is. */
 240           char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
 241             XCHAR_TO_XDIGIT (*(p + 2));
 242
 243           if (UNSAFE_CHAR (preempt) || RESERVED_CHAR (preempt))
 244             return CM_PASSTHROUGH;
 245           else
 246             return CM_DECODE;
 247         }
 248       else
 249         /* Garbled %.. sequence: encode `%'. */
 250         return CM_ENCODE;
 251     }
 252   else if (UNSAFE_CHAR (*p) && !RESERVED_CHAR (*p))
 253     return CM_ENCODE;
 254   else
 255     return CM_PASSTHROUGH;
 256 }
 257
 258 /* Translate a %-quoting (but possibly non-conformant) input string S
 259    into a %-quoting (and conformant) output string.  If no characters
 260    are encoded or decoded, return the same string S; otherwise, return
 261    a freshly allocated string with the new contents.
 262
 263    After a URL has been run through this function, the protocols that
 264    use `%' as the quote character can use the resulting string as-is,
 265    while those that don't call decode_string() to get to the intended
 266    data.  This function is also stable: after an input string is
 267    transformed the first time, all further transformations of the
 268    result yield the same result string.
 269
 270    Let's discuss why this function is needed.
 271
 272    Imagine Wget is to retrieve `http://abc.xyz/abc def'.  Since a raw
 273    space character would mess up the HTTP request, it needs to be
 274    quoted, like this:
 275
 276        GET /abc%20def HTTP/1.0
 277
 278    So it appears that the unsafe chars need to be quoted, as with
 279    encode_string.  But what if we're requested to download
 280    `abc%20def'?  Remember that %-encoding is valid URL syntax, so what
 281    the user meant was a literal space, and he was kind enough to quote
 282    it.  In that case, Wget should obviously leave the `%20' as is, and
 283    send the same request as above.  So in this case we may not call
 284    encode_string.
 285
 286    But what if the requested URI is `abc%20 def'?  If we call
 287    encode_string, we end up with `/abc%2520%20def', which is almost
 288    certainly not intended.  If we don't call encode_string, we are
 289    left with the embedded space and cannot send the request.  What the
 290    user meant was for Wget to request `/abc%20%20def', and this is
 291    where reencode_string kicks in.
 292
 293    Wget used to solve this by first decoding %-quotes, and then
 294    encoding all the "unsafe" characters found in the resulting string.
 295    This was wrong because it didn't preserve certain URL special
 296    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 297    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 298    whether we considered `+' reserved (it is).  One of these results
 299    is inevitable because by the second step we would lose information
 300    on whether the `+' was originally encoded or not.  Both results
 301    were wrong because in CGI parameters + means space, while %2B means
 302    literal plus.  reencode_string correctly translates the above to
 303    "a%2B+b", i.e. returns the original string.
 304
 305    This function uses an algorithm proposed by Anon Sricharoenchai:
 306
 307    1. Encode all URL_UNSAFE and the "%" that are not followed by 2
 308       hexdigits.
 309
 310    2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
 311       "+".
 312
 313    ...except that this code conflates the two steps, and decides
 314    whether to encode, decode, or pass through each character in turn.
 315    The function still uses two passes, but their logic is the same --
 316    the first pass exists merely for the sake of allocation.  Another
 317    small difference is that we include `+' to URL_RESERVED.
 318
 319    Anon's test case:
 320
 321    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 322    ->
 323    "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
 324
 325    Simpler test cases:
 326
 327    "foo bar"         -> "foo%20bar"
 328    "foo%20bar"       -> "foo%20bar"
 329    "foo %20bar"      -> "foo%20%20bar"
 330    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 331    "foo%25%20bar"    -> "foo%25%20bar"
 332    "foo%2%20bar"     -> "foo%252%20bar"
 333    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 334    "foo%2b+bar"      -> "foo%2b+bar"  */
 335
 336 char *
 337 reencode_string (const char *s)
 338 {
 339   const char *p1;
 340   char *newstr, *p2;
 341   int oldlen, newlen;
 342
 343   int encode_count = 0;
 344   int decode_count = 0;
 345
 346   /* First, pass through the string to see if there's anything to do,
 347      and to calculate the new length.  */
 348   for (p1 = s; *p1; p1++)
 349     {
 350       switch (decide_copy_method (p1))
 351         {
 352         case CM_ENCODE:
 353           ++encode_count;
 354           break;
 355         case CM_DECODE:
 356           ++decode_count;
 357           break;
 358         case CM_PASSTHROUGH:
 359           break;
 360         }
 361     }
 362
 363   if (!encode_count && !decode_count)
 364     /* The string is good as it is. */
 365     return (char *)s;           /* C const model sucks. */
 366
 367   oldlen = p1 - s;
 368   /* Each encoding adds two characters (hex digits), while each
 369      decoding removes two characters.  */
 370   newlen = oldlen + 2 * (encode_count - decode_count);
 371   newstr = xmalloc (newlen + 1);
 372
 373   p1 = s;
 374   p2 = newstr;
 375
 376   while (*p1)
 377     {
 378       switch (decide_copy_method (p1))
 379         {
 380         case CM_ENCODE:
 381           {
 382             unsigned char c = *p1++;
 383             *p2++ = '%';
 384             *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 385             *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 386           }
 387           break;
 388         case CM_DECODE:
 389           *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
 390                    + (XCHAR_TO_XDIGIT (*(p1 + 2))));
 391           p1 += 3;              /* skip %xx */
 392           break;
 393         case CM_PASSTHROUGH:
 394           *p2++ = *p1++;
 395         }
 396     }
 397   *p2 = '\0';
 398   assert (p2 - newstr == newlen);
 399   return newstr;
 400 }
 401
 402 /* Run PTR_VAR through reencode_string.  If a new string is consed,
 403    free PTR_VAR and make it point to the new storage.  Obviously,
 404    PTR_VAR needs to be an lvalue.  */
 405
 406 #define REENCODE(ptr_var) do {                  \
 407   char *rf_new = reencode_string (ptr_var);     \
 408   if (rf_new != ptr_var)                        \
 409     {                                           \
 410       xfree (ptr_var);                          \
 411       ptr_var = rf_new;                         \
 412     }                                           \
 413 } while (0)
 414 \f
 415 /* Returns the scheme type if the scheme is supported, or
 416    SCHEME_INVALID if not.  */
 417 enum url_scheme
 418 url_scheme (const char *url)
 419 {
 420   int i;
 421
 422   for (i = 0; supported_schemes[i].leading_string; i++)
 423     if (!strncasecmp (url, supported_schemes[i].leading_string,
 424                       strlen (supported_schemes[i].leading_string)))
 425       return (enum url_scheme)i;
 426   return SCHEME_INVALID;
 427 }
 428
 429 /* Return the number of characters needed to skip the scheme part of
 430    the URL, e.g. `http://'.  If no scheme is found, returns 0.  */
 431 int
 432 url_skip_scheme (const char *url)
 433 {
 434   const char *p = url;
 435
 436   /* Skip the scheme name.  We allow `-' and `+' because of `whois++',
 437      etc. */
 438   while (ISALNUM (*p) || *p == '-' || *p == '+')
 439     ++p;
 440   if (*p != ':')
 441     return 0;
 442   /* Skip ':'. */
 443   ++p;
 444
 445   /* Skip "//" if found. */
 446   if (*p == '/' && *(p + 1) == '/')
 447     p += 2;
 448
 449   return p - url;
 450 }
 451
 452 /* Returns 1 if the URL begins with a scheme (supported or
 453    unsupported), 0 otherwise.  */
 454 int
 455 url_has_scheme (const char *url)
 456 {
 457   const char *p = url;
 458   while (ISALNUM (*p) || *p == '-' || *p == '+')
 459     ++p;
 460   return *p == ':';
 461 }
 462
 463 int
 464 scheme_default_port (enum url_scheme scheme)
 465 {
 466   return supported_schemes[scheme].default_port;
 467 }
 468
 469 /* Skip the username and password, if present here.  The function
 470    should be called *not* with the complete URL, but with the part
 471    right after the scheme.
 472
 473    If no username and password are found, return 0.  */
 474 int
 475 url_skip_uname (const char *url)
 476 {
 477   const char *p;
 478
 479   /* Look for '@' that comes before '/' or '?'. */
 480   p = (const char *)strpbrk (url, "/?@");
 481   if (!p || *p != '@')
 482     return 0;
 483
 484   return p - url + 1;
 485 }
 486
 487 static int
 488 parse_uname (const char *str, int len, char **user, char **passwd)
 489 {
 490   char *colon;
 491
 492   if (len == 0)
 493     /* Empty user name not allowed. */
 494     return 0;
 495
 496   colon = memchr (str, ':', len);
 497   if (colon == str)
 498     /* Empty user name again. */
 499     return 0;
 500
 501   if (colon)
 502     {
 503       int pwlen = len - (colon + 1 - str);
 504       *passwd = xmalloc (pwlen + 1);
 505       memcpy (*passwd, colon + 1, pwlen);
 506       (*passwd)[pwlen] = '\0';
 507       len -= pwlen + 1;
 508     }
 509   else
 510     *passwd = NULL;
 511
 512   *user = xmalloc (len + 1);
 513   memcpy (*user, str, len);
 514   (*user)[len] = '\0';
 515
 516   return 1;
 517 }
 518
 519 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 520    popularized by Netscape and NcFTP.  HTTP shorthands look like this:
 521
 522    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 523    www.foo.com[:port]            -> http://www.foo.com[:port]
 524
 525    FTP shorthands look like this:
 526
 527    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 528    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 529
 530    If the URL needs not or cannot be rewritten, return NULL.  */
 531 char *
 532 rewrite_shorthand_url (const char *url)
 533 {
 534   const char *p;
 535
 536   if (url_has_scheme (url))
 537     return NULL;
 538
 539   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 540      latter Netscape.  */
 541   for (p = url; *p && *p != ':' && *p != '/'; p++)
 542     ;
 543
 544   if (p == url)
 545     return NULL;
 546
 547   if (*p == ':')
 548     {
 549       const char *pp, *path;
 550       char *res;
 551       /* If the characters after the colon and before the next slash
 552          or end of string are all digits, it's HTTP.  */
 553       int digits = 0;
 554       for (pp = p + 1; ISDIGIT (*pp); pp++)
 555         ++digits;
 556       if (digits > 0
 557           && (*pp == '/' || *pp == '\0'))
 558         goto http;
 559
 560       /* Prepend "ftp://" to the entire URL... */
 561       path = p + 1;
 562       res = xmalloc (6 + strlen (url) + 1);
 563       sprintf (res, "ftp://%s", url);
 564       /* ...and replace ':' with '/'. */
 565       res[6 + (p - url)] = '/';
 566       return res;
 567     }
 568   else
 569     {
 570       char *res;
 571     http:
 572       /* Just prepend "http://" to what we have. */
 573       res = xmalloc (7 + strlen (url) + 1);
 574       sprintf (res, "http://%s", url);
 575       return res;
 576     }
 577 }
 578 \f
 579 static void parse_path PARAMS ((const char *, char **, char **));
 580
 581 static char *
 582 strpbrk_or_eos (const char *s, const char *accept)
 583 {
 584   char *p = strpbrk (s, accept);
 585   if (!p)
 586     p = (char *)s + strlen (s);
 587   return p;
 588 }
 589
 590 /* Turn STR into lowercase; return non-zero if a character was
 591    actually changed. */
 592
 593 static int
 594 lowercase_str (char *str)
 595 {
 596   int change = 0;
 597   for (; *str; str++)
 598     if (!ISLOWER (*str))
 599       {
 600         change = 1;
 601         *str = TOLOWER (*str);
 602       }
 603   return change;
 604 }
 605
 606 static char *parse_errors[] = {
 607 #define PE_NO_ERROR            0
 608   "No error",
 609 #define PE_UNRECOGNIZED_SCHEME 1
 610   "Unrecognized scheme",
 611 #define PE_EMPTY_HOST          2
 612   "Empty host",
 613 #define PE_BAD_PORT_NUMBER     3
 614   "Bad port number",
 615 #define PE_INVALID_USER_NAME   4
 616   "Invalid user name"
 617 };
 618
 619 #define SETERR(p, v) do {                       \
 620   if (p)                                        \
 621     *(p) = (v);                                 \
 622 } while (0)
 623
 624 /* Parse a URL.
 625
 626    Return a new struct url if successful, NULL on error.  In case of
 627    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 628    error code. */
 629 struct url *
 630 url_parse (const char *url, int *error)
 631 {
 632   struct url *u;
 633   const char *p;
 634   int path_modified, host_modified;
 635
 636   enum url_scheme scheme;
 637
 638   const char *uname_b,     *uname_e;
 639   const char *host_b,      *host_e;
 640   const char *path_b,      *path_e;
 641   const char *params_b,    *params_e;
 642   const char *query_b,     *query_e;
 643   const char *fragment_b,  *fragment_e;
 644
 645   int port;
 646   char *user = NULL, *passwd = NULL;
 647
 648   char *url_encoded;
 649
 650   scheme = url_scheme (url);
 651   if (scheme == SCHEME_INVALID)
 652     {
 653       SETERR (error, PE_UNRECOGNIZED_SCHEME);
 654       return NULL;
 655     }
 656
 657   url_encoded = reencode_string (url);
 658   p = url_encoded;
 659
 660   p += strlen (supported_schemes[scheme].leading_string);
 661   uname_b = p;
 662   p += url_skip_uname (p);
 663   uname_e = p;
 664
 665   /* scheme://user:pass@host[:port]... */
 666   /*                    ^              */
 667
 668   /* We attempt to break down the URL into the components path,
 669      params, query, and fragment.  They are ordered like this:
 670
 671        scheme://host[:port][/path][;params][?query][#fragment]  */
 672
 673   params_b   = params_e   = NULL;
 674   query_b    = query_e    = NULL;
 675   fragment_b = fragment_e = NULL;
 676
 677   host_b = p;
 678   p = strpbrk_or_eos (p, ":/;?#");
 679   host_e = p;
 680
 681   if (host_b == host_e)
 682     {
 683       SETERR (error, PE_EMPTY_HOST);
 684       return NULL;
 685     }
 686
 687   port = scheme_default_port (scheme);
 688   if (*p == ':')
 689     {
 690       const char *port_b, *port_e, *pp;
 691
 692       /* scheme://host:port/tralala */
 693       /*              ^             */
 694       ++p;
 695       port_b = p;
 696       p = strpbrk_or_eos (p, "/;?#");
 697       port_e = p;
 698
 699       if (port_b == port_e)
 700         {
 701           /* http://host:/whatever */
 702           /*             ^         */
 703           SETERR (error, PE_BAD_PORT_NUMBER);
 704           return NULL;
 705         }
 706
 707       for (port = 0, pp = port_b; pp < port_e; pp++)
 708         {
 709           if (!ISDIGIT (*pp))
 710             {
 711               /* http://host:12randomgarbage/blah */
 712               /*               ^                  */
 713               SETERR (error, PE_BAD_PORT_NUMBER);
 714               return NULL;
 715             }
 716           port = 10 * port + (*pp - '0');
 717         }
 718     }
 719
 720   if (*p == '/')
 721     {
 722       ++p;
 723       path_b = p;
 724       p = strpbrk_or_eos (p, ";?#");
 725       path_e = p;
 726     }
 727   else
 728     {
 729       /* Path is not allowed not to exist. */
 730       path_b = path_e = p;
 731     }
 732
 733   if (*p == ';')
 734     {
 735       ++p;
 736       params_b = p;
 737       p = strpbrk_or_eos (p, "?#");
 738       params_e = p;
 739     }
 740   if (*p == '?')
 741     {
 742       ++p;
 743       query_b = p;
 744       p = strpbrk_or_eos (p, "#");
 745       query_e = p;
 746     }
 747   if (*p == '#')
 748     {
 749       ++p;
 750       fragment_b = p;
 751       p += strlen (p);
 752       fragment_e = p;
 753     }
 754   assert (*p == 0);
 755
 756   if (uname_b != uname_e)
 757     {
 758       /* http://user:pass@host */
 759       /*        ^         ^    */
 760       /*     uname_b   uname_e */
 761       if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
 762         {
 763           SETERR (error, PE_INVALID_USER_NAME);
 764           return NULL;
 765         }
 766     }
 767
 768   u = (struct url *)xmalloc (sizeof (struct url));
 769   memset (u, 0, sizeof (*u));
 770
 771   u->scheme = scheme;
 772   u->host   = strdupdelim (host_b, host_e);
 773   u->port   = port;
 774   u->user   = user;
 775   u->passwd = passwd;
 776
 777   u->path = strdupdelim (path_b, path_e);
 778   path_modified = path_simplify (u->path);
 779   parse_path (u->path, &u->dir, &u->file);
 780
 781   host_modified = lowercase_str (u->host);
 782
 783   if (params_b)
 784     u->params = strdupdelim (params_b, params_e);
 785   if (query_b)
 786     u->query = strdupdelim (query_b, query_e);
 787   if (fragment_b)
 788     u->fragment = strdupdelim (fragment_b, fragment_e);
 789
 790
 791   if (path_modified || u->fragment || host_modified)
 792     {
 793       /* If path_simplify modified the path, or if a fragment is
 794          present, or if the original host name had caps in it, make
 795          sure that u->url is equivalent to what would be printed by
 796          url_string.  */
 797       u->url = url_string (u, 0);
 798
 799       if (url_encoded != url)
 800         xfree ((char *) url_encoded);
 801     }
 802   else
 803     {
 804       if (url_encoded == url)
 805         u->url    = xstrdup (url);
 806       else
 807         u->url    = url_encoded;
 808     }
 809   url_encoded = NULL;
 810
 811   return u;
 812 }
 813
 814 const char *
 815 url_error (int error_code)
 816 {
 817   assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
 818   return parse_errors[error_code];
 819 }
 820
 821 static void
 822 parse_path (const char *quoted_path, char **dir, char **file)
 823 {
 824   char *path, *last_slash;
 825
 826   STRDUP_ALLOCA (path, quoted_path);
 827   decode_string (path);
 828
 829   last_slash = strrchr (path, '/');
 830   if (!last_slash)
 831     {
 832       *dir = xstrdup ("");
 833       *file = xstrdup (path);
 834     }
 835   else
 836     {
 837       *dir = strdupdelim (path, last_slash);
 838       *file = xstrdup (last_slash + 1);
 839     }
 840 }
 841
 842 /* Note: URL's "full path" is the path with the query string and
 843    params appended.  The "fragment" (#foo) is intentionally ignored,
 844    but that might be changed.  For example, if the original URL was
 845    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
 846    the full path will be "/foo/bar/baz;bullshit?querystring".  */
 847
 848 /* Return the length of the full path, without the terminating
 849    zero.  */
 850
 851 static int
 852 full_path_length (const struct url *url)
 853 {
 854   int len = 0;
 855
 856 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
 857
 858   FROB (path);
 859   FROB (params);
 860   FROB (query);
 861
 862 #undef FROB
 863
 864   return len;
 865 }
 866
 867 /* Write out the full path. */
 868
 869 static void
 870 full_path_write (const struct url *url, char *where)
 871 {
 872 #define FROB(el, chr) do {                      \
 873   char *f_el = url->el;                         \
 874   if (f_el) {                                   \
 875     int l = strlen (f_el);                      \
 876     *where++ = chr;                             \
 877     memcpy (where, f_el, l);                    \
 878     where += l;                                 \
 879   }                                             \
 880 } while (0)
 881
 882   FROB (path, '/');
 883   FROB (params, ';');
 884   FROB (query, '?');
 885
 886 #undef FROB
 887 }
 888
 889 /* Public function for getting the "full path".  E.g. if u->path is
 890    "foo/bar" and u->query is "param=value", full_path will be
 891    "/foo/bar?param=value". */
 892
 893 char *
 894 url_full_path (const struct url *url)
 895 {
 896   int length = full_path_length (url);
 897   char *full_path = (char *)xmalloc(length + 1);
 898
 899   full_path_write (url, full_path);
 900   full_path[length] = '\0';
 901
 902   return full_path;
 903 }
 904
 905 /* Sync u->path and u->url with u->dir and u->file. */
 906
 907 static void
 908 sync_path (struct url *url)
 909 {
 910   char *newpath;
 911
 912   xfree (url->path);
 913
 914   if (!*url->dir)
 915     {
 916       newpath = xstrdup (url->file);
 917       REENCODE (newpath);
 918     }
 919   else
 920     {
 921       int dirlen = strlen (url->dir);
 922       int filelen = strlen (url->file);
 923
 924       newpath = xmalloc (dirlen + 1 + filelen + 1);
 925       memcpy (newpath, url->dir, dirlen);
 926       newpath[dirlen] = '/';
 927       memcpy (newpath + dirlen + 1, url->file, filelen);
 928       newpath[dirlen + 1 + filelen] = '\0';
 929       REENCODE (newpath);
 930     }
 931
 932   url->path = newpath;
 933
 934   /* Synchronize u->url. */
 935   xfree (url->url);
 936   url->url = url_string (url, 0);
 937 }
 938
 939 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
 940    This way we can sync u->path and u->url when they get changed.  */
 941
 942 void
 943 url_set_dir (struct url *url, const char *newdir)
 944 {
 945   xfree (url->dir);
 946   url->dir = xstrdup (newdir);
 947   sync_path (url);
 948 }
 949
 950 void
 951 url_set_file (struct url *url, const char *newfile)
 952 {
 953   xfree (url->file);
 954   url->file = xstrdup (newfile);
 955   sync_path (url);
 956 }
 957
 958 void
 959 url_free (struct url *url)
 960 {
 961   xfree (url->host);
 962   xfree (url->path);
 963   xfree (url->url);
 964
 965   FREE_MAYBE (url->params);
 966   FREE_MAYBE (url->query);
 967   FREE_MAYBE (url->fragment);
 968   FREE_MAYBE (url->user);
 969   FREE_MAYBE (url->passwd);
 970
 971   xfree (url->dir);
 972   xfree (url->file);
 973
 974   xfree (url);
 975 }
 976 \f
 977 struct urlpos *
 978 get_urls_file (const char *file)
 979 {
 980   struct file_memory *fm;
 981   struct urlpos *head, *tail;
 982   const char *text, *text_end;
 983
 984   /* Load the file.  */
 985   fm = read_file (file);
 986   if (!fm)
 987     {
 988       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 989       return NULL;
 990     }
 991   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 992   head = tail = NULL;
 993   text = fm->content;
 994   text_end = fm->content + fm->length;
 995   while (text < text_end)
 996     {
 997       const char *line_beg = text;
 998       const char *line_end = memchr (text, '\n', text_end - text);
 999       if (!line_end)
1000         line_end = text_end;
1001       else
1002         ++line_end;
1003       text = line_end;
1004       while (line_beg < line_end
1005              && ISSPACE (*line_beg))
1006         ++line_beg;
1007       while (line_end > line_beg + 1
1008              && ISSPACE (*(line_end - 1)))
1009         --line_end;
1010       if (line_end > line_beg)
1011         {
1012           int up_error_code;
1013           char *url_text;
1014           struct urlpos *entry;
1015           struct url *url;
1016
1017           /* We must copy the URL to a zero-terminated string.  *sigh*.  */
1018           url_text = strdupdelim (line_beg, line_end);
1019           url = url_parse (url_text, &up_error_code);
1020           if (!url)
1021             {
1022               logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1023                          file, url_text, url_error (up_error_code));
1024               xfree (url_text);
1025               continue;
1026             }
1027           xfree (url_text);
1028
1029           entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1030           memset (entry, 0, sizeof (*entry));
1031           entry->next = NULL;
1032           entry->url = url;
1033
1034           if (!head)
1035             head = entry;
1036           else
1037             tail->next = entry;
1038           tail = entry;
1039         }
1040     }
1041   read_file_free (fm);
1042   return head;
1043 }
1044 \f
1045 /* Free the linked list of urlpos.  */
1046 void
1047 free_urlpos (struct urlpos *l)
1048 {
1049   while (l)
1050     {
1051       struct urlpos *next = l->next;
1052       if (l->url)
1053         url_free (l->url);
1054       FREE_MAYBE (l->local_name);
1055       xfree (l);
1056       l = next;
1057     }
1058 }
1059
1060 /* Rotate FNAME opt.backups times */
1061 void
1062 rotate_backups(const char *fname)
1063 {
1064   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1065   char *from = (char *)alloca (maxlen);
1066   char *to = (char *)alloca (maxlen);
1067   struct stat sb;
1068   int i;
1069
1070   if (stat (fname, &sb) == 0)
1071     if (S_ISREG (sb.st_mode) == 0)
1072       return;
1073
1074   for (i = opt.backups; i > 1; i--)
1075     {
1076       sprintf (from, "%s.%d", fname, i - 1);
1077       sprintf (to, "%s.%d", fname, i);
1078       /* #### This will fail on machines without the rename() system
1079          call.  */
1080       rename (from, to);
1081     }
1082
1083   sprintf (to, "%s.%d", fname, 1);
1084   rename(fname, to);
1085 }
1086
1087 /* Create all the necessary directories for PATH (a file).  Calls
1088    mkdirhier() internally.  */
1089 int
1090 mkalldirs (const char *path)
1091 {
1092   const char *p;
1093   char *t;
1094   struct stat st;
1095   int res;
1096
1097   p = path + strlen (path);
1098   for (; *p != '/' && p != path; p--);
1099   /* Don't create if it's just a file.  */
1100   if ((p == path) && (*p != '/'))
1101     return 0;
1102   t = strdupdelim (path, p);
1103   /* Check whether the directory exists.  */
1104   if ((stat (t, &st) == 0))
1105     {
1106       if (S_ISDIR (st.st_mode))
1107         {
1108           xfree (t);
1109           return 0;
1110         }
1111       else
1112         {
1113           /* If the dir exists as a file name, remove it first.  This
1114              is *only* for Wget to work with buggy old CERN http
1115              servers.  Here is the scenario: When Wget tries to
1116              retrieve a directory without a slash, e.g.
1117              http://foo/bar (bar being a directory), CERN server will
1118              not redirect it too http://foo/bar/ -- it will generate a
1119              directory listing containing links to bar/file1,
1120              bar/file2, etc.  Wget will lose because it saves this
1121              HTML listing to a file `bar', so it cannot create the
1122              directory.  To work around this, if the file of the same
1123              name exists, we just remove it and create the directory
1124              anyway.  */
1125           DEBUGP (("Removing %s because of directory danger!\n", t));
1126           unlink (t);
1127         }
1128     }
1129   res = make_directory (t);
1130   if (res != 0)
1131     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1132   xfree (t);
1133   return res;
1134 }
1135
1136 static int
1137 count_slashes (const char *s)
1138 {
1139   int i = 0;
1140   while (*s)
1141     if (*s++ == '/')
1142       ++i;
1143   return i;
1144 }
1145
1146 /* Return the path name of the URL-equivalent file name, with a
1147    remote-like structure of directories.  */
1148 static char *
1149 mkstruct (const struct url *u)
1150 {
1151   char *dir, *dir_preencoding;
1152   char *file, *res, *dirpref;
1153   char *query = u->query && *u->query ? u->query : NULL;
1154   int l;
1155
1156   if (opt.cut_dirs)
1157     {
1158       char *ptr = u->dir + (*u->dir == '/');
1159       int slash_count = 1 + count_slashes (ptr);
1160       int cut = MINVAL (opt.cut_dirs, slash_count);
1161       for (; cut && *ptr; ptr++)
1162         if (*ptr == '/')
1163           --cut;
1164       STRDUP_ALLOCA (dir, ptr);
1165     }
1166   else
1167     dir = u->dir + (*u->dir == '/');
1168
1169   /* Check for the true name (or at least a consistent name for saving
1170      to directory) of HOST, reusing the hlist if possible.  */
1171   if (opt.add_hostdir)
1172     {
1173       /* Add dir_prefix and hostname (if required) to the beginning of
1174          dir.  */
1175       dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1176                                 + strlen (u->host)
1177                                 + 1 + numdigit (u->port)
1178                                 + 1);
1179       if (!DOTP (opt.dir_prefix))
1180         sprintf (dirpref, "%s/%s", opt.dir_prefix, u->host);
1181       else
1182         strcpy (dirpref, u->host);
1183
1184       if (u->port != scheme_default_port (u->scheme))
1185         {
1186           int len = strlen (dirpref);
1187           dirpref[len] = ':';
1188           long_to_string (dirpref + len + 1, u->port);
1189         }
1190     }
1191   else                          /* not add_hostdir */
1192     {
1193       if (!DOTP (opt.dir_prefix))
1194         dirpref = opt.dir_prefix;
1195       else
1196         dirpref = "";
1197     }
1198
1199   /* If there is a prefix, prepend it.  */
1200   if (*dirpref)
1201     {
1202       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1203       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1204       dir = newdir;
1205     }
1206
1207   dir_preencoding = dir;
1208   dir = reencode_string (dir_preencoding);
1209
1210   l = strlen (dir);
1211   if (l && dir[l - 1] == '/')
1212     dir[l - 1] = '\0';
1213
1214   if (!*u->file)
1215     file = "index.html";
1216   else
1217     file = u->file;
1218
1219   /* Finally, construct the full name.  */
1220   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file)
1221                          + (query ? (1 + strlen (query)) : 0)
1222                          + 1);
1223   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1224   if (query)
1225     {
1226       strcat (res, "?");
1227       strcat (res, query);
1228     }
1229   if (dir != dir_preencoding)
1230     xfree (dir);
1231   return res;
1232 }
1233
1234 /* Compose a file name out of BASE, an unescaped file name, and QUERY,
1235    an escaped query string.  The trick is to make sure that unsafe
1236    characters in BASE are escaped, and that slashes in QUERY are also
1237    escaped.  */
1238
1239 static char *
1240 compose_file_name (char *base, char *query)
1241 {
1242   char result[256];
1243   char *from;
1244   char *to = result;
1245
1246   /* Copy BASE to RESULT and encode all unsafe characters.  */
1247   from = base;
1248   while (*from && to - result < sizeof (result))
1249     {
1250       if (UNSAFE_CHAR (*from))
1251         {
1252           unsigned char c = *from++;
1253           *to++ = '%';
1254           *to++ = XDIGIT_TO_XCHAR (c >> 4);
1255           *to++ = XDIGIT_TO_XCHAR (c & 0xf);
1256         }
1257       else
1258         *to++ = *from++;
1259     }
1260
1261   if (query && to - result < sizeof (result))
1262     {
1263       *to++ = '?';
1264
1265       /* Copy QUERY to RESULT and encode all '/' characters. */
1266       from = query;
1267       while (*from && to - result < sizeof (result))
1268         {
1269           if (*from == '/')
1270             {
1271               *to++ = '%';
1272               *to++ = '2';
1273               *to++ = 'F';
1274               ++from;
1275             }
1276           else
1277             *to++ = *from++;
1278         }
1279     }
1280
1281   if (to - result < sizeof (result))
1282     *to = '\0';
1283   else
1284     /* Truncate input which is too long, presumably due to a huge
1285        query string.  */
1286     result[sizeof (result) - 1] = '\0';
1287
1288   return xstrdup (result);
1289 }
1290
1291 /* Create a unique filename, corresponding to a given URL.  Calls
1292    mkstruct if necessary.  Does *not* actually create any directories.  */
1293 char *
1294 url_filename (const struct url *u)
1295 {
1296   char *file, *name;
1297   int have_prefix = 0;          /* whether we must prepend opt.dir_prefix */
1298
1299   if (opt.dirstruct)
1300     {
1301       file = mkstruct (u);
1302       have_prefix = 1;
1303     }
1304   else
1305     {
1306       char *base = *u->file ? u->file : "index.html";
1307       char *query = u->query && *u->query ? u->query : NULL;
1308       file = compose_file_name (base, query);
1309     }
1310
1311   if (!have_prefix)
1312     {
1313       /* Check whether the prefix directory is something other than "."
1314          before prepending it.  */
1315       if (!DOTP (opt.dir_prefix))
1316         {
1317           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1318                                          + 1 + strlen (file) + 1);
1319           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1320           xfree (file);
1321           file = nfile;
1322         }
1323     }
1324   /* DOS-ish file systems don't like `%' signs in them; we change it
1325      to `@'.  */
1326 #ifdef WINDOWS
1327   {
1328     char *p = file;
1329     for (p = file; *p; p++)
1330       if (*p == '%')
1331         *p = '@';
1332   }
1333 #endif /* WINDOWS */
1334
1335   /* Check the cases in which the unique extensions are not used:
1336      1) Clobbering is turned off (-nc).
1337      2) Retrieval with regetting.
1338      3) Timestamping is used.
1339      4) Hierarchy is built.
1340
1341      The exception is the case when file does exist and is a
1342      directory (actually support for bad httpd-s).  */
1343   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1344       && !(file_exists_p (file) && !file_non_directory_p (file)))
1345     return file;
1346
1347   /* Find a unique name.  */
1348   name = unique_name (file);
1349   xfree (file);
1350   return name;
1351 }
1352
1353 /* Like strlen(), but allow the URL to be ended with '?'.  */
1354 static int
1355 urlpath_length (const char *url)
1356 {
1357   const char *q = strpbrk_or_eos (url, "?;#");
1358   return q - url;
1359 }
1360
1361 /* Find the last occurrence of character C in the range [b, e), or
1362    NULL, if none are present.  This is almost completely equivalent to
1363    { *e = '\0'; return strrchr(b); }, except that it doesn't change
1364    the contents of the string.  */
1365 static const char *
1366 find_last_char (const char *b, const char *e, char c)
1367 {
1368   for (; e > b; e--)
1369     if (*e == c)
1370       return e;
1371   return NULL;
1372 }
1373
1374 /* Resolve the result of "linking" a base URI (BASE) to a
1375    link-specified URI (LINK).
1376
1377    Either of the URIs may be absolute or relative, complete with the
1378    host name, or path only.  This tries to behave "reasonably" in all
1379    foreseeable cases.  It employs little specific knowledge about
1380    schemes or URL-specific stuff -- it just works on strings.
1381
1382    The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1383    See uri_merge for a gentler interface to this functionality.
1384
1385    #### This function should handle `./' and `../' so that the evil
1386    path_simplify can go.  */
1387 static char *
1388 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1389 {
1390   char *constr;
1391
1392   if (no_scheme)
1393     {
1394       const char *end = base + urlpath_length (base);
1395
1396       if (!*link)
1397         {
1398           /* Empty LINK points back to BASE, query string and all. */
1399           constr = xstrdup (base);
1400         }
1401       else if (*link == '?')
1402         {
1403           /* LINK points to the same location, but changes the query
1404              string.  Examples: */
1405           /* uri_merge("path",         "?new") -> "path?new"     */
1406           /* uri_merge("path?foo",     "?new") -> "path?new"     */
1407           /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1408           /* uri_merge("path#foo",     "?new") -> "path?new"     */
1409           int baselength = end - base;
1410           constr = xmalloc (baselength + linklength + 1);
1411           memcpy (constr, base, baselength);
1412           memcpy (constr + baselength, link, linklength);
1413           constr[baselength + linklength] = '\0';
1414         }
1415       else if (*link == '#')
1416         {
1417           /* uri_merge("path",         "#new") -> "path#new"     */
1418           /* uri_merge("path#foo",     "#new") -> "path#new"     */
1419           /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1420           /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1421           int baselength;
1422           const char *end1 = strchr (base, '#');
1423           if (!end1)
1424             end1 = base + strlen (base);
1425           baselength = end1 - base;
1426           constr = xmalloc (baselength + linklength + 1);
1427           memcpy (constr, base, baselength);
1428           memcpy (constr + baselength, link, linklength);
1429           constr[baselength + linklength] = '\0';
1430         }
1431       else if (*link == '/')
1432         {
1433           /* LINK is an absolute path: we need to replace everything
1434              after (and including) the FIRST slash with LINK.
1435
1436              So, if BASE is "http://host/whatever/foo/bar", and LINK is
1437              "/qux/xyzzy", our result should be
1438              "http://host/qux/xyzzy".  */
1439           int span;
1440           const char *slash;
1441           const char *start_insert = NULL; /* for gcc to shut up. */
1442           const char *pos = base;
1443           int seen_slash_slash = 0;
1444           /* We're looking for the first slash, but want to ignore
1445              double slash. */
1446         again:
1447           slash = memchr (pos, '/', end - pos);
1448           if (slash && !seen_slash_slash)
1449             if (*(slash + 1) == '/')
1450               {
1451                 pos = slash + 2;
1452                 seen_slash_slash = 1;
1453                 goto again;
1454               }
1455
1456           /* At this point, SLASH is the location of the first / after
1457              "//", or the first slash altogether.  START_INSERT is the
1458              pointer to the location where LINK will be inserted.  When
1459              examining the last two examples, keep in mind that LINK
1460              begins with '/'. */
1461
1462           if (!slash && !seen_slash_slash)
1463             /* example: "foo" */
1464             /*           ^    */
1465             start_insert = base;
1466           else if (!slash && seen_slash_slash)
1467             /* example: "http://foo" */
1468             /*                     ^ */
1469             start_insert = end;
1470           else if (slash && !seen_slash_slash)
1471             /* example: "foo/bar" */
1472             /*           ^        */
1473             start_insert = base;
1474           else if (slash && seen_slash_slash)
1475             /* example: "http://something/" */
1476             /*                           ^  */
1477             start_insert = slash;
1478
1479           span = start_insert - base;
1480           constr = (char *)xmalloc (span + linklength + 1);
1481           if (span)
1482             memcpy (constr, base, span);
1483           if (linklength)
1484             memcpy (constr + span, link, linklength);
1485           constr[span + linklength] = '\0';
1486         }
1487       else
1488         {
1489           /* LINK is a relative URL: we need to replace everything
1490              after last slash (possibly empty) with LINK.
1491
1492              So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1493              our result should be "whatever/foo/qux/xyzzy".  */
1494           int need_explicit_slash = 0;
1495           int span;
1496           const char *start_insert;
1497           const char *last_slash = find_last_char (base, end, '/');
1498           if (!last_slash)
1499             {
1500               /* No slash found at all.  Append LINK to what we have,
1501                  but we'll need a slash as a separator.
1502
1503                  Example: if base == "foo" and link == "qux/xyzzy", then
1504                  we cannot just append link to base, because we'd get
1505                  "fooqux/xyzzy", whereas what we want is
1506                  "foo/qux/xyzzy".
1507
1508                  To make sure the / gets inserted, we set
1509                  need_explicit_slash to 1.  We also set start_insert
1510                  to end + 1, so that the length calculations work out
1511                  correctly for one more (slash) character.  Accessing
1512                  that character is fine, since it will be the
1513                  delimiter, '\0' or '?'.  */
1514               /* example: "foo?..." */
1515               /*               ^    ('?' gets changed to '/') */
1516               start_insert = end + 1;
1517               need_explicit_slash = 1;
1518             }
1519           else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1520             {
1521               /* example: http://host"  */
1522               /*                      ^ */
1523               start_insert = end + 1;
1524               need_explicit_slash = 1;
1525             }
1526           else
1527             {
1528               /* example: "whatever/foo/bar" */
1529               /*                        ^    */
1530               start_insert = last_slash + 1;
1531             }
1532
1533           span = start_insert - base;
1534           constr = (char *)xmalloc (span + linklength + 1);
1535           if (span)
1536             memcpy (constr, base, span);
1537           if (need_explicit_slash)
1538             constr[span - 1] = '/';
1539           if (linklength)
1540             memcpy (constr + span, link, linklength);
1541           constr[span + linklength] = '\0';
1542         }
1543     }
1544   else /* !no_scheme */
1545     {
1546       constr = strdupdelim (link, link + linklength);
1547     }
1548   return constr;
1549 }
1550
1551 /* Merge BASE with LINK and return the resulting URI.  This is an
1552    interface to uri_merge_1 that assumes that LINK is a
1553    zero-terminated string.  */
1554 char *
1555 uri_merge (const char *base, const char *link)
1556 {
1557   return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
1558 }
1559 \f
1560 #define APPEND(p, s) do {                       \
1561   int len = strlen (s);                         \
1562   memcpy (p, s, len);                           \
1563   p += len;                                     \
1564 } while (0)
1565
1566 /* Use this instead of password when the actual password is supposed
1567    to be hidden.  We intentionally use a generic string without giving
1568    away the number of characters in the password, like previous
1569    versions did.  */
1570 #define HIDDEN_PASSWORD "*password*"
1571
1572 /* Recreate the URL string from the data in URL.
1573
1574    If HIDE is non-zero (as it is when we're calling this on a URL we
1575    plan to print, but not when calling it to canonicalize a URL for
1576    use within the program), password will be hidden.  Unsafe
1577    characters in the URL will be quoted.  */
1578
1579 char *
1580 url_string (const struct url *url, int hide_password)
1581 {
1582   int size;
1583   char *result, *p;
1584   char *quoted_user = NULL, *quoted_passwd = NULL;
1585
1586   int scheme_port  = supported_schemes[url->scheme].default_port;
1587   char *scheme_str = supported_schemes[url->scheme].leading_string;
1588   int fplen = full_path_length (url);
1589
1590   assert (scheme_str != NULL);
1591
1592   /* Make sure the user name and password are quoted. */
1593   if (url->user)
1594     {
1595       quoted_user = encode_string_maybe (url->user);
1596       if (url->passwd)
1597         {
1598           if (hide_password)
1599             quoted_passwd = HIDDEN_PASSWORD;
1600           else
1601             quoted_passwd = encode_string_maybe (url->passwd);
1602         }
1603     }
1604
1605   size = (strlen (scheme_str)
1606           + strlen (url->host)
1607           + fplen
1608           + 1);
1609   if (url->port != scheme_port)
1610     size += 1 + numdigit (url->port);
1611   if (quoted_user)
1612     {
1613       size += 1 + strlen (quoted_user);
1614       if (quoted_passwd)
1615         size += 1 + strlen (quoted_passwd);
1616     }
1617
1618   p = result = xmalloc (size);
1619
1620   APPEND (p, scheme_str);
1621   if (quoted_user)
1622     {
1623       APPEND (p, quoted_user);
1624       if (quoted_passwd)
1625         {
1626           *p++ = ':';
1627           APPEND (p, quoted_passwd);
1628         }
1629       *p++ = '@';
1630     }
1631
1632   APPEND (p, url->host);
1633   if (url->port != scheme_port)
1634     {
1635       *p++ = ':';
1636       long_to_string (p, url->port);
1637       p += strlen (p);
1638     }
1639
1640   full_path_write (url, p);
1641   p += fplen;
1642   *p++ = '\0';
1643
1644   assert (p - result == size);
1645
1646   if (quoted_user && quoted_user != url->user)
1647     xfree (quoted_user);
1648   if (quoted_passwd && !hide_password
1649       && quoted_passwd != url->passwd)
1650     xfree (quoted_passwd);
1651
1652   return result;
1653 }
1654 \f
1655 /* Returns proxy host address, in accordance with SCHEME.  */
1656 char *
1657 getproxy (enum url_scheme scheme)
1658 {
1659   char *proxy = NULL;
1660   char *rewritten_url;
1661   static char rewritten_storage[1024];
1662
1663   switch (scheme)
1664     {
1665     case SCHEME_HTTP:
1666       proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1667       break;
1668 #ifdef HAVE_SSL
1669     case SCHEME_HTTPS:
1670       proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1671       break;
1672 #endif
1673     case SCHEME_FTP:
1674       proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1675       break;
1676     case SCHEME_INVALID:
1677       break;
1678     }
1679   if (!proxy || !*proxy)
1680     return NULL;
1681
1682   /* Handle shorthands. */
1683   rewritten_url = rewrite_shorthand_url (proxy);
1684   if (rewritten_url)
1685     {
1686       strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
1687       rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
1688       proxy = rewritten_storage;
1689     }
1690
1691   return proxy;
1692 }
1693
1694 /* Should a host be accessed through proxy, concerning no_proxy?  */
1695 int
1696 no_proxy_match (const char *host, const char **no_proxy)
1697 {
1698   if (!no_proxy)
1699     return 1;
1700   else
1701     return !sufmatch (no_proxy, host);
1702 }
1703 \f
1704 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1705 static const char *replace_attr PARAMS ((const char *, int, FILE *,
1706                                          const char *));
1707 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
1708                                                       const char *, int));
1709 static char *local_quote_string PARAMS ((const char *));
1710
1711 /* Change the links in one HTML file.  LINKS is a list of links in the
1712    document, along with their positions and the desired direction of
1713    the conversion.  */
1714 void
1715 convert_links (const char *file, struct urlpos *links)
1716 {
1717   struct file_memory *fm;
1718   FILE *fp;
1719   const char *p;
1720   downloaded_file_t downloaded_file_return;
1721
1722   struct urlpos *link;
1723   int to_url_count = 0, to_file_count = 0;
1724
1725   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1726
1727   {
1728     /* First we do a "dry run": go through the list L and see whether
1729        any URL needs to be converted in the first place.  If not, just
1730        leave the file alone.  */
1731     int dry_count = 0;
1732     struct urlpos *dry = links;
1733     for (dry = links; dry; dry = dry->next)
1734       if (dry->convert != CO_NOCONVERT)
1735         ++dry_count;
1736     if (!dry_count)
1737       {
1738         logputs (LOG_VERBOSE, _("nothing to do.\n"));
1739         return;
1740       }
1741   }
1742
1743   fm = read_file (file);
1744   if (!fm)
1745     {
1746       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1747                  file, strerror (errno));
1748       return;
1749     }
1750
1751   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1752   if (opt.backup_converted && downloaded_file_return)
1753     write_backup_file (file, downloaded_file_return);
1754
1755   /* Before opening the file for writing, unlink the file.  This is
1756      important if the data in FM is mmaped.  In such case, nulling the
1757      file, which is what fopen() below does, would make us read all
1758      zeroes from the mmaped region.  */
1759   if (unlink (file) < 0 && errno != ENOENT)
1760     {
1761       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1762                  file, strerror (errno));
1763       read_file_free (fm);
1764       return;
1765     }
1766   /* Now open the file for writing.  */
1767   fp = fopen (file, "wb");
1768   if (!fp)
1769     {
1770       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1771                  file, strerror (errno));
1772       read_file_free (fm);
1773       return;
1774     }
1775
1776   /* Here we loop through all the URLs in file, replacing those of
1777      them that are downloaded with relative references.  */
1778   p = fm->content;
1779   for (link = links; link; link = link->next)
1780     {
1781       char *url_start = fm->content + link->pos;
1782
1783       if (link->pos >= fm->length)
1784         {
1785           DEBUGP (("Something strange is going on.  Please investigate."));
1786           break;
1787         }
1788       /* If the URL is not to be converted, skip it.  */
1789       if (link->convert == CO_NOCONVERT)
1790         {
1791           DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
1792           continue;
1793         }
1794
1795       /* Echo the file contents, up to the offending URL's opening
1796          quote, to the outfile.  */
1797       fwrite (p, 1, url_start - p, fp);
1798       p = url_start;
1799
1800       switch (link->convert)
1801         {
1802         case CO_CONVERT_TO_RELATIVE:
1803           /* Convert absolute URL to relative. */
1804           {
1805             char *newname = construct_relative (file, link->local_name);
1806             char *quoted_newname = local_quote_string (newname);
1807
1808             if (!link->link_refresh_p)
1809               p = replace_attr (p, link->size, fp, quoted_newname);
1810             else
1811               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
1812                                              link->refresh_timeout);
1813
1814             DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1815                      link->url->url, newname, link->pos, file));
1816             xfree (newname);
1817             xfree (quoted_newname);
1818             ++to_file_count;
1819             break;
1820           }
1821         case CO_CONVERT_TO_COMPLETE:
1822           /* Convert the link to absolute URL. */
1823           {
1824             char *newlink = link->url->url;
1825             char *quoted_newlink = html_quote_string (newlink);
1826
1827             if (!link->link_refresh_p)
1828               p = replace_attr (p, link->size, fp, quoted_newlink);
1829             else
1830               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
1831                                              link->refresh_timeout);
1832
1833             DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1834                      newlink, link->pos, file));
1835             xfree (quoted_newlink);
1836             ++to_url_count;
1837             break;
1838           }
1839         case CO_NULLIFY_BASE:
1840           /* Change the base href to "". */
1841           p = replace_attr (p, link->size, fp, "");
1842           break;
1843         case CO_NOCONVERT:
1844           abort ();
1845           break;
1846         }
1847     }
1848
1849   /* Output the rest of the file. */
1850   if (p - fm->content < fm->length)
1851     fwrite (p, 1, fm->length - (p - fm->content), fp);
1852   fclose (fp);
1853   read_file_free (fm);
1854
1855   logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
1856 }
1857
1858 /* Construct and return a malloced copy of the relative link from two
1859    pieces of information: local name S1 of the referring file and
1860    local name S2 of the referred file.
1861
1862    So, if S1 is "jagor.srce.hr/index.html" and S2 is
1863    "jagor.srce.hr/images/news.gif", the function will return
1864    "images/news.gif".
1865
1866    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1867    "fly.cc.fer.hr/images/fly.gif", the function will return
1868    "../images/fly.gif".
1869
1870    Caveats: S1 should not begin with `/', unless S2 also begins with
1871    '/'.  S1 should not contain things like ".." and such --
1872    construct_relative ("fly/ioccc/../index.html",
1873    "fly/images/fly.gif") will fail.  (A workaround is to call
1874    something like path_simplify() on S1).  */
1875 static char *
1876 construct_relative (const char *s1, const char *s2)
1877 {
1878   int i, cnt, sepdirs1;
1879   char *res;
1880
1881   if (*s2 == '/')
1882     return xstrdup (s2);
1883   /* S1 should *not* be absolute, if S2 wasn't.  */
1884   assert (*s1 != '/');
1885   i = cnt = 0;
1886   /* Skip the directories common to both strings.  */
1887   while (1)
1888     {
1889       while (s1[i] && s2[i]
1890              && (s1[i] == s2[i])
1891              && (s1[i] != '/')
1892              && (s2[i] != '/'))
1893         ++i;
1894       if (s1[i] == '/' && s2[i] == '/')
1895         cnt = ++i;
1896       else
1897         break;
1898     }
1899   for (sepdirs1 = 0; s1[i]; i++)
1900     if (s1[i] == '/')
1901       ++sepdirs1;
1902   /* Now, construct the file as of:
1903      - ../ repeated sepdirs1 time
1904      - all the non-mutual directories of S2.  */
1905   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1906   for (i = 0; i < sepdirs1; i++)
1907     memcpy (res + 3 * i, "../", 3);
1908   strcpy (res + 3 * i, s2 + cnt);
1909   return res;
1910 }
1911 \f
1912 static void
1913 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1914 {
1915   /* Rather than just writing over the original .html file with the
1916      converted version, save the former to *.orig.  Note we only do
1917      this for files we've _successfully_ downloaded, so we don't
1918      clobber .orig files sitting around from previous invocations. */
1919
1920   /* Construct the backup filename as the original name plus ".orig". */
1921   size_t         filename_len = strlen(file);
1922   char*          filename_plus_orig_suffix;
1923   boolean        already_wrote_backup_file = FALSE;
1924   slist*         converted_file_ptr;
1925   static slist*  converted_files = NULL;
1926
1927   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1928     {
1929       /* Just write "orig" over "html".  We need to do it this way
1930          because when we're checking to see if we've downloaded the
1931          file before (to see if we can skip downloading it), we don't
1932          know if it's a text/html file.  Therefore we don't know yet
1933          at that stage that -E is going to cause us to tack on
1934          ".html", so we need to compare vs. the original URL plus
1935          ".orig", not the original URL plus ".html.orig". */
1936       filename_plus_orig_suffix = alloca (filename_len + 1);
1937       strcpy(filename_plus_orig_suffix, file);
1938       strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1939     }
1940   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1941     {
1942       /* Append ".orig" to the name. */
1943       filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1944       strcpy(filename_plus_orig_suffix, file);
1945       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1946     }
1947
1948   /* We can get called twice on the same URL thanks to the
1949      convert_all_links() call in main().  If we write the .orig file
1950      each time in such a case, it'll end up containing the first-pass
1951      conversion, not the original file.  So, see if we've already been
1952      called on this file. */
1953   converted_file_ptr = converted_files;
1954   while (converted_file_ptr != NULL)
1955     if (strcmp(converted_file_ptr->string, file) == 0)
1956       {
1957         already_wrote_backup_file = TRUE;
1958         break;
1959       }
1960     else
1961       converted_file_ptr = converted_file_ptr->next;
1962
1963   if (!already_wrote_backup_file)
1964     {
1965       /* Rename <file> to <file>.orig before former gets written over. */
1966       if (rename(file, filename_plus_orig_suffix) != 0)
1967         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1968                    file, filename_plus_orig_suffix, strerror (errno));
1969
1970       /* Remember that we've already written a .orig backup for this file.
1971          Note that we never free this memory since we need it till the
1972          convert_all_links() call, which is one of the last things the
1973          program does before terminating.  BTW, I'm not sure if it would be
1974          safe to just set 'converted_file_ptr->string' to 'file' below,
1975          rather than making a copy of the string...  Another note is that I
1976          thought I could just add a field to the urlpos structure saying
1977          that we'd written a .orig file for this URL, but that didn't work,
1978          so I had to make this separate list.
1979          -- Dan Harkless <wget@harkless.org>
1980
1981          This [adding a field to the urlpos structure] didn't work
1982          because convert_file() is called from convert_all_links at
1983          the end of the retrieval with a freshly built new urlpos
1984          list.
1985          -- Hrvoje Niksic <hniksic@arsdigita.com>
1986       */
1987       converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1988       converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
1989       converted_file_ptr->next = converted_files;
1990       converted_files = converted_file_ptr;
1991     }
1992 }
1993
1994 static int find_fragment PARAMS ((const char *, int, const char **,
1995                                   const char **));
1996
1997 /* Replace an attribute's original text with NEW_TEXT. */
1998
1999 static const char *
2000 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
2001 {
2002   int quote_flag = 0;
2003   char quote_char = '\"';       /* use "..." for quoting, unless the
2004                                    original value is quoted, in which
2005                                    case reuse its quoting char. */
2006   const char *frag_beg, *frag_end;
2007
2008   /* Structure of our string is:
2009        "...old-contents..."
2010        <---    size    --->  (with quotes)
2011      OR:
2012        ...old-contents...
2013        <---    size   -->    (no quotes)   */
2014
2015   if (*p == '\"' || *p == '\'')
2016     {
2017       quote_char = *p;
2018       quote_flag = 1;
2019       ++p;
2020       size -= 2;                /* disregard opening and closing quote */
2021     }
2022   putc (quote_char, fp);
2023   fputs (new_text, fp);
2024
2025   /* Look for fragment identifier, if any. */
2026   if (find_fragment (p, size, &frag_beg, &frag_end))
2027     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2028   p += size;
2029   if (quote_flag)
2030     ++p;
2031   putc (quote_char, fp);
2032
2033   return p;
2034 }
2035
2036 /* The same as REPLACE_ATTR, but used when replacing
2037    <meta http-equiv=refresh content="new_text"> because we need to
2038    append "timeout_value; URL=" before the next_text.  */
2039
2040 static const char *
2041 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
2042                            const char *new_text, int timeout)
2043 {
2044   /* "0; URL=..." */
2045   char *new_with_timeout = (char *)alloca (numdigit (timeout)
2046                                            + 6 /* "; URL=" */
2047                                            + strlen (new_text)
2048                                            + 1);
2049   sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
2050
2051   return replace_attr (p, size, fp, new_with_timeout);
2052 }
2053
2054 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2055    preceded by '&'.  If the character is not found, return zero.  If
2056    the character is found, return 1 and set BP and EP to point to the
2057    beginning and end of the region.
2058
2059    This is used for finding the fragment indentifiers in URLs.  */
2060
2061 static int
2062 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2063 {
2064   const char *end = beg + size;
2065   int saw_amp = 0;
2066   for (; beg < end; beg++)
2067     {
2068       switch (*beg)
2069         {
2070         case '&':
2071           saw_amp = 1;
2072           break;
2073         case '#':
2074           if (!saw_amp)
2075             {
2076               *bp = beg;
2077               *ep = end;
2078               return 1;
2079             }
2080           /* fallthrough */
2081         default:
2082           saw_amp = 0;
2083         }
2084     }
2085   return 0;
2086 }
2087
2088 /* The idea here was to quote ? as %3F to avoid passing part of the
2089    file name as the parameter when browsing the converted file through
2090    HTTP.  However, actually doing that breaks local browsing because
2091    "index.html%3Ffoo=bar" isn't even recognized as an HTML file!
2092    Perhaps this should be controlled by an option, but for now I'm
2093    leaving the question marks.
2094
2095    This is the original docstring of this function:
2096
2097    FILE should be a relative link to a local file.  It should be
2098    quoted as HTML because it will be used in HTML context.  However,
2099    we need to quote ? as %3F to avoid passing part of the file name as
2100    the parameter.  (This is not a problem when viewing locally, but is
2101    if the downloaded and converted tree is served by an HTTP
2102    server.)  */
2103
2104 /* Quote string as HTML. */
2105
2106 static char *
2107 local_quote_string (const char *file)
2108 {
2109   return html_quote_string (file);
2110
2111 #if 0
2112   const char *file_sans_qmark;
2113   int qm = count_char (file, '?');
2114
2115   if (qm)
2116     {
2117       const char *from = file;
2118       char *to, *newname;
2119
2120       /* qm * 2 because we replace each question mark with "%3F",
2121          i.e. replace one char with three, hence two more.  */
2122       int fsqlen = strlen (file) + qm * 2;
2123
2124       to = newname = (char *)alloca (fsqlen + 1);
2125       for (; *from; from++)
2126         {
2127           if (*from != '?')
2128             *to++ = *from;
2129           else
2130             {
2131               *to++ = '%';
2132               *to++ = '3';
2133               *to++ = 'F';
2134             }
2135         }
2136       assert (to - newname == fsqlen);
2137       *to = '\0';
2138
2139       file_sans_qmark = newname;
2140     }
2141   else
2142     file_sans_qmark = file;
2143
2144   return html_quote_string (file_sans_qmark);
2145 #endif
2146 }
2147
2148 /* We're storing "modes" of type downloaded_file_t in the hash table.
2149    However, our hash tables only accept pointers for keys and values.
2150    So when we need a pointer, we use the address of a
2151    downloaded_file_t variable of static storage.  */
2152
2153 static downloaded_file_t *
2154 downloaded_mode_to_ptr (downloaded_file_t mode)
2155 {
2156   static downloaded_file_t
2157     v1 = FILE_NOT_ALREADY_DOWNLOADED,
2158     v2 = FILE_DOWNLOADED_NORMALLY,
2159     v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2160     v4 = CHECK_FOR_FILE;
2161
2162   switch (mode)
2163     {
2164     case FILE_NOT_ALREADY_DOWNLOADED:
2165       return &v1;
2166     case FILE_DOWNLOADED_NORMALLY:
2167       return &v2;
2168     case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2169       return &v3;
2170     case CHECK_FOR_FILE:
2171       return &v4;
2172     }
2173   return NULL;
2174 }
2175
2176 /* This should really be merged with dl_file_url_map and
2177    downloaded_html_files in recur.c.  This was originally a list, but
2178    I changed it to a hash table beause it was actually taking a lot of
2179    time to find things in it.  */
2180
2181 static struct hash_table *downloaded_files_hash;
2182
2183 /* Remembers which files have been downloaded.  In the standard case, should be
2184    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2185    download successfully (i.e. not for ones we have failures on or that we skip
2186    due to -N).
2187
2188    When we've downloaded a file and tacked on a ".html" extension due to -E,
2189    call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2190    FILE_DOWNLOADED_NORMALLY.
2191
2192    If you just want to check if a file has been previously added without adding
2193    it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
2194    with local filenames, not remote URLs. */
2195 downloaded_file_t
2196 downloaded_file (downloaded_file_t mode, const char *file)
2197 {
2198   downloaded_file_t *ptr;
2199
2200   if (mode == CHECK_FOR_FILE)
2201     {
2202       if (!downloaded_files_hash)
2203         return FILE_NOT_ALREADY_DOWNLOADED;
2204       ptr = hash_table_get (downloaded_files_hash, file);
2205       if (!ptr)
2206         return FILE_NOT_ALREADY_DOWNLOADED;
2207       return *ptr;
2208     }
2209
2210   if (!downloaded_files_hash)
2211     downloaded_files_hash = make_string_hash_table (0);
2212
2213   ptr = hash_table_get (downloaded_files_hash, file);
2214   if (ptr)
2215     return *ptr;
2216
2217   ptr = downloaded_mode_to_ptr (mode);
2218   hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2219
2220   return FILE_NOT_ALREADY_DOWNLOADED;
2221 }
2222
2223 static int
2224 df_free_mapper (void *key, void *value, void *ignored)
2225 {
2226   xfree (key);
2227   return 0;
2228 }
2229
2230 void
2231 downloaded_files_free (void)
2232 {
2233   if (downloaded_files_hash)
2234     {
2235       hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2236       hash_table_destroy (downloaded_files_hash);
2237       downloaded_files_hash = NULL;
2238     }
2239 }