sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif
  29 #include <sys/types.h>
  30 #ifdef HAVE_UNISTD_H
  31 # include <unistd.h>
  32 #endif
  33 #include <errno.h>
  34 #include <assert.h>
  35
  36 #include "wget.h"
  37 #include "utils.h"
  38 #include "url.h"
  39 #include "host.h"
  40 #include "hash.h"
  41
  42 #ifndef errno
  43 extern int errno;
  44 #endif
  45
  46 /* Is X "."?  */
  47 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  48 /* Is X ".."?  */
  49 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  50
  51 static int urlpath_length PARAMS ((const char *));
  52
  53 struct scheme_data
  54 {
  55   char *leading_string;
  56   int default_port;
  57 };
  58
  59 /* Supported schemes: */
  60 static struct scheme_data supported_schemes[] =
  61 {
  62   { "http://",  DEFAULT_HTTP_PORT },
  63 #ifdef HAVE_SSL
  64   { "https://", DEFAULT_HTTPS_PORT },
  65 #endif
  66   { "ftp://",   DEFAULT_FTP_PORT },
  67
  68   /* SCHEME_INVALID */
  69   { NULL,       -1 }
  70 };
  71
  72 static char *construct_relative PARAMS ((const char *, const char *));
  73
  74 \f
  75 /* Support for encoding and decoding of URL strings.  We determine
  76    whether a character is unsafe through static table lookup.  This
  77    code assumes ASCII character set and 8-bit chars.  */
  78
  79 enum {
  80   urlchr_reserved = 1,
  81   urlchr_unsafe   = 2
  82 };
  83
  84 #define R  urlchr_reserved
  85 #define U  urlchr_unsafe
  86 #define RU R|U
  87
  88 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
  89
  90 /* rfc1738 reserved chars, preserved from encoding.  */
  91
  92 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
  93
  94 /* rfc1738 unsafe chars, plus some more.  */
  95
  96 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
  97
  98 const static unsigned char urlchr_table[256] =
  99 {
 100   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 101   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 102   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 103   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 104   U,  0,  U, RU,   0,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 105   0,  0,  0,  R,   0,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 106   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 107   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 108  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 109   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 110   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 111   0,  0,  0,  U,   U,  U,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 112   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 113   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 114   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 115   0,  0,  0,  U,   U,  U,  U,  U,   /* x   y   z   {    |   }   ~   DEL */
 116
 117   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 118   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 119   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 120   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 121
 122   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 123   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 124   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 125   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 126 };
 127
 128 /* Decodes the forms %xy in a URL to the character the hexadecimal
 129    code of which is xy.  xy are hexadecimal digits from
 130    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 131    hex-digits or `%' precedes `\0', the sequence is inserted
 132    literally.  */
 133
 134 static void
 135 decode_string (char *s)
 136 {
 137   char *t = s;                  /* t - tortoise */
 138   char *h = s;                  /* h - hare     */
 139
 140   for (; *h; h++, t++)
 141     {
 142       if (*h != '%')
 143         {
 144         copychar:
 145           *t = *h;
 146         }
 147       else
 148         {
 149           /* Do nothing if '%' is not followed by two hex digits. */
 150           if (!*(h + 1) || !*(h + 2)
 151               || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
 152             goto copychar;
 153           *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
 154           h += 2;
 155         }
 156     }
 157   *t = '\0';
 158 }
 159
 160 /* Like encode_string, but return S if there are no unsafe chars.  */
 161
 162 static char *
 163 encode_string_maybe (const char *s)
 164 {
 165   const char *p1;
 166   char *p2, *newstr;
 167   int newlen;
 168   int addition = 0;
 169
 170   for (p1 = s; *p1; p1++)
 171     if (UNSAFE_CHAR (*p1))
 172       addition += 2;            /* Two more characters (hex digits) */
 173
 174   if (!addition)
 175     return (char *)s;
 176
 177   newlen = (p1 - s) + addition;
 178   newstr = (char *)xmalloc (newlen + 1);
 179
 180   p1 = s;
 181   p2 = newstr;
 182   while (*p1)
 183     {
 184       if (UNSAFE_CHAR (*p1))
 185         {
 186           unsigned char c = *p1++;
 187           *p2++ = '%';
 188           *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 189           *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 190         }
 191       else
 192         *p2++ = *p1++;
 193     }
 194   *p2 = '\0';
 195   assert (p2 - newstr == newlen);
 196
 197   return newstr;
 198 }
 199
 200 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
 201    given string, returning a malloc-ed %XX encoded string.  */
 202
 203 char *
 204 encode_string (const char *s)
 205 {
 206   char *encoded = encode_string_maybe (s);
 207   if (encoded != s)
 208     return encoded;
 209   else
 210     return xstrdup (s);
 211 }
 212
 213 /* Encode unsafe characters in PTR to %xx.  If such encoding is done,
 214    the old value of PTR is freed and PTR is made to point to the newly
 215    allocated storage.  */
 216
 217 #define ENCODE(ptr) do {                        \
 218   char *e_new = encode_string_maybe (ptr);      \
 219   if (e_new != ptr)                             \
 220     {                                           \
 221       xfree (ptr);                              \
 222       ptr = e_new;                              \
 223     }                                           \
 224 } while (0)
 225 \f
 226 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
 227
 228 /* Decide whether to encode, decode, or pass through the char at P.
 229    This used to be a macro, but it got a little too convoluted.  */
 230 static inline enum copy_method
 231 decide_copy_method (const char *p)
 232 {
 233   if (*p == '%')
 234     {
 235       if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
 236         {
 237           /* %xx sequence: decode it, unless it would decode to an
 238              unsafe or a reserved char; in that case, leave it as
 239              is. */
 240           char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
 241             XCHAR_TO_XDIGIT (*(p + 2));
 242
 243           if (UNSAFE_CHAR (preempt) || RESERVED_CHAR (preempt))
 244             return CM_PASSTHROUGH;
 245           else
 246             return CM_DECODE;
 247         }
 248       else
 249         /* Garbled %.. sequence: encode `%'. */
 250         return CM_ENCODE;
 251     }
 252   else if (UNSAFE_CHAR (*p) && !RESERVED_CHAR (*p))
 253     return CM_ENCODE;
 254   else
 255     return CM_PASSTHROUGH;
 256 }
 257
 258 /* Translate a %-quoting (but possibly non-conformant) input string S
 259    into a %-quoting (and conformant) output string.  If no characters
 260    are encoded or decoded, return the same string S; otherwise, return
 261    a freshly allocated string with the new contents.
 262
 263    After a URL has been run through this function, the protocols that
 264    use `%' as the quote character can use the resulting string as-is,
 265    while those that don't call decode_string() to get to the intended
 266    data.  This function is also stable: after an input string is
 267    transformed the first time, all further transformations of the
 268    result yield the same result string.
 269
 270    Let's discuss why this function is needed.
 271
 272    Imagine Wget is to retrieve `http://abc.xyz/abc def'.  Since a raw
 273    space character would mess up the HTTP request, it needs to be
 274    quoted, like this:
 275
 276        GET /abc%20def HTTP/1.0
 277
 278    So it appears that the unsafe chars need to be quoted, as with
 279    encode_string.  But what if we're requested to download
 280    `abc%20def'?  Remember that %-encoding is valid URL syntax, so what
 281    the user meant was a literal space, and he was kind enough to quote
 282    it.  In that case, Wget should obviously leave the `%20' as is, and
 283    send the same request as above.  So in this case we may not call
 284    encode_string.
 285
 286    But what if the requested URI is `abc%20 def'?  If we call
 287    encode_string, we end up with `/abc%2520%20def', which is almost
 288    certainly not intended.  If we don't call encode_string, we are
 289    left with the embedded space and cannot send the request.  What the
 290    user meant was for Wget to request `/abc%20%20def', and this is
 291    where reencode_string kicks in.
 292
 293    Wget used to solve this by first decoding %-quotes, and then
 294    encoding all the "unsafe" characters found in the resulting string.
 295    This was wrong because it didn't preserve certain URL special
 296    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 297    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 298    whether we considered `+' reserved (it is).  One of these results
 299    is inevitable because by the second step we would lose information
 300    on whether the `+' was originally encoded or not.  Both results
 301    were wrong because in CGI parameters + means space, while %2B means
 302    literal plus.  reencode_string correctly translates the above to
 303    "a%2B+b", i.e. returns the original string.
 304
 305    This function uses an algorithm proposed by Anon Sricharoenchai:
 306
 307    1. Encode all URL_UNSAFE and the "%" that are not followed by 2
 308       hexdigits.
 309
 310    2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
 311       "+".
 312
 313    ...except that this code conflates the two steps, and decides
 314    whether to encode, decode, or pass through each character in turn.
 315    The function still uses two passes, but their logic is the same --
 316    the first pass exists merely for the sake of allocation.  Another
 317    small difference is that we include `+' to URL_RESERVED.
 318
 319    Anon's test case:
 320
 321    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 322    ->
 323    "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
 324
 325    Simpler test cases:
 326
 327    "foo bar"         -> "foo%20bar"
 328    "foo%20bar"       -> "foo%20bar"
 329    "foo %20bar"      -> "foo%20%20bar"
 330    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 331    "foo%25%20bar"    -> "foo%25%20bar"
 332    "foo%2%20bar"     -> "foo%252%20bar"
 333    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 334    "foo%2b+bar"      -> "foo%2b+bar"  */
 335
 336 char *
 337 reencode_string (const char *s)
 338 {
 339   const char *p1;
 340   char *newstr, *p2;
 341   int oldlen, newlen;
 342
 343   int encode_count = 0;
 344   int decode_count = 0;
 345
 346   /* First, pass through the string to see if there's anything to do,
 347      and to calculate the new length.  */
 348   for (p1 = s; *p1; p1++)
 349     {
 350       switch (decide_copy_method (p1))
 351         {
 352         case CM_ENCODE:
 353           ++encode_count;
 354           break;
 355         case CM_DECODE:
 356           ++decode_count;
 357           break;
 358         case CM_PASSTHROUGH:
 359           break;
 360         }
 361     }
 362
 363   if (!encode_count && !decode_count)
 364     /* The string is good as it is. */
 365     return (char *)s;           /* C const model sucks. */
 366
 367   oldlen = p1 - s;
 368   /* Each encoding adds two characters (hex digits), while each
 369      decoding removes two characters.  */
 370   newlen = oldlen + 2 * (encode_count - decode_count);
 371   newstr = xmalloc (newlen + 1);
 372
 373   p1 = s;
 374   p2 = newstr;
 375
 376   while (*p1)
 377     {
 378       switch (decide_copy_method (p1))
 379         {
 380         case CM_ENCODE:
 381           {
 382             unsigned char c = *p1++;
 383             *p2++ = '%';
 384             *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 385             *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 386           }
 387           break;
 388         case CM_DECODE:
 389           *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
 390                    + (XCHAR_TO_XDIGIT (*(p1 + 2))));
 391           p1 += 3;              /* skip %xx */
 392           break;
 393         case CM_PASSTHROUGH:
 394           *p2++ = *p1++;
 395         }
 396     }
 397   *p2 = '\0';
 398   assert (p2 - newstr == newlen);
 399   return newstr;
 400 }
 401
 402 /* Run PTR_VAR through reencode_string.  If a new string is consed,
 403    free PTR_VAR and make it point to the new storage.  Obviously,
 404    PTR_VAR needs to be an lvalue.  */
 405
 406 #define REENCODE(ptr_var) do {                  \
 407   char *rf_new = reencode_string (ptr_var);     \
 408   if (rf_new != ptr_var)                        \
 409     {                                           \
 410       xfree (ptr_var);                          \
 411       ptr_var = rf_new;                         \
 412     }                                           \
 413 } while (0)
 414 \f
 415 /* Returns the scheme type if the scheme is supported, or
 416    SCHEME_INVALID if not.  */
 417 enum url_scheme
 418 url_scheme (const char *url)
 419 {
 420   int i;
 421
 422   for (i = 0; supported_schemes[i].leading_string; i++)
 423     if (!strncasecmp (url, supported_schemes[i].leading_string,
 424                       strlen (supported_schemes[i].leading_string)))
 425       return (enum url_scheme)i;
 426   return SCHEME_INVALID;
 427 }
 428
 429 /* Return the number of characters needed to skip the scheme part of
 430    the URL, e.g. `http://'.  If no scheme is found, returns 0.  */
 431 int
 432 url_skip_scheme (const char *url)
 433 {
 434   const char *p = url;
 435
 436   /* Skip the scheme name.  We allow `-' and `+' because of `whois++',
 437      etc. */
 438   while (ISALNUM (*p) || *p == '-' || *p == '+')
 439     ++p;
 440   if (*p != ':')
 441     return 0;
 442   /* Skip ':'. */
 443   ++p;
 444
 445   /* Skip "//" if found. */
 446   if (*p == '/' && *(p + 1) == '/')
 447     p += 2;
 448
 449   return p - url;
 450 }
 451
 452 /* Returns 1 if the URL begins with a scheme (supported or
 453    unsupported), 0 otherwise.  */
 454 int
 455 url_has_scheme (const char *url)
 456 {
 457   const char *p = url;
 458   while (ISALNUM (*p) || *p == '-' || *p == '+')
 459     ++p;
 460   return *p == ':';
 461 }
 462
 463 int
 464 scheme_default_port (enum url_scheme scheme)
 465 {
 466   return supported_schemes[scheme].default_port;
 467 }
 468
 469 /* Skip the username and password, if present here.  The function
 470    should be called *not* with the complete URL, but with the part
 471    right after the scheme.
 472
 473    If no username and password are found, return 0.  */
 474 int
 475 url_skip_uname (const char *url)
 476 {
 477   const char *p;
 478
 479   /* Look for '@' that comes before '/' or '?'. */
 480   p = (const char *)strpbrk (url, "/?@");
 481   if (!p || *p != '@')
 482     return 0;
 483
 484   return p - url + 1;
 485 }
 486
 487 static int
 488 parse_uname (const char *str, int len, char **user, char **passwd)
 489 {
 490   char *colon;
 491
 492   if (len == 0)
 493     /* Empty user name not allowed. */
 494     return 0;
 495
 496   colon = memchr (str, ':', len);
 497   if (colon == str)
 498     /* Empty user name again. */
 499     return 0;
 500
 501   if (colon)
 502     {
 503       int pwlen = len - (colon + 1 - str);
 504       *passwd = xmalloc (pwlen + 1);
 505       memcpy (*passwd, colon + 1, pwlen);
 506       (*passwd)[pwlen] = '\0';
 507       len -= pwlen + 1;
 508     }
 509   else
 510     *passwd = NULL;
 511
 512   *user = xmalloc (len + 1);
 513   memcpy (*user, str, len);
 514   (*user)[len] = '\0';
 515
 516   return 1;
 517 }
 518
 519 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 520    popularized by Netscape and NcFTP.  HTTP shorthands look like this:
 521
 522    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 523    www.foo.com[:port]            -> http://www.foo.com[:port]
 524
 525    FTP shorthands look like this:
 526
 527    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 528    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 529
 530    If the URL needs not or cannot be rewritten, return NULL.  */
 531 char *
 532 rewrite_shorthand_url (const char *url)
 533 {
 534   const char *p;
 535
 536   if (url_has_scheme (url))
 537     return NULL;
 538
 539   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 540      latter Netscape.  */
 541   for (p = url; *p && *p != ':' && *p != '/'; p++)
 542     ;
 543
 544   if (p == url)
 545     return NULL;
 546
 547   if (*p == ':')
 548     {
 549       const char *pp, *path;
 550       char *res;
 551       /* If the characters after the colon and before the next slash
 552          or end of string are all digits, it's HTTP.  */
 553       int digits = 0;
 554       for (pp = p + 1; ISDIGIT (*pp); pp++)
 555         ++digits;
 556       if (digits > 0
 557           && (*pp == '/' || *pp == '\0'))
 558         goto http;
 559
 560       /* Prepend "ftp://" to the entire URL... */
 561       path = p + 1;
 562       res = xmalloc (6 + strlen (url) + 1);
 563       sprintf (res, "ftp://%s", url);
 564       /* ...and replace ':' with '/'. */
 565       res[6 + (p - url)] = '/';
 566       return res;
 567     }
 568   else
 569     {
 570       char *res;
 571     http:
 572       /* Just prepend "http://" to what we have. */
 573       res = xmalloc (7 + strlen (url) + 1);
 574       sprintf (res, "http://%s", url);
 575       return res;
 576     }
 577 }
 578 \f
 579 static void parse_path PARAMS ((const char *, char **, char **));
 580
 581 static char *
 582 strpbrk_or_eos (const char *s, const char *accept)
 583 {
 584   char *p = strpbrk (s, accept);
 585   if (!p)
 586     p = (char *)s + strlen (s);
 587   return p;
 588 }
 589
 590 /* Turn STR into lowercase; return non-zero if a character was
 591    actually changed. */
 592
 593 static int
 594 lowercase_str (char *str)
 595 {
 596   int change = 0;
 597   for (; *str; str++)
 598     if (!ISLOWER (*str))
 599       {
 600         change = 1;
 601         *str = TOLOWER (*str);
 602       }
 603   return change;
 604 }
 605
 606 static char *parse_errors[] = {
 607 #define PE_NO_ERROR            0
 608   "No error",
 609 #define PE_UNRECOGNIZED_SCHEME 1
 610   "Unrecognized scheme",
 611 #define PE_EMPTY_HOST          2
 612   "Empty host",
 613 #define PE_BAD_PORT_NUMBER     3
 614   "Bad port number",
 615 #define PE_INVALID_USER_NAME   4
 616   "Invalid user name"
 617 };
 618
 619 #define SETERR(p, v) do {                       \
 620   if (p)                                        \
 621     *(p) = (v);                                 \
 622 } while (0)
 623
 624 /* Parse a URL.
 625
 626    Return a new struct url if successful, NULL on error.  In case of
 627    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 628    error code. */
 629 struct url *
 630 url_parse (const char *url, int *error)
 631 {
 632   struct url *u;
 633   const char *p;
 634   int path_modified, host_modified;
 635
 636   enum url_scheme scheme;
 637
 638   const char *uname_b,     *uname_e;
 639   const char *host_b,      *host_e;
 640   const char *path_b,      *path_e;
 641   const char *params_b,    *params_e;
 642   const char *query_b,     *query_e;
 643   const char *fragment_b,  *fragment_e;
 644
 645   int port;
 646   char *user = NULL, *passwd = NULL;
 647
 648   char *url_encoded;
 649
 650   scheme = url_scheme (url);
 651   if (scheme == SCHEME_INVALID)
 652     {
 653       SETERR (error, PE_UNRECOGNIZED_SCHEME);
 654       return NULL;
 655     }
 656
 657   url_encoded = reencode_string (url);
 658   p = url_encoded;
 659
 660   p += strlen (supported_schemes[scheme].leading_string);
 661   uname_b = p;
 662   p += url_skip_uname (p);
 663   uname_e = p;
 664
 665   /* scheme://user:pass@host[:port]... */
 666   /*                    ^              */
 667
 668   /* We attempt to break down the URL into the components path,
 669      params, query, and fragment.  They are ordered like this:
 670
 671        scheme://host[:port][/path][;params][?query][#fragment]  */
 672
 673   params_b   = params_e   = NULL;
 674   query_b    = query_e    = NULL;
 675   fragment_b = fragment_e = NULL;
 676
 677   host_b = p;
 678   p = strpbrk_or_eos (p, ":/;?#");
 679   host_e = p;
 680
 681   if (host_b == host_e)
 682     {
 683       SETERR (error, PE_EMPTY_HOST);
 684       return NULL;
 685     }
 686
 687   port = scheme_default_port (scheme);
 688   if (*p == ':')
 689     {
 690       const char *port_b, *port_e, *pp;
 691
 692       /* scheme://host:port/tralala */
 693       /*              ^             */
 694       ++p;
 695       port_b = p;
 696       p = strpbrk_or_eos (p, "/;?#");
 697       port_e = p;
 698
 699       if (port_b == port_e)
 700         {
 701           /* http://host:/whatever */
 702           /*             ^         */
 703           SETERR (error, PE_BAD_PORT_NUMBER);
 704           return NULL;
 705         }
 706
 707       for (port = 0, pp = port_b; pp < port_e; pp++)
 708         {
 709           if (!ISDIGIT (*pp))
 710             {
 711               /* http://host:12randomgarbage/blah */
 712               /*               ^                  */
 713               SETERR (error, PE_BAD_PORT_NUMBER);
 714               return NULL;
 715             }
 716           port = 10 * port + (*pp - '0');
 717         }
 718     }
 719
 720   if (*p == '/')
 721     {
 722       ++p;
 723       path_b = p;
 724       p = strpbrk_or_eos (p, ";?#");
 725       path_e = p;
 726     }
 727   else
 728     {
 729       /* Path is not allowed not to exist. */
 730       path_b = path_e = p;
 731     }
 732
 733   if (*p == ';')
 734     {
 735       ++p;
 736       params_b = p;
 737       p = strpbrk_or_eos (p, "?#");
 738       params_e = p;
 739     }
 740   if (*p == '?')
 741     {
 742       ++p;
 743       query_b = p;
 744       p = strpbrk_or_eos (p, "#");
 745       query_e = p;
 746     }
 747   if (*p == '#')
 748     {
 749       ++p;
 750       fragment_b = p;
 751       p += strlen (p);
 752       fragment_e = p;
 753     }
 754   assert (*p == 0);
 755
 756   if (uname_b != uname_e)
 757     {
 758       /* http://user:pass@host */
 759       /*        ^         ^    */
 760       /*     uname_b   uname_e */
 761       if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
 762         {
 763           SETERR (error, PE_INVALID_USER_NAME);
 764           return NULL;
 765         }
 766     }
 767
 768   u = (struct url *)xmalloc (sizeof (struct url));
 769   memset (u, 0, sizeof (*u));
 770
 771   u->scheme = scheme;
 772   u->host   = strdupdelim (host_b, host_e);
 773   u->port   = port;
 774   u->user   = user;
 775   u->passwd = passwd;
 776
 777   u->path = strdupdelim (path_b, path_e);
 778   path_modified = path_simplify (u->path);
 779   parse_path (u->path, &u->dir, &u->file);
 780
 781   host_modified = lowercase_str (u->host);
 782
 783   if (params_b)
 784     u->params = strdupdelim (params_b, params_e);
 785   if (query_b)
 786     u->query = strdupdelim (query_b, query_e);
 787   if (fragment_b)
 788     u->fragment = strdupdelim (fragment_b, fragment_e);
 789
 790
 791   if (path_modified || u->fragment || host_modified)
 792     {
 793       /* If path_simplify modified the path, or if a fragment is
 794          present, or if the original host name had caps in it, make
 795          sure that u->url is equivalent to what would be printed by
 796          url_string.  */
 797       u->url = url_string (u, 0);
 798
 799       if (url_encoded != url)
 800         xfree ((char *) url_encoded);
 801     }
 802   else
 803     {
 804       if (url_encoded == url)
 805         u->url    = xstrdup (url);
 806       else
 807         u->url    = url_encoded;
 808     }
 809   url_encoded = NULL;
 810
 811   return u;
 812 }
 813
 814 const char *
 815 url_error (int error_code)
 816 {
 817   assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
 818   return parse_errors[error_code];
 819 }
 820
 821 static void
 822 parse_path (const char *quoted_path, char **dir, char **file)
 823 {
 824   char *path, *last_slash;
 825
 826   STRDUP_ALLOCA (path, quoted_path);
 827   decode_string (path);
 828
 829   last_slash = strrchr (path, '/');
 830   if (!last_slash)
 831     {
 832       *dir = xstrdup ("");
 833       *file = xstrdup (path);
 834     }
 835   else
 836     {
 837       *dir = strdupdelim (path, last_slash);
 838       *file = xstrdup (last_slash + 1);
 839     }
 840 }
 841
 842 /* Note: URL's "full path" is the path with the query string and
 843    params appended.  The "fragment" (#foo) is intentionally ignored,
 844    but that might be changed.  For example, if the original URL was
 845    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
 846    the full path will be "/foo/bar/baz;bullshit?querystring".  */
 847
 848 /* Return the length of the full path, without the terminating
 849    zero.  */
 850
 851 static int
 852 full_path_length (const struct url *url)
 853 {
 854   int len = 0;
 855
 856 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
 857
 858   FROB (path);
 859   FROB (params);
 860   FROB (query);
 861
 862 #undef FROB
 863
 864   return len;
 865 }
 866
 867 /* Write out the full path. */
 868
 869 static void
 870 full_path_write (const struct url *url, char *where)
 871 {
 872 #define FROB(el, chr) do {                      \
 873   char *f_el = url->el;                         \
 874   if (f_el) {                                   \
 875     int l = strlen (f_el);                      \
 876     *where++ = chr;                             \
 877     memcpy (where, f_el, l);                    \
 878     where += l;                                 \
 879   }                                             \
 880 } while (0)
 881
 882   FROB (path, '/');
 883   FROB (params, ';');
 884   FROB (query, '?');
 885
 886 #undef FROB
 887 }
 888
 889 /* Public function for getting the "full path". */
 890 char *
 891 url_full_path (const struct url *url)
 892 {
 893   int length = full_path_length (url);
 894   char *full_path = (char *)xmalloc(length + 1);
 895
 896   full_path_write (url, full_path);
 897   full_path[length] = '\0';
 898
 899   return full_path;
 900 }
 901
 902 /* Sync u->path and u->url with u->dir and u->file. */
 903 static void
 904 sync_path (struct url *url)
 905 {
 906   char *newpath;
 907
 908   xfree (url->path);
 909
 910   if (!*url->dir)
 911     {
 912       newpath = xstrdup (url->file);
 913       REENCODE (newpath);
 914     }
 915   else
 916     {
 917       int dirlen = strlen (url->dir);
 918       int filelen = strlen (url->file);
 919
 920       newpath = xmalloc (dirlen + 1 + filelen + 1);
 921       memcpy (newpath, url->dir, dirlen);
 922       newpath[dirlen] = '/';
 923       memcpy (newpath + dirlen + 1, url->file, filelen);
 924       newpath[dirlen + 1 + filelen] = '\0';
 925       REENCODE (newpath);
 926     }
 927
 928   url->path = newpath;
 929
 930   /* Synchronize u->url. */
 931   xfree (url->url);
 932   url->url = url_string (url, 0);
 933 }
 934
 935 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
 936    This way we can sync u->path and u->url when they get changed.  */
 937
 938 void
 939 url_set_dir (struct url *url, const char *newdir)
 940 {
 941   xfree (url->dir);
 942   url->dir = xstrdup (newdir);
 943   sync_path (url);
 944 }
 945
 946 void
 947 url_set_file (struct url *url, const char *newfile)
 948 {
 949   xfree (url->file);
 950   url->file = xstrdup (newfile);
 951   sync_path (url);
 952 }
 953
 954 void
 955 url_free (struct url *url)
 956 {
 957   xfree (url->host);
 958   xfree (url->path);
 959   xfree (url->url);
 960
 961   FREE_MAYBE (url->params);
 962   FREE_MAYBE (url->query);
 963   FREE_MAYBE (url->fragment);
 964   FREE_MAYBE (url->user);
 965   FREE_MAYBE (url->passwd);
 966
 967   xfree (url->dir);
 968   xfree (url->file);
 969
 970   xfree (url);
 971 }
 972 \f
 973 struct urlpos *
 974 get_urls_file (const char *file)
 975 {
 976   struct file_memory *fm;
 977   struct urlpos *head, *tail;
 978   const char *text, *text_end;
 979
 980   /* Load the file.  */
 981   fm = read_file (file);
 982   if (!fm)
 983     {
 984       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 985       return NULL;
 986     }
 987   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 988   head = tail = NULL;
 989   text = fm->content;
 990   text_end = fm->content + fm->length;
 991   while (text < text_end)
 992     {
 993       const char *line_beg = text;
 994       const char *line_end = memchr (text, '\n', text_end - text);
 995       if (!line_end)
 996         line_end = text_end;
 997       else
 998         ++line_end;
 999       text = line_end;
1000       while (line_beg < line_end
1001              && ISSPACE (*line_beg))
1002         ++line_beg;
1003       while (line_end > line_beg + 1
1004              && ISSPACE (*(line_end - 1)))
1005         --line_end;
1006       if (line_end > line_beg)
1007         {
1008           int up_error_code;
1009           char *url_text;
1010           struct urlpos *entry;
1011           struct url *url;
1012
1013           /* We must copy the URL to a zero-terminated string.  *sigh*.  */
1014           url_text = strdupdelim (line_beg, line_end);
1015           url = url_parse (url_text, &up_error_code);
1016           if (!url)
1017             {
1018               logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1019                          file, url_text, url_error (up_error_code));
1020               xfree (url_text);
1021               continue;
1022             }
1023           xfree (url_text);
1024
1025           entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1026           memset (entry, 0, sizeof (*entry));
1027           entry->next = NULL;
1028           entry->url = url;
1029
1030           if (!head)
1031             head = entry;
1032           else
1033             tail->next = entry;
1034           tail = entry;
1035         }
1036     }
1037   read_file_free (fm);
1038   return head;
1039 }
1040 \f
1041 /* Free the linked list of urlpos.  */
1042 void
1043 free_urlpos (struct urlpos *l)
1044 {
1045   while (l)
1046     {
1047       struct urlpos *next = l->next;
1048       if (l->url)
1049         url_free (l->url);
1050       FREE_MAYBE (l->local_name);
1051       xfree (l);
1052       l = next;
1053     }
1054 }
1055
1056 /* Rotate FNAME opt.backups times */
1057 void
1058 rotate_backups(const char *fname)
1059 {
1060   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1061   char *from = (char *)alloca (maxlen);
1062   char *to = (char *)alloca (maxlen);
1063   struct stat sb;
1064   int i;
1065
1066   if (stat (fname, &sb) == 0)
1067     if (S_ISREG (sb.st_mode) == 0)
1068       return;
1069
1070   for (i = opt.backups; i > 1; i--)
1071     {
1072       sprintf (from, "%s.%d", fname, i - 1);
1073       sprintf (to, "%s.%d", fname, i);
1074       /* #### This will fail on machines without the rename() system
1075          call.  */
1076       rename (from, to);
1077     }
1078
1079   sprintf (to, "%s.%d", fname, 1);
1080   rename(fname, to);
1081 }
1082
1083 /* Create all the necessary directories for PATH (a file).  Calls
1084    mkdirhier() internally.  */
1085 int
1086 mkalldirs (const char *path)
1087 {
1088   const char *p;
1089   char *t;
1090   struct stat st;
1091   int res;
1092
1093   p = path + strlen (path);
1094   for (; *p != '/' && p != path; p--);
1095   /* Don't create if it's just a file.  */
1096   if ((p == path) && (*p != '/'))
1097     return 0;
1098   t = strdupdelim (path, p);
1099   /* Check whether the directory exists.  */
1100   if ((stat (t, &st) == 0))
1101     {
1102       if (S_ISDIR (st.st_mode))
1103         {
1104           xfree (t);
1105           return 0;
1106         }
1107       else
1108         {
1109           /* If the dir exists as a file name, remove it first.  This
1110              is *only* for Wget to work with buggy old CERN http
1111              servers.  Here is the scenario: When Wget tries to
1112              retrieve a directory without a slash, e.g.
1113              http://foo/bar (bar being a directory), CERN server will
1114              not redirect it too http://foo/bar/ -- it will generate a
1115              directory listing containing links to bar/file1,
1116              bar/file2, etc.  Wget will lose because it saves this
1117              HTML listing to a file `bar', so it cannot create the
1118              directory.  To work around this, if the file of the same
1119              name exists, we just remove it and create the directory
1120              anyway.  */
1121           DEBUGP (("Removing %s because of directory danger!\n", t));
1122           unlink (t);
1123         }
1124     }
1125   res = make_directory (t);
1126   if (res != 0)
1127     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1128   xfree (t);
1129   return res;
1130 }
1131
1132 static int
1133 count_slashes (const char *s)
1134 {
1135   int i = 0;
1136   while (*s)
1137     if (*s++ == '/')
1138       ++i;
1139   return i;
1140 }
1141
1142 /* Return the path name of the URL-equivalent file name, with a
1143    remote-like structure of directories.  */
1144 static char *
1145 mkstruct (const struct url *u)
1146 {
1147   char *dir, *dir_preencoding;
1148   char *file, *res, *dirpref;
1149   char *query = u->query && *u->query ? u->query : NULL;
1150   int l;
1151
1152   if (opt.cut_dirs)
1153     {
1154       char *ptr = u->dir + (*u->dir == '/');
1155       int slash_count = 1 + count_slashes (ptr);
1156       int cut = MINVAL (opt.cut_dirs, slash_count);
1157       for (; cut && *ptr; ptr++)
1158         if (*ptr == '/')
1159           --cut;
1160       STRDUP_ALLOCA (dir, ptr);
1161     }
1162   else
1163     dir = u->dir + (*u->dir == '/');
1164
1165   /* Check for the true name (or at least a consistent name for saving
1166      to directory) of HOST, reusing the hlist if possible.  */
1167   if (opt.add_hostdir)
1168     {
1169       /* Add dir_prefix and hostname (if required) to the beginning of
1170          dir.  */
1171       dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1172                                 + strlen (u->host)
1173                                 + 1 + numdigit (u->port)
1174                                 + 1);
1175       if (!DOTP (opt.dir_prefix))
1176         sprintf (dirpref, "%s/%s", opt.dir_prefix, u->host);
1177       else
1178         strcpy (dirpref, u->host);
1179
1180       if (u->port != scheme_default_port (u->scheme))
1181         {
1182           int len = strlen (dirpref);
1183           dirpref[len] = ':';
1184           long_to_string (dirpref + len + 1, u->port);
1185         }
1186     }
1187   else                          /* not add_hostdir */
1188     {
1189       if (!DOTP (opt.dir_prefix))
1190         dirpref = opt.dir_prefix;
1191       else
1192         dirpref = "";
1193     }
1194
1195   /* If there is a prefix, prepend it.  */
1196   if (*dirpref)
1197     {
1198       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1199       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1200       dir = newdir;
1201     }
1202
1203   dir_preencoding = dir;
1204   dir = reencode_string (dir_preencoding);
1205
1206   l = strlen (dir);
1207   if (l && dir[l - 1] == '/')
1208     dir[l - 1] = '\0';
1209
1210   if (!*u->file)
1211     file = "index.html";
1212   else
1213     file = u->file;
1214
1215   /* Finally, construct the full name.  */
1216   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file)
1217                          + (query ? (1 + strlen (query)) : 0)
1218                          + 1);
1219   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1220   if (query)
1221     {
1222       strcat (res, "?");
1223       strcat (res, query);
1224     }
1225   if (dir != dir_preencoding)
1226     xfree (dir);
1227   return res;
1228 }
1229
1230 /* Compose a file name out of BASE, an unescaped file name, and QUERY,
1231    an escaped query string.  The trick is to make sure that unsafe
1232    characters in BASE are escaped, and that slashes in QUERY are also
1233    escaped.  */
1234
1235 static char *
1236 compose_file_name (char *base, char *query)
1237 {
1238   char result[256];
1239   char *from;
1240   char *to = result;
1241
1242   /* Copy BASE to RESULT and encode all unsafe characters.  */
1243   from = base;
1244   while (*from && to - result < sizeof (result))
1245     {
1246       if (UNSAFE_CHAR (*from))
1247         {
1248           unsigned char c = *from++;
1249           *to++ = '%';
1250           *to++ = XDIGIT_TO_XCHAR (c >> 4);
1251           *to++ = XDIGIT_TO_XCHAR (c & 0xf);
1252         }
1253       else
1254         *to++ = *from++;
1255     }
1256
1257   if (query && to - result < sizeof (result))
1258     {
1259       *to++ = '?';
1260
1261       /* Copy QUERY to RESULT and encode all '/' characters. */
1262       from = query;
1263       while (*from && to - result < sizeof (result))
1264         {
1265           if (*from == '/')
1266             {
1267               *to++ = '%';
1268               *to++ = '2';
1269               *to++ = 'F';
1270               ++from;
1271             }
1272           else
1273             *to++ = *from++;
1274         }
1275     }
1276
1277   if (to - result < sizeof (result))
1278     *to = '\0';
1279   else
1280     /* Truncate input which is too long, presumably due to a huge
1281        query string.  */
1282     result[sizeof (result) - 1] = '\0';
1283
1284   return xstrdup (result);
1285 }
1286
1287 /* Create a unique filename, corresponding to a given URL.  Calls
1288    mkstruct if necessary.  Does *not* actually create any directories.  */
1289 char *
1290 url_filename (const struct url *u)
1291 {
1292   char *file, *name;
1293   int have_prefix = 0;          /* whether we must prepend opt.dir_prefix */
1294
1295   if (opt.dirstruct)
1296     {
1297       file = mkstruct (u);
1298       have_prefix = 1;
1299     }
1300   else
1301     {
1302       char *base = *u->file ? u->file : "index.html";
1303       char *query = u->query && *u->query ? u->query : NULL;
1304       file = compose_file_name (base, query);
1305     }
1306
1307   if (!have_prefix)
1308     {
1309       /* Check whether the prefix directory is something other than "."
1310          before prepending it.  */
1311       if (!DOTP (opt.dir_prefix))
1312         {
1313           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1314                                          + 1 + strlen (file) + 1);
1315           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1316           xfree (file);
1317           file = nfile;
1318         }
1319     }
1320   /* DOS-ish file systems don't like `%' signs in them; we change it
1321      to `@'.  */
1322 #ifdef WINDOWS
1323   {
1324     char *p = file;
1325     for (p = file; *p; p++)
1326       if (*p == '%')
1327         *p = '@';
1328   }
1329 #endif /* WINDOWS */
1330
1331   /* Check the cases in which the unique extensions are not used:
1332      1) Clobbering is turned off (-nc).
1333      2) Retrieval with regetting.
1334      3) Timestamping is used.
1335      4) Hierarchy is built.
1336
1337      The exception is the case when file does exist and is a
1338      directory (actually support for bad httpd-s).  */
1339   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1340       && !(file_exists_p (file) && !file_non_directory_p (file)))
1341     return file;
1342
1343   /* Find a unique name.  */
1344   name = unique_name (file);
1345   xfree (file);
1346   return name;
1347 }
1348
1349 /* Like strlen(), but allow the URL to be ended with '?'.  */
1350 static int
1351 urlpath_length (const char *url)
1352 {
1353   const char *q = strpbrk_or_eos (url, "?;#");
1354   return q - url;
1355 }
1356
1357 /* Find the last occurrence of character C in the range [b, e), or
1358    NULL, if none are present.  This is almost completely equivalent to
1359    { *e = '\0'; return strrchr(b); }, except that it doesn't change
1360    the contents of the string.  */
1361 static const char *
1362 find_last_char (const char *b, const char *e, char c)
1363 {
1364   for (; e > b; e--)
1365     if (*e == c)
1366       return e;
1367   return NULL;
1368 }
1369
1370 /* Resolve the result of "linking" a base URI (BASE) to a
1371    link-specified URI (LINK).
1372
1373    Either of the URIs may be absolute or relative, complete with the
1374    host name, or path only.  This tries to behave "reasonably" in all
1375    foreseeable cases.  It employs little specific knowledge about
1376    schemes or URL-specific stuff -- it just works on strings.
1377
1378    The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1379    See uri_merge for a gentler interface to this functionality.
1380
1381    #### This function should handle `./' and `../' so that the evil
1382    path_simplify can go.  */
1383 static char *
1384 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1385 {
1386   char *constr;
1387
1388   if (no_scheme)
1389     {
1390       const char *end = base + urlpath_length (base);
1391
1392       if (!*link)
1393         {
1394           /* Empty LINK points back to BASE, query string and all. */
1395           constr = xstrdup (base);
1396         }
1397       else if (*link == '?')
1398         {
1399           /* LINK points to the same location, but changes the query
1400              string.  Examples: */
1401           /* uri_merge("path",         "?new") -> "path?new"     */
1402           /* uri_merge("path?foo",     "?new") -> "path?new"     */
1403           /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1404           /* uri_merge("path#foo",     "?new") -> "path?new"     */
1405           int baselength = end - base;
1406           constr = xmalloc (baselength + linklength + 1);
1407           memcpy (constr, base, baselength);
1408           memcpy (constr + baselength, link, linklength);
1409           constr[baselength + linklength] = '\0';
1410         }
1411       else if (*link == '#')
1412         {
1413           /* uri_merge("path",         "#new") -> "path#new"     */
1414           /* uri_merge("path#foo",     "#new") -> "path#new"     */
1415           /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1416           /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1417           int baselength;
1418           const char *end1 = strchr (base, '#');
1419           if (!end1)
1420             end1 = base + strlen (base);
1421           baselength = end1 - base;
1422           constr = xmalloc (baselength + linklength + 1);
1423           memcpy (constr, base, baselength);
1424           memcpy (constr + baselength, link, linklength);
1425           constr[baselength + linklength] = '\0';
1426         }
1427       else if (*link == '/')
1428         {
1429           /* LINK is an absolute path: we need to replace everything
1430              after (and including) the FIRST slash with LINK.
1431
1432              So, if BASE is "http://host/whatever/foo/bar", and LINK is
1433              "/qux/xyzzy", our result should be
1434              "http://host/qux/xyzzy".  */
1435           int span;
1436           const char *slash;
1437           const char *start_insert = NULL; /* for gcc to shut up. */
1438           const char *pos = base;
1439           int seen_slash_slash = 0;
1440           /* We're looking for the first slash, but want to ignore
1441              double slash. */
1442         again:
1443           slash = memchr (pos, '/', end - pos);
1444           if (slash && !seen_slash_slash)
1445             if (*(slash + 1) == '/')
1446               {
1447                 pos = slash + 2;
1448                 seen_slash_slash = 1;
1449                 goto again;
1450               }
1451
1452           /* At this point, SLASH is the location of the first / after
1453              "//", or the first slash altogether.  START_INSERT is the
1454              pointer to the location where LINK will be inserted.  When
1455              examining the last two examples, keep in mind that LINK
1456              begins with '/'. */
1457
1458           if (!slash && !seen_slash_slash)
1459             /* example: "foo" */
1460             /*           ^    */
1461             start_insert = base;
1462           else if (!slash && seen_slash_slash)
1463             /* example: "http://foo" */
1464             /*                     ^ */
1465             start_insert = end;
1466           else if (slash && !seen_slash_slash)
1467             /* example: "foo/bar" */
1468             /*           ^        */
1469             start_insert = base;
1470           else if (slash && seen_slash_slash)
1471             /* example: "http://something/" */
1472             /*                           ^  */
1473             start_insert = slash;
1474
1475           span = start_insert - base;
1476           constr = (char *)xmalloc (span + linklength + 1);
1477           if (span)
1478             memcpy (constr, base, span);
1479           if (linklength)
1480             memcpy (constr + span, link, linklength);
1481           constr[span + linklength] = '\0';
1482         }
1483       else
1484         {
1485           /* LINK is a relative URL: we need to replace everything
1486              after last slash (possibly empty) with LINK.
1487
1488              So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1489              our result should be "whatever/foo/qux/xyzzy".  */
1490           int need_explicit_slash = 0;
1491           int span;
1492           const char *start_insert;
1493           const char *last_slash = find_last_char (base, end, '/');
1494           if (!last_slash)
1495             {
1496               /* No slash found at all.  Append LINK to what we have,
1497                  but we'll need a slash as a separator.
1498
1499                  Example: if base == "foo" and link == "qux/xyzzy", then
1500                  we cannot just append link to base, because we'd get
1501                  "fooqux/xyzzy", whereas what we want is
1502                  "foo/qux/xyzzy".
1503
1504                  To make sure the / gets inserted, we set
1505                  need_explicit_slash to 1.  We also set start_insert
1506                  to end + 1, so that the length calculations work out
1507                  correctly for one more (slash) character.  Accessing
1508                  that character is fine, since it will be the
1509                  delimiter, '\0' or '?'.  */
1510               /* example: "foo?..." */
1511               /*               ^    ('?' gets changed to '/') */
1512               start_insert = end + 1;
1513               need_explicit_slash = 1;
1514             }
1515           else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1516             {
1517               /* example: http://host"  */
1518               /*                      ^ */
1519               start_insert = end + 1;
1520               need_explicit_slash = 1;
1521             }
1522           else
1523             {
1524               /* example: "whatever/foo/bar" */
1525               /*                        ^    */
1526               start_insert = last_slash + 1;
1527             }
1528
1529           span = start_insert - base;
1530           constr = (char *)xmalloc (span + linklength + 1);
1531           if (span)
1532             memcpy (constr, base, span);
1533           if (need_explicit_slash)
1534             constr[span - 1] = '/';
1535           if (linklength)
1536             memcpy (constr + span, link, linklength);
1537           constr[span + linklength] = '\0';
1538         }
1539     }
1540   else /* !no_scheme */
1541     {
1542       constr = strdupdelim (link, link + linklength);
1543     }
1544   return constr;
1545 }
1546
1547 /* Merge BASE with LINK and return the resulting URI.  This is an
1548    interface to uri_merge_1 that assumes that LINK is a
1549    zero-terminated string.  */
1550 char *
1551 uri_merge (const char *base, const char *link)
1552 {
1553   return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
1554 }
1555 \f
1556 #define APPEND(p, s) do {                       \
1557   int len = strlen (s);                         \
1558   memcpy (p, s, len);                           \
1559   p += len;                                     \
1560 } while (0)
1561
1562 /* Use this instead of password when the actual password is supposed
1563    to be hidden.  We intentionally use a generic string without giving
1564    away the number of characters in the password, like previous
1565    versions did.  */
1566 #define HIDDEN_PASSWORD "*password*"
1567
1568 /* Recreate the URL string from the data in URL.
1569
1570    If HIDE is non-zero (as it is when we're calling this on a URL we
1571    plan to print, but not when calling it to canonicalize a URL for
1572    use within the program), password will be hidden.  Unsafe
1573    characters in the URL will be quoted.  */
1574
1575 char *
1576 url_string (const struct url *url, int hide_password)
1577 {
1578   int size;
1579   char *result, *p;
1580   char *quoted_user = NULL, *quoted_passwd = NULL;
1581
1582   int scheme_port  = supported_schemes[url->scheme].default_port;
1583   char *scheme_str = supported_schemes[url->scheme].leading_string;
1584   int fplen = full_path_length (url);
1585
1586   assert (scheme_str != NULL);
1587
1588   /* Make sure the user name and password are quoted. */
1589   if (url->user)
1590     {
1591       quoted_user = encode_string_maybe (url->user);
1592       if (url->passwd)
1593         {
1594           if (hide_password)
1595             quoted_passwd = HIDDEN_PASSWORD;
1596           else
1597             quoted_passwd = encode_string_maybe (url->passwd);
1598         }
1599     }
1600
1601   size = (strlen (scheme_str)
1602           + strlen (url->host)
1603           + fplen
1604           + 1);
1605   if (url->port != scheme_port)
1606     size += 1 + numdigit (url->port);
1607   if (quoted_user)
1608     {
1609       size += 1 + strlen (quoted_user);
1610       if (quoted_passwd)
1611         size += 1 + strlen (quoted_passwd);
1612     }
1613
1614   p = result = xmalloc (size);
1615
1616   APPEND (p, scheme_str);
1617   if (quoted_user)
1618     {
1619       APPEND (p, quoted_user);
1620       if (quoted_passwd)
1621         {
1622           *p++ = ':';
1623           APPEND (p, quoted_passwd);
1624         }
1625       *p++ = '@';
1626     }
1627
1628   APPEND (p, url->host);
1629   if (url->port != scheme_port)
1630     {
1631       *p++ = ':';
1632       long_to_string (p, url->port);
1633       p += strlen (p);
1634     }
1635
1636   full_path_write (url, p);
1637   p += fplen;
1638   *p++ = '\0';
1639
1640   assert (p - result == size);
1641
1642   if (quoted_user && quoted_user != url->user)
1643     xfree (quoted_user);
1644   if (quoted_passwd && !hide_password
1645       && quoted_passwd != url->passwd)
1646     xfree (quoted_passwd);
1647
1648   return result;
1649 }
1650 \f
1651 /* Returns proxy host address, in accordance with SCHEME.  */
1652 char *
1653 getproxy (enum url_scheme scheme)
1654 {
1655   char *proxy = NULL;
1656   char *rewritten_url;
1657   static char rewritten_storage[1024];
1658
1659   switch (scheme)
1660     {
1661     case SCHEME_HTTP:
1662       proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1663       break;
1664 #ifdef HAVE_SSL
1665     case SCHEME_HTTPS:
1666       proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1667       break;
1668 #endif
1669     case SCHEME_FTP:
1670       proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1671       break;
1672     case SCHEME_INVALID:
1673       break;
1674     }
1675   if (!proxy || !*proxy)
1676     return NULL;
1677
1678   /* Handle shorthands. */
1679   rewritten_url = rewrite_shorthand_url (proxy);
1680   if (rewritten_url)
1681     {
1682       strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
1683       rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
1684       proxy = rewritten_storage;
1685     }
1686
1687   return proxy;
1688 }
1689
1690 /* Should a host be accessed through proxy, concerning no_proxy?  */
1691 int
1692 no_proxy_match (const char *host, const char **no_proxy)
1693 {
1694   if (!no_proxy)
1695     return 1;
1696   else
1697     return !sufmatch (no_proxy, host);
1698 }
1699 \f
1700 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1701 static const char *replace_attr PARAMS ((const char *, int, FILE *, const char *));
1702 static char *local_quote_string PARAMS ((const char *));
1703
1704 /* Change the links in one HTML file.  LINKS is a list of links in the
1705    document, along with their positions and the desired direction of
1706    the conversion.  */
1707 void
1708 convert_links (const char *file, struct urlpos *links)
1709 {
1710   struct file_memory *fm;
1711   FILE *fp;
1712   const char *p;
1713   downloaded_file_t downloaded_file_return;
1714
1715   struct urlpos *link;
1716   int to_url_count = 0, to_file_count = 0;
1717
1718   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1719
1720   {
1721     /* First we do a "dry run": go through the list L and see whether
1722        any URL needs to be converted in the first place.  If not, just
1723        leave the file alone.  */
1724     int dry_count = 0;
1725     struct urlpos *dry = links;
1726     for (dry = links; dry; dry = dry->next)
1727       if (dry->convert != CO_NOCONVERT)
1728         ++dry_count;
1729     if (!dry_count)
1730       {
1731         logputs (LOG_VERBOSE, _("nothing to do.\n"));
1732         return;
1733       }
1734   }
1735
1736   fm = read_file (file);
1737   if (!fm)
1738     {
1739       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1740                  file, strerror (errno));
1741       return;
1742     }
1743
1744   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1745   if (opt.backup_converted && downloaded_file_return)
1746     write_backup_file (file, downloaded_file_return);
1747
1748   /* Before opening the file for writing, unlink the file.  This is
1749      important if the data in FM is mmaped.  In such case, nulling the
1750      file, which is what fopen() below does, would make us read all
1751      zeroes from the mmaped region.  */
1752   if (unlink (file) < 0 && errno != ENOENT)
1753     {
1754       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1755                  file, strerror (errno));
1756       read_file_free (fm);
1757       return;
1758     }
1759   /* Now open the file for writing.  */
1760   fp = fopen (file, "wb");
1761   if (!fp)
1762     {
1763       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1764                  file, strerror (errno));
1765       read_file_free (fm);
1766       return;
1767     }
1768
1769   /* Here we loop through all the URLs in file, replacing those of
1770      them that are downloaded with relative references.  */
1771   p = fm->content;
1772   for (link = links; link; link = link->next)
1773     {
1774       char *url_start = fm->content + link->pos;
1775
1776       if (link->pos >= fm->length)
1777         {
1778           DEBUGP (("Something strange is going on.  Please investigate."));
1779           break;
1780         }
1781       /* If the URL is not to be converted, skip it.  */
1782       if (link->convert == CO_NOCONVERT)
1783         {
1784           DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
1785           continue;
1786         }
1787
1788       /* Echo the file contents, up to the offending URL's opening
1789          quote, to the outfile.  */
1790       fwrite (p, 1, url_start - p, fp);
1791       p = url_start;
1792
1793       switch (link->convert)
1794         {
1795         case CO_CONVERT_TO_RELATIVE:
1796           /* Convert absolute URL to relative. */
1797           {
1798             char *newname = construct_relative (file, link->local_name);
1799             char *quoted_newname = local_quote_string (newname);
1800             p = replace_attr (p, link->size, fp, quoted_newname);
1801             DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1802                      link->url->url, newname, link->pos, file));
1803             xfree (newname);
1804             xfree (quoted_newname);
1805             ++to_file_count;
1806             break;
1807           }
1808         case CO_CONVERT_TO_COMPLETE:
1809           /* Convert the link to absolute URL. */
1810           {
1811             char *newlink = link->url->url;
1812             char *quoted_newlink = html_quote_string (newlink);
1813             p = replace_attr (p, link->size, fp, quoted_newlink);
1814             DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1815                      newlink, link->pos, file));
1816             xfree (quoted_newlink);
1817             ++to_url_count;
1818             break;
1819           }
1820         case CO_NULLIFY_BASE:
1821           /* Change the base href to "". */
1822           p = replace_attr (p, link->size, fp, "");
1823           break;
1824         case CO_NOCONVERT:
1825           abort ();
1826           break;
1827         }
1828     }
1829
1830   /* Output the rest of the file. */
1831   if (p - fm->content < fm->length)
1832     fwrite (p, 1, fm->length - (p - fm->content), fp);
1833   fclose (fp);
1834   read_file_free (fm);
1835
1836   logprintf (LOG_VERBOSE,
1837              _("%d-%d\n"), to_file_count, to_url_count);
1838 }
1839
1840 /* Construct and return a malloced copy of the relative link from two
1841    pieces of information: local name S1 of the referring file and
1842    local name S2 of the referred file.
1843
1844    So, if S1 is "jagor.srce.hr/index.html" and S2 is
1845    "jagor.srce.hr/images/news.gif", the function will return
1846    "images/news.gif".
1847
1848    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1849    "fly.cc.fer.hr/images/fly.gif", the function will return
1850    "../images/fly.gif".
1851
1852    Caveats: S1 should not begin with `/', unless S2 also begins with
1853    '/'.  S1 should not contain things like ".." and such --
1854    construct_relative ("fly/ioccc/../index.html",
1855    "fly/images/fly.gif") will fail.  (A workaround is to call
1856    something like path_simplify() on S1).  */
1857 static char *
1858 construct_relative (const char *s1, const char *s2)
1859 {
1860   int i, cnt, sepdirs1;
1861   char *res;
1862
1863   if (*s2 == '/')
1864     return xstrdup (s2);
1865   /* S1 should *not* be absolute, if S2 wasn't.  */
1866   assert (*s1 != '/');
1867   i = cnt = 0;
1868   /* Skip the directories common to both strings.  */
1869   while (1)
1870     {
1871       while (s1[i] && s2[i]
1872              && (s1[i] == s2[i])
1873              && (s1[i] != '/')
1874              && (s2[i] != '/'))
1875         ++i;
1876       if (s1[i] == '/' && s2[i] == '/')
1877         cnt = ++i;
1878       else
1879         break;
1880     }
1881   for (sepdirs1 = 0; s1[i]; i++)
1882     if (s1[i] == '/')
1883       ++sepdirs1;
1884   /* Now, construct the file as of:
1885      - ../ repeated sepdirs1 time
1886      - all the non-mutual directories of S2.  */
1887   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1888   for (i = 0; i < sepdirs1; i++)
1889     memcpy (res + 3 * i, "../", 3);
1890   strcpy (res + 3 * i, s2 + cnt);
1891   return res;
1892 }
1893 \f
1894 static void
1895 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1896 {
1897   /* Rather than just writing over the original .html file with the
1898      converted version, save the former to *.orig.  Note we only do
1899      this for files we've _successfully_ downloaded, so we don't
1900      clobber .orig files sitting around from previous invocations. */
1901
1902   /* Construct the backup filename as the original name plus ".orig". */
1903   size_t         filename_len = strlen(file);
1904   char*          filename_plus_orig_suffix;
1905   boolean        already_wrote_backup_file = FALSE;
1906   slist*         converted_file_ptr;
1907   static slist*  converted_files = NULL;
1908
1909   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1910     {
1911       /* Just write "orig" over "html".  We need to do it this way
1912          because when we're checking to see if we've downloaded the
1913          file before (to see if we can skip downloading it), we don't
1914          know if it's a text/html file.  Therefore we don't know yet
1915          at that stage that -E is going to cause us to tack on
1916          ".html", so we need to compare vs. the original URL plus
1917          ".orig", not the original URL plus ".html.orig". */
1918       filename_plus_orig_suffix = alloca (filename_len + 1);
1919       strcpy(filename_plus_orig_suffix, file);
1920       strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1921     }
1922   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1923     {
1924       /* Append ".orig" to the name. */
1925       filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1926       strcpy(filename_plus_orig_suffix, file);
1927       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1928     }
1929
1930   /* We can get called twice on the same URL thanks to the
1931      convert_all_links() call in main().  If we write the .orig file
1932      each time in such a case, it'll end up containing the first-pass
1933      conversion, not the original file.  So, see if we've already been
1934      called on this file. */
1935   converted_file_ptr = converted_files;
1936   while (converted_file_ptr != NULL)
1937     if (strcmp(converted_file_ptr->string, file) == 0)
1938       {
1939         already_wrote_backup_file = TRUE;
1940         break;
1941       }
1942     else
1943       converted_file_ptr = converted_file_ptr->next;
1944
1945   if (!already_wrote_backup_file)
1946     {
1947       /* Rename <file> to <file>.orig before former gets written over. */
1948       if (rename(file, filename_plus_orig_suffix) != 0)
1949         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1950                    file, filename_plus_orig_suffix, strerror (errno));
1951
1952       /* Remember that we've already written a .orig backup for this file.
1953          Note that we never free this memory since we need it till the
1954          convert_all_links() call, which is one of the last things the
1955          program does before terminating.  BTW, I'm not sure if it would be
1956          safe to just set 'converted_file_ptr->string' to 'file' below,
1957          rather than making a copy of the string...  Another note is that I
1958          thought I could just add a field to the urlpos structure saying
1959          that we'd written a .orig file for this URL, but that didn't work,
1960          so I had to make this separate list.
1961          -- Dan Harkless <wget@harkless.org>
1962
1963          This [adding a field to the urlpos structure] didn't work
1964          because convert_file() is called from convert_all_links at
1965          the end of the retrieval with a freshly built new urlpos
1966          list.
1967          -- Hrvoje Niksic <hniksic@arsdigita.com>
1968       */
1969       converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1970       converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
1971       converted_file_ptr->next = converted_files;
1972       converted_files = converted_file_ptr;
1973     }
1974 }
1975
1976 static int find_fragment PARAMS ((const char *, int, const char **,
1977                                   const char **));
1978
1979 /* Replace an attribute's original text with NEW_TEXT. */
1980
1981 static const char *
1982 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
1983 {
1984   int quote_flag = 0;
1985   char quote_char = '\"';       /* use "..." for quoting, unless the
1986                                    original value is quoted, in which
1987                                    case reuse its quoting char. */
1988   const char *frag_beg, *frag_end;
1989
1990   /* Structure of our string is:
1991        "...old-contents..."
1992        <---    size    --->  (with quotes)
1993      OR:
1994        ...old-contents...
1995        <---    size   -->    (no quotes)   */
1996
1997   if (*p == '\"' || *p == '\'')
1998     {
1999       quote_char = *p;
2000       quote_flag = 1;
2001       ++p;
2002       size -= 2;                /* disregard opening and closing quote */
2003     }
2004   putc (quote_char, fp);
2005   fputs (new_text, fp);
2006
2007   /* Look for fragment identifier, if any. */
2008   if (find_fragment (p, size, &frag_beg, &frag_end))
2009     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2010   p += size;
2011   if (quote_flag)
2012     ++p;
2013   putc (quote_char, fp);
2014
2015   return p;
2016 }
2017
2018 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2019    preceded by '&'.  If the character is not found, return zero.  If
2020    the character is found, return 1 and set BP and EP to point to the
2021    beginning and end of the region.
2022
2023    This is used for finding the fragment indentifiers in URLs.  */
2024
2025 static int
2026 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2027 {
2028   const char *end = beg + size;
2029   int saw_amp = 0;
2030   for (; beg < end; beg++)
2031     {
2032       switch (*beg)
2033         {
2034         case '&':
2035           saw_amp = 1;
2036           break;
2037         case '#':
2038           if (!saw_amp)
2039             {
2040               *bp = beg;
2041               *ep = end;
2042               return 1;
2043             }
2044           /* fallthrough */
2045         default:
2046           saw_amp = 0;
2047         }
2048     }
2049   return 0;
2050 }
2051
2052 /* The idea here was to quote ? as %3F to avoid passing part of the
2053    file name as the parameter when browsing the converted file through
2054    HTTP.  However, actually doing that breaks local browsing because
2055    "index.html%3Ffoo=bar" isn't even recognized as an HTML file!
2056    Perhaps this should be controlled by an option, but for now I'm
2057    leaving the question marks.
2058
2059    This is the original docstring of this function:
2060
2061    FILE should be a relative link to a local file.  It should be
2062    quoted as HTML because it will be used in HTML context.  However,
2063    we need to quote ? as %3F to avoid passing part of the file name as
2064    the parameter.  (This is not a problem when viewing locally, but is
2065    if the downloaded and converted tree is served by an HTTP
2066    server.)  */
2067
2068 /* Quote string as HTML. */
2069
2070 static char *
2071 local_quote_string (const char *file)
2072 {
2073   return html_quote_string (file);
2074
2075 #if 0
2076   const char *file_sans_qmark;
2077   int qm = count_char (file, '?');
2078
2079   if (qm)
2080     {
2081       const char *from = file;
2082       char *to, *newname;
2083
2084       /* qm * 2 because we replace each question mark with "%3F",
2085          i.e. replace one char with three, hence two more.  */
2086       int fsqlen = strlen (file) + qm * 2;
2087
2088       to = newname = (char *)alloca (fsqlen + 1);
2089       for (; *from; from++)
2090         {
2091           if (*from != '?')
2092             *to++ = *from;
2093           else
2094             {
2095               *to++ = '%';
2096               *to++ = '3';
2097               *to++ = 'F';
2098             }
2099         }
2100       assert (to - newname == fsqlen);
2101       *to = '\0';
2102
2103       file_sans_qmark = newname;
2104     }
2105   else
2106     file_sans_qmark = file;
2107
2108   return html_quote_string (file_sans_qmark);
2109 #endif
2110 }
2111
2112 /* We're storing "modes" of type downloaded_file_t in the hash table.
2113    However, our hash tables only accept pointers for keys and values.
2114    So when we need a pointer, we use the address of a
2115    downloaded_file_t variable of static storage.  */
2116
2117 static downloaded_file_t *
2118 downloaded_mode_to_ptr (downloaded_file_t mode)
2119 {
2120   static downloaded_file_t
2121     v1 = FILE_NOT_ALREADY_DOWNLOADED,
2122     v2 = FILE_DOWNLOADED_NORMALLY,
2123     v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2124     v4 = CHECK_FOR_FILE;
2125
2126   switch (mode)
2127     {
2128     case FILE_NOT_ALREADY_DOWNLOADED:
2129       return &v1;
2130     case FILE_DOWNLOADED_NORMALLY:
2131       return &v2;
2132     case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2133       return &v3;
2134     case CHECK_FOR_FILE:
2135       return &v4;
2136     }
2137   return NULL;
2138 }
2139
2140 /* This should really be merged with dl_file_url_map and
2141    downloaded_html_files in recur.c.  This was originally a list, but
2142    I changed it to a hash table beause it was actually taking a lot of
2143    time to find things in it.  */
2144
2145 static struct hash_table *downloaded_files_hash;
2146
2147 /* Remembers which files have been downloaded.  In the standard case, should be
2148    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2149    download successfully (i.e. not for ones we have failures on or that we skip
2150    due to -N).
2151
2152    When we've downloaded a file and tacked on a ".html" extension due to -E,
2153    call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2154    FILE_DOWNLOADED_NORMALLY.
2155
2156    If you just want to check if a file has been previously added without adding
2157    it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
2158    with local filenames, not remote URLs. */
2159 downloaded_file_t
2160 downloaded_file (downloaded_file_t mode, const char *file)
2161 {
2162   downloaded_file_t *ptr;
2163
2164   if (mode == CHECK_FOR_FILE)
2165     {
2166       if (!downloaded_files_hash)
2167         return FILE_NOT_ALREADY_DOWNLOADED;
2168       ptr = hash_table_get (downloaded_files_hash, file);
2169       if (!ptr)
2170         return FILE_NOT_ALREADY_DOWNLOADED;
2171       return *ptr;
2172     }
2173
2174   if (!downloaded_files_hash)
2175     downloaded_files_hash = make_string_hash_table (0);
2176
2177   ptr = hash_table_get (downloaded_files_hash, file);
2178   if (ptr)
2179     return *ptr;
2180
2181   ptr = downloaded_mode_to_ptr (mode);
2182   hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2183
2184   return FILE_NOT_ALREADY_DOWNLOADED;
2185 }
2186
2187 static int
2188 df_free_mapper (void *key, void *value, void *ignored)
2189 {
2190   xfree (key);
2191   return 0;
2192 }
2193
2194 void
2195 downloaded_files_free (void)
2196 {
2197   if (downloaded_files_hash)
2198     {
2199       hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2200       hash_table_destroy (downloaded_files_hash);
2201       downloaded_files_hash = NULL;
2202     }
2203 }