sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif
  29 #include <sys/types.h>
  30 #ifdef HAVE_UNISTD_H
  31 # include <unistd.h>
  32 #endif
  33 #include <errno.h>
  34 #include <assert.h>
  35
  36 #include "wget.h"
  37 #include "utils.h"
  38 #include "url.h"
  39 #include "host.h"
  40 #include "hash.h"
  41
  42 #ifndef errno
  43 extern int errno;
  44 #endif
  45
  46 /* Is X "."?  */
  47 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  48 /* Is X ".."?  */
  49 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  50
  51 static int urlpath_length PARAMS ((const char *));
  52
  53 struct scheme_data
  54 {
  55   char *leading_string;
  56   int default_port;
  57 };
  58
  59 /* Supported schemes: */
  60 static struct scheme_data supported_schemes[] =
  61 {
  62   { "http://",  DEFAULT_HTTP_PORT },
  63 #ifdef HAVE_SSL
  64   { "https://", DEFAULT_HTTPS_PORT },
  65 #endif
  66   { "ftp://",   DEFAULT_FTP_PORT },
  67
  68   /* SCHEME_INVALID */
  69   { NULL,       -1 }
  70 };
  71
  72 static char *construct_relative PARAMS ((const char *, const char *));
  73
  74 \f
  75 /* Support for encoding and decoding of URL strings.  We determine
  76    whether a character is unsafe through static table lookup.  This
  77    code assumes ASCII character set and 8-bit chars.  */
  78
  79 enum {
  80   urlchr_reserved = 1,
  81   urlchr_unsafe   = 2
  82 };
  83
  84 #define R  urlchr_reserved
  85 #define U  urlchr_unsafe
  86 #define RU R|U
  87
  88 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
  89
  90 /* rfc1738 reserved chars, preserved from encoding.  */
  91
  92 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
  93
  94 /* rfc1738 unsafe chars, plus some more.  */
  95
  96 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
  97
  98 const static unsigned char urlchr_table[256] =
  99 {
 100   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 101   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 102   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 103   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 104   U,  0,  U, RU,   0,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 105   0,  0,  0,  R,   0,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 106   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 107   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 108  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 109   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 110   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 111   0,  0,  0,  U,   U,  U,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 112   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 113   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 114   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 115   0,  0,  0,  U,   U,  U,  U,  U,   /* x   y   z   {    |   }   ~   DEL */
 116
 117   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 118   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 119   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 120   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 121
 122   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 123   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 124   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 125   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 126 };
 127
 128 /* Decodes the forms %xy in a URL to the character the hexadecimal
 129    code of which is xy.  xy are hexadecimal digits from
 130    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 131    hex-digits or `%' precedes `\0', the sequence is inserted
 132    literally.  */
 133
 134 static void
 135 decode_string (char *s)
 136 {
 137   char *t = s;                  /* t - tortoise */
 138   char *h = s;                  /* h - hare     */
 139
 140   for (; *h; h++, t++)
 141     {
 142       if (*h != '%')
 143         {
 144         copychar:
 145           *t = *h;
 146         }
 147       else
 148         {
 149           /* Do nothing if '%' is not followed by two hex digits. */
 150           if (!*(h + 1) || !*(h + 2)
 151               || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
 152             goto copychar;
 153           *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
 154           h += 2;
 155         }
 156     }
 157   *t = '\0';
 158 }
 159
 160 /* Like encode_string, but return S if there are no unsafe chars.  */
 161
 162 static char *
 163 encode_string_maybe (const char *s)
 164 {
 165   const char *p1;
 166   char *p2, *newstr;
 167   int newlen;
 168   int addition = 0;
 169
 170   for (p1 = s; *p1; p1++)
 171     if (UNSAFE_CHAR (*p1))
 172       addition += 2;            /* Two more characters (hex digits) */
 173
 174   if (!addition)
 175     return (char *)s;
 176
 177   newlen = (p1 - s) + addition;
 178   newstr = (char *)xmalloc (newlen + 1);
 179
 180   p1 = s;
 181   p2 = newstr;
 182   while (*p1)
 183     {
 184       if (UNSAFE_CHAR (*p1))
 185         {
 186           unsigned char c = *p1++;
 187           *p2++ = '%';
 188           *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 189           *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 190         }
 191       else
 192         *p2++ = *p1++;
 193     }
 194   *p2 = '\0';
 195   assert (p2 - newstr == newlen);
 196
 197   return newstr;
 198 }
 199
 200 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
 201    given string, returning a malloc-ed %XX encoded string.  */
 202
 203 char *
 204 encode_string (const char *s)
 205 {
 206   char *encoded = encode_string_maybe (s);
 207   if (encoded != s)
 208     return encoded;
 209   else
 210     return xstrdup (s);
 211 }
 212
 213 /* Encode unsafe characters in PTR to %xx.  If such encoding is done,
 214    the old value of PTR is freed and PTR is made to point to the newly
 215    allocated storage.  */
 216
 217 #define ENCODE(ptr) do {                        \
 218   char *e_new = encode_string_maybe (ptr);      \
 219   if (e_new != ptr)                             \
 220     {                                           \
 221       xfree (ptr);                              \
 222       ptr = e_new;                              \
 223     }                                           \
 224 } while (0)
 225 \f
 226 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
 227
 228 /* Decide whether to encode, decode, or pass through the char at P.
 229    This used to be a macro, but it got a little too convoluted.  */
 230 static inline enum copy_method
 231 decide_copy_method (const char *p)
 232 {
 233   if (*p == '%')
 234     {
 235       if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
 236         {
 237           /* %xx sequence: decode it, unless it would decode to an
 238              unsafe or a reserved char; in that case, leave it as
 239              is. */
 240           char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
 241             XCHAR_TO_XDIGIT (*(p + 2));
 242
 243           if (UNSAFE_CHAR (preempt) || RESERVED_CHAR (preempt))
 244             return CM_PASSTHROUGH;
 245           else
 246             return CM_DECODE;
 247         }
 248       else
 249         /* Garbled %.. sequence: encode `%'. */
 250         return CM_ENCODE;
 251     }
 252   else if (UNSAFE_CHAR (*p) && !RESERVED_CHAR (*p))
 253     return CM_ENCODE;
 254   else
 255     return CM_PASSTHROUGH;
 256 }
 257
 258 /* Translate a %-quoting (but possibly non-conformant) input string S
 259    into a %-quoting (and conformant) output string.  If no characters
 260    are encoded or decoded, return the same string S; otherwise, return
 261    a freshly allocated string with the new contents.
 262
 263    After a URL has been run through this function, the protocols that
 264    use `%' as the quote character can use the resulting string as-is,
 265    while those that don't call decode_string() to get to the intended
 266    data.  This function is also stable: after an input string is
 267    transformed the first time, all further transformations of the
 268    result yield the same result string.
 269
 270    Let's discuss why this function is needed.
 271
 272    Imagine Wget is to retrieve `http://abc.xyz/abc def'.  Since a raw
 273    space character would mess up the HTTP request, it needs to be
 274    quoted, like this:
 275
 276        GET /abc%20def HTTP/1.0
 277
 278    So it appears that the unsafe chars need to be quoted, as with
 279    encode_string.  But what if we're requested to download
 280    `abc%20def'?  Remember that %-encoding is valid URL syntax, so what
 281    the user meant was a literal space, and he was kind enough to quote
 282    it.  In that case, Wget should obviously leave the `%20' as is, and
 283    send the same request as above.  So in this case we may not call
 284    encode_string.
 285
 286    But what if the requested URI is `abc%20 def'?  If we call
 287    encode_string, we end up with `/abc%2520%20def', which is almost
 288    certainly not intended.  If we don't call encode_string, we are
 289    left with the embedded space and cannot send the request.  What the
 290    user meant was for Wget to request `/abc%20%20def', and this is
 291    where reencode_string kicks in.
 292
 293    Wget used to solve this by first decoding %-quotes, and then
 294    encoding all the "unsafe" characters found in the resulting string.
 295    This was wrong because it didn't preserve certain URL special
 296    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 297    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 298    whether we considered `+' reserved (it is).  One of these results
 299    is inevitable because by the second step we would lose information
 300    on whether the `+' was originally encoded or not.  Both results
 301    were wrong because in CGI parameters + means space, while %2B means
 302    literal plus.  reencode_string correctly translates the above to
 303    "a%2B+b", i.e. returns the original string.
 304
 305    This function uses an algorithm proposed by Anon Sricharoenchai:
 306
 307    1. Encode all URL_UNSAFE and the "%" that are not followed by 2
 308       hexdigits.
 309
 310    2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
 311       "+".
 312
 313    ...except that this code conflates the two steps, and decides
 314    whether to encode, decode, or pass through each character in turn.
 315    The function still uses two passes, but their logic is the same --
 316    the first pass exists merely for the sake of allocation.  Another
 317    small difference is that we include `+' to URL_RESERVED.
 318
 319    Anon's test case:
 320
 321    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 322    ->
 323    "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
 324
 325    Simpler test cases:
 326
 327    "foo bar"         -> "foo%20bar"
 328    "foo%20bar"       -> "foo%20bar"
 329    "foo %20bar"      -> "foo%20%20bar"
 330    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 331    "foo%25%20bar"    -> "foo%25%20bar"
 332    "foo%2%20bar"     -> "foo%252%20bar"
 333    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 334    "foo%2b+bar"      -> "foo%2b+bar"  */
 335
 336 char *
 337 reencode_string (const char *s)
 338 {
 339   const char *p1;
 340   char *newstr, *p2;
 341   int oldlen, newlen;
 342
 343   int encode_count = 0;
 344   int decode_count = 0;
 345
 346   /* First, pass through the string to see if there's anything to do,
 347      and to calculate the new length.  */
 348   for (p1 = s; *p1; p1++)
 349     {
 350       switch (decide_copy_method (p1))
 351         {
 352         case CM_ENCODE:
 353           ++encode_count;
 354           break;
 355         case CM_DECODE:
 356           ++decode_count;
 357           break;
 358         case CM_PASSTHROUGH:
 359           break;
 360         }
 361     }
 362
 363   if (!encode_count && !decode_count)
 364     /* The string is good as it is. */
 365     return (char *)s;           /* C const model sucks. */
 366
 367   oldlen = p1 - s;
 368   /* Each encoding adds two characters (hex digits), while each
 369      decoding removes two characters.  */
 370   newlen = oldlen + 2 * (encode_count - decode_count);
 371   newstr = xmalloc (newlen + 1);
 372
 373   p1 = s;
 374   p2 = newstr;
 375
 376   while (*p1)
 377     {
 378       switch (decide_copy_method (p1))
 379         {
 380         case CM_ENCODE:
 381           {
 382             unsigned char c = *p1++;
 383             *p2++ = '%';
 384             *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 385             *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 386           }
 387           break;
 388         case CM_DECODE:
 389           *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
 390                    + (XCHAR_TO_XDIGIT (*(p1 + 2))));
 391           p1 += 3;              /* skip %xx */
 392           break;
 393         case CM_PASSTHROUGH:
 394           *p2++ = *p1++;
 395         }
 396     }
 397   *p2 = '\0';
 398   assert (p2 - newstr == newlen);
 399   return newstr;
 400 }
 401
 402 /* Run PTR_VAR through reencode_string.  If a new string is consed,
 403    free PTR_VAR and make it point to the new storage.  Obviously,
 404    PTR_VAR needs to be an lvalue.  */
 405
 406 #define REENCODE(ptr_var) do {                  \
 407   char *rf_new = reencode_string (ptr_var);     \
 408   if (rf_new != ptr_var)                        \
 409     {                                           \
 410       xfree (ptr_var);                          \
 411       ptr_var = rf_new;                         \
 412     }                                           \
 413 } while (0)
 414 \f
 415 /* Returns the scheme type if the scheme is supported, or
 416    SCHEME_INVALID if not.  */
 417 enum url_scheme
 418 url_scheme (const char *url)
 419 {
 420   int i;
 421
 422   for (i = 0; supported_schemes[i].leading_string; i++)
 423     if (!strncasecmp (url, supported_schemes[i].leading_string,
 424                       strlen (supported_schemes[i].leading_string)))
 425       return (enum url_scheme)i;
 426   return SCHEME_INVALID;
 427 }
 428
 429 /* Return the number of characters needed to skip the scheme part of
 430    the URL, e.g. `http://'.  If no scheme is found, returns 0.  */
 431 int
 432 url_skip_scheme (const char *url)
 433 {
 434   const char *p = url;
 435
 436   /* Skip the scheme name.  We allow `-' and `+' because of `whois++',
 437      etc. */
 438   while (ISALNUM (*p) || *p == '-' || *p == '+')
 439     ++p;
 440   if (*p != ':')
 441     return 0;
 442   /* Skip ':'. */
 443   ++p;
 444
 445   /* Skip "//" if found. */
 446   if (*p == '/' && *(p + 1) == '/')
 447     p += 2;
 448
 449   return p - url;
 450 }
 451
 452 /* Returns 1 if the URL begins with a scheme (supported or
 453    unsupported), 0 otherwise.  */
 454 int
 455 url_has_scheme (const char *url)
 456 {
 457   const char *p = url;
 458   while (ISALNUM (*p) || *p == '-' || *p == '+')
 459     ++p;
 460   return *p == ':';
 461 }
 462
 463 int
 464 scheme_default_port (enum url_scheme scheme)
 465 {
 466   return supported_schemes[scheme].default_port;
 467 }
 468
 469 /* Skip the username and password, if present here.  The function
 470    should be called *not* with the complete URL, but with the part
 471    right after the scheme.
 472
 473    If no username and password are found, return 0.  */
 474 int
 475 url_skip_uname (const char *url)
 476 {
 477   const char *p;
 478
 479   /* Look for '@' that comes before '/' or '?'. */
 480   p = (const char *)strpbrk (url, "/?@");
 481   if (!p || *p != '@')
 482     return 0;
 483
 484   return p - url + 1;
 485 }
 486
 487 static int
 488 parse_uname (const char *str, int len, char **user, char **passwd)
 489 {
 490   char *colon;
 491
 492   if (len == 0)
 493     /* Empty user name not allowed. */
 494     return 0;
 495
 496   colon = memchr (str, ':', len);
 497   if (colon == str)
 498     /* Empty user name again. */
 499     return 0;
 500
 501   if (colon)
 502     {
 503       int pwlen = len - (colon + 1 - str);
 504       *passwd = xmalloc (pwlen + 1);
 505       memcpy (*passwd, colon + 1, pwlen);
 506       (*passwd)[pwlen] = '\0';
 507       len -= pwlen + 1;
 508     }
 509   else
 510     *passwd = NULL;
 511
 512   *user = xmalloc (len + 1);
 513   memcpy (*user, str, len);
 514   (*user)[len] = '\0';
 515
 516   return 1;
 517 }
 518
 519 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 520    popularized by Netscape and NcFTP.  HTTP shorthands look like this:
 521
 522    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 523    www.foo.com[:port]            -> http://www.foo.com[:port]
 524
 525    FTP shorthands look like this:
 526
 527    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 528    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 529
 530    If the URL needs not or cannot be rewritten, return NULL.  */
 531 char *
 532 rewrite_shorthand_url (const char *url)
 533 {
 534   const char *p;
 535
 536   if (url_has_scheme (url))
 537     return NULL;
 538
 539   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 540      latter Netscape.  */
 541   for (p = url; *p && *p != ':' && *p != '/'; p++)
 542     ;
 543
 544   if (p == url)
 545     return NULL;
 546
 547   if (*p == ':')
 548     {
 549       const char *pp, *path;
 550       char *res;
 551       /* If the characters after the colon and before the next slash
 552          or end of string are all digits, it's HTTP.  */
 553       int digits = 0;
 554       for (pp = p + 1; ISDIGIT (*pp); pp++)
 555         ++digits;
 556       if (digits > 0
 557           && (*pp == '/' || *pp == '\0'))
 558         goto http;
 559
 560       /* Prepend "ftp://" to the entire URL... */
 561       path = p + 1;
 562       res = xmalloc (6 + strlen (url) + 1);
 563       sprintf (res, "ftp://%s", url);
 564       /* ...and replace ':' with '/'. */
 565       res[6 + (p - url)] = '/';
 566       return res;
 567     }
 568   else
 569     {
 570       char *res;
 571     http:
 572       /* Just prepend "http://" to what we have. */
 573       res = xmalloc (7 + strlen (url) + 1);
 574       sprintf (res, "http://%s", url);
 575       return res;
 576     }
 577 }
 578 \f
 579 static void parse_path PARAMS ((const char *, char **, char **));
 580
 581 static char *
 582 strpbrk_or_eos (const char *s, const char *accept)
 583 {
 584   char *p = strpbrk (s, accept);
 585   if (!p)
 586     p = (char *)s + strlen (s);
 587   return p;
 588 }
 589
 590 /* Turn STR into lowercase; return non-zero if a character was
 591    actually changed. */
 592
 593 static int
 594 lowercase_str (char *str)
 595 {
 596   int change = 0;
 597   for (; *str; str++)
 598     if (!ISLOWER (*str))
 599       {
 600         change = 1;
 601         *str = TOLOWER (*str);
 602       }
 603   return change;
 604 }
 605
 606 static char *parse_errors[] = {
 607 #define PE_NO_ERROR            0
 608   "No error",
 609 #define PE_UNRECOGNIZED_SCHEME 1
 610   "Unrecognized scheme",
 611 #define PE_EMPTY_HOST          2
 612   "Empty host",
 613 #define PE_BAD_PORT_NUMBER     3
 614   "Bad port number",
 615 #define PE_INVALID_USER_NAME   4
 616   "Invalid user name"
 617 };
 618
 619 #define SETERR(p, v) do {                       \
 620   if (p)                                        \
 621     *(p) = (v);                                 \
 622 } while (0)
 623
 624 /* Parse a URL.
 625
 626    Return a new struct url if successful, NULL on error.  In case of
 627    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 628    error code. */
 629 struct url *
 630 url_parse (const char *url, int *error)
 631 {
 632   struct url *u;
 633   const char *p;
 634   int path_modified, host_modified;
 635
 636   enum url_scheme scheme;
 637
 638   const char *uname_b,     *uname_e;
 639   const char *host_b,      *host_e;
 640   const char *path_b,      *path_e;
 641   const char *params_b,    *params_e;
 642   const char *query_b,     *query_e;
 643   const char *fragment_b,  *fragment_e;
 644
 645   int port;
 646   char *user = NULL, *passwd = NULL;
 647
 648   char *url_encoded;
 649
 650   scheme = url_scheme (url);
 651   if (scheme == SCHEME_INVALID)
 652     {
 653       SETERR (error, PE_UNRECOGNIZED_SCHEME);
 654       return NULL;
 655     }
 656
 657   url_encoded = reencode_string (url);
 658   p = url_encoded;
 659
 660   p += strlen (supported_schemes[scheme].leading_string);
 661   uname_b = p;
 662   p += url_skip_uname (p);
 663   uname_e = p;
 664
 665   /* scheme://user:pass@host[:port]... */
 666   /*                    ^              */
 667
 668   /* We attempt to break down the URL into the components path,
 669      params, query, and fragment.  They are ordered like this:
 670
 671        scheme://host[:port][/path][;params][?query][#fragment]  */
 672
 673   params_b   = params_e   = NULL;
 674   query_b    = query_e    = NULL;
 675   fragment_b = fragment_e = NULL;
 676
 677   host_b = p;
 678   p = strpbrk_or_eos (p, ":/;?#");
 679   host_e = p;
 680
 681   if (host_b == host_e)
 682     {
 683       SETERR (error, PE_EMPTY_HOST);
 684       return NULL;
 685     }
 686
 687   port = scheme_default_port (scheme);
 688   if (*p == ':')
 689     {
 690       const char *port_b, *port_e, *pp;
 691
 692       /* scheme://host:port/tralala */
 693       /*              ^             */
 694       ++p;
 695       port_b = p;
 696       p = strpbrk_or_eos (p, "/;?#");
 697       port_e = p;
 698
 699       if (port_b == port_e)
 700         {
 701           /* http://host:/whatever */
 702           /*             ^         */
 703           SETERR (error, PE_BAD_PORT_NUMBER);
 704           return NULL;
 705         }
 706
 707       for (port = 0, pp = port_b; pp < port_e; pp++)
 708         {
 709           if (!ISDIGIT (*pp))
 710             {
 711               /* http://host:12randomgarbage/blah */
 712               /*               ^                  */
 713               SETERR (error, PE_BAD_PORT_NUMBER);
 714               return NULL;
 715             }
 716           port = 10 * port + (*pp - '0');
 717         }
 718     }
 719
 720   if (*p == '/')
 721     {
 722       ++p;
 723       path_b = p;
 724       p = strpbrk_or_eos (p, ";?#");
 725       path_e = p;
 726     }
 727   else
 728     {
 729       /* Path is not allowed not to exist. */
 730       path_b = path_e = p;
 731     }
 732
 733   if (*p == ';')
 734     {
 735       ++p;
 736       params_b = p;
 737       p = strpbrk_or_eos (p, "?#");
 738       params_e = p;
 739     }
 740   if (*p == '?')
 741     {
 742       ++p;
 743       query_b = p;
 744       p = strpbrk_or_eos (p, "#");
 745       query_e = p;
 746     }
 747   if (*p == '#')
 748     {
 749       ++p;
 750       fragment_b = p;
 751       p += strlen (p);
 752       fragment_e = p;
 753     }
 754   assert (*p == 0);
 755
 756   if (uname_b != uname_e)
 757     {
 758       /* http://user:pass@host */
 759       /*        ^         ^    */
 760       /*     uname_b   uname_e */
 761       if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
 762         {
 763           SETERR (error, PE_INVALID_USER_NAME);
 764           return NULL;
 765         }
 766     }
 767
 768   u = (struct url *)xmalloc (sizeof (struct url));
 769   memset (u, 0, sizeof (*u));
 770
 771   u->scheme = scheme;
 772   u->host   = strdupdelim (host_b, host_e);
 773   u->port   = port;
 774   u->user   = user;
 775   u->passwd = passwd;
 776
 777   u->path = strdupdelim (path_b, path_e);
 778   path_modified = path_simplify (u->path);
 779   parse_path (u->path, &u->dir, &u->file);
 780
 781   host_modified = lowercase_str (u->host);
 782
 783   if (params_b)
 784     u->params = strdupdelim (params_b, params_e);
 785   if (query_b)
 786     u->query = strdupdelim (query_b, query_e);
 787   if (fragment_b)
 788     u->fragment = strdupdelim (fragment_b, fragment_e);
 789
 790
 791   if (path_modified || u->fragment || host_modified)
 792     {
 793       /* If path_simplify modified the path, or if a fragment is
 794          present, or if the original host name had caps in it, make
 795          sure that u->url is equivalent to what would be printed by
 796          url_string.  */
 797       u->url = url_string (u, 0);
 798
 799       if (url_encoded != url)
 800         xfree ((char *) url_encoded);
 801     }
 802   else
 803     {
 804       if (url_encoded == url)
 805         u->url    = xstrdup (url);
 806       else
 807         u->url    = url_encoded;
 808     }
 809   url_encoded = NULL;
 810
 811   return u;
 812 }
 813
 814 const char *
 815 url_error (int error_code)
 816 {
 817   assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
 818   return parse_errors[error_code];
 819 }
 820
 821 static void
 822 parse_path (const char *quoted_path, char **dir, char **file)
 823 {
 824   char *path, *last_slash;
 825
 826   STRDUP_ALLOCA (path, quoted_path);
 827   decode_string (path);
 828
 829   last_slash = strrchr (path, '/');
 830   if (!last_slash)
 831     {
 832       *dir = xstrdup ("");
 833       *file = xstrdup (path);
 834     }
 835   else
 836     {
 837       *dir = strdupdelim (path, last_slash);
 838       *file = xstrdup (last_slash + 1);
 839     }
 840 }
 841
 842 /* Note: URL's "full path" is the path with the query string and
 843    params appended.  The "fragment" (#foo) is intentionally ignored,
 844    but that might be changed.  For example, if the original URL was
 845    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
 846    the full path will be "/foo/bar/baz;bullshit?querystring".  */
 847
 848 /* Return the length of the full path, without the terminating
 849    zero.  */
 850
 851 static int
 852 full_path_length (const struct url *url)
 853 {
 854   int len = 0;
 855
 856 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
 857
 858   FROB (path);
 859   FROB (params);
 860   FROB (query);
 861
 862 #undef FROB
 863
 864   return len;
 865 }
 866
 867 /* Write out the full path. */
 868
 869 static void
 870 full_path_write (const struct url *url, char *where)
 871 {
 872 #define FROB(el, chr) do {                      \
 873   char *f_el = url->el;                         \
 874   if (f_el) {                                   \
 875     int l = strlen (f_el);                      \
 876     *where++ = chr;                             \
 877     memcpy (where, f_el, l);                    \
 878     where += l;                                 \
 879   }                                             \
 880 } while (0)
 881
 882   FROB (path, '/');
 883   FROB (params, ';');
 884   FROB (query, '?');
 885
 886 #undef FROB
 887 }
 888
 889 /* Public function for getting the "full path".  E.g. if u->path is
 890    "foo/bar" and u->query is "param=value", full_path will be
 891    "/foo/bar?param=value". */
 892
 893 char *
 894 url_full_path (const struct url *url)
 895 {
 896   int length = full_path_length (url);
 897   char *full_path = (char *)xmalloc(length + 1);
 898
 899   full_path_write (url, full_path);
 900   full_path[length] = '\0';
 901
 902   return full_path;
 903 }
 904
 905 /* Sync u->path and u->url with u->dir and u->file. */
 906
 907 static void
 908 sync_path (struct url *url)
 909 {
 910   char *newpath;
 911
 912   xfree (url->path);
 913
 914   if (!*url->dir)
 915     {
 916       newpath = xstrdup (url->file);
 917       REENCODE (newpath);
 918     }
 919   else
 920     {
 921       int dirlen = strlen (url->dir);
 922       int filelen = strlen (url->file);
 923
 924       newpath = xmalloc (dirlen + 1 + filelen + 1);
 925       memcpy (newpath, url->dir, dirlen);
 926       newpath[dirlen] = '/';
 927       memcpy (newpath + dirlen + 1, url->file, filelen);
 928       newpath[dirlen + 1 + filelen] = '\0';
 929       REENCODE (newpath);
 930     }
 931
 932   url->path = newpath;
 933
 934   /* Synchronize u->url. */
 935   xfree (url->url);
 936   url->url = url_string (url, 0);
 937 }
 938
 939 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
 940    This way we can sync u->path and u->url when they get changed.  */
 941
 942 void
 943 url_set_dir (struct url *url, const char *newdir)
 944 {
 945   xfree (url->dir);
 946   url->dir = xstrdup (newdir);
 947   sync_path (url);
 948 }
 949
 950 void
 951 url_set_file (struct url *url, const char *newfile)
 952 {
 953   xfree (url->file);
 954   url->file = xstrdup (newfile);
 955   sync_path (url);
 956 }
 957
 958 void
 959 url_free (struct url *url)
 960 {
 961   xfree (url->host);
 962   xfree (url->path);
 963   xfree (url->url);
 964
 965   FREE_MAYBE (url->params);
 966   FREE_MAYBE (url->query);
 967   FREE_MAYBE (url->fragment);
 968   FREE_MAYBE (url->user);
 969   FREE_MAYBE (url->passwd);
 970
 971   xfree (url->dir);
 972   xfree (url->file);
 973
 974   xfree (url);
 975 }
 976 \f
 977 struct urlpos *
 978 get_urls_file (const char *file)
 979 {
 980   struct file_memory *fm;
 981   struct urlpos *head, *tail;
 982   const char *text, *text_end;
 983
 984   /* Load the file.  */
 985   fm = read_file (file);
 986   if (!fm)
 987     {
 988       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 989       return NULL;
 990     }
 991   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 992   head = tail = NULL;
 993   text = fm->content;
 994   text_end = fm->content + fm->length;
 995   while (text < text_end)
 996     {
 997       const char *line_beg = text;
 998       const char *line_end = memchr (text, '\n', text_end - text);
 999       if (!line_end)
1000         line_end = text_end;
1001       else
1002         ++line_end;
1003       text = line_end;
1004       while (line_beg < line_end
1005              && ISSPACE (*line_beg))
1006         ++line_beg;
1007       while (line_end > line_beg + 1
1008              && ISSPACE (*(line_end - 1)))
1009         --line_end;
1010       if (line_end > line_beg)
1011         {
1012           /* URL is in the [line_beg, line_end) region. */
1013
1014           int up_error_code;
1015           char *url_text;
1016           struct urlpos *entry;
1017           struct url *url;
1018
1019           /* We must copy the URL to a zero-terminated string, and we
1020              can't use alloca because we're in a loop.  *sigh*.  */
1021           url_text = strdupdelim (line_beg, line_end);
1022
1023           if (opt.base_href)
1024             {
1025               /* Merge opt.base_href with URL. */
1026               char *merged = uri_merge (opt.base_href, url_text);
1027               xfree (url_text);
1028               url_text = merged;
1029             }
1030
1031           url = url_parse (url_text, &up_error_code);
1032           if (!url)
1033             {
1034               logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1035                          file, url_text, url_error (up_error_code));
1036               xfree (url_text);
1037               continue;
1038             }
1039           xfree (url_text);
1040
1041           entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1042           memset (entry, 0, sizeof (*entry));
1043           entry->next = NULL;
1044           entry->url = url;
1045
1046           if (!head)
1047             head = entry;
1048           else
1049             tail->next = entry;
1050           tail = entry;
1051         }
1052     }
1053   read_file_free (fm);
1054   return head;
1055 }
1056 \f
1057 /* Free the linked list of urlpos.  */
1058 void
1059 free_urlpos (struct urlpos *l)
1060 {
1061   while (l)
1062     {
1063       struct urlpos *next = l->next;
1064       if (l->url)
1065         url_free (l->url);
1066       FREE_MAYBE (l->local_name);
1067       xfree (l);
1068       l = next;
1069     }
1070 }
1071
1072 /* Rotate FNAME opt.backups times */
1073 void
1074 rotate_backups(const char *fname)
1075 {
1076   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1077   char *from = (char *)alloca (maxlen);
1078   char *to = (char *)alloca (maxlen);
1079   struct stat sb;
1080   int i;
1081
1082   if (stat (fname, &sb) == 0)
1083     if (S_ISREG (sb.st_mode) == 0)
1084       return;
1085
1086   for (i = opt.backups; i > 1; i--)
1087     {
1088       sprintf (from, "%s.%d", fname, i - 1);
1089       sprintf (to, "%s.%d", fname, i);
1090       /* #### This will fail on machines without the rename() system
1091          call.  */
1092       rename (from, to);
1093     }
1094
1095   sprintf (to, "%s.%d", fname, 1);
1096   rename(fname, to);
1097 }
1098
1099 /* Create all the necessary directories for PATH (a file).  Calls
1100    mkdirhier() internally.  */
1101 int
1102 mkalldirs (const char *path)
1103 {
1104   const char *p;
1105   char *t;
1106   struct stat st;
1107   int res;
1108
1109   p = path + strlen (path);
1110   for (; *p != '/' && p != path; p--);
1111   /* Don't create if it's just a file.  */
1112   if ((p == path) && (*p != '/'))
1113     return 0;
1114   t = strdupdelim (path, p);
1115   /* Check whether the directory exists.  */
1116   if ((stat (t, &st) == 0))
1117     {
1118       if (S_ISDIR (st.st_mode))
1119         {
1120           xfree (t);
1121           return 0;
1122         }
1123       else
1124         {
1125           /* If the dir exists as a file name, remove it first.  This
1126              is *only* for Wget to work with buggy old CERN http
1127              servers.  Here is the scenario: When Wget tries to
1128              retrieve a directory without a slash, e.g.
1129              http://foo/bar (bar being a directory), CERN server will
1130              not redirect it too http://foo/bar/ -- it will generate a
1131              directory listing containing links to bar/file1,
1132              bar/file2, etc.  Wget will lose because it saves this
1133              HTML listing to a file `bar', so it cannot create the
1134              directory.  To work around this, if the file of the same
1135              name exists, we just remove it and create the directory
1136              anyway.  */
1137           DEBUGP (("Removing %s because of directory danger!\n", t));
1138           unlink (t);
1139         }
1140     }
1141   res = make_directory (t);
1142   if (res != 0)
1143     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1144   xfree (t);
1145   return res;
1146 }
1147
1148 static int
1149 count_slashes (const char *s)
1150 {
1151   int i = 0;
1152   while (*s)
1153     if (*s++ == '/')
1154       ++i;
1155   return i;
1156 }
1157
1158 /* Return the path name of the URL-equivalent file name, with a
1159    remote-like structure of directories.  */
1160 static char *
1161 mkstruct (const struct url *u)
1162 {
1163   char *dir, *dir_preencoding;
1164   char *file, *res, *dirpref;
1165   char *query = u->query && *u->query ? u->query : NULL;
1166   int l;
1167
1168   if (opt.cut_dirs)
1169     {
1170       char *ptr = u->dir + (*u->dir == '/');
1171       int slash_count = 1 + count_slashes (ptr);
1172       int cut = MINVAL (opt.cut_dirs, slash_count);
1173       for (; cut && *ptr; ptr++)
1174         if (*ptr == '/')
1175           --cut;
1176       STRDUP_ALLOCA (dir, ptr);
1177     }
1178   else
1179     dir = u->dir + (*u->dir == '/');
1180
1181   /* Check for the true name (or at least a consistent name for saving
1182      to directory) of HOST, reusing the hlist if possible.  */
1183   if (opt.add_hostdir)
1184     {
1185       /* Add dir_prefix and hostname (if required) to the beginning of
1186          dir.  */
1187       dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1188                                 + strlen (u->host)
1189                                 + 1 + numdigit (u->port)
1190                                 + 1);
1191       if (!DOTP (opt.dir_prefix))
1192         sprintf (dirpref, "%s/%s", opt.dir_prefix, u->host);
1193       else
1194         strcpy (dirpref, u->host);
1195
1196       if (u->port != scheme_default_port (u->scheme))
1197         {
1198           int len = strlen (dirpref);
1199           dirpref[len] = ':';
1200           long_to_string (dirpref + len + 1, u->port);
1201         }
1202     }
1203   else                          /* not add_hostdir */
1204     {
1205       if (!DOTP (opt.dir_prefix))
1206         dirpref = opt.dir_prefix;
1207       else
1208         dirpref = "";
1209     }
1210
1211   /* If there is a prefix, prepend it.  */
1212   if (*dirpref)
1213     {
1214       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1215       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1216       dir = newdir;
1217     }
1218
1219   dir_preencoding = dir;
1220   dir = reencode_string (dir_preencoding);
1221
1222   l = strlen (dir);
1223   if (l && dir[l - 1] == '/')
1224     dir[l - 1] = '\0';
1225
1226   if (!*u->file)
1227     file = "index.html";
1228   else
1229     file = u->file;
1230
1231   /* Finally, construct the full name.  */
1232   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file)
1233                          + (query ? (1 + strlen (query)) : 0)
1234                          + 1);
1235   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1236   if (query)
1237     {
1238       strcat (res, "?");
1239       strcat (res, query);
1240     }
1241   if (dir != dir_preencoding)
1242     xfree (dir);
1243   return res;
1244 }
1245
1246 /* Compose a file name out of BASE, an unescaped file name, and QUERY,
1247    an escaped query string.  The trick is to make sure that unsafe
1248    characters in BASE are escaped, and that slashes in QUERY are also
1249    escaped.  */
1250
1251 static char *
1252 compose_file_name (char *base, char *query)
1253 {
1254   char result[256];
1255   char *from;
1256   char *to = result;
1257
1258   /* Copy BASE to RESULT and encode all unsafe characters.  */
1259   from = base;
1260   while (*from && to - result < sizeof (result))
1261     {
1262       if (UNSAFE_CHAR (*from))
1263         {
1264           unsigned char c = *from++;
1265           *to++ = '%';
1266           *to++ = XDIGIT_TO_XCHAR (c >> 4);
1267           *to++ = XDIGIT_TO_XCHAR (c & 0xf);
1268         }
1269       else
1270         *to++ = *from++;
1271     }
1272
1273   if (query && to - result < sizeof (result))
1274     {
1275       *to++ = '?';
1276
1277       /* Copy QUERY to RESULT and encode all '/' characters. */
1278       from = query;
1279       while (*from && to - result < sizeof (result))
1280         {
1281           if (*from == '/')
1282             {
1283               *to++ = '%';
1284               *to++ = '2';
1285               *to++ = 'F';
1286               ++from;
1287             }
1288           else
1289             *to++ = *from++;
1290         }
1291     }
1292
1293   if (to - result < sizeof (result))
1294     *to = '\0';
1295   else
1296     /* Truncate input which is too long, presumably due to a huge
1297        query string.  */
1298     result[sizeof (result) - 1] = '\0';
1299
1300   return xstrdup (result);
1301 }
1302
1303 /* Create a unique filename, corresponding to a given URL.  Calls
1304    mkstruct if necessary.  Does *not* actually create any directories.  */
1305 char *
1306 url_filename (const struct url *u)
1307 {
1308   char *file, *name;
1309   int have_prefix = 0;          /* whether we must prepend opt.dir_prefix */
1310
1311   if (opt.dirstruct)
1312     {
1313       file = mkstruct (u);
1314       have_prefix = 1;
1315     }
1316   else
1317     {
1318       char *base = *u->file ? u->file : "index.html";
1319       char *query = u->query && *u->query ? u->query : NULL;
1320       file = compose_file_name (base, query);
1321     }
1322
1323   if (!have_prefix)
1324     {
1325       /* Check whether the prefix directory is something other than "."
1326          before prepending it.  */
1327       if (!DOTP (opt.dir_prefix))
1328         {
1329           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1330                                          + 1 + strlen (file) + 1);
1331           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1332           xfree (file);
1333           file = nfile;
1334         }
1335     }
1336   /* DOS-ish file systems don't like `%' signs in them; we change it
1337      to `@'.  */
1338 #ifdef WINDOWS
1339   {
1340     char *p = file;
1341     for (p = file; *p; p++)
1342       if (*p == '%')
1343         *p = '@';
1344   }
1345 #endif /* WINDOWS */
1346
1347   /* Check the cases in which the unique extensions are not used:
1348      1) Clobbering is turned off (-nc).
1349      2) Retrieval with regetting.
1350      3) Timestamping is used.
1351      4) Hierarchy is built.
1352
1353      The exception is the case when file does exist and is a
1354      directory (actually support for bad httpd-s).  */
1355   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1356       && !(file_exists_p (file) && !file_non_directory_p (file)))
1357     return file;
1358
1359   /* Find a unique name.  */
1360   name = unique_name (file);
1361   xfree (file);
1362   return name;
1363 }
1364
1365 /* Like strlen(), but allow the URL to be ended with '?'.  */
1366 static int
1367 urlpath_length (const char *url)
1368 {
1369   const char *q = strpbrk_or_eos (url, "?;#");
1370   return q - url;
1371 }
1372
1373 /* Find the last occurrence of character C in the range [b, e), or
1374    NULL, if none are present.  This is almost completely equivalent to
1375    { *e = '\0'; return strrchr(b); }, except that it doesn't change
1376    the contents of the string.  */
1377 static const char *
1378 find_last_char (const char *b, const char *e, char c)
1379 {
1380   for (; e > b; e--)
1381     if (*e == c)
1382       return e;
1383   return NULL;
1384 }
1385
1386 /* Resolve the result of "linking" a base URI (BASE) to a
1387    link-specified URI (LINK).
1388
1389    Either of the URIs may be absolute or relative, complete with the
1390    host name, or path only.  This tries to behave "reasonably" in all
1391    foreseeable cases.  It employs little specific knowledge about
1392    schemes or URL-specific stuff -- it just works on strings.
1393
1394    The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1395    See uri_merge for a gentler interface to this functionality.
1396
1397    Perhaps this function should handle `./' and `../' so that the evil
1398    path_simplify can go.  */
1399 static char *
1400 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1401 {
1402   char *constr;
1403
1404   if (no_scheme)
1405     {
1406       const char *end = base + urlpath_length (base);
1407
1408       if (!*link)
1409         {
1410           /* Empty LINK points back to BASE, query string and all. */
1411           constr = xstrdup (base);
1412         }
1413       else if (*link == '?')
1414         {
1415           /* LINK points to the same location, but changes the query
1416              string.  Examples: */
1417           /* uri_merge("path",         "?new") -> "path?new"     */
1418           /* uri_merge("path?foo",     "?new") -> "path?new"     */
1419           /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1420           /* uri_merge("path#foo",     "?new") -> "path?new"     */
1421           int baselength = end - base;
1422           constr = xmalloc (baselength + linklength + 1);
1423           memcpy (constr, base, baselength);
1424           memcpy (constr + baselength, link, linklength);
1425           constr[baselength + linklength] = '\0';
1426         }
1427       else if (*link == '#')
1428         {
1429           /* uri_merge("path",         "#new") -> "path#new"     */
1430           /* uri_merge("path#foo",     "#new") -> "path#new"     */
1431           /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1432           /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1433           int baselength;
1434           const char *end1 = strchr (base, '#');
1435           if (!end1)
1436             end1 = base + strlen (base);
1437           baselength = end1 - base;
1438           constr = xmalloc (baselength + linklength + 1);
1439           memcpy (constr, base, baselength);
1440           memcpy (constr + baselength, link, linklength);
1441           constr[baselength + linklength] = '\0';
1442         }
1443       else if (*link == '/')
1444         {
1445           /* LINK is an absolute path: we need to replace everything
1446              after (and including) the FIRST slash with LINK.
1447
1448              So, if BASE is "http://host/whatever/foo/bar", and LINK is
1449              "/qux/xyzzy", our result should be
1450              "http://host/qux/xyzzy".  */
1451           int span;
1452           const char *slash;
1453           const char *start_insert = NULL; /* for gcc to shut up. */
1454           const char *pos = base;
1455           int seen_slash_slash = 0;
1456           /* We're looking for the first slash, but want to ignore
1457              double slash. */
1458         again:
1459           slash = memchr (pos, '/', end - pos);
1460           if (slash && !seen_slash_slash)
1461             if (*(slash + 1) == '/')
1462               {
1463                 pos = slash + 2;
1464                 seen_slash_slash = 1;
1465                 goto again;
1466               }
1467
1468           /* At this point, SLASH is the location of the first / after
1469              "//", or the first slash altogether.  START_INSERT is the
1470              pointer to the location where LINK will be inserted.  When
1471              examining the last two examples, keep in mind that LINK
1472              begins with '/'. */
1473
1474           if (!slash && !seen_slash_slash)
1475             /* example: "foo" */
1476             /*           ^    */
1477             start_insert = base;
1478           else if (!slash && seen_slash_slash)
1479             /* example: "http://foo" */
1480             /*                     ^ */
1481             start_insert = end;
1482           else if (slash && !seen_slash_slash)
1483             /* example: "foo/bar" */
1484             /*           ^        */
1485             start_insert = base;
1486           else if (slash && seen_slash_slash)
1487             /* example: "http://something/" */
1488             /*                           ^  */
1489             start_insert = slash;
1490
1491           span = start_insert - base;
1492           constr = (char *)xmalloc (span + linklength + 1);
1493           if (span)
1494             memcpy (constr, base, span);
1495           if (linklength)
1496             memcpy (constr + span, link, linklength);
1497           constr[span + linklength] = '\0';
1498         }
1499       else
1500         {
1501           /* LINK is a relative URL: we need to replace everything
1502              after last slash (possibly empty) with LINK.
1503
1504              So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1505              our result should be "whatever/foo/qux/xyzzy".  */
1506           int need_explicit_slash = 0;
1507           int span;
1508           const char *start_insert;
1509           const char *last_slash = find_last_char (base, end, '/');
1510           if (!last_slash)
1511             {
1512               /* No slash found at all.  Append LINK to what we have,
1513                  but we'll need a slash as a separator.
1514
1515                  Example: if base == "foo" and link == "qux/xyzzy", then
1516                  we cannot just append link to base, because we'd get
1517                  "fooqux/xyzzy", whereas what we want is
1518                  "foo/qux/xyzzy".
1519
1520                  To make sure the / gets inserted, we set
1521                  need_explicit_slash to 1.  We also set start_insert
1522                  to end + 1, so that the length calculations work out
1523                  correctly for one more (slash) character.  Accessing
1524                  that character is fine, since it will be the
1525                  delimiter, '\0' or '?'.  */
1526               /* example: "foo?..." */
1527               /*               ^    ('?' gets changed to '/') */
1528               start_insert = end + 1;
1529               need_explicit_slash = 1;
1530             }
1531           else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1532             {
1533               /* example: http://host"  */
1534               /*                      ^ */
1535               start_insert = end + 1;
1536               need_explicit_slash = 1;
1537             }
1538           else
1539             {
1540               /* example: "whatever/foo/bar" */
1541               /*                        ^    */
1542               start_insert = last_slash + 1;
1543             }
1544
1545           span = start_insert - base;
1546           constr = (char *)xmalloc (span + linklength + 1);
1547           if (span)
1548             memcpy (constr, base, span);
1549           if (need_explicit_slash)
1550             constr[span - 1] = '/';
1551           if (linklength)
1552             memcpy (constr + span, link, linklength);
1553           constr[span + linklength] = '\0';
1554         }
1555     }
1556   else /* !no_scheme */
1557     {
1558       constr = strdupdelim (link, link + linklength);
1559     }
1560   return constr;
1561 }
1562
1563 /* Merge BASE with LINK and return the resulting URI.  This is an
1564    interface to uri_merge_1 that assumes that LINK is a
1565    zero-terminated string.  */
1566 char *
1567 uri_merge (const char *base, const char *link)
1568 {
1569   return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
1570 }
1571 \f
1572 #define APPEND(p, s) do {                       \
1573   int len = strlen (s);                         \
1574   memcpy (p, s, len);                           \
1575   p += len;                                     \
1576 } while (0)
1577
1578 /* Use this instead of password when the actual password is supposed
1579    to be hidden.  We intentionally use a generic string without giving
1580    away the number of characters in the password, like previous
1581    versions did.  */
1582 #define HIDDEN_PASSWORD "*password*"
1583
1584 /* Recreate the URL string from the data in URL.
1585
1586    If HIDE is non-zero (as it is when we're calling this on a URL we
1587    plan to print, but not when calling it to canonicalize a URL for
1588    use within the program), password will be hidden.  Unsafe
1589    characters in the URL will be quoted.  */
1590
1591 char *
1592 url_string (const struct url *url, int hide_password)
1593 {
1594   int size;
1595   char *result, *p;
1596   char *quoted_user = NULL, *quoted_passwd = NULL;
1597
1598   int scheme_port  = supported_schemes[url->scheme].default_port;
1599   char *scheme_str = supported_schemes[url->scheme].leading_string;
1600   int fplen = full_path_length (url);
1601
1602   assert (scheme_str != NULL);
1603
1604   /* Make sure the user name and password are quoted. */
1605   if (url->user)
1606     {
1607       quoted_user = encode_string_maybe (url->user);
1608       if (url->passwd)
1609         {
1610           if (hide_password)
1611             quoted_passwd = HIDDEN_PASSWORD;
1612           else
1613             quoted_passwd = encode_string_maybe (url->passwd);
1614         }
1615     }
1616
1617   size = (strlen (scheme_str)
1618           + strlen (url->host)
1619           + fplen
1620           + 1);
1621   if (url->port != scheme_port)
1622     size += 1 + numdigit (url->port);
1623   if (quoted_user)
1624     {
1625       size += 1 + strlen (quoted_user);
1626       if (quoted_passwd)
1627         size += 1 + strlen (quoted_passwd);
1628     }
1629
1630   p = result = xmalloc (size);
1631
1632   APPEND (p, scheme_str);
1633   if (quoted_user)
1634     {
1635       APPEND (p, quoted_user);
1636       if (quoted_passwd)
1637         {
1638           *p++ = ':';
1639           APPEND (p, quoted_passwd);
1640         }
1641       *p++ = '@';
1642     }
1643
1644   APPEND (p, url->host);
1645   if (url->port != scheme_port)
1646     {
1647       *p++ = ':';
1648       long_to_string (p, url->port);
1649       p += strlen (p);
1650     }
1651
1652   full_path_write (url, p);
1653   p += fplen;
1654   *p++ = '\0';
1655
1656   assert (p - result == size);
1657
1658   if (quoted_user && quoted_user != url->user)
1659     xfree (quoted_user);
1660   if (quoted_passwd && !hide_password
1661       && quoted_passwd != url->passwd)
1662     xfree (quoted_passwd);
1663
1664   return result;
1665 }
1666 \f
1667 /* Returns proxy host address, in accordance with SCHEME.  */
1668 char *
1669 getproxy (enum url_scheme scheme)
1670 {
1671   char *proxy = NULL;
1672   char *rewritten_url;
1673   static char rewritten_storage[1024];
1674
1675   switch (scheme)
1676     {
1677     case SCHEME_HTTP:
1678       proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1679       break;
1680 #ifdef HAVE_SSL
1681     case SCHEME_HTTPS:
1682       proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1683       break;
1684 #endif
1685     case SCHEME_FTP:
1686       proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1687       break;
1688     case SCHEME_INVALID:
1689       break;
1690     }
1691   if (!proxy || !*proxy)
1692     return NULL;
1693
1694   /* Handle shorthands. */
1695   rewritten_url = rewrite_shorthand_url (proxy);
1696   if (rewritten_url)
1697     {
1698       strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
1699       rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
1700       proxy = rewritten_storage;
1701     }
1702
1703   return proxy;
1704 }
1705
1706 /* Should a host be accessed through proxy, concerning no_proxy?  */
1707 int
1708 no_proxy_match (const char *host, const char **no_proxy)
1709 {
1710   if (!no_proxy)
1711     return 1;
1712   else
1713     return !sufmatch (no_proxy, host);
1714 }
1715 \f
1716 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1717 static const char *replace_attr PARAMS ((const char *, int, FILE *,
1718                                          const char *));
1719 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
1720                                                       const char *, int));
1721 static char *local_quote_string PARAMS ((const char *));
1722
1723 /* Change the links in one HTML file.  LINKS is a list of links in the
1724    document, along with their positions and the desired direction of
1725    the conversion.  */
1726 void
1727 convert_links (const char *file, struct urlpos *links)
1728 {
1729   struct file_memory *fm;
1730   FILE *fp;
1731   const char *p;
1732   downloaded_file_t downloaded_file_return;
1733
1734   struct urlpos *link;
1735   int to_url_count = 0, to_file_count = 0;
1736
1737   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1738
1739   {
1740     /* First we do a "dry run": go through the list L and see whether
1741        any URL needs to be converted in the first place.  If not, just
1742        leave the file alone.  */
1743     int dry_count = 0;
1744     struct urlpos *dry = links;
1745     for (dry = links; dry; dry = dry->next)
1746       if (dry->convert != CO_NOCONVERT)
1747         ++dry_count;
1748     if (!dry_count)
1749       {
1750         logputs (LOG_VERBOSE, _("nothing to do.\n"));
1751         return;
1752       }
1753   }
1754
1755   fm = read_file (file);
1756   if (!fm)
1757     {
1758       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1759                  file, strerror (errno));
1760       return;
1761     }
1762
1763   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1764   if (opt.backup_converted && downloaded_file_return)
1765     write_backup_file (file, downloaded_file_return);
1766
1767   /* Before opening the file for writing, unlink the file.  This is
1768      important if the data in FM is mmaped.  In such case, nulling the
1769      file, which is what fopen() below does, would make us read all
1770      zeroes from the mmaped region.  */
1771   if (unlink (file) < 0 && errno != ENOENT)
1772     {
1773       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1774                  file, strerror (errno));
1775       read_file_free (fm);
1776       return;
1777     }
1778   /* Now open the file for writing.  */
1779   fp = fopen (file, "wb");
1780   if (!fp)
1781     {
1782       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1783                  file, strerror (errno));
1784       read_file_free (fm);
1785       return;
1786     }
1787
1788   /* Here we loop through all the URLs in file, replacing those of
1789      them that are downloaded with relative references.  */
1790   p = fm->content;
1791   for (link = links; link; link = link->next)
1792     {
1793       char *url_start = fm->content + link->pos;
1794
1795       if (link->pos >= fm->length)
1796         {
1797           DEBUGP (("Something strange is going on.  Please investigate."));
1798           break;
1799         }
1800       /* If the URL is not to be converted, skip it.  */
1801       if (link->convert == CO_NOCONVERT)
1802         {
1803           DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
1804           continue;
1805         }
1806
1807       /* Echo the file contents, up to the offending URL's opening
1808          quote, to the outfile.  */
1809       fwrite (p, 1, url_start - p, fp);
1810       p = url_start;
1811
1812       switch (link->convert)
1813         {
1814         case CO_CONVERT_TO_RELATIVE:
1815           /* Convert absolute URL to relative. */
1816           {
1817             char *newname = construct_relative (file, link->local_name);
1818             char *quoted_newname = local_quote_string (newname);
1819
1820             if (!link->link_refresh_p)
1821               p = replace_attr (p, link->size, fp, quoted_newname);
1822             else
1823               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
1824                                              link->refresh_timeout);
1825
1826             DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1827                      link->url->url, newname, link->pos, file));
1828             xfree (newname);
1829             xfree (quoted_newname);
1830             ++to_file_count;
1831             break;
1832           }
1833         case CO_CONVERT_TO_COMPLETE:
1834           /* Convert the link to absolute URL. */
1835           {
1836             char *newlink = link->url->url;
1837             char *quoted_newlink = html_quote_string (newlink);
1838
1839             if (!link->link_refresh_p)
1840               p = replace_attr (p, link->size, fp, quoted_newlink);
1841             else
1842               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
1843                                              link->refresh_timeout);
1844
1845             DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1846                      newlink, link->pos, file));
1847             xfree (quoted_newlink);
1848             ++to_url_count;
1849             break;
1850           }
1851         case CO_NULLIFY_BASE:
1852           /* Change the base href to "". */
1853           p = replace_attr (p, link->size, fp, "");
1854           break;
1855         case CO_NOCONVERT:
1856           abort ();
1857           break;
1858         }
1859     }
1860
1861   /* Output the rest of the file. */
1862   if (p - fm->content < fm->length)
1863     fwrite (p, 1, fm->length - (p - fm->content), fp);
1864   fclose (fp);
1865   read_file_free (fm);
1866
1867   logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
1868 }
1869
1870 /* Construct and return a malloced copy of the relative link from two
1871    pieces of information: local name S1 of the referring file and
1872    local name S2 of the referred file.
1873
1874    So, if S1 is "jagor.srce.hr/index.html" and S2 is
1875    "jagor.srce.hr/images/news.gif", the function will return
1876    "images/news.gif".
1877
1878    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1879    "fly.cc.fer.hr/images/fly.gif", the function will return
1880    "../images/fly.gif".
1881
1882    Caveats: S1 should not begin with `/', unless S2 also begins with
1883    '/'.  S1 should not contain things like ".." and such --
1884    construct_relative ("fly/ioccc/../index.html",
1885    "fly/images/fly.gif") will fail.  (A workaround is to call
1886    something like path_simplify() on S1).  */
1887 static char *
1888 construct_relative (const char *s1, const char *s2)
1889 {
1890   int i, cnt, sepdirs1;
1891   char *res;
1892
1893   if (*s2 == '/')
1894     return xstrdup (s2);
1895   /* S1 should *not* be absolute, if S2 wasn't.  */
1896   assert (*s1 != '/');
1897   i = cnt = 0;
1898   /* Skip the directories common to both strings.  */
1899   while (1)
1900     {
1901       while (s1[i] && s2[i]
1902              && (s1[i] == s2[i])
1903              && (s1[i] != '/')
1904              && (s2[i] != '/'))
1905         ++i;
1906       if (s1[i] == '/' && s2[i] == '/')
1907         cnt = ++i;
1908       else
1909         break;
1910     }
1911   for (sepdirs1 = 0; s1[i]; i++)
1912     if (s1[i] == '/')
1913       ++sepdirs1;
1914   /* Now, construct the file as of:
1915      - ../ repeated sepdirs1 time
1916      - all the non-mutual directories of S2.  */
1917   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1918   for (i = 0; i < sepdirs1; i++)
1919     memcpy (res + 3 * i, "../", 3);
1920   strcpy (res + 3 * i, s2 + cnt);
1921   return res;
1922 }
1923 \f
1924 static void
1925 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1926 {
1927   /* Rather than just writing over the original .html file with the
1928      converted version, save the former to *.orig.  Note we only do
1929      this for files we've _successfully_ downloaded, so we don't
1930      clobber .orig files sitting around from previous invocations. */
1931
1932   /* Construct the backup filename as the original name plus ".orig". */
1933   size_t         filename_len = strlen(file);
1934   char*          filename_plus_orig_suffix;
1935   boolean        already_wrote_backup_file = FALSE;
1936   slist*         converted_file_ptr;
1937   static slist*  converted_files = NULL;
1938
1939   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1940     {
1941       /* Just write "orig" over "html".  We need to do it this way
1942          because when we're checking to see if we've downloaded the
1943          file before (to see if we can skip downloading it), we don't
1944          know if it's a text/html file.  Therefore we don't know yet
1945          at that stage that -E is going to cause us to tack on
1946          ".html", so we need to compare vs. the original URL plus
1947          ".orig", not the original URL plus ".html.orig". */
1948       filename_plus_orig_suffix = alloca (filename_len + 1);
1949       strcpy(filename_plus_orig_suffix, file);
1950       strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1951     }
1952   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1953     {
1954       /* Append ".orig" to the name. */
1955       filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1956       strcpy(filename_plus_orig_suffix, file);
1957       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1958     }
1959
1960   /* We can get called twice on the same URL thanks to the
1961      convert_all_links() call in main().  If we write the .orig file
1962      each time in such a case, it'll end up containing the first-pass
1963      conversion, not the original file.  So, see if we've already been
1964      called on this file. */
1965   converted_file_ptr = converted_files;
1966   while (converted_file_ptr != NULL)
1967     if (strcmp(converted_file_ptr->string, file) == 0)
1968       {
1969         already_wrote_backup_file = TRUE;
1970         break;
1971       }
1972     else
1973       converted_file_ptr = converted_file_ptr->next;
1974
1975   if (!already_wrote_backup_file)
1976     {
1977       /* Rename <file> to <file>.orig before former gets written over. */
1978       if (rename(file, filename_plus_orig_suffix) != 0)
1979         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1980                    file, filename_plus_orig_suffix, strerror (errno));
1981
1982       /* Remember that we've already written a .orig backup for this file.
1983          Note that we never free this memory since we need it till the
1984          convert_all_links() call, which is one of the last things the
1985          program does before terminating.  BTW, I'm not sure if it would be
1986          safe to just set 'converted_file_ptr->string' to 'file' below,
1987          rather than making a copy of the string...  Another note is that I
1988          thought I could just add a field to the urlpos structure saying
1989          that we'd written a .orig file for this URL, but that didn't work,
1990          so I had to make this separate list.
1991          -- Dan Harkless <wget@harkless.org>
1992
1993          This [adding a field to the urlpos structure] didn't work
1994          because convert_file() is called from convert_all_links at
1995          the end of the retrieval with a freshly built new urlpos
1996          list.
1997          -- Hrvoje Niksic <hniksic@arsdigita.com>
1998       */
1999       converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
2000       converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
2001       converted_file_ptr->next = converted_files;
2002       converted_files = converted_file_ptr;
2003     }
2004 }
2005
2006 static int find_fragment PARAMS ((const char *, int, const char **,
2007                                   const char **));
2008
2009 /* Replace an attribute's original text with NEW_TEXT. */
2010
2011 static const char *
2012 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
2013 {
2014   int quote_flag = 0;
2015   char quote_char = '\"';       /* use "..." for quoting, unless the
2016                                    original value is quoted, in which
2017                                    case reuse its quoting char. */
2018   const char *frag_beg, *frag_end;
2019
2020   /* Structure of our string is:
2021        "...old-contents..."
2022        <---    size    --->  (with quotes)
2023      OR:
2024        ...old-contents...
2025        <---    size   -->    (no quotes)   */
2026
2027   if (*p == '\"' || *p == '\'')
2028     {
2029       quote_char = *p;
2030       quote_flag = 1;
2031       ++p;
2032       size -= 2;                /* disregard opening and closing quote */
2033     }
2034   putc (quote_char, fp);
2035   fputs (new_text, fp);
2036
2037   /* Look for fragment identifier, if any. */
2038   if (find_fragment (p, size, &frag_beg, &frag_end))
2039     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2040   p += size;
2041   if (quote_flag)
2042     ++p;
2043   putc (quote_char, fp);
2044
2045   return p;
2046 }
2047
2048 /* The same as REPLACE_ATTR, but used when replacing
2049    <meta http-equiv=refresh content="new_text"> because we need to
2050    append "timeout_value; URL=" before the next_text.  */
2051
2052 static const char *
2053 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
2054                            const char *new_text, int timeout)
2055 {
2056   /* "0; URL=..." */
2057   char *new_with_timeout = (char *)alloca (numdigit (timeout)
2058                                            + 6 /* "; URL=" */
2059                                            + strlen (new_text)
2060                                            + 1);
2061   sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
2062
2063   return replace_attr (p, size, fp, new_with_timeout);
2064 }
2065
2066 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2067    preceded by '&'.  If the character is not found, return zero.  If
2068    the character is found, return 1 and set BP and EP to point to the
2069    beginning and end of the region.
2070
2071    This is used for finding the fragment indentifiers in URLs.  */
2072
2073 static int
2074 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2075 {
2076   const char *end = beg + size;
2077   int saw_amp = 0;
2078   for (; beg < end; beg++)
2079     {
2080       switch (*beg)
2081         {
2082         case '&':
2083           saw_amp = 1;
2084           break;
2085         case '#':
2086           if (!saw_amp)
2087             {
2088               *bp = beg;
2089               *ep = end;
2090               return 1;
2091             }
2092           /* fallthrough */
2093         default:
2094           saw_amp = 0;
2095         }
2096     }
2097   return 0;
2098 }
2099
2100 /* The idea here was to quote ? as %3F to avoid passing part of the
2101    file name as the parameter when browsing the converted file through
2102    HTTP.  However, actually doing that breaks local browsing because
2103    "index.html%3Ffoo=bar" isn't even recognized as an HTML file!
2104    Perhaps this should be controlled by an option, but for now I'm
2105    leaving the question marks.
2106
2107    This is the original docstring of this function:
2108
2109    FILE should be a relative link to a local file.  It should be
2110    quoted as HTML because it will be used in HTML context.  However,
2111    we need to quote ? as %3F to avoid passing part of the file name as
2112    the parameter.  (This is not a problem when viewing locally, but is
2113    if the downloaded and converted tree is served by an HTTP
2114    server.)  */
2115
2116 /* Quote string as HTML. */
2117
2118 static char *
2119 local_quote_string (const char *file)
2120 {
2121   return html_quote_string (file);
2122
2123 #if 0
2124   const char *file_sans_qmark;
2125   int qm = count_char (file, '?');
2126
2127   if (qm)
2128     {
2129       const char *from = file;
2130       char *to, *newname;
2131
2132       /* qm * 2 because we replace each question mark with "%3F",
2133          i.e. replace one char with three, hence two more.  */
2134       int fsqlen = strlen (file) + qm * 2;
2135
2136       to = newname = (char *)alloca (fsqlen + 1);
2137       for (; *from; from++)
2138         {
2139           if (*from != '?')
2140             *to++ = *from;
2141           else
2142             {
2143               *to++ = '%';
2144               *to++ = '3';
2145               *to++ = 'F';
2146             }
2147         }
2148       assert (to - newname == fsqlen);
2149       *to = '\0';
2150
2151       file_sans_qmark = newname;
2152     }
2153   else
2154     file_sans_qmark = file;
2155
2156   return html_quote_string (file_sans_qmark);
2157 #endif
2158 }
2159
2160 /* We're storing "modes" of type downloaded_file_t in the hash table.
2161    However, our hash tables only accept pointers for keys and values.
2162    So when we need a pointer, we use the address of a
2163    downloaded_file_t variable of static storage.  */
2164
2165 static downloaded_file_t *
2166 downloaded_mode_to_ptr (downloaded_file_t mode)
2167 {
2168   static downloaded_file_t
2169     v1 = FILE_NOT_ALREADY_DOWNLOADED,
2170     v2 = FILE_DOWNLOADED_NORMALLY,
2171     v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2172     v4 = CHECK_FOR_FILE;
2173
2174   switch (mode)
2175     {
2176     case FILE_NOT_ALREADY_DOWNLOADED:
2177       return &v1;
2178     case FILE_DOWNLOADED_NORMALLY:
2179       return &v2;
2180     case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2181       return &v3;
2182     case CHECK_FOR_FILE:
2183       return &v4;
2184     }
2185   return NULL;
2186 }
2187
2188 /* This should really be merged with dl_file_url_map and
2189    downloaded_html_files in recur.c.  This was originally a list, but
2190    I changed it to a hash table beause it was actually taking a lot of
2191    time to find things in it.  */
2192
2193 static struct hash_table *downloaded_files_hash;
2194
2195 /* Remembers which files have been downloaded.  In the standard case, should be
2196    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2197    download successfully (i.e. not for ones we have failures on or that we skip
2198    due to -N).
2199
2200    When we've downloaded a file and tacked on a ".html" extension due to -E,
2201    call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2202    FILE_DOWNLOADED_NORMALLY.
2203
2204    If you just want to check if a file has been previously added without adding
2205    it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
2206    with local filenames, not remote URLs. */
2207 downloaded_file_t
2208 downloaded_file (downloaded_file_t mode, const char *file)
2209 {
2210   downloaded_file_t *ptr;
2211
2212   if (mode == CHECK_FOR_FILE)
2213     {
2214       if (!downloaded_files_hash)
2215         return FILE_NOT_ALREADY_DOWNLOADED;
2216       ptr = hash_table_get (downloaded_files_hash, file);
2217       if (!ptr)
2218         return FILE_NOT_ALREADY_DOWNLOADED;
2219       return *ptr;
2220     }
2221
2222   if (!downloaded_files_hash)
2223     downloaded_files_hash = make_string_hash_table (0);
2224
2225   ptr = hash_table_get (downloaded_files_hash, file);
2226   if (ptr)
2227     return *ptr;
2228
2229   ptr = downloaded_mode_to_ptr (mode);
2230   hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2231
2232   return FILE_NOT_ALREADY_DOWNLOADED;
2233 }
2234
2235 static int
2236 df_free_mapper (void *key, void *value, void *ignored)
2237 {
2238   xfree (key);
2239   return 0;
2240 }
2241
2242 void
2243 downloaded_files_free (void)
2244 {
2245   if (downloaded_files_hash)
2246     {
2247       hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2248       hash_table_destroy (downloaded_files_hash);
2249       downloaded_files_hash = NULL;
2250     }
2251 }