sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif
  29 #include <sys/types.h>
  30 #ifdef HAVE_UNISTD_H
  31 # include <unistd.h>
  32 #endif
  33 #include <errno.h>
  34 #include <assert.h>
  35
  36 #include "wget.h"
  37 #include "utils.h"
  38 #include "url.h"
  39 #include "host.h"
  40 #include "hash.h"
  41
  42 #ifndef errno
  43 extern int errno;
  44 #endif
  45
  46 /* Is X "."?  */
  47 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  48 /* Is X ".."?  */
  49 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  50
  51 static int urlpath_length PARAMS ((const char *));
  52
  53 struct scheme_data
  54 {
  55   char *leading_string;
  56   int default_port;
  57   int enabled;
  58 };
  59
  60 /* Supported schemes: */
  61 static struct scheme_data supported_schemes[] =
  62 {
  63   { "http://",  DEFAULT_HTTP_PORT,  1 },
  64 #ifdef HAVE_SSL
  65   { "https://", DEFAULT_HTTPS_PORT, 1 },
  66 #endif
  67   { "ftp://",   DEFAULT_FTP_PORT,   1 },
  68
  69   /* SCHEME_INVALID */
  70   { NULL,       -1,                 0 }
  71 };
  72
  73 static char *construct_relative PARAMS ((const char *, const char *));
  74
  75 \f
  76 /* Support for encoding and decoding of URL strings.  We determine
  77    whether a character is unsafe through static table lookup.  This
  78    code assumes ASCII character set and 8-bit chars.  */
  79
  80 enum {
  81   urlchr_reserved = 1,
  82   urlchr_unsafe   = 2
  83 };
  84
  85 #define R  urlchr_reserved
  86 #define U  urlchr_unsafe
  87 #define RU R|U
  88
  89 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
  90
  91 /* rfc1738 reserved chars, preserved from encoding.  */
  92
  93 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
  94
  95 /* rfc1738 unsafe chars, plus some more.  */
  96
  97 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
  98
  99 const static unsigned char urlchr_table[256] =
 100 {
 101   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 102   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 103   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 104   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 105   U,  0,  U, RU,   0,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 106   0,  0,  0,  R,   0,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 107   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 108   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 109  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 110   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 111   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 112   0,  0,  0,  U,   U,  U,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 113   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 114   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 115   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 116   0,  0,  0,  U,   U,  U,  U,  U,   /* x   y   z   {    |   }   ~   DEL */
 117
 118   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 119   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 120   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 121   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 122
 123   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 124   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 125   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 126   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 127 };
 128
 129 /* Decodes the forms %xy in a URL to the character the hexadecimal
 130    code of which is xy.  xy are hexadecimal digits from
 131    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 132    hex-digits or `%' precedes `\0', the sequence is inserted
 133    literally.  */
 134
 135 static void
 136 decode_string (char *s)
 137 {
 138   char *t = s;                  /* t - tortoise */
 139   char *h = s;                  /* h - hare     */
 140
 141   for (; *h; h++, t++)
 142     {
 143       if (*h != '%')
 144         {
 145         copychar:
 146           *t = *h;
 147         }
 148       else
 149         {
 150           /* Do nothing if '%' is not followed by two hex digits. */
 151           if (!*(h + 1) || !*(h + 2)
 152               || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
 153             goto copychar;
 154           *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
 155           h += 2;
 156         }
 157     }
 158   *t = '\0';
 159 }
 160
 161 /* Like encode_string, but return S if there are no unsafe chars.  */
 162
 163 static char *
 164 encode_string_maybe (const char *s)
 165 {
 166   const char *p1;
 167   char *p2, *newstr;
 168   int newlen;
 169   int addition = 0;
 170
 171   for (p1 = s; *p1; p1++)
 172     if (UNSAFE_CHAR (*p1))
 173       addition += 2;            /* Two more characters (hex digits) */
 174
 175   if (!addition)
 176     return (char *)s;
 177
 178   newlen = (p1 - s) + addition;
 179   newstr = (char *)xmalloc (newlen + 1);
 180
 181   p1 = s;
 182   p2 = newstr;
 183   while (*p1)
 184     {
 185       if (UNSAFE_CHAR (*p1))
 186         {
 187           unsigned char c = *p1++;
 188           *p2++ = '%';
 189           *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 190           *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 191         }
 192       else
 193         *p2++ = *p1++;
 194     }
 195   *p2 = '\0';
 196   assert (p2 - newstr == newlen);
 197
 198   return newstr;
 199 }
 200
 201 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
 202    given string, returning a malloc-ed %XX encoded string.  */
 203
 204 char *
 205 encode_string (const char *s)
 206 {
 207   char *encoded = encode_string_maybe (s);
 208   if (encoded != s)
 209     return encoded;
 210   else
 211     return xstrdup (s);
 212 }
 213
 214 /* Encode unsafe characters in PTR to %xx.  If such encoding is done,
 215    the old value of PTR is freed and PTR is made to point to the newly
 216    allocated storage.  */
 217
 218 #define ENCODE(ptr) do {                        \
 219   char *e_new = encode_string_maybe (ptr);      \
 220   if (e_new != ptr)                             \
 221     {                                           \
 222       xfree (ptr);                              \
 223       ptr = e_new;                              \
 224     }                                           \
 225 } while (0)
 226 \f
 227 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
 228
 229 /* Decide whether to encode, decode, or pass through the char at P.
 230    This used to be a macro, but it got a little too convoluted.  */
 231 static inline enum copy_method
 232 decide_copy_method (const char *p)
 233 {
 234   if (*p == '%')
 235     {
 236       if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
 237         {
 238           /* %xx sequence: decode it, unless it would decode to an
 239              unsafe or a reserved char; in that case, leave it as
 240              is. */
 241           char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
 242             XCHAR_TO_XDIGIT (*(p + 2));
 243
 244           if (UNSAFE_CHAR (preempt) || RESERVED_CHAR (preempt))
 245             return CM_PASSTHROUGH;
 246           else
 247             return CM_DECODE;
 248         }
 249       else
 250         /* Garbled %.. sequence: encode `%'. */
 251         return CM_ENCODE;
 252     }
 253   else if (UNSAFE_CHAR (*p) && !RESERVED_CHAR (*p))
 254     return CM_ENCODE;
 255   else
 256     return CM_PASSTHROUGH;
 257 }
 258
 259 /* Translate a %-quoting (but possibly non-conformant) input string S
 260    into a %-quoting (and conformant) output string.  If no characters
 261    are encoded or decoded, return the same string S; otherwise, return
 262    a freshly allocated string with the new contents.
 263
 264    After a URL has been run through this function, the protocols that
 265    use `%' as the quote character can use the resulting string as-is,
 266    while those that don't call decode_string() to get to the intended
 267    data.  This function is also stable: after an input string is
 268    transformed the first time, all further transformations of the
 269    result yield the same result string.
 270
 271    Let's discuss why this function is needed.
 272
 273    Imagine Wget is to retrieve `http://abc.xyz/abc def'.  Since a raw
 274    space character would mess up the HTTP request, it needs to be
 275    quoted, like this:
 276
 277        GET /abc%20def HTTP/1.0
 278
 279    So it appears that the unsafe chars need to be quoted, as with
 280    encode_string.  But what if we're requested to download
 281    `abc%20def'?  Remember that %-encoding is valid URL syntax, so what
 282    the user meant was a literal space, and he was kind enough to quote
 283    it.  In that case, Wget should obviously leave the `%20' as is, and
 284    send the same request as above.  So in this case we may not call
 285    encode_string.
 286
 287    But what if the requested URI is `abc%20 def'?  If we call
 288    encode_string, we end up with `/abc%2520%20def', which is almost
 289    certainly not intended.  If we don't call encode_string, we are
 290    left with the embedded space and cannot send the request.  What the
 291    user meant was for Wget to request `/abc%20%20def', and this is
 292    where reencode_string kicks in.
 293
 294    Wget used to solve this by first decoding %-quotes, and then
 295    encoding all the "unsafe" characters found in the resulting string.
 296    This was wrong because it didn't preserve certain URL special
 297    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 298    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 299    whether we considered `+' reserved (it is).  One of these results
 300    is inevitable because by the second step we would lose information
 301    on whether the `+' was originally encoded or not.  Both results
 302    were wrong because in CGI parameters + means space, while %2B means
 303    literal plus.  reencode_string correctly translates the above to
 304    "a%2B+b", i.e. returns the original string.
 305
 306    This function uses an algorithm proposed by Anon Sricharoenchai:
 307
 308    1. Encode all URL_UNSAFE and the "%" that are not followed by 2
 309       hexdigits.
 310
 311    2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
 312       "+".
 313
 314    ...except that this code conflates the two steps, and decides
 315    whether to encode, decode, or pass through each character in turn.
 316    The function still uses two passes, but their logic is the same --
 317    the first pass exists merely for the sake of allocation.  Another
 318    small difference is that we include `+' to URL_RESERVED.
 319
 320    Anon's test case:
 321
 322    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 323    ->
 324    "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
 325
 326    Simpler test cases:
 327
 328    "foo bar"         -> "foo%20bar"
 329    "foo%20bar"       -> "foo%20bar"
 330    "foo %20bar"      -> "foo%20%20bar"
 331    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 332    "foo%25%20bar"    -> "foo%25%20bar"
 333    "foo%2%20bar"     -> "foo%252%20bar"
 334    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 335    "foo%2b+bar"      -> "foo%2b+bar"  */
 336
 337 static char *
 338 reencode_string (const char *s)
 339 {
 340   const char *p1;
 341   char *newstr, *p2;
 342   int oldlen, newlen;
 343
 344   int encode_count = 0;
 345   int decode_count = 0;
 346
 347   /* First, pass through the string to see if there's anything to do,
 348      and to calculate the new length.  */
 349   for (p1 = s; *p1; p1++)
 350     {
 351       switch (decide_copy_method (p1))
 352         {
 353         case CM_ENCODE:
 354           ++encode_count;
 355           break;
 356         case CM_DECODE:
 357           ++decode_count;
 358           break;
 359         case CM_PASSTHROUGH:
 360           break;
 361         }
 362     }
 363
 364   if (!encode_count && !decode_count)
 365     /* The string is good as it is. */
 366     return (char *)s;           /* C const model sucks. */
 367
 368   oldlen = p1 - s;
 369   /* Each encoding adds two characters (hex digits), while each
 370      decoding removes two characters.  */
 371   newlen = oldlen + 2 * (encode_count - decode_count);
 372   newstr = xmalloc (newlen + 1);
 373
 374   p1 = s;
 375   p2 = newstr;
 376
 377   while (*p1)
 378     {
 379       switch (decide_copy_method (p1))
 380         {
 381         case CM_ENCODE:
 382           {
 383             unsigned char c = *p1++;
 384             *p2++ = '%';
 385             *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 386             *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 387           }
 388           break;
 389         case CM_DECODE:
 390           *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
 391                    + (XCHAR_TO_XDIGIT (*(p1 + 2))));
 392           p1 += 3;              /* skip %xx */
 393           break;
 394         case CM_PASSTHROUGH:
 395           *p2++ = *p1++;
 396         }
 397     }
 398   *p2 = '\0';
 399   assert (p2 - newstr == newlen);
 400   return newstr;
 401 }
 402
 403 /* Run PTR_VAR through reencode_string.  If a new string is consed,
 404    free PTR_VAR and make it point to the new storage.  Obviously,
 405    PTR_VAR needs to be an lvalue.  */
 406
 407 #define REENCODE(ptr_var) do {                  \
 408   char *rf_new = reencode_string (ptr_var);     \
 409   if (rf_new != ptr_var)                        \
 410     {                                           \
 411       xfree (ptr_var);                          \
 412       ptr_var = rf_new;                         \
 413     }                                           \
 414 } while (0)
 415 \f
 416 /* Returns the scheme type if the scheme is supported, or
 417    SCHEME_INVALID if not.  */
 418 enum url_scheme
 419 url_scheme (const char *url)
 420 {
 421   int i;
 422
 423   for (i = 0; supported_schemes[i].leading_string; i++)
 424     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 425                           strlen (supported_schemes[i].leading_string)))
 426       {
 427         if (supported_schemes[i].enabled)
 428           return (enum url_scheme) i;
 429         else
 430           return SCHEME_INVALID;
 431       }
 432
 433   return SCHEME_INVALID;
 434 }
 435
 436 /* Return the number of characters needed to skip the scheme part of
 437    the URL, e.g. `http://'.  If no scheme is found, returns 0.  */
 438 int
 439 url_skip_scheme (const char *url)
 440 {
 441   const char *p = url;
 442
 443   /* Skip the scheme name.  We allow `-' and `+' because of `whois++',
 444      etc. */
 445   while (ISALNUM (*p) || *p == '-' || *p == '+')
 446     ++p;
 447   if (*p != ':')
 448     return 0;
 449   /* Skip ':'. */
 450   ++p;
 451
 452   /* Skip "//" if found. */
 453   if (*p == '/' && *(p + 1) == '/')
 454     p += 2;
 455
 456   return p - url;
 457 }
 458
 459 /* Returns 1 if the URL begins with a scheme (supported or
 460    unsupported), 0 otherwise.  */
 461 int
 462 url_has_scheme (const char *url)
 463 {
 464   const char *p = url;
 465   while (ISALNUM (*p) || *p == '-' || *p == '+')
 466     ++p;
 467   return *p == ':';
 468 }
 469
 470 int
 471 scheme_default_port (enum url_scheme scheme)
 472 {
 473   return supported_schemes[scheme].default_port;
 474 }
 475
 476 void
 477 scheme_disable (enum url_scheme scheme)
 478 {
 479   supported_schemes[scheme].enabled = 0;
 480 }
 481
 482 /* Skip the username and password, if present here.  The function
 483    should be called *not* with the complete URL, but with the part
 484    right after the scheme.
 485
 486    If no username and password are found, return 0.  */
 487 int
 488 url_skip_uname (const char *url)
 489 {
 490   const char *p;
 491
 492   /* Look for '@' that comes before '/' or '?'. */
 493   p = (const char *)strpbrk (url, "/?@");
 494   if (!p || *p != '@')
 495     return 0;
 496
 497   return p - url + 1;
 498 }
 499
 500 static int
 501 parse_uname (const char *str, int len, char **user, char **passwd)
 502 {
 503   char *colon;
 504
 505   if (len == 0)
 506     /* Empty user name not allowed. */
 507     return 0;
 508
 509   colon = memchr (str, ':', len);
 510   if (colon == str)
 511     /* Empty user name again. */
 512     return 0;
 513
 514   if (colon)
 515     {
 516       int pwlen = len - (colon + 1 - str);
 517       *passwd = xmalloc (pwlen + 1);
 518       memcpy (*passwd, colon + 1, pwlen);
 519       (*passwd)[pwlen] = '\0';
 520       len -= pwlen + 1;
 521     }
 522   else
 523     *passwd = NULL;
 524
 525   *user = xmalloc (len + 1);
 526   memcpy (*user, str, len);
 527   (*user)[len] = '\0';
 528
 529   return 1;
 530 }
 531
 532 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 533    popularized by Netscape and NcFTP.  HTTP shorthands look like this:
 534
 535    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 536    www.foo.com[:port]            -> http://www.foo.com[:port]
 537
 538    FTP shorthands look like this:
 539
 540    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 541    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 542
 543    If the URL needs not or cannot be rewritten, return NULL.  */
 544 char *
 545 rewrite_shorthand_url (const char *url)
 546 {
 547   const char *p;
 548
 549   if (url_has_scheme (url))
 550     return NULL;
 551
 552   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 553      latter Netscape.  */
 554   for (p = url; *p && *p != ':' && *p != '/'; p++)
 555     ;
 556
 557   if (p == url)
 558     return NULL;
 559
 560   if (*p == ':')
 561     {
 562       const char *pp;
 563       char *res;
 564       /* If the characters after the colon and before the next slash
 565          or end of string are all digits, it's HTTP.  */
 566       int digits = 0;
 567       for (pp = p + 1; ISDIGIT (*pp); pp++)
 568         ++digits;
 569       if (digits > 0 && (*pp == '/' || *pp == '\0'))
 570         goto http;
 571
 572       /* Prepend "ftp://" to the entire URL... */
 573       res = xmalloc (6 + strlen (url) + 1);
 574       sprintf (res, "ftp://%s", url);
 575       /* ...and replace ':' with '/'. */
 576       res[6 + (p - url)] = '/';
 577       return res;
 578     }
 579   else
 580     {
 581       char *res;
 582     http:
 583       /* Just prepend "http://" to what we have. */
 584       res = xmalloc (7 + strlen (url) + 1);
 585       sprintf (res, "http://%s", url);
 586       return res;
 587     }
 588 }
 589 \f
 590 static void parse_path PARAMS ((const char *, char **, char **));
 591
 592 static char *
 593 strpbrk_or_eos (const char *s, const char *accept)
 594 {
 595   char *p = strpbrk (s, accept);
 596   if (!p)
 597     p = (char *)s + strlen (s);
 598   return p;
 599 }
 600
 601 /* Turn STR into lowercase; return non-zero if a character was
 602    actually changed. */
 603
 604 static int
 605 lowercase_str (char *str)
 606 {
 607   int change = 0;
 608   for (; *str; str++)
 609     if (ISUPPER (*str))
 610       {
 611         change = 1;
 612         *str = TOLOWER (*str);
 613       }
 614   return change;
 615 }
 616
 617 static char *parse_errors[] = {
 618 #define PE_NO_ERROR            0
 619   "No error",
 620 #define PE_UNSUPPORTED_SCHEME 1
 621   "Unsupported scheme",
 622 #define PE_EMPTY_HOST          2
 623   "Empty host",
 624 #define PE_BAD_PORT_NUMBER     3
 625   "Bad port number",
 626 #define PE_INVALID_USER_NAME   4
 627   "Invalid user name"
 628 };
 629
 630 #define SETERR(p, v) do {                       \
 631   if (p)                                        \
 632     *(p) = (v);                                 \
 633 } while (0)
 634
 635 /* Parse a URL.
 636
 637    Return a new struct url if successful, NULL on error.  In case of
 638    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 639    error code. */
 640 struct url *
 641 url_parse (const char *url, int *error)
 642 {
 643   struct url *u;
 644   const char *p;
 645   int path_modified, host_modified;
 646
 647   enum url_scheme scheme;
 648
 649   const char *uname_b,     *uname_e;
 650   const char *host_b,      *host_e;
 651   const char *path_b,      *path_e;
 652   const char *params_b,    *params_e;
 653   const char *query_b,     *query_e;
 654   const char *fragment_b,  *fragment_e;
 655
 656   int port;
 657   char *user = NULL, *passwd = NULL;
 658
 659   char *url_encoded;
 660
 661   scheme = url_scheme (url);
 662   if (scheme == SCHEME_INVALID)
 663     {
 664       SETERR (error, PE_UNSUPPORTED_SCHEME);
 665       return NULL;
 666     }
 667
 668   url_encoded = reencode_string (url);
 669   p = url_encoded;
 670
 671   p += strlen (supported_schemes[scheme].leading_string);
 672   uname_b = p;
 673   p += url_skip_uname (p);
 674   uname_e = p;
 675
 676   /* scheme://user:pass@host[:port]... */
 677   /*                    ^              */
 678
 679   /* We attempt to break down the URL into the components path,
 680      params, query, and fragment.  They are ordered like this:
 681
 682        scheme://host[:port][/path][;params][?query][#fragment]  */
 683
 684   params_b   = params_e   = NULL;
 685   query_b    = query_e    = NULL;
 686   fragment_b = fragment_e = NULL;
 687
 688   host_b = p;
 689   p = strpbrk_or_eos (p, ":/;?#");
 690   host_e = p;
 691
 692   if (host_b == host_e)
 693     {
 694       SETERR (error, PE_EMPTY_HOST);
 695       return NULL;
 696     }
 697
 698   port = scheme_default_port (scheme);
 699   if (*p == ':')
 700     {
 701       const char *port_b, *port_e, *pp;
 702
 703       /* scheme://host:port/tralala */
 704       /*              ^             */
 705       ++p;
 706       port_b = p;
 707       p = strpbrk_or_eos (p, "/;?#");
 708       port_e = p;
 709
 710       if (port_b == port_e)
 711         {
 712           /* http://host:/whatever */
 713           /*             ^         */
 714           SETERR (error, PE_BAD_PORT_NUMBER);
 715           return NULL;
 716         }
 717
 718       for (port = 0, pp = port_b; pp < port_e; pp++)
 719         {
 720           if (!ISDIGIT (*pp))
 721             {
 722               /* http://host:12randomgarbage/blah */
 723               /*               ^                  */
 724               SETERR (error, PE_BAD_PORT_NUMBER);
 725               return NULL;
 726             }
 727           port = 10 * port + (*pp - '0');
 728         }
 729     }
 730
 731   if (*p == '/')
 732     {
 733       ++p;
 734       path_b = p;
 735       p = strpbrk_or_eos (p, ";?#");
 736       path_e = p;
 737     }
 738   else
 739     {
 740       /* Path is not allowed not to exist. */
 741       path_b = path_e = p;
 742     }
 743
 744   if (*p == ';')
 745     {
 746       ++p;
 747       params_b = p;
 748       p = strpbrk_or_eos (p, "?#");
 749       params_e = p;
 750     }
 751   if (*p == '?')
 752     {
 753       ++p;
 754       query_b = p;
 755       p = strpbrk_or_eos (p, "#");
 756       query_e = p;
 757     }
 758   if (*p == '#')
 759     {
 760       ++p;
 761       fragment_b = p;
 762       p += strlen (p);
 763       fragment_e = p;
 764     }
 765   assert (*p == 0);
 766
 767   if (uname_b != uname_e)
 768     {
 769       /* http://user:pass@host */
 770       /*        ^         ^    */
 771       /*     uname_b   uname_e */
 772       if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
 773         {
 774           SETERR (error, PE_INVALID_USER_NAME);
 775           return NULL;
 776         }
 777     }
 778
 779   u = (struct url *)xmalloc (sizeof (struct url));
 780   memset (u, 0, sizeof (*u));
 781
 782   u->scheme = scheme;
 783   u->host   = strdupdelim (host_b, host_e);
 784   u->port   = port;
 785   u->user   = user;
 786   u->passwd = passwd;
 787
 788   u->path = strdupdelim (path_b, path_e);
 789   path_modified = path_simplify (u->path);
 790   parse_path (u->path, &u->dir, &u->file);
 791
 792   host_modified = lowercase_str (u->host);
 793
 794   if (params_b)
 795     u->params = strdupdelim (params_b, params_e);
 796   if (query_b)
 797     u->query = strdupdelim (query_b, query_e);
 798   if (fragment_b)
 799     u->fragment = strdupdelim (fragment_b, fragment_e);
 800
 801   if (path_modified || u->fragment || host_modified || path_b == path_e)
 802     {
 803       /* If we suspect that a transformation has rendered what
 804          url_string might return different from URL_ENCODED, rebuild
 805          u->url using url_string.  */
 806       u->url = url_string (u, 0);
 807
 808       if (url_encoded != url)
 809         xfree ((char *) url_encoded);
 810     }
 811   else
 812     {
 813       if (url_encoded == url)
 814         u->url    = xstrdup (url);
 815       else
 816         u->url    = url_encoded;
 817     }
 818   url_encoded = NULL;
 819
 820   return u;
 821 }
 822
 823 const char *
 824 url_error (int error_code)
 825 {
 826   assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
 827   return parse_errors[error_code];
 828 }
 829
 830 static void
 831 parse_path (const char *quoted_path, char **dir, char **file)
 832 {
 833   char *path, *last_slash;
 834
 835   STRDUP_ALLOCA (path, quoted_path);
 836   decode_string (path);
 837
 838   last_slash = strrchr (path, '/');
 839   if (!last_slash)
 840     {
 841       *dir = xstrdup ("");
 842       *file = xstrdup (path);
 843     }
 844   else
 845     {
 846       *dir = strdupdelim (path, last_slash);
 847       *file = xstrdup (last_slash + 1);
 848     }
 849 }
 850
 851 /* Note: URL's "full path" is the path with the query string and
 852    params appended.  The "fragment" (#foo) is intentionally ignored,
 853    but that might be changed.  For example, if the original URL was
 854    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
 855    the full path will be "/foo/bar/baz;bullshit?querystring".  */
 856
 857 /* Return the length of the full path, without the terminating
 858    zero.  */
 859
 860 static int
 861 full_path_length (const struct url *url)
 862 {
 863   int len = 0;
 864
 865 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
 866
 867   FROB (path);
 868   FROB (params);
 869   FROB (query);
 870
 871 #undef FROB
 872
 873   return len;
 874 }
 875
 876 /* Write out the full path. */
 877
 878 static void
 879 full_path_write (const struct url *url, char *where)
 880 {
 881 #define FROB(el, chr) do {                      \
 882   char *f_el = url->el;                         \
 883   if (f_el) {                                   \
 884     int l = strlen (f_el);                      \
 885     *where++ = chr;                             \
 886     memcpy (where, f_el, l);                    \
 887     where += l;                                 \
 888   }                                             \
 889 } while (0)
 890
 891   FROB (path, '/');
 892   FROB (params, ';');
 893   FROB (query, '?');
 894
 895 #undef FROB
 896 }
 897
 898 /* Public function for getting the "full path".  E.g. if u->path is
 899    "foo/bar" and u->query is "param=value", full_path will be
 900    "/foo/bar?param=value". */
 901
 902 char *
 903 url_full_path (const struct url *url)
 904 {
 905   int length = full_path_length (url);
 906   char *full_path = (char *)xmalloc(length + 1);
 907
 908   full_path_write (url, full_path);
 909   full_path[length] = '\0';
 910
 911   return full_path;
 912 }
 913
 914 /* Sync u->path and u->url with u->dir and u->file. */
 915
 916 static void
 917 sync_path (struct url *url)
 918 {
 919   char *newpath;
 920
 921   xfree (url->path);
 922
 923   if (!*url->dir)
 924     {
 925       newpath = xstrdup (url->file);
 926       REENCODE (newpath);
 927     }
 928   else
 929     {
 930       int dirlen = strlen (url->dir);
 931       int filelen = strlen (url->file);
 932
 933       newpath = xmalloc (dirlen + 1 + filelen + 1);
 934       memcpy (newpath, url->dir, dirlen);
 935       newpath[dirlen] = '/';
 936       memcpy (newpath + dirlen + 1, url->file, filelen);
 937       newpath[dirlen + 1 + filelen] = '\0';
 938       REENCODE (newpath);
 939     }
 940
 941   url->path = newpath;
 942
 943   /* Synchronize u->url. */
 944   xfree (url->url);
 945   url->url = url_string (url, 0);
 946 }
 947
 948 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
 949    This way we can sync u->path and u->url when they get changed.  */
 950
 951 void
 952 url_set_dir (struct url *url, const char *newdir)
 953 {
 954   xfree (url->dir);
 955   url->dir = xstrdup (newdir);
 956   sync_path (url);
 957 }
 958
 959 void
 960 url_set_file (struct url *url, const char *newfile)
 961 {
 962   xfree (url->file);
 963   url->file = xstrdup (newfile);
 964   sync_path (url);
 965 }
 966
 967 void
 968 url_free (struct url *url)
 969 {
 970   xfree (url->host);
 971   xfree (url->path);
 972   xfree (url->url);
 973
 974   FREE_MAYBE (url->params);
 975   FREE_MAYBE (url->query);
 976   FREE_MAYBE (url->fragment);
 977   FREE_MAYBE (url->user);
 978   FREE_MAYBE (url->passwd);
 979
 980   xfree (url->dir);
 981   xfree (url->file);
 982
 983   xfree (url);
 984 }
 985 \f
 986 struct urlpos *
 987 get_urls_file (const char *file)
 988 {
 989   struct file_memory *fm;
 990   struct urlpos *head, *tail;
 991   const char *text, *text_end;
 992
 993   /* Load the file.  */
 994   fm = read_file (file);
 995   if (!fm)
 996     {
 997       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 998       return NULL;
 999     }
1000   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
1001
1002   head = tail = NULL;
1003   text = fm->content;
1004   text_end = fm->content + fm->length;
1005   while (text < text_end)
1006     {
1007       const char *line_beg = text;
1008       const char *line_end = memchr (text, '\n', text_end - text);
1009       if (!line_end)
1010         line_end = text_end;
1011       else
1012         ++line_end;
1013       text = line_end;
1014
1015       /* Strip whitespace from the beginning and end of line. */
1016       while (line_beg < line_end && ISSPACE (*line_beg))
1017         ++line_beg;
1018       while (line_end > line_beg && ISSPACE (*(line_end - 1)))
1019         --line_end;
1020
1021       if (line_end > line_beg)
1022         {
1023           /* URL is in the [line_beg, line_end) region. */
1024
1025           int up_error_code;
1026           char *url_text;
1027           struct urlpos *entry;
1028           struct url *url;
1029
1030           /* We must copy the URL to a zero-terminated string, and we
1031              can't use alloca because we're in a loop.  *sigh*.  */
1032           url_text = strdupdelim (line_beg, line_end);
1033
1034           if (opt.base_href)
1035             {
1036               /* Merge opt.base_href with URL. */
1037               char *merged = uri_merge (opt.base_href, url_text);
1038               xfree (url_text);
1039               url_text = merged;
1040             }
1041
1042           url = url_parse (url_text, &up_error_code);
1043           if (!url)
1044             {
1045               logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1046                          file, url_text, url_error (up_error_code));
1047               xfree (url_text);
1048               continue;
1049             }
1050           xfree (url_text);
1051
1052           entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1053           memset (entry, 0, sizeof (*entry));
1054           entry->next = NULL;
1055           entry->url = url;
1056
1057           if (!head)
1058             head = entry;
1059           else
1060             tail->next = entry;
1061           tail = entry;
1062         }
1063     }
1064   read_file_free (fm);
1065   return head;
1066 }
1067 \f
1068 /* Free the linked list of urlpos.  */
1069 void
1070 free_urlpos (struct urlpos *l)
1071 {
1072   while (l)
1073     {
1074       struct urlpos *next = l->next;
1075       if (l->url)
1076         url_free (l->url);
1077       FREE_MAYBE (l->local_name);
1078       xfree (l);
1079       l = next;
1080     }
1081 }
1082
1083 /* Rotate FNAME opt.backups times */
1084 void
1085 rotate_backups(const char *fname)
1086 {
1087   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1088   char *from = (char *)alloca (maxlen);
1089   char *to = (char *)alloca (maxlen);
1090   struct stat sb;
1091   int i;
1092
1093   if (stat (fname, &sb) == 0)
1094     if (S_ISREG (sb.st_mode) == 0)
1095       return;
1096
1097   for (i = opt.backups; i > 1; i--)
1098     {
1099       sprintf (from, "%s.%d", fname, i - 1);
1100       sprintf (to, "%s.%d", fname, i);
1101       /* #### This will fail on machines without the rename() system
1102          call.  */
1103       rename (from, to);
1104     }
1105
1106   sprintf (to, "%s.%d", fname, 1);
1107   rename(fname, to);
1108 }
1109
1110 /* Create all the necessary directories for PATH (a file).  Calls
1111    mkdirhier() internally.  */
1112 int
1113 mkalldirs (const char *path)
1114 {
1115   const char *p;
1116   char *t;
1117   struct stat st;
1118   int res;
1119
1120   p = path + strlen (path);
1121   for (; *p != '/' && p != path; p--);
1122   /* Don't create if it's just a file.  */
1123   if ((p == path) && (*p != '/'))
1124     return 0;
1125   t = strdupdelim (path, p);
1126   /* Check whether the directory exists.  */
1127   if ((stat (t, &st) == 0))
1128     {
1129       if (S_ISDIR (st.st_mode))
1130         {
1131           xfree (t);
1132           return 0;
1133         }
1134       else
1135         {
1136           /* If the dir exists as a file name, remove it first.  This
1137              is *only* for Wget to work with buggy old CERN http
1138              servers.  Here is the scenario: When Wget tries to
1139              retrieve a directory without a slash, e.g.
1140              http://foo/bar (bar being a directory), CERN server will
1141              not redirect it too http://foo/bar/ -- it will generate a
1142              directory listing containing links to bar/file1,
1143              bar/file2, etc.  Wget will lose because it saves this
1144              HTML listing to a file `bar', so it cannot create the
1145              directory.  To work around this, if the file of the same
1146              name exists, we just remove it and create the directory
1147              anyway.  */
1148           DEBUGP (("Removing %s because of directory danger!\n", t));
1149           unlink (t);
1150         }
1151     }
1152   res = make_directory (t);
1153   if (res != 0)
1154     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1155   xfree (t);
1156   return res;
1157 }
1158
1159 static int
1160 count_slashes (const char *s)
1161 {
1162   int i = 0;
1163   while (*s)
1164     if (*s++ == '/')
1165       ++i;
1166   return i;
1167 }
1168
1169 /* Return the path name of the URL-equivalent file name, with a
1170    remote-like structure of directories.  */
1171 static char *
1172 mkstruct (const struct url *u)
1173 {
1174   char *dir, *dir_preencoding;
1175   char *file, *res, *dirpref;
1176   char *query = u->query && *u->query ? u->query : NULL;
1177   int l;
1178
1179   if (opt.cut_dirs)
1180     {
1181       char *ptr = u->dir + (*u->dir == '/');
1182       int slash_count = 1 + count_slashes (ptr);
1183       int cut = MINVAL (opt.cut_dirs, slash_count);
1184       for (; cut && *ptr; ptr++)
1185         if (*ptr == '/')
1186           --cut;
1187       STRDUP_ALLOCA (dir, ptr);
1188     }
1189   else
1190     dir = u->dir + (*u->dir == '/');
1191
1192   /* Check for the true name (or at least a consistent name for saving
1193      to directory) of HOST, reusing the hlist if possible.  */
1194   if (opt.add_hostdir)
1195     {
1196       /* Add dir_prefix and hostname (if required) to the beginning of
1197          dir.  */
1198       dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1199                                 + strlen (u->host)
1200                                 + 1 + numdigit (u->port)
1201                                 + 1);
1202       if (!DOTP (opt.dir_prefix))
1203         sprintf (dirpref, "%s/%s", opt.dir_prefix, u->host);
1204       else
1205         strcpy (dirpref, u->host);
1206
1207       if (u->port != scheme_default_port (u->scheme))
1208         {
1209           int len = strlen (dirpref);
1210           dirpref[len] = ':';
1211           number_to_string (dirpref + len + 1, u->port);
1212         }
1213     }
1214   else                          /* not add_hostdir */
1215     {
1216       if (!DOTP (opt.dir_prefix))
1217         dirpref = opt.dir_prefix;
1218       else
1219         dirpref = "";
1220     }
1221
1222   /* If there is a prefix, prepend it.  */
1223   if (*dirpref)
1224     {
1225       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1226       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1227       dir = newdir;
1228     }
1229
1230   dir_preencoding = dir;
1231   dir = reencode_string (dir_preencoding);
1232
1233   l = strlen (dir);
1234   if (l && dir[l - 1] == '/')
1235     dir[l - 1] = '\0';
1236
1237   if (!*u->file)
1238     file = "index.html";
1239   else
1240     file = u->file;
1241
1242   /* Finally, construct the full name.  */
1243   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file)
1244                          + (query ? (1 + strlen (query)) : 0)
1245                          + 1);
1246   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1247   if (query)
1248     {
1249       strcat (res, "?");
1250       strcat (res, query);
1251     }
1252   if (dir != dir_preencoding)
1253     xfree (dir);
1254   return res;
1255 }
1256
1257 /* Compose a file name out of BASE, an unescaped file name, and QUERY,
1258    an escaped query string.  The trick is to make sure that unsafe
1259    characters in BASE are escaped, and that slashes in QUERY are also
1260    escaped.  */
1261
1262 static char *
1263 compose_file_name (char *base, char *query)
1264 {
1265   char result[256];
1266   char *from;
1267   char *to = result;
1268
1269   /* Copy BASE to RESULT and encode all unsafe characters.  */
1270   from = base;
1271   while (*from && to - result < sizeof (result))
1272     {
1273       if (UNSAFE_CHAR (*from))
1274         {
1275           unsigned char c = *from++;
1276           *to++ = '%';
1277           *to++ = XDIGIT_TO_XCHAR (c >> 4);
1278           *to++ = XDIGIT_TO_XCHAR (c & 0xf);
1279         }
1280       else
1281         *to++ = *from++;
1282     }
1283
1284   if (query && to - result < sizeof (result))
1285     {
1286       *to++ = '?';
1287
1288       /* Copy QUERY to RESULT and encode all '/' characters. */
1289       from = query;
1290       while (*from && to - result < sizeof (result))
1291         {
1292           if (*from == '/')
1293             {
1294               *to++ = '%';
1295               *to++ = '2';
1296               *to++ = 'F';
1297               ++from;
1298             }
1299           else
1300             *to++ = *from++;
1301         }
1302     }
1303
1304   if (to - result < sizeof (result))
1305     *to = '\0';
1306   else
1307     /* Truncate input which is too long, presumably due to a huge
1308        query string.  */
1309     result[sizeof (result) - 1] = '\0';
1310
1311   return xstrdup (result);
1312 }
1313
1314 /* Create a unique filename, corresponding to a given URL.  Calls
1315    mkstruct if necessary.  Does *not* actually create any directories.  */
1316 char *
1317 url_filename (const struct url *u)
1318 {
1319   char *file, *name;
1320   int have_prefix = 0;          /* whether we must prepend opt.dir_prefix */
1321
1322   if (opt.dirstruct)
1323     {
1324       file = mkstruct (u);
1325       have_prefix = 1;
1326     }
1327   else
1328     {
1329       char *base = *u->file ? u->file : "index.html";
1330       char *query = u->query && *u->query ? u->query : NULL;
1331       file = compose_file_name (base, query);
1332     }
1333
1334   if (!have_prefix)
1335     {
1336       /* Check whether the prefix directory is something other than "."
1337          before prepending it.  */
1338       if (!DOTP (opt.dir_prefix))
1339         {
1340           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1341                                          + 1 + strlen (file) + 1);
1342           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1343           xfree (file);
1344           file = nfile;
1345         }
1346     }
1347   /* DOS-ish file systems don't like `%' signs in them; we change it
1348      to `@'.  */
1349 #ifdef WINDOWS
1350   {
1351     char *p = file;
1352     for (p = file; *p; p++)
1353       if (*p == '%')
1354         *p = '@';
1355   }
1356 #endif /* WINDOWS */
1357
1358   /* Check the cases in which the unique extensions are not used:
1359      1) Clobbering is turned off (-nc).
1360      2) Retrieval with regetting.
1361      3) Timestamping is used.
1362      4) Hierarchy is built.
1363
1364      The exception is the case when file does exist and is a
1365      directory (actually support for bad httpd-s).  */
1366   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1367       && !(file_exists_p (file) && !file_non_directory_p (file)))
1368     return file;
1369
1370   /* Find a unique name.  */
1371   name = unique_name (file);
1372   xfree (file);
1373   return name;
1374 }
1375
1376 /* Like strlen(), but allow the URL to be ended with '?'.  */
1377 static int
1378 urlpath_length (const char *url)
1379 {
1380   const char *q = strpbrk_or_eos (url, "?;#");
1381   return q - url;
1382 }
1383
1384 /* Find the last occurrence of character C in the range [b, e), or
1385    NULL, if none are present.  This is almost completely equivalent to
1386    { *e = '\0'; return strrchr(b); }, except that it doesn't change
1387    the contents of the string.  */
1388 static const char *
1389 find_last_char (const char *b, const char *e, char c)
1390 {
1391   for (; e > b; e--)
1392     if (*e == c)
1393       return e;
1394   return NULL;
1395 }
1396
1397 /* Resolve the result of "linking" a base URI (BASE) to a
1398    link-specified URI (LINK).
1399
1400    Either of the URIs may be absolute or relative, complete with the
1401    host name, or path only.  This tries to behave "reasonably" in all
1402    foreseeable cases.  It employs little specific knowledge about
1403    schemes or URL-specific stuff -- it just works on strings.
1404
1405    The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1406    See uri_merge for a gentler interface to this functionality.
1407
1408    Perhaps this function should handle `./' and `../' so that the evil
1409    path_simplify can go.  */
1410 static char *
1411 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1412 {
1413   char *constr;
1414
1415   if (no_scheme)
1416     {
1417       const char *end = base + urlpath_length (base);
1418
1419       if (!*link)
1420         {
1421           /* Empty LINK points back to BASE, query string and all. */
1422           constr = xstrdup (base);
1423         }
1424       else if (*link == '?')
1425         {
1426           /* LINK points to the same location, but changes the query
1427              string.  Examples: */
1428           /* uri_merge("path",         "?new") -> "path?new"     */
1429           /* uri_merge("path?foo",     "?new") -> "path?new"     */
1430           /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1431           /* uri_merge("path#foo",     "?new") -> "path?new"     */
1432           int baselength = end - base;
1433           constr = xmalloc (baselength + linklength + 1);
1434           memcpy (constr, base, baselength);
1435           memcpy (constr + baselength, link, linklength);
1436           constr[baselength + linklength] = '\0';
1437         }
1438       else if (*link == '#')
1439         {
1440           /* uri_merge("path",         "#new") -> "path#new"     */
1441           /* uri_merge("path#foo",     "#new") -> "path#new"     */
1442           /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1443           /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1444           int baselength;
1445           const char *end1 = strchr (base, '#');
1446           if (!end1)
1447             end1 = base + strlen (base);
1448           baselength = end1 - base;
1449           constr = xmalloc (baselength + linklength + 1);
1450           memcpy (constr, base, baselength);
1451           memcpy (constr + baselength, link, linklength);
1452           constr[baselength + linklength] = '\0';
1453         }
1454       else if (*link == '/')
1455         {
1456           /* LINK is an absolute path: we need to replace everything
1457              after (and including) the FIRST slash with LINK.
1458
1459              So, if BASE is "http://host/whatever/foo/bar", and LINK is
1460              "/qux/xyzzy", our result should be
1461              "http://host/qux/xyzzy".  */
1462           int span;
1463           const char *slash;
1464           const char *start_insert = NULL; /* for gcc to shut up. */
1465           const char *pos = base;
1466           int seen_slash_slash = 0;
1467           /* We're looking for the first slash, but want to ignore
1468              double slash. */
1469         again:
1470           slash = memchr (pos, '/', end - pos);
1471           if (slash && !seen_slash_slash)
1472             if (*(slash + 1) == '/')
1473               {
1474                 pos = slash + 2;
1475                 seen_slash_slash = 1;
1476                 goto again;
1477               }
1478
1479           /* At this point, SLASH is the location of the first / after
1480              "//", or the first slash altogether.  START_INSERT is the
1481              pointer to the location where LINK will be inserted.  When
1482              examining the last two examples, keep in mind that LINK
1483              begins with '/'. */
1484
1485           if (!slash && !seen_slash_slash)
1486             /* example: "foo" */
1487             /*           ^    */
1488             start_insert = base;
1489           else if (!slash && seen_slash_slash)
1490             /* example: "http://foo" */
1491             /*                     ^ */
1492             start_insert = end;
1493           else if (slash && !seen_slash_slash)
1494             /* example: "foo/bar" */
1495             /*           ^        */
1496             start_insert = base;
1497           else if (slash && seen_slash_slash)
1498             /* example: "http://something/" */
1499             /*                           ^  */
1500             start_insert = slash;
1501
1502           span = start_insert - base;
1503           constr = (char *)xmalloc (span + linklength + 1);
1504           if (span)
1505             memcpy (constr, base, span);
1506           if (linklength)
1507             memcpy (constr + span, link, linklength);
1508           constr[span + linklength] = '\0';
1509         }
1510       else
1511         {
1512           /* LINK is a relative URL: we need to replace everything
1513              after last slash (possibly empty) with LINK.
1514
1515              So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1516              our result should be "whatever/foo/qux/xyzzy".  */
1517           int need_explicit_slash = 0;
1518           int span;
1519           const char *start_insert;
1520           const char *last_slash = find_last_char (base, end, '/');
1521           if (!last_slash)
1522             {
1523               /* No slash found at all.  Append LINK to what we have,
1524                  but we'll need a slash as a separator.
1525
1526                  Example: if base == "foo" and link == "qux/xyzzy", then
1527                  we cannot just append link to base, because we'd get
1528                  "fooqux/xyzzy", whereas what we want is
1529                  "foo/qux/xyzzy".
1530
1531                  To make sure the / gets inserted, we set
1532                  need_explicit_slash to 1.  We also set start_insert
1533                  to end + 1, so that the length calculations work out
1534                  correctly for one more (slash) character.  Accessing
1535                  that character is fine, since it will be the
1536                  delimiter, '\0' or '?'.  */
1537               /* example: "foo?..." */
1538               /*               ^    ('?' gets changed to '/') */
1539               start_insert = end + 1;
1540               need_explicit_slash = 1;
1541             }
1542           else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1543             {
1544               /* example: http://host"  */
1545               /*                      ^ */
1546               start_insert = end + 1;
1547               need_explicit_slash = 1;
1548             }
1549           else
1550             {
1551               /* example: "whatever/foo/bar" */
1552               /*                        ^    */
1553               start_insert = last_slash + 1;
1554             }
1555
1556           span = start_insert - base;
1557           constr = (char *)xmalloc (span + linklength + 1);
1558           if (span)
1559             memcpy (constr, base, span);
1560           if (need_explicit_slash)
1561             constr[span - 1] = '/';
1562           if (linklength)
1563             memcpy (constr + span, link, linklength);
1564           constr[span + linklength] = '\0';
1565         }
1566     }
1567   else /* !no_scheme */
1568     {
1569       constr = strdupdelim (link, link + linklength);
1570     }
1571   return constr;
1572 }
1573
1574 /* Merge BASE with LINK and return the resulting URI.  This is an
1575    interface to uri_merge_1 that assumes that LINK is a
1576    zero-terminated string.  */
1577 char *
1578 uri_merge (const char *base, const char *link)
1579 {
1580   return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
1581 }
1582 \f
1583 #define APPEND(p, s) do {                       \
1584   int len = strlen (s);                         \
1585   memcpy (p, s, len);                           \
1586   p += len;                                     \
1587 } while (0)
1588
1589 /* Use this instead of password when the actual password is supposed
1590    to be hidden.  We intentionally use a generic string without giving
1591    away the number of characters in the password, like previous
1592    versions did.  */
1593 #define HIDDEN_PASSWORD "*password*"
1594
1595 /* Recreate the URL string from the data in URL.
1596
1597    If HIDE is non-zero (as it is when we're calling this on a URL we
1598    plan to print, but not when calling it to canonicalize a URL for
1599    use within the program), password will be hidden.  Unsafe
1600    characters in the URL will be quoted.  */
1601
1602 char *
1603 url_string (const struct url *url, int hide_password)
1604 {
1605   int size;
1606   char *result, *p;
1607   char *quoted_user = NULL, *quoted_passwd = NULL;
1608
1609   int scheme_port  = supported_schemes[url->scheme].default_port;
1610   char *scheme_str = supported_schemes[url->scheme].leading_string;
1611   int fplen = full_path_length (url);
1612
1613   assert (scheme_str != NULL);
1614
1615   /* Make sure the user name and password are quoted. */
1616   if (url->user)
1617     {
1618       quoted_user = encode_string_maybe (url->user);
1619       if (url->passwd)
1620         {
1621           if (hide_password)
1622             quoted_passwd = HIDDEN_PASSWORD;
1623           else
1624             quoted_passwd = encode_string_maybe (url->passwd);
1625         }
1626     }
1627
1628   size = (strlen (scheme_str)
1629           + strlen (url->host)
1630           + fplen
1631           + 1);
1632   if (url->port != scheme_port)
1633     size += 1 + numdigit (url->port);
1634   if (quoted_user)
1635     {
1636       size += 1 + strlen (quoted_user);
1637       if (quoted_passwd)
1638         size += 1 + strlen (quoted_passwd);
1639     }
1640
1641   p = result = xmalloc (size);
1642
1643   APPEND (p, scheme_str);
1644   if (quoted_user)
1645     {
1646       APPEND (p, quoted_user);
1647       if (quoted_passwd)
1648         {
1649           *p++ = ':';
1650           APPEND (p, quoted_passwd);
1651         }
1652       *p++ = '@';
1653     }
1654
1655   APPEND (p, url->host);
1656   if (url->port != scheme_port)
1657     {
1658       *p++ = ':';
1659       p = number_to_string (p, url->port);
1660     }
1661
1662   full_path_write (url, p);
1663   p += fplen;
1664   *p++ = '\0';
1665
1666   assert (p - result == size);
1667
1668   if (quoted_user && quoted_user != url->user)
1669     xfree (quoted_user);
1670   if (quoted_passwd && !hide_password
1671       && quoted_passwd != url->passwd)
1672     xfree (quoted_passwd);
1673
1674   return result;
1675 }
1676 \f
1677 /* Returns proxy host address, in accordance with SCHEME.  */
1678 char *
1679 getproxy (enum url_scheme scheme)
1680 {
1681   char *proxy = NULL;
1682   char *rewritten_url;
1683   static char rewritten_storage[1024];
1684
1685   switch (scheme)
1686     {
1687     case SCHEME_HTTP:
1688       proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1689       break;
1690 #ifdef HAVE_SSL
1691     case SCHEME_HTTPS:
1692       proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1693       break;
1694 #endif
1695     case SCHEME_FTP:
1696       proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1697       break;
1698     case SCHEME_INVALID:
1699       break;
1700     }
1701   if (!proxy || !*proxy)
1702     return NULL;
1703
1704   /* Handle shorthands. */
1705   rewritten_url = rewrite_shorthand_url (proxy);
1706   if (rewritten_url)
1707     {
1708       strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
1709       rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
1710       proxy = rewritten_storage;
1711     }
1712
1713   return proxy;
1714 }
1715
1716 /* Should a host be accessed through proxy, concerning no_proxy?  */
1717 int
1718 no_proxy_match (const char *host, const char **no_proxy)
1719 {
1720   if (!no_proxy)
1721     return 1;
1722   else
1723     return !sufmatch (no_proxy, host);
1724 }
1725 \f
1726 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1727 static const char *replace_attr PARAMS ((const char *, int, FILE *,
1728                                          const char *));
1729 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
1730                                                       const char *, int));
1731 static char *local_quote_string PARAMS ((const char *));
1732
1733 /* Change the links in one HTML file.  LINKS is a list of links in the
1734    document, along with their positions and the desired direction of
1735    the conversion.  */
1736 void
1737 convert_links (const char *file, struct urlpos *links)
1738 {
1739   struct file_memory *fm;
1740   FILE *fp;
1741   const char *p;
1742   downloaded_file_t downloaded_file_return;
1743
1744   struct urlpos *link;
1745   int to_url_count = 0, to_file_count = 0;
1746
1747   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1748
1749   {
1750     /* First we do a "dry run": go through the list L and see whether
1751        any URL needs to be converted in the first place.  If not, just
1752        leave the file alone.  */
1753     int dry_count = 0;
1754     struct urlpos *dry = links;
1755     for (dry = links; dry; dry = dry->next)
1756       if (dry->convert != CO_NOCONVERT)
1757         ++dry_count;
1758     if (!dry_count)
1759       {
1760         logputs (LOG_VERBOSE, _("nothing to do.\n"));
1761         return;
1762       }
1763   }
1764
1765   fm = read_file (file);
1766   if (!fm)
1767     {
1768       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1769                  file, strerror (errno));
1770       return;
1771     }
1772
1773   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1774   if (opt.backup_converted && downloaded_file_return)
1775     write_backup_file (file, downloaded_file_return);
1776
1777   /* Before opening the file for writing, unlink the file.  This is
1778      important if the data in FM is mmaped.  In such case, nulling the
1779      file, which is what fopen() below does, would make us read all
1780      zeroes from the mmaped region.  */
1781   if (unlink (file) < 0 && errno != ENOENT)
1782     {
1783       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1784                  file, strerror (errno));
1785       read_file_free (fm);
1786       return;
1787     }
1788   /* Now open the file for writing.  */
1789   fp = fopen (file, "wb");
1790   if (!fp)
1791     {
1792       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1793                  file, strerror (errno));
1794       read_file_free (fm);
1795       return;
1796     }
1797
1798   /* Here we loop through all the URLs in file, replacing those of
1799      them that are downloaded with relative references.  */
1800   p = fm->content;
1801   for (link = links; link; link = link->next)
1802     {
1803       char *url_start = fm->content + link->pos;
1804
1805       if (link->pos >= fm->length)
1806         {
1807           DEBUGP (("Something strange is going on.  Please investigate."));
1808           break;
1809         }
1810       /* If the URL is not to be converted, skip it.  */
1811       if (link->convert == CO_NOCONVERT)
1812         {
1813           DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
1814           continue;
1815         }
1816
1817       /* Echo the file contents, up to the offending URL's opening
1818          quote, to the outfile.  */
1819       fwrite (p, 1, url_start - p, fp);
1820       p = url_start;
1821
1822       switch (link->convert)
1823         {
1824         case CO_CONVERT_TO_RELATIVE:
1825           /* Convert absolute URL to relative. */
1826           {
1827             char *newname = construct_relative (file, link->local_name);
1828             char *quoted_newname = local_quote_string (newname);
1829
1830             if (!link->link_refresh_p)
1831               p = replace_attr (p, link->size, fp, quoted_newname);
1832             else
1833               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
1834                                              link->refresh_timeout);
1835
1836             DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1837                      link->url->url, newname, link->pos, file));
1838             xfree (newname);
1839             xfree (quoted_newname);
1840             ++to_file_count;
1841             break;
1842           }
1843         case CO_CONVERT_TO_COMPLETE:
1844           /* Convert the link to absolute URL. */
1845           {
1846             char *newlink = link->url->url;
1847             char *quoted_newlink = html_quote_string (newlink);
1848
1849             if (!link->link_refresh_p)
1850               p = replace_attr (p, link->size, fp, quoted_newlink);
1851             else
1852               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
1853                                              link->refresh_timeout);
1854
1855             DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1856                      newlink, link->pos, file));
1857             xfree (quoted_newlink);
1858             ++to_url_count;
1859             break;
1860           }
1861         case CO_NULLIFY_BASE:
1862           /* Change the base href to "". */
1863           p = replace_attr (p, link->size, fp, "");
1864           break;
1865         case CO_NOCONVERT:
1866           abort ();
1867           break;
1868         }
1869     }
1870
1871   /* Output the rest of the file. */
1872   if (p - fm->content < fm->length)
1873     fwrite (p, 1, fm->length - (p - fm->content), fp);
1874   fclose (fp);
1875   read_file_free (fm);
1876
1877   logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
1878 }
1879
1880 /* Construct and return a malloced copy of the relative link from two
1881    pieces of information: local name S1 of the referring file and
1882    local name S2 of the referred file.
1883
1884    So, if S1 is "jagor.srce.hr/index.html" and S2 is
1885    "jagor.srce.hr/images/news.gif", the function will return
1886    "images/news.gif".
1887
1888    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1889    "fly.cc.fer.hr/images/fly.gif", the function will return
1890    "../images/fly.gif".
1891
1892    Caveats: S1 should not begin with `/', unless S2 also begins with
1893    '/'.  S1 should not contain things like ".." and such --
1894    construct_relative ("fly/ioccc/../index.html",
1895    "fly/images/fly.gif") will fail.  (A workaround is to call
1896    something like path_simplify() on S1).  */
1897 static char *
1898 construct_relative (const char *s1, const char *s2)
1899 {
1900   int i, cnt, sepdirs1;
1901   char *res;
1902
1903   if (*s2 == '/')
1904     return xstrdup (s2);
1905   /* S1 should *not* be absolute, if S2 wasn't.  */
1906   assert (*s1 != '/');
1907   i = cnt = 0;
1908   /* Skip the directories common to both strings.  */
1909   while (1)
1910     {
1911       while (s1[i] && s2[i]
1912              && (s1[i] == s2[i])
1913              && (s1[i] != '/')
1914              && (s2[i] != '/'))
1915         ++i;
1916       if (s1[i] == '/' && s2[i] == '/')
1917         cnt = ++i;
1918       else
1919         break;
1920     }
1921   for (sepdirs1 = 0; s1[i]; i++)
1922     if (s1[i] == '/')
1923       ++sepdirs1;
1924   /* Now, construct the file as of:
1925      - ../ repeated sepdirs1 time
1926      - all the non-mutual directories of S2.  */
1927   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1928   for (i = 0; i < sepdirs1; i++)
1929     memcpy (res + 3 * i, "../", 3);
1930   strcpy (res + 3 * i, s2 + cnt);
1931   return res;
1932 }
1933 \f
1934 static void
1935 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1936 {
1937   /* Rather than just writing over the original .html file with the
1938      converted version, save the former to *.orig.  Note we only do
1939      this for files we've _successfully_ downloaded, so we don't
1940      clobber .orig files sitting around from previous invocations. */
1941
1942   /* Construct the backup filename as the original name plus ".orig". */
1943   size_t         filename_len = strlen(file);
1944   char*          filename_plus_orig_suffix;
1945   boolean        already_wrote_backup_file = FALSE;
1946   slist*         converted_file_ptr;
1947   static slist*  converted_files = NULL;
1948
1949   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1950     {
1951       /* Just write "orig" over "html".  We need to do it this way
1952          because when we're checking to see if we've downloaded the
1953          file before (to see if we can skip downloading it), we don't
1954          know if it's a text/html file.  Therefore we don't know yet
1955          at that stage that -E is going to cause us to tack on
1956          ".html", so we need to compare vs. the original URL plus
1957          ".orig", not the original URL plus ".html.orig". */
1958       filename_plus_orig_suffix = alloca (filename_len + 1);
1959       strcpy(filename_plus_orig_suffix, file);
1960       strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1961     }
1962   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1963     {
1964       /* Append ".orig" to the name. */
1965       filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1966       strcpy(filename_plus_orig_suffix, file);
1967       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1968     }
1969
1970   /* We can get called twice on the same URL thanks to the
1971      convert_all_links() call in main().  If we write the .orig file
1972      each time in such a case, it'll end up containing the first-pass
1973      conversion, not the original file.  So, see if we've already been
1974      called on this file. */
1975   converted_file_ptr = converted_files;
1976   while (converted_file_ptr != NULL)
1977     if (strcmp(converted_file_ptr->string, file) == 0)
1978       {
1979         already_wrote_backup_file = TRUE;
1980         break;
1981       }
1982     else
1983       converted_file_ptr = converted_file_ptr->next;
1984
1985   if (!already_wrote_backup_file)
1986     {
1987       /* Rename <file> to <file>.orig before former gets written over. */
1988       if (rename(file, filename_plus_orig_suffix) != 0)
1989         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1990                    file, filename_plus_orig_suffix, strerror (errno));
1991
1992       /* Remember that we've already written a .orig backup for this file.
1993          Note that we never free this memory since we need it till the
1994          convert_all_links() call, which is one of the last things the
1995          program does before terminating.  BTW, I'm not sure if it would be
1996          safe to just set 'converted_file_ptr->string' to 'file' below,
1997          rather than making a copy of the string...  Another note is that I
1998          thought I could just add a field to the urlpos structure saying
1999          that we'd written a .orig file for this URL, but that didn't work,
2000          so I had to make this separate list.
2001          -- Dan Harkless <wget@harkless.org>
2002
2003          This [adding a field to the urlpos structure] didn't work
2004          because convert_file() is called from convert_all_links at
2005          the end of the retrieval with a freshly built new urlpos
2006          list.
2007          -- Hrvoje Niksic <hniksic@arsdigita.com>
2008       */
2009       converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
2010       converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
2011       converted_file_ptr->next = converted_files;
2012       converted_files = converted_file_ptr;
2013     }
2014 }
2015
2016 static int find_fragment PARAMS ((const char *, int, const char **,
2017                                   const char **));
2018
2019 /* Replace an attribute's original text with NEW_TEXT. */
2020
2021 static const char *
2022 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
2023 {
2024   int quote_flag = 0;
2025   char quote_char = '\"';       /* use "..." for quoting, unless the
2026                                    original value is quoted, in which
2027                                    case reuse its quoting char. */
2028   const char *frag_beg, *frag_end;
2029
2030   /* Structure of our string is:
2031        "...old-contents..."
2032        <---    size    --->  (with quotes)
2033      OR:
2034        ...old-contents...
2035        <---    size   -->    (no quotes)   */
2036
2037   if (*p == '\"' || *p == '\'')
2038     {
2039       quote_char = *p;
2040       quote_flag = 1;
2041       ++p;
2042       size -= 2;                /* disregard opening and closing quote */
2043     }
2044   putc (quote_char, fp);
2045   fputs (new_text, fp);
2046
2047   /* Look for fragment identifier, if any. */
2048   if (find_fragment (p, size, &frag_beg, &frag_end))
2049     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2050   p += size;
2051   if (quote_flag)
2052     ++p;
2053   putc (quote_char, fp);
2054
2055   return p;
2056 }
2057
2058 /* The same as REPLACE_ATTR, but used when replacing
2059    <meta http-equiv=refresh content="new_text"> because we need to
2060    append "timeout_value; URL=" before the next_text.  */
2061
2062 static const char *
2063 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
2064                            const char *new_text, int timeout)
2065 {
2066   /* "0; URL=..." */
2067   char *new_with_timeout = (char *)alloca (numdigit (timeout)
2068                                            + 6 /* "; URL=" */
2069                                            + strlen (new_text)
2070                                            + 1);
2071   sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
2072
2073   return replace_attr (p, size, fp, new_with_timeout);
2074 }
2075
2076 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2077    preceded by '&'.  If the character is not found, return zero.  If
2078    the character is found, return 1 and set BP and EP to point to the
2079    beginning and end of the region.
2080
2081    This is used for finding the fragment indentifiers in URLs.  */
2082
2083 static int
2084 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2085 {
2086   const char *end = beg + size;
2087   int saw_amp = 0;
2088   for (; beg < end; beg++)
2089     {
2090       switch (*beg)
2091         {
2092         case '&':
2093           saw_amp = 1;
2094           break;
2095         case '#':
2096           if (!saw_amp)
2097             {
2098               *bp = beg;
2099               *ep = end;
2100               return 1;
2101             }
2102           /* fallthrough */
2103         default:
2104           saw_amp = 0;
2105         }
2106     }
2107   return 0;
2108 }
2109
2110 /* Quote FILE for use as local reference to an HTML file.
2111
2112    We quote ? as %3F to avoid passing part of the file name as the
2113    parameter when browsing the converted file through HTTP.  However,
2114    it is safe to do this only when `--html-extension' is turned on.
2115    This is because converting "index.html?foo=bar" to
2116    "index.html%3Ffoo=bar" would break local browsing, as the latter
2117    isn't even recognized as an HTML file!  However, converting
2118    "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
2119    safe for both local and HTTP-served browsing.  */
2120
2121 static char *
2122 local_quote_string (const char *file)
2123 {
2124   const char *file_sans_qmark;
2125   int qm;
2126
2127   if (!opt.html_extension)
2128     return html_quote_string (file);
2129
2130   qm = count_char (file, '?');
2131
2132   if (qm)
2133     {
2134       const char *from = file;
2135       char *to, *newname;
2136
2137       /* qm * 2 because we replace each question mark with "%3F",
2138          i.e. replace one char with three, hence two more.  */
2139       int fsqlen = strlen (file) + qm * 2;
2140
2141       to = newname = (char *)alloca (fsqlen + 1);
2142       for (; *from; from++)
2143         {
2144           if (*from != '?')
2145             *to++ = *from;
2146           else
2147             {
2148               *to++ = '%';
2149               *to++ = '3';
2150               *to++ = 'F';
2151             }
2152         }
2153       assert (to - newname == fsqlen);
2154       *to = '\0';
2155
2156       file_sans_qmark = newname;
2157     }
2158   else
2159     file_sans_qmark = file;
2160
2161   return html_quote_string (file_sans_qmark);
2162 }
2163
2164 /* We're storing "modes" of type downloaded_file_t in the hash table.
2165    However, our hash tables only accept pointers for keys and values.
2166    So when we need a pointer, we use the address of a
2167    downloaded_file_t variable of static storage.  */
2168
2169 static downloaded_file_t *
2170 downloaded_mode_to_ptr (downloaded_file_t mode)
2171 {
2172   static downloaded_file_t
2173     v1 = FILE_NOT_ALREADY_DOWNLOADED,
2174     v2 = FILE_DOWNLOADED_NORMALLY,
2175     v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2176     v4 = CHECK_FOR_FILE;
2177
2178   switch (mode)
2179     {
2180     case FILE_NOT_ALREADY_DOWNLOADED:
2181       return &v1;
2182     case FILE_DOWNLOADED_NORMALLY:
2183       return &v2;
2184     case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2185       return &v3;
2186     case CHECK_FOR_FILE:
2187       return &v4;
2188     }
2189   return NULL;
2190 }
2191
2192 /* This should really be merged with dl_file_url_map and
2193    downloaded_html_files in recur.c.  This was originally a list, but
2194    I changed it to a hash table beause it was actually taking a lot of
2195    time to find things in it.  */
2196
2197 static struct hash_table *downloaded_files_hash;
2198
2199 /* Remembers which files have been downloaded.  In the standard case, should be
2200    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2201    download successfully (i.e. not for ones we have failures on or that we skip
2202    due to -N).
2203
2204    When we've downloaded a file and tacked on a ".html" extension due to -E,
2205    call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2206    FILE_DOWNLOADED_NORMALLY.
2207
2208    If you just want to check if a file has been previously added without adding
2209    it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
2210    with local filenames, not remote URLs. */
2211 downloaded_file_t
2212 downloaded_file (downloaded_file_t mode, const char *file)
2213 {
2214   downloaded_file_t *ptr;
2215
2216   if (mode == CHECK_FOR_FILE)
2217     {
2218       if (!downloaded_files_hash)
2219         return FILE_NOT_ALREADY_DOWNLOADED;
2220       ptr = hash_table_get (downloaded_files_hash, file);
2221       if (!ptr)
2222         return FILE_NOT_ALREADY_DOWNLOADED;
2223       return *ptr;
2224     }
2225
2226   if (!downloaded_files_hash)
2227     downloaded_files_hash = make_string_hash_table (0);
2228
2229   ptr = hash_table_get (downloaded_files_hash, file);
2230   if (ptr)
2231     return *ptr;
2232
2233   ptr = downloaded_mode_to_ptr (mode);
2234   hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2235
2236   return FILE_NOT_ALREADY_DOWNLOADED;
2237 }
2238
2239 static int
2240 df_free_mapper (void *key, void *value, void *ignored)
2241 {
2242   xfree (key);
2243   return 0;
2244 }
2245
2246 void
2247 downloaded_files_free (void)
2248 {
2249   if (downloaded_files_hash)
2250     {
2251       hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2252       hash_table_destroy (downloaded_files_hash);
2253       downloaded_files_hash = NULL;
2254     }
2255 }