sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif
  29 #include <sys/types.h>
  30 #ifdef HAVE_UNISTD_H
  31 # include <unistd.h>
  32 #endif
  33 #include <errno.h>
  34 #include <assert.h>
  35
  36 #include "wget.h"
  37 #include "utils.h"
  38 #include "url.h"
  39 #include "host.h"
  40 #include "hash.h"
  41
  42 #ifndef errno
  43 extern int errno;
  44 #endif
  45
  46 /* Is X "."?  */
  47 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  48 /* Is X ".."?  */
  49 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  50
  51 static int urlpath_length PARAMS ((const char *));
  52
  53 struct scheme_data
  54 {
  55   char *leading_string;
  56   int default_port;
  57   int enabled;
  58 };
  59
  60 /* Supported schemes: */
  61 static struct scheme_data supported_schemes[] =
  62 {
  63   { "http://",  DEFAULT_HTTP_PORT,  1 },
  64 #ifdef HAVE_SSL
  65   { "https://", DEFAULT_HTTPS_PORT, 1 },
  66 #endif
  67   { "ftp://",   DEFAULT_FTP_PORT,   1 },
  68
  69   /* SCHEME_INVALID */
  70   { NULL,       -1,                 0 }
  71 };
  72
  73 static char *construct_relative PARAMS ((const char *, const char *));
  74
  75 \f
  76 /* Support for encoding and decoding of URL strings.  We determine
  77    whether a character is unsafe through static table lookup.  This
  78    code assumes ASCII character set and 8-bit chars.  */
  79
  80 enum {
  81   urlchr_reserved = 1,
  82   urlchr_unsafe   = 2
  83 };
  84
  85 #define R  urlchr_reserved
  86 #define U  urlchr_unsafe
  87 #define RU R|U
  88
  89 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
  90
  91 /* rfc1738 reserved chars, preserved from encoding.  */
  92
  93 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
  94
  95 /* rfc1738 unsafe chars, plus some more.  */
  96
  97 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
  98
  99 const static unsigned char urlchr_table[256] =
 100 {
 101   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 102   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 103   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 104   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 105   U,  0,  U, RU,   0,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 106   0,  0,  0,  R,   0,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 107   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 108   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 109  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 110   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 111   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 112   0,  0,  0,  U,   U,  U,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 113   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 114   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 115   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 116   0,  0,  0,  U,   U,  U,  U,  U,   /* x   y   z   {    |   }   ~   DEL */
 117
 118   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 119   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 120   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 121   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 122
 123   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 124   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 125   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 126   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 127 };
 128
 129 /* Decodes the forms %xy in a URL to the character the hexadecimal
 130    code of which is xy.  xy are hexadecimal digits from
 131    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 132    hex-digits or `%' precedes `\0', the sequence is inserted
 133    literally.  */
 134
 135 static void
 136 decode_string (char *s)
 137 {
 138   char *t = s;                  /* t - tortoise */
 139   char *h = s;                  /* h - hare     */
 140
 141   for (; *h; h++, t++)
 142     {
 143       if (*h != '%')
 144         {
 145         copychar:
 146           *t = *h;
 147         }
 148       else
 149         {
 150           /* Do nothing if '%' is not followed by two hex digits. */
 151           if (!*(h + 1) || !*(h + 2)
 152               || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
 153             goto copychar;
 154           *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
 155           h += 2;
 156         }
 157     }
 158   *t = '\0';
 159 }
 160
 161 /* Like encode_string, but return S if there are no unsafe chars.  */
 162
 163 static char *
 164 encode_string_maybe (const char *s)
 165 {
 166   const char *p1;
 167   char *p2, *newstr;
 168   int newlen;
 169   int addition = 0;
 170
 171   for (p1 = s; *p1; p1++)
 172     if (UNSAFE_CHAR (*p1))
 173       addition += 2;            /* Two more characters (hex digits) */
 174
 175   if (!addition)
 176     return (char *)s;
 177
 178   newlen = (p1 - s) + addition;
 179   newstr = (char *)xmalloc (newlen + 1);
 180
 181   p1 = s;
 182   p2 = newstr;
 183   while (*p1)
 184     {
 185       if (UNSAFE_CHAR (*p1))
 186         {
 187           unsigned char c = *p1++;
 188           *p2++ = '%';
 189           *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 190           *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 191         }
 192       else
 193         *p2++ = *p1++;
 194     }
 195   *p2 = '\0';
 196   assert (p2 - newstr == newlen);
 197
 198   return newstr;
 199 }
 200
 201 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
 202    given string, returning a malloc-ed %XX encoded string.  */
 203
 204 char *
 205 encode_string (const char *s)
 206 {
 207   char *encoded = encode_string_maybe (s);
 208   if (encoded != s)
 209     return encoded;
 210   else
 211     return xstrdup (s);
 212 }
 213
 214 /* Encode unsafe characters in PTR to %xx.  If such encoding is done,
 215    the old value of PTR is freed and PTR is made to point to the newly
 216    allocated storage.  */
 217
 218 #define ENCODE(ptr) do {                        \
 219   char *e_new = encode_string_maybe (ptr);      \
 220   if (e_new != ptr)                             \
 221     {                                           \
 222       xfree (ptr);                              \
 223       ptr = e_new;                              \
 224     }                                           \
 225 } while (0)
 226 \f
 227 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
 228
 229 /* Decide whether to encode, decode, or pass through the char at P.
 230    This used to be a macro, but it got a little too convoluted.  */
 231 static inline enum copy_method
 232 decide_copy_method (const char *p)
 233 {
 234   if (*p == '%')
 235     {
 236       if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
 237         {
 238           /* %xx sequence: decode it, unless it would decode to an
 239              unsafe or a reserved char; in that case, leave it as
 240              is. */
 241           char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
 242             XCHAR_TO_XDIGIT (*(p + 2));
 243
 244           if (UNSAFE_CHAR (preempt) || RESERVED_CHAR (preempt))
 245             return CM_PASSTHROUGH;
 246           else
 247             return CM_DECODE;
 248         }
 249       else
 250         /* Garbled %.. sequence: encode `%'. */
 251         return CM_ENCODE;
 252     }
 253   else if (UNSAFE_CHAR (*p) && !RESERVED_CHAR (*p))
 254     return CM_ENCODE;
 255   else
 256     return CM_PASSTHROUGH;
 257 }
 258
 259 /* Translate a %-quoting (but possibly non-conformant) input string S
 260    into a %-quoting (and conformant) output string.  If no characters
 261    are encoded or decoded, return the same string S; otherwise, return
 262    a freshly allocated string with the new contents.
 263
 264    After a URL has been run through this function, the protocols that
 265    use `%' as the quote character can use the resulting string as-is,
 266    while those that don't call decode_string() to get to the intended
 267    data.  This function is also stable: after an input string is
 268    transformed the first time, all further transformations of the
 269    result yield the same result string.
 270
 271    Let's discuss why this function is needed.
 272
 273    Imagine Wget is to retrieve `http://abc.xyz/abc def'.  Since a raw
 274    space character would mess up the HTTP request, it needs to be
 275    quoted, like this:
 276
 277        GET /abc%20def HTTP/1.0
 278
 279    So it appears that the unsafe chars need to be quoted, as with
 280    encode_string.  But what if we're requested to download
 281    `abc%20def'?  Remember that %-encoding is valid URL syntax, so what
 282    the user meant was a literal space, and he was kind enough to quote
 283    it.  In that case, Wget should obviously leave the `%20' as is, and
 284    send the same request as above.  So in this case we may not call
 285    encode_string.
 286
 287    But what if the requested URI is `abc%20 def'?  If we call
 288    encode_string, we end up with `/abc%2520%20def', which is almost
 289    certainly not intended.  If we don't call encode_string, we are
 290    left with the embedded space and cannot send the request.  What the
 291    user meant was for Wget to request `/abc%20%20def', and this is
 292    where reencode_string kicks in.
 293
 294    Wget used to solve this by first decoding %-quotes, and then
 295    encoding all the "unsafe" characters found in the resulting string.
 296    This was wrong because it didn't preserve certain URL special
 297    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 298    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 299    whether we considered `+' reserved (it is).  One of these results
 300    is inevitable because by the second step we would lose information
 301    on whether the `+' was originally encoded or not.  Both results
 302    were wrong because in CGI parameters + means space, while %2B means
 303    literal plus.  reencode_string correctly translates the above to
 304    "a%2B+b", i.e. returns the original string.
 305
 306    This function uses an algorithm proposed by Anon Sricharoenchai:
 307
 308    1. Encode all URL_UNSAFE and the "%" that are not followed by 2
 309       hexdigits.
 310
 311    2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
 312       "+".
 313
 314    ...except that this code conflates the two steps, and decides
 315    whether to encode, decode, or pass through each character in turn.
 316    The function still uses two passes, but their logic is the same --
 317    the first pass exists merely for the sake of allocation.  Another
 318    small difference is that we include `+' to URL_RESERVED.
 319
 320    Anon's test case:
 321
 322    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 323    ->
 324    "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
 325
 326    Simpler test cases:
 327
 328    "foo bar"         -> "foo%20bar"
 329    "foo%20bar"       -> "foo%20bar"
 330    "foo %20bar"      -> "foo%20%20bar"
 331    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 332    "foo%25%20bar"    -> "foo%25%20bar"
 333    "foo%2%20bar"     -> "foo%252%20bar"
 334    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 335    "foo%2b+bar"      -> "foo%2b+bar"  */
 336
 337 char *
 338 reencode_string (const char *s)
 339 {
 340   const char *p1;
 341   char *newstr, *p2;
 342   int oldlen, newlen;
 343
 344   int encode_count = 0;
 345   int decode_count = 0;
 346
 347   /* First, pass through the string to see if there's anything to do,
 348      and to calculate the new length.  */
 349   for (p1 = s; *p1; p1++)
 350     {
 351       switch (decide_copy_method (p1))
 352         {
 353         case CM_ENCODE:
 354           ++encode_count;
 355           break;
 356         case CM_DECODE:
 357           ++decode_count;
 358           break;
 359         case CM_PASSTHROUGH:
 360           break;
 361         }
 362     }
 363
 364   if (!encode_count && !decode_count)
 365     /* The string is good as it is. */
 366     return (char *)s;           /* C const model sucks. */
 367
 368   oldlen = p1 - s;
 369   /* Each encoding adds two characters (hex digits), while each
 370      decoding removes two characters.  */
 371   newlen = oldlen + 2 * (encode_count - decode_count);
 372   newstr = xmalloc (newlen + 1);
 373
 374   p1 = s;
 375   p2 = newstr;
 376
 377   while (*p1)
 378     {
 379       switch (decide_copy_method (p1))
 380         {
 381         case CM_ENCODE:
 382           {
 383             unsigned char c = *p1++;
 384             *p2++ = '%';
 385             *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 386             *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 387           }
 388           break;
 389         case CM_DECODE:
 390           *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
 391                    + (XCHAR_TO_XDIGIT (*(p1 + 2))));
 392           p1 += 3;              /* skip %xx */
 393           break;
 394         case CM_PASSTHROUGH:
 395           *p2++ = *p1++;
 396         }
 397     }
 398   *p2 = '\0';
 399   assert (p2 - newstr == newlen);
 400   return newstr;
 401 }
 402
 403 /* Run PTR_VAR through reencode_string.  If a new string is consed,
 404    free PTR_VAR and make it point to the new storage.  Obviously,
 405    PTR_VAR needs to be an lvalue.  */
 406
 407 #define REENCODE(ptr_var) do {                  \
 408   char *rf_new = reencode_string (ptr_var);     \
 409   if (rf_new != ptr_var)                        \
 410     {                                           \
 411       xfree (ptr_var);                          \
 412       ptr_var = rf_new;                         \
 413     }                                           \
 414 } while (0)
 415 \f
 416 /* Returns the scheme type if the scheme is supported, or
 417    SCHEME_INVALID if not.  */
 418 enum url_scheme
 419 url_scheme (const char *url)
 420 {
 421   int i;
 422
 423   for (i = 0; supported_schemes[i].leading_string; i++)
 424     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 425                           strlen (supported_schemes[i].leading_string)))
 426       {
 427         if (supported_schemes[i].enabled)
 428           return (enum url_scheme) i;
 429         else
 430           return SCHEME_INVALID;
 431       }
 432
 433   return SCHEME_INVALID;
 434 }
 435
 436 /* Return the number of characters needed to skip the scheme part of
 437    the URL, e.g. `http://'.  If no scheme is found, returns 0.  */
 438 int
 439 url_skip_scheme (const char *url)
 440 {
 441   const char *p = url;
 442
 443   /* Skip the scheme name.  We allow `-' and `+' because of `whois++',
 444      etc. */
 445   while (ISALNUM (*p) || *p == '-' || *p == '+')
 446     ++p;
 447   if (*p != ':')
 448     return 0;
 449   /* Skip ':'. */
 450   ++p;
 451
 452   /* Skip "//" if found. */
 453   if (*p == '/' && *(p + 1) == '/')
 454     p += 2;
 455
 456   return p - url;
 457 }
 458
 459 /* Returns 1 if the URL begins with a scheme (supported or
 460    unsupported), 0 otherwise.  */
 461 int
 462 url_has_scheme (const char *url)
 463 {
 464   const char *p = url;
 465   while (ISALNUM (*p) || *p == '-' || *p == '+')
 466     ++p;
 467   return *p == ':';
 468 }
 469
 470 int
 471 scheme_default_port (enum url_scheme scheme)
 472 {
 473   return supported_schemes[scheme].default_port;
 474 }
 475
 476 void
 477 scheme_disable (enum url_scheme scheme)
 478 {
 479   supported_schemes[scheme].enabled = 0;
 480 }
 481
 482 /* Skip the username and password, if present here.  The function
 483    should be called *not* with the complete URL, but with the part
 484    right after the scheme.
 485
 486    If no username and password are found, return 0.  */
 487 int
 488 url_skip_uname (const char *url)
 489 {
 490   const char *p;
 491
 492   /* Look for '@' that comes before '/' or '?'. */
 493   p = (const char *)strpbrk (url, "/?@");
 494   if (!p || *p != '@')
 495     return 0;
 496
 497   return p - url + 1;
 498 }
 499
 500 static int
 501 parse_uname (const char *str, int len, char **user, char **passwd)
 502 {
 503   char *colon;
 504
 505   if (len == 0)
 506     /* Empty user name not allowed. */
 507     return 0;
 508
 509   colon = memchr (str, ':', len);
 510   if (colon == str)
 511     /* Empty user name again. */
 512     return 0;
 513
 514   if (colon)
 515     {
 516       int pwlen = len - (colon + 1 - str);
 517       *passwd = xmalloc (pwlen + 1);
 518       memcpy (*passwd, colon + 1, pwlen);
 519       (*passwd)[pwlen] = '\0';
 520       len -= pwlen + 1;
 521     }
 522   else
 523     *passwd = NULL;
 524
 525   *user = xmalloc (len + 1);
 526   memcpy (*user, str, len);
 527   (*user)[len] = '\0';
 528
 529   return 1;
 530 }
 531
 532 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 533    popularized by Netscape and NcFTP.  HTTP shorthands look like this:
 534
 535    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 536    www.foo.com[:port]            -> http://www.foo.com[:port]
 537
 538    FTP shorthands look like this:
 539
 540    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 541    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 542
 543    If the URL needs not or cannot be rewritten, return NULL.  */
 544 char *
 545 rewrite_shorthand_url (const char *url)
 546 {
 547   const char *p;
 548
 549   if (url_has_scheme (url))
 550     return NULL;
 551
 552   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 553      latter Netscape.  */
 554   for (p = url; *p && *p != ':' && *p != '/'; p++)
 555     ;
 556
 557   if (p == url)
 558     return NULL;
 559
 560   if (*p == ':')
 561     {
 562       const char *pp, *path;
 563       char *res;
 564       /* If the characters after the colon and before the next slash
 565          or end of string are all digits, it's HTTP.  */
 566       int digits = 0;
 567       for (pp = p + 1; ISDIGIT (*pp); pp++)
 568         ++digits;
 569       if (digits > 0
 570           && (*pp == '/' || *pp == '\0'))
 571         goto http;
 572
 573       /* Prepend "ftp://" to the entire URL... */
 574       path = p + 1;
 575       res = xmalloc (6 + strlen (url) + 1);
 576       sprintf (res, "ftp://%s", url);
 577       /* ...and replace ':' with '/'. */
 578       res[6 + (p - url)] = '/';
 579       return res;
 580     }
 581   else
 582     {
 583       char *res;
 584     http:
 585       /* Just prepend "http://" to what we have. */
 586       res = xmalloc (7 + strlen (url) + 1);
 587       sprintf (res, "http://%s", url);
 588       return res;
 589     }
 590 }
 591 \f
 592 static void parse_path PARAMS ((const char *, char **, char **));
 593
 594 static char *
 595 strpbrk_or_eos (const char *s, const char *accept)
 596 {
 597   char *p = strpbrk (s, accept);
 598   if (!p)
 599     p = (char *)s + strlen (s);
 600   return p;
 601 }
 602
 603 /* Turn STR into lowercase; return non-zero if a character was
 604    actually changed. */
 605
 606 static int
 607 lowercase_str (char *str)
 608 {
 609   int change = 0;
 610   for (; *str; str++)
 611     if (ISUPPER (*str))
 612       {
 613         change = 1;
 614         *str = TOLOWER (*str);
 615       }
 616   return change;
 617 }
 618
 619 static char *parse_errors[] = {
 620 #define PE_NO_ERROR            0
 621   "No error",
 622 #define PE_UNSUPPORTED_SCHEME 1
 623   "Unsupported scheme",
 624 #define PE_EMPTY_HOST          2
 625   "Empty host",
 626 #define PE_BAD_PORT_NUMBER     3
 627   "Bad port number",
 628 #define PE_INVALID_USER_NAME   4
 629   "Invalid user name"
 630 };
 631
 632 #define SETERR(p, v) do {                       \
 633   if (p)                                        \
 634     *(p) = (v);                                 \
 635 } while (0)
 636
 637 /* Parse a URL.
 638
 639    Return a new struct url if successful, NULL on error.  In case of
 640    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 641    error code. */
 642 struct url *
 643 url_parse (const char *url, int *error)
 644 {
 645   struct url *u;
 646   const char *p;
 647   int path_modified, host_modified;
 648
 649   enum url_scheme scheme;
 650
 651   const char *uname_b,     *uname_e;
 652   const char *host_b,      *host_e;
 653   const char *path_b,      *path_e;
 654   const char *params_b,    *params_e;
 655   const char *query_b,     *query_e;
 656   const char *fragment_b,  *fragment_e;
 657
 658   int port;
 659   char *user = NULL, *passwd = NULL;
 660
 661   char *url_encoded;
 662
 663   scheme = url_scheme (url);
 664   if (scheme == SCHEME_INVALID)
 665     {
 666       SETERR (error, PE_UNSUPPORTED_SCHEME);
 667       return NULL;
 668     }
 669
 670   url_encoded = reencode_string (url);
 671   p = url_encoded;
 672
 673   p += strlen (supported_schemes[scheme].leading_string);
 674   uname_b = p;
 675   p += url_skip_uname (p);
 676   uname_e = p;
 677
 678   /* scheme://user:pass@host[:port]... */
 679   /*                    ^              */
 680
 681   /* We attempt to break down the URL into the components path,
 682      params, query, and fragment.  They are ordered like this:
 683
 684        scheme://host[:port][/path][;params][?query][#fragment]  */
 685
 686   params_b   = params_e   = NULL;
 687   query_b    = query_e    = NULL;
 688   fragment_b = fragment_e = NULL;
 689
 690   host_b = p;
 691   p = strpbrk_or_eos (p, ":/;?#");
 692   host_e = p;
 693
 694   if (host_b == host_e)
 695     {
 696       SETERR (error, PE_EMPTY_HOST);
 697       return NULL;
 698     }
 699
 700   port = scheme_default_port (scheme);
 701   if (*p == ':')
 702     {
 703       const char *port_b, *port_e, *pp;
 704
 705       /* scheme://host:port/tralala */
 706       /*              ^             */
 707       ++p;
 708       port_b = p;
 709       p = strpbrk_or_eos (p, "/;?#");
 710       port_e = p;
 711
 712       if (port_b == port_e)
 713         {
 714           /* http://host:/whatever */
 715           /*             ^         */
 716           SETERR (error, PE_BAD_PORT_NUMBER);
 717           return NULL;
 718         }
 719
 720       for (port = 0, pp = port_b; pp < port_e; pp++)
 721         {
 722           if (!ISDIGIT (*pp))
 723             {
 724               /* http://host:12randomgarbage/blah */
 725               /*               ^                  */
 726               SETERR (error, PE_BAD_PORT_NUMBER);
 727               return NULL;
 728             }
 729           port = 10 * port + (*pp - '0');
 730         }
 731     }
 732
 733   if (*p == '/')
 734     {
 735       ++p;
 736       path_b = p;
 737       p = strpbrk_or_eos (p, ";?#");
 738       path_e = p;
 739     }
 740   else
 741     {
 742       /* Path is not allowed not to exist. */
 743       path_b = path_e = p;
 744     }
 745
 746   if (*p == ';')
 747     {
 748       ++p;
 749       params_b = p;
 750       p = strpbrk_or_eos (p, "?#");
 751       params_e = p;
 752     }
 753   if (*p == '?')
 754     {
 755       ++p;
 756       query_b = p;
 757       p = strpbrk_or_eos (p, "#");
 758       query_e = p;
 759     }
 760   if (*p == '#')
 761     {
 762       ++p;
 763       fragment_b = p;
 764       p += strlen (p);
 765       fragment_e = p;
 766     }
 767   assert (*p == 0);
 768
 769   if (uname_b != uname_e)
 770     {
 771       /* http://user:pass@host */
 772       /*        ^         ^    */
 773       /*     uname_b   uname_e */
 774       if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
 775         {
 776           SETERR (error, PE_INVALID_USER_NAME);
 777           return NULL;
 778         }
 779     }
 780
 781   u = (struct url *)xmalloc (sizeof (struct url));
 782   memset (u, 0, sizeof (*u));
 783
 784   u->scheme = scheme;
 785   u->host   = strdupdelim (host_b, host_e);
 786   u->port   = port;
 787   u->user   = user;
 788   u->passwd = passwd;
 789
 790   u->path = strdupdelim (path_b, path_e);
 791   path_modified = path_simplify (u->path);
 792   parse_path (u->path, &u->dir, &u->file);
 793
 794   host_modified = lowercase_str (u->host);
 795
 796   if (params_b)
 797     u->params = strdupdelim (params_b, params_e);
 798   if (query_b)
 799     u->query = strdupdelim (query_b, query_e);
 800   if (fragment_b)
 801     u->fragment = strdupdelim (fragment_b, fragment_e);
 802
 803   if (path_modified || u->fragment || host_modified || path_b == path_e)
 804     {
 805       /* If we suspect that a transformation has rendered what
 806          url_string might return different from URL_ENCODED, rebuild
 807          u->url using url_string.  */
 808       u->url = url_string (u, 0);
 809
 810       if (url_encoded != url)
 811         xfree ((char *) url_encoded);
 812     }
 813   else
 814     {
 815       if (url_encoded == url)
 816         u->url    = xstrdup (url);
 817       else
 818         u->url    = url_encoded;
 819     }
 820   url_encoded = NULL;
 821
 822   return u;
 823 }
 824
 825 const char *
 826 url_error (int error_code)
 827 {
 828   assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
 829   return parse_errors[error_code];
 830 }
 831
 832 static void
 833 parse_path (const char *quoted_path, char **dir, char **file)
 834 {
 835   char *path, *last_slash;
 836
 837   STRDUP_ALLOCA (path, quoted_path);
 838   decode_string (path);
 839
 840   last_slash = strrchr (path, '/');
 841   if (!last_slash)
 842     {
 843       *dir = xstrdup ("");
 844       *file = xstrdup (path);
 845     }
 846   else
 847     {
 848       *dir = strdupdelim (path, last_slash);
 849       *file = xstrdup (last_slash + 1);
 850     }
 851 }
 852
 853 /* Note: URL's "full path" is the path with the query string and
 854    params appended.  The "fragment" (#foo) is intentionally ignored,
 855    but that might be changed.  For example, if the original URL was
 856    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
 857    the full path will be "/foo/bar/baz;bullshit?querystring".  */
 858
 859 /* Return the length of the full path, without the terminating
 860    zero.  */
 861
 862 static int
 863 full_path_length (const struct url *url)
 864 {
 865   int len = 0;
 866
 867 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
 868
 869   FROB (path);
 870   FROB (params);
 871   FROB (query);
 872
 873 #undef FROB
 874
 875   return len;
 876 }
 877
 878 /* Write out the full path. */
 879
 880 static void
 881 full_path_write (const struct url *url, char *where)
 882 {
 883 #define FROB(el, chr) do {                      \
 884   char *f_el = url->el;                         \
 885   if (f_el) {                                   \
 886     int l = strlen (f_el);                      \
 887     *where++ = chr;                             \
 888     memcpy (where, f_el, l);                    \
 889     where += l;                                 \
 890   }                                             \
 891 } while (0)
 892
 893   FROB (path, '/');
 894   FROB (params, ';');
 895   FROB (query, '?');
 896
 897 #undef FROB
 898 }
 899
 900 /* Public function for getting the "full path".  E.g. if u->path is
 901    "foo/bar" and u->query is "param=value", full_path will be
 902    "/foo/bar?param=value". */
 903
 904 char *
 905 url_full_path (const struct url *url)
 906 {
 907   int length = full_path_length (url);
 908   char *full_path = (char *)xmalloc(length + 1);
 909
 910   full_path_write (url, full_path);
 911   full_path[length] = '\0';
 912
 913   return full_path;
 914 }
 915
 916 /* Sync u->path and u->url with u->dir and u->file. */
 917
 918 static void
 919 sync_path (struct url *url)
 920 {
 921   char *newpath;
 922
 923   xfree (url->path);
 924
 925   if (!*url->dir)
 926     {
 927       newpath = xstrdup (url->file);
 928       REENCODE (newpath);
 929     }
 930   else
 931     {
 932       int dirlen = strlen (url->dir);
 933       int filelen = strlen (url->file);
 934
 935       newpath = xmalloc (dirlen + 1 + filelen + 1);
 936       memcpy (newpath, url->dir, dirlen);
 937       newpath[dirlen] = '/';
 938       memcpy (newpath + dirlen + 1, url->file, filelen);
 939       newpath[dirlen + 1 + filelen] = '\0';
 940       REENCODE (newpath);
 941     }
 942
 943   url->path = newpath;
 944
 945   /* Synchronize u->url. */
 946   xfree (url->url);
 947   url->url = url_string (url, 0);
 948 }
 949
 950 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
 951    This way we can sync u->path and u->url when they get changed.  */
 952
 953 void
 954 url_set_dir (struct url *url, const char *newdir)
 955 {
 956   xfree (url->dir);
 957   url->dir = xstrdup (newdir);
 958   sync_path (url);
 959 }
 960
 961 void
 962 url_set_file (struct url *url, const char *newfile)
 963 {
 964   xfree (url->file);
 965   url->file = xstrdup (newfile);
 966   sync_path (url);
 967 }
 968
 969 void
 970 url_free (struct url *url)
 971 {
 972   xfree (url->host);
 973   xfree (url->path);
 974   xfree (url->url);
 975
 976   FREE_MAYBE (url->params);
 977   FREE_MAYBE (url->query);
 978   FREE_MAYBE (url->fragment);
 979   FREE_MAYBE (url->user);
 980   FREE_MAYBE (url->passwd);
 981
 982   xfree (url->dir);
 983   xfree (url->file);
 984
 985   xfree (url);
 986 }
 987 \f
 988 struct urlpos *
 989 get_urls_file (const char *file)
 990 {
 991   struct file_memory *fm;
 992   struct urlpos *head, *tail;
 993   const char *text, *text_end;
 994
 995   /* Load the file.  */
 996   fm = read_file (file);
 997   if (!fm)
 998     {
 999       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
1000       return NULL;
1001     }
1002   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
1003   head = tail = NULL;
1004   text = fm->content;
1005   text_end = fm->content + fm->length;
1006   while (text < text_end)
1007     {
1008       const char *line_beg = text;
1009       const char *line_end = memchr (text, '\n', text_end - text);
1010       if (!line_end)
1011         line_end = text_end;
1012       else
1013         ++line_end;
1014       text = line_end;
1015       while (line_beg < line_end
1016              && ISSPACE (*line_beg))
1017         ++line_beg;
1018       while (line_end > line_beg + 1
1019              && ISSPACE (*(line_end - 1)))
1020         --line_end;
1021       if (line_end > line_beg)
1022         {
1023           /* URL is in the [line_beg, line_end) region. */
1024
1025           int up_error_code;
1026           char *url_text;
1027           struct urlpos *entry;
1028           struct url *url;
1029
1030           /* We must copy the URL to a zero-terminated string, and we
1031              can't use alloca because we're in a loop.  *sigh*.  */
1032           url_text = strdupdelim (line_beg, line_end);
1033
1034           if (opt.base_href)
1035             {
1036               /* Merge opt.base_href with URL. */
1037               char *merged = uri_merge (opt.base_href, url_text);
1038               xfree (url_text);
1039               url_text = merged;
1040             }
1041
1042           url = url_parse (url_text, &up_error_code);
1043           if (!url)
1044             {
1045               logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1046                          file, url_text, url_error (up_error_code));
1047               xfree (url_text);
1048               continue;
1049             }
1050           xfree (url_text);
1051
1052           entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1053           memset (entry, 0, sizeof (*entry));
1054           entry->next = NULL;
1055           entry->url = url;
1056
1057           if (!head)
1058             head = entry;
1059           else
1060             tail->next = entry;
1061           tail = entry;
1062         }
1063     }
1064   read_file_free (fm);
1065   return head;
1066 }
1067 \f
1068 /* Free the linked list of urlpos.  */
1069 void
1070 free_urlpos (struct urlpos *l)
1071 {
1072   while (l)
1073     {
1074       struct urlpos *next = l->next;
1075       if (l->url)
1076         url_free (l->url);
1077       FREE_MAYBE (l->local_name);
1078       xfree (l);
1079       l = next;
1080     }
1081 }
1082
1083 /* Rotate FNAME opt.backups times */
1084 void
1085 rotate_backups(const char *fname)
1086 {
1087   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1088   char *from = (char *)alloca (maxlen);
1089   char *to = (char *)alloca (maxlen);
1090   struct stat sb;
1091   int i;
1092
1093   if (stat (fname, &sb) == 0)
1094     if (S_ISREG (sb.st_mode) == 0)
1095       return;
1096
1097   for (i = opt.backups; i > 1; i--)
1098     {
1099       sprintf (from, "%s.%d", fname, i - 1);
1100       sprintf (to, "%s.%d", fname, i);
1101       /* #### This will fail on machines without the rename() system
1102          call.  */
1103       rename (from, to);
1104     }
1105
1106   sprintf (to, "%s.%d", fname, 1);
1107   rename(fname, to);
1108 }
1109
1110 /* Create all the necessary directories for PATH (a file).  Calls
1111    mkdirhier() internally.  */
1112 int
1113 mkalldirs (const char *path)
1114 {
1115   const char *p;
1116   char *t;
1117   struct stat st;
1118   int res;
1119
1120   p = path + strlen (path);
1121   for (; *p != '/' && p != path; p--);
1122   /* Don't create if it's just a file.  */
1123   if ((p == path) && (*p != '/'))
1124     return 0;
1125   t = strdupdelim (path, p);
1126   /* Check whether the directory exists.  */
1127   if ((stat (t, &st) == 0))
1128     {
1129       if (S_ISDIR (st.st_mode))
1130         {
1131           xfree (t);
1132           return 0;
1133         }
1134       else
1135         {
1136           /* If the dir exists as a file name, remove it first.  This
1137              is *only* for Wget to work with buggy old CERN http
1138              servers.  Here is the scenario: When Wget tries to
1139              retrieve a directory without a slash, e.g.
1140              http://foo/bar (bar being a directory), CERN server will
1141              not redirect it too http://foo/bar/ -- it will generate a
1142              directory listing containing links to bar/file1,
1143              bar/file2, etc.  Wget will lose because it saves this
1144              HTML listing to a file `bar', so it cannot create the
1145              directory.  To work around this, if the file of the same
1146              name exists, we just remove it and create the directory
1147              anyway.  */
1148           DEBUGP (("Removing %s because of directory danger!\n", t));
1149           unlink (t);
1150         }
1151     }
1152   res = make_directory (t);
1153   if (res != 0)
1154     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1155   xfree (t);
1156   return res;
1157 }
1158
1159 static int
1160 count_slashes (const char *s)
1161 {
1162   int i = 0;
1163   while (*s)
1164     if (*s++ == '/')
1165       ++i;
1166   return i;
1167 }
1168
1169 /* Return the path name of the URL-equivalent file name, with a
1170    remote-like structure of directories.  */
1171 static char *
1172 mkstruct (const struct url *u)
1173 {
1174   char *dir, *dir_preencoding;
1175   char *file, *res, *dirpref;
1176   char *query = u->query && *u->query ? u->query : NULL;
1177   int l;
1178
1179   if (opt.cut_dirs)
1180     {
1181       char *ptr = u->dir + (*u->dir == '/');
1182       int slash_count = 1 + count_slashes (ptr);
1183       int cut = MINVAL (opt.cut_dirs, slash_count);
1184       for (; cut && *ptr; ptr++)
1185         if (*ptr == '/')
1186           --cut;
1187       STRDUP_ALLOCA (dir, ptr);
1188     }
1189   else
1190     dir = u->dir + (*u->dir == '/');
1191
1192   /* Check for the true name (or at least a consistent name for saving
1193      to directory) of HOST, reusing the hlist if possible.  */
1194   if (opt.add_hostdir)
1195     {
1196       /* Add dir_prefix and hostname (if required) to the beginning of
1197          dir.  */
1198       dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1199                                 + strlen (u->host)
1200                                 + 1 + numdigit (u->port)
1201                                 + 1);
1202       if (!DOTP (opt.dir_prefix))
1203         sprintf (dirpref, "%s/%s", opt.dir_prefix, u->host);
1204       else
1205         strcpy (dirpref, u->host);
1206
1207       if (u->port != scheme_default_port (u->scheme))
1208         {
1209           int len = strlen (dirpref);
1210           dirpref[len] = ':';
1211           long_to_string (dirpref + len + 1, u->port);
1212         }
1213     }
1214   else                          /* not add_hostdir */
1215     {
1216       if (!DOTP (opt.dir_prefix))
1217         dirpref = opt.dir_prefix;
1218       else
1219         dirpref = "";
1220     }
1221
1222   /* If there is a prefix, prepend it.  */
1223   if (*dirpref)
1224     {
1225       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1226       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1227       dir = newdir;
1228     }
1229
1230   dir_preencoding = dir;
1231   dir = reencode_string (dir_preencoding);
1232
1233   l = strlen (dir);
1234   if (l && dir[l - 1] == '/')
1235     dir[l - 1] = '\0';
1236
1237   if (!*u->file)
1238     file = "index.html";
1239   else
1240     file = u->file;
1241
1242   /* Finally, construct the full name.  */
1243   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file)
1244                          + (query ? (1 + strlen (query)) : 0)
1245                          + 1);
1246   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1247   if (query)
1248     {
1249       strcat (res, "?");
1250       strcat (res, query);
1251     }
1252   if (dir != dir_preencoding)
1253     xfree (dir);
1254   return res;
1255 }
1256
1257 /* Compose a file name out of BASE, an unescaped file name, and QUERY,
1258    an escaped query string.  The trick is to make sure that unsafe
1259    characters in BASE are escaped, and that slashes in QUERY are also
1260    escaped.  */
1261
1262 static char *
1263 compose_file_name (char *base, char *query)
1264 {
1265   char result[256];
1266   char *from;
1267   char *to = result;
1268
1269   /* Copy BASE to RESULT and encode all unsafe characters.  */
1270   from = base;
1271   while (*from && to - result < sizeof (result))
1272     {
1273       if (UNSAFE_CHAR (*from))
1274         {
1275           unsigned char c = *from++;
1276           *to++ = '%';
1277           *to++ = XDIGIT_TO_XCHAR (c >> 4);
1278           *to++ = XDIGIT_TO_XCHAR (c & 0xf);
1279         }
1280       else
1281         *to++ = *from++;
1282     }
1283
1284   if (query && to - result < sizeof (result))
1285     {
1286       *to++ = '?';
1287
1288       /* Copy QUERY to RESULT and encode all '/' characters. */
1289       from = query;
1290       while (*from && to - result < sizeof (result))
1291         {
1292           if (*from == '/')
1293             {
1294               *to++ = '%';
1295               *to++ = '2';
1296               *to++ = 'F';
1297               ++from;
1298             }
1299           else
1300             *to++ = *from++;
1301         }
1302     }
1303
1304   if (to - result < sizeof (result))
1305     *to = '\0';
1306   else
1307     /* Truncate input which is too long, presumably due to a huge
1308        query string.  */
1309     result[sizeof (result) - 1] = '\0';
1310
1311   return xstrdup (result);
1312 }
1313
1314 /* Create a unique filename, corresponding to a given URL.  Calls
1315    mkstruct if necessary.  Does *not* actually create any directories.  */
1316 char *
1317 url_filename (const struct url *u)
1318 {
1319   char *file, *name;
1320   int have_prefix = 0;          /* whether we must prepend opt.dir_prefix */
1321
1322   if (opt.dirstruct)
1323     {
1324       file = mkstruct (u);
1325       have_prefix = 1;
1326     }
1327   else
1328     {
1329       char *base = *u->file ? u->file : "index.html";
1330       char *query = u->query && *u->query ? u->query : NULL;
1331       file = compose_file_name (base, query);
1332     }
1333
1334   if (!have_prefix)
1335     {
1336       /* Check whether the prefix directory is something other than "."
1337          before prepending it.  */
1338       if (!DOTP (opt.dir_prefix))
1339         {
1340           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1341                                          + 1 + strlen (file) + 1);
1342           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1343           xfree (file);
1344           file = nfile;
1345         }
1346     }
1347   /* DOS-ish file systems don't like `%' signs in them; we change it
1348      to `@'.  */
1349 #ifdef WINDOWS
1350   {
1351     char *p = file;
1352     for (p = file; *p; p++)
1353       if (*p == '%')
1354         *p = '@';
1355   }
1356 #endif /* WINDOWS */
1357
1358   /* Check the cases in which the unique extensions are not used:
1359      1) Clobbering is turned off (-nc).
1360      2) Retrieval with regetting.
1361      3) Timestamping is used.
1362      4) Hierarchy is built.
1363
1364      The exception is the case when file does exist and is a
1365      directory (actually support for bad httpd-s).  */
1366   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1367       && !(file_exists_p (file) && !file_non_directory_p (file)))
1368     return file;
1369
1370   /* Find a unique name.  */
1371   name = unique_name (file);
1372   xfree (file);
1373   return name;
1374 }
1375
1376 /* Like strlen(), but allow the URL to be ended with '?'.  */
1377 static int
1378 urlpath_length (const char *url)
1379 {
1380   const char *q = strpbrk_or_eos (url, "?;#");
1381   return q - url;
1382 }
1383
1384 /* Find the last occurrence of character C in the range [b, e), or
1385    NULL, if none are present.  This is almost completely equivalent to
1386    { *e = '\0'; return strrchr(b); }, except that it doesn't change
1387    the contents of the string.  */
1388 static const char *
1389 find_last_char (const char *b, const char *e, char c)
1390 {
1391   for (; e > b; e--)
1392     if (*e == c)
1393       return e;
1394   return NULL;
1395 }
1396
1397 /* Resolve the result of "linking" a base URI (BASE) to a
1398    link-specified URI (LINK).
1399
1400    Either of the URIs may be absolute or relative, complete with the
1401    host name, or path only.  This tries to behave "reasonably" in all
1402    foreseeable cases.  It employs little specific knowledge about
1403    schemes or URL-specific stuff -- it just works on strings.
1404
1405    The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1406    See uri_merge for a gentler interface to this functionality.
1407
1408    Perhaps this function should handle `./' and `../' so that the evil
1409    path_simplify can go.  */
1410 static char *
1411 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1412 {
1413   char *constr;
1414
1415   if (no_scheme)
1416     {
1417       const char *end = base + urlpath_length (base);
1418
1419       if (!*link)
1420         {
1421           /* Empty LINK points back to BASE, query string and all. */
1422           constr = xstrdup (base);
1423         }
1424       else if (*link == '?')
1425         {
1426           /* LINK points to the same location, but changes the query
1427              string.  Examples: */
1428           /* uri_merge("path",         "?new") -> "path?new"     */
1429           /* uri_merge("path?foo",     "?new") -> "path?new"     */
1430           /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1431           /* uri_merge("path#foo",     "?new") -> "path?new"     */
1432           int baselength = end - base;
1433           constr = xmalloc (baselength + linklength + 1);
1434           memcpy (constr, base, baselength);
1435           memcpy (constr + baselength, link, linklength);
1436           constr[baselength + linklength] = '\0';
1437         }
1438       else if (*link == '#')
1439         {
1440           /* uri_merge("path",         "#new") -> "path#new"     */
1441           /* uri_merge("path#foo",     "#new") -> "path#new"     */
1442           /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1443           /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1444           int baselength;
1445           const char *end1 = strchr (base, '#');
1446           if (!end1)
1447             end1 = base + strlen (base);
1448           baselength = end1 - base;
1449           constr = xmalloc (baselength + linklength + 1);
1450           memcpy (constr, base, baselength);
1451           memcpy (constr + baselength, link, linklength);
1452           constr[baselength + linklength] = '\0';
1453         }
1454       else if (*link == '/')
1455         {
1456           /* LINK is an absolute path: we need to replace everything
1457              after (and including) the FIRST slash with LINK.
1458
1459              So, if BASE is "http://host/whatever/foo/bar", and LINK is
1460              "/qux/xyzzy", our result should be
1461              "http://host/qux/xyzzy".  */
1462           int span;
1463           const char *slash;
1464           const char *start_insert = NULL; /* for gcc to shut up. */
1465           const char *pos = base;
1466           int seen_slash_slash = 0;
1467           /* We're looking for the first slash, but want to ignore
1468              double slash. */
1469         again:
1470           slash = memchr (pos, '/', end - pos);
1471           if (slash && !seen_slash_slash)
1472             if (*(slash + 1) == '/')
1473               {
1474                 pos = slash + 2;
1475                 seen_slash_slash = 1;
1476                 goto again;
1477               }
1478
1479           /* At this point, SLASH is the location of the first / after
1480              "//", or the first slash altogether.  START_INSERT is the
1481              pointer to the location where LINK will be inserted.  When
1482              examining the last two examples, keep in mind that LINK
1483              begins with '/'. */
1484
1485           if (!slash && !seen_slash_slash)
1486             /* example: "foo" */
1487             /*           ^    */
1488             start_insert = base;
1489           else if (!slash && seen_slash_slash)
1490             /* example: "http://foo" */
1491             /*                     ^ */
1492             start_insert = end;
1493           else if (slash && !seen_slash_slash)
1494             /* example: "foo/bar" */
1495             /*           ^        */
1496             start_insert = base;
1497           else if (slash && seen_slash_slash)
1498             /* example: "http://something/" */
1499             /*                           ^  */
1500             start_insert = slash;
1501
1502           span = start_insert - base;
1503           constr = (char *)xmalloc (span + linklength + 1);
1504           if (span)
1505             memcpy (constr, base, span);
1506           if (linklength)
1507             memcpy (constr + span, link, linklength);
1508           constr[span + linklength] = '\0';
1509         }
1510       else
1511         {
1512           /* LINK is a relative URL: we need to replace everything
1513              after last slash (possibly empty) with LINK.
1514
1515              So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1516              our result should be "whatever/foo/qux/xyzzy".  */
1517           int need_explicit_slash = 0;
1518           int span;
1519           const char *start_insert;
1520           const char *last_slash = find_last_char (base, end, '/');
1521           if (!last_slash)
1522             {
1523               /* No slash found at all.  Append LINK to what we have,
1524                  but we'll need a slash as a separator.
1525
1526                  Example: if base == "foo" and link == "qux/xyzzy", then
1527                  we cannot just append link to base, because we'd get
1528                  "fooqux/xyzzy", whereas what we want is
1529                  "foo/qux/xyzzy".
1530
1531                  To make sure the / gets inserted, we set
1532                  need_explicit_slash to 1.  We also set start_insert
1533                  to end + 1, so that the length calculations work out
1534                  correctly for one more (slash) character.  Accessing
1535                  that character is fine, since it will be the
1536                  delimiter, '\0' or '?'.  */
1537               /* example: "foo?..." */
1538               /*               ^    ('?' gets changed to '/') */
1539               start_insert = end + 1;
1540               need_explicit_slash = 1;
1541             }
1542           else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1543             {
1544               /* example: http://host"  */
1545               /*                      ^ */
1546               start_insert = end + 1;
1547               need_explicit_slash = 1;
1548             }
1549           else
1550             {
1551               /* example: "whatever/foo/bar" */
1552               /*                        ^    */
1553               start_insert = last_slash + 1;
1554             }
1555
1556           span = start_insert - base;
1557           constr = (char *)xmalloc (span + linklength + 1);
1558           if (span)
1559             memcpy (constr, base, span);
1560           if (need_explicit_slash)
1561             constr[span - 1] = '/';
1562           if (linklength)
1563             memcpy (constr + span, link, linklength);
1564           constr[span + linklength] = '\0';
1565         }
1566     }
1567   else /* !no_scheme */
1568     {
1569       constr = strdupdelim (link, link + linklength);
1570     }
1571   return constr;
1572 }
1573
1574 /* Merge BASE with LINK and return the resulting URI.  This is an
1575    interface to uri_merge_1 that assumes that LINK is a
1576    zero-terminated string.  */
1577 char *
1578 uri_merge (const char *base, const char *link)
1579 {
1580   return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
1581 }
1582 \f
1583 #define APPEND(p, s) do {                       \
1584   int len = strlen (s);                         \
1585   memcpy (p, s, len);                           \
1586   p += len;                                     \
1587 } while (0)
1588
1589 /* Use this instead of password when the actual password is supposed
1590    to be hidden.  We intentionally use a generic string without giving
1591    away the number of characters in the password, like previous
1592    versions did.  */
1593 #define HIDDEN_PASSWORD "*password*"
1594
1595 /* Recreate the URL string from the data in URL.
1596
1597    If HIDE is non-zero (as it is when we're calling this on a URL we
1598    plan to print, but not when calling it to canonicalize a URL for
1599    use within the program), password will be hidden.  Unsafe
1600    characters in the URL will be quoted.  */
1601
1602 char *
1603 url_string (const struct url *url, int hide_password)
1604 {
1605   int size;
1606   char *result, *p;
1607   char *quoted_user = NULL, *quoted_passwd = NULL;
1608
1609   int scheme_port  = supported_schemes[url->scheme].default_port;
1610   char *scheme_str = supported_schemes[url->scheme].leading_string;
1611   int fplen = full_path_length (url);
1612
1613   assert (scheme_str != NULL);
1614
1615   /* Make sure the user name and password are quoted. */
1616   if (url->user)
1617     {
1618       quoted_user = encode_string_maybe (url->user);
1619       if (url->passwd)
1620         {
1621           if (hide_password)
1622             quoted_passwd = HIDDEN_PASSWORD;
1623           else
1624             quoted_passwd = encode_string_maybe (url->passwd);
1625         }
1626     }
1627
1628   size = (strlen (scheme_str)
1629           + strlen (url->host)
1630           + fplen
1631           + 1);
1632   if (url->port != scheme_port)
1633     size += 1 + numdigit (url->port);
1634   if (quoted_user)
1635     {
1636       size += 1 + strlen (quoted_user);
1637       if (quoted_passwd)
1638         size += 1 + strlen (quoted_passwd);
1639     }
1640
1641   p = result = xmalloc (size);
1642
1643   APPEND (p, scheme_str);
1644   if (quoted_user)
1645     {
1646       APPEND (p, quoted_user);
1647       if (quoted_passwd)
1648         {
1649           *p++ = ':';
1650           APPEND (p, quoted_passwd);
1651         }
1652       *p++ = '@';
1653     }
1654
1655   APPEND (p, url->host);
1656   if (url->port != scheme_port)
1657     {
1658       *p++ = ':';
1659       long_to_string (p, url->port);
1660       p += strlen (p);
1661     }
1662
1663   full_path_write (url, p);
1664   p += fplen;
1665   *p++ = '\0';
1666
1667   assert (p - result == size);
1668
1669   if (quoted_user && quoted_user != url->user)
1670     xfree (quoted_user);
1671   if (quoted_passwd && !hide_password
1672       && quoted_passwd != url->passwd)
1673     xfree (quoted_passwd);
1674
1675   return result;
1676 }
1677 \f
1678 /* Returns proxy host address, in accordance with SCHEME.  */
1679 char *
1680 getproxy (enum url_scheme scheme)
1681 {
1682   char *proxy = NULL;
1683   char *rewritten_url;
1684   static char rewritten_storage[1024];
1685
1686   switch (scheme)
1687     {
1688     case SCHEME_HTTP:
1689       proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1690       break;
1691 #ifdef HAVE_SSL
1692     case SCHEME_HTTPS:
1693       proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1694       break;
1695 #endif
1696     case SCHEME_FTP:
1697       proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1698       break;
1699     case SCHEME_INVALID:
1700       break;
1701     }
1702   if (!proxy || !*proxy)
1703     return NULL;
1704
1705   /* Handle shorthands. */
1706   rewritten_url = rewrite_shorthand_url (proxy);
1707   if (rewritten_url)
1708     {
1709       strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
1710       rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
1711       proxy = rewritten_storage;
1712     }
1713
1714   return proxy;
1715 }
1716
1717 /* Should a host be accessed through proxy, concerning no_proxy?  */
1718 int
1719 no_proxy_match (const char *host, const char **no_proxy)
1720 {
1721   if (!no_proxy)
1722     return 1;
1723   else
1724     return !sufmatch (no_proxy, host);
1725 }
1726 \f
1727 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1728 static const char *replace_attr PARAMS ((const char *, int, FILE *,
1729                                          const char *));
1730 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
1731                                                       const char *, int));
1732 static char *local_quote_string PARAMS ((const char *));
1733
1734 /* Change the links in one HTML file.  LINKS is a list of links in the
1735    document, along with their positions and the desired direction of
1736    the conversion.  */
1737 void
1738 convert_links (const char *file, struct urlpos *links)
1739 {
1740   struct file_memory *fm;
1741   FILE *fp;
1742   const char *p;
1743   downloaded_file_t downloaded_file_return;
1744
1745   struct urlpos *link;
1746   int to_url_count = 0, to_file_count = 0;
1747
1748   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1749
1750   {
1751     /* First we do a "dry run": go through the list L and see whether
1752        any URL needs to be converted in the first place.  If not, just
1753        leave the file alone.  */
1754     int dry_count = 0;
1755     struct urlpos *dry = links;
1756     for (dry = links; dry; dry = dry->next)
1757       if (dry->convert != CO_NOCONVERT)
1758         ++dry_count;
1759     if (!dry_count)
1760       {
1761         logputs (LOG_VERBOSE, _("nothing to do.\n"));
1762         return;
1763       }
1764   }
1765
1766   fm = read_file (file);
1767   if (!fm)
1768     {
1769       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1770                  file, strerror (errno));
1771       return;
1772     }
1773
1774   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1775   if (opt.backup_converted && downloaded_file_return)
1776     write_backup_file (file, downloaded_file_return);
1777
1778   /* Before opening the file for writing, unlink the file.  This is
1779      important if the data in FM is mmaped.  In such case, nulling the
1780      file, which is what fopen() below does, would make us read all
1781      zeroes from the mmaped region.  */
1782   if (unlink (file) < 0 && errno != ENOENT)
1783     {
1784       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1785                  file, strerror (errno));
1786       read_file_free (fm);
1787       return;
1788     }
1789   /* Now open the file for writing.  */
1790   fp = fopen (file, "wb");
1791   if (!fp)
1792     {
1793       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1794                  file, strerror (errno));
1795       read_file_free (fm);
1796       return;
1797     }
1798
1799   /* Here we loop through all the URLs in file, replacing those of
1800      them that are downloaded with relative references.  */
1801   p = fm->content;
1802   for (link = links; link; link = link->next)
1803     {
1804       char *url_start = fm->content + link->pos;
1805
1806       if (link->pos >= fm->length)
1807         {
1808           DEBUGP (("Something strange is going on.  Please investigate."));
1809           break;
1810         }
1811       /* If the URL is not to be converted, skip it.  */
1812       if (link->convert == CO_NOCONVERT)
1813         {
1814           DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
1815           continue;
1816         }
1817
1818       /* Echo the file contents, up to the offending URL's opening
1819          quote, to the outfile.  */
1820       fwrite (p, 1, url_start - p, fp);
1821       p = url_start;
1822
1823       switch (link->convert)
1824         {
1825         case CO_CONVERT_TO_RELATIVE:
1826           /* Convert absolute URL to relative. */
1827           {
1828             char *newname = construct_relative (file, link->local_name);
1829             char *quoted_newname = local_quote_string (newname);
1830
1831             if (!link->link_refresh_p)
1832               p = replace_attr (p, link->size, fp, quoted_newname);
1833             else
1834               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
1835                                              link->refresh_timeout);
1836
1837             DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1838                      link->url->url, newname, link->pos, file));
1839             xfree (newname);
1840             xfree (quoted_newname);
1841             ++to_file_count;
1842             break;
1843           }
1844         case CO_CONVERT_TO_COMPLETE:
1845           /* Convert the link to absolute URL. */
1846           {
1847             char *newlink = link->url->url;
1848             char *quoted_newlink = html_quote_string (newlink);
1849
1850             if (!link->link_refresh_p)
1851               p = replace_attr (p, link->size, fp, quoted_newlink);
1852             else
1853               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
1854                                              link->refresh_timeout);
1855
1856             DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1857                      newlink, link->pos, file));
1858             xfree (quoted_newlink);
1859             ++to_url_count;
1860             break;
1861           }
1862         case CO_NULLIFY_BASE:
1863           /* Change the base href to "". */
1864           p = replace_attr (p, link->size, fp, "");
1865           break;
1866         case CO_NOCONVERT:
1867           abort ();
1868           break;
1869         }
1870     }
1871
1872   /* Output the rest of the file. */
1873   if (p - fm->content < fm->length)
1874     fwrite (p, 1, fm->length - (p - fm->content), fp);
1875   fclose (fp);
1876   read_file_free (fm);
1877
1878   logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
1879 }
1880
1881 /* Construct and return a malloced copy of the relative link from two
1882    pieces of information: local name S1 of the referring file and
1883    local name S2 of the referred file.
1884
1885    So, if S1 is "jagor.srce.hr/index.html" and S2 is
1886    "jagor.srce.hr/images/news.gif", the function will return
1887    "images/news.gif".
1888
1889    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1890    "fly.cc.fer.hr/images/fly.gif", the function will return
1891    "../images/fly.gif".
1892
1893    Caveats: S1 should not begin with `/', unless S2 also begins with
1894    '/'.  S1 should not contain things like ".." and such --
1895    construct_relative ("fly/ioccc/../index.html",
1896    "fly/images/fly.gif") will fail.  (A workaround is to call
1897    something like path_simplify() on S1).  */
1898 static char *
1899 construct_relative (const char *s1, const char *s2)
1900 {
1901   int i, cnt, sepdirs1;
1902   char *res;
1903
1904   if (*s2 == '/')
1905     return xstrdup (s2);
1906   /* S1 should *not* be absolute, if S2 wasn't.  */
1907   assert (*s1 != '/');
1908   i = cnt = 0;
1909   /* Skip the directories common to both strings.  */
1910   while (1)
1911     {
1912       while (s1[i] && s2[i]
1913              && (s1[i] == s2[i])
1914              && (s1[i] != '/')
1915              && (s2[i] != '/'))
1916         ++i;
1917       if (s1[i] == '/' && s2[i] == '/')
1918         cnt = ++i;
1919       else
1920         break;
1921     }
1922   for (sepdirs1 = 0; s1[i]; i++)
1923     if (s1[i] == '/')
1924       ++sepdirs1;
1925   /* Now, construct the file as of:
1926      - ../ repeated sepdirs1 time
1927      - all the non-mutual directories of S2.  */
1928   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1929   for (i = 0; i < sepdirs1; i++)
1930     memcpy (res + 3 * i, "../", 3);
1931   strcpy (res + 3 * i, s2 + cnt);
1932   return res;
1933 }
1934 \f
1935 static void
1936 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1937 {
1938   /* Rather than just writing over the original .html file with the
1939      converted version, save the former to *.orig.  Note we only do
1940      this for files we've _successfully_ downloaded, so we don't
1941      clobber .orig files sitting around from previous invocations. */
1942
1943   /* Construct the backup filename as the original name plus ".orig". */
1944   size_t         filename_len = strlen(file);
1945   char*          filename_plus_orig_suffix;
1946   boolean        already_wrote_backup_file = FALSE;
1947   slist*         converted_file_ptr;
1948   static slist*  converted_files = NULL;
1949
1950   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1951     {
1952       /* Just write "orig" over "html".  We need to do it this way
1953          because when we're checking to see if we've downloaded the
1954          file before (to see if we can skip downloading it), we don't
1955          know if it's a text/html file.  Therefore we don't know yet
1956          at that stage that -E is going to cause us to tack on
1957          ".html", so we need to compare vs. the original URL plus
1958          ".orig", not the original URL plus ".html.orig". */
1959       filename_plus_orig_suffix = alloca (filename_len + 1);
1960       strcpy(filename_plus_orig_suffix, file);
1961       strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1962     }
1963   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1964     {
1965       /* Append ".orig" to the name. */
1966       filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1967       strcpy(filename_plus_orig_suffix, file);
1968       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1969     }
1970
1971   /* We can get called twice on the same URL thanks to the
1972      convert_all_links() call in main().  If we write the .orig file
1973      each time in such a case, it'll end up containing the first-pass
1974      conversion, not the original file.  So, see if we've already been
1975      called on this file. */
1976   converted_file_ptr = converted_files;
1977   while (converted_file_ptr != NULL)
1978     if (strcmp(converted_file_ptr->string, file) == 0)
1979       {
1980         already_wrote_backup_file = TRUE;
1981         break;
1982       }
1983     else
1984       converted_file_ptr = converted_file_ptr->next;
1985
1986   if (!already_wrote_backup_file)
1987     {
1988       /* Rename <file> to <file>.orig before former gets written over. */
1989       if (rename(file, filename_plus_orig_suffix) != 0)
1990         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1991                    file, filename_plus_orig_suffix, strerror (errno));
1992
1993       /* Remember that we've already written a .orig backup for this file.
1994          Note that we never free this memory since we need it till the
1995          convert_all_links() call, which is one of the last things the
1996          program does before terminating.  BTW, I'm not sure if it would be
1997          safe to just set 'converted_file_ptr->string' to 'file' below,
1998          rather than making a copy of the string...  Another note is that I
1999          thought I could just add a field to the urlpos structure saying
2000          that we'd written a .orig file for this URL, but that didn't work,
2001          so I had to make this separate list.
2002          -- Dan Harkless <wget@harkless.org>
2003
2004          This [adding a field to the urlpos structure] didn't work
2005          because convert_file() is called from convert_all_links at
2006          the end of the retrieval with a freshly built new urlpos
2007          list.
2008          -- Hrvoje Niksic <hniksic@arsdigita.com>
2009       */
2010       converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
2011       converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
2012       converted_file_ptr->next = converted_files;
2013       converted_files = converted_file_ptr;
2014     }
2015 }
2016
2017 static int find_fragment PARAMS ((const char *, int, const char **,
2018                                   const char **));
2019
2020 /* Replace an attribute's original text with NEW_TEXT. */
2021
2022 static const char *
2023 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
2024 {
2025   int quote_flag = 0;
2026   char quote_char = '\"';       /* use "..." for quoting, unless the
2027                                    original value is quoted, in which
2028                                    case reuse its quoting char. */
2029   const char *frag_beg, *frag_end;
2030
2031   /* Structure of our string is:
2032        "...old-contents..."
2033        <---    size    --->  (with quotes)
2034      OR:
2035        ...old-contents...
2036        <---    size   -->    (no quotes)   */
2037
2038   if (*p == '\"' || *p == '\'')
2039     {
2040       quote_char = *p;
2041       quote_flag = 1;
2042       ++p;
2043       size -= 2;                /* disregard opening and closing quote */
2044     }
2045   putc (quote_char, fp);
2046   fputs (new_text, fp);
2047
2048   /* Look for fragment identifier, if any. */
2049   if (find_fragment (p, size, &frag_beg, &frag_end))
2050     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2051   p += size;
2052   if (quote_flag)
2053     ++p;
2054   putc (quote_char, fp);
2055
2056   return p;
2057 }
2058
2059 /* The same as REPLACE_ATTR, but used when replacing
2060    <meta http-equiv=refresh content="new_text"> because we need to
2061    append "timeout_value; URL=" before the next_text.  */
2062
2063 static const char *
2064 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
2065                            const char *new_text, int timeout)
2066 {
2067   /* "0; URL=..." */
2068   char *new_with_timeout = (char *)alloca (numdigit (timeout)
2069                                            + 6 /* "; URL=" */
2070                                            + strlen (new_text)
2071                                            + 1);
2072   sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
2073
2074   return replace_attr (p, size, fp, new_with_timeout);
2075 }
2076
2077 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2078    preceded by '&'.  If the character is not found, return zero.  If
2079    the character is found, return 1 and set BP and EP to point to the
2080    beginning and end of the region.
2081
2082    This is used for finding the fragment indentifiers in URLs.  */
2083
2084 static int
2085 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2086 {
2087   const char *end = beg + size;
2088   int saw_amp = 0;
2089   for (; beg < end; beg++)
2090     {
2091       switch (*beg)
2092         {
2093         case '&':
2094           saw_amp = 1;
2095           break;
2096         case '#':
2097           if (!saw_amp)
2098             {
2099               *bp = beg;
2100               *ep = end;
2101               return 1;
2102             }
2103           /* fallthrough */
2104         default:
2105           saw_amp = 0;
2106         }
2107     }
2108   return 0;
2109 }
2110
2111 /* Quote FILE for use as local reference to an HTML file.
2112
2113    We quote ? as %3F to avoid passing part of the file name as the
2114    parameter when browsing the converted file through HTTP.  However,
2115    it is safe to do this only when `--html-extension' is turned on.
2116    This is because converting "index.html?foo=bar" to
2117    "index.html%3Ffoo=bar" would break local browsing, as the latter
2118    isn't even recognized as an HTML file!  However, converting
2119    "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
2120    safe for both local and HTTP-served browsing.  */
2121
2122 static char *
2123 local_quote_string (const char *file)
2124 {
2125   const char *file_sans_qmark;
2126   int qm;
2127
2128   if (!opt.html_extension)
2129     return html_quote_string (file);
2130
2131   qm = count_char (file, '?');
2132
2133   if (qm)
2134     {
2135       const char *from = file;
2136       char *to, *newname;
2137
2138       /* qm * 2 because we replace each question mark with "%3F",
2139          i.e. replace one char with three, hence two more.  */
2140       int fsqlen = strlen (file) + qm * 2;
2141
2142       to = newname = (char *)alloca (fsqlen + 1);
2143       for (; *from; from++)
2144         {
2145           if (*from != '?')
2146             *to++ = *from;
2147           else
2148             {
2149               *to++ = '%';
2150               *to++ = '3';
2151               *to++ = 'F';
2152             }
2153         }
2154       assert (to - newname == fsqlen);
2155       *to = '\0';
2156
2157       file_sans_qmark = newname;
2158     }
2159   else
2160     file_sans_qmark = file;
2161
2162   return html_quote_string (file_sans_qmark);
2163 }
2164
2165 /* We're storing "modes" of type downloaded_file_t in the hash table.
2166    However, our hash tables only accept pointers for keys and values.
2167    So when we need a pointer, we use the address of a
2168    downloaded_file_t variable of static storage.  */
2169
2170 static downloaded_file_t *
2171 downloaded_mode_to_ptr (downloaded_file_t mode)
2172 {
2173   static downloaded_file_t
2174     v1 = FILE_NOT_ALREADY_DOWNLOADED,
2175     v2 = FILE_DOWNLOADED_NORMALLY,
2176     v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2177     v4 = CHECK_FOR_FILE;
2178
2179   switch (mode)
2180     {
2181     case FILE_NOT_ALREADY_DOWNLOADED:
2182       return &v1;
2183     case FILE_DOWNLOADED_NORMALLY:
2184       return &v2;
2185     case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2186       return &v3;
2187     case CHECK_FOR_FILE:
2188       return &v4;
2189     }
2190   return NULL;
2191 }
2192
2193 /* This should really be merged with dl_file_url_map and
2194    downloaded_html_files in recur.c.  This was originally a list, but
2195    I changed it to a hash table beause it was actually taking a lot of
2196    time to find things in it.  */
2197
2198 static struct hash_table *downloaded_files_hash;
2199
2200 /* Remembers which files have been downloaded.  In the standard case, should be
2201    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2202    download successfully (i.e. not for ones we have failures on or that we skip
2203    due to -N).
2204
2205    When we've downloaded a file and tacked on a ".html" extension due to -E,
2206    call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2207    FILE_DOWNLOADED_NORMALLY.
2208
2209    If you just want to check if a file has been previously added without adding
2210    it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
2211    with local filenames, not remote URLs. */
2212 downloaded_file_t
2213 downloaded_file (downloaded_file_t mode, const char *file)
2214 {
2215   downloaded_file_t *ptr;
2216
2217   if (mode == CHECK_FOR_FILE)
2218     {
2219       if (!downloaded_files_hash)
2220         return FILE_NOT_ALREADY_DOWNLOADED;
2221       ptr = hash_table_get (downloaded_files_hash, file);
2222       if (!ptr)
2223         return FILE_NOT_ALREADY_DOWNLOADED;
2224       return *ptr;
2225     }
2226
2227   if (!downloaded_files_hash)
2228     downloaded_files_hash = make_string_hash_table (0);
2229
2230   ptr = hash_table_get (downloaded_files_hash, file);
2231   if (ptr)
2232     return *ptr;
2233
2234   ptr = downloaded_mode_to_ptr (mode);
2235   hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2236
2237   return FILE_NOT_ALREADY_DOWNLOADED;
2238 }
2239
2240 static int
2241 df_free_mapper (void *key, void *value, void *ignored)
2242 {
2243   xfree (key);
2244   return 0;
2245 }
2246
2247 void
2248 downloaded_files_free (void)
2249 {
2250   if (downloaded_files_hash)
2251     {
2252       hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2253       hash_table_destroy (downloaded_files_hash);
2254       downloaded_files_hash = NULL;
2255     }
2256 }