sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif
  29 #include <sys/types.h>
  30 #ifdef HAVE_UNISTD_H
  31 # include <unistd.h>
  32 #endif
  33 #include <errno.h>
  34 #include <assert.h>
  35
  36 #include "wget.h"
  37 #include "utils.h"
  38 #include "url.h"
  39 #include "host.h"
  40 #include "hash.h"
  41
  42 #ifndef errno
  43 extern int errno;
  44 #endif
  45
  46 /* Is X "."?  */
  47 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  48 /* Is X ".."?  */
  49 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  50
  51 static int urlpath_length PARAMS ((const char *));
  52
  53 struct scheme_data
  54 {
  55   char *leading_string;
  56   int default_port;
  57   int enabled;
  58 };
  59
  60 /* Supported schemes: */
  61 static struct scheme_data supported_schemes[] =
  62 {
  63   { "http://",  DEFAULT_HTTP_PORT,  1 },
  64 #ifdef HAVE_SSL
  65   { "https://", DEFAULT_HTTPS_PORT, 1 },
  66 #endif
  67   { "ftp://",   DEFAULT_FTP_PORT,   1 },
  68
  69   /* SCHEME_INVALID */
  70   { NULL,       -1,                 0 }
  71 };
  72
  73 static char *construct_relative PARAMS ((const char *, const char *));
  74
  75 \f
  76 /* Support for encoding and decoding of URL strings.  We determine
  77    whether a character is unsafe through static table lookup.  This
  78    code assumes ASCII character set and 8-bit chars.  */
  79
  80 enum {
  81   urlchr_reserved = 1,
  82   urlchr_unsafe   = 2
  83 };
  84
  85 #define R  urlchr_reserved
  86 #define U  urlchr_unsafe
  87 #define RU R|U
  88
  89 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
  90
  91 /* rfc1738 reserved chars, preserved from encoding.  */
  92
  93 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
  94
  95 /* rfc1738 unsafe chars, plus some more.  */
  96
  97 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
  98
  99 const static unsigned char urlchr_table[256] =
 100 {
 101   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 102   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 103   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 104   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 105   U,  0,  U, RU,   0,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 106   0,  0,  0,  R,   0,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 107   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 108   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 109  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 110   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 111   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 112   0,  0,  0,  U,   U,  U,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 113   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 114   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 115   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 116   0,  0,  0,  U,   U,  U,  U,  U,   /* x   y   z   {    |   }   ~   DEL */
 117
 118   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 119   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 120   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 121   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 122
 123   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 124   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 125   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 126   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 127 };
 128
 129 /* Decodes the forms %xy in a URL to the character the hexadecimal
 130    code of which is xy.  xy are hexadecimal digits from
 131    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 132    hex-digits or `%' precedes `\0', the sequence is inserted
 133    literally.  */
 134
 135 static void
 136 decode_string (char *s)
 137 {
 138   char *t = s;                  /* t - tortoise */
 139   char *h = s;                  /* h - hare     */
 140
 141   for (; *h; h++, t++)
 142     {
 143       if (*h != '%')
 144         {
 145         copychar:
 146           *t = *h;
 147         }
 148       else
 149         {
 150           /* Do nothing if '%' is not followed by two hex digits. */
 151           if (!*(h + 1) || !*(h + 2)
 152               || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
 153             goto copychar;
 154           *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
 155           h += 2;
 156         }
 157     }
 158   *t = '\0';
 159 }
 160
 161 /* Like encode_string, but return S if there are no unsafe chars.  */
 162
 163 static char *
 164 encode_string_maybe (const char *s)
 165 {
 166   const char *p1;
 167   char *p2, *newstr;
 168   int newlen;
 169   int addition = 0;
 170
 171   for (p1 = s; *p1; p1++)
 172     if (UNSAFE_CHAR (*p1))
 173       addition += 2;            /* Two more characters (hex digits) */
 174
 175   if (!addition)
 176     return (char *)s;
 177
 178   newlen = (p1 - s) + addition;
 179   newstr = (char *)xmalloc (newlen + 1);
 180
 181   p1 = s;
 182   p2 = newstr;
 183   while (*p1)
 184     {
 185       if (UNSAFE_CHAR (*p1))
 186         {
 187           unsigned char c = *p1++;
 188           *p2++ = '%';
 189           *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 190           *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 191         }
 192       else
 193         *p2++ = *p1++;
 194     }
 195   *p2 = '\0';
 196   assert (p2 - newstr == newlen);
 197
 198   return newstr;
 199 }
 200
 201 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
 202    given string, returning a malloc-ed %XX encoded string.  */
 203
 204 char *
 205 encode_string (const char *s)
 206 {
 207   char *encoded = encode_string_maybe (s);
 208   if (encoded != s)
 209     return encoded;
 210   else
 211     return xstrdup (s);
 212 }
 213
 214 /* Encode unsafe characters in PTR to %xx.  If such encoding is done,
 215    the old value of PTR is freed and PTR is made to point to the newly
 216    allocated storage.  */
 217
 218 #define ENCODE(ptr) do {                        \
 219   char *e_new = encode_string_maybe (ptr);      \
 220   if (e_new != ptr)                             \
 221     {                                           \
 222       xfree (ptr);                              \
 223       ptr = e_new;                              \
 224     }                                           \
 225 } while (0)
 226 \f
 227 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
 228
 229 /* Decide whether to encode, decode, or pass through the char at P.
 230    This used to be a macro, but it got a little too convoluted.  */
 231 static inline enum copy_method
 232 decide_copy_method (const char *p)
 233 {
 234   if (*p == '%')
 235     {
 236       if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
 237         {
 238           /* %xx sequence: decode it, unless it would decode to an
 239              unsafe or a reserved char; in that case, leave it as
 240              is. */
 241           char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
 242             XCHAR_TO_XDIGIT (*(p + 2));
 243
 244           if (UNSAFE_CHAR (preempt) || RESERVED_CHAR (preempt))
 245             return CM_PASSTHROUGH;
 246           else
 247             return CM_DECODE;
 248         }
 249       else
 250         /* Garbled %.. sequence: encode `%'. */
 251         return CM_ENCODE;
 252     }
 253   else if (UNSAFE_CHAR (*p) && !RESERVED_CHAR (*p))
 254     return CM_ENCODE;
 255   else
 256     return CM_PASSTHROUGH;
 257 }
 258
 259 /* Translate a %-quoting (but possibly non-conformant) input string S
 260    into a %-quoting (and conformant) output string.  If no characters
 261    are encoded or decoded, return the same string S; otherwise, return
 262    a freshly allocated string with the new contents.
 263
 264    After a URL has been run through this function, the protocols that
 265    use `%' as the quote character can use the resulting string as-is,
 266    while those that don't call decode_string() to get to the intended
 267    data.  This function is also stable: after an input string is
 268    transformed the first time, all further transformations of the
 269    result yield the same result string.
 270
 271    Let's discuss why this function is needed.
 272
 273    Imagine Wget is to retrieve `http://abc.xyz/abc def'.  Since a raw
 274    space character would mess up the HTTP request, it needs to be
 275    quoted, like this:
 276
 277        GET /abc%20def HTTP/1.0
 278
 279    So it appears that the unsafe chars need to be quoted, as with
 280    encode_string.  But what if we're requested to download
 281    `abc%20def'?  Remember that %-encoding is valid URL syntax, so what
 282    the user meant was a literal space, and he was kind enough to quote
 283    it.  In that case, Wget should obviously leave the `%20' as is, and
 284    send the same request as above.  So in this case we may not call
 285    encode_string.
 286
 287    But what if the requested URI is `abc%20 def'?  If we call
 288    encode_string, we end up with `/abc%2520%20def', which is almost
 289    certainly not intended.  If we don't call encode_string, we are
 290    left with the embedded space and cannot send the request.  What the
 291    user meant was for Wget to request `/abc%20%20def', and this is
 292    where reencode_string kicks in.
 293
 294    Wget used to solve this by first decoding %-quotes, and then
 295    encoding all the "unsafe" characters found in the resulting string.
 296    This was wrong because it didn't preserve certain URL special
 297    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 298    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 299    whether we considered `+' reserved (it is).  One of these results
 300    is inevitable because by the second step we would lose information
 301    on whether the `+' was originally encoded or not.  Both results
 302    were wrong because in CGI parameters + means space, while %2B means
 303    literal plus.  reencode_string correctly translates the above to
 304    "a%2B+b", i.e. returns the original string.
 305
 306    This function uses an algorithm proposed by Anon Sricharoenchai:
 307
 308    1. Encode all URL_UNSAFE and the "%" that are not followed by 2
 309       hexdigits.
 310
 311    2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
 312       "+".
 313
 314    ...except that this code conflates the two steps, and decides
 315    whether to encode, decode, or pass through each character in turn.
 316    The function still uses two passes, but their logic is the same --
 317    the first pass exists merely for the sake of allocation.  Another
 318    small difference is that we include `+' to URL_RESERVED.
 319
 320    Anon's test case:
 321
 322    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 323    ->
 324    "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
 325
 326    Simpler test cases:
 327
 328    "foo bar"         -> "foo%20bar"
 329    "foo%20bar"       -> "foo%20bar"
 330    "foo %20bar"      -> "foo%20%20bar"
 331    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 332    "foo%25%20bar"    -> "foo%25%20bar"
 333    "foo%2%20bar"     -> "foo%252%20bar"
 334    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 335    "foo%2b+bar"      -> "foo%2b+bar"  */
 336
 337 static char *
 338 reencode_string (const char *s)
 339 {
 340   const char *p1;
 341   char *newstr, *p2;
 342   int oldlen, newlen;
 343
 344   int encode_count = 0;
 345   int decode_count = 0;
 346
 347   /* First, pass through the string to see if there's anything to do,
 348      and to calculate the new length.  */
 349   for (p1 = s; *p1; p1++)
 350     {
 351       switch (decide_copy_method (p1))
 352         {
 353         case CM_ENCODE:
 354           ++encode_count;
 355           break;
 356         case CM_DECODE:
 357           ++decode_count;
 358           break;
 359         case CM_PASSTHROUGH:
 360           break;
 361         }
 362     }
 363
 364   if (!encode_count && !decode_count)
 365     /* The string is good as it is. */
 366     return (char *)s;           /* C const model sucks. */
 367
 368   oldlen = p1 - s;
 369   /* Each encoding adds two characters (hex digits), while each
 370      decoding removes two characters.  */
 371   newlen = oldlen + 2 * (encode_count - decode_count);
 372   newstr = xmalloc (newlen + 1);
 373
 374   p1 = s;
 375   p2 = newstr;
 376
 377   while (*p1)
 378     {
 379       switch (decide_copy_method (p1))
 380         {
 381         case CM_ENCODE:
 382           {
 383             unsigned char c = *p1++;
 384             *p2++ = '%';
 385             *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 386             *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 387           }
 388           break;
 389         case CM_DECODE:
 390           *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
 391                    + (XCHAR_TO_XDIGIT (*(p1 + 2))));
 392           p1 += 3;              /* skip %xx */
 393           break;
 394         case CM_PASSTHROUGH:
 395           *p2++ = *p1++;
 396         }
 397     }
 398   *p2 = '\0';
 399   assert (p2 - newstr == newlen);
 400   return newstr;
 401 }
 402
 403 /* Run PTR_VAR through reencode_string.  If a new string is consed,
 404    free PTR_VAR and make it point to the new storage.  Obviously,
 405    PTR_VAR needs to be an lvalue.  */
 406
 407 #define REENCODE(ptr_var) do {                  \
 408   char *rf_new = reencode_string (ptr_var);     \
 409   if (rf_new != ptr_var)                        \
 410     {                                           \
 411       xfree (ptr_var);                          \
 412       ptr_var = rf_new;                         \
 413     }                                           \
 414 } while (0)
 415 \f
 416 /* Returns the scheme type if the scheme is supported, or
 417    SCHEME_INVALID if not.  */
 418 enum url_scheme
 419 url_scheme (const char *url)
 420 {
 421   int i;
 422
 423   for (i = 0; supported_schemes[i].leading_string; i++)
 424     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 425                           strlen (supported_schemes[i].leading_string)))
 426       {
 427         if (supported_schemes[i].enabled)
 428           return (enum url_scheme) i;
 429         else
 430           return SCHEME_INVALID;
 431       }
 432
 433   return SCHEME_INVALID;
 434 }
 435
 436 /* Return the number of characters needed to skip the scheme part of
 437    the URL, e.g. `http://'.  If no scheme is found, returns 0.  */
 438 int
 439 url_skip_scheme (const char *url)
 440 {
 441   const char *p = url;
 442
 443   /* Skip the scheme name.  We allow `-' and `+' because of `whois++',
 444      etc. */
 445   while (ISALNUM (*p) || *p == '-' || *p == '+')
 446     ++p;
 447   if (*p != ':')
 448     return 0;
 449   /* Skip ':'. */
 450   ++p;
 451
 452   /* Skip "//" if found. */
 453   if (*p == '/' && *(p + 1) == '/')
 454     p += 2;
 455
 456   return p - url;
 457 }
 458
 459 /* Returns 1 if the URL begins with a scheme (supported or
 460    unsupported), 0 otherwise.  */
 461 int
 462 url_has_scheme (const char *url)
 463 {
 464   const char *p = url;
 465   while (ISALNUM (*p) || *p == '-' || *p == '+')
 466     ++p;
 467   return *p == ':';
 468 }
 469
 470 int
 471 scheme_default_port (enum url_scheme scheme)
 472 {
 473   return supported_schemes[scheme].default_port;
 474 }
 475
 476 void
 477 scheme_disable (enum url_scheme scheme)
 478 {
 479   supported_schemes[scheme].enabled = 0;
 480 }
 481
 482 /* Skip the username and password, if present here.  The function
 483    should be called *not* with the complete URL, but with the part
 484    right after the scheme.
 485
 486    If no username and password are found, return 0.  */
 487 int
 488 url_skip_uname (const char *url)
 489 {
 490   const char *p;
 491
 492   /* Look for '@' that comes before '/' or '?'. */
 493   p = (const char *)strpbrk (url, "/?@");
 494   if (!p || *p != '@')
 495     return 0;
 496
 497   return p - url + 1;
 498 }
 499
 500 static int
 501 parse_uname (const char *str, int len, char **user, char **passwd)
 502 {
 503   char *colon;
 504
 505   if (len == 0)
 506     /* Empty user name not allowed. */
 507     return 0;
 508
 509   colon = memchr (str, ':', len);
 510   if (colon == str)
 511     /* Empty user name again. */
 512     return 0;
 513
 514   if (colon)
 515     {
 516       int pwlen = len - (colon + 1 - str);
 517       *passwd = xmalloc (pwlen + 1);
 518       memcpy (*passwd, colon + 1, pwlen);
 519       (*passwd)[pwlen] = '\0';
 520       len -= pwlen + 1;
 521     }
 522   else
 523     *passwd = NULL;
 524
 525   *user = xmalloc (len + 1);
 526   memcpy (*user, str, len);
 527   (*user)[len] = '\0';
 528
 529   return 1;
 530 }
 531
 532 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 533    popularized by Netscape and NcFTP.  HTTP shorthands look like this:
 534
 535    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 536    www.foo.com[:port]            -> http://www.foo.com[:port]
 537
 538    FTP shorthands look like this:
 539
 540    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 541    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 542
 543    If the URL needs not or cannot be rewritten, return NULL.  */
 544 char *
 545 rewrite_shorthand_url (const char *url)
 546 {
 547   const char *p;
 548
 549   if (url_has_scheme (url))
 550     return NULL;
 551
 552   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 553      latter Netscape.  */
 554   for (p = url; *p && *p != ':' && *p != '/'; p++)
 555     ;
 556
 557   if (p == url)
 558     return NULL;
 559
 560   if (*p == ':')
 561     {
 562       const char *pp;
 563       char *res;
 564       /* If the characters after the colon and before the next slash
 565          or end of string are all digits, it's HTTP.  */
 566       int digits = 0;
 567       for (pp = p + 1; ISDIGIT (*pp); pp++)
 568         ++digits;
 569       if (digits > 0 && (*pp == '/' || *pp == '\0'))
 570         goto http;
 571
 572       /* Prepend "ftp://" to the entire URL... */
 573       res = xmalloc (6 + strlen (url) + 1);
 574       sprintf (res, "ftp://%s", url);
 575       /* ...and replace ':' with '/'. */
 576       res[6 + (p - url)] = '/';
 577       return res;
 578     }
 579   else
 580     {
 581       char *res;
 582     http:
 583       /* Just prepend "http://" to what we have. */
 584       res = xmalloc (7 + strlen (url) + 1);
 585       sprintf (res, "http://%s", url);
 586       return res;
 587     }
 588 }
 589 \f
 590 static void parse_path PARAMS ((const char *, char **, char **));
 591
 592 static char *
 593 strpbrk_or_eos (const char *s, const char *accept)
 594 {
 595   char *p = strpbrk (s, accept);
 596   if (!p)
 597     p = (char *)s + strlen (s);
 598   return p;
 599 }
 600
 601 /* Turn STR into lowercase; return non-zero if a character was
 602    actually changed. */
 603
 604 static int
 605 lowercase_str (char *str)
 606 {
 607   int change = 0;
 608   for (; *str; str++)
 609     if (ISUPPER (*str))
 610       {
 611         change = 1;
 612         *str = TOLOWER (*str);
 613       }
 614   return change;
 615 }
 616
 617 static char *parse_errors[] = {
 618 #define PE_NO_ERROR            0
 619   "No error",
 620 #define PE_UNSUPPORTED_SCHEME 1
 621   "Unsupported scheme",
 622 #define PE_EMPTY_HOST          2
 623   "Empty host",
 624 #define PE_BAD_PORT_NUMBER     3
 625   "Bad port number",
 626 #define PE_INVALID_USER_NAME   4
 627   "Invalid user name"
 628 };
 629
 630 #define SETERR(p, v) do {                       \
 631   if (p)                                        \
 632     *(p) = (v);                                 \
 633 } while (0)
 634
 635 /* Parse a URL.
 636
 637    Return a new struct url if successful, NULL on error.  In case of
 638    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 639    error code. */
 640 struct url *
 641 url_parse (const char *url, int *error)
 642 {
 643   struct url *u;
 644   const char *p;
 645   int path_modified, host_modified;
 646
 647   enum url_scheme scheme;
 648
 649   const char *uname_b,     *uname_e;
 650   const char *host_b,      *host_e;
 651   const char *path_b,      *path_e;
 652   const char *params_b,    *params_e;
 653   const char *query_b,     *query_e;
 654   const char *fragment_b,  *fragment_e;
 655
 656   int port;
 657   char *user = NULL, *passwd = NULL;
 658
 659   char *url_encoded;
 660
 661   scheme = url_scheme (url);
 662   if (scheme == SCHEME_INVALID)
 663     {
 664       SETERR (error, PE_UNSUPPORTED_SCHEME);
 665       return NULL;
 666     }
 667
 668   url_encoded = reencode_string (url);
 669   p = url_encoded;
 670
 671   p += strlen (supported_schemes[scheme].leading_string);
 672   uname_b = p;
 673   p += url_skip_uname (p);
 674   uname_e = p;
 675
 676   /* scheme://user:pass@host[:port]... */
 677   /*                    ^              */
 678
 679   /* We attempt to break down the URL into the components path,
 680      params, query, and fragment.  They are ordered like this:
 681
 682        scheme://host[:port][/path][;params][?query][#fragment]  */
 683
 684   params_b   = params_e   = NULL;
 685   query_b    = query_e    = NULL;
 686   fragment_b = fragment_e = NULL;
 687
 688   host_b = p;
 689   p = strpbrk_or_eos (p, ":/;?#");
 690   host_e = p;
 691
 692   if (host_b == host_e)
 693     {
 694       SETERR (error, PE_EMPTY_HOST);
 695       return NULL;
 696     }
 697
 698   port = scheme_default_port (scheme);
 699   if (*p == ':')
 700     {
 701       const char *port_b, *port_e, *pp;
 702
 703       /* scheme://host:port/tralala */
 704       /*              ^             */
 705       ++p;
 706       port_b = p;
 707       p = strpbrk_or_eos (p, "/;?#");
 708       port_e = p;
 709
 710       if (port_b == port_e)
 711         {
 712           /* http://host:/whatever */
 713           /*             ^         */
 714           SETERR (error, PE_BAD_PORT_NUMBER);
 715           return NULL;
 716         }
 717
 718       for (port = 0, pp = port_b; pp < port_e; pp++)
 719         {
 720           if (!ISDIGIT (*pp))
 721             {
 722               /* http://host:12randomgarbage/blah */
 723               /*               ^                  */
 724               SETERR (error, PE_BAD_PORT_NUMBER);
 725               return NULL;
 726             }
 727           port = 10 * port + (*pp - '0');
 728         }
 729     }
 730
 731   if (*p == '/')
 732     {
 733       ++p;
 734       path_b = p;
 735       p = strpbrk_or_eos (p, ";?#");
 736       path_e = p;
 737     }
 738   else
 739     {
 740       /* Path is not allowed not to exist. */
 741       path_b = path_e = p;
 742     }
 743
 744   if (*p == ';')
 745     {
 746       ++p;
 747       params_b = p;
 748       p = strpbrk_or_eos (p, "?#");
 749       params_e = p;
 750     }
 751   if (*p == '?')
 752     {
 753       ++p;
 754       query_b = p;
 755       p = strpbrk_or_eos (p, "#");
 756       query_e = p;
 757     }
 758   if (*p == '#')
 759     {
 760       ++p;
 761       fragment_b = p;
 762       p += strlen (p);
 763       fragment_e = p;
 764     }
 765   assert (*p == 0);
 766
 767   if (uname_b != uname_e)
 768     {
 769       /* http://user:pass@host */
 770       /*        ^         ^    */
 771       /*     uname_b   uname_e */
 772       if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
 773         {
 774           SETERR (error, PE_INVALID_USER_NAME);
 775           return NULL;
 776         }
 777     }
 778
 779   u = (struct url *)xmalloc (sizeof (struct url));
 780   memset (u, 0, sizeof (*u));
 781
 782   u->scheme = scheme;
 783   u->host   = strdupdelim (host_b, host_e);
 784   u->port   = port;
 785   u->user   = user;
 786   u->passwd = passwd;
 787
 788   u->path = strdupdelim (path_b, path_e);
 789   path_modified = path_simplify (u->path);
 790   parse_path (u->path, &u->dir, &u->file);
 791
 792   host_modified = lowercase_str (u->host);
 793
 794   if (params_b)
 795     u->params = strdupdelim (params_b, params_e);
 796   if (query_b)
 797     u->query = strdupdelim (query_b, query_e);
 798   if (fragment_b)
 799     u->fragment = strdupdelim (fragment_b, fragment_e);
 800
 801   if (path_modified || u->fragment || host_modified || path_b == path_e)
 802     {
 803       /* If we suspect that a transformation has rendered what
 804          url_string might return different from URL_ENCODED, rebuild
 805          u->url using url_string.  */
 806       u->url = url_string (u, 0);
 807
 808       if (url_encoded != url)
 809         xfree ((char *) url_encoded);
 810     }
 811   else
 812     {
 813       if (url_encoded == url)
 814         u->url    = xstrdup (url);
 815       else
 816         u->url    = url_encoded;
 817     }
 818   url_encoded = NULL;
 819
 820   return u;
 821 }
 822
 823 const char *
 824 url_error (int error_code)
 825 {
 826   assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
 827   return parse_errors[error_code];
 828 }
 829
 830 static void
 831 parse_path (const char *quoted_path, char **dir, char **file)
 832 {
 833   char *path, *last_slash;
 834
 835   STRDUP_ALLOCA (path, quoted_path);
 836   decode_string (path);
 837
 838   last_slash = strrchr (path, '/');
 839   if (!last_slash)
 840     {
 841       *dir = xstrdup ("");
 842       *file = xstrdup (path);
 843     }
 844   else
 845     {
 846       *dir = strdupdelim (path, last_slash);
 847       *file = xstrdup (last_slash + 1);
 848     }
 849 }
 850
 851 /* Note: URL's "full path" is the path with the query string and
 852    params appended.  The "fragment" (#foo) is intentionally ignored,
 853    but that might be changed.  For example, if the original URL was
 854    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
 855    the full path will be "/foo/bar/baz;bullshit?querystring".  */
 856
 857 /* Return the length of the full path, without the terminating
 858    zero.  */
 859
 860 static int
 861 full_path_length (const struct url *url)
 862 {
 863   int len = 0;
 864
 865 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
 866
 867   FROB (path);
 868   FROB (params);
 869   FROB (query);
 870
 871 #undef FROB
 872
 873   return len;
 874 }
 875
 876 /* Write out the full path. */
 877
 878 static void
 879 full_path_write (const struct url *url, char *where)
 880 {
 881 #define FROB(el, chr) do {                      \
 882   char *f_el = url->el;                         \
 883   if (f_el) {                                   \
 884     int l = strlen (f_el);                      \
 885     *where++ = chr;                             \
 886     memcpy (where, f_el, l);                    \
 887     where += l;                                 \
 888   }                                             \
 889 } while (0)
 890
 891   FROB (path, '/');
 892   FROB (params, ';');
 893   FROB (query, '?');
 894
 895 #undef FROB
 896 }
 897
 898 /* Public function for getting the "full path".  E.g. if u->path is
 899    "foo/bar" and u->query is "param=value", full_path will be
 900    "/foo/bar?param=value". */
 901
 902 char *
 903 url_full_path (const struct url *url)
 904 {
 905   int length = full_path_length (url);
 906   char *full_path = (char *)xmalloc(length + 1);
 907
 908   full_path_write (url, full_path);
 909   full_path[length] = '\0';
 910
 911   return full_path;
 912 }
 913
 914 /* Sync u->path and u->url with u->dir and u->file. */
 915
 916 static void
 917 sync_path (struct url *url)
 918 {
 919   char *newpath;
 920
 921   xfree (url->path);
 922
 923   if (!*url->dir)
 924     {
 925       newpath = xstrdup (url->file);
 926       REENCODE (newpath);
 927     }
 928   else
 929     {
 930       int dirlen = strlen (url->dir);
 931       int filelen = strlen (url->file);
 932
 933       newpath = xmalloc (dirlen + 1 + filelen + 1);
 934       memcpy (newpath, url->dir, dirlen);
 935       newpath[dirlen] = '/';
 936       memcpy (newpath + dirlen + 1, url->file, filelen);
 937       newpath[dirlen + 1 + filelen] = '\0';
 938       REENCODE (newpath);
 939     }
 940
 941   url->path = newpath;
 942
 943   /* Synchronize u->url. */
 944   xfree (url->url);
 945   url->url = url_string (url, 0);
 946 }
 947
 948 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
 949    This way we can sync u->path and u->url when they get changed.  */
 950
 951 void
 952 url_set_dir (struct url *url, const char *newdir)
 953 {
 954   xfree (url->dir);
 955   url->dir = xstrdup (newdir);
 956   sync_path (url);
 957 }
 958
 959 void
 960 url_set_file (struct url *url, const char *newfile)
 961 {
 962   xfree (url->file);
 963   url->file = xstrdup (newfile);
 964   sync_path (url);
 965 }
 966
 967 void
 968 url_free (struct url *url)
 969 {
 970   xfree (url->host);
 971   xfree (url->path);
 972   xfree (url->url);
 973
 974   FREE_MAYBE (url->params);
 975   FREE_MAYBE (url->query);
 976   FREE_MAYBE (url->fragment);
 977   FREE_MAYBE (url->user);
 978   FREE_MAYBE (url->passwd);
 979
 980   xfree (url->dir);
 981   xfree (url->file);
 982
 983   xfree (url);
 984 }
 985 \f
 986 struct urlpos *
 987 get_urls_file (const char *file)
 988 {
 989   struct file_memory *fm;
 990   struct urlpos *head, *tail;
 991   const char *text, *text_end;
 992
 993   /* Load the file.  */
 994   fm = read_file (file);
 995   if (!fm)
 996     {
 997       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 998       return NULL;
 999     }
1000   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
1001   head = tail = NULL;
1002   text = fm->content;
1003   text_end = fm->content + fm->length;
1004   while (text < text_end)
1005     {
1006       const char *line_beg = text;
1007       const char *line_end = memchr (text, '\n', text_end - text);
1008       if (!line_end)
1009         line_end = text_end;
1010       else
1011         ++line_end;
1012       text = line_end;
1013       while (line_beg < line_end
1014              && ISSPACE (*line_beg))
1015         ++line_beg;
1016       while (line_end > line_beg + 1
1017              && ISSPACE (*(line_end - 1)))
1018         --line_end;
1019       if (line_end > line_beg)
1020         {
1021           /* URL is in the [line_beg, line_end) region. */
1022
1023           int up_error_code;
1024           char *url_text;
1025           struct urlpos *entry;
1026           struct url *url;
1027
1028           /* We must copy the URL to a zero-terminated string, and we
1029              can't use alloca because we're in a loop.  *sigh*.  */
1030           url_text = strdupdelim (line_beg, line_end);
1031
1032           if (opt.base_href)
1033             {
1034               /* Merge opt.base_href with URL. */
1035               char *merged = uri_merge (opt.base_href, url_text);
1036               xfree (url_text);
1037               url_text = merged;
1038             }
1039
1040           url = url_parse (url_text, &up_error_code);
1041           if (!url)
1042             {
1043               logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1044                          file, url_text, url_error (up_error_code));
1045               xfree (url_text);
1046               continue;
1047             }
1048           xfree (url_text);
1049
1050           entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1051           memset (entry, 0, sizeof (*entry));
1052           entry->next = NULL;
1053           entry->url = url;
1054
1055           if (!head)
1056             head = entry;
1057           else
1058             tail->next = entry;
1059           tail = entry;
1060         }
1061     }
1062   read_file_free (fm);
1063   return head;
1064 }
1065 \f
1066 /* Free the linked list of urlpos.  */
1067 void
1068 free_urlpos (struct urlpos *l)
1069 {
1070   while (l)
1071     {
1072       struct urlpos *next = l->next;
1073       if (l->url)
1074         url_free (l->url);
1075       FREE_MAYBE (l->local_name);
1076       xfree (l);
1077       l = next;
1078     }
1079 }
1080
1081 /* Rotate FNAME opt.backups times */
1082 void
1083 rotate_backups(const char *fname)
1084 {
1085   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1086   char *from = (char *)alloca (maxlen);
1087   char *to = (char *)alloca (maxlen);
1088   struct stat sb;
1089   int i;
1090
1091   if (stat (fname, &sb) == 0)
1092     if (S_ISREG (sb.st_mode) == 0)
1093       return;
1094
1095   for (i = opt.backups; i > 1; i--)
1096     {
1097       sprintf (from, "%s.%d", fname, i - 1);
1098       sprintf (to, "%s.%d", fname, i);
1099       /* #### This will fail on machines without the rename() system
1100          call.  */
1101       rename (from, to);
1102     }
1103
1104   sprintf (to, "%s.%d", fname, 1);
1105   rename(fname, to);
1106 }
1107
1108 /* Create all the necessary directories for PATH (a file).  Calls
1109    mkdirhier() internally.  */
1110 int
1111 mkalldirs (const char *path)
1112 {
1113   const char *p;
1114   char *t;
1115   struct stat st;
1116   int res;
1117
1118   p = path + strlen (path);
1119   for (; *p != '/' && p != path; p--);
1120   /* Don't create if it's just a file.  */
1121   if ((p == path) && (*p != '/'))
1122     return 0;
1123   t = strdupdelim (path, p);
1124   /* Check whether the directory exists.  */
1125   if ((stat (t, &st) == 0))
1126     {
1127       if (S_ISDIR (st.st_mode))
1128         {
1129           xfree (t);
1130           return 0;
1131         }
1132       else
1133         {
1134           /* If the dir exists as a file name, remove it first.  This
1135              is *only* for Wget to work with buggy old CERN http
1136              servers.  Here is the scenario: When Wget tries to
1137              retrieve a directory without a slash, e.g.
1138              http://foo/bar (bar being a directory), CERN server will
1139              not redirect it too http://foo/bar/ -- it will generate a
1140              directory listing containing links to bar/file1,
1141              bar/file2, etc.  Wget will lose because it saves this
1142              HTML listing to a file `bar', so it cannot create the
1143              directory.  To work around this, if the file of the same
1144              name exists, we just remove it and create the directory
1145              anyway.  */
1146           DEBUGP (("Removing %s because of directory danger!\n", t));
1147           unlink (t);
1148         }
1149     }
1150   res = make_directory (t);
1151   if (res != 0)
1152     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1153   xfree (t);
1154   return res;
1155 }
1156
1157 static int
1158 count_slashes (const char *s)
1159 {
1160   int i = 0;
1161   while (*s)
1162     if (*s++ == '/')
1163       ++i;
1164   return i;
1165 }
1166
1167 /* Return the path name of the URL-equivalent file name, with a
1168    remote-like structure of directories.  */
1169 static char *
1170 mkstruct (const struct url *u)
1171 {
1172   char *dir, *dir_preencoding;
1173   char *file, *res, *dirpref;
1174   char *query = u->query && *u->query ? u->query : NULL;
1175   int l;
1176
1177   if (opt.cut_dirs)
1178     {
1179       char *ptr = u->dir + (*u->dir == '/');
1180       int slash_count = 1 + count_slashes (ptr);
1181       int cut = MINVAL (opt.cut_dirs, slash_count);
1182       for (; cut && *ptr; ptr++)
1183         if (*ptr == '/')
1184           --cut;
1185       STRDUP_ALLOCA (dir, ptr);
1186     }
1187   else
1188     dir = u->dir + (*u->dir == '/');
1189
1190   /* Check for the true name (or at least a consistent name for saving
1191      to directory) of HOST, reusing the hlist if possible.  */
1192   if (opt.add_hostdir)
1193     {
1194       /* Add dir_prefix and hostname (if required) to the beginning of
1195          dir.  */
1196       dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1197                                 + strlen (u->host)
1198                                 + 1 + numdigit (u->port)
1199                                 + 1);
1200       if (!DOTP (opt.dir_prefix))
1201         sprintf (dirpref, "%s/%s", opt.dir_prefix, u->host);
1202       else
1203         strcpy (dirpref, u->host);
1204
1205       if (u->port != scheme_default_port (u->scheme))
1206         {
1207           int len = strlen (dirpref);
1208           dirpref[len] = ':';
1209           number_to_string (dirpref + len + 1, u->port);
1210         }
1211     }
1212   else                          /* not add_hostdir */
1213     {
1214       if (!DOTP (opt.dir_prefix))
1215         dirpref = opt.dir_prefix;
1216       else
1217         dirpref = "";
1218     }
1219
1220   /* If there is a prefix, prepend it.  */
1221   if (*dirpref)
1222     {
1223       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1224       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1225       dir = newdir;
1226     }
1227
1228   dir_preencoding = dir;
1229   dir = reencode_string (dir_preencoding);
1230
1231   l = strlen (dir);
1232   if (l && dir[l - 1] == '/')
1233     dir[l - 1] = '\0';
1234
1235   if (!*u->file)
1236     file = "index.html";
1237   else
1238     file = u->file;
1239
1240   /* Finally, construct the full name.  */
1241   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file)
1242                          + (query ? (1 + strlen (query)) : 0)
1243                          + 1);
1244   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1245   if (query)
1246     {
1247       strcat (res, "?");
1248       strcat (res, query);
1249     }
1250   if (dir != dir_preencoding)
1251     xfree (dir);
1252   return res;
1253 }
1254
1255 /* Compose a file name out of BASE, an unescaped file name, and QUERY,
1256    an escaped query string.  The trick is to make sure that unsafe
1257    characters in BASE are escaped, and that slashes in QUERY are also
1258    escaped.  */
1259
1260 static char *
1261 compose_file_name (char *base, char *query)
1262 {
1263   char result[256];
1264   char *from;
1265   char *to = result;
1266
1267   /* Copy BASE to RESULT and encode all unsafe characters.  */
1268   from = base;
1269   while (*from && to - result < sizeof (result))
1270     {
1271       if (UNSAFE_CHAR (*from))
1272         {
1273           unsigned char c = *from++;
1274           *to++ = '%';
1275           *to++ = XDIGIT_TO_XCHAR (c >> 4);
1276           *to++ = XDIGIT_TO_XCHAR (c & 0xf);
1277         }
1278       else
1279         *to++ = *from++;
1280     }
1281
1282   if (query && to - result < sizeof (result))
1283     {
1284       *to++ = '?';
1285
1286       /* Copy QUERY to RESULT and encode all '/' characters. */
1287       from = query;
1288       while (*from && to - result < sizeof (result))
1289         {
1290           if (*from == '/')
1291             {
1292               *to++ = '%';
1293               *to++ = '2';
1294               *to++ = 'F';
1295               ++from;
1296             }
1297           else
1298             *to++ = *from++;
1299         }
1300     }
1301
1302   if (to - result < sizeof (result))
1303     *to = '\0';
1304   else
1305     /* Truncate input which is too long, presumably due to a huge
1306        query string.  */
1307     result[sizeof (result) - 1] = '\0';
1308
1309   return xstrdup (result);
1310 }
1311
1312 /* Create a unique filename, corresponding to a given URL.  Calls
1313    mkstruct if necessary.  Does *not* actually create any directories.  */
1314 char *
1315 url_filename (const struct url *u)
1316 {
1317   char *file, *name;
1318   int have_prefix = 0;          /* whether we must prepend opt.dir_prefix */
1319
1320   if (opt.dirstruct)
1321     {
1322       file = mkstruct (u);
1323       have_prefix = 1;
1324     }
1325   else
1326     {
1327       char *base = *u->file ? u->file : "index.html";
1328       char *query = u->query && *u->query ? u->query : NULL;
1329       file = compose_file_name (base, query);
1330     }
1331
1332   if (!have_prefix)
1333     {
1334       /* Check whether the prefix directory is something other than "."
1335          before prepending it.  */
1336       if (!DOTP (opt.dir_prefix))
1337         {
1338           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1339                                          + 1 + strlen (file) + 1);
1340           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1341           xfree (file);
1342           file = nfile;
1343         }
1344     }
1345   /* DOS-ish file systems don't like `%' signs in them; we change it
1346      to `@'.  */
1347 #ifdef WINDOWS
1348   {
1349     char *p = file;
1350     for (p = file; *p; p++)
1351       if (*p == '%')
1352         *p = '@';
1353   }
1354 #endif /* WINDOWS */
1355
1356   /* Check the cases in which the unique extensions are not used:
1357      1) Clobbering is turned off (-nc).
1358      2) Retrieval with regetting.
1359      3) Timestamping is used.
1360      4) Hierarchy is built.
1361
1362      The exception is the case when file does exist and is a
1363      directory (actually support for bad httpd-s).  */
1364   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1365       && !(file_exists_p (file) && !file_non_directory_p (file)))
1366     return file;
1367
1368   /* Find a unique name.  */
1369   name = unique_name (file);
1370   xfree (file);
1371   return name;
1372 }
1373
1374 /* Like strlen(), but allow the URL to be ended with '?'.  */
1375 static int
1376 urlpath_length (const char *url)
1377 {
1378   const char *q = strpbrk_or_eos (url, "?;#");
1379   return q - url;
1380 }
1381
1382 /* Find the last occurrence of character C in the range [b, e), or
1383    NULL, if none are present.  This is almost completely equivalent to
1384    { *e = '\0'; return strrchr(b); }, except that it doesn't change
1385    the contents of the string.  */
1386 static const char *
1387 find_last_char (const char *b, const char *e, char c)
1388 {
1389   for (; e > b; e--)
1390     if (*e == c)
1391       return e;
1392   return NULL;
1393 }
1394
1395 /* Resolve the result of "linking" a base URI (BASE) to a
1396    link-specified URI (LINK).
1397
1398    Either of the URIs may be absolute or relative, complete with the
1399    host name, or path only.  This tries to behave "reasonably" in all
1400    foreseeable cases.  It employs little specific knowledge about
1401    schemes or URL-specific stuff -- it just works on strings.
1402
1403    The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1404    See uri_merge for a gentler interface to this functionality.
1405
1406    Perhaps this function should handle `./' and `../' so that the evil
1407    path_simplify can go.  */
1408 static char *
1409 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1410 {
1411   char *constr;
1412
1413   if (no_scheme)
1414     {
1415       const char *end = base + urlpath_length (base);
1416
1417       if (!*link)
1418         {
1419           /* Empty LINK points back to BASE, query string and all. */
1420           constr = xstrdup (base);
1421         }
1422       else if (*link == '?')
1423         {
1424           /* LINK points to the same location, but changes the query
1425              string.  Examples: */
1426           /* uri_merge("path",         "?new") -> "path?new"     */
1427           /* uri_merge("path?foo",     "?new") -> "path?new"     */
1428           /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1429           /* uri_merge("path#foo",     "?new") -> "path?new"     */
1430           int baselength = end - base;
1431           constr = xmalloc (baselength + linklength + 1);
1432           memcpy (constr, base, baselength);
1433           memcpy (constr + baselength, link, linklength);
1434           constr[baselength + linklength] = '\0';
1435         }
1436       else if (*link == '#')
1437         {
1438           /* uri_merge("path",         "#new") -> "path#new"     */
1439           /* uri_merge("path#foo",     "#new") -> "path#new"     */
1440           /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1441           /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1442           int baselength;
1443           const char *end1 = strchr (base, '#');
1444           if (!end1)
1445             end1 = base + strlen (base);
1446           baselength = end1 - base;
1447           constr = xmalloc (baselength + linklength + 1);
1448           memcpy (constr, base, baselength);
1449           memcpy (constr + baselength, link, linklength);
1450           constr[baselength + linklength] = '\0';
1451         }
1452       else if (*link == '/')
1453         {
1454           /* LINK is an absolute path: we need to replace everything
1455              after (and including) the FIRST slash with LINK.
1456
1457              So, if BASE is "http://host/whatever/foo/bar", and LINK is
1458              "/qux/xyzzy", our result should be
1459              "http://host/qux/xyzzy".  */
1460           int span;
1461           const char *slash;
1462           const char *start_insert = NULL; /* for gcc to shut up. */
1463           const char *pos = base;
1464           int seen_slash_slash = 0;
1465           /* We're looking for the first slash, but want to ignore
1466              double slash. */
1467         again:
1468           slash = memchr (pos, '/', end - pos);
1469           if (slash && !seen_slash_slash)
1470             if (*(slash + 1) == '/')
1471               {
1472                 pos = slash + 2;
1473                 seen_slash_slash = 1;
1474                 goto again;
1475               }
1476
1477           /* At this point, SLASH is the location of the first / after
1478              "//", or the first slash altogether.  START_INSERT is the
1479              pointer to the location where LINK will be inserted.  When
1480              examining the last two examples, keep in mind that LINK
1481              begins with '/'. */
1482
1483           if (!slash && !seen_slash_slash)
1484             /* example: "foo" */
1485             /*           ^    */
1486             start_insert = base;
1487           else if (!slash && seen_slash_slash)
1488             /* example: "http://foo" */
1489             /*                     ^ */
1490             start_insert = end;
1491           else if (slash && !seen_slash_slash)
1492             /* example: "foo/bar" */
1493             /*           ^        */
1494             start_insert = base;
1495           else if (slash && seen_slash_slash)
1496             /* example: "http://something/" */
1497             /*                           ^  */
1498             start_insert = slash;
1499
1500           span = start_insert - base;
1501           constr = (char *)xmalloc (span + linklength + 1);
1502           if (span)
1503             memcpy (constr, base, span);
1504           if (linklength)
1505             memcpy (constr + span, link, linklength);
1506           constr[span + linklength] = '\0';
1507         }
1508       else
1509         {
1510           /* LINK is a relative URL: we need to replace everything
1511              after last slash (possibly empty) with LINK.
1512
1513              So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1514              our result should be "whatever/foo/qux/xyzzy".  */
1515           int need_explicit_slash = 0;
1516           int span;
1517           const char *start_insert;
1518           const char *last_slash = find_last_char (base, end, '/');
1519           if (!last_slash)
1520             {
1521               /* No slash found at all.  Append LINK to what we have,
1522                  but we'll need a slash as a separator.
1523
1524                  Example: if base == "foo" and link == "qux/xyzzy", then
1525                  we cannot just append link to base, because we'd get
1526                  "fooqux/xyzzy", whereas what we want is
1527                  "foo/qux/xyzzy".
1528
1529                  To make sure the / gets inserted, we set
1530                  need_explicit_slash to 1.  We also set start_insert
1531                  to end + 1, so that the length calculations work out
1532                  correctly for one more (slash) character.  Accessing
1533                  that character is fine, since it will be the
1534                  delimiter, '\0' or '?'.  */
1535               /* example: "foo?..." */
1536               /*               ^    ('?' gets changed to '/') */
1537               start_insert = end + 1;
1538               need_explicit_slash = 1;
1539             }
1540           else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1541             {
1542               /* example: http://host"  */
1543               /*                      ^ */
1544               start_insert = end + 1;
1545               need_explicit_slash = 1;
1546             }
1547           else
1548             {
1549               /* example: "whatever/foo/bar" */
1550               /*                        ^    */
1551               start_insert = last_slash + 1;
1552             }
1553
1554           span = start_insert - base;
1555           constr = (char *)xmalloc (span + linklength + 1);
1556           if (span)
1557             memcpy (constr, base, span);
1558           if (need_explicit_slash)
1559             constr[span - 1] = '/';
1560           if (linklength)
1561             memcpy (constr + span, link, linklength);
1562           constr[span + linklength] = '\0';
1563         }
1564     }
1565   else /* !no_scheme */
1566     {
1567       constr = strdupdelim (link, link + linklength);
1568     }
1569   return constr;
1570 }
1571
1572 /* Merge BASE with LINK and return the resulting URI.  This is an
1573    interface to uri_merge_1 that assumes that LINK is a
1574    zero-terminated string.  */
1575 char *
1576 uri_merge (const char *base, const char *link)
1577 {
1578   return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
1579 }
1580 \f
1581 #define APPEND(p, s) do {                       \
1582   int len = strlen (s);                         \
1583   memcpy (p, s, len);                           \
1584   p += len;                                     \
1585 } while (0)
1586
1587 /* Use this instead of password when the actual password is supposed
1588    to be hidden.  We intentionally use a generic string without giving
1589    away the number of characters in the password, like previous
1590    versions did.  */
1591 #define HIDDEN_PASSWORD "*password*"
1592
1593 /* Recreate the URL string from the data in URL.
1594
1595    If HIDE is non-zero (as it is when we're calling this on a URL we
1596    plan to print, but not when calling it to canonicalize a URL for
1597    use within the program), password will be hidden.  Unsafe
1598    characters in the URL will be quoted.  */
1599
1600 char *
1601 url_string (const struct url *url, int hide_password)
1602 {
1603   int size;
1604   char *result, *p;
1605   char *quoted_user = NULL, *quoted_passwd = NULL;
1606
1607   int scheme_port  = supported_schemes[url->scheme].default_port;
1608   char *scheme_str = supported_schemes[url->scheme].leading_string;
1609   int fplen = full_path_length (url);
1610
1611   assert (scheme_str != NULL);
1612
1613   /* Make sure the user name and password are quoted. */
1614   if (url->user)
1615     {
1616       quoted_user = encode_string_maybe (url->user);
1617       if (url->passwd)
1618         {
1619           if (hide_password)
1620             quoted_passwd = HIDDEN_PASSWORD;
1621           else
1622             quoted_passwd = encode_string_maybe (url->passwd);
1623         }
1624     }
1625
1626   size = (strlen (scheme_str)
1627           + strlen (url->host)
1628           + fplen
1629           + 1);
1630   if (url->port != scheme_port)
1631     size += 1 + numdigit (url->port);
1632   if (quoted_user)
1633     {
1634       size += 1 + strlen (quoted_user);
1635       if (quoted_passwd)
1636         size += 1 + strlen (quoted_passwd);
1637     }
1638
1639   p = result = xmalloc (size);
1640
1641   APPEND (p, scheme_str);
1642   if (quoted_user)
1643     {
1644       APPEND (p, quoted_user);
1645       if (quoted_passwd)
1646         {
1647           *p++ = ':';
1648           APPEND (p, quoted_passwd);
1649         }
1650       *p++ = '@';
1651     }
1652
1653   APPEND (p, url->host);
1654   if (url->port != scheme_port)
1655     {
1656       *p++ = ':';
1657       p = number_to_string (p, url->port);
1658     }
1659
1660   full_path_write (url, p);
1661   p += fplen;
1662   *p++ = '\0';
1663
1664   assert (p - result == size);
1665
1666   if (quoted_user && quoted_user != url->user)
1667     xfree (quoted_user);
1668   if (quoted_passwd && !hide_password
1669       && quoted_passwd != url->passwd)
1670     xfree (quoted_passwd);
1671
1672   return result;
1673 }
1674 \f
1675 /* Returns proxy host address, in accordance with SCHEME.  */
1676 char *
1677 getproxy (enum url_scheme scheme)
1678 {
1679   char *proxy = NULL;
1680   char *rewritten_url;
1681   static char rewritten_storage[1024];
1682
1683   switch (scheme)
1684     {
1685     case SCHEME_HTTP:
1686       proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1687       break;
1688 #ifdef HAVE_SSL
1689     case SCHEME_HTTPS:
1690       proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1691       break;
1692 #endif
1693     case SCHEME_FTP:
1694       proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1695       break;
1696     case SCHEME_INVALID:
1697       break;
1698     }
1699   if (!proxy || !*proxy)
1700     return NULL;
1701
1702   /* Handle shorthands. */
1703   rewritten_url = rewrite_shorthand_url (proxy);
1704   if (rewritten_url)
1705     {
1706       strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
1707       rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
1708       proxy = rewritten_storage;
1709     }
1710
1711   return proxy;
1712 }
1713
1714 /* Should a host be accessed through proxy, concerning no_proxy?  */
1715 int
1716 no_proxy_match (const char *host, const char **no_proxy)
1717 {
1718   if (!no_proxy)
1719     return 1;
1720   else
1721     return !sufmatch (no_proxy, host);
1722 }
1723 \f
1724 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1725 static const char *replace_attr PARAMS ((const char *, int, FILE *,
1726                                          const char *));
1727 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
1728                                                       const char *, int));
1729 static char *local_quote_string PARAMS ((const char *));
1730
1731 /* Change the links in one HTML file.  LINKS is a list of links in the
1732    document, along with their positions and the desired direction of
1733    the conversion.  */
1734 void
1735 convert_links (const char *file, struct urlpos *links)
1736 {
1737   struct file_memory *fm;
1738   FILE *fp;
1739   const char *p;
1740   downloaded_file_t downloaded_file_return;
1741
1742   struct urlpos *link;
1743   int to_url_count = 0, to_file_count = 0;
1744
1745   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1746
1747   {
1748     /* First we do a "dry run": go through the list L and see whether
1749        any URL needs to be converted in the first place.  If not, just
1750        leave the file alone.  */
1751     int dry_count = 0;
1752     struct urlpos *dry = links;
1753     for (dry = links; dry; dry = dry->next)
1754       if (dry->convert != CO_NOCONVERT)
1755         ++dry_count;
1756     if (!dry_count)
1757       {
1758         logputs (LOG_VERBOSE, _("nothing to do.\n"));
1759         return;
1760       }
1761   }
1762
1763   fm = read_file (file);
1764   if (!fm)
1765     {
1766       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1767                  file, strerror (errno));
1768       return;
1769     }
1770
1771   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1772   if (opt.backup_converted && downloaded_file_return)
1773     write_backup_file (file, downloaded_file_return);
1774
1775   /* Before opening the file for writing, unlink the file.  This is
1776      important if the data in FM is mmaped.  In such case, nulling the
1777      file, which is what fopen() below does, would make us read all
1778      zeroes from the mmaped region.  */
1779   if (unlink (file) < 0 && errno != ENOENT)
1780     {
1781       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1782                  file, strerror (errno));
1783       read_file_free (fm);
1784       return;
1785     }
1786   /* Now open the file for writing.  */
1787   fp = fopen (file, "wb");
1788   if (!fp)
1789     {
1790       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1791                  file, strerror (errno));
1792       read_file_free (fm);
1793       return;
1794     }
1795
1796   /* Here we loop through all the URLs in file, replacing those of
1797      them that are downloaded with relative references.  */
1798   p = fm->content;
1799   for (link = links; link; link = link->next)
1800     {
1801       char *url_start = fm->content + link->pos;
1802
1803       if (link->pos >= fm->length)
1804         {
1805           DEBUGP (("Something strange is going on.  Please investigate."));
1806           break;
1807         }
1808       /* If the URL is not to be converted, skip it.  */
1809       if (link->convert == CO_NOCONVERT)
1810         {
1811           DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
1812           continue;
1813         }
1814
1815       /* Echo the file contents, up to the offending URL's opening
1816          quote, to the outfile.  */
1817       fwrite (p, 1, url_start - p, fp);
1818       p = url_start;
1819
1820       switch (link->convert)
1821         {
1822         case CO_CONVERT_TO_RELATIVE:
1823           /* Convert absolute URL to relative. */
1824           {
1825             char *newname = construct_relative (file, link->local_name);
1826             char *quoted_newname = local_quote_string (newname);
1827
1828             if (!link->link_refresh_p)
1829               p = replace_attr (p, link->size, fp, quoted_newname);
1830             else
1831               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
1832                                              link->refresh_timeout);
1833
1834             DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1835                      link->url->url, newname, link->pos, file));
1836             xfree (newname);
1837             xfree (quoted_newname);
1838             ++to_file_count;
1839             break;
1840           }
1841         case CO_CONVERT_TO_COMPLETE:
1842           /* Convert the link to absolute URL. */
1843           {
1844             char *newlink = link->url->url;
1845             char *quoted_newlink = html_quote_string (newlink);
1846
1847             if (!link->link_refresh_p)
1848               p = replace_attr (p, link->size, fp, quoted_newlink);
1849             else
1850               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
1851                                              link->refresh_timeout);
1852
1853             DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1854                      newlink, link->pos, file));
1855             xfree (quoted_newlink);
1856             ++to_url_count;
1857             break;
1858           }
1859         case CO_NULLIFY_BASE:
1860           /* Change the base href to "". */
1861           p = replace_attr (p, link->size, fp, "");
1862           break;
1863         case CO_NOCONVERT:
1864           abort ();
1865           break;
1866         }
1867     }
1868
1869   /* Output the rest of the file. */
1870   if (p - fm->content < fm->length)
1871     fwrite (p, 1, fm->length - (p - fm->content), fp);
1872   fclose (fp);
1873   read_file_free (fm);
1874
1875   logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
1876 }
1877
1878 /* Construct and return a malloced copy of the relative link from two
1879    pieces of information: local name S1 of the referring file and
1880    local name S2 of the referred file.
1881
1882    So, if S1 is "jagor.srce.hr/index.html" and S2 is
1883    "jagor.srce.hr/images/news.gif", the function will return
1884    "images/news.gif".
1885
1886    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1887    "fly.cc.fer.hr/images/fly.gif", the function will return
1888    "../images/fly.gif".
1889
1890    Caveats: S1 should not begin with `/', unless S2 also begins with
1891    '/'.  S1 should not contain things like ".." and such --
1892    construct_relative ("fly/ioccc/../index.html",
1893    "fly/images/fly.gif") will fail.  (A workaround is to call
1894    something like path_simplify() on S1).  */
1895 static char *
1896 construct_relative (const char *s1, const char *s2)
1897 {
1898   int i, cnt, sepdirs1;
1899   char *res;
1900
1901   if (*s2 == '/')
1902     return xstrdup (s2);
1903   /* S1 should *not* be absolute, if S2 wasn't.  */
1904   assert (*s1 != '/');
1905   i = cnt = 0;
1906   /* Skip the directories common to both strings.  */
1907   while (1)
1908     {
1909       while (s1[i] && s2[i]
1910              && (s1[i] == s2[i])
1911              && (s1[i] != '/')
1912              && (s2[i] != '/'))
1913         ++i;
1914       if (s1[i] == '/' && s2[i] == '/')
1915         cnt = ++i;
1916       else
1917         break;
1918     }
1919   for (sepdirs1 = 0; s1[i]; i++)
1920     if (s1[i] == '/')
1921       ++sepdirs1;
1922   /* Now, construct the file as of:
1923      - ../ repeated sepdirs1 time
1924      - all the non-mutual directories of S2.  */
1925   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1926   for (i = 0; i < sepdirs1; i++)
1927     memcpy (res + 3 * i, "../", 3);
1928   strcpy (res + 3 * i, s2 + cnt);
1929   return res;
1930 }
1931 \f
1932 static void
1933 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1934 {
1935   /* Rather than just writing over the original .html file with the
1936      converted version, save the former to *.orig.  Note we only do
1937      this for files we've _successfully_ downloaded, so we don't
1938      clobber .orig files sitting around from previous invocations. */
1939
1940   /* Construct the backup filename as the original name plus ".orig". */
1941   size_t         filename_len = strlen(file);
1942   char*          filename_plus_orig_suffix;
1943   boolean        already_wrote_backup_file = FALSE;
1944   slist*         converted_file_ptr;
1945   static slist*  converted_files = NULL;
1946
1947   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1948     {
1949       /* Just write "orig" over "html".  We need to do it this way
1950          because when we're checking to see if we've downloaded the
1951          file before (to see if we can skip downloading it), we don't
1952          know if it's a text/html file.  Therefore we don't know yet
1953          at that stage that -E is going to cause us to tack on
1954          ".html", so we need to compare vs. the original URL plus
1955          ".orig", not the original URL plus ".html.orig". */
1956       filename_plus_orig_suffix = alloca (filename_len + 1);
1957       strcpy(filename_plus_orig_suffix, file);
1958       strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1959     }
1960   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1961     {
1962       /* Append ".orig" to the name. */
1963       filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1964       strcpy(filename_plus_orig_suffix, file);
1965       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1966     }
1967
1968   /* We can get called twice on the same URL thanks to the
1969      convert_all_links() call in main().  If we write the .orig file
1970      each time in such a case, it'll end up containing the first-pass
1971      conversion, not the original file.  So, see if we've already been
1972      called on this file. */
1973   converted_file_ptr = converted_files;
1974   while (converted_file_ptr != NULL)
1975     if (strcmp(converted_file_ptr->string, file) == 0)
1976       {
1977         already_wrote_backup_file = TRUE;
1978         break;
1979       }
1980     else
1981       converted_file_ptr = converted_file_ptr->next;
1982
1983   if (!already_wrote_backup_file)
1984     {
1985       /* Rename <file> to <file>.orig before former gets written over. */
1986       if (rename(file, filename_plus_orig_suffix) != 0)
1987         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1988                    file, filename_plus_orig_suffix, strerror (errno));
1989
1990       /* Remember that we've already written a .orig backup for this file.
1991          Note that we never free this memory since we need it till the
1992          convert_all_links() call, which is one of the last things the
1993          program does before terminating.  BTW, I'm not sure if it would be
1994          safe to just set 'converted_file_ptr->string' to 'file' below,
1995          rather than making a copy of the string...  Another note is that I
1996          thought I could just add a field to the urlpos structure saying
1997          that we'd written a .orig file for this URL, but that didn't work,
1998          so I had to make this separate list.
1999          -- Dan Harkless <wget@harkless.org>
2000
2001          This [adding a field to the urlpos structure] didn't work
2002          because convert_file() is called from convert_all_links at
2003          the end of the retrieval with a freshly built new urlpos
2004          list.
2005          -- Hrvoje Niksic <hniksic@arsdigita.com>
2006       */
2007       converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
2008       converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
2009       converted_file_ptr->next = converted_files;
2010       converted_files = converted_file_ptr;
2011     }
2012 }
2013
2014 static int find_fragment PARAMS ((const char *, int, const char **,
2015                                   const char **));
2016
2017 /* Replace an attribute's original text with NEW_TEXT. */
2018
2019 static const char *
2020 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
2021 {
2022   int quote_flag = 0;
2023   char quote_char = '\"';       /* use "..." for quoting, unless the
2024                                    original value is quoted, in which
2025                                    case reuse its quoting char. */
2026   const char *frag_beg, *frag_end;
2027
2028   /* Structure of our string is:
2029        "...old-contents..."
2030        <---    size    --->  (with quotes)
2031      OR:
2032        ...old-contents...
2033        <---    size   -->    (no quotes)   */
2034
2035   if (*p == '\"' || *p == '\'')
2036     {
2037       quote_char = *p;
2038       quote_flag = 1;
2039       ++p;
2040       size -= 2;                /* disregard opening and closing quote */
2041     }
2042   putc (quote_char, fp);
2043   fputs (new_text, fp);
2044
2045   /* Look for fragment identifier, if any. */
2046   if (find_fragment (p, size, &frag_beg, &frag_end))
2047     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2048   p += size;
2049   if (quote_flag)
2050     ++p;
2051   putc (quote_char, fp);
2052
2053   return p;
2054 }
2055
2056 /* The same as REPLACE_ATTR, but used when replacing
2057    <meta http-equiv=refresh content="new_text"> because we need to
2058    append "timeout_value; URL=" before the next_text.  */
2059
2060 static const char *
2061 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
2062                            const char *new_text, int timeout)
2063 {
2064   /* "0; URL=..." */
2065   char *new_with_timeout = (char *)alloca (numdigit (timeout)
2066                                            + 6 /* "; URL=" */
2067                                            + strlen (new_text)
2068                                            + 1);
2069   sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
2070
2071   return replace_attr (p, size, fp, new_with_timeout);
2072 }
2073
2074 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2075    preceded by '&'.  If the character is not found, return zero.  If
2076    the character is found, return 1 and set BP and EP to point to the
2077    beginning and end of the region.
2078
2079    This is used for finding the fragment indentifiers in URLs.  */
2080
2081 static int
2082 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2083 {
2084   const char *end = beg + size;
2085   int saw_amp = 0;
2086   for (; beg < end; beg++)
2087     {
2088       switch (*beg)
2089         {
2090         case '&':
2091           saw_amp = 1;
2092           break;
2093         case '#':
2094           if (!saw_amp)
2095             {
2096               *bp = beg;
2097               *ep = end;
2098               return 1;
2099             }
2100           /* fallthrough */
2101         default:
2102           saw_amp = 0;
2103         }
2104     }
2105   return 0;
2106 }
2107
2108 /* Quote FILE for use as local reference to an HTML file.
2109
2110    We quote ? as %3F to avoid passing part of the file name as the
2111    parameter when browsing the converted file through HTTP.  However,
2112    it is safe to do this only when `--html-extension' is turned on.
2113    This is because converting "index.html?foo=bar" to
2114    "index.html%3Ffoo=bar" would break local browsing, as the latter
2115    isn't even recognized as an HTML file!  However, converting
2116    "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
2117    safe for both local and HTTP-served browsing.  */
2118
2119 static char *
2120 local_quote_string (const char *file)
2121 {
2122   const char *file_sans_qmark;
2123   int qm;
2124
2125   if (!opt.html_extension)
2126     return html_quote_string (file);
2127
2128   qm = count_char (file, '?');
2129
2130   if (qm)
2131     {
2132       const char *from = file;
2133       char *to, *newname;
2134
2135       /* qm * 2 because we replace each question mark with "%3F",
2136          i.e. replace one char with three, hence two more.  */
2137       int fsqlen = strlen (file) + qm * 2;
2138
2139       to = newname = (char *)alloca (fsqlen + 1);
2140       for (; *from; from++)
2141         {
2142           if (*from != '?')
2143             *to++ = *from;
2144           else
2145             {
2146               *to++ = '%';
2147               *to++ = '3';
2148               *to++ = 'F';
2149             }
2150         }
2151       assert (to - newname == fsqlen);
2152       *to = '\0';
2153
2154       file_sans_qmark = newname;
2155     }
2156   else
2157     file_sans_qmark = file;
2158
2159   return html_quote_string (file_sans_qmark);
2160 }
2161
2162 /* We're storing "modes" of type downloaded_file_t in the hash table.
2163    However, our hash tables only accept pointers for keys and values.
2164    So when we need a pointer, we use the address of a
2165    downloaded_file_t variable of static storage.  */
2166
2167 static downloaded_file_t *
2168 downloaded_mode_to_ptr (downloaded_file_t mode)
2169 {
2170   static downloaded_file_t
2171     v1 = FILE_NOT_ALREADY_DOWNLOADED,
2172     v2 = FILE_DOWNLOADED_NORMALLY,
2173     v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2174     v4 = CHECK_FOR_FILE;
2175
2176   switch (mode)
2177     {
2178     case FILE_NOT_ALREADY_DOWNLOADED:
2179       return &v1;
2180     case FILE_DOWNLOADED_NORMALLY:
2181       return &v2;
2182     case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2183       return &v3;
2184     case CHECK_FOR_FILE:
2185       return &v4;
2186     }
2187   return NULL;
2188 }
2189
2190 /* This should really be merged with dl_file_url_map and
2191    downloaded_html_files in recur.c.  This was originally a list, but
2192    I changed it to a hash table beause it was actually taking a lot of
2193    time to find things in it.  */
2194
2195 static struct hash_table *downloaded_files_hash;
2196
2197 /* Remembers which files have been downloaded.  In the standard case, should be
2198    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2199    download successfully (i.e. not for ones we have failures on or that we skip
2200    due to -N).
2201
2202    When we've downloaded a file and tacked on a ".html" extension due to -E,
2203    call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2204    FILE_DOWNLOADED_NORMALLY.
2205
2206    If you just want to check if a file has been previously added without adding
2207    it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
2208    with local filenames, not remote URLs. */
2209 downloaded_file_t
2210 downloaded_file (downloaded_file_t mode, const char *file)
2211 {
2212   downloaded_file_t *ptr;
2213
2214   if (mode == CHECK_FOR_FILE)
2215     {
2216       if (!downloaded_files_hash)
2217         return FILE_NOT_ALREADY_DOWNLOADED;
2218       ptr = hash_table_get (downloaded_files_hash, file);
2219       if (!ptr)
2220         return FILE_NOT_ALREADY_DOWNLOADED;
2221       return *ptr;
2222     }
2223
2224   if (!downloaded_files_hash)
2225     downloaded_files_hash = make_string_hash_table (0);
2226
2227   ptr = hash_table_get (downloaded_files_hash, file);
2228   if (ptr)
2229     return *ptr;
2230
2231   ptr = downloaded_mode_to_ptr (mode);
2232   hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2233
2234   return FILE_NOT_ALREADY_DOWNLOADED;
2235 }
2236
2237 static int
2238 df_free_mapper (void *key, void *value, void *ignored)
2239 {
2240   xfree (key);
2241   return 0;
2242 }
2243
2244 void
2245 downloaded_files_free (void)
2246 {
2247   if (downloaded_files_hash)
2248     {
2249       hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2250       hash_table_destroy (downloaded_files_hash);
2251       downloaded_files_hash = NULL;
2252     }
2253 }