sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif
  29 #include <sys/types.h>
  30 #ifdef HAVE_UNISTD_H
  31 # include <unistd.h>
  32 #endif
  33 #include <errno.h>
  34 #include <assert.h>
  35
  36 #include "wget.h"
  37 #include "utils.h"
  38 #include "url.h"
  39 #include "host.h"
  40 #include "hash.h"
  41
  42 #ifndef errno
  43 extern int errno;
  44 #endif
  45
  46 /* Is X "."?  */
  47 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  48 /* Is X ".."?  */
  49 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  50
  51 static int urlpath_length PARAMS ((const char *));
  52
  53 struct scheme_data
  54 {
  55   char *leading_string;
  56   int default_port;
  57   int enabled;
  58 };
  59
  60 /* Supported schemes: */
  61 static struct scheme_data supported_schemes[] =
  62 {
  63   { "http://",  DEFAULT_HTTP_PORT,  1 },
  64 #ifdef HAVE_SSL
  65   { "https://", DEFAULT_HTTPS_PORT, 1 },
  66 #endif
  67   { "ftp://",   DEFAULT_FTP_PORT,   1 },
  68
  69   /* SCHEME_INVALID */
  70   { NULL,       -1,                 0 }
  71 };
  72
  73 static char *construct_relative PARAMS ((const char *, const char *));
  74
  75 \f
  76 /* Support for encoding and decoding of URL strings.  We determine
  77    whether a character is unsafe through static table lookup.  This
  78    code assumes ASCII character set and 8-bit chars.  */
  79
  80 enum {
  81   urlchr_reserved = 1,
  82   urlchr_unsafe   = 2
  83 };
  84
  85 #define R  urlchr_reserved
  86 #define U  urlchr_unsafe
  87 #define RU R|U
  88
  89 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
  90
  91 /* rfc1738 reserved chars, preserved from encoding.  */
  92
  93 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
  94
  95 /* rfc1738 unsafe chars, plus some more.  */
  96
  97 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
  98
  99 const static unsigned char urlchr_table[256] =
 100 {
 101   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 102   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 103   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 104   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 105   U,  0,  U, RU,   0,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 106   0,  0,  0,  R,   0,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 107   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 108   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 109  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 110   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 111   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 112   0,  0,  0,  U,   U,  U,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 113   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 114   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 115   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 116   0,  0,  0,  U,   U,  U,  U,  U,   /* x   y   z   {    |   }   ~   DEL */
 117
 118   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 119   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 120   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 121   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 122
 123   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 124   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 125   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 126   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 127 };
 128
 129 /* Decodes the forms %xy in a URL to the character the hexadecimal
 130    code of which is xy.  xy are hexadecimal digits from
 131    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 132    hex-digits or `%' precedes `\0', the sequence is inserted
 133    literally.  */
 134
 135 static void
 136 decode_string (char *s)
 137 {
 138   char *t = s;                  /* t - tortoise */
 139   char *h = s;                  /* h - hare     */
 140
 141   for (; *h; h++, t++)
 142     {
 143       if (*h != '%')
 144         {
 145         copychar:
 146           *t = *h;
 147         }
 148       else
 149         {
 150           /* Do nothing if '%' is not followed by two hex digits. */
 151           if (!*(h + 1) || !*(h + 2)
 152               || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
 153             goto copychar;
 154           *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
 155           h += 2;
 156         }
 157     }
 158   *t = '\0';
 159 }
 160
 161 /* Like encode_string, but return S if there are no unsafe chars.  */
 162
 163 static char *
 164 encode_string_maybe (const char *s)
 165 {
 166   const char *p1;
 167   char *p2, *newstr;
 168   int newlen;
 169   int addition = 0;
 170
 171   for (p1 = s; *p1; p1++)
 172     if (UNSAFE_CHAR (*p1))
 173       addition += 2;            /* Two more characters (hex digits) */
 174
 175   if (!addition)
 176     return (char *)s;
 177
 178   newlen = (p1 - s) + addition;
 179   newstr = (char *)xmalloc (newlen + 1);
 180
 181   p1 = s;
 182   p2 = newstr;
 183   while (*p1)
 184     {
 185       if (UNSAFE_CHAR (*p1))
 186         {
 187           unsigned char c = *p1++;
 188           *p2++ = '%';
 189           *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 190           *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 191         }
 192       else
 193         *p2++ = *p1++;
 194     }
 195   *p2 = '\0';
 196   assert (p2 - newstr == newlen);
 197
 198   return newstr;
 199 }
 200
 201 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
 202    given string, returning a malloc-ed %XX encoded string.  */
 203
 204 char *
 205 encode_string (const char *s)
 206 {
 207   char *encoded = encode_string_maybe (s);
 208   if (encoded != s)
 209     return encoded;
 210   else
 211     return xstrdup (s);
 212 }
 213
 214 /* Encode unsafe characters in PTR to %xx.  If such encoding is done,
 215    the old value of PTR is freed and PTR is made to point to the newly
 216    allocated storage.  */
 217
 218 #define ENCODE(ptr) do {                        \
 219   char *e_new = encode_string_maybe (ptr);      \
 220   if (e_new != ptr)                             \
 221     {                                           \
 222       xfree (ptr);                              \
 223       ptr = e_new;                              \
 224     }                                           \
 225 } while (0)
 226 \f
 227 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
 228
 229 /* Decide whether to encode, decode, or pass through the char at P.
 230    This used to be a macro, but it got a little too convoluted.  */
 231 static inline enum copy_method
 232 decide_copy_method (const char *p)
 233 {
 234   if (*p == '%')
 235     {
 236       if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
 237         {
 238           /* %xx sequence: decode it, unless it would decode to an
 239              unsafe or a reserved char; in that case, leave it as
 240              is. */
 241           char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
 242             XCHAR_TO_XDIGIT (*(p + 2));
 243
 244           if (UNSAFE_CHAR (preempt) || RESERVED_CHAR (preempt))
 245             return CM_PASSTHROUGH;
 246           else
 247             return CM_DECODE;
 248         }
 249       else
 250         /* Garbled %.. sequence: encode `%'. */
 251         return CM_ENCODE;
 252     }
 253   else if (UNSAFE_CHAR (*p) && !RESERVED_CHAR (*p))
 254     return CM_ENCODE;
 255   else
 256     return CM_PASSTHROUGH;
 257 }
 258
 259 /* Translate a %-quoting (but possibly non-conformant) input string S
 260    into a %-quoting (and conformant) output string.  If no characters
 261    are encoded or decoded, return the same string S; otherwise, return
 262    a freshly allocated string with the new contents.
 263
 264    After a URL has been run through this function, the protocols that
 265    use `%' as the quote character can use the resulting string as-is,
 266    while those that don't call decode_string() to get to the intended
 267    data.  This function is also stable: after an input string is
 268    transformed the first time, all further transformations of the
 269    result yield the same result string.
 270
 271    Let's discuss why this function is needed.
 272
 273    Imagine Wget is to retrieve `http://abc.xyz/abc def'.  Since a raw
 274    space character would mess up the HTTP request, it needs to be
 275    quoted, like this:
 276
 277        GET /abc%20def HTTP/1.0
 278
 279    So it appears that the unsafe chars need to be quoted, as with
 280    encode_string.  But what if we're requested to download
 281    `abc%20def'?  Remember that %-encoding is valid URL syntax, so what
 282    the user meant was a literal space, and he was kind enough to quote
 283    it.  In that case, Wget should obviously leave the `%20' as is, and
 284    send the same request as above.  So in this case we may not call
 285    encode_string.
 286
 287    But what if the requested URI is `abc%20 def'?  If we call
 288    encode_string, we end up with `/abc%2520%20def', which is almost
 289    certainly not intended.  If we don't call encode_string, we are
 290    left with the embedded space and cannot send the request.  What the
 291    user meant was for Wget to request `/abc%20%20def', and this is
 292    where reencode_string kicks in.
 293
 294    Wget used to solve this by first decoding %-quotes, and then
 295    encoding all the "unsafe" characters found in the resulting string.
 296    This was wrong because it didn't preserve certain URL special
 297    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 298    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 299    whether we considered `+' reserved (it is).  One of these results
 300    is inevitable because by the second step we would lose information
 301    on whether the `+' was originally encoded or not.  Both results
 302    were wrong because in CGI parameters + means space, while %2B means
 303    literal plus.  reencode_string correctly translates the above to
 304    "a%2B+b", i.e. returns the original string.
 305
 306    This function uses an algorithm proposed by Anon Sricharoenchai:
 307
 308    1. Encode all URL_UNSAFE and the "%" that are not followed by 2
 309       hexdigits.
 310
 311    2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
 312       "+".
 313
 314    ...except that this code conflates the two steps, and decides
 315    whether to encode, decode, or pass through each character in turn.
 316    The function still uses two passes, but their logic is the same --
 317    the first pass exists merely for the sake of allocation.  Another
 318    small difference is that we include `+' to URL_RESERVED.
 319
 320    Anon's test case:
 321
 322    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 323    ->
 324    "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
 325
 326    Simpler test cases:
 327
 328    "foo bar"         -> "foo%20bar"
 329    "foo%20bar"       -> "foo%20bar"
 330    "foo %20bar"      -> "foo%20%20bar"
 331    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 332    "foo%25%20bar"    -> "foo%25%20bar"
 333    "foo%2%20bar"     -> "foo%252%20bar"
 334    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 335    "foo%2b+bar"      -> "foo%2b+bar"  */
 336
 337 static char *
 338 reencode_string (const char *s)
 339 {
 340   const char *p1;
 341   char *newstr, *p2;
 342   int oldlen, newlen;
 343
 344   int encode_count = 0;
 345   int decode_count = 0;
 346
 347   /* First, pass through the string to see if there's anything to do,
 348      and to calculate the new length.  */
 349   for (p1 = s; *p1; p1++)
 350     {
 351       switch (decide_copy_method (p1))
 352         {
 353         case CM_ENCODE:
 354           ++encode_count;
 355           break;
 356         case CM_DECODE:
 357           ++decode_count;
 358           break;
 359         case CM_PASSTHROUGH:
 360           break;
 361         }
 362     }
 363
 364   if (!encode_count && !decode_count)
 365     /* The string is good as it is. */
 366     return (char *)s;           /* C const model sucks. */
 367
 368   oldlen = p1 - s;
 369   /* Each encoding adds two characters (hex digits), while each
 370      decoding removes two characters.  */
 371   newlen = oldlen + 2 * (encode_count - decode_count);
 372   newstr = xmalloc (newlen + 1);
 373
 374   p1 = s;
 375   p2 = newstr;
 376
 377   while (*p1)
 378     {
 379       switch (decide_copy_method (p1))
 380         {
 381         case CM_ENCODE:
 382           {
 383             unsigned char c = *p1++;
 384             *p2++ = '%';
 385             *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 386             *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 387           }
 388           break;
 389         case CM_DECODE:
 390           *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
 391                    + (XCHAR_TO_XDIGIT (*(p1 + 2))));
 392           p1 += 3;              /* skip %xx */
 393           break;
 394         case CM_PASSTHROUGH:
 395           *p2++ = *p1++;
 396         }
 397     }
 398   *p2 = '\0';
 399   assert (p2 - newstr == newlen);
 400   return newstr;
 401 }
 402
 403 /* Run PTR_VAR through reencode_string.  If a new string is consed,
 404    free PTR_VAR and make it point to the new storage.  Obviously,
 405    PTR_VAR needs to be an lvalue.  */
 406
 407 #define REENCODE(ptr_var) do {                  \
 408   char *rf_new = reencode_string (ptr_var);     \
 409   if (rf_new != ptr_var)                        \
 410     {                                           \
 411       xfree (ptr_var);                          \
 412       ptr_var = rf_new;                         \
 413     }                                           \
 414 } while (0)
 415 \f
 416 /* Returns the scheme type if the scheme is supported, or
 417    SCHEME_INVALID if not.  */
 418 enum url_scheme
 419 url_scheme (const char *url)
 420 {
 421   int i;
 422
 423   for (i = 0; supported_schemes[i].leading_string; i++)
 424     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 425                           strlen (supported_schemes[i].leading_string)))
 426       {
 427         if (supported_schemes[i].enabled)
 428           return (enum url_scheme) i;
 429         else
 430           return SCHEME_INVALID;
 431       }
 432
 433   return SCHEME_INVALID;
 434 }
 435
 436 /* Return the number of characters needed to skip the scheme part of
 437    the URL, e.g. `http://'.  If no scheme is found, returns 0.  */
 438 int
 439 url_skip_scheme (const char *url)
 440 {
 441   const char *p = url;
 442
 443   /* Skip the scheme name.  We allow `-' and `+' because of `whois++',
 444      etc. */
 445   while (ISALNUM (*p) || *p == '-' || *p == '+')
 446     ++p;
 447   if (*p != ':')
 448     return 0;
 449   /* Skip ':'. */
 450   ++p;
 451
 452   /* Skip "//" if found. */
 453   if (*p == '/' && *(p + 1) == '/')
 454     p += 2;
 455
 456   return p - url;
 457 }
 458
 459 /* Returns 1 if the URL begins with a scheme (supported or
 460    unsupported), 0 otherwise.  */
 461 int
 462 url_has_scheme (const char *url)
 463 {
 464   const char *p = url;
 465   while (ISALNUM (*p) || *p == '-' || *p == '+')
 466     ++p;
 467   return *p == ':';
 468 }
 469
 470 int
 471 scheme_default_port (enum url_scheme scheme)
 472 {
 473   return supported_schemes[scheme].default_port;
 474 }
 475
 476 void
 477 scheme_disable (enum url_scheme scheme)
 478 {
 479   supported_schemes[scheme].enabled = 0;
 480 }
 481
 482 /* Skip the username and password, if present here.  The function
 483    should be called *not* with the complete URL, but with the part
 484    right after the scheme.
 485
 486    If no username and password are found, return 0.  */
 487 int
 488 url_skip_uname (const char *url)
 489 {
 490   const char *p;
 491
 492   /* Look for '@' that comes before '/' or '?'. */
 493   p = (const char *)strpbrk (url, "/?@");
 494   if (!p || *p != '@')
 495     return 0;
 496
 497   return p - url + 1;
 498 }
 499
 500 static int
 501 parse_uname (const char *str, int len, char **user, char **passwd)
 502 {
 503   char *colon;
 504
 505   if (len == 0)
 506     /* Empty user name not allowed. */
 507     return 0;
 508
 509   colon = memchr (str, ':', len);
 510   if (colon == str)
 511     /* Empty user name again. */
 512     return 0;
 513
 514   if (colon)
 515     {
 516       int pwlen = len - (colon + 1 - str);
 517       *passwd = xmalloc (pwlen + 1);
 518       memcpy (*passwd, colon + 1, pwlen);
 519       (*passwd)[pwlen] = '\0';
 520       len -= pwlen + 1;
 521     }
 522   else
 523     *passwd = NULL;
 524
 525   *user = xmalloc (len + 1);
 526   memcpy (*user, str, len);
 527   (*user)[len] = '\0';
 528
 529   return 1;
 530 }
 531
 532 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 533    popularized by Netscape and NcFTP.  HTTP shorthands look like this:
 534
 535    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 536    www.foo.com[:port]            -> http://www.foo.com[:port]
 537
 538    FTP shorthands look like this:
 539
 540    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 541    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 542
 543    If the URL needs not or cannot be rewritten, return NULL.  */
 544 char *
 545 rewrite_shorthand_url (const char *url)
 546 {
 547   const char *p;
 548
 549   if (url_has_scheme (url))
 550     return NULL;
 551
 552   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 553      latter Netscape.  */
 554   for (p = url; *p && *p != ':' && *p != '/'; p++)
 555     ;
 556
 557   if (p == url)
 558     return NULL;
 559
 560   if (*p == ':')
 561     {
 562       const char *pp;
 563       char *res;
 564       /* If the characters after the colon and before the next slash
 565          or end of string are all digits, it's HTTP.  */
 566       int digits = 0;
 567       for (pp = p + 1; ISDIGIT (*pp); pp++)
 568         ++digits;
 569       if (digits > 0 && (*pp == '/' || *pp == '\0'))
 570         goto http;
 571
 572       /* Prepend "ftp://" to the entire URL... */
 573       res = xmalloc (6 + strlen (url) + 1);
 574       sprintf (res, "ftp://%s", url);
 575       /* ...and replace ':' with '/'. */
 576       res[6 + (p - url)] = '/';
 577       return res;
 578     }
 579   else
 580     {
 581       char *res;
 582     http:
 583       /* Just prepend "http://" to what we have. */
 584       res = xmalloc (7 + strlen (url) + 1);
 585       sprintf (res, "http://%s", url);
 586       return res;
 587     }
 588 }
 589 \f
 590 static void parse_path PARAMS ((const char *, char **, char **));
 591
 592 static char *
 593 strpbrk_or_eos (const char *s, const char *accept)
 594 {
 595   char *p = strpbrk (s, accept);
 596   if (!p)
 597     p = (char *)s + strlen (s);
 598   return p;
 599 }
 600
 601 /* Turn STR into lowercase; return non-zero if a character was
 602    actually changed. */
 603
 604 static int
 605 lowercase_str (char *str)
 606 {
 607   int change = 0;
 608   for (; *str; str++)
 609     if (ISUPPER (*str))
 610       {
 611         change = 1;
 612         *str = TOLOWER (*str);
 613       }
 614   return change;
 615 }
 616
 617 static char *parse_errors[] = {
 618 #define PE_NO_ERROR            0
 619   "No error",
 620 #define PE_UNSUPPORTED_SCHEME 1
 621   "Unsupported scheme",
 622 #define PE_EMPTY_HOST          2
 623   "Empty host",
 624 #define PE_BAD_PORT_NUMBER     3
 625   "Bad port number",
 626 #define PE_INVALID_USER_NAME   4
 627   "Invalid user name"
 628 };
 629
 630 #define SETERR(p, v) do {                       \
 631   if (p)                                        \
 632     *(p) = (v);                                 \
 633 } while (0)
 634
 635 /* Parse a URL.
 636
 637    Return a new struct url if successful, NULL on error.  In case of
 638    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 639    error code. */
 640 struct url *
 641 url_parse (const char *url, int *error)
 642 {
 643   struct url *u;
 644   const char *p;
 645   int path_modified, host_modified;
 646
 647   enum url_scheme scheme;
 648
 649   const char *uname_b,     *uname_e;
 650   const char *host_b,      *host_e;
 651   const char *path_b,      *path_e;
 652   const char *params_b,    *params_e;
 653   const char *query_b,     *query_e;
 654   const char *fragment_b,  *fragment_e;
 655
 656   int port;
 657   char *user = NULL, *passwd = NULL;
 658
 659   char *url_encoded;
 660
 661   scheme = url_scheme (url);
 662   if (scheme == SCHEME_INVALID)
 663     {
 664       SETERR (error, PE_UNSUPPORTED_SCHEME);
 665       return NULL;
 666     }
 667
 668   url_encoded = reencode_string (url);
 669   p = url_encoded;
 670
 671   p += strlen (supported_schemes[scheme].leading_string);
 672   uname_b = p;
 673   p += url_skip_uname (p);
 674   uname_e = p;
 675
 676   /* scheme://user:pass@host[:port]... */
 677   /*                    ^              */
 678
 679   /* We attempt to break down the URL into the components path,
 680      params, query, and fragment.  They are ordered like this:
 681
 682        scheme://host[:port][/path][;params][?query][#fragment]  */
 683
 684   params_b   = params_e   = NULL;
 685   query_b    = query_e    = NULL;
 686   fragment_b = fragment_e = NULL;
 687
 688   host_b = p;
 689   p = strpbrk_or_eos (p, ":/;?#");
 690   host_e = p;
 691
 692   if (host_b == host_e)
 693     {
 694       SETERR (error, PE_EMPTY_HOST);
 695       return NULL;
 696     }
 697
 698   port = scheme_default_port (scheme);
 699   if (*p == ':')
 700     {
 701       const char *port_b, *port_e, *pp;
 702
 703       /* scheme://host:port/tralala */
 704       /*              ^             */
 705       ++p;
 706       port_b = p;
 707       p = strpbrk_or_eos (p, "/;?#");
 708       port_e = p;
 709
 710       if (port_b == port_e)
 711         {
 712           /* http://host:/whatever */
 713           /*             ^         */
 714           SETERR (error, PE_BAD_PORT_NUMBER);
 715           return NULL;
 716         }
 717
 718       for (port = 0, pp = port_b; pp < port_e; pp++)
 719         {
 720           if (!ISDIGIT (*pp))
 721             {
 722               /* http://host:12randomgarbage/blah */
 723               /*               ^                  */
 724               SETERR (error, PE_BAD_PORT_NUMBER);
 725               return NULL;
 726             }
 727           port = 10 * port + (*pp - '0');
 728         }
 729     }
 730
 731   if (*p == '/')
 732     {
 733       ++p;
 734       path_b = p;
 735       p = strpbrk_or_eos (p, ";?#");
 736       path_e = p;
 737     }
 738   else
 739     {
 740       /* Path is not allowed not to exist. */
 741       path_b = path_e = p;
 742     }
 743
 744   if (*p == ';')
 745     {
 746       ++p;
 747       params_b = p;
 748       p = strpbrk_or_eos (p, "?#");
 749       params_e = p;
 750     }
 751   if (*p == '?')
 752     {
 753       ++p;
 754       query_b = p;
 755       p = strpbrk_or_eos (p, "#");
 756       query_e = p;
 757     }
 758   if (*p == '#')
 759     {
 760       ++p;
 761       fragment_b = p;
 762       p += strlen (p);
 763       fragment_e = p;
 764     }
 765   assert (*p == 0);
 766
 767   if (uname_b != uname_e)
 768     {
 769       /* http://user:pass@host */
 770       /*        ^         ^    */
 771       /*     uname_b   uname_e */
 772       if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
 773         {
 774           SETERR (error, PE_INVALID_USER_NAME);
 775           return NULL;
 776         }
 777     }
 778
 779   u = (struct url *)xmalloc (sizeof (struct url));
 780   memset (u, 0, sizeof (*u));
 781
 782   u->scheme = scheme;
 783   u->host   = strdupdelim (host_b, host_e);
 784   u->port   = port;
 785   u->user   = user;
 786   u->passwd = passwd;
 787
 788   u->path = strdupdelim (path_b, path_e);
 789   path_modified = path_simplify (u->path);
 790   parse_path (u->path, &u->dir, &u->file);
 791
 792   host_modified = lowercase_str (u->host);
 793
 794   if (params_b)
 795     u->params = strdupdelim (params_b, params_e);
 796   if (query_b)
 797     u->query = strdupdelim (query_b, query_e);
 798   if (fragment_b)
 799     u->fragment = strdupdelim (fragment_b, fragment_e);
 800
 801   if (path_modified || u->fragment || host_modified || path_b == path_e)
 802     {
 803       /* If we suspect that a transformation has rendered what
 804          url_string might return different from URL_ENCODED, rebuild
 805          u->url using url_string.  */
 806       u->url = url_string (u, 0);
 807
 808       if (url_encoded != url)
 809         xfree ((char *) url_encoded);
 810     }
 811   else
 812     {
 813       if (url_encoded == url)
 814         u->url    = xstrdup (url);
 815       else
 816         u->url    = url_encoded;
 817     }
 818   url_encoded = NULL;
 819
 820   return u;
 821 }
 822
 823 const char *
 824 url_error (int error_code)
 825 {
 826   assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
 827   return parse_errors[error_code];
 828 }
 829
 830 static void
 831 parse_path (const char *quoted_path, char **dir, char **file)
 832 {
 833   char *path, *last_slash;
 834
 835   STRDUP_ALLOCA (path, quoted_path);
 836   decode_string (path);
 837
 838   last_slash = strrchr (path, '/');
 839   if (!last_slash)
 840     {
 841       *dir = xstrdup ("");
 842       *file = xstrdup (path);
 843     }
 844   else
 845     {
 846       *dir = strdupdelim (path, last_slash);
 847       *file = xstrdup (last_slash + 1);
 848     }
 849 }
 850
 851 /* Note: URL's "full path" is the path with the query string and
 852    params appended.  The "fragment" (#foo) is intentionally ignored,
 853    but that might be changed.  For example, if the original URL was
 854    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
 855    the full path will be "/foo/bar/baz;bullshit?querystring".  */
 856
 857 /* Return the length of the full path, without the terminating
 858    zero.  */
 859
 860 static int
 861 full_path_length (const struct url *url)
 862 {
 863   int len = 0;
 864
 865 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
 866
 867   FROB (path);
 868   FROB (params);
 869   FROB (query);
 870
 871 #undef FROB
 872
 873   return len;
 874 }
 875
 876 /* Write out the full path. */
 877
 878 static void
 879 full_path_write (const struct url *url, char *where)
 880 {
 881 #define FROB(el, chr) do {                      \
 882   char *f_el = url->el;                         \
 883   if (f_el) {                                   \
 884     int l = strlen (f_el);                      \
 885     *where++ = chr;                             \
 886     memcpy (where, f_el, l);                    \
 887     where += l;                                 \
 888   }                                             \
 889 } while (0)
 890
 891   FROB (path, '/');
 892   FROB (params, ';');
 893   FROB (query, '?');
 894
 895 #undef FROB
 896 }
 897
 898 /* Public function for getting the "full path".  E.g. if u->path is
 899    "foo/bar" and u->query is "param=value", full_path will be
 900    "/foo/bar?param=value". */
 901
 902 char *
 903 url_full_path (const struct url *url)
 904 {
 905   int length = full_path_length (url);
 906   char *full_path = (char *)xmalloc(length + 1);
 907
 908   full_path_write (url, full_path);
 909   full_path[length] = '\0';
 910
 911   return full_path;
 912 }
 913
 914 /* Sync u->path and u->url with u->dir and u->file. */
 915
 916 static void
 917 sync_path (struct url *url)
 918 {
 919   char *newpath;
 920
 921   xfree (url->path);
 922
 923   if (!*url->dir)
 924     {
 925       newpath = xstrdup (url->file);
 926       REENCODE (newpath);
 927     }
 928   else
 929     {
 930       int dirlen = strlen (url->dir);
 931       int filelen = strlen (url->file);
 932
 933       newpath = xmalloc (dirlen + 1 + filelen + 1);
 934       memcpy (newpath, url->dir, dirlen);
 935       newpath[dirlen] = '/';
 936       memcpy (newpath + dirlen + 1, url->file, filelen);
 937       newpath[dirlen + 1 + filelen] = '\0';
 938       REENCODE (newpath);
 939     }
 940
 941   url->path = newpath;
 942
 943   /* Synchronize u->url. */
 944   xfree (url->url);
 945   url->url = url_string (url, 0);
 946 }
 947
 948 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
 949    This way we can sync u->path and u->url when they get changed.  */
 950
 951 void
 952 url_set_dir (struct url *url, const char *newdir)
 953 {
 954   xfree (url->dir);
 955   url->dir = xstrdup (newdir);
 956   sync_path (url);
 957 }
 958
 959 void
 960 url_set_file (struct url *url, const char *newfile)
 961 {
 962   xfree (url->file);
 963   url->file = xstrdup (newfile);
 964   sync_path (url);
 965 }
 966
 967 void
 968 url_free (struct url *url)
 969 {
 970   xfree (url->host);
 971   xfree (url->path);
 972   xfree (url->url);
 973
 974   FREE_MAYBE (url->params);
 975   FREE_MAYBE (url->query);
 976   FREE_MAYBE (url->fragment);
 977   FREE_MAYBE (url->user);
 978   FREE_MAYBE (url->passwd);
 979
 980   xfree (url->dir);
 981   xfree (url->file);
 982
 983   xfree (url);
 984 }
 985 \f
 986 struct urlpos *
 987 get_urls_file (const char *file)
 988 {
 989   struct file_memory *fm;
 990   struct urlpos *head, *tail;
 991   const char *text, *text_end;
 992
 993   /* Load the file.  */
 994   fm = read_file (file);
 995   if (!fm)
 996     {
 997       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 998       return NULL;
 999     }
1000   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
1001   head = tail = NULL;
1002   text = fm->content;
1003   text_end = fm->content + fm->length;
1004   while (text < text_end)
1005     {
1006       const char *line_beg = text;
1007       const char *line_end = memchr (text, '\n', text_end - text);
1008       if (!line_end)
1009         line_end = text_end;
1010       else
1011         ++line_end;
1012       text = line_end;
1013       while (line_beg < line_end
1014              && ISSPACE (*line_beg))
1015         ++line_beg;
1016       while (line_end > line_beg + 1
1017              && ISSPACE (*(line_end - 1)))
1018         --line_end;
1019       if (line_end > line_beg)
1020         {
1021           /* URL is in the [line_beg, line_end) region. */
1022
1023           int up_error_code;
1024           char *url_text;
1025           struct urlpos *entry;
1026           struct url *url;
1027
1028           /* We must copy the URL to a zero-terminated string, and we
1029              can't use alloca because we're in a loop.  *sigh*.  */
1030           url_text = strdupdelim (line_beg, line_end);
1031
1032           if (opt.base_href)
1033             {
1034               /* Merge opt.base_href with URL. */
1035               char *merged = uri_merge (opt.base_href, url_text);
1036               xfree (url_text);
1037               url_text = merged;
1038             }
1039
1040           url = url_parse (url_text, &up_error_code);
1041           if (!url)
1042             {
1043               logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1044                          file, url_text, url_error (up_error_code));
1045               xfree (url_text);
1046               continue;
1047             }
1048           xfree (url_text);
1049
1050           entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1051           memset (entry, 0, sizeof (*entry));
1052           entry->next = NULL;
1053           entry->url = url;
1054
1055           if (!head)
1056             head = entry;
1057           else
1058             tail->next = entry;
1059           tail = entry;
1060         }
1061     }
1062   read_file_free (fm);
1063   return head;
1064 }
1065 \f
1066 /* Free the linked list of urlpos.  */
1067 void
1068 free_urlpos (struct urlpos *l)
1069 {
1070   while (l)
1071     {
1072       struct urlpos *next = l->next;
1073       if (l->url)
1074         url_free (l->url);
1075       FREE_MAYBE (l->local_name);
1076       xfree (l);
1077       l = next;
1078     }
1079 }
1080
1081 /* Rotate FNAME opt.backups times */
1082 void
1083 rotate_backups(const char *fname)
1084 {
1085   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1086   char *from = (char *)alloca (maxlen);
1087   char *to = (char *)alloca (maxlen);
1088   struct stat sb;
1089   int i;
1090
1091   if (stat (fname, &sb) == 0)
1092     if (S_ISREG (sb.st_mode) == 0)
1093       return;
1094
1095   for (i = opt.backups; i > 1; i--)
1096     {
1097       sprintf (from, "%s.%d", fname, i - 1);
1098       sprintf (to, "%s.%d", fname, i);
1099       /* #### This will fail on machines without the rename() system
1100          call.  */
1101       rename (from, to);
1102     }
1103
1104   sprintf (to, "%s.%d", fname, 1);
1105   rename(fname, to);
1106 }
1107
1108 /* Create all the necessary directories for PATH (a file).  Calls
1109    mkdirhier() internally.  */
1110 int
1111 mkalldirs (const char *path)
1112 {
1113   const char *p;
1114   char *t;
1115   struct stat st;
1116   int res;
1117
1118   p = path + strlen (path);
1119   for (; *p != '/' && p != path; p--);
1120   /* Don't create if it's just a file.  */
1121   if ((p == path) && (*p != '/'))
1122     return 0;
1123   t = strdupdelim (path, p);
1124   /* Check whether the directory exists.  */
1125   if ((stat (t, &st) == 0))
1126     {
1127       if (S_ISDIR (st.st_mode))
1128         {
1129           xfree (t);
1130           return 0;
1131         }
1132       else
1133         {
1134           /* If the dir exists as a file name, remove it first.  This
1135              is *only* for Wget to work with buggy old CERN http
1136              servers.  Here is the scenario: When Wget tries to
1137              retrieve a directory without a slash, e.g.
1138              http://foo/bar (bar being a directory), CERN server will
1139              not redirect it too http://foo/bar/ -- it will generate a
1140              directory listing containing links to bar/file1,
1141              bar/file2, etc.  Wget will lose because it saves this
1142              HTML listing to a file `bar', so it cannot create the
1143              directory.  To work around this, if the file of the same
1144              name exists, we just remove it and create the directory
1145              anyway.  */
1146           DEBUGP (("Removing %s because of directory danger!\n", t));
1147           unlink (t);
1148         }
1149     }
1150   res = make_directory (t);
1151   if (res != 0)
1152     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1153   xfree (t);
1154   return res;
1155 }
1156
1157 static int
1158 count_slashes (const char *s)
1159 {
1160   int i = 0;
1161   while (*s)
1162     if (*s++ == '/')
1163       ++i;
1164   return i;
1165 }
1166
1167 /* Return the path name of the URL-equivalent file name, with a
1168    remote-like structure of directories.  */
1169 static char *
1170 mkstruct (const struct url *u)
1171 {
1172   char *dir, *dir_preencoding;
1173   char *file, *res, *dirpref;
1174   char *query = u->query && *u->query ? u->query : NULL;
1175   int l;
1176
1177   if (opt.cut_dirs)
1178     {
1179       char *ptr = u->dir + (*u->dir == '/');
1180       int slash_count = 1 + count_slashes (ptr);
1181       int cut = MINVAL (opt.cut_dirs, slash_count);
1182       for (; cut && *ptr; ptr++)
1183         if (*ptr == '/')
1184           --cut;
1185       STRDUP_ALLOCA (dir, ptr);
1186     }
1187   else
1188     dir = u->dir + (*u->dir == '/');
1189
1190   /* Check for the true name (or at least a consistent name for saving
1191      to directory) of HOST, reusing the hlist if possible.  */
1192   if (opt.add_hostdir)
1193     {
1194       /* Add dir_prefix and hostname (if required) to the beginning of
1195          dir.  */
1196       dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1197                                 + strlen (u->host)
1198                                 + 1 + numdigit (u->port)
1199                                 + 1);
1200       if (!DOTP (opt.dir_prefix))
1201         sprintf (dirpref, "%s/%s", opt.dir_prefix, u->host);
1202       else
1203         strcpy (dirpref, u->host);
1204
1205       if (u->port != scheme_default_port (u->scheme))
1206         {
1207           int len = strlen (dirpref);
1208           dirpref[len] = ':';
1209           long_to_string (dirpref + len + 1, u->port);
1210         }
1211     }
1212   else                          /* not add_hostdir */
1213     {
1214       if (!DOTP (opt.dir_prefix))
1215         dirpref = opt.dir_prefix;
1216       else
1217         dirpref = "";
1218     }
1219
1220   /* If there is a prefix, prepend it.  */
1221   if (*dirpref)
1222     {
1223       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1224       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1225       dir = newdir;
1226     }
1227
1228   dir_preencoding = dir;
1229   dir = reencode_string (dir_preencoding);
1230
1231   l = strlen (dir);
1232   if (l && dir[l - 1] == '/')
1233     dir[l - 1] = '\0';
1234
1235   if (!*u->file)
1236     file = "index.html";
1237   else
1238     file = u->file;
1239
1240   /* Finally, construct the full name.  */
1241   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file)
1242                          + (query ? (1 + strlen (query)) : 0)
1243                          + 1);
1244   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1245   if (query)
1246     {
1247       strcat (res, "?");
1248       strcat (res, query);
1249     }
1250   if (dir != dir_preencoding)
1251     xfree (dir);
1252   return res;
1253 }
1254
1255 /* Compose a file name out of BASE, an unescaped file name, and QUERY,
1256    an escaped query string.  The trick is to make sure that unsafe
1257    characters in BASE are escaped, and that slashes in QUERY are also
1258    escaped.  */
1259
1260 static char *
1261 compose_file_name (char *base, char *query)
1262 {
1263   char result[256];
1264   char *from;
1265   char *to = result;
1266
1267   /* Copy BASE to RESULT and encode all unsafe characters.  */
1268   from = base;
1269   while (*from && to - result < sizeof (result))
1270     {
1271       if (UNSAFE_CHAR (*from))
1272         {
1273           unsigned char c = *from++;
1274           *to++ = '%';
1275           *to++ = XDIGIT_TO_XCHAR (c >> 4);
1276           *to++ = XDIGIT_TO_XCHAR (c & 0xf);
1277         }
1278       else
1279         *to++ = *from++;
1280     }
1281
1282   if (query && to - result < sizeof (result))
1283     {
1284       *to++ = '?';
1285
1286       /* Copy QUERY to RESULT and encode all '/' characters. */
1287       from = query;
1288       while (*from && to - result < sizeof (result))
1289         {
1290           if (*from == '/')
1291             {
1292               *to++ = '%';
1293               *to++ = '2';
1294               *to++ = 'F';
1295               ++from;
1296             }
1297           else
1298             *to++ = *from++;
1299         }
1300     }
1301
1302   if (to - result < sizeof (result))
1303     *to = '\0';
1304   else
1305     /* Truncate input which is too long, presumably due to a huge
1306        query string.  */
1307     result[sizeof (result) - 1] = '\0';
1308
1309   return xstrdup (result);
1310 }
1311
1312 /* Create a unique filename, corresponding to a given URL.  Calls
1313    mkstruct if necessary.  Does *not* actually create any directories.  */
1314 char *
1315 url_filename (const struct url *u)
1316 {
1317   char *file, *name;
1318   int have_prefix = 0;          /* whether we must prepend opt.dir_prefix */
1319
1320   if (opt.dirstruct)
1321     {
1322       file = mkstruct (u);
1323       have_prefix = 1;
1324     }
1325   else
1326     {
1327       char *base = *u->file ? u->file : "index.html";
1328       char *query = u->query && *u->query ? u->query : NULL;
1329       file = compose_file_name (base, query);
1330     }
1331
1332   if (!have_prefix)
1333     {
1334       /* Check whether the prefix directory is something other than "."
1335          before prepending it.  */
1336       if (!DOTP (opt.dir_prefix))
1337         {
1338           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1339                                          + 1 + strlen (file) + 1);
1340           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1341           xfree (file);
1342           file = nfile;
1343         }
1344     }
1345   /* DOS-ish file systems don't like `%' signs in them; we change it
1346      to `@'.  */
1347 #ifdef WINDOWS
1348   {
1349     char *p = file;
1350     for (p = file; *p; p++)
1351       if (*p == '%')
1352         *p = '@';
1353   }
1354 #endif /* WINDOWS */
1355
1356   /* Check the cases in which the unique extensions are not used:
1357      1) Clobbering is turned off (-nc).
1358      2) Retrieval with regetting.
1359      3) Timestamping is used.
1360      4) Hierarchy is built.
1361
1362      The exception is the case when file does exist and is a
1363      directory (actually support for bad httpd-s).  */
1364   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1365       && !(file_exists_p (file) && !file_non_directory_p (file)))
1366     return file;
1367
1368   /* Find a unique name.  */
1369   name = unique_name (file);
1370   xfree (file);
1371   return name;
1372 }
1373
1374 /* Like strlen(), but allow the URL to be ended with '?'.  */
1375 static int
1376 urlpath_length (const char *url)
1377 {
1378   const char *q = strpbrk_or_eos (url, "?;#");
1379   return q - url;
1380 }
1381
1382 /* Find the last occurrence of character C in the range [b, e), or
1383    NULL, if none are present.  This is almost completely equivalent to
1384    { *e = '\0'; return strrchr(b); }, except that it doesn't change
1385    the contents of the string.  */
1386 static const char *
1387 find_last_char (const char *b, const char *e, char c)
1388 {
1389   for (; e > b; e--)
1390     if (*e == c)
1391       return e;
1392   return NULL;
1393 }
1394
1395 /* Resolve the result of "linking" a base URI (BASE) to a
1396    link-specified URI (LINK).
1397
1398    Either of the URIs may be absolute or relative, complete with the
1399    host name, or path only.  This tries to behave "reasonably" in all
1400    foreseeable cases.  It employs little specific knowledge about
1401    schemes or URL-specific stuff -- it just works on strings.
1402
1403    The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1404    See uri_merge for a gentler interface to this functionality.
1405
1406    Perhaps this function should handle `./' and `../' so that the evil
1407    path_simplify can go.  */
1408 static char *
1409 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1410 {
1411   char *constr;
1412
1413   if (no_scheme)
1414     {
1415       const char *end = base + urlpath_length (base);
1416
1417       if (!*link)
1418         {
1419           /* Empty LINK points back to BASE, query string and all. */
1420           constr = xstrdup (base);
1421         }
1422       else if (*link == '?')
1423         {
1424           /* LINK points to the same location, but changes the query
1425              string.  Examples: */
1426           /* uri_merge("path",         "?new") -> "path?new"     */
1427           /* uri_merge("path?foo",     "?new") -> "path?new"     */
1428           /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1429           /* uri_merge("path#foo",     "?new") -> "path?new"     */
1430           int baselength = end - base;
1431           constr = xmalloc (baselength + linklength + 1);
1432           memcpy (constr, base, baselength);
1433           memcpy (constr + baselength, link, linklength);
1434           constr[baselength + linklength] = '\0';
1435         }
1436       else if (*link == '#')
1437         {
1438           /* uri_merge("path",         "#new") -> "path#new"     */
1439           /* uri_merge("path#foo",     "#new") -> "path#new"     */
1440           /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1441           /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1442           int baselength;
1443           const char *end1 = strchr (base, '#');
1444           if (!end1)
1445             end1 = base + strlen (base);
1446           baselength = end1 - base;
1447           constr = xmalloc (baselength + linklength + 1);
1448           memcpy (constr, base, baselength);
1449           memcpy (constr + baselength, link, linklength);
1450           constr[baselength + linklength] = '\0';
1451         }
1452       else if (*link == '/')
1453         {
1454           /* LINK is an absolute path: we need to replace everything
1455              after (and including) the FIRST slash with LINK.
1456
1457              So, if BASE is "http://host/whatever/foo/bar", and LINK is
1458              "/qux/xyzzy", our result should be
1459              "http://host/qux/xyzzy".  */
1460           int span;
1461           const char *slash;
1462           const char *start_insert = NULL; /* for gcc to shut up. */
1463           const char *pos = base;
1464           int seen_slash_slash = 0;
1465           /* We're looking for the first slash, but want to ignore
1466              double slash. */
1467         again:
1468           slash = memchr (pos, '/', end - pos);
1469           if (slash && !seen_slash_slash)
1470             if (*(slash + 1) == '/')
1471               {
1472                 pos = slash + 2;
1473                 seen_slash_slash = 1;
1474                 goto again;
1475               }
1476
1477           /* At this point, SLASH is the location of the first / after
1478              "//", or the first slash altogether.  START_INSERT is the
1479              pointer to the location where LINK will be inserted.  When
1480              examining the last two examples, keep in mind that LINK
1481              begins with '/'. */
1482
1483           if (!slash && !seen_slash_slash)
1484             /* example: "foo" */
1485             /*           ^    */
1486             start_insert = base;
1487           else if (!slash && seen_slash_slash)
1488             /* example: "http://foo" */
1489             /*                     ^ */
1490             start_insert = end;
1491           else if (slash && !seen_slash_slash)
1492             /* example: "foo/bar" */
1493             /*           ^        */
1494             start_insert = base;
1495           else if (slash && seen_slash_slash)
1496             /* example: "http://something/" */
1497             /*                           ^  */
1498             start_insert = slash;
1499
1500           span = start_insert - base;
1501           constr = (char *)xmalloc (span + linklength + 1);
1502           if (span)
1503             memcpy (constr, base, span);
1504           if (linklength)
1505             memcpy (constr + span, link, linklength);
1506           constr[span + linklength] = '\0';
1507         }
1508       else
1509         {
1510           /* LINK is a relative URL: we need to replace everything
1511              after last slash (possibly empty) with LINK.
1512
1513              So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1514              our result should be "whatever/foo/qux/xyzzy".  */
1515           int need_explicit_slash = 0;
1516           int span;
1517           const char *start_insert;
1518           const char *last_slash = find_last_char (base, end, '/');
1519           if (!last_slash)
1520             {
1521               /* No slash found at all.  Append LINK to what we have,
1522                  but we'll need a slash as a separator.
1523
1524                  Example: if base == "foo" and link == "qux/xyzzy", then
1525                  we cannot just append link to base, because we'd get
1526                  "fooqux/xyzzy", whereas what we want is
1527                  "foo/qux/xyzzy".
1528
1529                  To make sure the / gets inserted, we set
1530                  need_explicit_slash to 1.  We also set start_insert
1531                  to end + 1, so that the length calculations work out
1532                  correctly for one more (slash) character.  Accessing
1533                  that character is fine, since it will be the
1534                  delimiter, '\0' or '?'.  */
1535               /* example: "foo?..." */
1536               /*               ^    ('?' gets changed to '/') */
1537               start_insert = end + 1;
1538               need_explicit_slash = 1;
1539             }
1540           else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1541             {
1542               /* example: http://host"  */
1543               /*                      ^ */
1544               start_insert = end + 1;
1545               need_explicit_slash = 1;
1546             }
1547           else
1548             {
1549               /* example: "whatever/foo/bar" */
1550               /*                        ^    */
1551               start_insert = last_slash + 1;
1552             }
1553
1554           span = start_insert - base;
1555           constr = (char *)xmalloc (span + linklength + 1);
1556           if (span)
1557             memcpy (constr, base, span);
1558           if (need_explicit_slash)
1559             constr[span - 1] = '/';
1560           if (linklength)
1561             memcpy (constr + span, link, linklength);
1562           constr[span + linklength] = '\0';
1563         }
1564     }
1565   else /* !no_scheme */
1566     {
1567       constr = strdupdelim (link, link + linklength);
1568     }
1569   return constr;
1570 }
1571
1572 /* Merge BASE with LINK and return the resulting URI.  This is an
1573    interface to uri_merge_1 that assumes that LINK is a
1574    zero-terminated string.  */
1575 char *
1576 uri_merge (const char *base, const char *link)
1577 {
1578   return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
1579 }
1580 \f
1581 #define APPEND(p, s) do {                       \
1582   int len = strlen (s);                         \
1583   memcpy (p, s, len);                           \
1584   p += len;                                     \
1585 } while (0)
1586
1587 /* Use this instead of password when the actual password is supposed
1588    to be hidden.  We intentionally use a generic string without giving
1589    away the number of characters in the password, like previous
1590    versions did.  */
1591 #define HIDDEN_PASSWORD "*password*"
1592
1593 /* Recreate the URL string from the data in URL.
1594
1595    If HIDE is non-zero (as it is when we're calling this on a URL we
1596    plan to print, but not when calling it to canonicalize a URL for
1597    use within the program), password will be hidden.  Unsafe
1598    characters in the URL will be quoted.  */
1599
1600 char *
1601 url_string (const struct url *url, int hide_password)
1602 {
1603   int size;
1604   char *result, *p;
1605   char *quoted_user = NULL, *quoted_passwd = NULL;
1606
1607   int scheme_port  = supported_schemes[url->scheme].default_port;
1608   char *scheme_str = supported_schemes[url->scheme].leading_string;
1609   int fplen = full_path_length (url);
1610
1611   assert (scheme_str != NULL);
1612
1613   /* Make sure the user name and password are quoted. */
1614   if (url->user)
1615     {
1616       quoted_user = encode_string_maybe (url->user);
1617       if (url->passwd)
1618         {
1619           if (hide_password)
1620             quoted_passwd = HIDDEN_PASSWORD;
1621           else
1622             quoted_passwd = encode_string_maybe (url->passwd);
1623         }
1624     }
1625
1626   size = (strlen (scheme_str)
1627           + strlen (url->host)
1628           + fplen
1629           + 1);
1630   if (url->port != scheme_port)
1631     size += 1 + numdigit (url->port);
1632   if (quoted_user)
1633     {
1634       size += 1 + strlen (quoted_user);
1635       if (quoted_passwd)
1636         size += 1 + strlen (quoted_passwd);
1637     }
1638
1639   p = result = xmalloc (size);
1640
1641   APPEND (p, scheme_str);
1642   if (quoted_user)
1643     {
1644       APPEND (p, quoted_user);
1645       if (quoted_passwd)
1646         {
1647           *p++ = ':';
1648           APPEND (p, quoted_passwd);
1649         }
1650       *p++ = '@';
1651     }
1652
1653   APPEND (p, url->host);
1654   if (url->port != scheme_port)
1655     {
1656       *p++ = ':';
1657       long_to_string (p, url->port);
1658       p += strlen (p);
1659     }
1660
1661   full_path_write (url, p);
1662   p += fplen;
1663   *p++ = '\0';
1664
1665   assert (p - result == size);
1666
1667   if (quoted_user && quoted_user != url->user)
1668     xfree (quoted_user);
1669   if (quoted_passwd && !hide_password
1670       && quoted_passwd != url->passwd)
1671     xfree (quoted_passwd);
1672
1673   return result;
1674 }
1675 \f
1676 /* Returns proxy host address, in accordance with SCHEME.  */
1677 char *
1678 getproxy (enum url_scheme scheme)
1679 {
1680   char *proxy = NULL;
1681   char *rewritten_url;
1682   static char rewritten_storage[1024];
1683
1684   switch (scheme)
1685     {
1686     case SCHEME_HTTP:
1687       proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1688       break;
1689 #ifdef HAVE_SSL
1690     case SCHEME_HTTPS:
1691       proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1692       break;
1693 #endif
1694     case SCHEME_FTP:
1695       proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1696       break;
1697     case SCHEME_INVALID:
1698       break;
1699     }
1700   if (!proxy || !*proxy)
1701     return NULL;
1702
1703   /* Handle shorthands. */
1704   rewritten_url = rewrite_shorthand_url (proxy);
1705   if (rewritten_url)
1706     {
1707       strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
1708       rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
1709       proxy = rewritten_storage;
1710     }
1711
1712   return proxy;
1713 }
1714
1715 /* Should a host be accessed through proxy, concerning no_proxy?  */
1716 int
1717 no_proxy_match (const char *host, const char **no_proxy)
1718 {
1719   if (!no_proxy)
1720     return 1;
1721   else
1722     return !sufmatch (no_proxy, host);
1723 }
1724 \f
1725 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1726 static const char *replace_attr PARAMS ((const char *, int, FILE *,
1727                                          const char *));
1728 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
1729                                                       const char *, int));
1730 static char *local_quote_string PARAMS ((const char *));
1731
1732 /* Change the links in one HTML file.  LINKS is a list of links in the
1733    document, along with their positions and the desired direction of
1734    the conversion.  */
1735 void
1736 convert_links (const char *file, struct urlpos *links)
1737 {
1738   struct file_memory *fm;
1739   FILE *fp;
1740   const char *p;
1741   downloaded_file_t downloaded_file_return;
1742
1743   struct urlpos *link;
1744   int to_url_count = 0, to_file_count = 0;
1745
1746   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1747
1748   {
1749     /* First we do a "dry run": go through the list L and see whether
1750        any URL needs to be converted in the first place.  If not, just
1751        leave the file alone.  */
1752     int dry_count = 0;
1753     struct urlpos *dry = links;
1754     for (dry = links; dry; dry = dry->next)
1755       if (dry->convert != CO_NOCONVERT)
1756         ++dry_count;
1757     if (!dry_count)
1758       {
1759         logputs (LOG_VERBOSE, _("nothing to do.\n"));
1760         return;
1761       }
1762   }
1763
1764   fm = read_file (file);
1765   if (!fm)
1766     {
1767       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1768                  file, strerror (errno));
1769       return;
1770     }
1771
1772   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1773   if (opt.backup_converted && downloaded_file_return)
1774     write_backup_file (file, downloaded_file_return);
1775
1776   /* Before opening the file for writing, unlink the file.  This is
1777      important if the data in FM is mmaped.  In such case, nulling the
1778      file, which is what fopen() below does, would make us read all
1779      zeroes from the mmaped region.  */
1780   if (unlink (file) < 0 && errno != ENOENT)
1781     {
1782       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1783                  file, strerror (errno));
1784       read_file_free (fm);
1785       return;
1786     }
1787   /* Now open the file for writing.  */
1788   fp = fopen (file, "wb");
1789   if (!fp)
1790     {
1791       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1792                  file, strerror (errno));
1793       read_file_free (fm);
1794       return;
1795     }
1796
1797   /* Here we loop through all the URLs in file, replacing those of
1798      them that are downloaded with relative references.  */
1799   p = fm->content;
1800   for (link = links; link; link = link->next)
1801     {
1802       char *url_start = fm->content + link->pos;
1803
1804       if (link->pos >= fm->length)
1805         {
1806           DEBUGP (("Something strange is going on.  Please investigate."));
1807           break;
1808         }
1809       /* If the URL is not to be converted, skip it.  */
1810       if (link->convert == CO_NOCONVERT)
1811         {
1812           DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
1813           continue;
1814         }
1815
1816       /* Echo the file contents, up to the offending URL's opening
1817          quote, to the outfile.  */
1818       fwrite (p, 1, url_start - p, fp);
1819       p = url_start;
1820
1821       switch (link->convert)
1822         {
1823         case CO_CONVERT_TO_RELATIVE:
1824           /* Convert absolute URL to relative. */
1825           {
1826             char *newname = construct_relative (file, link->local_name);
1827             char *quoted_newname = local_quote_string (newname);
1828
1829             if (!link->link_refresh_p)
1830               p = replace_attr (p, link->size, fp, quoted_newname);
1831             else
1832               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
1833                                              link->refresh_timeout);
1834
1835             DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1836                      link->url->url, newname, link->pos, file));
1837             xfree (newname);
1838             xfree (quoted_newname);
1839             ++to_file_count;
1840             break;
1841           }
1842         case CO_CONVERT_TO_COMPLETE:
1843           /* Convert the link to absolute URL. */
1844           {
1845             char *newlink = link->url->url;
1846             char *quoted_newlink = html_quote_string (newlink);
1847
1848             if (!link->link_refresh_p)
1849               p = replace_attr (p, link->size, fp, quoted_newlink);
1850             else
1851               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
1852                                              link->refresh_timeout);
1853
1854             DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1855                      newlink, link->pos, file));
1856             xfree (quoted_newlink);
1857             ++to_url_count;
1858             break;
1859           }
1860         case CO_NULLIFY_BASE:
1861           /* Change the base href to "". */
1862           p = replace_attr (p, link->size, fp, "");
1863           break;
1864         case CO_NOCONVERT:
1865           abort ();
1866           break;
1867         }
1868     }
1869
1870   /* Output the rest of the file. */
1871   if (p - fm->content < fm->length)
1872     fwrite (p, 1, fm->length - (p - fm->content), fp);
1873   fclose (fp);
1874   read_file_free (fm);
1875
1876   logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
1877 }
1878
1879 /* Construct and return a malloced copy of the relative link from two
1880    pieces of information: local name S1 of the referring file and
1881    local name S2 of the referred file.
1882
1883    So, if S1 is "jagor.srce.hr/index.html" and S2 is
1884    "jagor.srce.hr/images/news.gif", the function will return
1885    "images/news.gif".
1886
1887    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1888    "fly.cc.fer.hr/images/fly.gif", the function will return
1889    "../images/fly.gif".
1890
1891    Caveats: S1 should not begin with `/', unless S2 also begins with
1892    '/'.  S1 should not contain things like ".." and such --
1893    construct_relative ("fly/ioccc/../index.html",
1894    "fly/images/fly.gif") will fail.  (A workaround is to call
1895    something like path_simplify() on S1).  */
1896 static char *
1897 construct_relative (const char *s1, const char *s2)
1898 {
1899   int i, cnt, sepdirs1;
1900   char *res;
1901
1902   if (*s2 == '/')
1903     return xstrdup (s2);
1904   /* S1 should *not* be absolute, if S2 wasn't.  */
1905   assert (*s1 != '/');
1906   i = cnt = 0;
1907   /* Skip the directories common to both strings.  */
1908   while (1)
1909     {
1910       while (s1[i] && s2[i]
1911              && (s1[i] == s2[i])
1912              && (s1[i] != '/')
1913              && (s2[i] != '/'))
1914         ++i;
1915       if (s1[i] == '/' && s2[i] == '/')
1916         cnt = ++i;
1917       else
1918         break;
1919     }
1920   for (sepdirs1 = 0; s1[i]; i++)
1921     if (s1[i] == '/')
1922       ++sepdirs1;
1923   /* Now, construct the file as of:
1924      - ../ repeated sepdirs1 time
1925      - all the non-mutual directories of S2.  */
1926   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1927   for (i = 0; i < sepdirs1; i++)
1928     memcpy (res + 3 * i, "../", 3);
1929   strcpy (res + 3 * i, s2 + cnt);
1930   return res;
1931 }
1932 \f
1933 static void
1934 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1935 {
1936   /* Rather than just writing over the original .html file with the
1937      converted version, save the former to *.orig.  Note we only do
1938      this for files we've _successfully_ downloaded, so we don't
1939      clobber .orig files sitting around from previous invocations. */
1940
1941   /* Construct the backup filename as the original name plus ".orig". */
1942   size_t         filename_len = strlen(file);
1943   char*          filename_plus_orig_suffix;
1944   boolean        already_wrote_backup_file = FALSE;
1945   slist*         converted_file_ptr;
1946   static slist*  converted_files = NULL;
1947
1948   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1949     {
1950       /* Just write "orig" over "html".  We need to do it this way
1951          because when we're checking to see if we've downloaded the
1952          file before (to see if we can skip downloading it), we don't
1953          know if it's a text/html file.  Therefore we don't know yet
1954          at that stage that -E is going to cause us to tack on
1955          ".html", so we need to compare vs. the original URL plus
1956          ".orig", not the original URL plus ".html.orig". */
1957       filename_plus_orig_suffix = alloca (filename_len + 1);
1958       strcpy(filename_plus_orig_suffix, file);
1959       strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1960     }
1961   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1962     {
1963       /* Append ".orig" to the name. */
1964       filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1965       strcpy(filename_plus_orig_suffix, file);
1966       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1967     }
1968
1969   /* We can get called twice on the same URL thanks to the
1970      convert_all_links() call in main().  If we write the .orig file
1971      each time in such a case, it'll end up containing the first-pass
1972      conversion, not the original file.  So, see if we've already been
1973      called on this file. */
1974   converted_file_ptr = converted_files;
1975   while (converted_file_ptr != NULL)
1976     if (strcmp(converted_file_ptr->string, file) == 0)
1977       {
1978         already_wrote_backup_file = TRUE;
1979         break;
1980       }
1981     else
1982       converted_file_ptr = converted_file_ptr->next;
1983
1984   if (!already_wrote_backup_file)
1985     {
1986       /* Rename <file> to <file>.orig before former gets written over. */
1987       if (rename(file, filename_plus_orig_suffix) != 0)
1988         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1989                    file, filename_plus_orig_suffix, strerror (errno));
1990
1991       /* Remember that we've already written a .orig backup for this file.
1992          Note that we never free this memory since we need it till the
1993          convert_all_links() call, which is one of the last things the
1994          program does before terminating.  BTW, I'm not sure if it would be
1995          safe to just set 'converted_file_ptr->string' to 'file' below,
1996          rather than making a copy of the string...  Another note is that I
1997          thought I could just add a field to the urlpos structure saying
1998          that we'd written a .orig file for this URL, but that didn't work,
1999          so I had to make this separate list.
2000          -- Dan Harkless <wget@harkless.org>
2001
2002          This [adding a field to the urlpos structure] didn't work
2003          because convert_file() is called from convert_all_links at
2004          the end of the retrieval with a freshly built new urlpos
2005          list.
2006          -- Hrvoje Niksic <hniksic@arsdigita.com>
2007       */
2008       converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
2009       converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
2010       converted_file_ptr->next = converted_files;
2011       converted_files = converted_file_ptr;
2012     }
2013 }
2014
2015 static int find_fragment PARAMS ((const char *, int, const char **,
2016                                   const char **));
2017
2018 /* Replace an attribute's original text with NEW_TEXT. */
2019
2020 static const char *
2021 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
2022 {
2023   int quote_flag = 0;
2024   char quote_char = '\"';       /* use "..." for quoting, unless the
2025                                    original value is quoted, in which
2026                                    case reuse its quoting char. */
2027   const char *frag_beg, *frag_end;
2028
2029   /* Structure of our string is:
2030        "...old-contents..."
2031        <---    size    --->  (with quotes)
2032      OR:
2033        ...old-contents...
2034        <---    size   -->    (no quotes)   */
2035
2036   if (*p == '\"' || *p == '\'')
2037     {
2038       quote_char = *p;
2039       quote_flag = 1;
2040       ++p;
2041       size -= 2;                /* disregard opening and closing quote */
2042     }
2043   putc (quote_char, fp);
2044   fputs (new_text, fp);
2045
2046   /* Look for fragment identifier, if any. */
2047   if (find_fragment (p, size, &frag_beg, &frag_end))
2048     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2049   p += size;
2050   if (quote_flag)
2051     ++p;
2052   putc (quote_char, fp);
2053
2054   return p;
2055 }
2056
2057 /* The same as REPLACE_ATTR, but used when replacing
2058    <meta http-equiv=refresh content="new_text"> because we need to
2059    append "timeout_value; URL=" before the next_text.  */
2060
2061 static const char *
2062 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
2063                            const char *new_text, int timeout)
2064 {
2065   /* "0; URL=..." */
2066   char *new_with_timeout = (char *)alloca (numdigit (timeout)
2067                                            + 6 /* "; URL=" */
2068                                            + strlen (new_text)
2069                                            + 1);
2070   sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
2071
2072   return replace_attr (p, size, fp, new_with_timeout);
2073 }
2074
2075 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2076    preceded by '&'.  If the character is not found, return zero.  If
2077    the character is found, return 1 and set BP and EP to point to the
2078    beginning and end of the region.
2079
2080    This is used for finding the fragment indentifiers in URLs.  */
2081
2082 static int
2083 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2084 {
2085   const char *end = beg + size;
2086   int saw_amp = 0;
2087   for (; beg < end; beg++)
2088     {
2089       switch (*beg)
2090         {
2091         case '&':
2092           saw_amp = 1;
2093           break;
2094         case '#':
2095           if (!saw_amp)
2096             {
2097               *bp = beg;
2098               *ep = end;
2099               return 1;
2100             }
2101           /* fallthrough */
2102         default:
2103           saw_amp = 0;
2104         }
2105     }
2106   return 0;
2107 }
2108
2109 /* Quote FILE for use as local reference to an HTML file.
2110
2111    We quote ? as %3F to avoid passing part of the file name as the
2112    parameter when browsing the converted file through HTTP.  However,
2113    it is safe to do this only when `--html-extension' is turned on.
2114    This is because converting "index.html?foo=bar" to
2115    "index.html%3Ffoo=bar" would break local browsing, as the latter
2116    isn't even recognized as an HTML file!  However, converting
2117    "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
2118    safe for both local and HTTP-served browsing.  */
2119
2120 static char *
2121 local_quote_string (const char *file)
2122 {
2123   const char *file_sans_qmark;
2124   int qm;
2125
2126   if (!opt.html_extension)
2127     return html_quote_string (file);
2128
2129   qm = count_char (file, '?');
2130
2131   if (qm)
2132     {
2133       const char *from = file;
2134       char *to, *newname;
2135
2136       /* qm * 2 because we replace each question mark with "%3F",
2137          i.e. replace one char with three, hence two more.  */
2138       int fsqlen = strlen (file) + qm * 2;
2139
2140       to = newname = (char *)alloca (fsqlen + 1);
2141       for (; *from; from++)
2142         {
2143           if (*from != '?')
2144             *to++ = *from;
2145           else
2146             {
2147               *to++ = '%';
2148               *to++ = '3';
2149               *to++ = 'F';
2150             }
2151         }
2152       assert (to - newname == fsqlen);
2153       *to = '\0';
2154
2155       file_sans_qmark = newname;
2156     }
2157   else
2158     file_sans_qmark = file;
2159
2160   return html_quote_string (file_sans_qmark);
2161 }
2162
2163 /* We're storing "modes" of type downloaded_file_t in the hash table.
2164    However, our hash tables only accept pointers for keys and values.
2165    So when we need a pointer, we use the address of a
2166    downloaded_file_t variable of static storage.  */
2167
2168 static downloaded_file_t *
2169 downloaded_mode_to_ptr (downloaded_file_t mode)
2170 {
2171   static downloaded_file_t
2172     v1 = FILE_NOT_ALREADY_DOWNLOADED,
2173     v2 = FILE_DOWNLOADED_NORMALLY,
2174     v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2175     v4 = CHECK_FOR_FILE;
2176
2177   switch (mode)
2178     {
2179     case FILE_NOT_ALREADY_DOWNLOADED:
2180       return &v1;
2181     case FILE_DOWNLOADED_NORMALLY:
2182       return &v2;
2183     case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2184       return &v3;
2185     case CHECK_FOR_FILE:
2186       return &v4;
2187     }
2188   return NULL;
2189 }
2190
2191 /* This should really be merged with dl_file_url_map and
2192    downloaded_html_files in recur.c.  This was originally a list, but
2193    I changed it to a hash table beause it was actually taking a lot of
2194    time to find things in it.  */
2195
2196 static struct hash_table *downloaded_files_hash;
2197
2198 /* Remembers which files have been downloaded.  In the standard case, should be
2199    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2200    download successfully (i.e. not for ones we have failures on or that we skip
2201    due to -N).
2202
2203    When we've downloaded a file and tacked on a ".html" extension due to -E,
2204    call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2205    FILE_DOWNLOADED_NORMALLY.
2206
2207    If you just want to check if a file has been previously added without adding
2208    it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
2209    with local filenames, not remote URLs. */
2210 downloaded_file_t
2211 downloaded_file (downloaded_file_t mode, const char *file)
2212 {
2213   downloaded_file_t *ptr;
2214
2215   if (mode == CHECK_FOR_FILE)
2216     {
2217       if (!downloaded_files_hash)
2218         return FILE_NOT_ALREADY_DOWNLOADED;
2219       ptr = hash_table_get (downloaded_files_hash, file);
2220       if (!ptr)
2221         return FILE_NOT_ALREADY_DOWNLOADED;
2222       return *ptr;
2223     }
2224
2225   if (!downloaded_files_hash)
2226     downloaded_files_hash = make_string_hash_table (0);
2227
2228   ptr = hash_table_get (downloaded_files_hash, file);
2229   if (ptr)
2230     return *ptr;
2231
2232   ptr = downloaded_mode_to_ptr (mode);
2233   hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2234
2235   return FILE_NOT_ALREADY_DOWNLOADED;
2236 }
2237
2238 static int
2239 df_free_mapper (void *key, void *value, void *ignored)
2240 {
2241   xfree (key);
2242   return 0;
2243 }
2244
2245 void
2246 downloaded_files_free (void)
2247 {
2248   if (downloaded_files_hash)
2249     {
2250       hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2251       hash_table_destroy (downloaded_files_hash);
2252       downloaded_files_hash = NULL;
2253     }
2254 }