sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif
  29 #include <sys/types.h>
  30 #ifdef HAVE_UNISTD_H
  31 # include <unistd.h>
  32 #endif
  33 #include <errno.h>
  34 #include <assert.h>
  35
  36 #include "wget.h"
  37 #include "utils.h"
  38 #include "url.h"
  39 #include "host.h"
  40 #include "hash.h"
  41
  42 #ifndef errno
  43 extern int errno;
  44 #endif
  45
  46 /* Is X "."?  */
  47 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  48 /* Is X ".."?  */
  49 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  50
  51 struct scheme_data
  52 {
  53   char *leading_string;
  54   int default_port;
  55   int enabled;
  56 };
  57
  58 /* Supported schemes: */
  59 static struct scheme_data supported_schemes[] =
  60 {
  61   { "http://",  DEFAULT_HTTP_PORT,  1 },
  62 #ifdef HAVE_SSL
  63   { "https://", DEFAULT_HTTPS_PORT, 1 },
  64 #endif
  65   { "ftp://",   DEFAULT_FTP_PORT,   1 },
  66
  67   /* SCHEME_INVALID */
  68   { NULL,       -1,                 0 }
  69 };
  70
  71 /* Forward declarations: */
  72
  73 static char *construct_relative PARAMS ((const char *, const char *));
  74 static int path_simplify PARAMS ((char *));
  75
  76
  77 \f
  78 /* Support for encoding and decoding of URL strings.  We determine
  79    whether a character is unsafe through static table lookup.  This
  80    code assumes ASCII character set and 8-bit chars.  */
  81
  82 enum {
  83   urlchr_reserved = 1,
  84   urlchr_unsafe   = 2
  85 };
  86
  87 #define R  urlchr_reserved
  88 #define U  urlchr_unsafe
  89 #define RU R|U
  90
  91 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
  92
  93 /* rfc1738 reserved chars, preserved from encoding.  */
  94
  95 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
  96
  97 /* rfc1738 unsafe chars, plus some more.  */
  98
  99 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 100
 101 const static unsigned char urlchr_table[256] =
 102 {
 103   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 104   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 105   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 106   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 107   U,  0,  U, RU,   0,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 108   0,  0,  0,  R,   0,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 109   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 110   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 111  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 112   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 113   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 114   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 115   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 116   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 117   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 118   0,  0,  0,  U,   U,  U,  U,  U,   /* x   y   z   {    |   }   ~   DEL */
 119
 120   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 121   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 122   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 123   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 124
 125   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 126   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 127   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 128   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 129 };
 130
 131 /* Decodes the forms %xy in a URL to the character the hexadecimal
 132    code of which is xy.  xy are hexadecimal digits from
 133    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 134    hex-digits or `%' precedes `\0', the sequence is inserted
 135    literally.  */
 136
 137 static void
 138 decode_string (char *s)
 139 {
 140   char *t = s;                  /* t - tortoise */
 141   char *h = s;                  /* h - hare     */
 142
 143   for (; *h; h++, t++)
 144     {
 145       if (*h != '%')
 146         {
 147         copychar:
 148           *t = *h;
 149         }
 150       else
 151         {
 152           /* Do nothing if '%' is not followed by two hex digits. */
 153           if (!*(h + 1) || !*(h + 2)
 154               || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
 155             goto copychar;
 156           *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
 157           h += 2;
 158         }
 159     }
 160   *t = '\0';
 161 }
 162
 163 /* Like encode_string, but return S if there are no unsafe chars.  */
 164
 165 static char *
 166 encode_string_maybe (const char *s)
 167 {
 168   const char *p1;
 169   char *p2, *newstr;
 170   int newlen;
 171   int addition = 0;
 172
 173   for (p1 = s; *p1; p1++)
 174     if (UNSAFE_CHAR (*p1))
 175       addition += 2;            /* Two more characters (hex digits) */
 176
 177   if (!addition)
 178     return (char *)s;
 179
 180   newlen = (p1 - s) + addition;
 181   newstr = (char *)xmalloc (newlen + 1);
 182
 183   p1 = s;
 184   p2 = newstr;
 185   while (*p1)
 186     {
 187       if (UNSAFE_CHAR (*p1))
 188         {
 189           unsigned char c = *p1++;
 190           *p2++ = '%';
 191           *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 192           *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 193         }
 194       else
 195         *p2++ = *p1++;
 196     }
 197   *p2 = '\0';
 198   assert (p2 - newstr == newlen);
 199
 200   return newstr;
 201 }
 202
 203 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
 204    given string, returning a malloc-ed %XX encoded string.  */
 205
 206 char *
 207 encode_string (const char *s)
 208 {
 209   char *encoded = encode_string_maybe (s);
 210   if (encoded != s)
 211     return encoded;
 212   else
 213     return xstrdup (s);
 214 }
 215
 216 /* Encode unsafe characters in PTR to %xx.  If such encoding is done,
 217    the old value of PTR is freed and PTR is made to point to the newly
 218    allocated storage.  */
 219
 220 #define ENCODE(ptr) do {                        \
 221   char *e_new = encode_string_maybe (ptr);      \
 222   if (e_new != ptr)                             \
 223     {                                           \
 224       xfree (ptr);                              \
 225       ptr = e_new;                              \
 226     }                                           \
 227 } while (0)
 228 \f
 229 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
 230
 231 /* Decide whether to encode, decode, or pass through the char at P.
 232    This used to be a macro, but it got a little too convoluted.  */
 233 static inline enum copy_method
 234 decide_copy_method (const char *p)
 235 {
 236   if (*p == '%')
 237     {
 238       if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
 239         {
 240           /* %xx sequence: decode it, unless it would decode to an
 241              unsafe or a reserved char; in that case, leave it as
 242              is. */
 243           char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
 244             XCHAR_TO_XDIGIT (*(p + 2));
 245
 246           if (UNSAFE_CHAR (preempt) || RESERVED_CHAR (preempt))
 247             return CM_PASSTHROUGH;
 248           else
 249             return CM_DECODE;
 250         }
 251       else
 252         /* Garbled %.. sequence: encode `%'. */
 253         return CM_ENCODE;
 254     }
 255   else if (UNSAFE_CHAR (*p) && !RESERVED_CHAR (*p))
 256     return CM_ENCODE;
 257   else
 258     return CM_PASSTHROUGH;
 259 }
 260
 261 /* Translate a %-quoting (but possibly non-conformant) input string S
 262    into a %-quoting (and conformant) output string.  If no characters
 263    are encoded or decoded, return the same string S; otherwise, return
 264    a freshly allocated string with the new contents.
 265
 266    After a URL has been run through this function, the protocols that
 267    use `%' as the quote character can use the resulting string as-is,
 268    while those that don't call decode_string() to get to the intended
 269    data.  This function is also stable: after an input string is
 270    transformed the first time, all further transformations of the
 271    result yield the same result string.
 272
 273    Let's discuss why this function is needed.
 274
 275    Imagine Wget is to retrieve `http://abc.xyz/abc def'.  Since a raw
 276    space character would mess up the HTTP request, it needs to be
 277    quoted, like this:
 278
 279        GET /abc%20def HTTP/1.0
 280
 281    So it appears that the unsafe chars need to be quoted, as with
 282    encode_string.  But what if we're requested to download
 283    `abc%20def'?  Remember that %-encoding is valid URL syntax, so what
 284    the user meant was a literal space, and he was kind enough to quote
 285    it.  In that case, Wget should obviously leave the `%20' as is, and
 286    send the same request as above.  So in this case we may not call
 287    encode_string.
 288
 289    But what if the requested URI is `abc%20 def'?  If we call
 290    encode_string, we end up with `/abc%2520%20def', which is almost
 291    certainly not intended.  If we don't call encode_string, we are
 292    left with the embedded space and cannot send the request.  What the
 293    user meant was for Wget to request `/abc%20%20def', and this is
 294    where reencode_string kicks in.
 295
 296    Wget used to solve this by first decoding %-quotes, and then
 297    encoding all the "unsafe" characters found in the resulting string.
 298    This was wrong because it didn't preserve certain URL special
 299    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 300    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 301    whether we considered `+' reserved (it is).  One of these results
 302    is inevitable because by the second step we would lose information
 303    on whether the `+' was originally encoded or not.  Both results
 304    were wrong because in CGI parameters + means space, while %2B means
 305    literal plus.  reencode_string correctly translates the above to
 306    "a%2B+b", i.e. returns the original string.
 307
 308    This function uses an algorithm proposed by Anon Sricharoenchai:
 309
 310    1. Encode all URL_UNSAFE and the "%" that are not followed by 2
 311       hexdigits.
 312
 313    2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
 314       "+".
 315
 316    ...except that this code conflates the two steps, and decides
 317    whether to encode, decode, or pass through each character in turn.
 318    The function still uses two passes, but their logic is the same --
 319    the first pass exists merely for the sake of allocation.  Another
 320    small difference is that we include `+' to URL_RESERVED.
 321
 322    Anon's test case:
 323
 324    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 325    ->
 326    "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
 327
 328    Simpler test cases:
 329
 330    "foo bar"         -> "foo%20bar"
 331    "foo%20bar"       -> "foo%20bar"
 332    "foo %20bar"      -> "foo%20%20bar"
 333    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 334    "foo%25%20bar"    -> "foo%25%20bar"
 335    "foo%2%20bar"     -> "foo%252%20bar"
 336    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 337    "foo%2b+bar"      -> "foo%2b+bar"  */
 338
 339 static char *
 340 reencode_string (const char *s)
 341 {
 342   const char *p1;
 343   char *newstr, *p2;
 344   int oldlen, newlen;
 345
 346   int encode_count = 0;
 347   int decode_count = 0;
 348
 349   /* First, pass through the string to see if there's anything to do,
 350      and to calculate the new length.  */
 351   for (p1 = s; *p1; p1++)
 352     {
 353       switch (decide_copy_method (p1))
 354         {
 355         case CM_ENCODE:
 356           ++encode_count;
 357           break;
 358         case CM_DECODE:
 359           ++decode_count;
 360           break;
 361         case CM_PASSTHROUGH:
 362           break;
 363         }
 364     }
 365
 366   if (!encode_count && !decode_count)
 367     /* The string is good as it is. */
 368     return (char *)s;           /* C const model sucks. */
 369
 370   oldlen = p1 - s;
 371   /* Each encoding adds two characters (hex digits), while each
 372      decoding removes two characters.  */
 373   newlen = oldlen + 2 * (encode_count - decode_count);
 374   newstr = xmalloc (newlen + 1);
 375
 376   p1 = s;
 377   p2 = newstr;
 378
 379   while (*p1)
 380     {
 381       switch (decide_copy_method (p1))
 382         {
 383         case CM_ENCODE:
 384           {
 385             unsigned char c = *p1++;
 386             *p2++ = '%';
 387             *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 388             *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 389           }
 390           break;
 391         case CM_DECODE:
 392           *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
 393                    + (XCHAR_TO_XDIGIT (*(p1 + 2))));
 394           p1 += 3;              /* skip %xx */
 395           break;
 396         case CM_PASSTHROUGH:
 397           *p2++ = *p1++;
 398         }
 399     }
 400   *p2 = '\0';
 401   assert (p2 - newstr == newlen);
 402   return newstr;
 403 }
 404
 405 /* Run PTR_VAR through reencode_string.  If a new string is consed,
 406    free PTR_VAR and make it point to the new storage.  Obviously,
 407    PTR_VAR needs to be an lvalue.  */
 408
 409 #define REENCODE(ptr_var) do {                  \
 410   char *rf_new = reencode_string (ptr_var);     \
 411   if (rf_new != ptr_var)                        \
 412     {                                           \
 413       xfree (ptr_var);                          \
 414       ptr_var = rf_new;                         \
 415     }                                           \
 416 } while (0)
 417 \f
 418 /* Returns the scheme type if the scheme is supported, or
 419    SCHEME_INVALID if not.  */
 420 enum url_scheme
 421 url_scheme (const char *url)
 422 {
 423   int i;
 424
 425   for (i = 0; supported_schemes[i].leading_string; i++)
 426     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 427                           strlen (supported_schemes[i].leading_string)))
 428       {
 429         if (supported_schemes[i].enabled)
 430           return (enum url_scheme) i;
 431         else
 432           return SCHEME_INVALID;
 433       }
 434
 435   return SCHEME_INVALID;
 436 }
 437
 438 /* Return the number of characters needed to skip the scheme part of
 439    the URL, e.g. `http://'.  If no scheme is found, returns 0.  */
 440 int
 441 url_skip_scheme (const char *url)
 442 {
 443   const char *p = url;
 444
 445   /* Skip the scheme name.  We allow `-' and `+' because of `whois++',
 446      etc. */
 447   while (ISALNUM (*p) || *p == '-' || *p == '+')
 448     ++p;
 449   if (*p != ':')
 450     return 0;
 451   /* Skip ':'. */
 452   ++p;
 453
 454   /* Skip "//" if found. */
 455   if (*p == '/' && *(p + 1) == '/')
 456     p += 2;
 457
 458   return p - url;
 459 }
 460
 461 /* Returns 1 if the URL begins with a scheme (supported or
 462    unsupported), 0 otherwise.  */
 463 int
 464 url_has_scheme (const char *url)
 465 {
 466   const char *p = url;
 467   while (ISALNUM (*p) || *p == '-' || *p == '+')
 468     ++p;
 469   return *p == ':';
 470 }
 471
 472 int
 473 scheme_default_port (enum url_scheme scheme)
 474 {
 475   return supported_schemes[scheme].default_port;
 476 }
 477
 478 void
 479 scheme_disable (enum url_scheme scheme)
 480 {
 481   supported_schemes[scheme].enabled = 0;
 482 }
 483
 484 /* Skip the username and password, if present here.  The function
 485    should be called *not* with the complete URL, but with the part
 486    right after the scheme.
 487
 488    If no username and password are found, return 0.  */
 489 int
 490 url_skip_uname (const char *url)
 491 {
 492   const char *p;
 493
 494   /* Look for '@' that comes before '/' or '?'. */
 495   p = (const char *)strpbrk (url, "/?@");
 496   if (!p || *p != '@')
 497     return 0;
 498
 499   return p - url + 1;
 500 }
 501
 502 static int
 503 parse_uname (const char *str, int len, char **user, char **passwd)
 504 {
 505   char *colon;
 506
 507   if (len == 0)
 508     /* Empty user name not allowed. */
 509     return 0;
 510
 511   colon = memchr (str, ':', len);
 512   if (colon == str)
 513     /* Empty user name again. */
 514     return 0;
 515
 516   if (colon)
 517     {
 518       int pwlen = len - (colon + 1 - str);
 519       *passwd = xmalloc (pwlen + 1);
 520       memcpy (*passwd, colon + 1, pwlen);
 521       (*passwd)[pwlen] = '\0';
 522       len -= pwlen + 1;
 523     }
 524   else
 525     *passwd = NULL;
 526
 527   *user = xmalloc (len + 1);
 528   memcpy (*user, str, len);
 529   (*user)[len] = '\0';
 530
 531   if (*user)
 532     decode_string (*user);
 533   if (*passwd)
 534     decode_string (*passwd);
 535
 536   return 1;
 537 }
 538
 539 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 540    popularized by Netscape and NcFTP.  HTTP shorthands look like this:
 541
 542    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 543    www.foo.com[:port]            -> http://www.foo.com[:port]
 544
 545    FTP shorthands look like this:
 546
 547    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 548    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 549
 550    If the URL needs not or cannot be rewritten, return NULL.  */
 551 char *
 552 rewrite_shorthand_url (const char *url)
 553 {
 554   const char *p;
 555
 556   if (url_has_scheme (url))
 557     return NULL;
 558
 559   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 560      latter Netscape.  */
 561   for (p = url; *p && *p != ':' && *p != '/'; p++)
 562     ;
 563
 564   if (p == url)
 565     return NULL;
 566
 567   if (*p == ':')
 568     {
 569       const char *pp;
 570       char *res;
 571       /* If the characters after the colon and before the next slash
 572          or end of string are all digits, it's HTTP.  */
 573       int digits = 0;
 574       for (pp = p + 1; ISDIGIT (*pp); pp++)
 575         ++digits;
 576       if (digits > 0 && (*pp == '/' || *pp == '\0'))
 577         goto http;
 578
 579       /* Prepend "ftp://" to the entire URL... */
 580       res = xmalloc (6 + strlen (url) + 1);
 581       sprintf (res, "ftp://%s", url);
 582       /* ...and replace ':' with '/'. */
 583       res[6 + (p - url)] = '/';
 584       return res;
 585     }
 586   else
 587     {
 588       char *res;
 589     http:
 590       /* Just prepend "http://" to what we have. */
 591       res = xmalloc (7 + strlen (url) + 1);
 592       sprintf (res, "http://%s", url);
 593       return res;
 594     }
 595 }
 596 \f
 597 static void parse_path PARAMS ((const char *, char **, char **));
 598
 599 static char *
 600 strpbrk_or_eos (const char *s, const char *accept)
 601 {
 602   char *p = strpbrk (s, accept);
 603   if (!p)
 604     p = (char *)s + strlen (s);
 605   return p;
 606 }
 607
 608 /* Turn STR into lowercase; return non-zero if a character was
 609    actually changed. */
 610
 611 static int
 612 lowercase_str (char *str)
 613 {
 614   int change = 0;
 615   for (; *str; str++)
 616     if (ISUPPER (*str))
 617       {
 618         change = 1;
 619         *str = TOLOWER (*str);
 620       }
 621   return change;
 622 }
 623
 624 static char *parse_errors[] = {
 625 #define PE_NO_ERROR                     0
 626   "No error",
 627 #define PE_UNSUPPORTED_SCHEME           1
 628   "Unsupported scheme",
 629 #define PE_EMPTY_HOST                   2
 630   "Empty host",
 631 #define PE_BAD_PORT_NUMBER              3
 632   "Bad port number",
 633 #define PE_INVALID_USER_NAME            4
 634   "Invalid user name",
 635 #define PE_UNTERMINATED_IPV6_ADDRESS    5
 636   "Unterminated IPv6 numeric address",
 637 #define PE_INVALID_IPV6_ADDRESS         6
 638   "Invalid char in IPv6 numeric address"
 639 };
 640
 641 #define SETERR(p, v) do {                       \
 642   if (p)                                        \
 643     *(p) = (v);                                 \
 644 } while (0)
 645
 646 /* Parse a URL.
 647
 648    Return a new struct url if successful, NULL on error.  In case of
 649    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 650    error code. */
 651 struct url *
 652 url_parse (const char *url, int *error)
 653 {
 654   struct url *u;
 655   const char *p;
 656   int path_modified, host_modified;
 657
 658   enum url_scheme scheme;
 659
 660   const char *uname_b,     *uname_e;
 661   const char *host_b,      *host_e;
 662   const char *path_b,      *path_e;
 663   const char *params_b,    *params_e;
 664   const char *query_b,     *query_e;
 665   const char *fragment_b,  *fragment_e;
 666
 667   int port;
 668   char *user = NULL, *passwd = NULL;
 669
 670   char *url_encoded;
 671
 672   scheme = url_scheme (url);
 673   if (scheme == SCHEME_INVALID)
 674     {
 675       SETERR (error, PE_UNSUPPORTED_SCHEME);
 676       return NULL;
 677     }
 678
 679   url_encoded = reencode_string (url);
 680   p = url_encoded;
 681
 682   p += strlen (supported_schemes[scheme].leading_string);
 683   uname_b = p;
 684   p += url_skip_uname (p);
 685   uname_e = p;
 686
 687   /* scheme://user:pass@host[:port]... */
 688   /*                    ^              */
 689
 690   /* We attempt to break down the URL into the components path,
 691      params, query, and fragment.  They are ordered like this:
 692
 693        scheme://host[:port][/path][;params][?query][#fragment]  */
 694
 695   params_b   = params_e   = NULL;
 696   query_b    = query_e    = NULL;
 697   fragment_b = fragment_e = NULL;
 698
 699   host_b = p;
 700
 701   if (*p == '[')
 702     {
 703       /* Support http://[::1]/ used by IPv6. */
 704       int invalid = 0;
 705       ++p;
 706       while (1)
 707         {
 708           char c = *p++;
 709           switch (c)
 710             {
 711             case ']':
 712               goto out;
 713             case '\0':
 714               SETERR (error, PE_UNTERMINATED_IPV6_ADDRESS);
 715               return NULL;
 716             case ':': case '.':
 717               break;
 718             default:
 719               if (ISXDIGIT (c))
 720                 break;
 721               invalid = 1;
 722             }
 723         }
 724     out:
 725       if (invalid)
 726         {
 727           SETERR (error, PE_INVALID_IPV6_ADDRESS);
 728           return NULL;
 729         }
 730       /* Don't include brackets in [host_b, host_p). */
 731       ++host_b;
 732       host_e = p - 1;
 733     }
 734   else
 735     {
 736       p = strpbrk_or_eos (p, ":/;?#");
 737       host_e = p;
 738     }
 739
 740   if (host_b == host_e)
 741     {
 742       SETERR (error, PE_EMPTY_HOST);
 743       return NULL;
 744     }
 745
 746   port = scheme_default_port (scheme);
 747   if (*p == ':')
 748     {
 749       const char *port_b, *port_e, *pp;
 750
 751       /* scheme://host:port/tralala */
 752       /*              ^             */
 753       ++p;
 754       port_b = p;
 755       p = strpbrk_or_eos (p, "/;?#");
 756       port_e = p;
 757
 758       if (port_b == port_e)
 759         {
 760           /* http://host:/whatever */
 761           /*             ^         */
 762           SETERR (error, PE_BAD_PORT_NUMBER);
 763           return NULL;
 764         }
 765
 766       for (port = 0, pp = port_b; pp < port_e; pp++)
 767         {
 768           if (!ISDIGIT (*pp))
 769             {
 770               /* http://host:12randomgarbage/blah */
 771               /*               ^                  */
 772               SETERR (error, PE_BAD_PORT_NUMBER);
 773               return NULL;
 774             }
 775           port = 10 * port + (*pp - '0');
 776         }
 777     }
 778
 779   if (*p == '/')
 780     {
 781       ++p;
 782       path_b = p;
 783       p = strpbrk_or_eos (p, ";?#");
 784       path_e = p;
 785     }
 786   else
 787     {
 788       /* Path is not allowed not to exist. */
 789       path_b = path_e = p;
 790     }
 791
 792   if (*p == ';')
 793     {
 794       ++p;
 795       params_b = p;
 796       p = strpbrk_or_eos (p, "?#");
 797       params_e = p;
 798     }
 799   if (*p == '?')
 800     {
 801       ++p;
 802       query_b = p;
 803       p = strpbrk_or_eos (p, "#");
 804       query_e = p;
 805
 806       /* Hack that allows users to use '?' (a wildcard character) in
 807          FTP URLs without it being interpreted as a query string
 808          delimiter.  */
 809       if (scheme == SCHEME_FTP)
 810         {
 811           query_b = query_e = NULL;
 812           path_e = p;
 813         }
 814     }
 815   if (*p == '#')
 816     {
 817       ++p;
 818       fragment_b = p;
 819       p += strlen (p);
 820       fragment_e = p;
 821     }
 822   assert (*p == 0);
 823
 824   if (uname_b != uname_e)
 825     {
 826       /* http://user:pass@host */
 827       /*        ^         ^    */
 828       /*     uname_b   uname_e */
 829       if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
 830         {
 831           SETERR (error, PE_INVALID_USER_NAME);
 832           return NULL;
 833         }
 834     }
 835
 836   u = (struct url *)xmalloc (sizeof (struct url));
 837   memset (u, 0, sizeof (*u));
 838
 839   u->scheme = scheme;
 840   u->host   = strdupdelim (host_b, host_e);
 841   u->port   = port;
 842   u->user   = user;
 843   u->passwd = passwd;
 844
 845   u->path = strdupdelim (path_b, path_e);
 846   path_modified = path_simplify (u->path);
 847   parse_path (u->path, &u->dir, &u->file);
 848
 849   host_modified = lowercase_str (u->host);
 850
 851   if (params_b)
 852     u->params = strdupdelim (params_b, params_e);
 853   if (query_b)
 854     u->query = strdupdelim (query_b, query_e);
 855   if (fragment_b)
 856     u->fragment = strdupdelim (fragment_b, fragment_e);
 857
 858   if (path_modified || u->fragment || host_modified || path_b == path_e)
 859     {
 860       /* If we suspect that a transformation has rendered what
 861          url_string might return different from URL_ENCODED, rebuild
 862          u->url using url_string.  */
 863       u->url = url_string (u, 0);
 864
 865       if (url_encoded != url)
 866         xfree ((char *) url_encoded);
 867     }
 868   else
 869     {
 870       if (url_encoded == url)
 871         u->url    = xstrdup (url);
 872       else
 873         u->url    = url_encoded;
 874     }
 875   url_encoded = NULL;
 876
 877   return u;
 878 }
 879
 880 const char *
 881 url_error (int error_code)
 882 {
 883   assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
 884   return parse_errors[error_code];
 885 }
 886
 887 static void
 888 parse_path (const char *quoted_path, char **dir, char **file)
 889 {
 890   char *path, *last_slash;
 891
 892   STRDUP_ALLOCA (path, quoted_path);
 893   decode_string (path);
 894
 895   last_slash = strrchr (path, '/');
 896   if (!last_slash)
 897     {
 898       *dir = xstrdup ("");
 899       *file = xstrdup (path);
 900     }
 901   else
 902     {
 903       *dir = strdupdelim (path, last_slash);
 904       *file = xstrdup (last_slash + 1);
 905     }
 906 }
 907
 908 /* Note: URL's "full path" is the path with the query string and
 909    params appended.  The "fragment" (#foo) is intentionally ignored,
 910    but that might be changed.  For example, if the original URL was
 911    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
 912    the full path will be "/foo/bar/baz;bullshit?querystring".  */
 913
 914 /* Return the length of the full path, without the terminating
 915    zero.  */
 916
 917 static int
 918 full_path_length (const struct url *url)
 919 {
 920   int len = 0;
 921
 922 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
 923
 924   FROB (path);
 925   FROB (params);
 926   FROB (query);
 927
 928 #undef FROB
 929
 930   return len;
 931 }
 932
 933 /* Write out the full path. */
 934
 935 static void
 936 full_path_write (const struct url *url, char *where)
 937 {
 938 #define FROB(el, chr) do {                      \
 939   char *f_el = url->el;                         \
 940   if (f_el) {                                   \
 941     int l = strlen (f_el);                      \
 942     *where++ = chr;                             \
 943     memcpy (where, f_el, l);                    \
 944     where += l;                                 \
 945   }                                             \
 946 } while (0)
 947
 948   FROB (path, '/');
 949   FROB (params, ';');
 950   FROB (query, '?');
 951
 952 #undef FROB
 953 }
 954
 955 /* Public function for getting the "full path".  E.g. if u->path is
 956    "foo/bar" and u->query is "param=value", full_path will be
 957    "/foo/bar?param=value". */
 958
 959 char *
 960 url_full_path (const struct url *url)
 961 {
 962   int length = full_path_length (url);
 963   char *full_path = (char *)xmalloc(length + 1);
 964
 965   full_path_write (url, full_path);
 966   full_path[length] = '\0';
 967
 968   return full_path;
 969 }
 970
 971 /* Sync u->path and u->url with u->dir and u->file. */
 972
 973 static void
 974 sync_path (struct url *url)
 975 {
 976   char *newpath;
 977
 978   xfree (url->path);
 979
 980   if (!*url->dir)
 981     {
 982       newpath = xstrdup (url->file);
 983       REENCODE (newpath);
 984     }
 985   else
 986     {
 987       int dirlen = strlen (url->dir);
 988       int filelen = strlen (url->file);
 989
 990       newpath = xmalloc (dirlen + 1 + filelen + 1);
 991       memcpy (newpath, url->dir, dirlen);
 992       newpath[dirlen] = '/';
 993       memcpy (newpath + dirlen + 1, url->file, filelen);
 994       newpath[dirlen + 1 + filelen] = '\0';
 995       REENCODE (newpath);
 996     }
 997
 998   url->path = newpath;
 999
1000   /* Synchronize u->url. */
1001   xfree (url->url);
1002   url->url = url_string (url, 0);
1003 }
1004
1005 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1006    This way we can sync u->path and u->url when they get changed.  */
1007
1008 void
1009 url_set_dir (struct url *url, const char *newdir)
1010 {
1011   xfree (url->dir);
1012   url->dir = xstrdup (newdir);
1013   sync_path (url);
1014 }
1015
1016 void
1017 url_set_file (struct url *url, const char *newfile)
1018 {
1019   xfree (url->file);
1020   url->file = xstrdup (newfile);
1021   sync_path (url);
1022 }
1023
1024 void
1025 url_free (struct url *url)
1026 {
1027   xfree (url->host);
1028   xfree (url->path);
1029   xfree (url->url);
1030
1031   FREE_MAYBE (url->params);
1032   FREE_MAYBE (url->query);
1033   FREE_MAYBE (url->fragment);
1034   FREE_MAYBE (url->user);
1035   FREE_MAYBE (url->passwd);
1036
1037   xfree (url->dir);
1038   xfree (url->file);
1039
1040   xfree (url);
1041 }
1042 \f
1043 struct urlpos *
1044 get_urls_file (const char *file)
1045 {
1046   struct file_memory *fm;
1047   struct urlpos *head, *tail;
1048   const char *text, *text_end;
1049
1050   /* Load the file.  */
1051   fm = read_file (file);
1052   if (!fm)
1053     {
1054       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
1055       return NULL;
1056     }
1057   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
1058
1059   head = tail = NULL;
1060   text = fm->content;
1061   text_end = fm->content + fm->length;
1062   while (text < text_end)
1063     {
1064       const char *line_beg = text;
1065       const char *line_end = memchr (text, '\n', text_end - text);
1066       if (!line_end)
1067         line_end = text_end;
1068       else
1069         ++line_end;
1070       text = line_end;
1071
1072       /* Strip whitespace from the beginning and end of line. */
1073       while (line_beg < line_end && ISSPACE (*line_beg))
1074         ++line_beg;
1075       while (line_end > line_beg && ISSPACE (*(line_end - 1)))
1076         --line_end;
1077
1078       if (line_end > line_beg)
1079         {
1080           /* URL is in the [line_beg, line_end) region. */
1081
1082           int up_error_code;
1083           char *url_text;
1084           struct urlpos *entry;
1085           struct url *url;
1086
1087           /* We must copy the URL to a zero-terminated string, and we
1088              can't use alloca because we're in a loop.  *sigh*.  */
1089           url_text = strdupdelim (line_beg, line_end);
1090
1091           if (opt.base_href)
1092             {
1093               /* Merge opt.base_href with URL. */
1094               char *merged = uri_merge (opt.base_href, url_text);
1095               xfree (url_text);
1096               url_text = merged;
1097             }
1098
1099           url = url_parse (url_text, &up_error_code);
1100           if (!url)
1101             {
1102               logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1103                          file, url_text, url_error (up_error_code));
1104               xfree (url_text);
1105               continue;
1106             }
1107           xfree (url_text);
1108
1109           entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1110           memset (entry, 0, sizeof (*entry));
1111           entry->next = NULL;
1112           entry->url = url;
1113
1114           if (!head)
1115             head = entry;
1116           else
1117             tail->next = entry;
1118           tail = entry;
1119         }
1120     }
1121   read_file_free (fm);
1122   return head;
1123 }
1124 \f
1125 /* Free the linked list of urlpos.  */
1126 void
1127 free_urlpos (struct urlpos *l)
1128 {
1129   while (l)
1130     {
1131       struct urlpos *next = l->next;
1132       if (l->url)
1133         url_free (l->url);
1134       FREE_MAYBE (l->local_name);
1135       xfree (l);
1136       l = next;
1137     }
1138 }
1139
1140 /* Rotate FNAME opt.backups times */
1141 void
1142 rotate_backups(const char *fname)
1143 {
1144   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1145   char *from = (char *)alloca (maxlen);
1146   char *to = (char *)alloca (maxlen);
1147   struct stat sb;
1148   int i;
1149
1150   if (stat (fname, &sb) == 0)
1151     if (S_ISREG (sb.st_mode) == 0)
1152       return;
1153
1154   for (i = opt.backups; i > 1; i--)
1155     {
1156       sprintf (from, "%s.%d", fname, i - 1);
1157       sprintf (to, "%s.%d", fname, i);
1158       /* #### This will fail on machines without the rename() system
1159          call.  */
1160       rename (from, to);
1161     }
1162
1163   sprintf (to, "%s.%d", fname, 1);
1164   rename(fname, to);
1165 }
1166
1167 /* Create all the necessary directories for PATH (a file).  Calls
1168    mkdirhier() internally.  */
1169 int
1170 mkalldirs (const char *path)
1171 {
1172   const char *p;
1173   char *t;
1174   struct stat st;
1175   int res;
1176
1177   p = path + strlen (path);
1178   for (; *p != '/' && p != path; p--);
1179   /* Don't create if it's just a file.  */
1180   if ((p == path) && (*p != '/'))
1181     return 0;
1182   t = strdupdelim (path, p);
1183   /* Check whether the directory exists.  */
1184   if ((stat (t, &st) == 0))
1185     {
1186       if (S_ISDIR (st.st_mode))
1187         {
1188           xfree (t);
1189           return 0;
1190         }
1191       else
1192         {
1193           /* If the dir exists as a file name, remove it first.  This
1194              is *only* for Wget to work with buggy old CERN http
1195              servers.  Here is the scenario: When Wget tries to
1196              retrieve a directory without a slash, e.g.
1197              http://foo/bar (bar being a directory), CERN server will
1198              not redirect it too http://foo/bar/ -- it will generate a
1199              directory listing containing links to bar/file1,
1200              bar/file2, etc.  Wget will lose because it saves this
1201              HTML listing to a file `bar', so it cannot create the
1202              directory.  To work around this, if the file of the same
1203              name exists, we just remove it and create the directory
1204              anyway.  */
1205           DEBUGP (("Removing %s because of directory danger!\n", t));
1206           unlink (t);
1207         }
1208     }
1209   res = make_directory (t);
1210   if (res != 0)
1211     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1212   xfree (t);
1213   return res;
1214 }
1215
1216 static int
1217 count_slashes (const char *s)
1218 {
1219   int i = 0;
1220   while (*s)
1221     if (*s++ == '/')
1222       ++i;
1223   return i;
1224 }
1225
1226 /* Return the path name of the URL-equivalent file name, with a
1227    remote-like structure of directories.  */
1228 static char *
1229 mkstruct (const struct url *u)
1230 {
1231   char *dir, *file;
1232   char *res, *dirpref;
1233   int l;
1234
1235   if (opt.cut_dirs)
1236     {
1237       char *ptr = u->dir + (*u->dir == '/');
1238       int slash_count = 1 + count_slashes (ptr);
1239       int cut = MINVAL (opt.cut_dirs, slash_count);
1240       for (; cut && *ptr; ptr++)
1241         if (*ptr == '/')
1242           --cut;
1243       STRDUP_ALLOCA (dir, ptr);
1244     }
1245   else
1246     dir = u->dir + (*u->dir == '/');
1247
1248   /* Check for the true name (or at least a consistent name for saving
1249      to directory) of HOST, reusing the hlist if possible.  */
1250   if (opt.add_hostdir)
1251     {
1252       /* Add dir_prefix and hostname (if required) to the beginning of
1253          dir.  */
1254       dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1255                                 + strlen (u->host)
1256                                 + 1 + numdigit (u->port)
1257                                 + 1);
1258       if (!DOTP (opt.dir_prefix))
1259         sprintf (dirpref, "%s/%s", opt.dir_prefix, u->host);
1260       else
1261         strcpy (dirpref, u->host);
1262
1263       if (u->port != scheme_default_port (u->scheme))
1264         {
1265           int len = strlen (dirpref);
1266           dirpref[len] = ':';
1267           number_to_string (dirpref + len + 1, u->port);
1268         }
1269     }
1270   else                          /* not add_hostdir */
1271     {
1272       if (!DOTP (opt.dir_prefix))
1273         dirpref = opt.dir_prefix;
1274       else
1275         dirpref = "";
1276     }
1277
1278   /* If there is a prefix, prepend it.  */
1279   if (*dirpref)
1280     {
1281       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1282       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1283       dir = newdir;
1284     }
1285
1286   l = strlen (dir);
1287   if (l && dir[l - 1] == '/')
1288     dir[l - 1] = '\0';
1289
1290   if (!*u->file)
1291     file = "index.html";
1292   else
1293     file = u->file;
1294
1295   /* Finally, construct the full name.  */
1296   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file)
1297                          + 1);
1298   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1299
1300   return res;
1301 }
1302
1303 /* Compose a file name out of BASE, an unescaped file name, and QUERY,
1304    an escaped query string.  The trick is to make sure that unsafe
1305    characters in BASE are escaped, and that slashes in QUERY are also
1306    escaped.  */
1307
1308 static char *
1309 compose_file_name (char *base, char *query)
1310 {
1311   char result[256];
1312   char *from;
1313   char *to = result;
1314
1315   /* Copy BASE to RESULT and encode all unsafe characters.  */
1316   from = base;
1317   while (*from && to - result < sizeof (result))
1318     {
1319       if (UNSAFE_CHAR (*from))
1320         {
1321           unsigned char c = *from++;
1322           *to++ = '%';
1323           *to++ = XDIGIT_TO_XCHAR (c >> 4);
1324           *to++ = XDIGIT_TO_XCHAR (c & 0xf);
1325         }
1326       else
1327         *to++ = *from++;
1328     }
1329
1330   if (query && to - result < sizeof (result))
1331     {
1332       *to++ = '?';
1333
1334       /* Copy QUERY to RESULT and encode all '/' characters. */
1335       from = query;
1336       while (*from && to - result < sizeof (result))
1337         {
1338           if (*from == '/')
1339             {
1340               *to++ = '%';
1341               *to++ = '2';
1342               *to++ = 'F';
1343               ++from;
1344             }
1345           else
1346             *to++ = *from++;
1347         }
1348     }
1349
1350   if (to - result < sizeof (result))
1351     *to = '\0';
1352   else
1353     /* Truncate input which is too long, presumably due to a huge
1354        query string.  */
1355     result[sizeof (result) - 1] = '\0';
1356
1357   return xstrdup (result);
1358 }
1359
1360 /* Create a unique filename, corresponding to a given URL.  Calls
1361    mkstruct if necessary.  Does *not* actually create any directories.  */
1362 char *
1363 url_filename (const struct url *u)
1364 {
1365   char *file, *name;
1366
1367   char *query = u->query && *u->query ? u->query : NULL;
1368
1369   if (opt.dirstruct)
1370     {
1371       char *base = mkstruct (u);
1372       file = compose_file_name (base, query);
1373       xfree (base);
1374     }
1375   else
1376     {
1377       char *base = *u->file ? u->file : "index.html";
1378       file = compose_file_name (base, query);
1379
1380       /* Check whether the prefix directory is something other than "."
1381          before prepending it.  */
1382       if (!DOTP (opt.dir_prefix))
1383         {
1384           /* #### should just realloc FILE and prepend dir_prefix. */
1385           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1386                                          + 1 + strlen (file) + 1);
1387           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1388           xfree (file);
1389           file = nfile;
1390         }
1391     }
1392
1393   /* DOS-ish file systems don't like `%' signs in them; we change it
1394      to `@'.  */
1395 #ifdef WINDOWS
1396   {
1397     char *p = file;
1398     for (p = file; *p; p++)
1399       if (*p == '%')
1400         *p = '@';
1401   }
1402 #endif /* WINDOWS */
1403
1404   /* Check the cases in which the unique extensions are not used:
1405      1) Clobbering is turned off (-nc).
1406      2) Retrieval with regetting.
1407      3) Timestamping is used.
1408      4) Hierarchy is built.
1409
1410      The exception is the case when file does exist and is a
1411      directory (actually support for bad httpd-s).  */
1412   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1413       && !(file_exists_p (file) && !file_non_directory_p (file)))
1414     return file;
1415
1416   /* Find a unique name.  */
1417   name = unique_name (file);
1418   xfree (file);
1419   return name;
1420 }
1421
1422 /* Return the langth of URL's path.  Path is considered to be
1423    terminated by one of '?', ';', '#', or by the end of the
1424    string.  */
1425 static int
1426 path_length (const char *url)
1427 {
1428   const char *q = strpbrk_or_eos (url, "?;#");
1429   return q - url;
1430 }
1431
1432 /* Find the last occurrence of character C in the range [b, e), or
1433    NULL, if none are present.  This is equivalent to strrchr(b, c),
1434    except that it accepts an END argument instead of requiring the
1435    string to be zero-terminated.  Why is there no memrchr()?  */
1436 static const char *
1437 find_last_char (const char *b, const char *e, char c)
1438 {
1439   for (; e > b; e--)
1440     if (*e == c)
1441       return e;
1442   return NULL;
1443 }
1444 \f
1445 /* Resolve "." and ".." elements of PATH by destructively modifying
1446    PATH.  "." is resolved by removing that path element, and ".." is
1447    resolved by removing the preceding path element.  Leading and
1448    trailing slashes are preserved.
1449
1450    Return non-zero if any changes have been made.
1451
1452    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1453    test examples are provided below.  If you change anything in this
1454    function, run test_path_simplify to make sure you haven't broken a
1455    test case.
1456
1457    A previous version of this function was based on path_simplify()
1458    from GNU Bash, but it has been rewritten for Wget 1.8.1.  */
1459
1460 static int
1461 path_simplify (char *path)
1462 {
1463   int change = 0;
1464   char *p, *end;
1465
1466   if (path[0] == '/')
1467     ++path;                     /* preserve the leading '/'. */
1468
1469   p = path;
1470   end = p + strlen (p) + 1;     /* position past the terminating zero. */
1471
1472   while (1)
1473     {
1474     again:
1475       /* P should point to the beginning of a path element. */
1476
1477       if (*p == '.' && (*(p + 1) == '/' || *(p + 1) == '\0'))
1478         {
1479           /* Handle "./foo" by moving "foo" two characters to the
1480              left. */
1481           if (*(p + 1) == '/')
1482             {
1483               change = 1;
1484               memmove (p, p + 2, end - p);
1485               end -= 2;
1486               goto again;
1487             }
1488           else
1489             {
1490               change = 1;
1491               *p = '\0';
1492               break;
1493             }
1494         }
1495       else if (*p == '.' && *(p + 1) == '.'
1496                && (*(p + 2) == '/' || *(p + 2) == '\0'))
1497         {
1498           /* Handle "../foo" by moving "foo" one path element to the
1499              left.  */
1500           char *b = p;          /* not p-1 because P can equal PATH */
1501
1502           /* Backtrack by one path element, but not past the beginning
1503              of PATH. */
1504
1505           /* foo/bar/../baz */
1506           /*         ^ p    */
1507           /*     ^ b        */
1508
1509           if (b > path)
1510             {
1511               /* Move backwards until B hits the beginning of the
1512                  previous path element or the beginning of path. */
1513               for (--b; b > path && *(b - 1) != '/'; b--)
1514                 ;
1515             }
1516
1517           change = 1;
1518           if (*(p + 2) == '/')
1519             {
1520               memmove (b, p + 3, end - (p + 3));
1521               end -= (p + 3) - b;
1522               p = b;
1523             }
1524           else
1525             {
1526               *b = '\0';
1527               break;
1528             }
1529
1530           goto again;
1531         }
1532       else if (*p == '/')
1533         {
1534           /* Remove empty path elements.  Not mandated by rfc1808 et
1535              al, but empty path elements are not all that useful, and
1536              the rest of Wget might not deal with them well. */
1537           char *q = p;
1538           while (*q == '/')
1539             ++q;
1540           change = 1;
1541           if (*q == '\0')
1542             {
1543               *p = '\0';
1544               break;
1545             }
1546           memmove (p, q, end - q);
1547           end -= q - p;
1548           goto again;
1549         }
1550
1551       /* Skip to the next path element. */
1552       while (*p && *p != '/')
1553         ++p;
1554       if (*p == '\0')
1555         break;
1556
1557       /* Make sure P points to the beginning of the next path element,
1558          which is location after the slash. */
1559       ++p;
1560     }
1561
1562   return change;
1563 }
1564 \f
1565 /* Resolve the result of "linking" a base URI (BASE) to a
1566    link-specified URI (LINK).
1567
1568    Either of the URIs may be absolute or relative, complete with the
1569    host name, or path only.  This tries to behave "reasonably" in all
1570    foreseeable cases.  It employs little specific knowledge about
1571    schemes or URL-specific stuff -- it just works on strings.
1572
1573    The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1574    See uri_merge for a gentler interface to this functionality.
1575
1576    Perhaps this function should call path_simplify so that the callers
1577    don't have to call url_parse unconditionally.  */
1578 static char *
1579 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1580 {
1581   char *constr;
1582
1583   if (no_scheme)
1584     {
1585       const char *end = base + path_length (base);
1586
1587       if (!*link)
1588         {
1589           /* Empty LINK points back to BASE, query string and all. */
1590           constr = xstrdup (base);
1591         }
1592       else if (*link == '?')
1593         {
1594           /* LINK points to the same location, but changes the query
1595              string.  Examples: */
1596           /* uri_merge("path",         "?new") -> "path?new"     */
1597           /* uri_merge("path?foo",     "?new") -> "path?new"     */
1598           /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1599           /* uri_merge("path#foo",     "?new") -> "path?new"     */
1600           int baselength = end - base;
1601           constr = xmalloc (baselength + linklength + 1);
1602           memcpy (constr, base, baselength);
1603           memcpy (constr + baselength, link, linklength);
1604           constr[baselength + linklength] = '\0';
1605         }
1606       else if (*link == '#')
1607         {
1608           /* uri_merge("path",         "#new") -> "path#new"     */
1609           /* uri_merge("path#foo",     "#new") -> "path#new"     */
1610           /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1611           /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1612           int baselength;
1613           const char *end1 = strchr (base, '#');
1614           if (!end1)
1615             end1 = base + strlen (base);
1616           baselength = end1 - base;
1617           constr = xmalloc (baselength + linklength + 1);
1618           memcpy (constr, base, baselength);
1619           memcpy (constr + baselength, link, linklength);
1620           constr[baselength + linklength] = '\0';
1621         }
1622       else if (linklength > 1 && *link == '/' && *(link + 1) == '/')
1623         {
1624           /* LINK begins with "//" and so is a net path: we need to
1625              replace everything after (and including) the double slash
1626              with LINK. */
1627
1628           /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1629           /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1630           /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1631
1632           int span;
1633           const char *slash;
1634           const char *start_insert;
1635
1636           /* Look for first slash. */
1637           slash = memchr (base, '/', end - base);
1638           /* If found slash and it is a double slash, then replace
1639              from this point, else default to replacing from the
1640              beginning.  */
1641           if (slash && *(slash + 1) == '/')
1642             start_insert = slash;
1643           else
1644             start_insert = base;
1645
1646           span = start_insert - base;
1647           constr = (char *)xmalloc (span + linklength + 1);
1648           if (span)
1649             memcpy (constr, base, span);
1650           memcpy (constr + span, link, linklength);
1651           constr[span + linklength] = '\0';
1652         }
1653       else if (*link == '/')
1654         {
1655           /* LINK is an absolute path: we need to replace everything
1656              after (and including) the FIRST slash with LINK.
1657
1658              So, if BASE is "http://host/whatever/foo/bar", and LINK is
1659              "/qux/xyzzy", our result should be
1660              "http://host/qux/xyzzy".  */
1661           int span;
1662           const char *slash;
1663           const char *start_insert = NULL; /* for gcc to shut up. */
1664           const char *pos = base;
1665           int seen_slash_slash = 0;
1666           /* We're looking for the first slash, but want to ignore
1667              double slash. */
1668         again:
1669           slash = memchr (pos, '/', end - pos);
1670           if (slash && !seen_slash_slash)
1671             if (*(slash + 1) == '/')
1672               {
1673                 pos = slash + 2;
1674                 seen_slash_slash = 1;
1675                 goto again;
1676               }
1677
1678           /* At this point, SLASH is the location of the first / after
1679              "//", or the first slash altogether.  START_INSERT is the
1680              pointer to the location where LINK will be inserted.  When
1681              examining the last two examples, keep in mind that LINK
1682              begins with '/'. */
1683
1684           if (!slash && !seen_slash_slash)
1685             /* example: "foo" */
1686             /*           ^    */
1687             start_insert = base;
1688           else if (!slash && seen_slash_slash)
1689             /* example: "http://foo" */
1690             /*                     ^ */
1691             start_insert = end;
1692           else if (slash && !seen_slash_slash)
1693             /* example: "foo/bar" */
1694             /*           ^        */
1695             start_insert = base;
1696           else if (slash && seen_slash_slash)
1697             /* example: "http://something/" */
1698             /*                           ^  */
1699             start_insert = slash;
1700
1701           span = start_insert - base;
1702           constr = (char *)xmalloc (span + linklength + 1);
1703           if (span)
1704             memcpy (constr, base, span);
1705           if (linklength)
1706             memcpy (constr + span, link, linklength);
1707           constr[span + linklength] = '\0';
1708         }
1709       else
1710         {
1711           /* LINK is a relative URL: we need to replace everything
1712              after last slash (possibly empty) with LINK.
1713
1714              So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1715              our result should be "whatever/foo/qux/xyzzy".  */
1716           int need_explicit_slash = 0;
1717           int span;
1718           const char *start_insert;
1719           const char *last_slash = find_last_char (base, end, '/');
1720           if (!last_slash)
1721             {
1722               /* No slash found at all.  Append LINK to what we have,
1723                  but we'll need a slash as a separator.
1724
1725                  Example: if base == "foo" and link == "qux/xyzzy", then
1726                  we cannot just append link to base, because we'd get
1727                  "fooqux/xyzzy", whereas what we want is
1728                  "foo/qux/xyzzy".
1729
1730                  To make sure the / gets inserted, we set
1731                  need_explicit_slash to 1.  We also set start_insert
1732                  to end + 1, so that the length calculations work out
1733                  correctly for one more (slash) character.  Accessing
1734                  that character is fine, since it will be the
1735                  delimiter, '\0' or '?'.  */
1736               /* example: "foo?..." */
1737               /*               ^    ('?' gets changed to '/') */
1738               start_insert = end + 1;
1739               need_explicit_slash = 1;
1740             }
1741           else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1742             {
1743               /* example: http://host"  */
1744               /*                      ^ */
1745               start_insert = end + 1;
1746               need_explicit_slash = 1;
1747             }
1748           else
1749             {
1750               /* example: "whatever/foo/bar" */
1751               /*                        ^    */
1752               start_insert = last_slash + 1;
1753             }
1754
1755           span = start_insert - base;
1756           constr = (char *)xmalloc (span + linklength + 1);
1757           if (span)
1758             memcpy (constr, base, span);
1759           if (need_explicit_slash)
1760             constr[span - 1] = '/';
1761           if (linklength)
1762             memcpy (constr + span, link, linklength);
1763           constr[span + linklength] = '\0';
1764         }
1765     }
1766   else /* !no_scheme */
1767     {
1768       constr = strdupdelim (link, link + linklength);
1769     }
1770   return constr;
1771 }
1772
1773 /* Merge BASE with LINK and return the resulting URI.  This is an
1774    interface to uri_merge_1 that assumes that LINK is a
1775    zero-terminated string.  */
1776 char *
1777 uri_merge (const char *base, const char *link)
1778 {
1779   return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
1780 }
1781 \f
1782 #define APPEND(p, s) do {                       \
1783   int len = strlen (s);                         \
1784   memcpy (p, s, len);                           \
1785   p += len;                                     \
1786 } while (0)
1787
1788 /* Use this instead of password when the actual password is supposed
1789    to be hidden.  We intentionally use a generic string without giving
1790    away the number of characters in the password, like previous
1791    versions did.  */
1792 #define HIDDEN_PASSWORD "*password*"
1793
1794 /* Recreate the URL string from the data in URL.
1795
1796    If HIDE is non-zero (as it is when we're calling this on a URL we
1797    plan to print, but not when calling it to canonicalize a URL for
1798    use within the program), password will be hidden.  Unsafe
1799    characters in the URL will be quoted.  */
1800
1801 char *
1802 url_string (const struct url *url, int hide_password)
1803 {
1804   int size;
1805   char *result, *p;
1806   char *quoted_user = NULL, *quoted_passwd = NULL;
1807
1808   int scheme_port  = supported_schemes[url->scheme].default_port;
1809   char *scheme_str = supported_schemes[url->scheme].leading_string;
1810   int fplen = full_path_length (url);
1811
1812   int brackets_around_host = 0;
1813
1814   assert (scheme_str != NULL);
1815
1816   /* Make sure the user name and password are quoted. */
1817   if (url->user)
1818     {
1819       quoted_user = encode_string_maybe (url->user);
1820       if (url->passwd)
1821         {
1822           if (hide_password)
1823             quoted_passwd = HIDDEN_PASSWORD;
1824           else
1825             quoted_passwd = encode_string_maybe (url->passwd);
1826         }
1827     }
1828
1829   if (strchr (url->host, ':'))
1830     brackets_around_host = 1;
1831
1832   size = (strlen (scheme_str)
1833           + strlen (url->host)
1834           + (brackets_around_host ? 2 : 0)
1835           + fplen
1836           + 1);
1837   if (url->port != scheme_port)
1838     size += 1 + numdigit (url->port);
1839   if (quoted_user)
1840     {
1841       size += 1 + strlen (quoted_user);
1842       if (quoted_passwd)
1843         size += 1 + strlen (quoted_passwd);
1844     }
1845
1846   p = result = xmalloc (size);
1847
1848   APPEND (p, scheme_str);
1849   if (quoted_user)
1850     {
1851       APPEND (p, quoted_user);
1852       if (quoted_passwd)
1853         {
1854           *p++ = ':';
1855           APPEND (p, quoted_passwd);
1856         }
1857       *p++ = '@';
1858     }
1859
1860   if (brackets_around_host)
1861     *p++ = '[';
1862   APPEND (p, url->host);
1863   if (brackets_around_host)
1864     *p++ = ']';
1865   if (url->port != scheme_port)
1866     {
1867       *p++ = ':';
1868       p = number_to_string (p, url->port);
1869     }
1870
1871   full_path_write (url, p);
1872   p += fplen;
1873   *p++ = '\0';
1874
1875   assert (p - result == size);
1876
1877   if (quoted_user && quoted_user != url->user)
1878     xfree (quoted_user);
1879   if (quoted_passwd && !hide_password
1880       && quoted_passwd != url->passwd)
1881     xfree (quoted_passwd);
1882
1883   return result;
1884 }
1885 \f
1886 /* Return the URL of the proxy appropriate for url U.  */
1887 char *
1888 getproxy (struct url *u)
1889 {
1890   char *proxy = NULL;
1891   char *rewritten_url;
1892   static char rewritten_storage[1024];
1893
1894   if (!opt.use_proxy)
1895     return NULL;
1896   if (!no_proxy_match (u->host, (const char **)opt.no_proxy))
1897     return NULL;
1898
1899   switch (u->scheme)
1900     {
1901     case SCHEME_HTTP:
1902       proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1903       break;
1904 #ifdef HAVE_SSL
1905     case SCHEME_HTTPS:
1906       proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1907       break;
1908 #endif
1909     case SCHEME_FTP:
1910       proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1911       break;
1912     case SCHEME_INVALID:
1913       break;
1914     }
1915   if (!proxy || !*proxy)
1916     return NULL;
1917
1918   /* Handle shorthands.  `rewritten_storage' is a kludge to allow
1919      getproxy() to return static storage. */
1920   rewritten_url = rewrite_shorthand_url (proxy);
1921   if (rewritten_url)
1922     {
1923       strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
1924       rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
1925       proxy = rewritten_storage;
1926     }
1927
1928   return proxy;
1929 }
1930
1931 /* Should a host be accessed through proxy, concerning no_proxy?  */
1932 int
1933 no_proxy_match (const char *host, const char **no_proxy)
1934 {
1935   if (!no_proxy)
1936     return 1;
1937   else
1938     return !sufmatch (no_proxy, host);
1939 }
1940 \f
1941 /* Support for converting links for local viewing in downloaded HTML
1942    files.  This should be moved to another file, because it has
1943    nothing to do with processing URLs.  */
1944
1945 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1946 static const char *replace_attr PARAMS ((const char *, int, FILE *,
1947                                          const char *));
1948 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
1949                                                       const char *, int));
1950 static char *local_quote_string PARAMS ((const char *));
1951
1952 /* Change the links in one HTML file.  LINKS is a list of links in the
1953    document, along with their positions and the desired direction of
1954    the conversion.  */
1955 void
1956 convert_links (const char *file, struct urlpos *links)
1957 {
1958   struct file_memory *fm;
1959   FILE *fp;
1960   const char *p;
1961   downloaded_file_t downloaded_file_return;
1962
1963   struct urlpos *link;
1964   int to_url_count = 0, to_file_count = 0;
1965
1966   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1967
1968   {
1969     /* First we do a "dry run": go through the list L and see whether
1970        any URL needs to be converted in the first place.  If not, just
1971        leave the file alone.  */
1972     int dry_count = 0;
1973     struct urlpos *dry = links;
1974     for (dry = links; dry; dry = dry->next)
1975       if (dry->convert != CO_NOCONVERT)
1976         ++dry_count;
1977     if (!dry_count)
1978       {
1979         logputs (LOG_VERBOSE, _("nothing to do.\n"));
1980         return;
1981       }
1982   }
1983
1984   fm = read_file (file);
1985   if (!fm)
1986     {
1987       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1988                  file, strerror (errno));
1989       return;
1990     }
1991
1992   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1993   if (opt.backup_converted && downloaded_file_return)
1994     write_backup_file (file, downloaded_file_return);
1995
1996   /* Before opening the file for writing, unlink the file.  This is
1997      important if the data in FM is mmaped.  In such case, nulling the
1998      file, which is what fopen() below does, would make us read all
1999      zeroes from the mmaped region.  */
2000   if (unlink (file) < 0 && errno != ENOENT)
2001     {
2002       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
2003                  file, strerror (errno));
2004       read_file_free (fm);
2005       return;
2006     }
2007   /* Now open the file for writing.  */
2008   fp = fopen (file, "wb");
2009   if (!fp)
2010     {
2011       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2012                  file, strerror (errno));
2013       read_file_free (fm);
2014       return;
2015     }
2016
2017   /* Here we loop through all the URLs in file, replacing those of
2018      them that are downloaded with relative references.  */
2019   p = fm->content;
2020   for (link = links; link; link = link->next)
2021     {
2022       char *url_start = fm->content + link->pos;
2023
2024       if (link->pos >= fm->length)
2025         {
2026           DEBUGP (("Something strange is going on.  Please investigate."));
2027           break;
2028         }
2029       /* If the URL is not to be converted, skip it.  */
2030       if (link->convert == CO_NOCONVERT)
2031         {
2032           DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
2033           continue;
2034         }
2035
2036       /* Echo the file contents, up to the offending URL's opening
2037          quote, to the outfile.  */
2038       fwrite (p, 1, url_start - p, fp);
2039       p = url_start;
2040
2041       switch (link->convert)
2042         {
2043         case CO_CONVERT_TO_RELATIVE:
2044           /* Convert absolute URL to relative. */
2045           {
2046             char *newname = construct_relative (file, link->local_name);
2047             char *quoted_newname = local_quote_string (newname);
2048
2049             if (!link->link_refresh_p)
2050               p = replace_attr (p, link->size, fp, quoted_newname);
2051             else
2052               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
2053                                              link->refresh_timeout);
2054
2055             DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
2056                      link->url->url, newname, link->pos, file));
2057             xfree (newname);
2058             xfree (quoted_newname);
2059             ++to_file_count;
2060             break;
2061           }
2062         case CO_CONVERT_TO_COMPLETE:
2063           /* Convert the link to absolute URL. */
2064           {
2065             char *newlink = link->url->url;
2066             char *quoted_newlink = html_quote_string (newlink);
2067
2068             if (!link->link_refresh_p)
2069               p = replace_attr (p, link->size, fp, quoted_newlink);
2070             else
2071               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
2072                                              link->refresh_timeout);
2073
2074             DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
2075                      newlink, link->pos, file));
2076             xfree (quoted_newlink);
2077             ++to_url_count;
2078             break;
2079           }
2080         case CO_NULLIFY_BASE:
2081           /* Change the base href to "". */
2082           p = replace_attr (p, link->size, fp, "");
2083           break;
2084         case CO_NOCONVERT:
2085           abort ();
2086           break;
2087         }
2088     }
2089
2090   /* Output the rest of the file. */
2091   if (p - fm->content < fm->length)
2092     fwrite (p, 1, fm->length - (p - fm->content), fp);
2093   fclose (fp);
2094   read_file_free (fm);
2095
2096   logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
2097 }
2098
2099 /* Construct and return a malloced copy of the relative link from two
2100    pieces of information: local name S1 of the referring file and
2101    local name S2 of the referred file.
2102
2103    So, if S1 is "jagor.srce.hr/index.html" and S2 is
2104    "jagor.srce.hr/images/news.gif", the function will return
2105    "images/news.gif".
2106
2107    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
2108    "fly.cc.fer.hr/images/fly.gif", the function will return
2109    "../images/fly.gif".
2110
2111    Caveats: S1 should not begin with `/', unless S2 also begins with
2112    '/'.  S1 should not contain things like ".." and such --
2113    construct_relative ("fly/ioccc/../index.html",
2114    "fly/images/fly.gif") will fail.  (A workaround is to call
2115    something like path_simplify() on S1).  */
2116 static char *
2117 construct_relative (const char *s1, const char *s2)
2118 {
2119   int i, cnt, sepdirs1;
2120   char *res;
2121
2122   if (*s2 == '/')
2123     return xstrdup (s2);
2124   /* S1 should *not* be absolute, if S2 wasn't.  */
2125   assert (*s1 != '/');
2126   i = cnt = 0;
2127   /* Skip the directories common to both strings.  */
2128   while (1)
2129     {
2130       while (s1[i] && s2[i]
2131              && (s1[i] == s2[i])
2132              && (s1[i] != '/')
2133              && (s2[i] != '/'))
2134         ++i;
2135       if (s1[i] == '/' && s2[i] == '/')
2136         cnt = ++i;
2137       else
2138         break;
2139     }
2140   for (sepdirs1 = 0; s1[i]; i++)
2141     if (s1[i] == '/')
2142       ++sepdirs1;
2143   /* Now, construct the file as of:
2144      - ../ repeated sepdirs1 time
2145      - all the non-mutual directories of S2.  */
2146   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
2147   for (i = 0; i < sepdirs1; i++)
2148     memcpy (res + 3 * i, "../", 3);
2149   strcpy (res + 3 * i, s2 + cnt);
2150   return res;
2151 }
2152 \f
2153 static void
2154 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
2155 {
2156   /* Rather than just writing over the original .html file with the
2157      converted version, save the former to *.orig.  Note we only do
2158      this for files we've _successfully_ downloaded, so we don't
2159      clobber .orig files sitting around from previous invocations. */
2160
2161   /* Construct the backup filename as the original name plus ".orig". */
2162   size_t         filename_len = strlen(file);
2163   char*          filename_plus_orig_suffix;
2164   boolean        already_wrote_backup_file = FALSE;
2165   slist*         converted_file_ptr;
2166   static slist*  converted_files = NULL;
2167
2168   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
2169     {
2170       /* Just write "orig" over "html".  We need to do it this way
2171          because when we're checking to see if we've downloaded the
2172          file before (to see if we can skip downloading it), we don't
2173          know if it's a text/html file.  Therefore we don't know yet
2174          at that stage that -E is going to cause us to tack on
2175          ".html", so we need to compare vs. the original URL plus
2176          ".orig", not the original URL plus ".html.orig". */
2177       filename_plus_orig_suffix = alloca (filename_len + 1);
2178       strcpy(filename_plus_orig_suffix, file);
2179       strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
2180     }
2181   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
2182     {
2183       /* Append ".orig" to the name. */
2184       filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
2185       strcpy(filename_plus_orig_suffix, file);
2186       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
2187     }
2188
2189   /* We can get called twice on the same URL thanks to the
2190      convert_all_links() call in main().  If we write the .orig file
2191      each time in such a case, it'll end up containing the first-pass
2192      conversion, not the original file.  So, see if we've already been
2193      called on this file. */
2194   converted_file_ptr = converted_files;
2195   while (converted_file_ptr != NULL)
2196     if (strcmp(converted_file_ptr->string, file) == 0)
2197       {
2198         already_wrote_backup_file = TRUE;
2199         break;
2200       }
2201     else
2202       converted_file_ptr = converted_file_ptr->next;
2203
2204   if (!already_wrote_backup_file)
2205     {
2206       /* Rename <file> to <file>.orig before former gets written over. */
2207       if (rename(file, filename_plus_orig_suffix) != 0)
2208         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
2209                    file, filename_plus_orig_suffix, strerror (errno));
2210
2211       /* Remember that we've already written a .orig backup for this file.
2212          Note that we never free this memory since we need it till the
2213          convert_all_links() call, which is one of the last things the
2214          program does before terminating.  BTW, I'm not sure if it would be
2215          safe to just set 'converted_file_ptr->string' to 'file' below,
2216          rather than making a copy of the string...  Another note is that I
2217          thought I could just add a field to the urlpos structure saying
2218          that we'd written a .orig file for this URL, but that didn't work,
2219          so I had to make this separate list.
2220          -- Dan Harkless <wget@harkless.org>
2221
2222          This [adding a field to the urlpos structure] didn't work
2223          because convert_file() is called from convert_all_links at
2224          the end of the retrieval with a freshly built new urlpos
2225          list.
2226          -- Hrvoje Niksic <hniksic@arsdigita.com>
2227       */
2228       converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
2229       converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
2230       converted_file_ptr->next = converted_files;
2231       converted_files = converted_file_ptr;
2232     }
2233 }
2234
2235 static int find_fragment PARAMS ((const char *, int, const char **,
2236                                   const char **));
2237
2238 /* Replace an attribute's original text with NEW_TEXT. */
2239
2240 static const char *
2241 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
2242 {
2243   int quote_flag = 0;
2244   char quote_char = '\"';       /* use "..." for quoting, unless the
2245                                    original value is quoted, in which
2246                                    case reuse its quoting char. */
2247   const char *frag_beg, *frag_end;
2248
2249   /* Structure of our string is:
2250        "...old-contents..."
2251        <---    size    --->  (with quotes)
2252      OR:
2253        ...old-contents...
2254        <---    size   -->    (no quotes)   */
2255
2256   if (*p == '\"' || *p == '\'')
2257     {
2258       quote_char = *p;
2259       quote_flag = 1;
2260       ++p;
2261       size -= 2;                /* disregard opening and closing quote */
2262     }
2263   putc (quote_char, fp);
2264   fputs (new_text, fp);
2265
2266   /* Look for fragment identifier, if any. */
2267   if (find_fragment (p, size, &frag_beg, &frag_end))
2268     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2269   p += size;
2270   if (quote_flag)
2271     ++p;
2272   putc (quote_char, fp);
2273
2274   return p;
2275 }
2276
2277 /* The same as REPLACE_ATTR, but used when replacing
2278    <meta http-equiv=refresh content="new_text"> because we need to
2279    append "timeout_value; URL=" before the next_text.  */
2280
2281 static const char *
2282 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
2283                            const char *new_text, int timeout)
2284 {
2285   /* "0; URL=..." */
2286   char *new_with_timeout = (char *)alloca (numdigit (timeout)
2287                                            + 6 /* "; URL=" */
2288                                            + strlen (new_text)
2289                                            + 1);
2290   sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
2291
2292   return replace_attr (p, size, fp, new_with_timeout);
2293 }
2294
2295 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2296    preceded by '&'.  If the character is not found, return zero.  If
2297    the character is found, return 1 and set BP and EP to point to the
2298    beginning and end of the region.
2299
2300    This is used for finding the fragment indentifiers in URLs.  */
2301
2302 static int
2303 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2304 {
2305   const char *end = beg + size;
2306   int saw_amp = 0;
2307   for (; beg < end; beg++)
2308     {
2309       switch (*beg)
2310         {
2311         case '&':
2312           saw_amp = 1;
2313           break;
2314         case '#':
2315           if (!saw_amp)
2316             {
2317               *bp = beg;
2318               *ep = end;
2319               return 1;
2320             }
2321           /* fallthrough */
2322         default:
2323           saw_amp = 0;
2324         }
2325     }
2326   return 0;
2327 }
2328
2329 /* Quote FILE for use as local reference to an HTML file.
2330
2331    We quote ? as %3F to avoid passing part of the file name as the
2332    parameter when browsing the converted file through HTTP.  However,
2333    it is safe to do this only when `--html-extension' is turned on.
2334    This is because converting "index.html?foo=bar" to
2335    "index.html%3Ffoo=bar" would break local browsing, as the latter
2336    isn't even recognized as an HTML file!  However, converting
2337    "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
2338    safe for both local and HTTP-served browsing.  */
2339
2340 static char *
2341 local_quote_string (const char *file)
2342 {
2343   const char *file_sans_qmark;
2344   int qm;
2345
2346   if (!opt.html_extension)
2347     return html_quote_string (file);
2348
2349   qm = count_char (file, '?');
2350
2351   if (qm)
2352     {
2353       const char *from = file;
2354       char *to, *newname;
2355
2356       /* qm * 2 because we replace each question mark with "%3F",
2357          i.e. replace one char with three, hence two more.  */
2358       int fsqlen = strlen (file) + qm * 2;
2359
2360       to = newname = (char *)alloca (fsqlen + 1);
2361       for (; *from; from++)
2362         {
2363           if (*from != '?')
2364             *to++ = *from;
2365           else
2366             {
2367               *to++ = '%';
2368               *to++ = '3';
2369               *to++ = 'F';
2370             }
2371         }
2372       assert (to - newname == fsqlen);
2373       *to = '\0';
2374
2375       file_sans_qmark = newname;
2376     }
2377   else
2378     file_sans_qmark = file;
2379
2380   return html_quote_string (file_sans_qmark);
2381 }
2382
2383 /* We're storing "modes" of type downloaded_file_t in the hash table.
2384    However, our hash tables only accept pointers for keys and values.
2385    So when we need a pointer, we use the address of a
2386    downloaded_file_t variable of static storage.  */
2387
2388 static downloaded_file_t *
2389 downloaded_mode_to_ptr (downloaded_file_t mode)
2390 {
2391   static downloaded_file_t
2392     v1 = FILE_NOT_ALREADY_DOWNLOADED,
2393     v2 = FILE_DOWNLOADED_NORMALLY,
2394     v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2395     v4 = CHECK_FOR_FILE;
2396
2397   switch (mode)
2398     {
2399     case FILE_NOT_ALREADY_DOWNLOADED:
2400       return &v1;
2401     case FILE_DOWNLOADED_NORMALLY:
2402       return &v2;
2403     case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2404       return &v3;
2405     case CHECK_FOR_FILE:
2406       return &v4;
2407     }
2408   return NULL;
2409 }
2410
2411 /* This should really be merged with dl_file_url_map and
2412    downloaded_html_files in recur.c.  This was originally a list, but
2413    I changed it to a hash table beause it was actually taking a lot of
2414    time to find things in it.  */
2415
2416 static struct hash_table *downloaded_files_hash;
2417
2418 /* Remembers which files have been downloaded.  In the standard case, should be
2419    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2420    download successfully (i.e. not for ones we have failures on or that we skip
2421    due to -N).
2422
2423    When we've downloaded a file and tacked on a ".html" extension due to -E,
2424    call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2425    FILE_DOWNLOADED_NORMALLY.
2426
2427    If you just want to check if a file has been previously added without adding
2428    it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
2429    with local filenames, not remote URLs. */
2430 downloaded_file_t
2431 downloaded_file (downloaded_file_t mode, const char *file)
2432 {
2433   downloaded_file_t *ptr;
2434
2435   if (mode == CHECK_FOR_FILE)
2436     {
2437       if (!downloaded_files_hash)
2438         return FILE_NOT_ALREADY_DOWNLOADED;
2439       ptr = hash_table_get (downloaded_files_hash, file);
2440       if (!ptr)
2441         return FILE_NOT_ALREADY_DOWNLOADED;
2442       return *ptr;
2443     }
2444
2445   if (!downloaded_files_hash)
2446     downloaded_files_hash = make_string_hash_table (0);
2447
2448   ptr = hash_table_get (downloaded_files_hash, file);
2449   if (ptr)
2450     return *ptr;
2451
2452   ptr = downloaded_mode_to_ptr (mode);
2453   hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2454
2455   return FILE_NOT_ALREADY_DOWNLOADED;
2456 }
2457
2458 static int
2459 df_free_mapper (void *key, void *value, void *ignored)
2460 {
2461   xfree (key);
2462   return 0;
2463 }
2464
2465 void
2466 downloaded_files_free (void)
2467 {
2468   if (downloaded_files_hash)
2469     {
2470       hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2471       hash_table_destroy (downloaded_files_hash);
2472       downloaded_files_hash = NULL;
2473     }
2474 }
2475
2476 /* Return non-zero if scheme a is similar to scheme b.
2477
2478    Schemes are similar if they are equal.  If SSL is supported, schemes
2479    are also similar if one is http (SCHEME_HTTP) and the other is https
2480    (SCHEME_HTTPS).  */
2481 int
2482 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2483 {
2484   if (a == b)
2485     return 1;
2486 #ifdef HAVE_SSL
2487   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2488       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2489     return 1;
2490 #endif
2491   return 0;
2492 }
2493 \f
2494 #if 0
2495 /* Debugging and testing support for path_simplify. */
2496
2497 /* Debug: run path_simplify on PATH and return the result in a new
2498    string.  Useful for calling from the debugger.  */
2499 static char *
2500 ps (char *path)
2501 {
2502   char *copy = xstrdup (path);
2503   path_simplify (copy);
2504   return copy;
2505 }
2506
2507 static void
2508 run_test (char *test, char *expected_result, int expected_change)
2509 {
2510   char *test_copy = xstrdup (test);
2511   int modified = path_simplify (test_copy);
2512
2513   if (0 != strcmp (test_copy, expected_result))
2514     {
2515       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2516               test, expected_result, test_copy);
2517     }
2518   if (modified != expected_change)
2519     {
2520       if (expected_change == 1)
2521         printf ("Expected no modification with path_simplify(\"%s\").\n",
2522                 test);
2523       else
2524         printf ("Expected modification with path_simplify(\"%s\").\n",
2525                 test);
2526     }
2527   xfree (test_copy);
2528 }
2529
2530 static void
2531 test_path_simplify (void)
2532 {
2533   static struct {
2534     char *test, *result;
2535     int should_modify;
2536   } tests[] = {
2537     { "",               "",             0 },
2538     { ".",              "",             1 },
2539     { "..",             "",             1 },
2540     { "foo",            "foo",          0 },
2541     { "foo/bar",        "foo/bar",      0 },
2542     { "foo///bar",      "foo/bar",      1 },
2543     { "foo/.",          "foo/",         1 },
2544     { "foo/./",         "foo/",         1 },
2545     { "foo./",          "foo./",        0 },
2546     { "foo/../bar",     "bar",          1 },
2547     { "foo/../bar/",    "bar/",         1 },
2548     { "foo/bar/..",     "foo/",         1 },
2549     { "foo/bar/../x",   "foo/x",        1 },
2550     { "foo/bar/../x/",  "foo/x/",       1 },
2551     { "foo/..",         "",             1 },
2552     { "foo/../..",      "",             1 },
2553     { "a/b/../../c",    "c",            1 },
2554     { "./a/../b",       "b",            1 }
2555   };
2556   int i;
2557
2558   for (i = 0; i < ARRAY_SIZE (tests); i++)
2559     {
2560       char *test = tests[i].test;
2561       char *expected_result = tests[i].result;
2562       int   expected_change = tests[i].should_modify;
2563       run_test (test, expected_result, expected_change);
2564     }
2565
2566   /* Now run all the tests with a leading slash before the test case,
2567      to prove that the slash is being preserved.  */
2568   for (i = 0; i < ARRAY_SIZE (tests); i++)
2569     {
2570       char *test, *expected_result;
2571       int expected_change = tests[i].should_modify;
2572
2573       test = xmalloc (1 + strlen (tests[i].test) + 1);
2574       sprintf (test, "/%s", tests[i].test);
2575
2576       expected_result = xmalloc (1 + strlen (tests[i].result) + 1);
2577       sprintf (expected_result, "/%s", tests[i].result);
2578
2579       run_test (test, expected_result, expected_change);
2580
2581       xfree (test);
2582       xfree (expected_result);
2583     }
2584 }
2585 #endif