sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001, 2003, 2003
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2 of the License, or (at
  10 your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget; if not, write to the Free Software
  19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  20
  21 In addition, as a special exception, the Free Software Foundation
  22 gives permission to link the code of its release of Wget with the
  23 OpenSSL project's "OpenSSL" library (or with modified versions of it
  24 that use the same license as the "OpenSSL" library), and distribute
  25 the linked executables.  You must obey the GNU General Public License
  26 in all respects for all of the code used other than "OpenSSL".  If you
  27 modify this file, you may extend this exception to your version of the
  28 file, but you are not obligated to do so.  If you do not wish to do
  29 so, delete this exception statement from your version.  */
  30
  31 #include <config.h>
  32
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #ifdef HAVE_STRING_H
  36 # include <string.h>
  37 #else
  38 # include <strings.h>
  39 #endif
  40 #include <sys/types.h>
  41 #ifdef HAVE_UNISTD_H
  42 # include <unistd.h>
  43 #endif
  44 #include <errno.h>
  45 #include <assert.h>
  46
  47 #include "wget.h"
  48 #include "utils.h"
  49 #include "url.h"
  50 #include "host.h"
  51 #include "hash.h"
  52
  53 #ifndef errno
  54 extern int errno;
  55 #endif
  56
  57 /* Is X "."?  */
  58 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  59 /* Is X ".."?  */
  60 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  61
  62 static const int NS_INADDRSZ  = 4;
  63 static const int NS_IN6ADDRSZ = 16;
  64 static const int NS_INT16SZ = 2;
  65
  66
  67 struct scheme_data
  68 {
  69   char *leading_string;
  70   int default_port;
  71   int enabled;
  72 };
  73
  74 /* Supported schemes: */
  75 static struct scheme_data supported_schemes[] =
  76 {
  77   { "http://",  DEFAULT_HTTP_PORT,  1 },
  78 #ifdef HAVE_SSL
  79   { "https://", DEFAULT_HTTPS_PORT, 1 },
  80 #endif
  81   { "ftp://",   DEFAULT_FTP_PORT,   1 },
  82
  83   /* SCHEME_INVALID */
  84   { NULL,       -1,                 0 }
  85 };
  86
  87 /* Forward declarations: */
  88
  89 static char *construct_relative PARAMS ((const char *, const char *));
  90 static int path_simplify PARAMS ((char *));
  91
  92
  93 \f
  94 /* Support for encoding and decoding of URL strings.  We determine
  95    whether a character is unsafe through static table lookup.  This
  96    code assumes ASCII character set and 8-bit chars.  */
  97
  98 enum {
  99   /* rfc1738 reserved chars, preserved from encoding.  */
 100   urlchr_reserved = 1,
 101
 102   /* rfc1738 unsafe chars, plus some more.  */
 103   urlchr_unsafe   = 2
 104 };
 105
 106 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 107 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 108 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 109
 110 /* Shorthands for the table: */
 111 #define R  urlchr_reserved
 112 #define U  urlchr_unsafe
 113 #define RU R|U
 114
 115 const static unsigned char urlchr_table[256] =
 116 {
 117   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 118   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 119   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 120   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 121   U,  0,  U, RU,   0,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 122   0,  0,  0,  R,   0,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 123   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 124   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 125  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 126   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 127   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 128   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 129   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 130   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 131   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 132   0,  0,  0,  U,   U,  U,  U,  U,   /* x   y   z   {    |   }   ~   DEL */
 133
 134   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 135   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 136   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 137   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 138
 139   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 140   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 141   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 142   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 143 };
 144 #undef R
 145 #undef U
 146 #undef RU
 147
 148 /* URL-unescape the string S.
 149
 150    This is done by transforming the sequences "%HH" to the character
 151    represented by the hexadecimal digits HH.  If % is not followed by
 152    two hexadecimal digits, it is inserted literally.
 153
 154    The transformation is done in place.  If you need the original
 155    string intact, make a copy before calling this function.  */
 156
 157 static void
 158 url_unescape (char *s)
 159 {
 160   char *t = s;                  /* t - tortoise */
 161   char *h = s;                  /* h - hare     */
 162
 163   for (; *h; h++, t++)
 164     {
 165       if (*h != '%')
 166         {
 167         copychar:
 168           *t = *h;
 169         }
 170       else
 171         {
 172           /* Do nothing if '%' is not followed by two hex digits. */
 173           if (!*(h + 1) || !*(h + 2)
 174               || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
 175             goto copychar;
 176           *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
 177           h += 2;
 178         }
 179     }
 180   *t = '\0';
 181 }
 182
 183 /* The core of url_escape_* functions.  Escapes the characters that
 184    match the provided mask in urlchr_table.
 185
 186    If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars
 187    will be returned unchanged.  If ALLOW_PASSTHROUGH is zero, a
 188    freshly allocated string will be returned in all cases.  */
 189
 190 static char *
 191 url_escape_1 (const char *s, unsigned char mask, int allow_passthrough)
 192 {
 193   const char *p1;
 194   char *p2, *newstr;
 195   int newlen;
 196   int addition = 0;
 197
 198   for (p1 = s; *p1; p1++)
 199     if (urlchr_test (*p1, mask))
 200       addition += 2;            /* Two more characters (hex digits) */
 201
 202   if (!addition)
 203     return allow_passthrough ? (char *)s : xstrdup (s);
 204
 205   newlen = (p1 - s) + addition;
 206   newstr = (char *)xmalloc (newlen + 1);
 207
 208   p1 = s;
 209   p2 = newstr;
 210   while (*p1)
 211     {
 212       /* Quote the characters that match the test mask. */
 213       if (urlchr_test (*p1, mask))
 214         {
 215           unsigned char c = *p1++;
 216           *p2++ = '%';
 217           *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 218           *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 219         }
 220       else
 221         *p2++ = *p1++;
 222     }
 223   assert (p2 - newstr == newlen);
 224   *p2 = '\0';
 225
 226   return newstr;
 227 }
 228
 229 /* URL-escape the unsafe characters (see urlchr_table) in a given
 230    string, returning a freshly allocated string.  */
 231
 232 char *
 233 url_escape (const char *s)
 234 {
 235   return url_escape_1 (s, urlchr_unsafe, 0);
 236 }
 237
 238 /* URL-escape the unsafe characters (see urlchr_table) in a given
 239    string.  If no characters are unsafe, S is returned.  */
 240
 241 static char *
 242 url_escape_allow_passthrough (const char *s)
 243 {
 244   return url_escape_1 (s, urlchr_unsafe, 1);
 245 }
 246 \f
 247 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
 248
 249 /* Decide whether to encode, decode, or pass through the char at P.
 250    This used to be a macro, but it got a little too convoluted.  */
 251 static inline enum copy_method
 252 decide_copy_method (const char *p)
 253 {
 254   if (*p == '%')
 255     {
 256       if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
 257         {
 258           /* %xx sequence: decode it, unless it would decode to an
 259              unsafe or a reserved char; in that case, leave it as
 260              is. */
 261           char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
 262             XCHAR_TO_XDIGIT (*(p + 2));
 263
 264           if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt))
 265             return CM_PASSTHROUGH;
 266           else
 267             return CM_DECODE;
 268         }
 269       else
 270         /* Garbled %.. sequence: encode `%'. */
 271         return CM_ENCODE;
 272     }
 273   else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
 274     return CM_ENCODE;
 275   else
 276     return CM_PASSTHROUGH;
 277 }
 278
 279 /* Translate a %-escaped (but possibly non-conformant) input string S
 280    into a %-escaped (and conformant) output string.  If no characters
 281    are encoded or decoded, return the same string S; otherwise, return
 282    a freshly allocated string with the new contents.
 283
 284    After a URL has been run through this function, the protocols that
 285    use `%' as the quote character can use the resulting string as-is,
 286    while those that don't call url_unescape() to get to the intended
 287    data.  This function is also stable: after an input string is
 288    transformed the first time, all further transformations of the
 289    result yield the same result string.
 290
 291    Let's discuss why this function is needed.
 292
 293    Imagine Wget is to retrieve `http://abc.xyz/abc def'.  Since a raw
 294    space character would mess up the HTTP request, it needs to be
 295    quoted, like this:
 296
 297        GET /abc%20def HTTP/1.0
 298
 299    It appears that the unsafe chars need to be quoted, for example
 300    with url_escape.  But what if we're requested to download
 301    `abc%20def'?  url_escape transforms "%" to "%25", which would leave
 302    us with `abc%2520def'.  This is incorrect -- since %-escapes are
 303    part of URL syntax, "%20" is the correct way to denote a literal
 304    space on the Wget command line.  This leaves us in the conclusion
 305    that in that case Wget should not call url_escape, but leave the
 306    `%20' as is.
 307
 308    And what if the requested URI is `abc%20 def'?  If we call
 309    url_escape, we end up with `/abc%2520%20def', which is almost
 310    certainly not intended.  If we don't call url_escape, we are left
 311    with the embedded space and cannot complete the request.  What the
 312    user meant was for Wget to request `/abc%20%20def', and this is
 313    where reencode_escapes kicks in.
 314
 315    Wget used to solve this by first decoding %-quotes, and then
 316    encoding all the "unsafe" characters found in the resulting string.
 317    This was wrong because it didn't preserve certain URL special
 318    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 319    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 320    whether we considered `+' reserved (it is).  One of these results
 321    is inevitable because by the second step we would lose information
 322    on whether the `+' was originally encoded or not.  Both results
 323    were wrong because in CGI parameters + means space, while %2B means
 324    literal plus.  reencode_escapes correctly translates the above to
 325    "a%2B+b", i.e. returns the original string.
 326
 327    This function uses an algorithm proposed by Anon Sricharoenchai:
 328
 329    1. Encode all URL_UNSAFE and the "%" that are not followed by 2
 330       hexdigits.
 331
 332    2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
 333       "+".
 334
 335    ...except that this code conflates the two steps, and decides
 336    whether to encode, decode, or pass through each character in turn.
 337    The function still uses two passes, but their logic is the same --
 338    the first pass exists merely for the sake of allocation.  Another
 339    small difference is that we include `+' to URL_RESERVED.
 340
 341    Anon's test case:
 342
 343    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 344    ->
 345    "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
 346
 347    Simpler test cases:
 348
 349    "foo bar"         -> "foo%20bar"
 350    "foo%20bar"       -> "foo%20bar"
 351    "foo %20bar"      -> "foo%20%20bar"
 352    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 353    "foo%25%20bar"    -> "foo%25%20bar"
 354    "foo%2%20bar"     -> "foo%252%20bar"
 355    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 356    "foo%2b+bar"      -> "foo%2b+bar"  */
 357
 358 static char *
 359 reencode_escapes (const char *s)
 360 {
 361   const char *p1;
 362   char *newstr, *p2;
 363   int oldlen, newlen;
 364
 365   int encode_count = 0;
 366   int decode_count = 0;
 367
 368   /* First, pass through the string to see if there's anything to do,
 369      and to calculate the new length.  */
 370   for (p1 = s; *p1; p1++)
 371     {
 372       switch (decide_copy_method (p1))
 373         {
 374         case CM_ENCODE:
 375           ++encode_count;
 376           break;
 377         case CM_DECODE:
 378           ++decode_count;
 379           break;
 380         case CM_PASSTHROUGH:
 381           break;
 382         }
 383     }
 384
 385   if (!encode_count && !decode_count)
 386     /* The string is good as it is. */
 387     return (char *)s;           /* C const model sucks. */
 388
 389   oldlen = p1 - s;
 390   /* Each encoding adds two characters (hex digits), while each
 391      decoding removes two characters.  */
 392   newlen = oldlen + 2 * (encode_count - decode_count);
 393   newstr = xmalloc (newlen + 1);
 394
 395   p1 = s;
 396   p2 = newstr;
 397
 398   while (*p1)
 399     {
 400       switch (decide_copy_method (p1))
 401         {
 402         case CM_ENCODE:
 403           {
 404             unsigned char c = *p1++;
 405             *p2++ = '%';
 406             *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 407             *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 408           }
 409           break;
 410         case CM_DECODE:
 411           *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
 412                    + (XCHAR_TO_XDIGIT (*(p1 + 2))));
 413           p1 += 3;              /* skip %xx */
 414           break;
 415         case CM_PASSTHROUGH:
 416           *p2++ = *p1++;
 417         }
 418     }
 419   *p2 = '\0';
 420   assert (p2 - newstr == newlen);
 421   return newstr;
 422 }
 423 \f
 424 /* Returns the scheme type if the scheme is supported, or
 425    SCHEME_INVALID if not.  */
 426 enum url_scheme
 427 url_scheme (const char *url)
 428 {
 429   int i;
 430
 431   for (i = 0; supported_schemes[i].leading_string; i++)
 432     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 433                           strlen (supported_schemes[i].leading_string)))
 434       {
 435         if (supported_schemes[i].enabled)
 436           return (enum url_scheme) i;
 437         else
 438           return SCHEME_INVALID;
 439       }
 440
 441   return SCHEME_INVALID;
 442 }
 443
 444 /* Return the number of characters needed to skip the scheme part of
 445    the URL, e.g. `http://'.  If no scheme is found, returns 0.  */
 446 int
 447 url_skip_scheme (const char *url)
 448 {
 449   const char *p = url;
 450
 451   /* Skip the scheme name.  We allow `-' and `+' because of `whois++',
 452      etc. */
 453   while (ISALNUM (*p) || *p == '-' || *p == '+')
 454     ++p;
 455   if (*p != ':')
 456     return 0;
 457   /* Skip ':'. */
 458   ++p;
 459
 460   /* Skip "//" if found. */
 461   if (*p == '/' && *(p + 1) == '/')
 462     p += 2;
 463
 464   return p - url;
 465 }
 466
 467 /* Returns 1 if the URL begins with a scheme (supported or
 468    unsupported), 0 otherwise.  */
 469 int
 470 url_has_scheme (const char *url)
 471 {
 472   const char *p = url;
 473   while (ISALNUM (*p) || *p == '-' || *p == '+')
 474     ++p;
 475   return *p == ':';
 476 }
 477
 478 int
 479 scheme_default_port (enum url_scheme scheme)
 480 {
 481   return supported_schemes[scheme].default_port;
 482 }
 483
 484 void
 485 scheme_disable (enum url_scheme scheme)
 486 {
 487   supported_schemes[scheme].enabled = 0;
 488 }
 489
 490 /* Skip the username and password, if present here.  The function
 491    should be called *not* with the complete URL, but with the part
 492    right after the scheme.
 493
 494    If no username and password are found, return 0.  */
 495 int
 496 url_skip_uname (const char *url)
 497 {
 498   const char *p;
 499
 500   /* Look for '@' that comes before '/' or '?'. */
 501   p = (const char *)strpbrk (url, "/?@");
 502   if (!p || *p != '@')
 503     return 0;
 504
 505   return p - url + 1;
 506 }
 507
 508 static int
 509 parse_uname (const char *str, int len, char **user, char **passwd)
 510 {
 511   char *colon;
 512
 513   if (len == 0)
 514     /* Empty user name not allowed. */
 515     return 0;
 516
 517   colon = memchr (str, ':', len);
 518   if (colon == str)
 519     /* Empty user name again. */
 520     return 0;
 521
 522   if (colon)
 523     {
 524       int pwlen = len - (colon + 1 - str);
 525       *passwd = xmalloc (pwlen + 1);
 526       memcpy (*passwd, colon + 1, pwlen);
 527       (*passwd)[pwlen] = '\0';
 528       len -= pwlen + 1;
 529     }
 530   else
 531     *passwd = NULL;
 532
 533   *user = xmalloc (len + 1);
 534   memcpy (*user, str, len);
 535   (*user)[len] = '\0';
 536
 537   if (*user)
 538     url_unescape (*user);
 539   if (*passwd)
 540     url_unescape (*passwd);
 541
 542   return 1;
 543 }
 544
 545 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 546    popularized by Netscape and NcFTP.  HTTP shorthands look like this:
 547
 548    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 549    www.foo.com[:port]            -> http://www.foo.com[:port]
 550
 551    FTP shorthands look like this:
 552
 553    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 554    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 555
 556    If the URL needs not or cannot be rewritten, return NULL.  */
 557 char *
 558 rewrite_shorthand_url (const char *url)
 559 {
 560   const char *p;
 561
 562   if (url_has_scheme (url))
 563     return NULL;
 564
 565   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 566      latter Netscape.  */
 567   for (p = url; *p && *p != ':' && *p != '/'; p++)
 568     ;
 569
 570   if (p == url)
 571     return NULL;
 572
 573   if (*p == ':')
 574     {
 575       const char *pp;
 576       char *res;
 577       /* If the characters after the colon and before the next slash
 578          or end of string are all digits, it's HTTP.  */
 579       int digits = 0;
 580       for (pp = p + 1; ISDIGIT (*pp); pp++)
 581         ++digits;
 582       if (digits > 0 && (*pp == '/' || *pp == '\0'))
 583         goto http;
 584
 585       /* Prepend "ftp://" to the entire URL... */
 586       res = xmalloc (6 + strlen (url) + 1);
 587       sprintf (res, "ftp://%s", url);
 588       /* ...and replace ':' with '/'. */
 589       res[6 + (p - url)] = '/';
 590       return res;
 591     }
 592   else
 593     {
 594       char *res;
 595     http:
 596       /* Just prepend "http://" to what we have. */
 597       res = xmalloc (7 + strlen (url) + 1);
 598       sprintf (res, "http://%s", url);
 599       return res;
 600     }
 601 }
 602 \f
 603 static void parse_path PARAMS ((const char *, char **, char **));
 604
 605 /* Like strpbrk, with the exception that it returns the pointer to the
 606    terminating zero (end-of-string aka "eos") if no matching character
 607    is found.
 608
 609    Although I normally balk at Gcc-specific optimizations, it probably
 610    makes sense here: glibc has optimizations that detect strpbrk being
 611    called with literal string as ACCEPT and inline the search.  That
 612    optimization is defeated if strpbrk is hidden within the call to
 613    another function.  (And no, making strpbrk_or_eos inline doesn't
 614    help because the check for literal accept is in the
 615    preprocessor.)  */
 616
 617 #ifdef __GNUC__
 618
 619 #define strpbrk_or_eos(s, accept) ({            \
 620   char *SOE_p = strpbrk (s, accept);            \
 621   if (!SOE_p)                                   \
 622     SOE_p = (char *)s + strlen (s);             \
 623   SOE_p;                                        \
 624 })
 625
 626 #else  /* not __GNUC__ */
 627
 628 static char *
 629 strpbrk_or_eos (const char *s, const char *accept)
 630 {
 631   char *p = strpbrk (s, accept);
 632   if (!p)
 633     p = (char *)s + strlen (s);
 634   return p;
 635 }
 636 #endif
 637
 638 /* Turn STR into lowercase; return non-zero if a character was
 639    actually changed. */
 640
 641 static int
 642 lowercase_str (char *str)
 643 {
 644   int change = 0;
 645   for (; *str; str++)
 646     if (ISUPPER (*str))
 647       {
 648         change = 1;
 649         *str = TOLOWER (*str);
 650       }
 651   return change;
 652 }
 653
 654 static char *parse_errors[] = {
 655 #define PE_NO_ERROR                     0
 656   "No error",
 657 #define PE_UNSUPPORTED_SCHEME           1
 658   "Unsupported scheme",
 659 #define PE_EMPTY_HOST                   2
 660   "Empty host",
 661 #define PE_BAD_PORT_NUMBER              3
 662   "Bad port number",
 663 #define PE_INVALID_USER_NAME            4
 664   "Invalid user name",
 665 #define PE_UNTERMINATED_IPV6_ADDRESS    5
 666   "Unterminated IPv6 numeric address",
 667 #define PE_IPV6_NOT_SUPPORTED           6
 668   "IPv6 addresses not supported",
 669 #define PE_INVALID_IPV6_ADDRESS         7
 670   "Invalid IPv6 numeric address"
 671 };
 672
 673 #define SETERR(p, v) do {                       \
 674   if (p)                                        \
 675     *(p) = (v);                                 \
 676 } while (0)
 677
 678 #ifdef ENABLE_IPV6
 679 /* The following two functions were adapted from glibc. */
 680
 681 static int
 682 is_valid_ipv4_address (const char *str, const char *end)
 683 {
 684   int saw_digit, octets;
 685   int val;
 686
 687   saw_digit = 0;
 688   octets = 0;
 689   val = 0;
 690
 691   while (str < end) {
 692     int ch = *str++;
 693
 694     if (ch >= '0' && ch <= '9') {
 695       val = val * 10 + (ch - '0');
 696
 697       if (val > 255)
 698         return 0;
 699       if (saw_digit == 0) {
 700         if (++octets > 4)
 701           return 0;
 702         saw_digit = 1;
 703       }
 704     } else if (ch == '.' && saw_digit == 1) {
 705       if (octets == 4)
 706         return 0;
 707       val = 0;
 708       saw_digit = 0;
 709     } else
 710       return 0;
 711   }
 712   if (octets < 4)
 713     return 0;
 714
 715   return 1;
 716 }
 717
 718 static int
 719 is_valid_ipv6_address (const char *str, const char *end)
 720 {
 721   static const char xdigits[] = "0123456789abcdef";
 722   const char *curtok;
 723   int tp;
 724   const char *colonp;
 725   int saw_xdigit;
 726   unsigned int val;
 727
 728   tp = 0;
 729   colonp = NULL;
 730
 731   if (str == end)
 732     return 0;
 733
 734   /* Leading :: requires some special handling. */
 735   if (*str == ':')
 736     {
 737       ++str;
 738       if (str == end || *str != ':')
 739         return 0;
 740     }
 741
 742   curtok = str;
 743   saw_xdigit = 0;
 744   val = 0;
 745
 746   while (str < end) {
 747     int ch = *str++;
 748     const char *pch;
 749
 750     /* if ch is a number, add it to val. */
 751     pch = strchr(xdigits, ch);
 752     if (pch != NULL) {
 753       val <<= 4;
 754       val |= (pch - xdigits);
 755       if (val > 0xffff)
 756         return 0;
 757       saw_xdigit = 1;
 758       continue;
 759     }
 760
 761     /* if ch is a colon ... */
 762     if (ch == ':') {
 763       curtok = str;
 764       if (saw_xdigit == 0) {
 765         if (colonp != NULL)
 766           return 0;
 767         colonp = str + tp;
 768         continue;
 769       } else if (str == end) {
 770         return 0;
 771       }
 772       if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
 773         return 0;
 774       tp += NS_INT16SZ;
 775       saw_xdigit = 0;
 776       val = 0;
 777       continue;
 778     }
 779
 780     /* if ch is a dot ... */
 781     if (ch == '.' && (tp <= NS_IN6ADDRSZ - NS_INADDRSZ) &&
 782         is_valid_ipv4_address(curtok, end) == 1) {
 783       tp += NS_INADDRSZ;
 784       saw_xdigit = 0;
 785       break;
 786     }
 787
 788     return 0;
 789   }
 790
 791   if (saw_xdigit == 1) {
 792     if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
 793       return 0;
 794     tp += NS_INT16SZ;
 795   }
 796
 797   if (colonp != NULL) {
 798     if (tp == NS_IN6ADDRSZ)
 799       return 0;
 800     tp = NS_IN6ADDRSZ;
 801   }
 802
 803   if (tp != NS_IN6ADDRSZ)
 804     return 0;
 805
 806   return 1;
 807 }
 808 #endif
 809
 810 /* Parse a URL.
 811
 812    Return a new struct url if successful, NULL on error.  In case of
 813    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 814    error code. */
 815 struct url *
 816 url_parse (const char *url, int *error)
 817 {
 818   struct url *u;
 819   const char *p;
 820   int path_modified, host_modified;
 821
 822   enum url_scheme scheme;
 823
 824   const char *uname_b,     *uname_e;
 825   const char *host_b,      *host_e;
 826   const char *path_b,      *path_e;
 827   const char *params_b,    *params_e;
 828   const char *query_b,     *query_e;
 829   const char *fragment_b,  *fragment_e;
 830
 831   int port;
 832   char *user = NULL, *passwd = NULL;
 833
 834   char *url_encoded;
 835
 836   scheme = url_scheme (url);
 837   if (scheme == SCHEME_INVALID)
 838     {
 839       SETERR (error, PE_UNSUPPORTED_SCHEME);
 840       return NULL;
 841     }
 842
 843   url_encoded = reencode_escapes (url);
 844   p = url_encoded;
 845
 846   p += strlen (supported_schemes[scheme].leading_string);
 847   uname_b = p;
 848   p += url_skip_uname (p);
 849   uname_e = p;
 850
 851   /* scheme://user:pass@host[:port]... */
 852   /*                    ^              */
 853
 854   /* We attempt to break down the URL into the components path,
 855      params, query, and fragment.  They are ordered like this:
 856
 857        scheme://host[:port][/path][;params][?query][#fragment]  */
 858
 859   params_b   = params_e   = NULL;
 860   query_b    = query_e    = NULL;
 861   fragment_b = fragment_e = NULL;
 862
 863   host_b = p;
 864
 865   if (*p == '[')
 866     {
 867       /* Handle IPv6 address inside square brackets.  Ideally we'd
 868          just look for the terminating ']', but rfc2732 mandates
 869          rejecting invalid IPv6 addresses.  */
 870
 871       /* The address begins after '['. */
 872       host_b = p + 1;
 873       host_e = strchr (host_b, ']');
 874
 875       if (!host_e)
 876         {
 877           SETERR (error, PE_UNTERMINATED_IPV6_ADDRESS);
 878           return NULL;
 879         }
 880
 881 #ifdef ENABLE_IPV6
 882       /* Check if the IPv6 address is valid. */
 883       if (!is_valid_ipv6_address(host_b, host_e))
 884         {
 885           SETERR (error, PE_INVALID_IPV6_ADDRESS);
 886           return NULL;
 887         }
 888
 889       /* Continue parsing after the closing ']'. */
 890       p = host_e + 1;
 891 #else
 892       SETERR (error, PE_IPV6_NOT_SUPPORTED);
 893       return NULL;
 894 #endif
 895     }
 896   else
 897     {
 898       p = strpbrk_or_eos (p, ":/;?#");
 899       host_e = p;
 900     }
 901
 902   if (host_b == host_e)
 903     {
 904       SETERR (error, PE_EMPTY_HOST);
 905       return NULL;
 906     }
 907
 908   port = scheme_default_port (scheme);
 909   if (*p == ':')
 910     {
 911       const char *port_b, *port_e, *pp;
 912
 913       /* scheme://host:port/tralala */
 914       /*              ^             */
 915       ++p;
 916       port_b = p;
 917       p = strpbrk_or_eos (p, "/;?#");
 918       port_e = p;
 919
 920       if (port_b == port_e)
 921         {
 922           /* http://host:/whatever */
 923           /*             ^         */
 924           SETERR (error, PE_BAD_PORT_NUMBER);
 925           return NULL;
 926         }
 927
 928       for (port = 0, pp = port_b; pp < port_e; pp++)
 929         {
 930           if (!ISDIGIT (*pp))
 931             {
 932               /* http://host:12randomgarbage/blah */
 933               /*               ^                  */
 934               SETERR (error, PE_BAD_PORT_NUMBER);
 935               return NULL;
 936             }
 937
 938           port = 10 * port + (*pp - '0');
 939         }
 940     }
 941
 942   if (*p == '/')
 943     {
 944       ++p;
 945       path_b = p;
 946       p = strpbrk_or_eos (p, ";?#");
 947       path_e = p;
 948     }
 949   else
 950     {
 951       /* Path is not allowed not to exist. */
 952       path_b = path_e = p;
 953     }
 954
 955   if (*p == ';')
 956     {
 957       ++p;
 958       params_b = p;
 959       p = strpbrk_or_eos (p, "?#");
 960       params_e = p;
 961     }
 962   if (*p == '?')
 963     {
 964       ++p;
 965       query_b = p;
 966       p = strpbrk_or_eos (p, "#");
 967       query_e = p;
 968
 969       /* Hack that allows users to use '?' (a wildcard character) in
 970          FTP URLs without it being interpreted as a query string
 971          delimiter.  */
 972       if (scheme == SCHEME_FTP)
 973         {
 974           query_b = query_e = NULL;
 975           path_e = p;
 976         }
 977     }
 978   if (*p == '#')
 979     {
 980       ++p;
 981       fragment_b = p;
 982       p += strlen (p);
 983       fragment_e = p;
 984     }
 985   assert (*p == 0);
 986
 987   if (uname_b != uname_e)
 988     {
 989       /* http://user:pass@host */
 990       /*        ^         ^    */
 991       /*     uname_b   uname_e */
 992       if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
 993         {
 994           SETERR (error, PE_INVALID_USER_NAME);
 995           return NULL;
 996         }
 997     }
 998
 999   u = (struct url *)xmalloc (sizeof (struct url));
1000   memset (u, 0, sizeof (*u));
1001
1002   u->scheme = scheme;
1003   u->host   = strdupdelim (host_b, host_e);
1004   u->port   = port;
1005   u->user   = user;
1006   u->passwd = passwd;
1007
1008   u->path = strdupdelim (path_b, path_e);
1009   path_modified = path_simplify (u->path);
1010   parse_path (u->path, &u->dir, &u->file);
1011
1012   host_modified = lowercase_str (u->host);
1013
1014   if (params_b)
1015     u->params = strdupdelim (params_b, params_e);
1016   if (query_b)
1017     u->query = strdupdelim (query_b, query_e);
1018   if (fragment_b)
1019     u->fragment = strdupdelim (fragment_b, fragment_e);
1020
1021   if (path_modified || u->fragment || host_modified || path_b == path_e)
1022     {
1023       /* If we suspect that a transformation has rendered what
1024          url_string might return different from URL_ENCODED, rebuild
1025          u->url using url_string.  */
1026       u->url = url_string (u, 0);
1027
1028       if (url_encoded != url)
1029         xfree ((char *) url_encoded);
1030     }
1031   else
1032     {
1033       if (url_encoded == url)
1034         u->url = xstrdup (url);
1035       else
1036         u->url = url_encoded;
1037     }
1038   url_encoded = NULL;
1039
1040   return u;
1041 }
1042
1043 const char *
1044 url_error (int error_code)
1045 {
1046   assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
1047   return parse_errors[error_code];
1048 }
1049
1050 /* Parse PATH into dir and file.  PATH is extracted from the URL and
1051    is URL-escaped.  The function returns unescaped DIR and FILE.  */
1052
1053 static void
1054 parse_path (const char *path, char **dir, char **file)
1055 {
1056   char *last_slash;
1057
1058   last_slash = strrchr (path, '/');
1059   if (!last_slash)
1060     {
1061       *dir = xstrdup ("");
1062       *file = xstrdup (path);
1063     }
1064   else
1065     {
1066       *dir = strdupdelim (path, last_slash);
1067       *file = xstrdup (last_slash + 1);
1068     }
1069   url_unescape (*dir);
1070   url_unescape (*file);
1071 }
1072
1073 /* Note: URL's "full path" is the path with the query string and
1074    params appended.  The "fragment" (#foo) is intentionally ignored,
1075    but that might be changed.  For example, if the original URL was
1076    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
1077    the full path will be "/foo/bar/baz;bullshit?querystring".  */
1078
1079 /* Return the length of the full path, without the terminating
1080    zero.  */
1081
1082 static int
1083 full_path_length (const struct url *url)
1084 {
1085   int len = 0;
1086
1087 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
1088
1089   FROB (path);
1090   FROB (params);
1091   FROB (query);
1092
1093 #undef FROB
1094
1095   return len;
1096 }
1097
1098 /* Write out the full path. */
1099
1100 static void
1101 full_path_write (const struct url *url, char *where)
1102 {
1103 #define FROB(el, chr) do {                      \
1104   char *f_el = url->el;                         \
1105   if (f_el) {                                   \
1106     int l = strlen (f_el);                      \
1107     *where++ = chr;                             \
1108     memcpy (where, f_el, l);                    \
1109     where += l;                                 \
1110   }                                             \
1111 } while (0)
1112
1113   FROB (path, '/');
1114   FROB (params, ';');
1115   FROB (query, '?');
1116
1117 #undef FROB
1118 }
1119
1120 /* Public function for getting the "full path".  E.g. if u->path is
1121    "foo/bar" and u->query is "param=value", full_path will be
1122    "/foo/bar?param=value". */
1123
1124 char *
1125 url_full_path (const struct url *url)
1126 {
1127   int length = full_path_length (url);
1128   char *full_path = (char *)xmalloc(length + 1);
1129
1130   full_path_write (url, full_path);
1131   full_path[length] = '\0';
1132
1133   return full_path;
1134 }
1135
1136 /* Escape unsafe and reserved characters, except for the slash
1137    characters.  */
1138
1139 static char *
1140 url_escape_dir (const char *dir)
1141 {
1142   char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1143   char *h, *t;
1144   if (newdir == dir)
1145     return (char *)dir;
1146
1147   /* Unescape slashes in NEWDIR. */
1148
1149   h = newdir;                   /* hare */
1150   t = newdir;                   /* tortoise */
1151
1152   for (; *h; h++, t++)
1153     {
1154       if (*h == '%' && h[1] == '2' && h[2] == 'F')
1155         {
1156           *t = '/';
1157           h += 2;
1158         }
1159       else
1160         *t = *h;
1161     }
1162   *t = '\0';
1163
1164   return newdir;
1165 }
1166
1167 /* Sync u->path and u->url with u->dir and u->file.  Called after
1168    u->file or u->dir have been changed, typically by the FTP code.  */
1169
1170 static void
1171 sync_path (struct url *u)
1172 {
1173   char *newpath, *efile, *edir;
1174
1175   xfree (u->path);
1176
1177   /* u->dir and u->file are not escaped.  URL-escape them before
1178      reassembling them into u->path.  That way, if they contain
1179      separators like '?' or even if u->file contains slashes, the
1180      path will be correctly assembled.  (u->file can contain slashes
1181      if the URL specifies it with %2f, or if an FTP server returns
1182      it.)  */
1183   edir = url_escape_dir (u->dir);
1184   efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1185
1186   if (!*edir)
1187     newpath = xstrdup (efile);
1188   else
1189     {
1190       int dirlen = strlen (edir);
1191       int filelen = strlen (efile);
1192
1193       /* Copy "DIR/FILE" to newpath. */
1194       char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1195       memcpy (p, edir, dirlen);
1196       p += dirlen;
1197       *p++ = '/';
1198       memcpy (p, efile, filelen);
1199       p += filelen;
1200       *p++ = '\0';
1201     }
1202
1203   u->path = newpath;
1204
1205   if (edir != u->dir)
1206     xfree (edir);
1207   if (efile != u->file)
1208     xfree (efile);
1209
1210   /* Regenerate u->url as well.  */
1211   xfree (u->url);
1212   u->url = url_string (u, 0);
1213 }
1214
1215 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1216    This way we can sync u->path and u->url when they get changed.  */
1217
1218 void
1219 url_set_dir (struct url *url, const char *newdir)
1220 {
1221   xfree (url->dir);
1222   url->dir = xstrdup (newdir);
1223   sync_path (url);
1224 }
1225
1226 void
1227 url_set_file (struct url *url, const char *newfile)
1228 {
1229   xfree (url->file);
1230   url->file = xstrdup (newfile);
1231   sync_path (url);
1232 }
1233
1234 void
1235 url_free (struct url *url)
1236 {
1237   xfree (url->host);
1238   xfree (url->path);
1239   xfree (url->url);
1240
1241   FREE_MAYBE (url->params);
1242   FREE_MAYBE (url->query);
1243   FREE_MAYBE (url->fragment);
1244   FREE_MAYBE (url->user);
1245   FREE_MAYBE (url->passwd);
1246
1247   xfree (url->dir);
1248   xfree (url->file);
1249
1250   xfree (url);
1251 }
1252 \f
1253 struct urlpos *
1254 get_urls_file (const char *file)
1255 {
1256   struct file_memory *fm;
1257   struct urlpos *head, *tail;
1258   const char *text, *text_end;
1259
1260   /* Load the file.  */
1261   fm = read_file (file);
1262   if (!fm)
1263     {
1264       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
1265       return NULL;
1266     }
1267   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
1268
1269   head = tail = NULL;
1270   text = fm->content;
1271   text_end = fm->content + fm->length;
1272   while (text < text_end)
1273     {
1274       const char *line_beg = text;
1275       const char *line_end = memchr (text, '\n', text_end - text);
1276       if (!line_end)
1277         line_end = text_end;
1278       else
1279         ++line_end;
1280       text = line_end;
1281
1282       /* Strip whitespace from the beginning and end of line. */
1283       while (line_beg < line_end && ISSPACE (*line_beg))
1284         ++line_beg;
1285       while (line_end > line_beg && ISSPACE (*(line_end - 1)))
1286         --line_end;
1287
1288       if (line_end > line_beg)
1289         {
1290           /* URL is in the [line_beg, line_end) region. */
1291
1292           int up_error_code;
1293           char *url_text;
1294           struct urlpos *entry;
1295           struct url *url;
1296
1297           /* We must copy the URL to a zero-terminated string, and we
1298              can't use alloca because we're in a loop.  *sigh*.  */
1299           url_text = strdupdelim (line_beg, line_end);
1300
1301           if (opt.base_href)
1302             {
1303               /* Merge opt.base_href with URL. */
1304               char *merged = uri_merge (opt.base_href, url_text);
1305               xfree (url_text);
1306               url_text = merged;
1307             }
1308
1309           url = url_parse (url_text, &up_error_code);
1310           if (!url)
1311             {
1312               logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1313                          file, url_text, url_error (up_error_code));
1314               xfree (url_text);
1315               continue;
1316             }
1317           xfree (url_text);
1318
1319           entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1320           memset (entry, 0, sizeof (*entry));
1321           entry->next = NULL;
1322           entry->url = url;
1323
1324           if (!head)
1325             head = entry;
1326           else
1327             tail->next = entry;
1328           tail = entry;
1329         }
1330     }
1331   read_file_free (fm);
1332   return head;
1333 }
1334 \f
1335 /* Free the linked list of urlpos.  */
1336 void
1337 free_urlpos (struct urlpos *l)
1338 {
1339   while (l)
1340     {
1341       struct urlpos *next = l->next;
1342       if (l->url)
1343         url_free (l->url);
1344       FREE_MAYBE (l->local_name);
1345       xfree (l);
1346       l = next;
1347     }
1348 }
1349
1350 /* Rotate FNAME opt.backups times */
1351 void
1352 rotate_backups(const char *fname)
1353 {
1354   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1355   char *from = (char *)alloca (maxlen);
1356   char *to = (char *)alloca (maxlen);
1357   struct stat sb;
1358   int i;
1359
1360   if (stat (fname, &sb) == 0)
1361     if (S_ISREG (sb.st_mode) == 0)
1362       return;
1363
1364   for (i = opt.backups; i > 1; i--)
1365     {
1366       sprintf (from, "%s.%d", fname, i - 1);
1367       sprintf (to, "%s.%d", fname, i);
1368       rename (from, to);
1369     }
1370
1371   sprintf (to, "%s.%d", fname, 1);
1372   rename(fname, to);
1373 }
1374
1375 /* Create all the necessary directories for PATH (a file).  Calls
1376    mkdirhier() internally.  */
1377 int
1378 mkalldirs (const char *path)
1379 {
1380   const char *p;
1381   char *t;
1382   struct stat st;
1383   int res;
1384
1385   p = path + strlen (path);
1386   for (; *p != '/' && p != path; p--)
1387     ;
1388
1389   /* Don't create if it's just a file.  */
1390   if ((p == path) && (*p != '/'))
1391     return 0;
1392   t = strdupdelim (path, p);
1393
1394   /* Check whether the directory exists.  */
1395   if ((stat (t, &st) == 0))
1396     {
1397       if (S_ISDIR (st.st_mode))
1398         {
1399           xfree (t);
1400           return 0;
1401         }
1402       else
1403         {
1404           /* If the dir exists as a file name, remove it first.  This
1405              is *only* for Wget to work with buggy old CERN http
1406              servers.  Here is the scenario: When Wget tries to
1407              retrieve a directory without a slash, e.g.
1408              http://foo/bar (bar being a directory), CERN server will
1409              not redirect it too http://foo/bar/ -- it will generate a
1410              directory listing containing links to bar/file1,
1411              bar/file2, etc.  Wget will lose because it saves this
1412              HTML listing to a file `bar', so it cannot create the
1413              directory.  To work around this, if the file of the same
1414              name exists, we just remove it and create the directory
1415              anyway.  */
1416           DEBUGP (("Removing %s because of directory danger!\n", t));
1417           unlink (t);
1418         }
1419     }
1420   res = make_directory (t);
1421   if (res != 0)
1422     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1423   xfree (t);
1424   return res;
1425 }
1426 \f
1427 /* Functions for constructing the file name out of URL components.  */
1428
1429 /* A growable string structure, used by url_file_name and friends.
1430    This should perhaps be moved to utils.c.
1431
1432    The idea is to have a convenient and efficient way to construct a
1433    string by having various functions append data to it.  Instead of
1434    passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1435    functions in questions, we pass the pointer to this struct.  */
1436
1437 struct growable {
1438   char *base;
1439   int size;
1440   int tail;
1441 };
1442
1443 /* Ensure that the string can accept APPEND_COUNT more characters past
1444    the current TAIL position.  If necessary, this will grow the string
1445    and update its allocated size.  If the string is already large
1446    enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1447 #define GROW(g, append_size) do {                                       \
1448   struct growable *G_ = g;                                              \
1449   DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1450 } while (0)
1451
1452 /* Return the tail position of the string. */
1453 #define TAIL(r) ((r)->base + (r)->tail)
1454
1455 /* Move the tail position by APPEND_COUNT characters. */
1456 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1457
1458 /* Append the string STR to DEST.  NOTICE: the string in DEST is not
1459    terminated.  */
1460
1461 static void
1462 append_string (const char *str, struct growable *dest)
1463 {
1464   int l = strlen (str);
1465   GROW (dest, l);
1466   memcpy (TAIL (dest), str, l);
1467   TAIL_INCR (dest, l);
1468 }
1469
1470 /* Append CH to DEST.  For example, append_char (0, DEST)
1471    zero-terminates DEST.  */
1472
1473 static void
1474 append_char (char ch, struct growable *dest)
1475 {
1476   GROW (dest, 1);
1477   *TAIL (dest) = ch;
1478   TAIL_INCR (dest, 1);
1479 }
1480
1481 enum {
1482   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1483   filechr_not_windows = 2,      /* unusable on Windows, one of \|/<>?:*" */
1484   filechr_control     = 4,      /* a control character, e.g. 0-31 */
1485 };
1486
1487 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1488
1489 /* Shorthands for the table: */
1490 #define U filechr_not_unix
1491 #define W filechr_not_windows
1492 #define C filechr_control
1493
1494 #define UW U|W
1495 #define UWC U|W|C
1496
1497 /* Table of characters unsafe under various conditions (see above).
1498
1499    Arguably we could also claim `%' to be unsafe, since we use it as
1500    the escape character.  If we ever want to be able to reliably
1501    translate file name back to URL, this would become important
1502    crucial.  Right now, it's better to be minimal in escaping.  */
1503
1504 const static unsigned char filechr_table[256] =
1505 {
1506 UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1507   C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1508   C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1509   C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1510   0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1511   0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1512   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1513   0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
1514   0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1515   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1516   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1517   0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1518   0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1519   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1520   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1521   0,  0,  0,  0,   0,  0,  0,  0,   /* x   y   z   {    |   }   ~   DEL */
1522
1523   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
1524   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
1525   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1526   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1527
1528   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1529   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1530   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1531   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1532 };
1533
1534 /* FN_PORT_SEP is the separator between host and port in file names
1535    for non-standard port numbers.  On Unix this is normally ':', as in
1536    "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1537    because Windows can't handle ':' in file names.  */
1538 #define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1539
1540 /* FN_QUERY_SEP is the separator between the file name and the URL
1541    query, normally '?'.  Since Windows cannot handle '?' as part of
1542    file name, we use '@' instead there.  */
1543 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1544
1545 /* Quote path element, characters in [b, e), as file name, and append
1546    the quoted string to DEST.  Each character is quoted as per
1547    file_unsafe_char and the corresponding table.  */
1548
1549 static void
1550 append_uri_pathel (const char *b, const char *e, struct growable *dest)
1551 {
1552   char *pathel;
1553   int pathlen;
1554
1555   const char *p;
1556   int quoted, outlen;
1557
1558   int mask;
1559   if (opt.restrict_files_os == restrict_unix)
1560     mask = filechr_not_unix;
1561   else
1562     mask = filechr_not_windows;
1563   if (opt.restrict_files_ctrl)
1564     mask |= filechr_control;
1565
1566   /* Copy [b, e) to PATHEL and URL-unescape it. */
1567   BOUNDED_TO_ALLOCA (b, e, pathel);
1568   url_unescape (pathel);
1569   pathlen = strlen (pathel);
1570
1571   /* Go through PATHEL and check how many characters we'll need to
1572      add for file quoting. */
1573   quoted = 0;
1574   for (p = pathel; *p; p++)
1575     if (FILE_CHAR_TEST (*p, mask))
1576       ++quoted;
1577
1578   /* p - pathel is the string length.  Each quoted char means two
1579      additional characters in the string, hence 2*quoted.  */
1580   outlen = (p - pathel) + (2 * quoted);
1581   GROW (dest, outlen);
1582
1583   if (!quoted)
1584     {
1585       /* If there's nothing to quote, we don't need to go through the
1586          string the second time.  */
1587       memcpy (TAIL (dest), pathel, outlen);
1588     }
1589   else
1590     {
1591       char *q = TAIL (dest);
1592       for (p = pathel; *p; p++)
1593         {
1594           if (!FILE_CHAR_TEST (*p, mask))
1595             *q++ = *p;
1596           else
1597             {
1598               unsigned char ch = *p;
1599               *q++ = '%';
1600               *q++ = XDIGIT_TO_XCHAR (ch >> 4);
1601               *q++ = XDIGIT_TO_XCHAR (ch & 0xf);
1602             }
1603         }
1604       assert (q - TAIL (dest) == outlen);
1605     }
1606   TAIL_INCR (dest, outlen);
1607 }
1608
1609 /* Append to DEST the directory structure that corresponds the
1610    directory part of URL's path.  For example, if the URL is
1611    http://server/dir1/dir2/file, this appends "/dir1/dir2".
1612
1613    Each path element ("dir1" and "dir2" in the above example) is
1614    examined, url-unescaped, and re-escaped as file name element.
1615
1616    Additionally, it cuts as many directories from the path as
1617    specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1618    will produce "bar" for the above example.  For 2 or more, it will
1619    produce "".
1620
1621    Each component of the path is quoted for use as file name.  */
1622
1623 static void
1624 append_dir_structure (const struct url *u, struct growable *dest)
1625 {
1626   char *pathel, *next;
1627   int cut = opt.cut_dirs;
1628
1629   /* Go through the path components, de-URL-quote them, and quote them
1630      (if necessary) as file names.  */
1631
1632   pathel = u->path;
1633   for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1634     {
1635       if (cut-- > 0)
1636         continue;
1637       if (pathel == next)
1638         /* Ignore empty pathels.  path_simplify should remove
1639            occurrences of "//" from the path, but it has special cases
1640            for starting / which generates an empty pathel here.  */
1641         continue;
1642
1643       if (dest->tail)
1644         append_char ('/', dest);
1645       append_uri_pathel (pathel, next, dest);
1646     }
1647 }
1648
1649 /* Return a unique file name that matches the given URL as good as
1650    possible.  Does not create directories on the file system.  */
1651
1652 char *
1653 url_file_name (const struct url *u)
1654 {
1655   struct growable fnres;
1656
1657   char *u_file, *u_query;
1658   char *fname, *unique;
1659
1660   fnres.base = NULL;
1661   fnres.size = 0;
1662   fnres.tail = 0;
1663
1664   /* Start with the directory prefix, if specified. */
1665   if (!DOTP (opt.dir_prefix))
1666     append_string (opt.dir_prefix, &fnres);
1667
1668   /* If "dirstruct" is turned on (typically the case with -r), add
1669      the host and port (unless those have been turned off) and
1670      directory structure.  */
1671   if (opt.dirstruct)
1672     {
1673       if (opt.add_hostdir)
1674         {
1675           if (fnres.tail)
1676             append_char ('/', &fnres);
1677           append_string (u->host, &fnres);
1678           if (u->port != scheme_default_port (u->scheme))
1679             {
1680               char portstr[24];
1681               number_to_string (portstr, u->port);
1682               append_char (FN_PORT_SEP, &fnres);
1683               append_string (portstr, &fnres);
1684             }
1685         }
1686
1687       append_dir_structure (u, &fnres);
1688     }
1689
1690   /* Add the file name. */
1691   if (fnres.tail)
1692     append_char ('/', &fnres);
1693   u_file = *u->file ? u->file : "index.html";
1694   append_uri_pathel (u_file, u_file + strlen (u_file), &fnres);
1695
1696   /* Append "?query" to the file name. */
1697   u_query = u->query && *u->query ? u->query : NULL;
1698   if (u_query)
1699     {
1700       append_char (FN_QUERY_SEP, &fnres);
1701       append_uri_pathel (u_query, u_query + strlen (u_query), &fnres);
1702     }
1703
1704   /* Zero-terminate the file name. */
1705   append_char ('\0', &fnres);
1706
1707   fname = fnres.base;
1708
1709   /* Check the cases in which the unique extensions are not used:
1710      1) Clobbering is turned off (-nc).
1711      2) Retrieval with regetting.
1712      3) Timestamping is used.
1713      4) Hierarchy is built.
1714
1715      The exception is the case when file does exist and is a
1716      directory (see `mkalldirs' for explanation).  */
1717
1718   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1719       && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1720     return fname;
1721
1722   unique = unique_name (fname, 1);
1723   if (unique != fname)
1724     xfree (fname);
1725   return unique;
1726 }
1727
1728 /* Return the length of URL's path.  Path is considered to be
1729    terminated by one of '?', ';', '#', or by the end of the
1730    string.  */
1731 static int
1732 path_length (const char *url)
1733 {
1734   const char *q = strpbrk_or_eos (url, "?;#");
1735   return q - url;
1736 }
1737
1738 /* Find the last occurrence of character C in the range [b, e), or
1739    NULL, if none are present.  This is equivalent to strrchr(b, c),
1740    except that it accepts an END argument instead of requiring the
1741    string to be zero-terminated.  Why is there no memrchr()?  */
1742 static const char *
1743 find_last_char (const char *b, const char *e, char c)
1744 {
1745   for (; e > b; e--)
1746     if (*e == c)
1747       return e;
1748   return NULL;
1749 }
1750 \f
1751 /* Resolve "." and ".." elements of PATH by destructively modifying
1752    PATH.  "." is resolved by removing that path element, and ".." is
1753    resolved by removing the preceding path element.  Leading and
1754    trailing slashes are preserved.
1755
1756    Return non-zero if any changes have been made.
1757
1758    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1759    test examples are provided below.  If you change anything in this
1760    function, run test_path_simplify to make sure you haven't broken a
1761    test case.
1762
1763    A previous version of this function was based on path_simplify()
1764    from GNU Bash, but it has been rewritten for Wget 1.8.1.  */
1765
1766 static int
1767 path_simplify (char *path)
1768 {
1769   int change = 0;
1770   char *p, *end;
1771
1772   if (path[0] == '/')
1773     ++path;                     /* preserve the leading '/'. */
1774
1775   p = path;
1776   end = p + strlen (p) + 1;     /* position past the terminating zero. */
1777
1778   while (1)
1779     {
1780     again:
1781       /* P should point to the beginning of a path element. */
1782
1783       if (*p == '.' && (*(p + 1) == '/' || *(p + 1) == '\0'))
1784         {
1785           /* Handle "./foo" by moving "foo" two characters to the
1786              left. */
1787           if (*(p + 1) == '/')
1788             {
1789               change = 1;
1790               memmove (p, p + 2, end - p);
1791               end -= 2;
1792               goto again;
1793             }
1794           else
1795             {
1796               change = 1;
1797               *p = '\0';
1798               break;
1799             }
1800         }
1801       else if (*p == '.' && *(p + 1) == '.'
1802                && (*(p + 2) == '/' || *(p + 2) == '\0'))
1803         {
1804           /* Handle "../foo" by moving "foo" one path element to the
1805              left.  */
1806           char *b = p;          /* not p-1 because P can equal PATH */
1807
1808           /* Backtrack by one path element, but not past the beginning
1809              of PATH. */
1810
1811           /* foo/bar/../baz */
1812           /*         ^ p    */
1813           /*     ^ b        */
1814
1815           if (b > path)
1816             {
1817               /* Move backwards until B hits the beginning of the
1818                  previous path element or the beginning of path. */
1819               for (--b; b > path && *(b - 1) != '/'; b--)
1820                 ;
1821             }
1822
1823           change = 1;
1824           if (*(p + 2) == '/')
1825             {
1826               memmove (b, p + 3, end - (p + 3));
1827               end -= (p + 3) - b;
1828               p = b;
1829             }
1830           else
1831             {
1832               *b = '\0';
1833               break;
1834             }
1835
1836           goto again;
1837         }
1838       else if (*p == '/')
1839         {
1840           /* Remove empty path elements.  Not mandated by rfc1808 et
1841              al, but it seems like a good idea to get rid of them.
1842              Supporting them properly is hard (in which directory do
1843              you save http://x.com///y.html?) and they don't seem to
1844              bring much gain.  */
1845           char *q = p;
1846           while (*q == '/')
1847             ++q;
1848           change = 1;
1849           if (*q == '\0')
1850             {
1851               *p = '\0';
1852               break;
1853             }
1854           memmove (p, q, end - q);
1855           end -= q - p;
1856           goto again;
1857         }
1858
1859       /* Skip to the next path element. */
1860       while (*p && *p != '/')
1861         ++p;
1862       if (*p == '\0')
1863         break;
1864
1865       /* Make sure P points to the beginning of the next path element,
1866          which is location after the slash. */
1867       ++p;
1868     }
1869
1870   return change;
1871 }
1872 \f
1873 /* Resolve the result of "linking" a base URI (BASE) to a
1874    link-specified URI (LINK).
1875
1876    Either of the URIs may be absolute or relative, complete with the
1877    host name, or path only.  This tries to behave "reasonably" in all
1878    foreseeable cases.  It employs little specific knowledge about
1879    schemes or URL-specific stuff -- it just works on strings.
1880
1881    The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1882    See uri_merge for a gentler interface to this functionality.
1883
1884    Perhaps this function should call path_simplify so that the callers
1885    don't have to call url_parse unconditionally.  */
1886 static char *
1887 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1888 {
1889   char *constr;
1890
1891   if (no_scheme)
1892     {
1893       const char *end = base + path_length (base);
1894
1895       if (!*link)
1896         {
1897           /* Empty LINK points back to BASE, query string and all. */
1898           constr = xstrdup (base);
1899         }
1900       else if (*link == '?')
1901         {
1902           /* LINK points to the same location, but changes the query
1903              string.  Examples: */
1904           /* uri_merge("path",         "?new") -> "path?new"     */
1905           /* uri_merge("path?foo",     "?new") -> "path?new"     */
1906           /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1907           /* uri_merge("path#foo",     "?new") -> "path?new"     */
1908           int baselength = end - base;
1909           constr = xmalloc (baselength + linklength + 1);
1910           memcpy (constr, base, baselength);
1911           memcpy (constr + baselength, link, linklength);
1912           constr[baselength + linklength] = '\0';
1913         }
1914       else if (*link == '#')
1915         {
1916           /* uri_merge("path",         "#new") -> "path#new"     */
1917           /* uri_merge("path#foo",     "#new") -> "path#new"     */
1918           /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1919           /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1920           int baselength;
1921           const char *end1 = strchr (base, '#');
1922           if (!end1)
1923             end1 = base + strlen (base);
1924           baselength = end1 - base;
1925           constr = xmalloc (baselength + linklength + 1);
1926           memcpy (constr, base, baselength);
1927           memcpy (constr + baselength, link, linklength);
1928           constr[baselength + linklength] = '\0';
1929         }
1930       else if (linklength > 1 && *link == '/' && *(link + 1) == '/')
1931         {
1932           /* LINK begins with "//" and so is a net path: we need to
1933              replace everything after (and including) the double slash
1934              with LINK. */
1935
1936           /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1937           /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1938           /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1939
1940           int span;
1941           const char *slash;
1942           const char *start_insert;
1943
1944           /* Look for first slash. */
1945           slash = memchr (base, '/', end - base);
1946           /* If found slash and it is a double slash, then replace
1947              from this point, else default to replacing from the
1948              beginning.  */
1949           if (slash && *(slash + 1) == '/')
1950             start_insert = slash;
1951           else
1952             start_insert = base;
1953
1954           span = start_insert - base;
1955           constr = (char *)xmalloc (span + linklength + 1);
1956           if (span)
1957             memcpy (constr, base, span);
1958           memcpy (constr + span, link, linklength);
1959           constr[span + linklength] = '\0';
1960         }
1961       else if (*link == '/')
1962         {
1963           /* LINK is an absolute path: we need to replace everything
1964              after (and including) the FIRST slash with LINK.
1965
1966              So, if BASE is "http://host/whatever/foo/bar", and LINK is
1967              "/qux/xyzzy", our result should be
1968              "http://host/qux/xyzzy".  */
1969           int span;
1970           const char *slash;
1971           const char *start_insert = NULL; /* for gcc to shut up. */
1972           const char *pos = base;
1973           int seen_slash_slash = 0;
1974           /* We're looking for the first slash, but want to ignore
1975              double slash. */
1976         again:
1977           slash = memchr (pos, '/', end - pos);
1978           if (slash && !seen_slash_slash)
1979             if (*(slash + 1) == '/')
1980               {
1981                 pos = slash + 2;
1982                 seen_slash_slash = 1;
1983                 goto again;
1984               }
1985
1986           /* At this point, SLASH is the location of the first / after
1987              "//", or the first slash altogether.  START_INSERT is the
1988              pointer to the location where LINK will be inserted.  When
1989              examining the last two examples, keep in mind that LINK
1990              begins with '/'. */
1991
1992           if (!slash && !seen_slash_slash)
1993             /* example: "foo" */
1994             /*           ^    */
1995             start_insert = base;
1996           else if (!slash && seen_slash_slash)
1997             /* example: "http://foo" */
1998             /*                     ^ */
1999             start_insert = end;
2000           else if (slash && !seen_slash_slash)
2001             /* example: "foo/bar" */
2002             /*           ^        */
2003             start_insert = base;
2004           else if (slash && seen_slash_slash)
2005             /* example: "http://something/" */
2006             /*                           ^  */
2007             start_insert = slash;
2008
2009           span = start_insert - base;
2010           constr = (char *)xmalloc (span + linklength + 1);
2011           if (span)
2012             memcpy (constr, base, span);
2013           if (linklength)
2014             memcpy (constr + span, link, linklength);
2015           constr[span + linklength] = '\0';
2016         }
2017       else
2018         {
2019           /* LINK is a relative URL: we need to replace everything
2020              after last slash (possibly empty) with LINK.
2021
2022              So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
2023              our result should be "whatever/foo/qux/xyzzy".  */
2024           int need_explicit_slash = 0;
2025           int span;
2026           const char *start_insert;
2027           const char *last_slash = find_last_char (base, end, '/');
2028           if (!last_slash)
2029             {
2030               /* No slash found at all.  Append LINK to what we have,
2031                  but we'll need a slash as a separator.
2032
2033                  Example: if base == "foo" and link == "qux/xyzzy", then
2034                  we cannot just append link to base, because we'd get
2035                  "fooqux/xyzzy", whereas what we want is
2036                  "foo/qux/xyzzy".
2037
2038                  To make sure the / gets inserted, we set
2039                  need_explicit_slash to 1.  We also set start_insert
2040                  to end + 1, so that the length calculations work out
2041                  correctly for one more (slash) character.  Accessing
2042                  that character is fine, since it will be the
2043                  delimiter, '\0' or '?'.  */
2044               /* example: "foo?..." */
2045               /*               ^    ('?' gets changed to '/') */
2046               start_insert = end + 1;
2047               need_explicit_slash = 1;
2048             }
2049           else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
2050             {
2051               /* example: http://host"  */
2052               /*                      ^ */
2053               start_insert = end + 1;
2054               need_explicit_slash = 1;
2055             }
2056           else
2057             {
2058               /* example: "whatever/foo/bar" */
2059               /*                        ^    */
2060               start_insert = last_slash + 1;
2061             }
2062
2063           span = start_insert - base;
2064           constr = (char *)xmalloc (span + linklength + 1);
2065           if (span)
2066             memcpy (constr, base, span);
2067           if (need_explicit_slash)
2068             constr[span - 1] = '/';
2069           if (linklength)
2070             memcpy (constr + span, link, linklength);
2071           constr[span + linklength] = '\0';
2072         }
2073     }
2074   else /* !no_scheme */
2075     {
2076       constr = strdupdelim (link, link + linklength);
2077     }
2078   return constr;
2079 }
2080
2081 /* Merge BASE with LINK and return the resulting URI.  This is an
2082    interface to uri_merge_1 that assumes that LINK is a
2083    zero-terminated string.  */
2084 char *
2085 uri_merge (const char *base, const char *link)
2086 {
2087   return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
2088 }
2089 \f
2090 #define APPEND(p, s) do {                       \
2091   int len = strlen (s);                         \
2092   memcpy (p, s, len);                           \
2093   p += len;                                     \
2094 } while (0)
2095
2096 /* Use this instead of password when the actual password is supposed
2097    to be hidden.  We intentionally use a generic string without giving
2098    away the number of characters in the password, like previous
2099    versions did.  */
2100 #define HIDDEN_PASSWORD "*password*"
2101
2102 /* Recreate the URL string from the data in URL.
2103
2104    If HIDE is non-zero (as it is when we're calling this on a URL we
2105    plan to print, but not when calling it to canonicalize a URL for
2106    use within the program), password will be hidden.  Unsafe
2107    characters in the URL will be quoted.  */
2108
2109 char *
2110 url_string (const struct url *url, int hide_password)
2111 {
2112   int size;
2113   char *result, *p;
2114   char *quoted_user = NULL, *quoted_passwd = NULL;
2115
2116   int scheme_port  = supported_schemes[url->scheme].default_port;
2117   char *scheme_str = supported_schemes[url->scheme].leading_string;
2118   int fplen = full_path_length (url);
2119
2120   int brackets_around_host = 0;
2121
2122   assert (scheme_str != NULL);
2123
2124   /* Make sure the user name and password are quoted. */
2125   if (url->user)
2126     {
2127       quoted_user = url_escape_allow_passthrough (url->user);
2128       if (url->passwd)
2129         {
2130           if (hide_password)
2131             quoted_passwd = HIDDEN_PASSWORD;
2132           else
2133             quoted_passwd = url_escape_allow_passthrough (url->passwd);
2134         }
2135     }
2136
2137   if (strchr (url->host, ':'))
2138     brackets_around_host = 1;
2139
2140   size = (strlen (scheme_str)
2141           + strlen (url->host)
2142           + (brackets_around_host ? 2 : 0)
2143           + fplen
2144           + 1);
2145   if (url->port != scheme_port)
2146     size += 1 + numdigit (url->port);
2147   if (quoted_user)
2148     {
2149       size += 1 + strlen (quoted_user);
2150       if (quoted_passwd)
2151         size += 1 + strlen (quoted_passwd);
2152     }
2153
2154   p = result = xmalloc (size);
2155
2156   APPEND (p, scheme_str);
2157   if (quoted_user)
2158     {
2159       APPEND (p, quoted_user);
2160       if (quoted_passwd)
2161         {
2162           *p++ = ':';
2163           APPEND (p, quoted_passwd);
2164         }
2165       *p++ = '@';
2166     }
2167
2168   if (brackets_around_host)
2169     *p++ = '[';
2170   APPEND (p, url->host);
2171   if (brackets_around_host)
2172     *p++ = ']';
2173   if (url->port != scheme_port)
2174     {
2175       *p++ = ':';
2176       p = number_to_string (p, url->port);
2177     }
2178
2179   full_path_write (url, p);
2180   p += fplen;
2181   *p++ = '\0';
2182
2183   assert (p - result == size);
2184
2185   if (quoted_user && quoted_user != url->user)
2186     xfree (quoted_user);
2187   if (quoted_passwd && !hide_password
2188       && quoted_passwd != url->passwd)
2189     xfree (quoted_passwd);
2190
2191   return result;
2192 }
2193 \f
2194 /* Return the URL of the proxy appropriate for url U.  */
2195 char *
2196 getproxy (struct url *u)
2197 {
2198   char *proxy = NULL;
2199   char *rewritten_url;
2200   static char rewritten_storage[1024];
2201
2202   if (!opt.use_proxy)
2203     return NULL;
2204   if (!no_proxy_match (u->host, (const char **)opt.no_proxy))
2205     return NULL;
2206
2207   switch (u->scheme)
2208     {
2209     case SCHEME_HTTP:
2210       proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
2211       break;
2212 #ifdef HAVE_SSL
2213     case SCHEME_HTTPS:
2214       proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
2215       break;
2216 #endif
2217     case SCHEME_FTP:
2218       proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
2219       break;
2220     case SCHEME_INVALID:
2221       break;
2222     }
2223   if (!proxy || !*proxy)
2224     return NULL;
2225
2226   /* Handle shorthands.  `rewritten_storage' is a kludge to allow
2227      getproxy() to return static storage. */
2228   rewritten_url = rewrite_shorthand_url (proxy);
2229   if (rewritten_url)
2230     {
2231       strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
2232       rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
2233       proxy = rewritten_storage;
2234     }
2235
2236   return proxy;
2237 }
2238
2239 /* Should a host be accessed through proxy, concerning no_proxy?  */
2240 int
2241 no_proxy_match (const char *host, const char **no_proxy)
2242 {
2243   if (!no_proxy)
2244     return 1;
2245   else
2246     return !sufmatch (no_proxy, host);
2247 }
2248 \f
2249 /* Support for converting links for local viewing in downloaded HTML
2250    files.  This should be moved to another file, because it has
2251    nothing to do with processing URLs.  */
2252
2253 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
2254 static const char *replace_attr PARAMS ((const char *, int, FILE *,
2255                                          const char *));
2256 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
2257                                                       const char *, int));
2258 static char *local_quote_string PARAMS ((const char *));
2259
2260 /* Change the links in one HTML file.  LINKS is a list of links in the
2261    document, along with their positions and the desired direction of
2262    the conversion.  */
2263 void
2264 convert_links (const char *file, struct urlpos *links)
2265 {
2266   struct file_memory *fm;
2267   FILE *fp;
2268   const char *p;
2269   downloaded_file_t downloaded_file_return;
2270
2271   struct urlpos *link;
2272   int to_url_count = 0, to_file_count = 0;
2273
2274   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
2275
2276   {
2277     /* First we do a "dry run": go through the list L and see whether
2278        any URL needs to be converted in the first place.  If not, just
2279        leave the file alone.  */
2280     int dry_count = 0;
2281     struct urlpos *dry = links;
2282     for (dry = links; dry; dry = dry->next)
2283       if (dry->convert != CO_NOCONVERT)
2284         ++dry_count;
2285     if (!dry_count)
2286       {
2287         logputs (LOG_VERBOSE, _("nothing to do.\n"));
2288         return;
2289       }
2290   }
2291
2292   fm = read_file (file);
2293   if (!fm)
2294     {
2295       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2296                  file, strerror (errno));
2297       return;
2298     }
2299
2300   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
2301   if (opt.backup_converted && downloaded_file_return)
2302     write_backup_file (file, downloaded_file_return);
2303
2304   /* Before opening the file for writing, unlink the file.  This is
2305      important if the data in FM is mmaped.  In such case, nulling the
2306      file, which is what fopen() below does, would make us read all
2307      zeroes from the mmaped region.  */
2308   if (unlink (file) < 0 && errno != ENOENT)
2309     {
2310       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
2311                  file, strerror (errno));
2312       read_file_free (fm);
2313       return;
2314     }
2315   /* Now open the file for writing.  */
2316   fp = fopen (file, "wb");
2317   if (!fp)
2318     {
2319       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2320                  file, strerror (errno));
2321       read_file_free (fm);
2322       return;
2323     }
2324
2325   /* Here we loop through all the URLs in file, replacing those of
2326      them that are downloaded with relative references.  */
2327   p = fm->content;
2328   for (link = links; link; link = link->next)
2329     {
2330       char *url_start = fm->content + link->pos;
2331
2332       if (link->pos >= fm->length)
2333         {
2334           DEBUGP (("Something strange is going on.  Please investigate."));
2335           break;
2336         }
2337       /* If the URL is not to be converted, skip it.  */
2338       if (link->convert == CO_NOCONVERT)
2339         {
2340           DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
2341           continue;
2342         }
2343
2344       /* Echo the file contents, up to the offending URL's opening
2345          quote, to the outfile.  */
2346       fwrite (p, 1, url_start - p, fp);
2347       p = url_start;
2348
2349       switch (link->convert)
2350         {
2351         case CO_CONVERT_TO_RELATIVE:
2352           /* Convert absolute URL to relative. */
2353           {
2354             char *newname = construct_relative (file, link->local_name);
2355             char *quoted_newname = local_quote_string (newname);
2356
2357             if (!link->link_refresh_p)
2358               p = replace_attr (p, link->size, fp, quoted_newname);
2359             else
2360               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
2361                                              link->refresh_timeout);
2362
2363             DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
2364                      link->url->url, newname, link->pos, file));
2365             xfree (newname);
2366             xfree (quoted_newname);
2367             ++to_file_count;
2368             break;
2369           }
2370         case CO_CONVERT_TO_COMPLETE:
2371           /* Convert the link to absolute URL. */
2372           {
2373             char *newlink = link->url->url;
2374             char *quoted_newlink = html_quote_string (newlink);
2375
2376             if (!link->link_refresh_p)
2377               p = replace_attr (p, link->size, fp, quoted_newlink);
2378             else
2379               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
2380                                              link->refresh_timeout);
2381
2382             DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
2383                      newlink, link->pos, file));
2384             xfree (quoted_newlink);
2385             ++to_url_count;
2386             break;
2387           }
2388         case CO_NULLIFY_BASE:
2389           /* Change the base href to "". */
2390           p = replace_attr (p, link->size, fp, "");
2391           break;
2392         case CO_NOCONVERT:
2393           abort ();
2394           break;
2395         }
2396     }
2397
2398   /* Output the rest of the file. */
2399   if (p - fm->content < fm->length)
2400     fwrite (p, 1, fm->length - (p - fm->content), fp);
2401   fclose (fp);
2402   read_file_free (fm);
2403
2404   logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
2405 }
2406
2407 /* Construct and return a malloced copy of the relative link from two
2408    pieces of information: local name S1 of the referring file and
2409    local name S2 of the referred file.
2410
2411    So, if S1 is "jagor.srce.hr/index.html" and S2 is
2412    "jagor.srce.hr/images/news.gif", the function will return
2413    "images/news.gif".
2414
2415    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
2416    "fly.cc.fer.hr/images/fly.gif", the function will return
2417    "../images/fly.gif".
2418
2419    Caveats: S1 should not begin with `/', unless S2 also begins with
2420    '/'.  S1 should not contain things like ".." and such --
2421    construct_relative ("fly/ioccc/../index.html",
2422    "fly/images/fly.gif") will fail.  (A workaround is to call
2423    something like path_simplify() on S1).  */
2424 static char *
2425 construct_relative (const char *s1, const char *s2)
2426 {
2427   int i, cnt, sepdirs1;
2428   char *res;
2429
2430   if (*s2 == '/')
2431     return xstrdup (s2);
2432   /* S1 should *not* be absolute, if S2 wasn't.  */
2433   assert (*s1 != '/');
2434   i = cnt = 0;
2435   /* Skip the directories common to both strings.  */
2436   while (1)
2437     {
2438       while (s1[i] && s2[i]
2439              && (s1[i] == s2[i])
2440              && (s1[i] != '/')
2441              && (s2[i] != '/'))
2442         ++i;
2443       if (s1[i] == '/' && s2[i] == '/')
2444         cnt = ++i;
2445       else
2446         break;
2447     }
2448   for (sepdirs1 = 0; s1[i]; i++)
2449     if (s1[i] == '/')
2450       ++sepdirs1;
2451   /* Now, construct the file as of:
2452      - ../ repeated sepdirs1 time
2453      - all the non-mutual directories of S2.  */
2454   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
2455   for (i = 0; i < sepdirs1; i++)
2456     memcpy (res + 3 * i, "../", 3);
2457   strcpy (res + 3 * i, s2 + cnt);
2458   return res;
2459 }
2460 \f
2461 static void
2462 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
2463 {
2464   /* Rather than just writing over the original .html file with the
2465      converted version, save the former to *.orig.  Note we only do
2466      this for files we've _successfully_ downloaded, so we don't
2467      clobber .orig files sitting around from previous invocations. */
2468
2469   /* Construct the backup filename as the original name plus ".orig". */
2470   size_t         filename_len = strlen(file);
2471   char*          filename_plus_orig_suffix;
2472   boolean        already_wrote_backup_file = FALSE;
2473   slist*         converted_file_ptr;
2474   static slist*  converted_files = NULL;
2475
2476   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
2477     {
2478       /* Just write "orig" over "html".  We need to do it this way
2479          because when we're checking to see if we've downloaded the
2480          file before (to see if we can skip downloading it), we don't
2481          know if it's a text/html file.  Therefore we don't know yet
2482          at that stage that -E is going to cause us to tack on
2483          ".html", so we need to compare vs. the original URL plus
2484          ".orig", not the original URL plus ".html.orig". */
2485       filename_plus_orig_suffix = alloca (filename_len + 1);
2486       strcpy(filename_plus_orig_suffix, file);
2487       strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
2488     }
2489   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
2490     {
2491       /* Append ".orig" to the name. */
2492       filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
2493       strcpy(filename_plus_orig_suffix, file);
2494       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
2495     }
2496
2497   /* We can get called twice on the same URL thanks to the
2498      convert_all_links() call in main().  If we write the .orig file
2499      each time in such a case, it'll end up containing the first-pass
2500      conversion, not the original file.  So, see if we've already been
2501      called on this file. */
2502   converted_file_ptr = converted_files;
2503   while (converted_file_ptr != NULL)
2504     if (strcmp(converted_file_ptr->string, file) == 0)
2505       {
2506         already_wrote_backup_file = TRUE;
2507         break;
2508       }
2509     else
2510       converted_file_ptr = converted_file_ptr->next;
2511
2512   if (!already_wrote_backup_file)
2513     {
2514       /* Rename <file> to <file>.orig before former gets written over. */
2515       if (rename(file, filename_plus_orig_suffix) != 0)
2516         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
2517                    file, filename_plus_orig_suffix, strerror (errno));
2518
2519       /* Remember that we've already written a .orig backup for this file.
2520          Note that we never free this memory since we need it till the
2521          convert_all_links() call, which is one of the last things the
2522          program does before terminating.  BTW, I'm not sure if it would be
2523          safe to just set 'converted_file_ptr->string' to 'file' below,
2524          rather than making a copy of the string...  Another note is that I
2525          thought I could just add a field to the urlpos structure saying
2526          that we'd written a .orig file for this URL, but that didn't work,
2527          so I had to make this separate list.
2528          -- Dan Harkless <wget@harkless.org>
2529
2530          This [adding a field to the urlpos structure] didn't work
2531          because convert_file() is called from convert_all_links at
2532          the end of the retrieval with a freshly built new urlpos
2533          list.
2534          -- Hrvoje Niksic <hniksic@arsdigita.com>
2535       */
2536       converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
2537       converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
2538       converted_file_ptr->next = converted_files;
2539       converted_files = converted_file_ptr;
2540     }
2541 }
2542
2543 static int find_fragment PARAMS ((const char *, int, const char **,
2544                                   const char **));
2545
2546 /* Replace an attribute's original text with NEW_TEXT. */
2547
2548 static const char *
2549 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
2550 {
2551   int quote_flag = 0;
2552   char quote_char = '\"';       /* use "..." for quoting, unless the
2553                                    original value is quoted, in which
2554                                    case reuse its quoting char. */
2555   const char *frag_beg, *frag_end;
2556
2557   /* Structure of our string is:
2558        "...old-contents..."
2559        <---    size    --->  (with quotes)
2560      OR:
2561        ...old-contents...
2562        <---    size   -->    (no quotes)   */
2563
2564   if (*p == '\"' || *p == '\'')
2565     {
2566       quote_char = *p;
2567       quote_flag = 1;
2568       ++p;
2569       size -= 2;                /* disregard opening and closing quote */
2570     }
2571   putc (quote_char, fp);
2572   fputs (new_text, fp);
2573
2574   /* Look for fragment identifier, if any. */
2575   if (find_fragment (p, size, &frag_beg, &frag_end))
2576     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2577   p += size;
2578   if (quote_flag)
2579     ++p;
2580   putc (quote_char, fp);
2581
2582   return p;
2583 }
2584
2585 /* The same as REPLACE_ATTR, but used when replacing
2586    <meta http-equiv=refresh content="new_text"> because we need to
2587    append "timeout_value; URL=" before the next_text.  */
2588
2589 static const char *
2590 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
2591                            const char *new_text, int timeout)
2592 {
2593   /* "0; URL=..." */
2594   char *new_with_timeout = (char *)alloca (numdigit (timeout)
2595                                            + 6 /* "; URL=" */
2596                                            + strlen (new_text)
2597                                            + 1);
2598   sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
2599
2600   return replace_attr (p, size, fp, new_with_timeout);
2601 }
2602
2603 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2604    preceded by '&'.  If the character is not found, return zero.  If
2605    the character is found, return 1 and set BP and EP to point to the
2606    beginning and end of the region.
2607
2608    This is used for finding the fragment indentifiers in URLs.  */
2609
2610 static int
2611 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2612 {
2613   const char *end = beg + size;
2614   int saw_amp = 0;
2615   for (; beg < end; beg++)
2616     {
2617       switch (*beg)
2618         {
2619         case '&':
2620           saw_amp = 1;
2621           break;
2622         case '#':
2623           if (!saw_amp)
2624             {
2625               *bp = beg;
2626               *ep = end;
2627               return 1;
2628             }
2629           /* fallthrough */
2630         default:
2631           saw_amp = 0;
2632         }
2633     }
2634   return 0;
2635 }
2636
2637 /* Quote FILE for use as local reference to an HTML file.
2638
2639    We quote ? as %3F to avoid passing part of the file name as the
2640    parameter when browsing the converted file through HTTP.  However,
2641    it is safe to do this only when `--html-extension' is turned on.
2642    This is because converting "index.html?foo=bar" to
2643    "index.html%3Ffoo=bar" would break local browsing, as the latter
2644    isn't even recognized as an HTML file!  However, converting
2645    "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
2646    safe for both local and HTTP-served browsing.  */
2647
2648 static char *
2649 local_quote_string (const char *file)
2650 {
2651   const char *file_sans_qmark;
2652   int qm;
2653
2654   if (!opt.html_extension)
2655     return html_quote_string (file);
2656
2657   qm = count_char (file, '?');
2658
2659   if (qm)
2660     {
2661       const char *from = file;
2662       char *to, *newname;
2663
2664       /* qm * 2 because we replace each question mark with "%3F",
2665          i.e. replace one char with three, hence two more.  */
2666       int fsqlen = strlen (file) + qm * 2;
2667
2668       to = newname = (char *)alloca (fsqlen + 1);
2669       for (; *from; from++)
2670         {
2671           if (*from != '?')
2672             *to++ = *from;
2673           else
2674             {
2675               *to++ = '%';
2676               *to++ = '3';
2677               *to++ = 'F';
2678             }
2679         }
2680       assert (to - newname == fsqlen);
2681       *to = '\0';
2682
2683       file_sans_qmark = newname;
2684     }
2685   else
2686     file_sans_qmark = file;
2687
2688   return html_quote_string (file_sans_qmark);
2689 }
2690
2691 /* We're storing "modes" of type downloaded_file_t in the hash table.
2692    However, our hash tables only accept pointers for keys and values.
2693    So when we need a pointer, we use the address of a
2694    downloaded_file_t variable of static storage.  */
2695
2696 static downloaded_file_t *
2697 downloaded_mode_to_ptr (downloaded_file_t mode)
2698 {
2699   static downloaded_file_t
2700     v1 = FILE_NOT_ALREADY_DOWNLOADED,
2701     v2 = FILE_DOWNLOADED_NORMALLY,
2702     v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2703     v4 = CHECK_FOR_FILE;
2704
2705   switch (mode)
2706     {
2707     case FILE_NOT_ALREADY_DOWNLOADED:
2708       return &v1;
2709     case FILE_DOWNLOADED_NORMALLY:
2710       return &v2;
2711     case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2712       return &v3;
2713     case CHECK_FOR_FILE:
2714       return &v4;
2715     }
2716   return NULL;
2717 }
2718
2719 /* This should really be merged with dl_file_url_map and
2720    downloaded_html_files in recur.c.  This was originally a list, but
2721    I changed it to a hash table beause it was actually taking a lot of
2722    time to find things in it.  */
2723
2724 static struct hash_table *downloaded_files_hash;
2725
2726 /* Remembers which files have been downloaded.  In the standard case, should be
2727    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2728    download successfully (i.e. not for ones we have failures on or that we skip
2729    due to -N).
2730
2731    When we've downloaded a file and tacked on a ".html" extension due to -E,
2732    call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2733    FILE_DOWNLOADED_NORMALLY.
2734
2735    If you just want to check if a file has been previously added without adding
2736    it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
2737    with local filenames, not remote URLs. */
2738 downloaded_file_t
2739 downloaded_file (downloaded_file_t mode, const char *file)
2740 {
2741   downloaded_file_t *ptr;
2742
2743   if (mode == CHECK_FOR_FILE)
2744     {
2745       if (!downloaded_files_hash)
2746         return FILE_NOT_ALREADY_DOWNLOADED;
2747       ptr = hash_table_get (downloaded_files_hash, file);
2748       if (!ptr)
2749         return FILE_NOT_ALREADY_DOWNLOADED;
2750       return *ptr;
2751     }
2752
2753   if (!downloaded_files_hash)
2754     downloaded_files_hash = make_string_hash_table (0);
2755
2756   ptr = hash_table_get (downloaded_files_hash, file);
2757   if (ptr)
2758     return *ptr;
2759
2760   ptr = downloaded_mode_to_ptr (mode);
2761   hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2762
2763   return FILE_NOT_ALREADY_DOWNLOADED;
2764 }
2765
2766 static int
2767 df_free_mapper (void *key, void *value, void *ignored)
2768 {
2769   xfree (key);
2770   return 0;
2771 }
2772
2773 void
2774 downloaded_files_free (void)
2775 {
2776   if (downloaded_files_hash)
2777     {
2778       hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2779       hash_table_destroy (downloaded_files_hash);
2780       downloaded_files_hash = NULL;
2781     }
2782 }
2783
2784 /* Return non-zero if scheme a is similar to scheme b.
2785
2786    Schemes are similar if they are equal.  If SSL is supported, schemes
2787    are also similar if one is http (SCHEME_HTTP) and the other is https
2788    (SCHEME_HTTPS).  */
2789 int
2790 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2791 {
2792   if (a == b)
2793     return 1;
2794 #ifdef HAVE_SSL
2795   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2796       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2797     return 1;
2798 #endif
2799   return 0;
2800 }
2801 \f
2802 #if 0
2803 /* Debugging and testing support for path_simplify. */
2804
2805 /* Debug: run path_simplify on PATH and return the result in a new
2806    string.  Useful for calling from the debugger.  */
2807 static char *
2808 ps (char *path)
2809 {
2810   char *copy = xstrdup (path);
2811   path_simplify (copy);
2812   return copy;
2813 }
2814
2815 static void
2816 run_test (char *test, char *expected_result, int expected_change)
2817 {
2818   char *test_copy = xstrdup (test);
2819   int modified = path_simplify (test_copy);
2820
2821   if (0 != strcmp (test_copy, expected_result))
2822     {
2823       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2824               test, expected_result, test_copy);
2825     }
2826   if (modified != expected_change)
2827     {
2828       if (expected_change == 1)
2829         printf ("Expected no modification with path_simplify(\"%s\").\n",
2830                 test);
2831       else
2832         printf ("Expected modification with path_simplify(\"%s\").\n",
2833                 test);
2834     }
2835   xfree (test_copy);
2836 }
2837
2838 static void
2839 test_path_simplify (void)
2840 {
2841   static struct {
2842     char *test, *result;
2843     int should_modify;
2844   } tests[] = {
2845     { "",               "",             0 },
2846     { ".",              "",             1 },
2847     { "..",             "",             1 },
2848     { "foo",            "foo",          0 },
2849     { "foo/bar",        "foo/bar",      0 },
2850     { "foo///bar",      "foo/bar",      1 },
2851     { "foo/.",          "foo/",         1 },
2852     { "foo/./",         "foo/",         1 },
2853     { "foo./",          "foo./",        0 },
2854     { "foo/../bar",     "bar",          1 },
2855     { "foo/../bar/",    "bar/",         1 },
2856     { "foo/bar/..",     "foo/",         1 },
2857     { "foo/bar/../x",   "foo/x",        1 },
2858     { "foo/bar/../x/",  "foo/x/",       1 },
2859     { "foo/..",         "",             1 },
2860     { "foo/../..",      "",             1 },
2861     { "a/b/../../c",    "c",            1 },
2862     { "./a/../b",       "b",            1 }
2863   };
2864   int i;
2865
2866   for (i = 0; i < ARRAY_SIZE (tests); i++)
2867     {
2868       char *test = tests[i].test;
2869       char *expected_result = tests[i].result;
2870       int   expected_change = tests[i].should_modify;
2871       run_test (test, expected_result, expected_change);
2872     }
2873
2874   /* Now run all the tests with a leading slash before the test case,
2875      to prove that the slash is being preserved.  */
2876   for (i = 0; i < ARRAY_SIZE (tests); i++)
2877     {
2878       char *test, *expected_result;
2879       int expected_change = tests[i].should_modify;
2880
2881       test = xmalloc (1 + strlen (tests[i].test) + 1);
2882       sprintf (test, "/%s", tests[i].test);
2883
2884       expected_result = xmalloc (1 + strlen (tests[i].result) + 1);
2885       sprintf (expected_result, "/%s", tests[i].result);
2886
2887       run_test (test, expected_result, expected_change);
2888
2889       xfree (test);
2890       xfree (expected_result);
2891     }
2892 }
2893 #endif