sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001, 2003, 2003
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2 of the License, or (at
  10 your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget; if not, write to the Free Software
  19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  20
  21 In addition, as a special exception, the Free Software Foundation
  22 gives permission to link the code of its release of Wget with the
  23 OpenSSL project's "OpenSSL" library (or with modified versions of it
  24 that use the same license as the "OpenSSL" library), and distribute
  25 the linked executables.  You must obey the GNU General Public License
  26 in all respects for all of the code used other than "OpenSSL".  If you
  27 modify this file, you may extend this exception to your version of the
  28 file, but you are not obligated to do so.  If you do not wish to do
  29 so, delete this exception statement from your version.  */
  30
  31 #include <config.h>
  32
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #ifdef HAVE_STRING_H
  36 # include <string.h>
  37 #else
  38 # include <strings.h>
  39 #endif
  40 #include <sys/types.h>
  41 #ifdef HAVE_UNISTD_H
  42 # include <unistd.h>
  43 #endif
  44 #include <errno.h>
  45 #include <assert.h>
  46
  47 #include "wget.h"
  48 #include "utils.h"
  49 #include "url.h"
  50 #include "host.h"
  51 #include "hash.h"
  52
  53 #ifndef errno
  54 extern int errno;
  55 #endif
  56
  57 /* Is X "."?  */
  58 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  59 /* Is X ".."?  */
  60 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  61
  62 static const int NS_INADDRSZ  = 4;
  63 static const int NS_IN6ADDRSZ = 16;
  64 static const int NS_INT16SZ = 2;
  65
  66
  67 struct scheme_data
  68 {
  69   char *leading_string;
  70   int default_port;
  71   int enabled;
  72 };
  73
  74 /* Supported schemes: */
  75 static struct scheme_data supported_schemes[] =
  76 {
  77   { "http://",  DEFAULT_HTTP_PORT,  1 },
  78 #ifdef HAVE_SSL
  79   { "https://", DEFAULT_HTTPS_PORT, 1 },
  80 #endif
  81   { "ftp://",   DEFAULT_FTP_PORT,   1 },
  82
  83   /* SCHEME_INVALID */
  84   { NULL,       -1,                 0 }
  85 };
  86
  87 /* Forward declarations: */
  88
  89 static char *construct_relative PARAMS ((const char *, const char *));
  90 static int path_simplify PARAMS ((char *));
  91
  92
  93 \f
  94 /* Support for encoding and decoding of URL strings.  We determine
  95    whether a character is unsafe through static table lookup.  This
  96    code assumes ASCII character set and 8-bit chars.  */
  97
  98 enum {
  99   /* rfc1738 reserved chars, preserved from encoding.  */
 100   urlchr_reserved = 1,
 101
 102   /* rfc1738 unsafe chars, plus some more.  */
 103   urlchr_unsafe   = 2
 104 };
 105
 106 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 107 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 108 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 109
 110 /* Shorthands for the table: */
 111 #define R  urlchr_reserved
 112 #define U  urlchr_unsafe
 113 #define RU R|U
 114
 115 const static unsigned char urlchr_table[256] =
 116 {
 117   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 118   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 119   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 120   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 121   U,  0,  U, RU,   0,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 122   0,  0,  0,  R,   0,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 123   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 124   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 125  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 126   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 127   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 128   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 129   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 130   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 131   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 132   0,  0,  0,  U,   U,  U,  U,  U,   /* x   y   z   {    |   }   ~   DEL */
 133
 134   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 135   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 136   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 137   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 138
 139   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 140   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 141   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 142   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 143 };
 144 #undef R
 145 #undef U
 146 #undef RU
 147
 148 /* URL-unescape the string S.
 149
 150    This is done by transforming the sequences "%HH" to the character
 151    represented by the hexadecimal digits HH.  If % is not followed by
 152    two hexadecimal digits, it is inserted literally.
 153
 154    The transformation is done in place.  If you need the original
 155    string intact, make a copy before calling this function.  */
 156
 157 static void
 158 url_unescape (char *s)
 159 {
 160   char *t = s;                  /* t - tortoise */
 161   char *h = s;                  /* h - hare     */
 162
 163   for (; *h; h++, t++)
 164     {
 165       if (*h != '%')
 166         {
 167         copychar:
 168           *t = *h;
 169         }
 170       else
 171         {
 172           /* Do nothing if '%' is not followed by two hex digits. */
 173           if (!*(h + 1) || !*(h + 2)
 174               || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
 175             goto copychar;
 176           *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
 177           h += 2;
 178         }
 179     }
 180   *t = '\0';
 181 }
 182
 183 /* The core of url_escape_* functions.  Escapes the characters that
 184    match the provided mask in urlchr_table.
 185
 186    If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars
 187    will be returned unchanged.  If ALLOW_PASSTHROUGH is zero, a
 188    freshly allocated string will be returned in all cases.  */
 189
 190 static char *
 191 url_escape_1 (const char *s, unsigned char mask, int allow_passthrough)
 192 {
 193   const char *p1;
 194   char *p2, *newstr;
 195   int newlen;
 196   int addition = 0;
 197
 198   for (p1 = s; *p1; p1++)
 199     if (urlchr_test (*p1, mask))
 200       addition += 2;            /* Two more characters (hex digits) */
 201
 202   if (!addition)
 203     return allow_passthrough ? (char *)s : xstrdup (s);
 204
 205   newlen = (p1 - s) + addition;
 206   newstr = (char *)xmalloc (newlen + 1);
 207
 208   p1 = s;
 209   p2 = newstr;
 210   while (*p1)
 211     {
 212       /* Quote the characters that match the test mask. */
 213       if (urlchr_test (*p1, mask))
 214         {
 215           unsigned char c = *p1++;
 216           *p2++ = '%';
 217           *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 218           *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 219         }
 220       else
 221         *p2++ = *p1++;
 222     }
 223   assert (p2 - newstr == newlen);
 224   *p2 = '\0';
 225
 226   return newstr;
 227 }
 228
 229 /* URL-escape the unsafe characters (see urlchr_table) in a given
 230    string, returning a freshly allocated string.  */
 231
 232 char *
 233 url_escape (const char *s)
 234 {
 235   return url_escape_1 (s, urlchr_unsafe, 0);
 236 }
 237
 238 /* URL-escape the unsafe characters (see urlchr_table) in a given
 239    string.  If no characters are unsafe, S is returned.  */
 240
 241 static char *
 242 url_escape_allow_passthrough (const char *s)
 243 {
 244   return url_escape_1 (s, urlchr_unsafe, 1);
 245 }
 246 \f
 247 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
 248
 249 /* Decide whether to encode, decode, or pass through the char at P.
 250    This used to be a macro, but it got a little too convoluted.  */
 251 static inline enum copy_method
 252 decide_copy_method (const char *p)
 253 {
 254   if (*p == '%')
 255     {
 256       if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
 257         {
 258           /* %xx sequence: decode it, unless it would decode to an
 259              unsafe or a reserved char; in that case, leave it as
 260              is. */
 261           char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
 262             XCHAR_TO_XDIGIT (*(p + 2));
 263
 264           if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt))
 265             return CM_PASSTHROUGH;
 266           else
 267             return CM_DECODE;
 268         }
 269       else
 270         /* Garbled %.. sequence: encode `%'. */
 271         return CM_ENCODE;
 272     }
 273   else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
 274     return CM_ENCODE;
 275   else
 276     return CM_PASSTHROUGH;
 277 }
 278
 279 /* Translate a %-escaped (but possibly non-conformant) input string S
 280    into a %-escaped (and conformant) output string.  If no characters
 281    are encoded or decoded, return the same string S; otherwise, return
 282    a freshly allocated string with the new contents.
 283
 284    After a URL has been run through this function, the protocols that
 285    use `%' as the quote character can use the resulting string as-is,
 286    while those that don't call url_unescape() to get to the intended
 287    data.  This function is also stable: after an input string is
 288    transformed the first time, all further transformations of the
 289    result yield the same result string.
 290
 291    Let's discuss why this function is needed.
 292
 293    Imagine Wget is to retrieve `http://abc.xyz/abc def'.  Since a raw
 294    space character would mess up the HTTP request, it needs to be
 295    quoted, like this:
 296
 297        GET /abc%20def HTTP/1.0
 298
 299    It appears that the unsafe chars need to be quoted, for example
 300    with url_escape.  But what if we're requested to download
 301    `abc%20def'?  url_escape transforms "%" to "%25", which would leave
 302    us with `abc%2520def'.  This is incorrect -- since %-escapes are
 303    part of URL syntax, "%20" is the correct way to denote a literal
 304    space on the Wget command line.  This leaves us in the conclusion
 305    that in that case Wget should not call url_escape, but leave the
 306    `%20' as is.
 307
 308    And what if the requested URI is `abc%20 def'?  If we call
 309    url_escape, we end up with `/abc%2520%20def', which is almost
 310    certainly not intended.  If we don't call url_escape, we are left
 311    with the embedded space and cannot complete the request.  What the
 312    user meant was for Wget to request `/abc%20%20def', and this is
 313    where reencode_escapes kicks in.
 314
 315    Wget used to solve this by first decoding %-quotes, and then
 316    encoding all the "unsafe" characters found in the resulting string.
 317    This was wrong because it didn't preserve certain URL special
 318    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 319    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 320    whether we considered `+' reserved (it is).  One of these results
 321    is inevitable because by the second step we would lose information
 322    on whether the `+' was originally encoded or not.  Both results
 323    were wrong because in CGI parameters + means space, while %2B means
 324    literal plus.  reencode_escapes correctly translates the above to
 325    "a%2B+b", i.e. returns the original string.
 326
 327    This function uses an algorithm proposed by Anon Sricharoenchai:
 328
 329    1. Encode all URL_UNSAFE and the "%" that are not followed by 2
 330       hexdigits.
 331
 332    2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
 333       "+".
 334
 335    ...except that this code conflates the two steps, and decides
 336    whether to encode, decode, or pass through each character in turn.
 337    The function still uses two passes, but their logic is the same --
 338    the first pass exists merely for the sake of allocation.  Another
 339    small difference is that we include `+' to URL_RESERVED.
 340
 341    Anon's test case:
 342
 343    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 344    ->
 345    "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
 346
 347    Simpler test cases:
 348
 349    "foo bar"         -> "foo%20bar"
 350    "foo%20bar"       -> "foo%20bar"
 351    "foo %20bar"      -> "foo%20%20bar"
 352    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 353    "foo%25%20bar"    -> "foo%25%20bar"
 354    "foo%2%20bar"     -> "foo%252%20bar"
 355    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 356    "foo%2b+bar"      -> "foo%2b+bar"  */
 357
 358 static char *
 359 reencode_escapes (const char *s)
 360 {
 361   const char *p1;
 362   char *newstr, *p2;
 363   int oldlen, newlen;
 364
 365   int encode_count = 0;
 366   int decode_count = 0;
 367
 368   /* First, pass through the string to see if there's anything to do,
 369      and to calculate the new length.  */
 370   for (p1 = s; *p1; p1++)
 371     {
 372       switch (decide_copy_method (p1))
 373         {
 374         case CM_ENCODE:
 375           ++encode_count;
 376           break;
 377         case CM_DECODE:
 378           ++decode_count;
 379           break;
 380         case CM_PASSTHROUGH:
 381           break;
 382         }
 383     }
 384
 385   if (!encode_count && !decode_count)
 386     /* The string is good as it is. */
 387     return (char *)s;           /* C const model sucks. */
 388
 389   oldlen = p1 - s;
 390   /* Each encoding adds two characters (hex digits), while each
 391      decoding removes two characters.  */
 392   newlen = oldlen + 2 * (encode_count - decode_count);
 393   newstr = xmalloc (newlen + 1);
 394
 395   p1 = s;
 396   p2 = newstr;
 397
 398   while (*p1)
 399     {
 400       switch (decide_copy_method (p1))
 401         {
 402         case CM_ENCODE:
 403           {
 404             unsigned char c = *p1++;
 405             *p2++ = '%';
 406             *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 407             *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 408           }
 409           break;
 410         case CM_DECODE:
 411           *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
 412                    + (XCHAR_TO_XDIGIT (*(p1 + 2))));
 413           p1 += 3;              /* skip %xx */
 414           break;
 415         case CM_PASSTHROUGH:
 416           *p2++ = *p1++;
 417         }
 418     }
 419   *p2 = '\0';
 420   assert (p2 - newstr == newlen);
 421   return newstr;
 422 }
 423 \f
 424 /* Returns the scheme type if the scheme is supported, or
 425    SCHEME_INVALID if not.  */
 426 enum url_scheme
 427 url_scheme (const char *url)
 428 {
 429   int i;
 430
 431   for (i = 0; supported_schemes[i].leading_string; i++)
 432     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 433                           strlen (supported_schemes[i].leading_string)))
 434       {
 435         if (supported_schemes[i].enabled)
 436           return (enum url_scheme) i;
 437         else
 438           return SCHEME_INVALID;
 439       }
 440
 441   return SCHEME_INVALID;
 442 }
 443
 444 /* Return the number of characters needed to skip the scheme part of
 445    the URL, e.g. `http://'.  If no scheme is found, returns 0.  */
 446 int
 447 url_skip_scheme (const char *url)
 448 {
 449   const char *p = url;
 450
 451   /* Skip the scheme name.  We allow `-' and `+' because of `whois++',
 452      etc. */
 453   while (ISALNUM (*p) || *p == '-' || *p == '+')
 454     ++p;
 455   if (*p != ':')
 456     return 0;
 457   /* Skip ':'. */
 458   ++p;
 459
 460   /* Skip "//" if found. */
 461   if (*p == '/' && *(p + 1) == '/')
 462     p += 2;
 463
 464   return p - url;
 465 }
 466
 467 /* Returns 1 if the URL begins with a scheme (supported or
 468    unsupported), 0 otherwise.  */
 469 int
 470 url_has_scheme (const char *url)
 471 {
 472   const char *p = url;
 473   while (ISALNUM (*p) || *p == '-' || *p == '+')
 474     ++p;
 475   return *p == ':';
 476 }
 477
 478 int
 479 scheme_default_port (enum url_scheme scheme)
 480 {
 481   return supported_schemes[scheme].default_port;
 482 }
 483
 484 void
 485 scheme_disable (enum url_scheme scheme)
 486 {
 487   supported_schemes[scheme].enabled = 0;
 488 }
 489
 490 /* Skip the username and password, if present here.  The function
 491    should be called *not* with the complete URL, but with the part
 492    right after the scheme.
 493
 494    If no username and password are found, return 0.  */
 495 int
 496 url_skip_uname (const char *url)
 497 {
 498   const char *p;
 499
 500   /* Look for '@' that comes before '/' or '?'. */
 501   p = (const char *)strpbrk (url, "/?@");
 502   if (!p || *p != '@')
 503     return 0;
 504
 505   return p - url + 1;
 506 }
 507
 508 static int
 509 parse_uname (const char *str, int len, char **user, char **passwd)
 510 {
 511   char *colon;
 512
 513   if (len == 0)
 514     /* Empty user name not allowed. */
 515     return 0;
 516
 517   colon = memchr (str, ':', len);
 518   if (colon == str)
 519     /* Empty user name again. */
 520     return 0;
 521
 522   if (colon)
 523     {
 524       int pwlen = len - (colon + 1 - str);
 525       *passwd = xmalloc (pwlen + 1);
 526       memcpy (*passwd, colon + 1, pwlen);
 527       (*passwd)[pwlen] = '\0';
 528       len -= pwlen + 1;
 529     }
 530   else
 531     *passwd = NULL;
 532
 533   *user = xmalloc (len + 1);
 534   memcpy (*user, str, len);
 535   (*user)[len] = '\0';
 536
 537   if (*user)
 538     url_unescape (*user);
 539   if (*passwd)
 540     url_unescape (*passwd);
 541
 542   return 1;
 543 }
 544
 545 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 546    popularized by Netscape and NcFTP.  HTTP shorthands look like this:
 547
 548    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 549    www.foo.com[:port]            -> http://www.foo.com[:port]
 550
 551    FTP shorthands look like this:
 552
 553    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 554    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 555
 556    If the URL needs not or cannot be rewritten, return NULL.  */
 557 char *
 558 rewrite_shorthand_url (const char *url)
 559 {
 560   const char *p;
 561
 562   if (url_has_scheme (url))
 563     return NULL;
 564
 565   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 566      latter Netscape.  */
 567   for (p = url; *p && *p != ':' && *p != '/'; p++)
 568     ;
 569
 570   if (p == url)
 571     return NULL;
 572
 573   if (*p == ':')
 574     {
 575       const char *pp;
 576       char *res;
 577       /* If the characters after the colon and before the next slash
 578          or end of string are all digits, it's HTTP.  */
 579       int digits = 0;
 580       for (pp = p + 1; ISDIGIT (*pp); pp++)
 581         ++digits;
 582       if (digits > 0 && (*pp == '/' || *pp == '\0'))
 583         goto http;
 584
 585       /* Prepend "ftp://" to the entire URL... */
 586       res = xmalloc (6 + strlen (url) + 1);
 587       sprintf (res, "ftp://%s", url);
 588       /* ...and replace ':' with '/'. */
 589       res[6 + (p - url)] = '/';
 590       return res;
 591     }
 592   else
 593     {
 594       char *res;
 595     http:
 596       /* Just prepend "http://" to what we have. */
 597       res = xmalloc (7 + strlen (url) + 1);
 598       sprintf (res, "http://%s", url);
 599       return res;
 600     }
 601 }
 602 \f
 603 static void parse_path PARAMS ((const char *, char **, char **));
 604
 605 /* Like strpbrk, with the exception that it returns the pointer to the
 606    terminating zero (end-of-string aka "eos") if no matching character
 607    is found.
 608
 609    Although I normally balk at Gcc-specific optimizations, it probably
 610    makes sense here: glibc has optimizations that detect strpbrk being
 611    called with literal string as ACCEPT and inline the search.  That
 612    optimization is defeated if strpbrk is hidden within the call to
 613    another function.  (And no, making strpbrk_or_eos inline doesn't
 614    help because the check for literal accept is in the
 615    preprocessor.)  */
 616
 617 #ifdef __GNUC__
 618
 619 #define strpbrk_or_eos(s, accept) ({            \
 620   char *SOE_p = strpbrk (s, accept);            \
 621   if (!SOE_p)                                   \
 622     SOE_p = (char *)s + strlen (s);             \
 623   SOE_p;                                        \
 624 })
 625
 626 #else  /* not __GNUC__ */
 627
 628 static char *
 629 strpbrk_or_eos (const char *s, const char *accept)
 630 {
 631   char *p = strpbrk (s, accept);
 632   if (!p)
 633     p = (char *)s + strlen (s);
 634   return p;
 635 }
 636 #endif
 637
 638 /* Turn STR into lowercase; return non-zero if a character was
 639    actually changed. */
 640
 641 static int
 642 lowercase_str (char *str)
 643 {
 644   int change = 0;
 645   for (; *str; str++)
 646     if (ISUPPER (*str))
 647       {
 648         change = 1;
 649         *str = TOLOWER (*str);
 650       }
 651   return change;
 652 }
 653
 654 static char *parse_errors[] = {
 655 #define PE_NO_ERROR                     0
 656   "No error",
 657 #define PE_UNSUPPORTED_SCHEME           1
 658   "Unsupported scheme",
 659 #define PE_EMPTY_HOST                   2
 660   "Empty host",
 661 #define PE_BAD_PORT_NUMBER              3
 662   "Bad port number",
 663 #define PE_INVALID_USER_NAME            4
 664   "Invalid user name",
 665 #define PE_UNTERMINATED_IPV6_ADDRESS    5
 666   "Unterminated IPv6 numeric address",
 667 #define PE_IPV6_NOT_SUPPORTED           6
 668   "IPv6 addresses not supported",
 669 #define PE_INVALID_IPV6_ADDRESS         7
 670   "Invalid IPv6 numeric address"
 671 };
 672
 673 #define SETERR(p, v) do {                       \
 674   if (p)                                        \
 675     *(p) = (v);                                 \
 676 } while (0)
 677
 678 #ifdef ENABLE_IPV6
 679 /* The following two functions were adapted from glibc. */
 680
 681 static int
 682 is_valid_ipv4_address (const char *str, const char *end)
 683 {
 684   int saw_digit, octets;
 685   int val;
 686
 687   saw_digit = 0;
 688   octets = 0;
 689   val = 0;
 690
 691   while (str < end) {
 692     int ch = *str++;
 693
 694     if (ch >= '0' && ch <= '9') {
 695       val = val * 10 + (ch - '0');
 696
 697       if (val > 255)
 698         return 0;
 699       if (saw_digit == 0) {
 700         if (++octets > 4)
 701           return 0;
 702         saw_digit = 1;
 703       }
 704     } else if (ch == '.' && saw_digit == 1) {
 705       if (octets == 4)
 706         return 0;
 707       val = 0;
 708       saw_digit = 0;
 709     } else
 710       return 0;
 711   }
 712   if (octets < 4)
 713     return 0;
 714
 715   return 1;
 716 }
 717
 718 static int
 719 is_valid_ipv6_address (const char *str, const char *end)
 720 {
 721   static const char xdigits[] = "0123456789abcdef";
 722   const char *curtok;
 723   int tp;
 724   const char *colonp;
 725   int saw_xdigit;
 726   unsigned int val;
 727
 728   tp = 0;
 729   colonp = NULL;
 730
 731   if (str == end)
 732     return 0;
 733
 734   /* Leading :: requires some special handling. */
 735   if (*str == ':')
 736     {
 737       ++str;
 738       if (str == end || *str != ':')
 739         return 0;
 740     }
 741
 742   curtok = str;
 743   saw_xdigit = 0;
 744   val = 0;
 745
 746   while (str < end) {
 747     int ch = *str++;
 748     const char *pch;
 749
 750     /* if ch is a number, add it to val. */
 751     pch = strchr(xdigits, ch);
 752     if (pch != NULL) {
 753       val <<= 4;
 754       val |= (pch - xdigits);
 755       if (val > 0xffff)
 756         return 0;
 757       saw_xdigit = 1;
 758       continue;
 759     }
 760
 761     /* if ch is a colon ... */
 762     if (ch == ':') {
 763       curtok = str;
 764       if (saw_xdigit == 0) {
 765         if (colonp != NULL)
 766           return 0;
 767         colonp = str + tp;
 768         continue;
 769       } else if (str == end) {
 770         return 0;
 771       }
 772       if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
 773         return 0;
 774       tp += NS_INT16SZ;
 775       saw_xdigit = 0;
 776       val = 0;
 777       continue;
 778     }
 779
 780     /* if ch is a dot ... */
 781     if (ch == '.' && (tp <= NS_IN6ADDRSZ - NS_INADDRSZ) &&
 782         is_valid_ipv4_address(curtok, end) == 1) {
 783       tp += NS_INADDRSZ;
 784       saw_xdigit = 0;
 785       break;
 786     }
 787
 788     return 0;
 789   }
 790
 791   if (saw_xdigit == 1) {
 792     if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
 793       return 0;
 794     tp += NS_INT16SZ;
 795   }
 796
 797   if (colonp != NULL) {
 798     if (tp == NS_IN6ADDRSZ)
 799       return 0;
 800     tp = NS_IN6ADDRSZ;
 801   }
 802
 803   if (tp != NS_IN6ADDRSZ)
 804     return 0;
 805
 806   return 1;
 807 }
 808 #endif
 809
 810 /* Parse a URL.
 811
 812    Return a new struct url if successful, NULL on error.  In case of
 813    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 814    error code. */
 815 struct url *
 816 url_parse (const char *url, int *error)
 817 {
 818   struct url *u;
 819   const char *p;
 820   int path_modified, host_modified;
 821
 822   enum url_scheme scheme;
 823
 824   const char *uname_b,     *uname_e;
 825   const char *host_b,      *host_e;
 826   const char *path_b,      *path_e;
 827   const char *params_b,    *params_e;
 828   const char *query_b,     *query_e;
 829   const char *fragment_b,  *fragment_e;
 830
 831   int port;
 832   char *user = NULL, *passwd = NULL;
 833
 834   char *url_encoded;
 835
 836   scheme = url_scheme (url);
 837   if (scheme == SCHEME_INVALID)
 838     {
 839       SETERR (error, PE_UNSUPPORTED_SCHEME);
 840       return NULL;
 841     }
 842
 843   url_encoded = reencode_escapes (url);
 844   p = url_encoded;
 845
 846   p += strlen (supported_schemes[scheme].leading_string);
 847   uname_b = p;
 848   p += url_skip_uname (p);
 849   uname_e = p;
 850
 851   /* scheme://user:pass@host[:port]... */
 852   /*                    ^              */
 853
 854   /* We attempt to break down the URL into the components path,
 855      params, query, and fragment.  They are ordered like this:
 856
 857        scheme://host[:port][/path][;params][?query][#fragment]  */
 858
 859   params_b   = params_e   = NULL;
 860   query_b    = query_e    = NULL;
 861   fragment_b = fragment_e = NULL;
 862
 863   host_b = p;
 864
 865   if (*p == '[')
 866     {
 867       /* Handle IPv6 address inside square brackets.  Ideally we'd
 868          just look for the terminating ']', but rfc2732 mandates
 869          rejecting invalid IPv6 addresses.  */
 870
 871       /* The address begins after '['. */
 872       host_b = p + 1;
 873       host_e = strchr (host_b, ']');
 874
 875       if (!host_e)
 876         {
 877           SETERR (error, PE_UNTERMINATED_IPV6_ADDRESS);
 878           return NULL;
 879         }
 880
 881 #ifdef ENABLE_IPV6
 882       /* Check if the IPv6 address is valid. */
 883       if (!is_valid_ipv6_address(host_b, host_e))
 884         {
 885           SETERR (error, PE_INVALID_IPV6_ADDRESS);
 886           return NULL;
 887         }
 888
 889       /* Continue parsing after the closing ']'. */
 890       p = host_e + 1;
 891 #else
 892       SETERR (error, PE_IPV6_NOT_SUPPORTED);
 893       return NULL;
 894 #endif
 895     }
 896   else
 897     {
 898       p = strpbrk_or_eos (p, ":/;?#");
 899       host_e = p;
 900     }
 901
 902   if (host_b == host_e)
 903     {
 904       SETERR (error, PE_EMPTY_HOST);
 905       return NULL;
 906     }
 907
 908   port = scheme_default_port (scheme);
 909   if (*p == ':')
 910     {
 911       const char *port_b, *port_e, *pp;
 912
 913       /* scheme://host:port/tralala */
 914       /*              ^             */
 915       ++p;
 916       port_b = p;
 917       p = strpbrk_or_eos (p, "/;?#");
 918       port_e = p;
 919
 920       if (port_b == port_e)
 921         {
 922           /* http://host:/whatever */
 923           /*             ^         */
 924           SETERR (error, PE_BAD_PORT_NUMBER);
 925           return NULL;
 926         }
 927
 928       for (port = 0, pp = port_b; pp < port_e; pp++)
 929         {
 930           if (!ISDIGIT (*pp))
 931             {
 932               /* http://host:12randomgarbage/blah */
 933               /*               ^                  */
 934               SETERR (error, PE_BAD_PORT_NUMBER);
 935               return NULL;
 936             }
 937
 938           port = 10 * port + (*pp - '0');
 939         }
 940     }
 941
 942   if (*p == '/')
 943     {
 944       ++p;
 945       path_b = p;
 946       p = strpbrk_or_eos (p, ";?#");
 947       path_e = p;
 948     }
 949   else
 950     {
 951       /* Path is not allowed not to exist. */
 952       path_b = path_e = p;
 953     }
 954
 955   if (*p == ';')
 956     {
 957       ++p;
 958       params_b = p;
 959       p = strpbrk_or_eos (p, "?#");
 960       params_e = p;
 961     }
 962   if (*p == '?')
 963     {
 964       ++p;
 965       query_b = p;
 966       p = strpbrk_or_eos (p, "#");
 967       query_e = p;
 968
 969       /* Hack that allows users to use '?' (a wildcard character) in
 970          FTP URLs without it being interpreted as a query string
 971          delimiter.  */
 972       if (scheme == SCHEME_FTP)
 973         {
 974           query_b = query_e = NULL;
 975           path_e = p;
 976         }
 977     }
 978   if (*p == '#')
 979     {
 980       ++p;
 981       fragment_b = p;
 982       p += strlen (p);
 983       fragment_e = p;
 984     }
 985   assert (*p == 0);
 986
 987   if (uname_b != uname_e)
 988     {
 989       /* http://user:pass@host */
 990       /*        ^         ^    */
 991       /*     uname_b   uname_e */
 992       if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
 993         {
 994           SETERR (error, PE_INVALID_USER_NAME);
 995           return NULL;
 996         }
 997     }
 998
 999   u = (struct url *)xmalloc (sizeof (struct url));
1000   memset (u, 0, sizeof (*u));
1001
1002   u->scheme = scheme;
1003   u->host   = strdupdelim (host_b, host_e);
1004   u->port   = port;
1005   u->user   = user;
1006   u->passwd = passwd;
1007
1008   u->path = strdupdelim (path_b, path_e);
1009   path_modified = path_simplify (u->path);
1010   parse_path (u->path, &u->dir, &u->file);
1011
1012   host_modified = lowercase_str (u->host);
1013
1014   if (params_b)
1015     u->params = strdupdelim (params_b, params_e);
1016   if (query_b)
1017     u->query = strdupdelim (query_b, query_e);
1018   if (fragment_b)
1019     u->fragment = strdupdelim (fragment_b, fragment_e);
1020
1021   if (path_modified || u->fragment || host_modified || path_b == path_e)
1022     {
1023       /* If we suspect that a transformation has rendered what
1024          url_string might return different from URL_ENCODED, rebuild
1025          u->url using url_string.  */
1026       u->url = url_string (u, 0);
1027
1028       if (url_encoded != url)
1029         xfree ((char *) url_encoded);
1030     }
1031   else
1032     {
1033       if (url_encoded == url)
1034         u->url = xstrdup (url);
1035       else
1036         u->url = url_encoded;
1037     }
1038   url_encoded = NULL;
1039
1040   return u;
1041 }
1042
1043 const char *
1044 url_error (int error_code)
1045 {
1046   assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
1047   return parse_errors[error_code];
1048 }
1049
1050 /* Parse PATH into dir and file.  PATH is extracted from the URL and
1051    is URL-escaped.  The function returns unescaped DIR and FILE.  */
1052
1053 static void
1054 parse_path (const char *path, char **dir, char **file)
1055 {
1056   char *last_slash;
1057
1058   last_slash = strrchr (path, '/');
1059   if (!last_slash)
1060     {
1061       *dir = xstrdup ("");
1062       *file = xstrdup (path);
1063     }
1064   else
1065     {
1066       *dir = strdupdelim (path, last_slash);
1067       *file = xstrdup (last_slash + 1);
1068     }
1069   url_unescape (*dir);
1070   url_unescape (*file);
1071 }
1072
1073 /* Note: URL's "full path" is the path with the query string and
1074    params appended.  The "fragment" (#foo) is intentionally ignored,
1075    but that might be changed.  For example, if the original URL was
1076    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
1077    the full path will be "/foo/bar/baz;bullshit?querystring".  */
1078
1079 /* Return the length of the full path, without the terminating
1080    zero.  */
1081
1082 static int
1083 full_path_length (const struct url *url)
1084 {
1085   int len = 0;
1086
1087 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
1088
1089   FROB (path);
1090   FROB (params);
1091   FROB (query);
1092
1093 #undef FROB
1094
1095   return len;
1096 }
1097
1098 /* Write out the full path. */
1099
1100 static void
1101 full_path_write (const struct url *url, char *where)
1102 {
1103 #define FROB(el, chr) do {                      \
1104   char *f_el = url->el;                         \
1105   if (f_el) {                                   \
1106     int l = strlen (f_el);                      \
1107     *where++ = chr;                             \
1108     memcpy (where, f_el, l);                    \
1109     where += l;                                 \
1110   }                                             \
1111 } while (0)
1112
1113   FROB (path, '/');
1114   FROB (params, ';');
1115   FROB (query, '?');
1116
1117 #undef FROB
1118 }
1119
1120 /* Public function for getting the "full path".  E.g. if u->path is
1121    "foo/bar" and u->query is "param=value", full_path will be
1122    "/foo/bar?param=value". */
1123
1124 char *
1125 url_full_path (const struct url *url)
1126 {
1127   int length = full_path_length (url);
1128   char *full_path = (char *)xmalloc(length + 1);
1129
1130   full_path_write (url, full_path);
1131   full_path[length] = '\0';
1132
1133   return full_path;
1134 }
1135
1136 /* Escape unsafe and reserved characters, except for the slash
1137    characters.  */
1138
1139 static char *
1140 url_escape_dir (const char *dir)
1141 {
1142   char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1143   char *h, *t;
1144   if (newdir == dir)
1145     return (char *)dir;
1146
1147   /* Unescape slashes in NEWDIR. */
1148
1149   h = newdir;                   /* hare */
1150   t = newdir;                   /* tortoise */
1151
1152   for (; *h; h++, t++)
1153     {
1154       if (*h == '%' && h[1] == '2' && h[2] == 'F')
1155         {
1156           *t = '/';
1157           h += 2;
1158         }
1159       else
1160         *t = *h;
1161     }
1162   *t = '\0';
1163
1164   return newdir;
1165 }
1166
1167 /* Sync u->path and u->url with u->dir and u->file.  Called after
1168    u->file or u->dir have been changed, typically by the FTP code.  */
1169
1170 static void
1171 sync_path (struct url *u)
1172 {
1173   char *newpath, *efile, *edir;
1174
1175   xfree (u->path);
1176
1177   /* u->dir and u->file are not escaped.  URL-escape them before
1178      reassembling them into u->path.  That way, if they contain
1179      separators like '?' or even if u->file contains slashes, the
1180      path will be correctly assembled.  (u->file can contain slashes
1181      if the URL specifies it with %2f, or if an FTP server returns
1182      it.)  */
1183   edir = url_escape_dir (u->dir);
1184   efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1185
1186   if (!*edir)
1187     newpath = xstrdup (efile);
1188   else
1189     {
1190       int dirlen = strlen (edir);
1191       int filelen = strlen (efile);
1192
1193       /* Copy "DIR/FILE" to newpath. */
1194       char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1195       memcpy (p, edir, dirlen);
1196       p += dirlen;
1197       *p++ = '/';
1198       memcpy (p, efile, filelen);
1199       p += filelen;
1200       *p++ = '\0';
1201     }
1202
1203   u->path = newpath;
1204
1205   if (edir != u->dir)
1206     xfree (edir);
1207   if (efile != u->file)
1208     xfree (efile);
1209
1210   /* Regenerate u->url as well.  */
1211   xfree (u->url);
1212   u->url = url_string (u, 0);
1213 }
1214
1215 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1216    This way we can sync u->path and u->url when they get changed.  */
1217
1218 void
1219 url_set_dir (struct url *url, const char *newdir)
1220 {
1221   xfree (url->dir);
1222   url->dir = xstrdup (newdir);
1223   sync_path (url);
1224 }
1225
1226 void
1227 url_set_file (struct url *url, const char *newfile)
1228 {
1229   xfree (url->file);
1230   url->file = xstrdup (newfile);
1231   sync_path (url);
1232 }
1233
1234 void
1235 url_free (struct url *url)
1236 {
1237   xfree (url->host);
1238   xfree (url->path);
1239   xfree (url->url);
1240
1241   FREE_MAYBE (url->params);
1242   FREE_MAYBE (url->query);
1243   FREE_MAYBE (url->fragment);
1244   FREE_MAYBE (url->user);
1245   FREE_MAYBE (url->passwd);
1246
1247   xfree (url->dir);
1248   xfree (url->file);
1249
1250   xfree (url);
1251 }
1252 \f
1253 struct urlpos *
1254 get_urls_file (const char *file)
1255 {
1256   struct file_memory *fm;
1257   struct urlpos *head, *tail;
1258   const char *text, *text_end;
1259
1260   /* Load the file.  */
1261   fm = read_file (file);
1262   if (!fm)
1263     {
1264       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
1265       return NULL;
1266     }
1267   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
1268
1269   head = tail = NULL;
1270   text = fm->content;
1271   text_end = fm->content + fm->length;
1272   while (text < text_end)
1273     {
1274       const char *line_beg = text;
1275       const char *line_end = memchr (text, '\n', text_end - text);
1276       if (!line_end)
1277         line_end = text_end;
1278       else
1279         ++line_end;
1280       text = line_end;
1281
1282       /* Strip whitespace from the beginning and end of line. */
1283       while (line_beg < line_end && ISSPACE (*line_beg))
1284         ++line_beg;
1285       while (line_end > line_beg && ISSPACE (*(line_end - 1)))
1286         --line_end;
1287
1288       if (line_end > line_beg)
1289         {
1290           /* URL is in the [line_beg, line_end) region. */
1291
1292           int up_error_code;
1293           char *url_text;
1294           struct urlpos *entry;
1295           struct url *url;
1296
1297           /* We must copy the URL to a zero-terminated string, and we
1298              can't use alloca because we're in a loop.  *sigh*.  */
1299           url_text = strdupdelim (line_beg, line_end);
1300
1301           if (opt.base_href)
1302             {
1303               /* Merge opt.base_href with URL. */
1304               char *merged = uri_merge (opt.base_href, url_text);
1305               xfree (url_text);
1306               url_text = merged;
1307             }
1308
1309           url = url_parse (url_text, &up_error_code);
1310           if (!url)
1311             {
1312               logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1313                          file, url_text, url_error (up_error_code));
1314               xfree (url_text);
1315               continue;
1316             }
1317           xfree (url_text);
1318
1319           entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1320           memset (entry, 0, sizeof (*entry));
1321           entry->next = NULL;
1322           entry->url = url;
1323
1324           if (!head)
1325             head = entry;
1326           else
1327             tail->next = entry;
1328           tail = entry;
1329         }
1330     }
1331   read_file_free (fm);
1332   return head;
1333 }
1334 \f
1335 /* Free the linked list of urlpos.  */
1336 void
1337 free_urlpos (struct urlpos *l)
1338 {
1339   while (l)
1340     {
1341       struct urlpos *next = l->next;
1342       if (l->url)
1343         url_free (l->url);
1344       FREE_MAYBE (l->local_name);
1345       xfree (l);
1346       l = next;
1347     }
1348 }
1349
1350 /* Rotate FNAME opt.backups times */
1351 void
1352 rotate_backups(const char *fname)
1353 {
1354   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1355   char *from = (char *)alloca (maxlen);
1356   char *to = (char *)alloca (maxlen);
1357   struct stat sb;
1358   int i;
1359
1360   if (stat (fname, &sb) == 0)
1361     if (S_ISREG (sb.st_mode) == 0)
1362       return;
1363
1364   for (i = opt.backups; i > 1; i--)
1365     {
1366       sprintf (from, "%s.%d", fname, i - 1);
1367       sprintf (to, "%s.%d", fname, i);
1368       rename (from, to);
1369     }
1370
1371   sprintf (to, "%s.%d", fname, 1);
1372   rename(fname, to);
1373 }
1374
1375 /* Create all the necessary directories for PATH (a file).  Calls
1376    mkdirhier() internally.  */
1377 int
1378 mkalldirs (const char *path)
1379 {
1380   const char *p;
1381   char *t;
1382   struct stat st;
1383   int res;
1384
1385   p = path + strlen (path);
1386   for (; *p != '/' && p != path; p--)
1387     ;
1388
1389   /* Don't create if it's just a file.  */
1390   if ((p == path) && (*p != '/'))
1391     return 0;
1392   t = strdupdelim (path, p);
1393
1394   /* Check whether the directory exists.  */
1395   if ((stat (t, &st) == 0))
1396     {
1397       if (S_ISDIR (st.st_mode))
1398         {
1399           xfree (t);
1400           return 0;
1401         }
1402       else
1403         {
1404           /* If the dir exists as a file name, remove it first.  This
1405              is *only* for Wget to work with buggy old CERN http
1406              servers.  Here is the scenario: When Wget tries to
1407              retrieve a directory without a slash, e.g.
1408              http://foo/bar (bar being a directory), CERN server will
1409              not redirect it too http://foo/bar/ -- it will generate a
1410              directory listing containing links to bar/file1,
1411              bar/file2, etc.  Wget will lose because it saves this
1412              HTML listing to a file `bar', so it cannot create the
1413              directory.  To work around this, if the file of the same
1414              name exists, we just remove it and create the directory
1415              anyway.  */
1416           DEBUGP (("Removing %s because of directory danger!\n", t));
1417           unlink (t);
1418         }
1419     }
1420   res = make_directory (t);
1421   if (res != 0)
1422     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1423   xfree (t);
1424   return res;
1425 }
1426 \f
1427 /* Functions for constructing the file name out of URL components.  */
1428
1429 /* A growable string structure, used by url_file_name and friends.
1430    This should perhaps be moved to utils.c.
1431
1432    The idea is to have a convenient and efficient way to construct a
1433    string by having various functions append data to it.  Instead of
1434    passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1435    functions in questions, we pass the pointer to this struct.  */
1436
1437 struct growable {
1438   char *base;
1439   int size;
1440   int tail;
1441 };
1442
1443 /* Ensure that the string can accept APPEND_COUNT more characters past
1444    the current TAIL position.  If necessary, this will grow the string
1445    and update its allocated size.  If the string is already large
1446    enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1447 #define GROW(g, append_size) do {                                       \
1448   struct growable *G_ = g;                                              \
1449   DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1450 } while (0)
1451
1452 /* Return the tail position of the string. */
1453 #define TAIL(r) ((r)->base + (r)->tail)
1454
1455 /* Move the tail position by APPEND_COUNT characters. */
1456 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1457
1458 /* Append the string STR to DEST.  NOTICE: the string in DEST is not
1459    terminated.  */
1460
1461 static void
1462 append_string (const char *str, struct growable *dest)
1463 {
1464   int l = strlen (str);
1465   GROW (dest, l);
1466   memcpy (TAIL (dest), str, l);
1467   TAIL_INCR (dest, l);
1468 }
1469
1470 /* Append CH to DEST.  For example, append_char (0, DEST)
1471    zero-terminates DEST.  */
1472
1473 static void
1474 append_char (char ch, struct growable *dest)
1475 {
1476   GROW (dest, 1);
1477   *TAIL (dest) = ch;
1478   TAIL_INCR (dest, 1);
1479 }
1480
1481 enum {
1482   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1483   filechr_not_windows = 2,      /* unusable on Windows, one of \|/<>?:*" */
1484   filechr_control     = 4,      /* a control character, e.g. 0-31 */
1485 };
1486
1487 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1488
1489 /* Shorthands for the table: */
1490 #define U filechr_not_unix
1491 #define W filechr_not_windows
1492 #define C filechr_control
1493
1494 #define UW U|W
1495 #define UWC U|W|C
1496
1497 /* Table of characters unsafe under various conditions (see above).
1498
1499    Arguably we could also claim `%' to be unsafe, since we use it as
1500    the escape character.  If we ever want to be able to reliably
1501    translate file name back to URL, this would become important
1502    crucial.  Right now, it's better to be minimal in escaping.  */
1503
1504 const static unsigned char filechr_table[256] =
1505 {
1506 UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1507   C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1508   C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1509   C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1510   0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1511   0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1512   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1513   0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
1514   0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1515   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1516   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1517   0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1518   0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1519   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1520   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1521   0,  0,  0,  0,   0,  0,  0,  0,   /* x   y   z   {    |   }   ~   DEL */
1522
1523   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
1524   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
1525   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1526   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1527
1528   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1529   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1530   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1531   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1532 };
1533 #undef U
1534 #undef W
1535 #undef C
1536 #undef UW
1537 #undef UWC
1538
1539 /* FN_PORT_SEP is the separator between host and port in file names
1540    for non-standard port numbers.  On Unix this is normally ':', as in
1541    "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1542    because Windows can't handle ':' in file names.  */
1543 #define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1544
1545 /* FN_QUERY_SEP is the separator between the file name and the URL
1546    query, normally '?'.  Since Windows cannot handle '?' as part of
1547    file name, we use '@' instead there.  */
1548 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1549
1550 /* Quote path element, characters in [b, e), as file name, and append
1551    the quoted string to DEST.  Each character is quoted as per
1552    file_unsafe_char and the corresponding table.  */
1553
1554 static void
1555 append_uri_pathel (const char *b, const char *e, struct growable *dest)
1556 {
1557   char *pathel;
1558   int pathlen;
1559
1560   const char *p;
1561   int quoted, outlen;
1562
1563   int mask;
1564   if (opt.restrict_files_os == restrict_unix)
1565     mask = filechr_not_unix;
1566   else
1567     mask = filechr_not_windows;
1568   if (opt.restrict_files_ctrl)
1569     mask |= filechr_control;
1570
1571   /* Copy [b, e) to PATHEL and URL-unescape it. */
1572   BOUNDED_TO_ALLOCA (b, e, pathel);
1573   url_unescape (pathel);
1574   pathlen = strlen (pathel);
1575
1576   /* Go through PATHEL and check how many characters we'll need to
1577      add for file quoting. */
1578   quoted = 0;
1579   for (p = pathel; *p; p++)
1580     if (FILE_CHAR_TEST (*p, mask))
1581       ++quoted;
1582
1583   /* p - pathel is the string length.  Each quoted char means two
1584      additional characters in the string, hence 2*quoted.  */
1585   outlen = (p - pathel) + (2 * quoted);
1586   GROW (dest, outlen);
1587
1588   if (!quoted)
1589     {
1590       /* If there's nothing to quote, we don't need to go through the
1591          string the second time.  */
1592       memcpy (TAIL (dest), pathel, outlen);
1593     }
1594   else
1595     {
1596       char *q = TAIL (dest);
1597       for (p = pathel; *p; p++)
1598         {
1599           if (!FILE_CHAR_TEST (*p, mask))
1600             *q++ = *p;
1601           else
1602             {
1603               unsigned char ch = *p;
1604               *q++ = '%';
1605               *q++ = XDIGIT_TO_XCHAR (ch >> 4);
1606               *q++ = XDIGIT_TO_XCHAR (ch & 0xf);
1607             }
1608         }
1609       assert (q - TAIL (dest) == outlen);
1610     }
1611   TAIL_INCR (dest, outlen);
1612 }
1613
1614 /* Append to DEST the directory structure that corresponds the
1615    directory part of URL's path.  For example, if the URL is
1616    http://server/dir1/dir2/file, this appends "/dir1/dir2".
1617
1618    Each path element ("dir1" and "dir2" in the above example) is
1619    examined, url-unescaped, and re-escaped as file name element.
1620
1621    Additionally, it cuts as many directories from the path as
1622    specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1623    will produce "bar" for the above example.  For 2 or more, it will
1624    produce "".
1625
1626    Each component of the path is quoted for use as file name.  */
1627
1628 static void
1629 append_dir_structure (const struct url *u, struct growable *dest)
1630 {
1631   char *pathel, *next;
1632   int cut = opt.cut_dirs;
1633
1634   /* Go through the path components, de-URL-quote them, and quote them
1635      (if necessary) as file names.  */
1636
1637   pathel = u->path;
1638   for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1639     {
1640       if (cut-- > 0)
1641         continue;
1642       if (pathel == next)
1643         /* Ignore empty pathels.  path_simplify should remove
1644            occurrences of "//" from the path, but it has special cases
1645            for starting / which generates an empty pathel here.  */
1646         continue;
1647
1648       if (dest->tail)
1649         append_char ('/', dest);
1650       append_uri_pathel (pathel, next, dest);
1651     }
1652 }
1653
1654 /* Return a unique file name that matches the given URL as good as
1655    possible.  Does not create directories on the file system.  */
1656
1657 char *
1658 url_file_name (const struct url *u)
1659 {
1660   struct growable fnres;
1661
1662   char *u_file, *u_query;
1663   char *fname, *unique;
1664
1665   fnres.base = NULL;
1666   fnres.size = 0;
1667   fnres.tail = 0;
1668
1669   /* Start with the directory prefix, if specified. */
1670   if (!DOTP (opt.dir_prefix))
1671     append_string (opt.dir_prefix, &fnres);
1672
1673   /* If "dirstruct" is turned on (typically the case with -r), add
1674      the host and port (unless those have been turned off) and
1675      directory structure.  */
1676   if (opt.dirstruct)
1677     {
1678       if (opt.add_hostdir)
1679         {
1680           if (fnres.tail)
1681             append_char ('/', &fnres);
1682           append_string (u->host, &fnres);
1683           if (u->port != scheme_default_port (u->scheme))
1684             {
1685               char portstr[24];
1686               number_to_string (portstr, u->port);
1687               append_char (FN_PORT_SEP, &fnres);
1688               append_string (portstr, &fnres);
1689             }
1690         }
1691
1692       append_dir_structure (u, &fnres);
1693     }
1694
1695   /* Add the file name. */
1696   if (fnres.tail)
1697     append_char ('/', &fnres);
1698   u_file = *u->file ? u->file : "index.html";
1699   append_uri_pathel (u_file, u_file + strlen (u_file), &fnres);
1700
1701   /* Append "?query" to the file name. */
1702   u_query = u->query && *u->query ? u->query : NULL;
1703   if (u_query)
1704     {
1705       append_char (FN_QUERY_SEP, &fnres);
1706       append_uri_pathel (u_query, u_query + strlen (u_query), &fnres);
1707     }
1708
1709   /* Zero-terminate the file name. */
1710   append_char ('\0', &fnres);
1711
1712   fname = fnres.base;
1713
1714   /* Check the cases in which the unique extensions are not used:
1715      1) Clobbering is turned off (-nc).
1716      2) Retrieval with regetting.
1717      3) Timestamping is used.
1718      4) Hierarchy is built.
1719
1720      The exception is the case when file does exist and is a
1721      directory (see `mkalldirs' for explanation).  */
1722
1723   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1724       && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1725     return fname;
1726
1727   unique = unique_name (fname, 1);
1728   if (unique != fname)
1729     xfree (fname);
1730   return unique;
1731 }
1732
1733 /* Return the length of URL's path.  Path is considered to be
1734    terminated by one of '?', ';', '#', or by the end of the
1735    string.  */
1736 static int
1737 path_length (const char *url)
1738 {
1739   const char *q = strpbrk_or_eos (url, "?;#");
1740   return q - url;
1741 }
1742
1743 /* Find the last occurrence of character C in the range [b, e), or
1744    NULL, if none are present.  This is equivalent to strrchr(b, c),
1745    except that it accepts an END argument instead of requiring the
1746    string to be zero-terminated.  Why is there no memrchr()?  */
1747 static const char *
1748 find_last_char (const char *b, const char *e, char c)
1749 {
1750   for (; e > b; e--)
1751     if (*e == c)
1752       return e;
1753   return NULL;
1754 }
1755 \f
1756 /* Resolve "." and ".." elements of PATH by destructively modifying
1757    PATH.  "." is resolved by removing that path element, and ".." is
1758    resolved by removing the preceding path element.  Leading and
1759    trailing slashes are preserved.
1760
1761    Return non-zero if any changes have been made.
1762
1763    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1764    test examples are provided below.  If you change anything in this
1765    function, run test_path_simplify to make sure you haven't broken a
1766    test case.
1767
1768    A previous version of this function was based on path_simplify()
1769    from GNU Bash, but it has been rewritten for Wget 1.8.1.  */
1770
1771 static int
1772 path_simplify (char *path)
1773 {
1774   int change = 0;
1775   char *p, *end;
1776
1777   if (path[0] == '/')
1778     ++path;                     /* preserve the leading '/'. */
1779
1780   p = path;
1781   end = p + strlen (p) + 1;     /* position past the terminating zero. */
1782
1783   while (1)
1784     {
1785     again:
1786       /* P should point to the beginning of a path element. */
1787
1788       if (*p == '.' && (*(p + 1) == '/' || *(p + 1) == '\0'))
1789         {
1790           /* Handle "./foo" by moving "foo" two characters to the
1791              left. */
1792           if (*(p + 1) == '/')
1793             {
1794               change = 1;
1795               memmove (p, p + 2, end - p);
1796               end -= 2;
1797               goto again;
1798             }
1799           else
1800             {
1801               change = 1;
1802               *p = '\0';
1803               break;
1804             }
1805         }
1806       else if (*p == '.' && *(p + 1) == '.'
1807                && (*(p + 2) == '/' || *(p + 2) == '\0'))
1808         {
1809           /* Handle "../foo" by moving "foo" one path element to the
1810              left.  */
1811           char *b = p;          /* not p-1 because P can equal PATH */
1812
1813           /* Backtrack by one path element, but not past the beginning
1814              of PATH. */
1815
1816           /* foo/bar/../baz */
1817           /*         ^ p    */
1818           /*     ^ b        */
1819
1820           if (b > path)
1821             {
1822               /* Move backwards until B hits the beginning of the
1823                  previous path element or the beginning of path. */
1824               for (--b; b > path && *(b - 1) != '/'; b--)
1825                 ;
1826             }
1827
1828           change = 1;
1829           if (*(p + 2) == '/')
1830             {
1831               memmove (b, p + 3, end - (p + 3));
1832               end -= (p + 3) - b;
1833               p = b;
1834             }
1835           else
1836             {
1837               *b = '\0';
1838               break;
1839             }
1840
1841           goto again;
1842         }
1843       else if (*p == '/')
1844         {
1845           /* Remove empty path elements.  Not mandated by rfc1808 et
1846              al, but it seems like a good idea to get rid of them.
1847              Supporting them properly is hard (in which directory do
1848              you save http://x.com///y.html?) and they don't seem to
1849              bring much gain.  */
1850           char *q = p;
1851           while (*q == '/')
1852             ++q;
1853           change = 1;
1854           if (*q == '\0')
1855             {
1856               *p = '\0';
1857               break;
1858             }
1859           memmove (p, q, end - q);
1860           end -= q - p;
1861           goto again;
1862         }
1863
1864       /* Skip to the next path element. */
1865       while (*p && *p != '/')
1866         ++p;
1867       if (*p == '\0')
1868         break;
1869
1870       /* Make sure P points to the beginning of the next path element,
1871          which is location after the slash. */
1872       ++p;
1873     }
1874
1875   return change;
1876 }
1877 \f
1878 /* Resolve the result of "linking" a base URI (BASE) to a
1879    link-specified URI (LINK).
1880
1881    Either of the URIs may be absolute or relative, complete with the
1882    host name, or path only.  This tries to behave "reasonably" in all
1883    foreseeable cases.  It employs little specific knowledge about
1884    schemes or URL-specific stuff -- it just works on strings.
1885
1886    The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1887    See uri_merge for a gentler interface to this functionality.
1888
1889    Perhaps this function should call path_simplify so that the callers
1890    don't have to call url_parse unconditionally.  */
1891 static char *
1892 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1893 {
1894   char *constr;
1895
1896   if (no_scheme)
1897     {
1898       const char *end = base + path_length (base);
1899
1900       if (!*link)
1901         {
1902           /* Empty LINK points back to BASE, query string and all. */
1903           constr = xstrdup (base);
1904         }
1905       else if (*link == '?')
1906         {
1907           /* LINK points to the same location, but changes the query
1908              string.  Examples: */
1909           /* uri_merge("path",         "?new") -> "path?new"     */
1910           /* uri_merge("path?foo",     "?new") -> "path?new"     */
1911           /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1912           /* uri_merge("path#foo",     "?new") -> "path?new"     */
1913           int baselength = end - base;
1914           constr = xmalloc (baselength + linklength + 1);
1915           memcpy (constr, base, baselength);
1916           memcpy (constr + baselength, link, linklength);
1917           constr[baselength + linklength] = '\0';
1918         }
1919       else if (*link == '#')
1920         {
1921           /* uri_merge("path",         "#new") -> "path#new"     */
1922           /* uri_merge("path#foo",     "#new") -> "path#new"     */
1923           /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1924           /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1925           int baselength;
1926           const char *end1 = strchr (base, '#');
1927           if (!end1)
1928             end1 = base + strlen (base);
1929           baselength = end1 - base;
1930           constr = xmalloc (baselength + linklength + 1);
1931           memcpy (constr, base, baselength);
1932           memcpy (constr + baselength, link, linklength);
1933           constr[baselength + linklength] = '\0';
1934         }
1935       else if (linklength > 1 && *link == '/' && *(link + 1) == '/')
1936         {
1937           /* LINK begins with "//" and so is a net path: we need to
1938              replace everything after (and including) the double slash
1939              with LINK. */
1940
1941           /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1942           /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1943           /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1944
1945           int span;
1946           const char *slash;
1947           const char *start_insert;
1948
1949           /* Look for first slash. */
1950           slash = memchr (base, '/', end - base);
1951           /* If found slash and it is a double slash, then replace
1952              from this point, else default to replacing from the
1953              beginning.  */
1954           if (slash && *(slash + 1) == '/')
1955             start_insert = slash;
1956           else
1957             start_insert = base;
1958
1959           span = start_insert - base;
1960           constr = (char *)xmalloc (span + linklength + 1);
1961           if (span)
1962             memcpy (constr, base, span);
1963           memcpy (constr + span, link, linklength);
1964           constr[span + linklength] = '\0';
1965         }
1966       else if (*link == '/')
1967         {
1968           /* LINK is an absolute path: we need to replace everything
1969              after (and including) the FIRST slash with LINK.
1970
1971              So, if BASE is "http://host/whatever/foo/bar", and LINK is
1972              "/qux/xyzzy", our result should be
1973              "http://host/qux/xyzzy".  */
1974           int span;
1975           const char *slash;
1976           const char *start_insert = NULL; /* for gcc to shut up. */
1977           const char *pos = base;
1978           int seen_slash_slash = 0;
1979           /* We're looking for the first slash, but want to ignore
1980              double slash. */
1981         again:
1982           slash = memchr (pos, '/', end - pos);
1983           if (slash && !seen_slash_slash)
1984             if (*(slash + 1) == '/')
1985               {
1986                 pos = slash + 2;
1987                 seen_slash_slash = 1;
1988                 goto again;
1989               }
1990
1991           /* At this point, SLASH is the location of the first / after
1992              "//", or the first slash altogether.  START_INSERT is the
1993              pointer to the location where LINK will be inserted.  When
1994              examining the last two examples, keep in mind that LINK
1995              begins with '/'. */
1996
1997           if (!slash && !seen_slash_slash)
1998             /* example: "foo" */
1999             /*           ^    */
2000             start_insert = base;
2001           else if (!slash && seen_slash_slash)
2002             /* example: "http://foo" */
2003             /*                     ^ */
2004             start_insert = end;
2005           else if (slash && !seen_slash_slash)
2006             /* example: "foo/bar" */
2007             /*           ^        */
2008             start_insert = base;
2009           else if (slash && seen_slash_slash)
2010             /* example: "http://something/" */
2011             /*                           ^  */
2012             start_insert = slash;
2013
2014           span = start_insert - base;
2015           constr = (char *)xmalloc (span + linklength + 1);
2016           if (span)
2017             memcpy (constr, base, span);
2018           if (linklength)
2019             memcpy (constr + span, link, linklength);
2020           constr[span + linklength] = '\0';
2021         }
2022       else
2023         {
2024           /* LINK is a relative URL: we need to replace everything
2025              after last slash (possibly empty) with LINK.
2026
2027              So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
2028              our result should be "whatever/foo/qux/xyzzy".  */
2029           int need_explicit_slash = 0;
2030           int span;
2031           const char *start_insert;
2032           const char *last_slash = find_last_char (base, end, '/');
2033           if (!last_slash)
2034             {
2035               /* No slash found at all.  Append LINK to what we have,
2036                  but we'll need a slash as a separator.
2037
2038                  Example: if base == "foo" and link == "qux/xyzzy", then
2039                  we cannot just append link to base, because we'd get
2040                  "fooqux/xyzzy", whereas what we want is
2041                  "foo/qux/xyzzy".
2042
2043                  To make sure the / gets inserted, we set
2044                  need_explicit_slash to 1.  We also set start_insert
2045                  to end + 1, so that the length calculations work out
2046                  correctly for one more (slash) character.  Accessing
2047                  that character is fine, since it will be the
2048                  delimiter, '\0' or '?'.  */
2049               /* example: "foo?..." */
2050               /*               ^    ('?' gets changed to '/') */
2051               start_insert = end + 1;
2052               need_explicit_slash = 1;
2053             }
2054           else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
2055             {
2056               /* example: http://host"  */
2057               /*                      ^ */
2058               start_insert = end + 1;
2059               need_explicit_slash = 1;
2060             }
2061           else
2062             {
2063               /* example: "whatever/foo/bar" */
2064               /*                        ^    */
2065               start_insert = last_slash + 1;
2066             }
2067
2068           span = start_insert - base;
2069           constr = (char *)xmalloc (span + linklength + 1);
2070           if (span)
2071             memcpy (constr, base, span);
2072           if (need_explicit_slash)
2073             constr[span - 1] = '/';
2074           if (linklength)
2075             memcpy (constr + span, link, linklength);
2076           constr[span + linklength] = '\0';
2077         }
2078     }
2079   else /* !no_scheme */
2080     {
2081       constr = strdupdelim (link, link + linklength);
2082     }
2083   return constr;
2084 }
2085
2086 /* Merge BASE with LINK and return the resulting URI.  This is an
2087    interface to uri_merge_1 that assumes that LINK is a
2088    zero-terminated string.  */
2089 char *
2090 uri_merge (const char *base, const char *link)
2091 {
2092   return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
2093 }
2094 \f
2095 #define APPEND(p, s) do {                       \
2096   int len = strlen (s);                         \
2097   memcpy (p, s, len);                           \
2098   p += len;                                     \
2099 } while (0)
2100
2101 /* Use this instead of password when the actual password is supposed
2102    to be hidden.  We intentionally use a generic string without giving
2103    away the number of characters in the password, like previous
2104    versions did.  */
2105 #define HIDDEN_PASSWORD "*password*"
2106
2107 /* Recreate the URL string from the data in URL.
2108
2109    If HIDE is non-zero (as it is when we're calling this on a URL we
2110    plan to print, but not when calling it to canonicalize a URL for
2111    use within the program), password will be hidden.  Unsafe
2112    characters in the URL will be quoted.  */
2113
2114 char *
2115 url_string (const struct url *url, int hide_password)
2116 {
2117   int size;
2118   char *result, *p;
2119   char *quoted_user = NULL, *quoted_passwd = NULL;
2120
2121   int scheme_port  = supported_schemes[url->scheme].default_port;
2122   char *scheme_str = supported_schemes[url->scheme].leading_string;
2123   int fplen = full_path_length (url);
2124
2125   int brackets_around_host = 0;
2126
2127   assert (scheme_str != NULL);
2128
2129   /* Make sure the user name and password are quoted. */
2130   if (url->user)
2131     {
2132       quoted_user = url_escape_allow_passthrough (url->user);
2133       if (url->passwd)
2134         {
2135           if (hide_password)
2136             quoted_passwd = HIDDEN_PASSWORD;
2137           else
2138             quoted_passwd = url_escape_allow_passthrough (url->passwd);
2139         }
2140     }
2141
2142   if (strchr (url->host, ':'))
2143     brackets_around_host = 1;
2144
2145   size = (strlen (scheme_str)
2146           + strlen (url->host)
2147           + (brackets_around_host ? 2 : 0)
2148           + fplen
2149           + 1);
2150   if (url->port != scheme_port)
2151     size += 1 + numdigit (url->port);
2152   if (quoted_user)
2153     {
2154       size += 1 + strlen (quoted_user);
2155       if (quoted_passwd)
2156         size += 1 + strlen (quoted_passwd);
2157     }
2158
2159   p = result = xmalloc (size);
2160
2161   APPEND (p, scheme_str);
2162   if (quoted_user)
2163     {
2164       APPEND (p, quoted_user);
2165       if (quoted_passwd)
2166         {
2167           *p++ = ':';
2168           APPEND (p, quoted_passwd);
2169         }
2170       *p++ = '@';
2171     }
2172
2173   if (brackets_around_host)
2174     *p++ = '[';
2175   APPEND (p, url->host);
2176   if (brackets_around_host)
2177     *p++ = ']';
2178   if (url->port != scheme_port)
2179     {
2180       *p++ = ':';
2181       p = number_to_string (p, url->port);
2182     }
2183
2184   full_path_write (url, p);
2185   p += fplen;
2186   *p++ = '\0';
2187
2188   assert (p - result == size);
2189
2190   if (quoted_user && quoted_user != url->user)
2191     xfree (quoted_user);
2192   if (quoted_passwd && !hide_password
2193       && quoted_passwd != url->passwd)
2194     xfree (quoted_passwd);
2195
2196   return result;
2197 }
2198 \f
2199 /* Return the URL of the proxy appropriate for url U.  */
2200 char *
2201 getproxy (struct url *u)
2202 {
2203   char *proxy = NULL;
2204   char *rewritten_url;
2205   static char rewritten_storage[1024];
2206
2207   if (!opt.use_proxy)
2208     return NULL;
2209   if (!no_proxy_match (u->host, (const char **)opt.no_proxy))
2210     return NULL;
2211
2212   switch (u->scheme)
2213     {
2214     case SCHEME_HTTP:
2215       proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
2216       break;
2217 #ifdef HAVE_SSL
2218     case SCHEME_HTTPS:
2219       proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
2220       break;
2221 #endif
2222     case SCHEME_FTP:
2223       proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
2224       break;
2225     case SCHEME_INVALID:
2226       break;
2227     }
2228   if (!proxy || !*proxy)
2229     return NULL;
2230
2231   /* Handle shorthands.  `rewritten_storage' is a kludge to allow
2232      getproxy() to return static storage. */
2233   rewritten_url = rewrite_shorthand_url (proxy);
2234   if (rewritten_url)
2235     {
2236       strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
2237       rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
2238       proxy = rewritten_storage;
2239     }
2240
2241   return proxy;
2242 }
2243
2244 /* Should a host be accessed through proxy, concerning no_proxy?  */
2245 int
2246 no_proxy_match (const char *host, const char **no_proxy)
2247 {
2248   if (!no_proxy)
2249     return 1;
2250   else
2251     return !sufmatch (no_proxy, host);
2252 }
2253 \f
2254 /* Support for converting links for local viewing in downloaded HTML
2255    files.  This should be moved to another file, because it has
2256    nothing to do with processing URLs.  */
2257
2258 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
2259 static const char *replace_attr PARAMS ((const char *, int, FILE *,
2260                                          const char *));
2261 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
2262                                                       const char *, int));
2263 static char *local_quote_string PARAMS ((const char *));
2264
2265 /* Change the links in one HTML file.  LINKS is a list of links in the
2266    document, along with their positions and the desired direction of
2267    the conversion.  */
2268 void
2269 convert_links (const char *file, struct urlpos *links)
2270 {
2271   struct file_memory *fm;
2272   FILE *fp;
2273   const char *p;
2274   downloaded_file_t downloaded_file_return;
2275
2276   struct urlpos *link;
2277   int to_url_count = 0, to_file_count = 0;
2278
2279   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
2280
2281   {
2282     /* First we do a "dry run": go through the list L and see whether
2283        any URL needs to be converted in the first place.  If not, just
2284        leave the file alone.  */
2285     int dry_count = 0;
2286     struct urlpos *dry = links;
2287     for (dry = links; dry; dry = dry->next)
2288       if (dry->convert != CO_NOCONVERT)
2289         ++dry_count;
2290     if (!dry_count)
2291       {
2292         logputs (LOG_VERBOSE, _("nothing to do.\n"));
2293         return;
2294       }
2295   }
2296
2297   fm = read_file (file);
2298   if (!fm)
2299     {
2300       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2301                  file, strerror (errno));
2302       return;
2303     }
2304
2305   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
2306   if (opt.backup_converted && downloaded_file_return)
2307     write_backup_file (file, downloaded_file_return);
2308
2309   /* Before opening the file for writing, unlink the file.  This is
2310      important if the data in FM is mmaped.  In such case, nulling the
2311      file, which is what fopen() below does, would make us read all
2312      zeroes from the mmaped region.  */
2313   if (unlink (file) < 0 && errno != ENOENT)
2314     {
2315       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
2316                  file, strerror (errno));
2317       read_file_free (fm);
2318       return;
2319     }
2320   /* Now open the file for writing.  */
2321   fp = fopen (file, "wb");
2322   if (!fp)
2323     {
2324       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2325                  file, strerror (errno));
2326       read_file_free (fm);
2327       return;
2328     }
2329
2330   /* Here we loop through all the URLs in file, replacing those of
2331      them that are downloaded with relative references.  */
2332   p = fm->content;
2333   for (link = links; link; link = link->next)
2334     {
2335       char *url_start = fm->content + link->pos;
2336
2337       if (link->pos >= fm->length)
2338         {
2339           DEBUGP (("Something strange is going on.  Please investigate."));
2340           break;
2341         }
2342       /* If the URL is not to be converted, skip it.  */
2343       if (link->convert == CO_NOCONVERT)
2344         {
2345           DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
2346           continue;
2347         }
2348
2349       /* Echo the file contents, up to the offending URL's opening
2350          quote, to the outfile.  */
2351       fwrite (p, 1, url_start - p, fp);
2352       p = url_start;
2353
2354       switch (link->convert)
2355         {
2356         case CO_CONVERT_TO_RELATIVE:
2357           /* Convert absolute URL to relative. */
2358           {
2359             char *newname = construct_relative (file, link->local_name);
2360             char *quoted_newname = local_quote_string (newname);
2361
2362             if (!link->link_refresh_p)
2363               p = replace_attr (p, link->size, fp, quoted_newname);
2364             else
2365               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
2366                                              link->refresh_timeout);
2367
2368             DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
2369                      link->url->url, newname, link->pos, file));
2370             xfree (newname);
2371             xfree (quoted_newname);
2372             ++to_file_count;
2373             break;
2374           }
2375         case CO_CONVERT_TO_COMPLETE:
2376           /* Convert the link to absolute URL. */
2377           {
2378             char *newlink = link->url->url;
2379             char *quoted_newlink = html_quote_string (newlink);
2380
2381             if (!link->link_refresh_p)
2382               p = replace_attr (p, link->size, fp, quoted_newlink);
2383             else
2384               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
2385                                              link->refresh_timeout);
2386
2387             DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
2388                      newlink, link->pos, file));
2389             xfree (quoted_newlink);
2390             ++to_url_count;
2391             break;
2392           }
2393         case CO_NULLIFY_BASE:
2394           /* Change the base href to "". */
2395           p = replace_attr (p, link->size, fp, "");
2396           break;
2397         case CO_NOCONVERT:
2398           abort ();
2399           break;
2400         }
2401     }
2402
2403   /* Output the rest of the file. */
2404   if (p - fm->content < fm->length)
2405     fwrite (p, 1, fm->length - (p - fm->content), fp);
2406   fclose (fp);
2407   read_file_free (fm);
2408
2409   logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
2410 }
2411
2412 /* Construct and return a malloced copy of the relative link from two
2413    pieces of information: local name S1 of the referring file and
2414    local name S2 of the referred file.
2415
2416    So, if S1 is "jagor.srce.hr/index.html" and S2 is
2417    "jagor.srce.hr/images/news.gif", the function will return
2418    "images/news.gif".
2419
2420    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
2421    "fly.cc.fer.hr/images/fly.gif", the function will return
2422    "../images/fly.gif".
2423
2424    Caveats: S1 should not begin with `/', unless S2 also begins with
2425    '/'.  S1 should not contain things like ".." and such --
2426    construct_relative ("fly/ioccc/../index.html",
2427    "fly/images/fly.gif") will fail.  (A workaround is to call
2428    something like path_simplify() on S1).  */
2429 static char *
2430 construct_relative (const char *s1, const char *s2)
2431 {
2432   int i, cnt, sepdirs1;
2433   char *res;
2434
2435   if (*s2 == '/')
2436     return xstrdup (s2);
2437   /* S1 should *not* be absolute, if S2 wasn't.  */
2438   assert (*s1 != '/');
2439   i = cnt = 0;
2440   /* Skip the directories common to both strings.  */
2441   while (1)
2442     {
2443       while (s1[i] && s2[i]
2444              && (s1[i] == s2[i])
2445              && (s1[i] != '/')
2446              && (s2[i] != '/'))
2447         ++i;
2448       if (s1[i] == '/' && s2[i] == '/')
2449         cnt = ++i;
2450       else
2451         break;
2452     }
2453   for (sepdirs1 = 0; s1[i]; i++)
2454     if (s1[i] == '/')
2455       ++sepdirs1;
2456   /* Now, construct the file as of:
2457      - ../ repeated sepdirs1 time
2458      - all the non-mutual directories of S2.  */
2459   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
2460   for (i = 0; i < sepdirs1; i++)
2461     memcpy (res + 3 * i, "../", 3);
2462   strcpy (res + 3 * i, s2 + cnt);
2463   return res;
2464 }
2465 \f
2466 static void
2467 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
2468 {
2469   /* Rather than just writing over the original .html file with the
2470      converted version, save the former to *.orig.  Note we only do
2471      this for files we've _successfully_ downloaded, so we don't
2472      clobber .orig files sitting around from previous invocations. */
2473
2474   /* Construct the backup filename as the original name plus ".orig". */
2475   size_t         filename_len = strlen(file);
2476   char*          filename_plus_orig_suffix;
2477   boolean        already_wrote_backup_file = FALSE;
2478   slist*         converted_file_ptr;
2479   static slist*  converted_files = NULL;
2480
2481   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
2482     {
2483       /* Just write "orig" over "html".  We need to do it this way
2484          because when we're checking to see if we've downloaded the
2485          file before (to see if we can skip downloading it), we don't
2486          know if it's a text/html file.  Therefore we don't know yet
2487          at that stage that -E is going to cause us to tack on
2488          ".html", so we need to compare vs. the original URL plus
2489          ".orig", not the original URL plus ".html.orig". */
2490       filename_plus_orig_suffix = alloca (filename_len + 1);
2491       strcpy(filename_plus_orig_suffix, file);
2492       strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
2493     }
2494   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
2495     {
2496       /* Append ".orig" to the name. */
2497       filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
2498       strcpy(filename_plus_orig_suffix, file);
2499       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
2500     }
2501
2502   /* We can get called twice on the same URL thanks to the
2503      convert_all_links() call in main().  If we write the .orig file
2504      each time in such a case, it'll end up containing the first-pass
2505      conversion, not the original file.  So, see if we've already been
2506      called on this file. */
2507   converted_file_ptr = converted_files;
2508   while (converted_file_ptr != NULL)
2509     if (strcmp(converted_file_ptr->string, file) == 0)
2510       {
2511         already_wrote_backup_file = TRUE;
2512         break;
2513       }
2514     else
2515       converted_file_ptr = converted_file_ptr->next;
2516
2517   if (!already_wrote_backup_file)
2518     {
2519       /* Rename <file> to <file>.orig before former gets written over. */
2520       if (rename(file, filename_plus_orig_suffix) != 0)
2521         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
2522                    file, filename_plus_orig_suffix, strerror (errno));
2523
2524       /* Remember that we've already written a .orig backup for this file.
2525          Note that we never free this memory since we need it till the
2526          convert_all_links() call, which is one of the last things the
2527          program does before terminating.  BTW, I'm not sure if it would be
2528          safe to just set 'converted_file_ptr->string' to 'file' below,
2529          rather than making a copy of the string...  Another note is that I
2530          thought I could just add a field to the urlpos structure saying
2531          that we'd written a .orig file for this URL, but that didn't work,
2532          so I had to make this separate list.
2533          -- Dan Harkless <wget@harkless.org>
2534
2535          This [adding a field to the urlpos structure] didn't work
2536          because convert_file() is called from convert_all_links at
2537          the end of the retrieval with a freshly built new urlpos
2538          list.
2539          -- Hrvoje Niksic <hniksic@arsdigita.com>
2540       */
2541       converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
2542       converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
2543       converted_file_ptr->next = converted_files;
2544       converted_files = converted_file_ptr;
2545     }
2546 }
2547
2548 static int find_fragment PARAMS ((const char *, int, const char **,
2549                                   const char **));
2550
2551 /* Replace an attribute's original text with NEW_TEXT. */
2552
2553 static const char *
2554 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
2555 {
2556   int quote_flag = 0;
2557   char quote_char = '\"';       /* use "..." for quoting, unless the
2558                                    original value is quoted, in which
2559                                    case reuse its quoting char. */
2560   const char *frag_beg, *frag_end;
2561
2562   /* Structure of our string is:
2563        "...old-contents..."
2564        <---    size    --->  (with quotes)
2565      OR:
2566        ...old-contents...
2567        <---    size   -->    (no quotes)   */
2568
2569   if (*p == '\"' || *p == '\'')
2570     {
2571       quote_char = *p;
2572       quote_flag = 1;
2573       ++p;
2574       size -= 2;                /* disregard opening and closing quote */
2575     }
2576   putc (quote_char, fp);
2577   fputs (new_text, fp);
2578
2579   /* Look for fragment identifier, if any. */
2580   if (find_fragment (p, size, &frag_beg, &frag_end))
2581     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2582   p += size;
2583   if (quote_flag)
2584     ++p;
2585   putc (quote_char, fp);
2586
2587   return p;
2588 }
2589
2590 /* The same as REPLACE_ATTR, but used when replacing
2591    <meta http-equiv=refresh content="new_text"> because we need to
2592    append "timeout_value; URL=" before the next_text.  */
2593
2594 static const char *
2595 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
2596                            const char *new_text, int timeout)
2597 {
2598   /* "0; URL=..." */
2599   char *new_with_timeout = (char *)alloca (numdigit (timeout)
2600                                            + 6 /* "; URL=" */
2601                                            + strlen (new_text)
2602                                            + 1);
2603   sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
2604
2605   return replace_attr (p, size, fp, new_with_timeout);
2606 }
2607
2608 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2609    preceded by '&'.  If the character is not found, return zero.  If
2610    the character is found, return 1 and set BP and EP to point to the
2611    beginning and end of the region.
2612
2613    This is used for finding the fragment indentifiers in URLs.  */
2614
2615 static int
2616 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2617 {
2618   const char *end = beg + size;
2619   int saw_amp = 0;
2620   for (; beg < end; beg++)
2621     {
2622       switch (*beg)
2623         {
2624         case '&':
2625           saw_amp = 1;
2626           break;
2627         case '#':
2628           if (!saw_amp)
2629             {
2630               *bp = beg;
2631               *ep = end;
2632               return 1;
2633             }
2634           /* fallthrough */
2635         default:
2636           saw_amp = 0;
2637         }
2638     }
2639   return 0;
2640 }
2641
2642 /* Quote FILE for use as local reference to an HTML file.
2643
2644    We quote ? as %3F to avoid passing part of the file name as the
2645    parameter when browsing the converted file through HTTP.  However,
2646    it is safe to do this only when `--html-extension' is turned on.
2647    This is because converting "index.html?foo=bar" to
2648    "index.html%3Ffoo=bar" would break local browsing, as the latter
2649    isn't even recognized as an HTML file!  However, converting
2650    "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
2651    safe for both local and HTTP-served browsing.  */
2652
2653 static char *
2654 local_quote_string (const char *file)
2655 {
2656   const char *file_sans_qmark;
2657   int qm;
2658
2659   if (!opt.html_extension)
2660     return html_quote_string (file);
2661
2662   qm = count_char (file, '?');
2663
2664   if (qm)
2665     {
2666       const char *from = file;
2667       char *to, *newname;
2668
2669       /* qm * 2 because we replace each question mark with "%3F",
2670          i.e. replace one char with three, hence two more.  */
2671       int fsqlen = strlen (file) + qm * 2;
2672
2673       to = newname = (char *)alloca (fsqlen + 1);
2674       for (; *from; from++)
2675         {
2676           if (*from != '?')
2677             *to++ = *from;
2678           else
2679             {
2680               *to++ = '%';
2681               *to++ = '3';
2682               *to++ = 'F';
2683             }
2684         }
2685       assert (to - newname == fsqlen);
2686       *to = '\0';
2687
2688       file_sans_qmark = newname;
2689     }
2690   else
2691     file_sans_qmark = file;
2692
2693   return html_quote_string (file_sans_qmark);
2694 }
2695
2696 /* We're storing "modes" of type downloaded_file_t in the hash table.
2697    However, our hash tables only accept pointers for keys and values.
2698    So when we need a pointer, we use the address of a
2699    downloaded_file_t variable of static storage.  */
2700
2701 static downloaded_file_t *
2702 downloaded_mode_to_ptr (downloaded_file_t mode)
2703 {
2704   static downloaded_file_t
2705     v1 = FILE_NOT_ALREADY_DOWNLOADED,
2706     v2 = FILE_DOWNLOADED_NORMALLY,
2707     v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2708     v4 = CHECK_FOR_FILE;
2709
2710   switch (mode)
2711     {
2712     case FILE_NOT_ALREADY_DOWNLOADED:
2713       return &v1;
2714     case FILE_DOWNLOADED_NORMALLY:
2715       return &v2;
2716     case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2717       return &v3;
2718     case CHECK_FOR_FILE:
2719       return &v4;
2720     }
2721   return NULL;
2722 }
2723
2724 /* This should really be merged with dl_file_url_map and
2725    downloaded_html_files in recur.c.  This was originally a list, but
2726    I changed it to a hash table beause it was actually taking a lot of
2727    time to find things in it.  */
2728
2729 static struct hash_table *downloaded_files_hash;
2730
2731 /* Remembers which files have been downloaded.  In the standard case, should be
2732    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2733    download successfully (i.e. not for ones we have failures on or that we skip
2734    due to -N).
2735
2736    When we've downloaded a file and tacked on a ".html" extension due to -E,
2737    call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2738    FILE_DOWNLOADED_NORMALLY.
2739
2740    If you just want to check if a file has been previously added without adding
2741    it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
2742    with local filenames, not remote URLs. */
2743 downloaded_file_t
2744 downloaded_file (downloaded_file_t mode, const char *file)
2745 {
2746   downloaded_file_t *ptr;
2747
2748   if (mode == CHECK_FOR_FILE)
2749     {
2750       if (!downloaded_files_hash)
2751         return FILE_NOT_ALREADY_DOWNLOADED;
2752       ptr = hash_table_get (downloaded_files_hash, file);
2753       if (!ptr)
2754         return FILE_NOT_ALREADY_DOWNLOADED;
2755       return *ptr;
2756     }
2757
2758   if (!downloaded_files_hash)
2759     downloaded_files_hash = make_string_hash_table (0);
2760
2761   ptr = hash_table_get (downloaded_files_hash, file);
2762   if (ptr)
2763     return *ptr;
2764
2765   ptr = downloaded_mode_to_ptr (mode);
2766   hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2767
2768   return FILE_NOT_ALREADY_DOWNLOADED;
2769 }
2770
2771 static int
2772 df_free_mapper (void *key, void *value, void *ignored)
2773 {
2774   xfree (key);
2775   return 0;
2776 }
2777
2778 void
2779 downloaded_files_free (void)
2780 {
2781   if (downloaded_files_hash)
2782     {
2783       hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2784       hash_table_destroy (downloaded_files_hash);
2785       downloaded_files_hash = NULL;
2786     }
2787 }
2788
2789 /* Return non-zero if scheme a is similar to scheme b.
2790
2791    Schemes are similar if they are equal.  If SSL is supported, schemes
2792    are also similar if one is http (SCHEME_HTTP) and the other is https
2793    (SCHEME_HTTPS).  */
2794 int
2795 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2796 {
2797   if (a == b)
2798     return 1;
2799 #ifdef HAVE_SSL
2800   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2801       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2802     return 1;
2803 #endif
2804   return 0;
2805 }
2806 \f
2807 #if 0
2808 /* Debugging and testing support for path_simplify. */
2809
2810 /* Debug: run path_simplify on PATH and return the result in a new
2811    string.  Useful for calling from the debugger.  */
2812 static char *
2813 ps (char *path)
2814 {
2815   char *copy = xstrdup (path);
2816   path_simplify (copy);
2817   return copy;
2818 }
2819
2820 static void
2821 run_test (char *test, char *expected_result, int expected_change)
2822 {
2823   char *test_copy = xstrdup (test);
2824   int modified = path_simplify (test_copy);
2825
2826   if (0 != strcmp (test_copy, expected_result))
2827     {
2828       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2829               test, expected_result, test_copy);
2830     }
2831   if (modified != expected_change)
2832     {
2833       if (expected_change == 1)
2834         printf ("Expected no modification with path_simplify(\"%s\").\n",
2835                 test);
2836       else
2837         printf ("Expected modification with path_simplify(\"%s\").\n",
2838                 test);
2839     }
2840   xfree (test_copy);
2841 }
2842
2843 static void
2844 test_path_simplify (void)
2845 {
2846   static struct {
2847     char *test, *result;
2848     int should_modify;
2849   } tests[] = {
2850     { "",               "",             0 },
2851     { ".",              "",             1 },
2852     { "..",             "",             1 },
2853     { "foo",            "foo",          0 },
2854     { "foo/bar",        "foo/bar",      0 },
2855     { "foo///bar",      "foo/bar",      1 },
2856     { "foo/.",          "foo/",         1 },
2857     { "foo/./",         "foo/",         1 },
2858     { "foo./",          "foo./",        0 },
2859     { "foo/../bar",     "bar",          1 },
2860     { "foo/../bar/",    "bar/",         1 },
2861     { "foo/bar/..",     "foo/",         1 },
2862     { "foo/bar/../x",   "foo/x",        1 },
2863     { "foo/bar/../x/",  "foo/x/",       1 },
2864     { "foo/..",         "",             1 },
2865     { "foo/../..",      "",             1 },
2866     { "a/b/../../c",    "c",            1 },
2867     { "./a/../b",       "b",            1 }
2868   };
2869   int i;
2870
2871   for (i = 0; i < ARRAY_SIZE (tests); i++)
2872     {
2873       char *test = tests[i].test;
2874       char *expected_result = tests[i].result;
2875       int   expected_change = tests[i].should_modify;
2876       run_test (test, expected_result, expected_change);
2877     }
2878
2879   /* Now run all the tests with a leading slash before the test case,
2880      to prove that the slash is being preserved.  */
2881   for (i = 0; i < ARRAY_SIZE (tests); i++)
2882     {
2883       char *test, *expected_result;
2884       int expected_change = tests[i].should_modify;
2885
2886       test = xmalloc (1 + strlen (tests[i].test) + 1);
2887       sprintf (test, "/%s", tests[i].test);
2888
2889       expected_result = xmalloc (1 + strlen (tests[i].result) + 1);
2890       sprintf (expected_result, "/%s", tests[i].result);
2891
2892       run_test (test, expected_result, expected_change);
2893
2894       xfree (test);
2895       xfree (expected_result);
2896     }
2897 }
2898 #endif