sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001, 2003, 2003
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2 of the License, or (at
  10 your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget; if not, write to the Free Software
  19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  20
  21 In addition, as a special exception, the Free Software Foundation
  22 gives permission to link the code of its release of Wget with the
  23 OpenSSL project's "OpenSSL" library (or with modified versions of it
  24 that use the same license as the "OpenSSL" library), and distribute
  25 the linked executables.  You must obey the GNU General Public License
  26 in all respects for all of the code used other than "OpenSSL".  If you
  27 modify this file, you may extend this exception to your version of the
  28 file, but you are not obligated to do so.  If you do not wish to do
  29 so, delete this exception statement from your version.  */
  30
  31 #include <config.h>
  32
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #ifdef HAVE_STRING_H
  36 # include <string.h>
  37 #else
  38 # include <strings.h>
  39 #endif
  40 #include <sys/types.h>
  41 #ifdef HAVE_UNISTD_H
  42 # include <unistd.h>
  43 #endif
  44 #include <errno.h>
  45 #include <assert.h>
  46
  47 #include "wget.h"
  48 #include "utils.h"
  49 #include "url.h"
  50 #include "host.h"
  51 #include "hash.h"
  52
  53 #ifndef errno
  54 extern int errno;
  55 #endif
  56
  57 /* Is X "."?  */
  58 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  59 /* Is X ".."?  */
  60 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  61
  62 static const int NS_INADDRSZ  = 4;
  63 static const int NS_IN6ADDRSZ = 16;
  64 static const int NS_INT16SZ = 2;
  65
  66
  67 struct scheme_data
  68 {
  69   char *leading_string;
  70   int default_port;
  71   int enabled;
  72 };
  73
  74 /* Supported schemes: */
  75 static struct scheme_data supported_schemes[] =
  76 {
  77   { "http://",  DEFAULT_HTTP_PORT,  1 },
  78 #ifdef HAVE_SSL
  79   { "https://", DEFAULT_HTTPS_PORT, 1 },
  80 #endif
  81   { "ftp://",   DEFAULT_FTP_PORT,   1 },
  82
  83   /* SCHEME_INVALID */
  84   { NULL,       -1,                 0 }
  85 };
  86
  87 /* Forward declarations: */
  88
  89 static char *construct_relative PARAMS ((const char *, const char *));
  90 static int path_simplify PARAMS ((char *));
  91
  92
  93 \f
  94 /* Support for encoding and decoding of URL strings.  We determine
  95    whether a character is unsafe through static table lookup.  This
  96    code assumes ASCII character set and 8-bit chars.  */
  97
  98 enum {
  99   /* rfc1738 reserved chars, preserved from encoding.  */
 100   urlchr_reserved = 1,
 101
 102   /* rfc1738 unsafe chars, plus some more.  */
 103   urlchr_unsafe   = 2
 104 };
 105
 106 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 107 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 108 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 109
 110 /* Shorthands for the table: */
 111 #define R  urlchr_reserved
 112 #define U  urlchr_unsafe
 113 #define RU R|U
 114
 115 const static unsigned char urlchr_table[256] =
 116 {
 117   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 118   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 119   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 120   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 121   U,  0,  U, RU,   0,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 122   0,  0,  0,  R,   0,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 123   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 124   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 125  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 126   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 127   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 128   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 129   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 130   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 131   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 132   0,  0,  0,  U,   U,  U,  U,  U,   /* x   y   z   {    |   }   ~   DEL */
 133
 134   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 135   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 136   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 137   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 138
 139   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 140   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 141   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 142   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 143 };
 144 #undef R
 145 #undef U
 146 #undef RU
 147
 148 /* URL-unescape the string S.
 149
 150    This is done by transforming the sequences "%HH" to the character
 151    represented by the hexadecimal digits HH.  If % is not followed by
 152    two hexadecimal digits, it is inserted literally.
 153
 154    The transformation is done in place.  If you need the original
 155    string intact, make a copy before calling this function.  */
 156
 157 static void
 158 url_unescape (char *s)
 159 {
 160   char *t = s;                  /* t - tortoise */
 161   char *h = s;                  /* h - hare     */
 162
 163   for (; *h; h++, t++)
 164     {
 165       if (*h != '%')
 166         {
 167         copychar:
 168           *t = *h;
 169         }
 170       else
 171         {
 172           /* Do nothing if '%' is not followed by two hex digits. */
 173           if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2])))
 174             goto copychar;
 175           *t = X2DIGITS_TO_NUM (h[1], h[2]);
 176           h += 2;
 177         }
 178     }
 179   *t = '\0';
 180 }
 181
 182 /* The core of url_escape_* functions.  Escapes the characters that
 183    match the provided mask in urlchr_table.
 184
 185    If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars
 186    will be returned unchanged.  If ALLOW_PASSTHROUGH is zero, a
 187    freshly allocated string will be returned in all cases.  */
 188
 189 static char *
 190 url_escape_1 (const char *s, unsigned char mask, int allow_passthrough)
 191 {
 192   const char *p1;
 193   char *p2, *newstr;
 194   int newlen;
 195   int addition = 0;
 196
 197   for (p1 = s; *p1; p1++)
 198     if (urlchr_test (*p1, mask))
 199       addition += 2;            /* Two more characters (hex digits) */
 200
 201   if (!addition)
 202     return allow_passthrough ? (char *)s : xstrdup (s);
 203
 204   newlen = (p1 - s) + addition;
 205   newstr = (char *)xmalloc (newlen + 1);
 206
 207   p1 = s;
 208   p2 = newstr;
 209   while (*p1)
 210     {
 211       /* Quote the characters that match the test mask. */
 212       if (urlchr_test (*p1, mask))
 213         {
 214           unsigned char c = *p1++;
 215           *p2++ = '%';
 216           *p2++ = XNUM_TO_digit (c >> 4);
 217           *p2++ = XNUM_TO_digit (c & 0xf);
 218         }
 219       else
 220         *p2++ = *p1++;
 221     }
 222   assert (p2 - newstr == newlen);
 223   *p2 = '\0';
 224
 225   return newstr;
 226 }
 227
 228 /* URL-escape the unsafe characters (see urlchr_table) in a given
 229    string, returning a freshly allocated string.  */
 230
 231 char *
 232 url_escape (const char *s)
 233 {
 234   return url_escape_1 (s, urlchr_unsafe, 0);
 235 }
 236
 237 /* URL-escape the unsafe characters (see urlchr_table) in a given
 238    string.  If no characters are unsafe, S is returned.  */
 239
 240 static char *
 241 url_escape_allow_passthrough (const char *s)
 242 {
 243   return url_escape_1 (s, urlchr_unsafe, 1);
 244 }
 245 \f
 246 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
 247
 248 /* Decide whether to encode, decode, or pass through the char at P.
 249    This used to be a macro, but it got a little too convoluted.  */
 250 static inline enum copy_method
 251 decide_copy_method (const char *p)
 252 {
 253   if (*p == '%')
 254     {
 255       if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
 256         {
 257           /* %xx sequence: decode it, unless it would decode to an
 258              unsafe or a reserved char; in that case, leave it as
 259              is. */
 260           char preempt = X2DIGITS_TO_NUM (*(p + 1), *(p + 2));
 261           if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt))
 262             return CM_PASSTHROUGH;
 263           else
 264             return CM_DECODE;
 265         }
 266       else
 267         /* Garbled %.. sequence: encode `%'. */
 268         return CM_ENCODE;
 269     }
 270   else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
 271     return CM_ENCODE;
 272   else
 273     return CM_PASSTHROUGH;
 274 }
 275
 276 /* Translate a %-escaped (but possibly non-conformant) input string S
 277    into a %-escaped (and conformant) output string.  If no characters
 278    are encoded or decoded, return the same string S; otherwise, return
 279    a freshly allocated string with the new contents.
 280
 281    After a URL has been run through this function, the protocols that
 282    use `%' as the quote character can use the resulting string as-is,
 283    while those that don't call url_unescape() to get to the intended
 284    data.  This function is also stable: after an input string is
 285    transformed the first time, all further transformations of the
 286    result yield the same result string.
 287
 288    Let's discuss why this function is needed.
 289
 290    Imagine Wget is to retrieve `http://abc.xyz/abc def'.  Since a raw
 291    space character would mess up the HTTP request, it needs to be
 292    quoted, like this:
 293
 294        GET /abc%20def HTTP/1.0
 295
 296    It appears that the unsafe chars need to be quoted, for example
 297    with url_escape.  But what if we're requested to download
 298    `abc%20def'?  url_escape transforms "%" to "%25", which would leave
 299    us with `abc%2520def'.  This is incorrect -- since %-escapes are
 300    part of URL syntax, "%20" is the correct way to denote a literal
 301    space on the Wget command line.  This leaves us in the conclusion
 302    that in that case Wget should not call url_escape, but leave the
 303    `%20' as is.
 304
 305    And what if the requested URI is `abc%20 def'?  If we call
 306    url_escape, we end up with `/abc%2520%20def', which is almost
 307    certainly not intended.  If we don't call url_escape, we are left
 308    with the embedded space and cannot complete the request.  What the
 309    user meant was for Wget to request `/abc%20%20def', and this is
 310    where reencode_escapes kicks in.
 311
 312    Wget used to solve this by first decoding %-quotes, and then
 313    encoding all the "unsafe" characters found in the resulting string.
 314    This was wrong because it didn't preserve certain URL special
 315    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 316    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 317    whether we considered `+' reserved (it is).  One of these results
 318    is inevitable because by the second step we would lose information
 319    on whether the `+' was originally encoded or not.  Both results
 320    were wrong because in CGI parameters + means space, while %2B means
 321    literal plus.  reencode_escapes correctly translates the above to
 322    "a%2B+b", i.e. returns the original string.
 323
 324    This function uses an algorithm proposed by Anon Sricharoenchai:
 325
 326    1. Encode all URL_UNSAFE and the "%" that are not followed by 2
 327       hexdigits.
 328
 329    2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
 330       "+".
 331
 332    ...except that this code conflates the two steps, and decides
 333    whether to encode, decode, or pass through each character in turn.
 334    The function still uses two passes, but their logic is the same --
 335    the first pass exists merely for the sake of allocation.  Another
 336    small difference is that we include `+' to URL_RESERVED.
 337
 338    Anon's test case:
 339
 340    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 341    ->
 342    "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
 343
 344    Simpler test cases:
 345
 346    "foo bar"         -> "foo%20bar"
 347    "foo%20bar"       -> "foo%20bar"
 348    "foo %20bar"      -> "foo%20%20bar"
 349    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 350    "foo%25%20bar"    -> "foo%25%20bar"
 351    "foo%2%20bar"     -> "foo%252%20bar"
 352    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 353    "foo%2b+bar"      -> "foo%2b+bar"  */
 354
 355 static char *
 356 reencode_escapes (const char *s)
 357 {
 358   const char *p1;
 359   char *newstr, *p2;
 360   int oldlen, newlen;
 361
 362   int encode_count = 0;
 363   int decode_count = 0;
 364
 365   /* First, pass through the string to see if there's anything to do,
 366      and to calculate the new length.  */
 367   for (p1 = s; *p1; p1++)
 368     {
 369       switch (decide_copy_method (p1))
 370         {
 371         case CM_ENCODE:
 372           ++encode_count;
 373           break;
 374         case CM_DECODE:
 375           ++decode_count;
 376           break;
 377         case CM_PASSTHROUGH:
 378           break;
 379         }
 380     }
 381
 382   if (!encode_count && !decode_count)
 383     /* The string is good as it is. */
 384     return (char *)s;           /* C const model sucks. */
 385
 386   oldlen = p1 - s;
 387   /* Each encoding adds two characters (hex digits), while each
 388      decoding removes two characters.  */
 389   newlen = oldlen + 2 * (encode_count - decode_count);
 390   newstr = xmalloc (newlen + 1);
 391
 392   p1 = s;
 393   p2 = newstr;
 394
 395   while (*p1)
 396     {
 397       switch (decide_copy_method (p1))
 398         {
 399         case CM_ENCODE:
 400           {
 401             unsigned char c = *p1++;
 402             *p2++ = '%';
 403             *p2++ = XNUM_TO_DIGIT (c >> 4);
 404             *p2++ = XNUM_TO_DIGIT (c & 0xf);
 405           }
 406           break;
 407         case CM_DECODE:
 408           *p2++ = X2DIGITS_TO_NUM (p1[1], p1[2]);
 409           p1 += 3;              /* skip %xx */
 410           break;
 411         case CM_PASSTHROUGH:
 412           *p2++ = *p1++;
 413         }
 414     }
 415   *p2 = '\0';
 416   assert (p2 - newstr == newlen);
 417   return newstr;
 418 }
 419 \f
 420 /* Returns the scheme type if the scheme is supported, or
 421    SCHEME_INVALID if not.  */
 422 enum url_scheme
 423 url_scheme (const char *url)
 424 {
 425   int i;
 426
 427   for (i = 0; supported_schemes[i].leading_string; i++)
 428     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 429                           strlen (supported_schemes[i].leading_string)))
 430       {
 431         if (supported_schemes[i].enabled)
 432           return (enum url_scheme) i;
 433         else
 434           return SCHEME_INVALID;
 435       }
 436
 437   return SCHEME_INVALID;
 438 }
 439
 440 /* Return the number of characters needed to skip the scheme part of
 441    the URL, e.g. `http://'.  If no scheme is found, returns 0.  */
 442 int
 443 url_skip_scheme (const char *url)
 444 {
 445   const char *p = url;
 446
 447   /* Skip the scheme name.  We allow `-' and `+' because of `whois++',
 448      etc. */
 449   while (ISALNUM (*p) || *p == '-' || *p == '+')
 450     ++p;
 451   if (*p != ':')
 452     return 0;
 453   /* Skip ':'. */
 454   ++p;
 455
 456   /* Skip "//" if found. */
 457   if (*p == '/' && *(p + 1) == '/')
 458     p += 2;
 459
 460   return p - url;
 461 }
 462
 463 /* Returns 1 if the URL begins with a scheme (supported or
 464    unsupported), 0 otherwise.  */
 465 int
 466 url_has_scheme (const char *url)
 467 {
 468   const char *p = url;
 469   while (ISALNUM (*p) || *p == '-' || *p == '+')
 470     ++p;
 471   return *p == ':';
 472 }
 473
 474 int
 475 scheme_default_port (enum url_scheme scheme)
 476 {
 477   return supported_schemes[scheme].default_port;
 478 }
 479
 480 void
 481 scheme_disable (enum url_scheme scheme)
 482 {
 483   supported_schemes[scheme].enabled = 0;
 484 }
 485
 486 /* Skip the username and password, if present here.  The function
 487    should be called *not* with the complete URL, but with the part
 488    right after the scheme.
 489
 490    If no username and password are found, return 0.  */
 491 int
 492 url_skip_uname (const char *url)
 493 {
 494   const char *p;
 495
 496   /* Look for '@' that comes before '/' or '?'. */
 497   p = (const char *)strpbrk (url, "/?@");
 498   if (!p || *p != '@')
 499     return 0;
 500
 501   return p - url + 1;
 502 }
 503
 504 static int
 505 parse_uname (const char *str, int len, char **user, char **passwd)
 506 {
 507   char *colon;
 508
 509   if (len == 0)
 510     /* Empty user name not allowed. */
 511     return 0;
 512
 513   colon = memchr (str, ':', len);
 514   if (colon == str)
 515     /* Empty user name again. */
 516     return 0;
 517
 518   if (colon)
 519     {
 520       int pwlen = len - (colon + 1 - str);
 521       *passwd = xmalloc (pwlen + 1);
 522       memcpy (*passwd, colon + 1, pwlen);
 523       (*passwd)[pwlen] = '\0';
 524       len -= pwlen + 1;
 525     }
 526   else
 527     *passwd = NULL;
 528
 529   *user = xmalloc (len + 1);
 530   memcpy (*user, str, len);
 531   (*user)[len] = '\0';
 532
 533   if (*user)
 534     url_unescape (*user);
 535   if (*passwd)
 536     url_unescape (*passwd);
 537
 538   return 1;
 539 }
 540
 541 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 542    popularized by Netscape and NcFTP.  HTTP shorthands look like this:
 543
 544    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 545    www.foo.com[:port]            -> http://www.foo.com[:port]
 546
 547    FTP shorthands look like this:
 548
 549    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 550    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 551
 552    If the URL needs not or cannot be rewritten, return NULL.  */
 553 char *
 554 rewrite_shorthand_url (const char *url)
 555 {
 556   const char *p;
 557
 558   if (url_has_scheme (url))
 559     return NULL;
 560
 561   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 562      latter Netscape.  */
 563   for (p = url; *p && *p != ':' && *p != '/'; p++)
 564     ;
 565
 566   if (p == url)
 567     return NULL;
 568
 569   if (*p == ':')
 570     {
 571       const char *pp;
 572       char *res;
 573       /* If the characters after the colon and before the next slash
 574          or end of string are all digits, it's HTTP.  */
 575       int digits = 0;
 576       for (pp = p + 1; ISDIGIT (*pp); pp++)
 577         ++digits;
 578       if (digits > 0 && (*pp == '/' || *pp == '\0'))
 579         goto http;
 580
 581       /* Prepend "ftp://" to the entire URL... */
 582       res = xmalloc (6 + strlen (url) + 1);
 583       sprintf (res, "ftp://%s", url);
 584       /* ...and replace ':' with '/'. */
 585       res[6 + (p - url)] = '/';
 586       return res;
 587     }
 588   else
 589     {
 590       char *res;
 591     http:
 592       /* Just prepend "http://" to what we have. */
 593       res = xmalloc (7 + strlen (url) + 1);
 594       sprintf (res, "http://%s", url);
 595       return res;
 596     }
 597 }
 598 \f
 599 static void parse_path PARAMS ((const char *, char **, char **));
 600
 601 /* Like strpbrk, with the exception that it returns the pointer to the
 602    terminating zero (end-of-string aka "eos") if no matching character
 603    is found.
 604
 605    Although I normally balk at Gcc-specific optimizations, it probably
 606    makes sense here: glibc has optimizations that detect strpbrk being
 607    called with literal string as ACCEPT and inline the search.  That
 608    optimization is defeated if strpbrk is hidden within the call to
 609    another function.  (And no, making strpbrk_or_eos inline doesn't
 610    help because the check for literal accept is in the
 611    preprocessor.)  */
 612
 613 #ifdef __GNUC__
 614
 615 #define strpbrk_or_eos(s, accept) ({            \
 616   char *SOE_p = strpbrk (s, accept);            \
 617   if (!SOE_p)                                   \
 618     SOE_p = (char *)s + strlen (s);             \
 619   SOE_p;                                        \
 620 })
 621
 622 #else  /* not __GNUC__ */
 623
 624 static char *
 625 strpbrk_or_eos (const char *s, const char *accept)
 626 {
 627   char *p = strpbrk (s, accept);
 628   if (!p)
 629     p = (char *)s + strlen (s);
 630   return p;
 631 }
 632 #endif
 633
 634 /* Turn STR into lowercase; return non-zero if a character was
 635    actually changed. */
 636
 637 static int
 638 lowercase_str (char *str)
 639 {
 640   int change = 0;
 641   for (; *str; str++)
 642     if (ISUPPER (*str))
 643       {
 644         change = 1;
 645         *str = TOLOWER (*str);
 646       }
 647   return change;
 648 }
 649
 650 static char *parse_errors[] = {
 651 #define PE_NO_ERROR                     0
 652   "No error",
 653 #define PE_UNSUPPORTED_SCHEME           1
 654   "Unsupported scheme",
 655 #define PE_EMPTY_HOST                   2
 656   "Empty host",
 657 #define PE_BAD_PORT_NUMBER              3
 658   "Bad port number",
 659 #define PE_INVALID_USER_NAME            4
 660   "Invalid user name",
 661 #define PE_UNTERMINATED_IPV6_ADDRESS    5
 662   "Unterminated IPv6 numeric address",
 663 #define PE_IPV6_NOT_SUPPORTED           6
 664   "IPv6 addresses not supported",
 665 #define PE_INVALID_IPV6_ADDRESS         7
 666   "Invalid IPv6 numeric address"
 667 };
 668
 669 #define SETERR(p, v) do {                       \
 670   if (p)                                        \
 671     *(p) = (v);                                 \
 672 } while (0)
 673
 674 #ifdef ENABLE_IPV6
 675 /* The following two functions were adapted from glibc. */
 676
 677 static int
 678 is_valid_ipv4_address (const char *str, const char *end)
 679 {
 680   int saw_digit, octets;
 681   int val;
 682
 683   saw_digit = 0;
 684   octets = 0;
 685   val = 0;
 686
 687   while (str < end) {
 688     int ch = *str++;
 689
 690     if (ch >= '0' && ch <= '9') {
 691       val = val * 10 + (ch - '0');
 692
 693       if (val > 255)
 694         return 0;
 695       if (saw_digit == 0) {
 696         if (++octets > 4)
 697           return 0;
 698         saw_digit = 1;
 699       }
 700     } else if (ch == '.' && saw_digit == 1) {
 701       if (octets == 4)
 702         return 0;
 703       val = 0;
 704       saw_digit = 0;
 705     } else
 706       return 0;
 707   }
 708   if (octets < 4)
 709     return 0;
 710
 711   return 1;
 712 }
 713
 714 static int
 715 is_valid_ipv6_address (const char *str, const char *end)
 716 {
 717   static const char xdigits[] = "0123456789abcdef";
 718   const char *curtok;
 719   int tp;
 720   const char *colonp;
 721   int saw_xdigit;
 722   unsigned int val;
 723
 724   tp = 0;
 725   colonp = NULL;
 726
 727   if (str == end)
 728     return 0;
 729
 730   /* Leading :: requires some special handling. */
 731   if (*str == ':')
 732     {
 733       ++str;
 734       if (str == end || *str != ':')
 735         return 0;
 736     }
 737
 738   curtok = str;
 739   saw_xdigit = 0;
 740   val = 0;
 741
 742   while (str < end) {
 743     int ch = *str++;
 744     const char *pch;
 745
 746     /* if ch is a number, add it to val. */
 747     pch = strchr(xdigits, ch);
 748     if (pch != NULL) {
 749       val <<= 4;
 750       val |= (pch - xdigits);
 751       if (val > 0xffff)
 752         return 0;
 753       saw_xdigit = 1;
 754       continue;
 755     }
 756
 757     /* if ch is a colon ... */
 758     if (ch == ':') {
 759       curtok = str;
 760       if (saw_xdigit == 0) {
 761         if (colonp != NULL)
 762           return 0;
 763         colonp = str + tp;
 764         continue;
 765       } else if (str == end) {
 766         return 0;
 767       }
 768       if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
 769         return 0;
 770       tp += NS_INT16SZ;
 771       saw_xdigit = 0;
 772       val = 0;
 773       continue;
 774     }
 775
 776     /* if ch is a dot ... */
 777     if (ch == '.' && (tp <= NS_IN6ADDRSZ - NS_INADDRSZ) &&
 778         is_valid_ipv4_address(curtok, end) == 1) {
 779       tp += NS_INADDRSZ;
 780       saw_xdigit = 0;
 781       break;
 782     }
 783
 784     return 0;
 785   }
 786
 787   if (saw_xdigit == 1) {
 788     if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
 789       return 0;
 790     tp += NS_INT16SZ;
 791   }
 792
 793   if (colonp != NULL) {
 794     if (tp == NS_IN6ADDRSZ)
 795       return 0;
 796     tp = NS_IN6ADDRSZ;
 797   }
 798
 799   if (tp != NS_IN6ADDRSZ)
 800     return 0;
 801
 802   return 1;
 803 }
 804 #endif
 805
 806 /* Parse a URL.
 807
 808    Return a new struct url if successful, NULL on error.  In case of
 809    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 810    error code. */
 811 struct url *
 812 url_parse (const char *url, int *error)
 813 {
 814   struct url *u;
 815   const char *p;
 816   int path_modified, host_modified;
 817
 818   enum url_scheme scheme;
 819
 820   const char *uname_b,     *uname_e;
 821   const char *host_b,      *host_e;
 822   const char *path_b,      *path_e;
 823   const char *params_b,    *params_e;
 824   const char *query_b,     *query_e;
 825   const char *fragment_b,  *fragment_e;
 826
 827   int port;
 828   char *user = NULL, *passwd = NULL;
 829
 830   char *url_encoded;
 831
 832   scheme = url_scheme (url);
 833   if (scheme == SCHEME_INVALID)
 834     {
 835       SETERR (error, PE_UNSUPPORTED_SCHEME);
 836       return NULL;
 837     }
 838
 839   url_encoded = reencode_escapes (url);
 840   p = url_encoded;
 841
 842   p += strlen (supported_schemes[scheme].leading_string);
 843   uname_b = p;
 844   p += url_skip_uname (p);
 845   uname_e = p;
 846
 847   /* scheme://user:pass@host[:port]... */
 848   /*                    ^              */
 849
 850   /* We attempt to break down the URL into the components path,
 851      params, query, and fragment.  They are ordered like this:
 852
 853        scheme://host[:port][/path][;params][?query][#fragment]  */
 854
 855   params_b   = params_e   = NULL;
 856   query_b    = query_e    = NULL;
 857   fragment_b = fragment_e = NULL;
 858
 859   host_b = p;
 860
 861   if (*p == '[')
 862     {
 863       /* Handle IPv6 address inside square brackets.  Ideally we'd
 864          just look for the terminating ']', but rfc2732 mandates
 865          rejecting invalid IPv6 addresses.  */
 866
 867       /* The address begins after '['. */
 868       host_b = p + 1;
 869       host_e = strchr (host_b, ']');
 870
 871       if (!host_e)
 872         {
 873           SETERR (error, PE_UNTERMINATED_IPV6_ADDRESS);
 874           return NULL;
 875         }
 876
 877 #ifdef ENABLE_IPV6
 878       /* Check if the IPv6 address is valid. */
 879       if (!is_valid_ipv6_address(host_b, host_e))
 880         {
 881           SETERR (error, PE_INVALID_IPV6_ADDRESS);
 882           return NULL;
 883         }
 884
 885       /* Continue parsing after the closing ']'. */
 886       p = host_e + 1;
 887 #else
 888       SETERR (error, PE_IPV6_NOT_SUPPORTED);
 889       return NULL;
 890 #endif
 891     }
 892   else
 893     {
 894       p = strpbrk_or_eos (p, ":/;?#");
 895       host_e = p;
 896     }
 897
 898   if (host_b == host_e)
 899     {
 900       SETERR (error, PE_EMPTY_HOST);
 901       return NULL;
 902     }
 903
 904   port = scheme_default_port (scheme);
 905   if (*p == ':')
 906     {
 907       const char *port_b, *port_e, *pp;
 908
 909       /* scheme://host:port/tralala */
 910       /*              ^             */
 911       ++p;
 912       port_b = p;
 913       p = strpbrk_or_eos (p, "/;?#");
 914       port_e = p;
 915
 916       if (port_b == port_e)
 917         {
 918           /* http://host:/whatever */
 919           /*             ^         */
 920           SETERR (error, PE_BAD_PORT_NUMBER);
 921           return NULL;
 922         }
 923
 924       for (port = 0, pp = port_b; pp < port_e; pp++)
 925         {
 926           if (!ISDIGIT (*pp))
 927             {
 928               /* http://host:12randomgarbage/blah */
 929               /*               ^                  */
 930               SETERR (error, PE_BAD_PORT_NUMBER);
 931               return NULL;
 932             }
 933
 934           port = 10 * port + (*pp - '0');
 935         }
 936     }
 937
 938   if (*p == '/')
 939     {
 940       ++p;
 941       path_b = p;
 942       p = strpbrk_or_eos (p, ";?#");
 943       path_e = p;
 944     }
 945   else
 946     {
 947       /* Path is not allowed not to exist. */
 948       path_b = path_e = p;
 949     }
 950
 951   if (*p == ';')
 952     {
 953       ++p;
 954       params_b = p;
 955       p = strpbrk_or_eos (p, "?#");
 956       params_e = p;
 957     }
 958   if (*p == '?')
 959     {
 960       ++p;
 961       query_b = p;
 962       p = strpbrk_or_eos (p, "#");
 963       query_e = p;
 964
 965       /* Hack that allows users to use '?' (a wildcard character) in
 966          FTP URLs without it being interpreted as a query string
 967          delimiter.  */
 968       if (scheme == SCHEME_FTP)
 969         {
 970           query_b = query_e = NULL;
 971           path_e = p;
 972         }
 973     }
 974   if (*p == '#')
 975     {
 976       ++p;
 977       fragment_b = p;
 978       p += strlen (p);
 979       fragment_e = p;
 980     }
 981   assert (*p == 0);
 982
 983   if (uname_b != uname_e)
 984     {
 985       /* http://user:pass@host */
 986       /*        ^         ^    */
 987       /*     uname_b   uname_e */
 988       if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
 989         {
 990           SETERR (error, PE_INVALID_USER_NAME);
 991           return NULL;
 992         }
 993     }
 994
 995   u = (struct url *)xmalloc (sizeof (struct url));
 996   memset (u, 0, sizeof (*u));
 997
 998   u->scheme = scheme;
 999   u->host   = strdupdelim (host_b, host_e);
1000   u->port   = port;
1001   u->user   = user;
1002   u->passwd = passwd;
1003
1004   u->path = strdupdelim (path_b, path_e);
1005   path_modified = path_simplify (u->path);
1006   parse_path (u->path, &u->dir, &u->file);
1007
1008   host_modified = lowercase_str (u->host);
1009
1010   if (params_b)
1011     u->params = strdupdelim (params_b, params_e);
1012   if (query_b)
1013     u->query = strdupdelim (query_b, query_e);
1014   if (fragment_b)
1015     u->fragment = strdupdelim (fragment_b, fragment_e);
1016
1017   if (path_modified || u->fragment || host_modified || path_b == path_e)
1018     {
1019       /* If we suspect that a transformation has rendered what
1020          url_string might return different from URL_ENCODED, rebuild
1021          u->url using url_string.  */
1022       u->url = url_string (u, 0);
1023
1024       if (url_encoded != url)
1025         xfree ((char *) url_encoded);
1026     }
1027   else
1028     {
1029       if (url_encoded == url)
1030         u->url = xstrdup (url);
1031       else
1032         u->url = url_encoded;
1033     }
1034   url_encoded = NULL;
1035
1036   return u;
1037 }
1038
1039 const char *
1040 url_error (int error_code)
1041 {
1042   assert (error_code >= 0 && error_code < countof (parse_errors));
1043   return parse_errors[error_code];
1044 }
1045
1046 /* Parse PATH into dir and file.  PATH is extracted from the URL and
1047    is URL-escaped.  The function returns unescaped DIR and FILE.  */
1048
1049 static void
1050 parse_path (const char *path, char **dir, char **file)
1051 {
1052   char *last_slash;
1053
1054   last_slash = strrchr (path, '/');
1055   if (!last_slash)
1056     {
1057       *dir = xstrdup ("");
1058       *file = xstrdup (path);
1059     }
1060   else
1061     {
1062       *dir = strdupdelim (path, last_slash);
1063       *file = xstrdup (last_slash + 1);
1064     }
1065   url_unescape (*dir);
1066   url_unescape (*file);
1067 }
1068
1069 /* Note: URL's "full path" is the path with the query string and
1070    params appended.  The "fragment" (#foo) is intentionally ignored,
1071    but that might be changed.  For example, if the original URL was
1072    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
1073    the full path will be "/foo/bar/baz;bullshit?querystring".  */
1074
1075 /* Return the length of the full path, without the terminating
1076    zero.  */
1077
1078 static int
1079 full_path_length (const struct url *url)
1080 {
1081   int len = 0;
1082
1083 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
1084
1085   FROB (path);
1086   FROB (params);
1087   FROB (query);
1088
1089 #undef FROB
1090
1091   return len;
1092 }
1093
1094 /* Write out the full path. */
1095
1096 static void
1097 full_path_write (const struct url *url, char *where)
1098 {
1099 #define FROB(el, chr) do {                      \
1100   char *f_el = url->el;                         \
1101   if (f_el) {                                   \
1102     int l = strlen (f_el);                      \
1103     *where++ = chr;                             \
1104     memcpy (where, f_el, l);                    \
1105     where += l;                                 \
1106   }                                             \
1107 } while (0)
1108
1109   FROB (path, '/');
1110   FROB (params, ';');
1111   FROB (query, '?');
1112
1113 #undef FROB
1114 }
1115
1116 /* Public function for getting the "full path".  E.g. if u->path is
1117    "foo/bar" and u->query is "param=value", full_path will be
1118    "/foo/bar?param=value". */
1119
1120 char *
1121 url_full_path (const struct url *url)
1122 {
1123   int length = full_path_length (url);
1124   char *full_path = (char *)xmalloc(length + 1);
1125
1126   full_path_write (url, full_path);
1127   full_path[length] = '\0';
1128
1129   return full_path;
1130 }
1131
1132 /* Escape unsafe and reserved characters, except for the slash
1133    characters.  */
1134
1135 static char *
1136 url_escape_dir (const char *dir)
1137 {
1138   char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1139   char *h, *t;
1140   if (newdir == dir)
1141     return (char *)dir;
1142
1143   /* Unescape slashes in NEWDIR. */
1144
1145   h = newdir;                   /* hare */
1146   t = newdir;                   /* tortoise */
1147
1148   for (; *h; h++, t++)
1149     {
1150       if (*h == '%' && h[1] == '2' && h[2] == 'F')
1151         {
1152           *t = '/';
1153           h += 2;
1154         }
1155       else
1156         *t = *h;
1157     }
1158   *t = '\0';
1159
1160   return newdir;
1161 }
1162
1163 /* Sync u->path and u->url with u->dir and u->file.  Called after
1164    u->file or u->dir have been changed, typically by the FTP code.  */
1165
1166 static void
1167 sync_path (struct url *u)
1168 {
1169   char *newpath, *efile, *edir;
1170
1171   xfree (u->path);
1172
1173   /* u->dir and u->file are not escaped.  URL-escape them before
1174      reassembling them into u->path.  That way, if they contain
1175      separators like '?' or even if u->file contains slashes, the
1176      path will be correctly assembled.  (u->file can contain slashes
1177      if the URL specifies it with %2f, or if an FTP server returns
1178      it.)  */
1179   edir = url_escape_dir (u->dir);
1180   efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1181
1182   if (!*edir)
1183     newpath = xstrdup (efile);
1184   else
1185     {
1186       int dirlen = strlen (edir);
1187       int filelen = strlen (efile);
1188
1189       /* Copy "DIR/FILE" to newpath. */
1190       char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1191       memcpy (p, edir, dirlen);
1192       p += dirlen;
1193       *p++ = '/';
1194       memcpy (p, efile, filelen);
1195       p += filelen;
1196       *p++ = '\0';
1197     }
1198
1199   u->path = newpath;
1200
1201   if (edir != u->dir)
1202     xfree (edir);
1203   if (efile != u->file)
1204     xfree (efile);
1205
1206   /* Regenerate u->url as well.  */
1207   xfree (u->url);
1208   u->url = url_string (u, 0);
1209 }
1210
1211 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1212    This way we can sync u->path and u->url when they get changed.  */
1213
1214 void
1215 url_set_dir (struct url *url, const char *newdir)
1216 {
1217   xfree (url->dir);
1218   url->dir = xstrdup (newdir);
1219   sync_path (url);
1220 }
1221
1222 void
1223 url_set_file (struct url *url, const char *newfile)
1224 {
1225   xfree (url->file);
1226   url->file = xstrdup (newfile);
1227   sync_path (url);
1228 }
1229
1230 void
1231 url_free (struct url *url)
1232 {
1233   xfree (url->host);
1234   xfree (url->path);
1235   xfree (url->url);
1236
1237   FREE_MAYBE (url->params);
1238   FREE_MAYBE (url->query);
1239   FREE_MAYBE (url->fragment);
1240   FREE_MAYBE (url->user);
1241   FREE_MAYBE (url->passwd);
1242
1243   xfree (url->dir);
1244   xfree (url->file);
1245
1246   xfree (url);
1247 }
1248 \f
1249 struct urlpos *
1250 get_urls_file (const char *file)
1251 {
1252   struct file_memory *fm;
1253   struct urlpos *head, *tail;
1254   const char *text, *text_end;
1255
1256   /* Load the file.  */
1257   fm = read_file (file);
1258   if (!fm)
1259     {
1260       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
1261       return NULL;
1262     }
1263   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
1264
1265   head = tail = NULL;
1266   text = fm->content;
1267   text_end = fm->content + fm->length;
1268   while (text < text_end)
1269     {
1270       const char *line_beg = text;
1271       const char *line_end = memchr (text, '\n', text_end - text);
1272       if (!line_end)
1273         line_end = text_end;
1274       else
1275         ++line_end;
1276       text = line_end;
1277
1278       /* Strip whitespace from the beginning and end of line. */
1279       while (line_beg < line_end && ISSPACE (*line_beg))
1280         ++line_beg;
1281       while (line_end > line_beg && ISSPACE (*(line_end - 1)))
1282         --line_end;
1283
1284       if (line_end > line_beg)
1285         {
1286           /* URL is in the [line_beg, line_end) region. */
1287
1288           int up_error_code;
1289           char *url_text;
1290           struct urlpos *entry;
1291           struct url *url;
1292
1293           /* We must copy the URL to a zero-terminated string, and we
1294              can't use alloca because we're in a loop.  *sigh*.  */
1295           url_text = strdupdelim (line_beg, line_end);
1296
1297           if (opt.base_href)
1298             {
1299               /* Merge opt.base_href with URL. */
1300               char *merged = uri_merge (opt.base_href, url_text);
1301               xfree (url_text);
1302               url_text = merged;
1303             }
1304
1305           url = url_parse (url_text, &up_error_code);
1306           if (!url)
1307             {
1308               logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1309                          file, url_text, url_error (up_error_code));
1310               xfree (url_text);
1311               continue;
1312             }
1313           xfree (url_text);
1314
1315           entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1316           memset (entry, 0, sizeof (*entry));
1317           entry->next = NULL;
1318           entry->url = url;
1319
1320           if (!head)
1321             head = entry;
1322           else
1323             tail->next = entry;
1324           tail = entry;
1325         }
1326     }
1327   read_file_free (fm);
1328   return head;
1329 }
1330 \f
1331 /* Free the linked list of urlpos.  */
1332 void
1333 free_urlpos (struct urlpos *l)
1334 {
1335   while (l)
1336     {
1337       struct urlpos *next = l->next;
1338       if (l->url)
1339         url_free (l->url);
1340       FREE_MAYBE (l->local_name);
1341       xfree (l);
1342       l = next;
1343     }
1344 }
1345
1346 /* Rotate FNAME opt.backups times */
1347 void
1348 rotate_backups(const char *fname)
1349 {
1350   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1351   char *from = (char *)alloca (maxlen);
1352   char *to = (char *)alloca (maxlen);
1353   struct stat sb;
1354   int i;
1355
1356   if (stat (fname, &sb) == 0)
1357     if (S_ISREG (sb.st_mode) == 0)
1358       return;
1359
1360   for (i = opt.backups; i > 1; i--)
1361     {
1362       sprintf (from, "%s.%d", fname, i - 1);
1363       sprintf (to, "%s.%d", fname, i);
1364       rename (from, to);
1365     }
1366
1367   sprintf (to, "%s.%d", fname, 1);
1368   rename(fname, to);
1369 }
1370
1371 /* Create all the necessary directories for PATH (a file).  Calls
1372    mkdirhier() internally.  */
1373 int
1374 mkalldirs (const char *path)
1375 {
1376   const char *p;
1377   char *t;
1378   struct stat st;
1379   int res;
1380
1381   p = path + strlen (path);
1382   for (; *p != '/' && p != path; p--)
1383     ;
1384
1385   /* Don't create if it's just a file.  */
1386   if ((p == path) && (*p != '/'))
1387     return 0;
1388   t = strdupdelim (path, p);
1389
1390   /* Check whether the directory exists.  */
1391   if ((stat (t, &st) == 0))
1392     {
1393       if (S_ISDIR (st.st_mode))
1394         {
1395           xfree (t);
1396           return 0;
1397         }
1398       else
1399         {
1400           /* If the dir exists as a file name, remove it first.  This
1401              is *only* for Wget to work with buggy old CERN http
1402              servers.  Here is the scenario: When Wget tries to
1403              retrieve a directory without a slash, e.g.
1404              http://foo/bar (bar being a directory), CERN server will
1405              not redirect it too http://foo/bar/ -- it will generate a
1406              directory listing containing links to bar/file1,
1407              bar/file2, etc.  Wget will lose because it saves this
1408              HTML listing to a file `bar', so it cannot create the
1409              directory.  To work around this, if the file of the same
1410              name exists, we just remove it and create the directory
1411              anyway.  */
1412           DEBUGP (("Removing %s because of directory danger!\n", t));
1413           unlink (t);
1414         }
1415     }
1416   res = make_directory (t);
1417   if (res != 0)
1418     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1419   xfree (t);
1420   return res;
1421 }
1422 \f
1423 /* Functions for constructing the file name out of URL components.  */
1424
1425 /* A growable string structure, used by url_file_name and friends.
1426    This should perhaps be moved to utils.c.
1427
1428    The idea is to have a convenient and efficient way to construct a
1429    string by having various functions append data to it.  Instead of
1430    passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1431    functions in questions, we pass the pointer to this struct.  */
1432
1433 struct growable {
1434   char *base;
1435   int size;
1436   int tail;
1437 };
1438
1439 /* Ensure that the string can accept APPEND_COUNT more characters past
1440    the current TAIL position.  If necessary, this will grow the string
1441    and update its allocated size.  If the string is already large
1442    enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1443 #define GROW(g, append_size) do {                                       \
1444   struct growable *G_ = g;                                              \
1445   DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1446 } while (0)
1447
1448 /* Return the tail position of the string. */
1449 #define TAIL(r) ((r)->base + (r)->tail)
1450
1451 /* Move the tail position by APPEND_COUNT characters. */
1452 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1453
1454 /* Append the string STR to DEST.  NOTICE: the string in DEST is not
1455    terminated.  */
1456
1457 static void
1458 append_string (const char *str, struct growable *dest)
1459 {
1460   int l = strlen (str);
1461   GROW (dest, l);
1462   memcpy (TAIL (dest), str, l);
1463   TAIL_INCR (dest, l);
1464 }
1465
1466 /* Append CH to DEST.  For example, append_char (0, DEST)
1467    zero-terminates DEST.  */
1468
1469 static void
1470 append_char (char ch, struct growable *dest)
1471 {
1472   GROW (dest, 1);
1473   *TAIL (dest) = ch;
1474   TAIL_INCR (dest, 1);
1475 }
1476
1477 enum {
1478   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1479   filechr_not_windows = 2,      /* unusable on Windows, one of \|/<>?:*" */
1480   filechr_control     = 4,      /* a control character, e.g. 0-31 */
1481 };
1482
1483 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1484
1485 /* Shorthands for the table: */
1486 #define U filechr_not_unix
1487 #define W filechr_not_windows
1488 #define C filechr_control
1489
1490 #define UW U|W
1491 #define UWC U|W|C
1492
1493 /* Table of characters unsafe under various conditions (see above).
1494
1495    Arguably we could also claim `%' to be unsafe, since we use it as
1496    the escape character.  If we ever want to be able to reliably
1497    translate file name back to URL, this would become important
1498    crucial.  Right now, it's better to be minimal in escaping.  */
1499
1500 const static unsigned char filechr_table[256] =
1501 {
1502 UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1503   C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1504   C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1505   C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1506   0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1507   0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1508   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1509   0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
1510   0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1511   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1512   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1513   0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1514   0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1515   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1516   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1517   0,  0,  0,  0,   0,  0,  0,  0,   /* x   y   z   {    |   }   ~   DEL */
1518
1519   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
1520   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
1521   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1522   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1523
1524   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1525   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1526   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1527   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1528 };
1529 #undef U
1530 #undef W
1531 #undef C
1532 #undef UW
1533 #undef UWC
1534
1535 /* FN_PORT_SEP is the separator between host and port in file names
1536    for non-standard port numbers.  On Unix this is normally ':', as in
1537    "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1538    because Windows can't handle ':' in file names.  */
1539 #define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1540
1541 /* FN_QUERY_SEP is the separator between the file name and the URL
1542    query, normally '?'.  Since Windows cannot handle '?' as part of
1543    file name, we use '@' instead there.  */
1544 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1545
1546 /* Quote path element, characters in [b, e), as file name, and append
1547    the quoted string to DEST.  Each character is quoted as per
1548    file_unsafe_char and the corresponding table.  */
1549
1550 static void
1551 append_uri_pathel (const char *b, const char *e, struct growable *dest)
1552 {
1553   char *pathel;
1554   int pathlen;
1555
1556   const char *p;
1557   int quoted, outlen;
1558
1559   int mask;
1560   if (opt.restrict_files_os == restrict_unix)
1561     mask = filechr_not_unix;
1562   else
1563     mask = filechr_not_windows;
1564   if (opt.restrict_files_ctrl)
1565     mask |= filechr_control;
1566
1567   /* Copy [b, e) to PATHEL and URL-unescape it. */
1568   BOUNDED_TO_ALLOCA (b, e, pathel);
1569   url_unescape (pathel);
1570   pathlen = strlen (pathel);
1571
1572   /* Go through PATHEL and check how many characters we'll need to
1573      add for file quoting. */
1574   quoted = 0;
1575   for (p = pathel; *p; p++)
1576     if (FILE_CHAR_TEST (*p, mask))
1577       ++quoted;
1578
1579   /* p - pathel is the string length.  Each quoted char means two
1580      additional characters in the string, hence 2*quoted.  */
1581   outlen = (p - pathel) + (2 * quoted);
1582   GROW (dest, outlen);
1583
1584   if (!quoted)
1585     {
1586       /* If there's nothing to quote, we don't need to go through the
1587          string the second time.  */
1588       memcpy (TAIL (dest), pathel, outlen);
1589     }
1590   else
1591     {
1592       char *q = TAIL (dest);
1593       for (p = pathel; *p; p++)
1594         {
1595           if (!FILE_CHAR_TEST (*p, mask))
1596             *q++ = *p;
1597           else
1598             {
1599               unsigned char ch = *p;
1600               *q++ = '%';
1601               *q++ = XNUM_TO_DIGIT (ch >> 4);
1602               *q++ = XNUM_TO_DIGIT (ch & 0xf);
1603             }
1604         }
1605       assert (q - TAIL (dest) == outlen);
1606     }
1607   TAIL_INCR (dest, outlen);
1608 }
1609
1610 /* Append to DEST the directory structure that corresponds the
1611    directory part of URL's path.  For example, if the URL is
1612    http://server/dir1/dir2/file, this appends "/dir1/dir2".
1613
1614    Each path element ("dir1" and "dir2" in the above example) is
1615    examined, url-unescaped, and re-escaped as file name element.
1616
1617    Additionally, it cuts as many directories from the path as
1618    specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1619    will produce "bar" for the above example.  For 2 or more, it will
1620    produce "".
1621
1622    Each component of the path is quoted for use as file name.  */
1623
1624 static void
1625 append_dir_structure (const struct url *u, struct growable *dest)
1626 {
1627   char *pathel, *next;
1628   int cut = opt.cut_dirs;
1629
1630   /* Go through the path components, de-URL-quote them, and quote them
1631      (if necessary) as file names.  */
1632
1633   pathel = u->path;
1634   for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1635     {
1636       if (cut-- > 0)
1637         continue;
1638       if (pathel == next)
1639         /* Ignore empty pathels.  path_simplify should remove
1640            occurrences of "//" from the path, but it has special cases
1641            for starting / which generates an empty pathel here.  */
1642         continue;
1643
1644       if (dest->tail)
1645         append_char ('/', dest);
1646       append_uri_pathel (pathel, next, dest);
1647     }
1648 }
1649
1650 /* Return a unique file name that matches the given URL as good as
1651    possible.  Does not create directories on the file system.  */
1652
1653 char *
1654 url_file_name (const struct url *u)
1655 {
1656   struct growable fnres;
1657
1658   char *u_file, *u_query;
1659   char *fname, *unique;
1660
1661   fnres.base = NULL;
1662   fnres.size = 0;
1663   fnres.tail = 0;
1664
1665   /* Start with the directory prefix, if specified. */
1666   if (!DOTP (opt.dir_prefix))
1667     append_string (opt.dir_prefix, &fnres);
1668
1669   /* If "dirstruct" is turned on (typically the case with -r), add
1670      the host and port (unless those have been turned off) and
1671      directory structure.  */
1672   if (opt.dirstruct)
1673     {
1674       if (opt.add_hostdir)
1675         {
1676           if (fnres.tail)
1677             append_char ('/', &fnres);
1678           append_string (u->host, &fnres);
1679           if (u->port != scheme_default_port (u->scheme))
1680             {
1681               char portstr[24];
1682               number_to_string (portstr, u->port);
1683               append_char (FN_PORT_SEP, &fnres);
1684               append_string (portstr, &fnres);
1685             }
1686         }
1687
1688       append_dir_structure (u, &fnres);
1689     }
1690
1691   /* Add the file name. */
1692   if (fnres.tail)
1693     append_char ('/', &fnres);
1694   u_file = *u->file ? u->file : "index.html";
1695   append_uri_pathel (u_file, u_file + strlen (u_file), &fnres);
1696
1697   /* Append "?query" to the file name. */
1698   u_query = u->query && *u->query ? u->query : NULL;
1699   if (u_query)
1700     {
1701       append_char (FN_QUERY_SEP, &fnres);
1702       append_uri_pathel (u_query, u_query + strlen (u_query), &fnres);
1703     }
1704
1705   /* Zero-terminate the file name. */
1706   append_char ('\0', &fnres);
1707
1708   fname = fnres.base;
1709
1710   /* Check the cases in which the unique extensions are not used:
1711      1) Clobbering is turned off (-nc).
1712      2) Retrieval with regetting.
1713      3) Timestamping is used.
1714      4) Hierarchy is built.
1715
1716      The exception is the case when file does exist and is a
1717      directory (see `mkalldirs' for explanation).  */
1718
1719   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1720       && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1721     return fname;
1722
1723   unique = unique_name (fname, 1);
1724   if (unique != fname)
1725     xfree (fname);
1726   return unique;
1727 }
1728
1729 /* Return the length of URL's path.  Path is considered to be
1730    terminated by one of '?', ';', '#', or by the end of the
1731    string.  */
1732 static int
1733 path_length (const char *url)
1734 {
1735   const char *q = strpbrk_or_eos (url, "?;#");
1736   return q - url;
1737 }
1738
1739 /* Find the last occurrence of character C in the range [b, e), or
1740    NULL, if none are present.  This is equivalent to strrchr(b, c),
1741    except that it accepts an END argument instead of requiring the
1742    string to be zero-terminated.  Why is there no memrchr()?  */
1743 static const char *
1744 find_last_char (const char *b, const char *e, char c)
1745 {
1746   for (; e > b; e--)
1747     if (*e == c)
1748       return e;
1749   return NULL;
1750 }
1751 \f
1752 /* Resolve "." and ".." elements of PATH by destructively modifying
1753    PATH.  "." is resolved by removing that path element, and ".." is
1754    resolved by removing the preceding path element.  Leading and
1755    trailing slashes are preserved.
1756
1757    Return non-zero if any changes have been made.
1758
1759    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1760    test examples are provided below.  If you change anything in this
1761    function, run test_path_simplify to make sure you haven't broken a
1762    test case.
1763
1764    A previous version of this function was based on path_simplify()
1765    from GNU Bash, but it has been rewritten for Wget 1.8.1.  */
1766
1767 static int
1768 path_simplify (char *path)
1769 {
1770   int change = 0;
1771   char *p, *end;
1772
1773   if (path[0] == '/')
1774     ++path;                     /* preserve the leading '/'. */
1775
1776   p = path;
1777   end = p + strlen (p) + 1;     /* position past the terminating zero. */
1778
1779   while (1)
1780     {
1781     again:
1782       /* P should point to the beginning of a path element. */
1783
1784       if (*p == '.' && (*(p + 1) == '/' || *(p + 1) == '\0'))
1785         {
1786           /* Handle "./foo" by moving "foo" two characters to the
1787              left. */
1788           if (*(p + 1) == '/')
1789             {
1790               change = 1;
1791               memmove (p, p + 2, end - p);
1792               end -= 2;
1793               goto again;
1794             }
1795           else
1796             {
1797               change = 1;
1798               *p = '\0';
1799               break;
1800             }
1801         }
1802       else if (*p == '.' && *(p + 1) == '.'
1803                && (*(p + 2) == '/' || *(p + 2) == '\0'))
1804         {
1805           /* Handle "../foo" by moving "foo" one path element to the
1806              left.  */
1807           char *b = p;          /* not p-1 because P can equal PATH */
1808
1809           /* Backtrack by one path element, but not past the beginning
1810              of PATH. */
1811
1812           /* foo/bar/../baz */
1813           /*         ^ p    */
1814           /*     ^ b        */
1815
1816           if (b > path)
1817             {
1818               /* Move backwards until B hits the beginning of the
1819                  previous path element or the beginning of path. */
1820               for (--b; b > path && *(b - 1) != '/'; b--)
1821                 ;
1822             }
1823
1824           change = 1;
1825           if (*(p + 2) == '/')
1826             {
1827               memmove (b, p + 3, end - (p + 3));
1828               end -= (p + 3) - b;
1829               p = b;
1830             }
1831           else
1832             {
1833               *b = '\0';
1834               break;
1835             }
1836
1837           goto again;
1838         }
1839       else if (*p == '/')
1840         {
1841           /* Remove empty path elements.  Not mandated by rfc1808 et
1842              al, but it seems like a good idea to get rid of them.
1843              Supporting them properly is hard (in which directory do
1844              you save http://x.com///y.html?) and they don't seem to
1845              bring much gain.  */
1846           char *q = p;
1847           while (*q == '/')
1848             ++q;
1849           change = 1;
1850           if (*q == '\0')
1851             {
1852               *p = '\0';
1853               break;
1854             }
1855           memmove (p, q, end - q);
1856           end -= q - p;
1857           goto again;
1858         }
1859
1860       /* Skip to the next path element. */
1861       while (*p && *p != '/')
1862         ++p;
1863       if (*p == '\0')
1864         break;
1865
1866       /* Make sure P points to the beginning of the next path element,
1867          which is location after the slash. */
1868       ++p;
1869     }
1870
1871   return change;
1872 }
1873 \f
1874 /* Resolve the result of "linking" a base URI (BASE) to a
1875    link-specified URI (LINK).
1876
1877    Either of the URIs may be absolute or relative, complete with the
1878    host name, or path only.  This tries to behave "reasonably" in all
1879    foreseeable cases.  It employs little specific knowledge about
1880    schemes or URL-specific stuff -- it just works on strings.
1881
1882    The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1883    See uri_merge for a gentler interface to this functionality.
1884
1885    Perhaps this function should call path_simplify so that the callers
1886    don't have to call url_parse unconditionally.  */
1887 static char *
1888 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1889 {
1890   char *constr;
1891
1892   if (no_scheme)
1893     {
1894       const char *end = base + path_length (base);
1895
1896       if (!*link)
1897         {
1898           /* Empty LINK points back to BASE, query string and all. */
1899           constr = xstrdup (base);
1900         }
1901       else if (*link == '?')
1902         {
1903           /* LINK points to the same location, but changes the query
1904              string.  Examples: */
1905           /* uri_merge("path",         "?new") -> "path?new"     */
1906           /* uri_merge("path?foo",     "?new") -> "path?new"     */
1907           /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1908           /* uri_merge("path#foo",     "?new") -> "path?new"     */
1909           int baselength = end - base;
1910           constr = xmalloc (baselength + linklength + 1);
1911           memcpy (constr, base, baselength);
1912           memcpy (constr + baselength, link, linklength);
1913           constr[baselength + linklength] = '\0';
1914         }
1915       else if (*link == '#')
1916         {
1917           /* uri_merge("path",         "#new") -> "path#new"     */
1918           /* uri_merge("path#foo",     "#new") -> "path#new"     */
1919           /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1920           /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1921           int baselength;
1922           const char *end1 = strchr (base, '#');
1923           if (!end1)
1924             end1 = base + strlen (base);
1925           baselength = end1 - base;
1926           constr = xmalloc (baselength + linklength + 1);
1927           memcpy (constr, base, baselength);
1928           memcpy (constr + baselength, link, linklength);
1929           constr[baselength + linklength] = '\0';
1930         }
1931       else if (linklength > 1 && *link == '/' && *(link + 1) == '/')
1932         {
1933           /* LINK begins with "//" and so is a net path: we need to
1934              replace everything after (and including) the double slash
1935              with LINK. */
1936
1937           /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1938           /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1939           /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1940
1941           int span;
1942           const char *slash;
1943           const char *start_insert;
1944
1945           /* Look for first slash. */
1946           slash = memchr (base, '/', end - base);
1947           /* If found slash and it is a double slash, then replace
1948              from this point, else default to replacing from the
1949              beginning.  */
1950           if (slash && *(slash + 1) == '/')
1951             start_insert = slash;
1952           else
1953             start_insert = base;
1954
1955           span = start_insert - base;
1956           constr = (char *)xmalloc (span + linklength + 1);
1957           if (span)
1958             memcpy (constr, base, span);
1959           memcpy (constr + span, link, linklength);
1960           constr[span + linklength] = '\0';
1961         }
1962       else if (*link == '/')
1963         {
1964           /* LINK is an absolute path: we need to replace everything
1965              after (and including) the FIRST slash with LINK.
1966
1967              So, if BASE is "http://host/whatever/foo/bar", and LINK is
1968              "/qux/xyzzy", our result should be
1969              "http://host/qux/xyzzy".  */
1970           int span;
1971           const char *slash;
1972           const char *start_insert = NULL; /* for gcc to shut up. */
1973           const char *pos = base;
1974           int seen_slash_slash = 0;
1975           /* We're looking for the first slash, but want to ignore
1976              double slash. */
1977         again:
1978           slash = memchr (pos, '/', end - pos);
1979           if (slash && !seen_slash_slash)
1980             if (*(slash + 1) == '/')
1981               {
1982                 pos = slash + 2;
1983                 seen_slash_slash = 1;
1984                 goto again;
1985               }
1986
1987           /* At this point, SLASH is the location of the first / after
1988              "//", or the first slash altogether.  START_INSERT is the
1989              pointer to the location where LINK will be inserted.  When
1990              examining the last two examples, keep in mind that LINK
1991              begins with '/'. */
1992
1993           if (!slash && !seen_slash_slash)
1994             /* example: "foo" */
1995             /*           ^    */
1996             start_insert = base;
1997           else if (!slash && seen_slash_slash)
1998             /* example: "http://foo" */
1999             /*                     ^ */
2000             start_insert = end;
2001           else if (slash && !seen_slash_slash)
2002             /* example: "foo/bar" */
2003             /*           ^        */
2004             start_insert = base;
2005           else if (slash && seen_slash_slash)
2006             /* example: "http://something/" */
2007             /*                           ^  */
2008             start_insert = slash;
2009
2010           span = start_insert - base;
2011           constr = (char *)xmalloc (span + linklength + 1);
2012           if (span)
2013             memcpy (constr, base, span);
2014           if (linklength)
2015             memcpy (constr + span, link, linklength);
2016           constr[span + linklength] = '\0';
2017         }
2018       else
2019         {
2020           /* LINK is a relative URL: we need to replace everything
2021              after last slash (possibly empty) with LINK.
2022
2023              So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
2024              our result should be "whatever/foo/qux/xyzzy".  */
2025           int need_explicit_slash = 0;
2026           int span;
2027           const char *start_insert;
2028           const char *last_slash = find_last_char (base, end, '/');
2029           if (!last_slash)
2030             {
2031               /* No slash found at all.  Append LINK to what we have,
2032                  but we'll need a slash as a separator.
2033
2034                  Example: if base == "foo" and link == "qux/xyzzy", then
2035                  we cannot just append link to base, because we'd get
2036                  "fooqux/xyzzy", whereas what we want is
2037                  "foo/qux/xyzzy".
2038
2039                  To make sure the / gets inserted, we set
2040                  need_explicit_slash to 1.  We also set start_insert
2041                  to end + 1, so that the length calculations work out
2042                  correctly for one more (slash) character.  Accessing
2043                  that character is fine, since it will be the
2044                  delimiter, '\0' or '?'.  */
2045               /* example: "foo?..." */
2046               /*               ^    ('?' gets changed to '/') */
2047               start_insert = end + 1;
2048               need_explicit_slash = 1;
2049             }
2050           else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
2051             {
2052               /* example: http://host"  */
2053               /*                      ^ */
2054               start_insert = end + 1;
2055               need_explicit_slash = 1;
2056             }
2057           else
2058             {
2059               /* example: "whatever/foo/bar" */
2060               /*                        ^    */
2061               start_insert = last_slash + 1;
2062             }
2063
2064           span = start_insert - base;
2065           constr = (char *)xmalloc (span + linklength + 1);
2066           if (span)
2067             memcpy (constr, base, span);
2068           if (need_explicit_slash)
2069             constr[span - 1] = '/';
2070           if (linklength)
2071             memcpy (constr + span, link, linklength);
2072           constr[span + linklength] = '\0';
2073         }
2074     }
2075   else /* !no_scheme */
2076     {
2077       constr = strdupdelim (link, link + linklength);
2078     }
2079   return constr;
2080 }
2081
2082 /* Merge BASE with LINK and return the resulting URI.  This is an
2083    interface to uri_merge_1 that assumes that LINK is a
2084    zero-terminated string.  */
2085 char *
2086 uri_merge (const char *base, const char *link)
2087 {
2088   return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
2089 }
2090 \f
2091 #define APPEND(p, s) do {                       \
2092   int len = strlen (s);                         \
2093   memcpy (p, s, len);                           \
2094   p += len;                                     \
2095 } while (0)
2096
2097 /* Use this instead of password when the actual password is supposed
2098    to be hidden.  We intentionally use a generic string without giving
2099    away the number of characters in the password, like previous
2100    versions did.  */
2101 #define HIDDEN_PASSWORD "*password*"
2102
2103 /* Recreate the URL string from the data in URL.
2104
2105    If HIDE is non-zero (as it is when we're calling this on a URL we
2106    plan to print, but not when calling it to canonicalize a URL for
2107    use within the program), password will be hidden.  Unsafe
2108    characters in the URL will be quoted.  */
2109
2110 char *
2111 url_string (const struct url *url, int hide_password)
2112 {
2113   int size;
2114   char *result, *p;
2115   char *quoted_user = NULL, *quoted_passwd = NULL;
2116
2117   int scheme_port  = supported_schemes[url->scheme].default_port;
2118   char *scheme_str = supported_schemes[url->scheme].leading_string;
2119   int fplen = full_path_length (url);
2120
2121   int brackets_around_host = 0;
2122
2123   assert (scheme_str != NULL);
2124
2125   /* Make sure the user name and password are quoted. */
2126   if (url->user)
2127     {
2128       quoted_user = url_escape_allow_passthrough (url->user);
2129       if (url->passwd)
2130         {
2131           if (hide_password)
2132             quoted_passwd = HIDDEN_PASSWORD;
2133           else
2134             quoted_passwd = url_escape_allow_passthrough (url->passwd);
2135         }
2136     }
2137
2138   if (strchr (url->host, ':'))
2139     brackets_around_host = 1;
2140
2141   size = (strlen (scheme_str)
2142           + strlen (url->host)
2143           + (brackets_around_host ? 2 : 0)
2144           + fplen
2145           + 1);
2146   if (url->port != scheme_port)
2147     size += 1 + numdigit (url->port);
2148   if (quoted_user)
2149     {
2150       size += 1 + strlen (quoted_user);
2151       if (quoted_passwd)
2152         size += 1 + strlen (quoted_passwd);
2153     }
2154
2155   p = result = xmalloc (size);
2156
2157   APPEND (p, scheme_str);
2158   if (quoted_user)
2159     {
2160       APPEND (p, quoted_user);
2161       if (quoted_passwd)
2162         {
2163           *p++ = ':';
2164           APPEND (p, quoted_passwd);
2165         }
2166       *p++ = '@';
2167     }
2168
2169   if (brackets_around_host)
2170     *p++ = '[';
2171   APPEND (p, url->host);
2172   if (brackets_around_host)
2173     *p++ = ']';
2174   if (url->port != scheme_port)
2175     {
2176       *p++ = ':';
2177       p = number_to_string (p, url->port);
2178     }
2179
2180   full_path_write (url, p);
2181   p += fplen;
2182   *p++ = '\0';
2183
2184   assert (p - result == size);
2185
2186   if (quoted_user && quoted_user != url->user)
2187     xfree (quoted_user);
2188   if (quoted_passwd && !hide_password
2189       && quoted_passwd != url->passwd)
2190     xfree (quoted_passwd);
2191
2192   return result;
2193 }
2194 \f
2195 /* Return the URL of the proxy appropriate for url U.  */
2196 char *
2197 getproxy (struct url *u)
2198 {
2199   char *proxy = NULL;
2200   char *rewritten_url;
2201   static char rewritten_storage[1024];
2202
2203   if (!opt.use_proxy)
2204     return NULL;
2205   if (!no_proxy_match (u->host, (const char **)opt.no_proxy))
2206     return NULL;
2207
2208   switch (u->scheme)
2209     {
2210     case SCHEME_HTTP:
2211       proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
2212       break;
2213 #ifdef HAVE_SSL
2214     case SCHEME_HTTPS:
2215       proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
2216       break;
2217 #endif
2218     case SCHEME_FTP:
2219       proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
2220       break;
2221     case SCHEME_INVALID:
2222       break;
2223     }
2224   if (!proxy || !*proxy)
2225     return NULL;
2226
2227   /* Handle shorthands.  `rewritten_storage' is a kludge to allow
2228      getproxy() to return static storage. */
2229   rewritten_url = rewrite_shorthand_url (proxy);
2230   if (rewritten_url)
2231     {
2232       strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
2233       rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
2234       proxy = rewritten_storage;
2235     }
2236
2237   return proxy;
2238 }
2239
2240 /* Should a host be accessed through proxy, concerning no_proxy?  */
2241 int
2242 no_proxy_match (const char *host, const char **no_proxy)
2243 {
2244   if (!no_proxy)
2245     return 1;
2246   else
2247     return !sufmatch (no_proxy, host);
2248 }
2249 \f
2250 /* Support for converting links for local viewing in downloaded HTML
2251    files.  This should be moved to another file, because it has
2252    nothing to do with processing URLs.  */
2253
2254 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
2255 static const char *replace_attr PARAMS ((const char *, int, FILE *,
2256                                          const char *));
2257 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
2258                                                       const char *, int));
2259 static char *local_quote_string PARAMS ((const char *));
2260
2261 /* Change the links in one HTML file.  LINKS is a list of links in the
2262    document, along with their positions and the desired direction of
2263    the conversion.  */
2264 void
2265 convert_links (const char *file, struct urlpos *links)
2266 {
2267   struct file_memory *fm;
2268   FILE *fp;
2269   const char *p;
2270   downloaded_file_t downloaded_file_return;
2271
2272   struct urlpos *link;
2273   int to_url_count = 0, to_file_count = 0;
2274
2275   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
2276
2277   {
2278     /* First we do a "dry run": go through the list L and see whether
2279        any URL needs to be converted in the first place.  If not, just
2280        leave the file alone.  */
2281     int dry_count = 0;
2282     struct urlpos *dry = links;
2283     for (dry = links; dry; dry = dry->next)
2284       if (dry->convert != CO_NOCONVERT)
2285         ++dry_count;
2286     if (!dry_count)
2287       {
2288         logputs (LOG_VERBOSE, _("nothing to do.\n"));
2289         return;
2290       }
2291   }
2292
2293   fm = read_file (file);
2294   if (!fm)
2295     {
2296       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2297                  file, strerror (errno));
2298       return;
2299     }
2300
2301   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
2302   if (opt.backup_converted && downloaded_file_return)
2303     write_backup_file (file, downloaded_file_return);
2304
2305   /* Before opening the file for writing, unlink the file.  This is
2306      important if the data in FM is mmaped.  In such case, nulling the
2307      file, which is what fopen() below does, would make us read all
2308      zeroes from the mmaped region.  */
2309   if (unlink (file) < 0 && errno != ENOENT)
2310     {
2311       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
2312                  file, strerror (errno));
2313       read_file_free (fm);
2314       return;
2315     }
2316   /* Now open the file for writing.  */
2317   fp = fopen (file, "wb");
2318   if (!fp)
2319     {
2320       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2321                  file, strerror (errno));
2322       read_file_free (fm);
2323       return;
2324     }
2325
2326   /* Here we loop through all the URLs in file, replacing those of
2327      them that are downloaded with relative references.  */
2328   p = fm->content;
2329   for (link = links; link; link = link->next)
2330     {
2331       char *url_start = fm->content + link->pos;
2332
2333       if (link->pos >= fm->length)
2334         {
2335           DEBUGP (("Something strange is going on.  Please investigate."));
2336           break;
2337         }
2338       /* If the URL is not to be converted, skip it.  */
2339       if (link->convert == CO_NOCONVERT)
2340         {
2341           DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
2342           continue;
2343         }
2344
2345       /* Echo the file contents, up to the offending URL's opening
2346          quote, to the outfile.  */
2347       fwrite (p, 1, url_start - p, fp);
2348       p = url_start;
2349
2350       switch (link->convert)
2351         {
2352         case CO_CONVERT_TO_RELATIVE:
2353           /* Convert absolute URL to relative. */
2354           {
2355             char *newname = construct_relative (file, link->local_name);
2356             char *quoted_newname = local_quote_string (newname);
2357
2358             if (!link->link_refresh_p)
2359               p = replace_attr (p, link->size, fp, quoted_newname);
2360             else
2361               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
2362                                              link->refresh_timeout);
2363
2364             DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
2365                      link->url->url, newname, link->pos, file));
2366             xfree (newname);
2367             xfree (quoted_newname);
2368             ++to_file_count;
2369             break;
2370           }
2371         case CO_CONVERT_TO_COMPLETE:
2372           /* Convert the link to absolute URL. */
2373           {
2374             char *newlink = link->url->url;
2375             char *quoted_newlink = html_quote_string (newlink);
2376
2377             if (!link->link_refresh_p)
2378               p = replace_attr (p, link->size, fp, quoted_newlink);
2379             else
2380               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
2381                                              link->refresh_timeout);
2382
2383             DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
2384                      newlink, link->pos, file));
2385             xfree (quoted_newlink);
2386             ++to_url_count;
2387             break;
2388           }
2389         case CO_NULLIFY_BASE:
2390           /* Change the base href to "". */
2391           p = replace_attr (p, link->size, fp, "");
2392           break;
2393         case CO_NOCONVERT:
2394           abort ();
2395           break;
2396         }
2397     }
2398
2399   /* Output the rest of the file. */
2400   if (p - fm->content < fm->length)
2401     fwrite (p, 1, fm->length - (p - fm->content), fp);
2402   fclose (fp);
2403   read_file_free (fm);
2404
2405   logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
2406 }
2407
2408 /* Construct and return a malloced copy of the relative link from two
2409    pieces of information: local name S1 of the referring file and
2410    local name S2 of the referred file.
2411
2412    So, if S1 is "jagor.srce.hr/index.html" and S2 is
2413    "jagor.srce.hr/images/news.gif", the function will return
2414    "images/news.gif".
2415
2416    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
2417    "fly.cc.fer.hr/images/fly.gif", the function will return
2418    "../images/fly.gif".
2419
2420    Caveats: S1 should not begin with `/', unless S2 also begins with
2421    '/'.  S1 should not contain things like ".." and such --
2422    construct_relative ("fly/ioccc/../index.html",
2423    "fly/images/fly.gif") will fail.  (A workaround is to call
2424    something like path_simplify() on S1).  */
2425 static char *
2426 construct_relative (const char *s1, const char *s2)
2427 {
2428   int i, cnt, sepdirs1;
2429   char *res;
2430
2431   if (*s2 == '/')
2432     return xstrdup (s2);
2433   /* S1 should *not* be absolute, if S2 wasn't.  */
2434   assert (*s1 != '/');
2435   i = cnt = 0;
2436   /* Skip the directories common to both strings.  */
2437   while (1)
2438     {
2439       while (s1[i] && s2[i]
2440              && (s1[i] == s2[i])
2441              && (s1[i] != '/')
2442              && (s2[i] != '/'))
2443         ++i;
2444       if (s1[i] == '/' && s2[i] == '/')
2445         cnt = ++i;
2446       else
2447         break;
2448     }
2449   for (sepdirs1 = 0; s1[i]; i++)
2450     if (s1[i] == '/')
2451       ++sepdirs1;
2452   /* Now, construct the file as of:
2453      - ../ repeated sepdirs1 time
2454      - all the non-mutual directories of S2.  */
2455   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
2456   for (i = 0; i < sepdirs1; i++)
2457     memcpy (res + 3 * i, "../", 3);
2458   strcpy (res + 3 * i, s2 + cnt);
2459   return res;
2460 }
2461 \f
2462 static void
2463 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
2464 {
2465   /* Rather than just writing over the original .html file with the
2466      converted version, save the former to *.orig.  Note we only do
2467      this for files we've _successfully_ downloaded, so we don't
2468      clobber .orig files sitting around from previous invocations. */
2469
2470   /* Construct the backup filename as the original name plus ".orig". */
2471   size_t         filename_len = strlen(file);
2472   char*          filename_plus_orig_suffix;
2473   boolean        already_wrote_backup_file = FALSE;
2474   slist*         converted_file_ptr;
2475   static slist*  converted_files = NULL;
2476
2477   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
2478     {
2479       /* Just write "orig" over "html".  We need to do it this way
2480          because when we're checking to see if we've downloaded the
2481          file before (to see if we can skip downloading it), we don't
2482          know if it's a text/html file.  Therefore we don't know yet
2483          at that stage that -E is going to cause us to tack on
2484          ".html", so we need to compare vs. the original URL plus
2485          ".orig", not the original URL plus ".html.orig". */
2486       filename_plus_orig_suffix = alloca (filename_len + 1);
2487       strcpy(filename_plus_orig_suffix, file);
2488       strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
2489     }
2490   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
2491     {
2492       /* Append ".orig" to the name. */
2493       filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
2494       strcpy(filename_plus_orig_suffix, file);
2495       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
2496     }
2497
2498   /* We can get called twice on the same URL thanks to the
2499      convert_all_links() call in main().  If we write the .orig file
2500      each time in such a case, it'll end up containing the first-pass
2501      conversion, not the original file.  So, see if we've already been
2502      called on this file. */
2503   converted_file_ptr = converted_files;
2504   while (converted_file_ptr != NULL)
2505     if (strcmp(converted_file_ptr->string, file) == 0)
2506       {
2507         already_wrote_backup_file = TRUE;
2508         break;
2509       }
2510     else
2511       converted_file_ptr = converted_file_ptr->next;
2512
2513   if (!already_wrote_backup_file)
2514     {
2515       /* Rename <file> to <file>.orig before former gets written over. */
2516       if (rename(file, filename_plus_orig_suffix) != 0)
2517         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
2518                    file, filename_plus_orig_suffix, strerror (errno));
2519
2520       /* Remember that we've already written a .orig backup for this file.
2521          Note that we never free this memory since we need it till the
2522          convert_all_links() call, which is one of the last things the
2523          program does before terminating.  BTW, I'm not sure if it would be
2524          safe to just set 'converted_file_ptr->string' to 'file' below,
2525          rather than making a copy of the string...  Another note is that I
2526          thought I could just add a field to the urlpos structure saying
2527          that we'd written a .orig file for this URL, but that didn't work,
2528          so I had to make this separate list.
2529          -- Dan Harkless <wget@harkless.org>
2530
2531          This [adding a field to the urlpos structure] didn't work
2532          because convert_file() is called from convert_all_links at
2533          the end of the retrieval with a freshly built new urlpos
2534          list.
2535          -- Hrvoje Niksic <hniksic@arsdigita.com>
2536       */
2537       converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
2538       converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
2539       converted_file_ptr->next = converted_files;
2540       converted_files = converted_file_ptr;
2541     }
2542 }
2543
2544 static int find_fragment PARAMS ((const char *, int, const char **,
2545                                   const char **));
2546
2547 /* Replace an attribute's original text with NEW_TEXT. */
2548
2549 static const char *
2550 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
2551 {
2552   int quote_flag = 0;
2553   char quote_char = '\"';       /* use "..." for quoting, unless the
2554                                    original value is quoted, in which
2555                                    case reuse its quoting char. */
2556   const char *frag_beg, *frag_end;
2557
2558   /* Structure of our string is:
2559        "...old-contents..."
2560        <---    size    --->  (with quotes)
2561      OR:
2562        ...old-contents...
2563        <---    size   -->    (no quotes)   */
2564
2565   if (*p == '\"' || *p == '\'')
2566     {
2567       quote_char = *p;
2568       quote_flag = 1;
2569       ++p;
2570       size -= 2;                /* disregard opening and closing quote */
2571     }
2572   putc (quote_char, fp);
2573   fputs (new_text, fp);
2574
2575   /* Look for fragment identifier, if any. */
2576   if (find_fragment (p, size, &frag_beg, &frag_end))
2577     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2578   p += size;
2579   if (quote_flag)
2580     ++p;
2581   putc (quote_char, fp);
2582
2583   return p;
2584 }
2585
2586 /* The same as REPLACE_ATTR, but used when replacing
2587    <meta http-equiv=refresh content="new_text"> because we need to
2588    append "timeout_value; URL=" before the next_text.  */
2589
2590 static const char *
2591 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
2592                            const char *new_text, int timeout)
2593 {
2594   /* "0; URL=..." */
2595   char *new_with_timeout = (char *)alloca (numdigit (timeout)
2596                                            + 6 /* "; URL=" */
2597                                            + strlen (new_text)
2598                                            + 1);
2599   sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
2600
2601   return replace_attr (p, size, fp, new_with_timeout);
2602 }
2603
2604 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2605    preceded by '&'.  If the character is not found, return zero.  If
2606    the character is found, return 1 and set BP and EP to point to the
2607    beginning and end of the region.
2608
2609    This is used for finding the fragment indentifiers in URLs.  */
2610
2611 static int
2612 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2613 {
2614   const char *end = beg + size;
2615   int saw_amp = 0;
2616   for (; beg < end; beg++)
2617     {
2618       switch (*beg)
2619         {
2620         case '&':
2621           saw_amp = 1;
2622           break;
2623         case '#':
2624           if (!saw_amp)
2625             {
2626               *bp = beg;
2627               *ep = end;
2628               return 1;
2629             }
2630           /* fallthrough */
2631         default:
2632           saw_amp = 0;
2633         }
2634     }
2635   return 0;
2636 }
2637
2638 /* Quote FILE for use as local reference to an HTML file.
2639
2640    We quote ? as %3F to avoid passing part of the file name as the
2641    parameter when browsing the converted file through HTTP.  However,
2642    it is safe to do this only when `--html-extension' is turned on.
2643    This is because converting "index.html?foo=bar" to
2644    "index.html%3Ffoo=bar" would break local browsing, as the latter
2645    isn't even recognized as an HTML file!  However, converting
2646    "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
2647    safe for both local and HTTP-served browsing.  */
2648
2649 static char *
2650 local_quote_string (const char *file)
2651 {
2652   const char *file_sans_qmark;
2653   int qm;
2654
2655   if (!opt.html_extension)
2656     return html_quote_string (file);
2657
2658   qm = count_char (file, '?');
2659
2660   if (qm)
2661     {
2662       const char *from = file;
2663       char *to, *newname;
2664
2665       /* qm * 2 because we replace each question mark with "%3F",
2666          i.e. replace one char with three, hence two more.  */
2667       int fsqlen = strlen (file) + qm * 2;
2668
2669       to = newname = (char *)alloca (fsqlen + 1);
2670       for (; *from; from++)
2671         {
2672           if (*from != '?')
2673             *to++ = *from;
2674           else
2675             {
2676               *to++ = '%';
2677               *to++ = '3';
2678               *to++ = 'F';
2679             }
2680         }
2681       assert (to - newname == fsqlen);
2682       *to = '\0';
2683
2684       file_sans_qmark = newname;
2685     }
2686   else
2687     file_sans_qmark = file;
2688
2689   return html_quote_string (file_sans_qmark);
2690 }
2691
2692 /* We're storing "modes" of type downloaded_file_t in the hash table.
2693    However, our hash tables only accept pointers for keys and values.
2694    So when we need a pointer, we use the address of a
2695    downloaded_file_t variable of static storage.  */
2696
2697 static downloaded_file_t *
2698 downloaded_mode_to_ptr (downloaded_file_t mode)
2699 {
2700   static downloaded_file_t
2701     v1 = FILE_NOT_ALREADY_DOWNLOADED,
2702     v2 = FILE_DOWNLOADED_NORMALLY,
2703     v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2704     v4 = CHECK_FOR_FILE;
2705
2706   switch (mode)
2707     {
2708     case FILE_NOT_ALREADY_DOWNLOADED:
2709       return &v1;
2710     case FILE_DOWNLOADED_NORMALLY:
2711       return &v2;
2712     case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2713       return &v3;
2714     case CHECK_FOR_FILE:
2715       return &v4;
2716     }
2717   return NULL;
2718 }
2719
2720 /* This should really be merged with dl_file_url_map and
2721    downloaded_html_files in recur.c.  This was originally a list, but
2722    I changed it to a hash table beause it was actually taking a lot of
2723    time to find things in it.  */
2724
2725 static struct hash_table *downloaded_files_hash;
2726
2727 /* Remembers which files have been downloaded.  In the standard case, should be
2728    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2729    download successfully (i.e. not for ones we have failures on or that we skip
2730    due to -N).
2731
2732    When we've downloaded a file and tacked on a ".html" extension due to -E,
2733    call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2734    FILE_DOWNLOADED_NORMALLY.
2735
2736    If you just want to check if a file has been previously added without adding
2737    it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
2738    with local filenames, not remote URLs. */
2739 downloaded_file_t
2740 downloaded_file (downloaded_file_t mode, const char *file)
2741 {
2742   downloaded_file_t *ptr;
2743
2744   if (mode == CHECK_FOR_FILE)
2745     {
2746       if (!downloaded_files_hash)
2747         return FILE_NOT_ALREADY_DOWNLOADED;
2748       ptr = hash_table_get (downloaded_files_hash, file);
2749       if (!ptr)
2750         return FILE_NOT_ALREADY_DOWNLOADED;
2751       return *ptr;
2752     }
2753
2754   if (!downloaded_files_hash)
2755     downloaded_files_hash = make_string_hash_table (0);
2756
2757   ptr = hash_table_get (downloaded_files_hash, file);
2758   if (ptr)
2759     return *ptr;
2760
2761   ptr = downloaded_mode_to_ptr (mode);
2762   hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2763
2764   return FILE_NOT_ALREADY_DOWNLOADED;
2765 }
2766
2767 static int
2768 df_free_mapper (void *key, void *value, void *ignored)
2769 {
2770   xfree (key);
2771   return 0;
2772 }
2773
2774 void
2775 downloaded_files_free (void)
2776 {
2777   if (downloaded_files_hash)
2778     {
2779       hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2780       hash_table_destroy (downloaded_files_hash);
2781       downloaded_files_hash = NULL;
2782     }
2783 }
2784
2785 /* Return non-zero if scheme a is similar to scheme b.
2786
2787    Schemes are similar if they are equal.  If SSL is supported, schemes
2788    are also similar if one is http (SCHEME_HTTP) and the other is https
2789    (SCHEME_HTTPS).  */
2790 int
2791 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2792 {
2793   if (a == b)
2794     return 1;
2795 #ifdef HAVE_SSL
2796   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2797       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2798     return 1;
2799 #endif
2800   return 0;
2801 }
2802 \f
2803 #if 0
2804 /* Debugging and testing support for path_simplify. */
2805
2806 /* Debug: run path_simplify on PATH and return the result in a new
2807    string.  Useful for calling from the debugger.  */
2808 static char *
2809 ps (char *path)
2810 {
2811   char *copy = xstrdup (path);
2812   path_simplify (copy);
2813   return copy;
2814 }
2815
2816 static void
2817 run_test (char *test, char *expected_result, int expected_change)
2818 {
2819   char *test_copy = xstrdup (test);
2820   int modified = path_simplify (test_copy);
2821
2822   if (0 != strcmp (test_copy, expected_result))
2823     {
2824       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2825               test, expected_result, test_copy);
2826     }
2827   if (modified != expected_change)
2828     {
2829       if (expected_change == 1)
2830         printf ("Expected no modification with path_simplify(\"%s\").\n",
2831                 test);
2832       else
2833         printf ("Expected modification with path_simplify(\"%s\").\n",
2834                 test);
2835     }
2836   xfree (test_copy);
2837 }
2838
2839 static void
2840 test_path_simplify (void)
2841 {
2842   static struct {
2843     char *test, *result;
2844     int should_modify;
2845   } tests[] = {
2846     { "",               "",             0 },
2847     { ".",              "",             1 },
2848     { "..",             "",             1 },
2849     { "foo",            "foo",          0 },
2850     { "foo/bar",        "foo/bar",      0 },
2851     { "foo///bar",      "foo/bar",      1 },
2852     { "foo/.",          "foo/",         1 },
2853     { "foo/./",         "foo/",         1 },
2854     { "foo./",          "foo./",        0 },
2855     { "foo/../bar",     "bar",          1 },
2856     { "foo/../bar/",    "bar/",         1 },
2857     { "foo/bar/..",     "foo/",         1 },
2858     { "foo/bar/../x",   "foo/x",        1 },
2859     { "foo/bar/../x/",  "foo/x/",       1 },
2860     { "foo/..",         "",             1 },
2861     { "foo/../..",      "",             1 },
2862     { "a/b/../../c",    "c",            1 },
2863     { "./a/../b",       "b",            1 }
2864   };
2865   int i;
2866
2867   for (i = 0; i < countof (tests); i++)
2868     {
2869       char *test = tests[i].test;
2870       char *expected_result = tests[i].result;
2871       int   expected_change = tests[i].should_modify;
2872       run_test (test, expected_result, expected_change);
2873     }
2874
2875   /* Now run all the tests with a leading slash before the test case,
2876      to prove that the slash is being preserved.  */
2877   for (i = 0; i < countof (tests); i++)
2878     {
2879       char *test, *expected_result;
2880       int expected_change = tests[i].should_modify;
2881
2882       test = xmalloc (1 + strlen (tests[i].test) + 1);
2883       sprintf (test, "/%s", tests[i].test);
2884
2885       expected_result = xmalloc (1 + strlen (tests[i].result) + 1);
2886       sprintf (expected_result, "/%s", tests[i].result);
2887
2888       run_test (test, expected_result, expected_change);
2889
2890       xfree (test);
2891       xfree (expected_result);
2892     }
2893 }
2894 #endif