sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001, 2003, 2003
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2 of the License, or (at
  10 your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget; if not, write to the Free Software
  19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  20
  21 In addition, as a special exception, the Free Software Foundation
  22 gives permission to link the code of its release of Wget with the
  23 OpenSSL project's "OpenSSL" library (or with modified versions of it
  24 that use the same license as the "OpenSSL" library), and distribute
  25 the linked executables.  You must obey the GNU General Public License
  26 in all respects for all of the code used other than "OpenSSL".  If you
  27 modify this file, you may extend this exception to your version of the
  28 file, but you are not obligated to do so.  If you do not wish to do
  29 so, delete this exception statement from your version.  */
  30
  31 #include <config.h>
  32
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #ifdef HAVE_STRING_H
  36 # include <string.h>
  37 #else
  38 # include <strings.h>
  39 #endif
  40 #include <sys/types.h>
  41 #ifdef HAVE_UNISTD_H
  42 # include <unistd.h>
  43 #endif
  44 #include <errno.h>
  45 #include <assert.h>
  46
  47 #include "wget.h"
  48 #include "utils.h"
  49 #include "url.h"
  50 #include "host.h"
  51 #include "hash.h"
  52
  53 #ifndef errno
  54 extern int errno;
  55 #endif
  56
  57 /* Is X "."?  */
  58 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  59 /* Is X ".."?  */
  60 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  61
  62 static const int NS_INADDRSZ  = 4;
  63 static const int NS_IN6ADDRSZ = 16;
  64 static const int NS_INT16SZ = 2;
  65
  66
  67 struct scheme_data
  68 {
  69   char *leading_string;
  70   int default_port;
  71   int enabled;
  72 };
  73
  74 /* Supported schemes: */
  75 static struct scheme_data supported_schemes[] =
  76 {
  77   { "http://",  DEFAULT_HTTP_PORT,  1 },
  78 #ifdef HAVE_SSL
  79   { "https://", DEFAULT_HTTPS_PORT, 1 },
  80 #endif
  81   { "ftp://",   DEFAULT_FTP_PORT,   1 },
  82
  83   /* SCHEME_INVALID */
  84   { NULL,       -1,                 0 }
  85 };
  86
  87 /* Forward declarations: */
  88
  89 static char *construct_relative PARAMS ((const char *, const char *));
  90 static int path_simplify PARAMS ((char *));
  91
  92
  93 \f
  94 /* Support for encoding and decoding of URL strings.  We determine
  95    whether a character is unsafe through static table lookup.  This
  96    code assumes ASCII character set and 8-bit chars.  */
  97
  98 enum {
  99   /* rfc1738 reserved chars, preserved from encoding.  */
 100   urlchr_reserved = 1,
 101
 102   /* rfc1738 unsafe chars, plus some more.  */
 103   urlchr_unsafe   = 2
 104 };
 105
 106 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 107 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 108 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 109
 110 /* Shorthands for the table: */
 111 #define R  urlchr_reserved
 112 #define U  urlchr_unsafe
 113 #define RU R|U
 114
 115 const static unsigned char urlchr_table[256] =
 116 {
 117   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 118   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 119   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 120   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 121   U,  0,  U, RU,   0,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 122   0,  0,  0,  R,   0,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 123   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 124   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 125  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 126   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 127   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 128   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 129   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 130   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 131   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 132   0,  0,  0,  U,   U,  U,  U,  U,   /* x   y   z   {    |   }   ~   DEL */
 133
 134   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 135   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 136   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 137   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 138
 139   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 140   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 141   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 142   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 143 };
 144 #undef R
 145 #undef U
 146 #undef RU
 147
 148 /* URL-unescape the string S.
 149
 150    This is done by transforming the sequences "%HH" to the character
 151    represented by the hexadecimal digits HH.  If % is not followed by
 152    two hexadecimal digits, it is inserted literally.
 153
 154    The transformation is done in place.  If you need the original
 155    string intact, make a copy before calling this function.  */
 156
 157 static void
 158 url_unescape (char *s)
 159 {
 160   char *t = s;                  /* t - tortoise */
 161   char *h = s;                  /* h - hare     */
 162
 163   for (; *h; h++, t++)
 164     {
 165       if (*h != '%')
 166         {
 167         copychar:
 168           *t = *h;
 169         }
 170       else
 171         {
 172           /* Do nothing if '%' is not followed by two hex digits. */
 173           if (!*(h + 1) || !*(h + 2)
 174               || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
 175             goto copychar;
 176           *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
 177           h += 2;
 178         }
 179     }
 180   *t = '\0';
 181 }
 182
 183 /* The core of url_escape_* functions.  Escapes the characters that
 184    match the provided mask in urlchr_table.
 185
 186    If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars
 187    will be returned unchanged.  If ALLOW_PASSTHROUGH is zero, a
 188    freshly allocated string will be returned in all cases.  */
 189
 190 static char *
 191 url_escape_1 (const char *s, unsigned char mask, int allow_passthrough)
 192 {
 193   const char *p1;
 194   char *p2, *newstr;
 195   int newlen;
 196   int addition = 0;
 197
 198   for (p1 = s; *p1; p1++)
 199     if (urlchr_test (*p1, mask))
 200       addition += 2;            /* Two more characters (hex digits) */
 201
 202   if (!addition)
 203     return allow_passthrough ? (char *)s : xstrdup (s);
 204
 205   newlen = (p1 - s) + addition;
 206   newstr = (char *)xmalloc (newlen + 1);
 207
 208   p1 = s;
 209   p2 = newstr;
 210   while (*p1)
 211     {
 212       /* Quote the characters that match the test mask. */
 213       if (urlchr_test (*p1, mask))
 214         {
 215           unsigned char c = *p1++;
 216           *p2++ = '%';
 217           *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 218           *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 219         }
 220       else
 221         *p2++ = *p1++;
 222     }
 223   assert (p2 - newstr == newlen);
 224   *p2 = '\0';
 225
 226   return newstr;
 227 }
 228
 229 /* URL-escape the unsafe characters (see urlchr_table) in a given
 230    string, returning a freshly allocated string.  */
 231
 232 char *
 233 url_escape (const char *s)
 234 {
 235   return url_escape_1 (s, urlchr_unsafe, 0);
 236 }
 237
 238 /* URL-escape the unsafe characters (see urlchr_table) in a given
 239    string.  If no characters are unsafe, S is returned.  */
 240
 241 static char *
 242 url_escape_allow_passthrough (const char *s)
 243 {
 244   return url_escape_1 (s, urlchr_unsafe, 1);
 245 }
 246 \f
 247 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
 248
 249 /* Decide whether to encode, decode, or pass through the char at P.
 250    This used to be a macro, but it got a little too convoluted.  */
 251 static inline enum copy_method
 252 decide_copy_method (const char *p)
 253 {
 254   if (*p == '%')
 255     {
 256       if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
 257         {
 258           /* %xx sequence: decode it, unless it would decode to an
 259              unsafe or a reserved char; in that case, leave it as
 260              is. */
 261           char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
 262             XCHAR_TO_XDIGIT (*(p + 2));
 263
 264           if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt))
 265             return CM_PASSTHROUGH;
 266           else
 267             return CM_DECODE;
 268         }
 269       else
 270         /* Garbled %.. sequence: encode `%'. */
 271         return CM_ENCODE;
 272     }
 273   else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
 274     return CM_ENCODE;
 275   else
 276     return CM_PASSTHROUGH;
 277 }
 278
 279 /* Translate a %-escaped (but possibly non-conformant) input string S
 280    into a %-escaped (and conformant) output string.  If no characters
 281    are encoded or decoded, return the same string S; otherwise, return
 282    a freshly allocated string with the new contents.
 283
 284    After a URL has been run through this function, the protocols that
 285    use `%' as the quote character can use the resulting string as-is,
 286    while those that don't call url_unescape() to get to the intended
 287    data.  This function is also stable: after an input string is
 288    transformed the first time, all further transformations of the
 289    result yield the same result string.
 290
 291    Let's discuss why this function is needed.
 292
 293    Imagine Wget is to retrieve `http://abc.xyz/abc def'.  Since a raw
 294    space character would mess up the HTTP request, it needs to be
 295    quoted, like this:
 296
 297        GET /abc%20def HTTP/1.0
 298
 299    It appears that the unsafe chars need to be quoted, for example
 300    with url_escape.  But what if we're requested to download
 301    `abc%20def'?  url_escape transforms "%" to "%25", which would leave
 302    us with `abc%2520def'.  This is incorrect -- since %-escapes are
 303    part of URL syntax, "%20" is the correct way to denote a literal
 304    space on the Wget command line.  This leaves us in the conclusion
 305    that in that case Wget should not call url_escape, but leave the
 306    `%20' as is.
 307
 308    And what if the requested URI is `abc%20 def'?  If we call
 309    url_escape, we end up with `/abc%2520%20def', which is almost
 310    certainly not intended.  If we don't call url_escape, we are left
 311    with the embedded space and cannot complete the request.  What the
 312    user meant was for Wget to request `/abc%20%20def', and this is
 313    where reencode_escapes kicks in.
 314
 315    Wget used to solve this by first decoding %-quotes, and then
 316    encoding all the "unsafe" characters found in the resulting string.
 317    This was wrong because it didn't preserve certain URL special
 318    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 319    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 320    whether we considered `+' reserved (it is).  One of these results
 321    is inevitable because by the second step we would lose information
 322    on whether the `+' was originally encoded or not.  Both results
 323    were wrong because in CGI parameters + means space, while %2B means
 324    literal plus.  reencode_escapes correctly translates the above to
 325    "a%2B+b", i.e. returns the original string.
 326
 327    This function uses an algorithm proposed by Anon Sricharoenchai:
 328
 329    1. Encode all URL_UNSAFE and the "%" that are not followed by 2
 330       hexdigits.
 331
 332    2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
 333       "+".
 334
 335    ...except that this code conflates the two steps, and decides
 336    whether to encode, decode, or pass through each character in turn.
 337    The function still uses two passes, but their logic is the same --
 338    the first pass exists merely for the sake of allocation.  Another
 339    small difference is that we include `+' to URL_RESERVED.
 340
 341    Anon's test case:
 342
 343    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 344    ->
 345    "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
 346
 347    Simpler test cases:
 348
 349    "foo bar"         -> "foo%20bar"
 350    "foo%20bar"       -> "foo%20bar"
 351    "foo %20bar"      -> "foo%20%20bar"
 352    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 353    "foo%25%20bar"    -> "foo%25%20bar"
 354    "foo%2%20bar"     -> "foo%252%20bar"
 355    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 356    "foo%2b+bar"      -> "foo%2b+bar"  */
 357
 358 static char *
 359 reencode_escapes (const char *s)
 360 {
 361   const char *p1;
 362   char *newstr, *p2;
 363   int oldlen, newlen;
 364
 365   int encode_count = 0;
 366   int decode_count = 0;
 367
 368   /* First, pass through the string to see if there's anything to do,
 369      and to calculate the new length.  */
 370   for (p1 = s; *p1; p1++)
 371     {
 372       switch (decide_copy_method (p1))
 373         {
 374         case CM_ENCODE:
 375           ++encode_count;
 376           break;
 377         case CM_DECODE:
 378           ++decode_count;
 379           break;
 380         case CM_PASSTHROUGH:
 381           break;
 382         }
 383     }
 384
 385   if (!encode_count && !decode_count)
 386     /* The string is good as it is. */
 387     return (char *)s;           /* C const model sucks. */
 388
 389   oldlen = p1 - s;
 390   /* Each encoding adds two characters (hex digits), while each
 391      decoding removes two characters.  */
 392   newlen = oldlen + 2 * (encode_count - decode_count);
 393   newstr = xmalloc (newlen + 1);
 394
 395   p1 = s;
 396   p2 = newstr;
 397
 398   while (*p1)
 399     {
 400       switch (decide_copy_method (p1))
 401         {
 402         case CM_ENCODE:
 403           {
 404             unsigned char c = *p1++;
 405             *p2++ = '%';
 406             *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 407             *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 408           }
 409           break;
 410         case CM_DECODE:
 411           *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
 412                    + (XCHAR_TO_XDIGIT (*(p1 + 2))));
 413           p1 += 3;              /* skip %xx */
 414           break;
 415         case CM_PASSTHROUGH:
 416           *p2++ = *p1++;
 417         }
 418     }
 419   *p2 = '\0';
 420   assert (p2 - newstr == newlen);
 421   return newstr;
 422 }
 423 \f
 424 /* Returns the scheme type if the scheme is supported, or
 425    SCHEME_INVALID if not.  */
 426 enum url_scheme
 427 url_scheme (const char *url)
 428 {
 429   int i;
 430
 431   for (i = 0; supported_schemes[i].leading_string; i++)
 432     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 433                           strlen (supported_schemes[i].leading_string)))
 434       {
 435         if (supported_schemes[i].enabled)
 436           return (enum url_scheme) i;
 437         else
 438           return SCHEME_INVALID;
 439       }
 440
 441   return SCHEME_INVALID;
 442 }
 443
 444 /* Return the number of characters needed to skip the scheme part of
 445    the URL, e.g. `http://'.  If no scheme is found, returns 0.  */
 446 int
 447 url_skip_scheme (const char *url)
 448 {
 449   const char *p = url;
 450
 451   /* Skip the scheme name.  We allow `-' and `+' because of `whois++',
 452      etc. */
 453   while (ISALNUM (*p) || *p == '-' || *p == '+')
 454     ++p;
 455   if (*p != ':')
 456     return 0;
 457   /* Skip ':'. */
 458   ++p;
 459
 460   /* Skip "//" if found. */
 461   if (*p == '/' && *(p + 1) == '/')
 462     p += 2;
 463
 464   return p - url;
 465 }
 466
 467 /* Returns 1 if the URL begins with a scheme (supported or
 468    unsupported), 0 otherwise.  */
 469 int
 470 url_has_scheme (const char *url)
 471 {
 472   const char *p = url;
 473   while (ISALNUM (*p) || *p == '-' || *p == '+')
 474     ++p;
 475   return *p == ':';
 476 }
 477
 478 int
 479 scheme_default_port (enum url_scheme scheme)
 480 {
 481   return supported_schemes[scheme].default_port;
 482 }
 483
 484 void
 485 scheme_disable (enum url_scheme scheme)
 486 {
 487   supported_schemes[scheme].enabled = 0;
 488 }
 489
 490 /* Skip the username and password, if present here.  The function
 491    should be called *not* with the complete URL, but with the part
 492    right after the scheme.
 493
 494    If no username and password are found, return 0.  */
 495 int
 496 url_skip_uname (const char *url)
 497 {
 498   const char *p;
 499
 500   /* Look for '@' that comes before '/' or '?'. */
 501   p = (const char *)strpbrk (url, "/?@");
 502   if (!p || *p != '@')
 503     return 0;
 504
 505   return p - url + 1;
 506 }
 507
 508 static int
 509 parse_uname (const char *str, int len, char **user, char **passwd)
 510 {
 511   char *colon;
 512
 513   if (len == 0)
 514     /* Empty user name not allowed. */
 515     return 0;
 516
 517   colon = memchr (str, ':', len);
 518   if (colon == str)
 519     /* Empty user name again. */
 520     return 0;
 521
 522   if (colon)
 523     {
 524       int pwlen = len - (colon + 1 - str);
 525       *passwd = xmalloc (pwlen + 1);
 526       memcpy (*passwd, colon + 1, pwlen);
 527       (*passwd)[pwlen] = '\0';
 528       len -= pwlen + 1;
 529     }
 530   else
 531     *passwd = NULL;
 532
 533   *user = xmalloc (len + 1);
 534   memcpy (*user, str, len);
 535   (*user)[len] = '\0';
 536
 537   if (*user)
 538     url_unescape (*user);
 539   if (*passwd)
 540     url_unescape (*passwd);
 541
 542   return 1;
 543 }
 544
 545 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 546    popularized by Netscape and NcFTP.  HTTP shorthands look like this:
 547
 548    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 549    www.foo.com[:port]            -> http://www.foo.com[:port]
 550
 551    FTP shorthands look like this:
 552
 553    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 554    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 555
 556    If the URL needs not or cannot be rewritten, return NULL.  */
 557 char *
 558 rewrite_shorthand_url (const char *url)
 559 {
 560   const char *p;
 561
 562   if (url_has_scheme (url))
 563     return NULL;
 564
 565   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 566      latter Netscape.  */
 567   for (p = url; *p && *p != ':' && *p != '/'; p++)
 568     ;
 569
 570   if (p == url)
 571     return NULL;
 572
 573   if (*p == ':')
 574     {
 575       const char *pp;
 576       char *res;
 577       /* If the characters after the colon and before the next slash
 578          or end of string are all digits, it's HTTP.  */
 579       int digits = 0;
 580       for (pp = p + 1; ISDIGIT (*pp); pp++)
 581         ++digits;
 582       if (digits > 0 && (*pp == '/' || *pp == '\0'))
 583         goto http;
 584
 585       /* Prepend "ftp://" to the entire URL... */
 586       res = xmalloc (6 + strlen (url) + 1);
 587       sprintf (res, "ftp://%s", url);
 588       /* ...and replace ':' with '/'. */
 589       res[6 + (p - url)] = '/';
 590       return res;
 591     }
 592   else
 593     {
 594       char *res;
 595     http:
 596       /* Just prepend "http://" to what we have. */
 597       res = xmalloc (7 + strlen (url) + 1);
 598       sprintf (res, "http://%s", url);
 599       return res;
 600     }
 601 }
 602 \f
 603 static void parse_path PARAMS ((const char *, char **, char **));
 604
 605 /* Like strpbrk, with the exception that it returns the pointer to the
 606    terminating zero (end-of-string aka "eos") if no matching character
 607    is found.
 608
 609    Although I normally balk at Gcc-specific optimizations, it probably
 610    makes sense here: glibc has optimizations that detect strpbrk being
 611    called with literal string as ACCEPT and inline the search.  That
 612    optimization is defeated if strpbrk is hidden within the call to
 613    another function.  (And no, making strpbrk_or_eos inline doesn't
 614    help because the check for literal accept is in the
 615    preprocessor.)  */
 616
 617 #ifdef __GNUC__
 618
 619 #define strpbrk_or_eos(s, accept) ({            \
 620   char *SOE_p = strpbrk (s, accept);            \
 621   if (!SOE_p)                                   \
 622     SOE_p = (char *)s + strlen (s);             \
 623   SOE_p;                                        \
 624 })
 625
 626 #else  /* not __GNUC__ */
 627
 628 static char *
 629 strpbrk_or_eos (const char *s, const char *accept)
 630 {
 631   char *p = strpbrk (s, accept);
 632   if (!p)
 633     p = (char *)s + strlen (s);
 634   return p;
 635 }
 636 #endif
 637
 638 /* Turn STR into lowercase; return non-zero if a character was
 639    actually changed. */
 640
 641 static int
 642 lowercase_str (char *str)
 643 {
 644   int change = 0;
 645   for (; *str; str++)
 646     if (ISUPPER (*str))
 647       {
 648         change = 1;
 649         *str = TOLOWER (*str);
 650       }
 651   return change;
 652 }
 653
 654 static char *parse_errors[] = {
 655 #define PE_NO_ERROR                     0
 656   "No error",
 657 #define PE_UNSUPPORTED_SCHEME           1
 658   "Unsupported scheme",
 659 #define PE_EMPTY_HOST                   2
 660   "Empty host",
 661 #define PE_BAD_PORT_NUMBER              3
 662   "Bad port number",
 663 #define PE_INVALID_USER_NAME            4
 664   "Invalid user name",
 665 #define PE_UNTERMINATED_IPV6_ADDRESS    5
 666   "Unterminated IPv6 numeric address",
 667 #define PE_IPV6_NOT_SUPPORTED           6
 668   "IPv6 addresses not supported",
 669 #define PE_INVALID_IPV6_ADDRESS         7
 670   "Invalid IPv6 numeric address"
 671 };
 672
 673 #define SETERR(p, v) do {                       \
 674   if (p)                                        \
 675     *(p) = (v);                                 \
 676 } while (0)
 677
 678 #ifdef ENABLE_IPV6
 679 /* The following two functions were adapted from glibc. */
 680
 681 static int
 682 is_valid_ipv4_address (const char *str, const char *end)
 683 {
 684   int saw_digit, octets;
 685   int val;
 686
 687   saw_digit = 0;
 688   octets = 0;
 689   val = 0;
 690
 691   while (str < end) {
 692     int ch = *str++;
 693
 694     if (ch >= '0' && ch <= '9') {
 695       val = val * 10 + (ch - '0');
 696
 697       if (val > 255)
 698         return 0;
 699       if (saw_digit == 0) {
 700         if (++octets > 4)
 701           return 0;
 702         saw_digit = 1;
 703       }
 704     } else if (ch == '.' && saw_digit == 1) {
 705       if (octets == 4)
 706         return 0;
 707       val = 0;
 708       saw_digit = 0;
 709     } else
 710       return 0;
 711   }
 712   if (octets < 4)
 713     return 0;
 714
 715   return 1;
 716 }
 717
 718 static int
 719 is_valid_ipv6_address (const char *str, const char *end)
 720 {
 721   static const char xdigits[] = "0123456789abcdef";
 722   const char *curtok;
 723   int tp;
 724   const char *colonp;
 725   int saw_xdigit;
 726   unsigned int val;
 727
 728   tp = 0;
 729   colonp = NULL;
 730
 731   if (str == end)
 732     return 0;
 733
 734   /* Leading :: requires some special handling. */
 735   if (*str == ':')
 736     {
 737       ++str;
 738       if (str == end || *str != ':')
 739         return 0;
 740     }
 741
 742   curtok = str;
 743   saw_xdigit = 0;
 744   val = 0;
 745
 746   while (str < end) {
 747     int ch = *str++;
 748     const char *pch;
 749
 750     /* if ch is a number, add it to val. */
 751     pch = strchr(xdigits, ch);
 752     if (pch != NULL) {
 753       val <<= 4;
 754       val |= (pch - xdigits);
 755       if (val > 0xffff)
 756         return 0;
 757       saw_xdigit = 1;
 758       continue;
 759     }
 760
 761     /* if ch is a colon ... */
 762     if (ch == ':') {
 763       curtok = str;
 764       if (saw_xdigit == 0) {
 765         if (colonp != NULL)
 766           return 0;
 767         colonp = str + tp;
 768         continue;
 769       } else if (str == end) {
 770         return 0;
 771       }
 772       if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
 773         return 0;
 774       tp += NS_INT16SZ;
 775       saw_xdigit = 0;
 776       val = 0;
 777       continue;
 778     }
 779
 780     /* if ch is a dot ... */
 781     if (ch == '.' && (tp <= NS_IN6ADDRSZ - NS_INADDRSZ) &&
 782         is_valid_ipv4_address(curtok, end) == 1) {
 783       tp += NS_INADDRSZ;
 784       saw_xdigit = 0;
 785       break;
 786     }
 787
 788     return 0;
 789   }
 790
 791   if (saw_xdigit == 1) {
 792     if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
 793       return 0;
 794     tp += NS_INT16SZ;
 795   }
 796
 797   if (colonp != NULL) {
 798     if (tp == NS_IN6ADDRSZ)
 799       return 0;
 800     tp = NS_IN6ADDRSZ;
 801   }
 802
 803   if (tp != NS_IN6ADDRSZ)
 804     return 0;
 805
 806   return 1;
 807 }
 808 #endif
 809
 810 /* Parse a URL.
 811
 812    Return a new struct url if successful, NULL on error.  In case of
 813    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 814    error code. */
 815 struct url *
 816 url_parse (const char *url, int *error)
 817 {
 818   struct url *u;
 819   const char *p;
 820   int path_modified, host_modified;
 821
 822   enum url_scheme scheme;
 823
 824   const char *uname_b,     *uname_e;
 825   const char *host_b,      *host_e;
 826   const char *path_b,      *path_e;
 827   const char *params_b,    *params_e;
 828   const char *query_b,     *query_e;
 829   const char *fragment_b,  *fragment_e;
 830
 831   int port;
 832   char *user = NULL, *passwd = NULL;
 833
 834   char *url_encoded;
 835
 836   scheme = url_scheme (url);
 837   if (scheme == SCHEME_INVALID)
 838     {
 839       SETERR (error, PE_UNSUPPORTED_SCHEME);
 840       return NULL;
 841     }
 842
 843   url_encoded = reencode_escapes (url);
 844   p = url_encoded;
 845
 846   p += strlen (supported_schemes[scheme].leading_string);
 847   uname_b = p;
 848   p += url_skip_uname (p);
 849   uname_e = p;
 850
 851   /* scheme://user:pass@host[:port]... */
 852   /*                    ^              */
 853
 854   /* We attempt to break down the URL into the components path,
 855      params, query, and fragment.  They are ordered like this:
 856
 857        scheme://host[:port][/path][;params][?query][#fragment]  */
 858
 859   params_b   = params_e   = NULL;
 860   query_b    = query_e    = NULL;
 861   fragment_b = fragment_e = NULL;
 862
 863   host_b = p;
 864
 865   if (*p == '[')
 866     {
 867       /* Handle IPv6 address inside square brackets.  Ideally we'd
 868          just look for the terminating ']', but rfc2732 mandates
 869          rejecting invalid IPv6 addresses.  */
 870
 871       /* The address begins after '['. */
 872       host_b = p + 1;
 873       host_e = strchr (host_b, ']');
 874
 875       if (!host_e)
 876         {
 877           SETERR (error, PE_UNTERMINATED_IPV6_ADDRESS);
 878           return NULL;
 879         }
 880
 881 #ifdef ENABLE_IPV6
 882       /* Check if the IPv6 address is valid. */
 883       if (!is_valid_ipv6_address(host_b, host_e))
 884         {
 885           SETERR (error, PE_INVALID_IPV6_ADDRESS);
 886           return NULL;
 887         }
 888
 889       /* Continue parsing after the closing ']'. */
 890       p = host_e + 1;
 891 #else
 892       SETERR (error, PE_IPV6_NOT_SUPPORTED);
 893       return NULL;
 894 #endif
 895     }
 896   else
 897     {
 898       p = strpbrk_or_eos (p, ":/;?#");
 899       host_e = p;
 900     }
 901
 902   if (host_b == host_e)
 903     {
 904       SETERR (error, PE_EMPTY_HOST);
 905       return NULL;
 906     }
 907
 908   port = scheme_default_port (scheme);
 909   if (*p == ':')
 910     {
 911       const char *port_b, *port_e, *pp;
 912
 913       /* scheme://host:port/tralala */
 914       /*              ^             */
 915       ++p;
 916       port_b = p;
 917       p = strpbrk_or_eos (p, "/;?#");
 918       port_e = p;
 919
 920       if (port_b == port_e)
 921         {
 922           /* http://host:/whatever */
 923           /*             ^         */
 924           SETERR (error, PE_BAD_PORT_NUMBER);
 925           return NULL;
 926         }
 927
 928       for (port = 0, pp = port_b; pp < port_e; pp++)
 929         {
 930           if (!ISDIGIT (*pp))
 931             {
 932               /* http://host:12randomgarbage/blah */
 933               /*               ^                  */
 934               SETERR (error, PE_BAD_PORT_NUMBER);
 935               return NULL;
 936             }
 937
 938           port = 10 * port + (*pp - '0');
 939         }
 940     }
 941
 942   if (*p == '/')
 943     {
 944       ++p;
 945       path_b = p;
 946       p = strpbrk_or_eos (p, ";?#");
 947       path_e = p;
 948     }
 949   else
 950     {
 951       /* Path is not allowed not to exist. */
 952       path_b = path_e = p;
 953     }
 954
 955   if (*p == ';')
 956     {
 957       ++p;
 958       params_b = p;
 959       p = strpbrk_or_eos (p, "?#");
 960       params_e = p;
 961     }
 962   if (*p == '?')
 963     {
 964       ++p;
 965       query_b = p;
 966       p = strpbrk_or_eos (p, "#");
 967       query_e = p;
 968
 969       /* Hack that allows users to use '?' (a wildcard character) in
 970          FTP URLs without it being interpreted as a query string
 971          delimiter.  */
 972       if (scheme == SCHEME_FTP)
 973         {
 974           query_b = query_e = NULL;
 975           path_e = p;
 976         }
 977     }
 978   if (*p == '#')
 979     {
 980       ++p;
 981       fragment_b = p;
 982       p += strlen (p);
 983       fragment_e = p;
 984     }
 985   assert (*p == 0);
 986
 987   if (uname_b != uname_e)
 988     {
 989       /* http://user:pass@host */
 990       /*        ^         ^    */
 991       /*     uname_b   uname_e */
 992       if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
 993         {
 994           SETERR (error, PE_INVALID_USER_NAME);
 995           return NULL;
 996         }
 997     }
 998
 999   u = (struct url *)xmalloc (sizeof (struct url));
1000   memset (u, 0, sizeof (*u));
1001
1002   u->scheme = scheme;
1003   u->host   = strdupdelim (host_b, host_e);
1004   u->port   = port;
1005   u->user   = user;
1006   u->passwd = passwd;
1007
1008   u->path = strdupdelim (path_b, path_e);
1009   path_modified = path_simplify (u->path);
1010   parse_path (u->path, &u->dir, &u->file);
1011
1012   host_modified = lowercase_str (u->host);
1013
1014   if (params_b)
1015     u->params = strdupdelim (params_b, params_e);
1016   if (query_b)
1017     u->query = strdupdelim (query_b, query_e);
1018   if (fragment_b)
1019     u->fragment = strdupdelim (fragment_b, fragment_e);
1020
1021   if (path_modified || u->fragment || host_modified || path_b == path_e)
1022     {
1023       /* If we suspect that a transformation has rendered what
1024          url_string might return different from URL_ENCODED, rebuild
1025          u->url using url_string.  */
1026       u->url = url_string (u, 0);
1027
1028       if (url_encoded != url)
1029         xfree ((char *) url_encoded);
1030     }
1031   else
1032     {
1033       if (url_encoded == url)
1034         u->url = xstrdup (url);
1035       else
1036         u->url = url_encoded;
1037     }
1038   url_encoded = NULL;
1039
1040   return u;
1041 }
1042
1043 const char *
1044 url_error (int error_code)
1045 {
1046   assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
1047   return parse_errors[error_code];
1048 }
1049
1050 /* Parse PATH into dir and file.  PATH is extracted from the URL and
1051    is URL-escaped.  The function returns unescaped DIR and FILE.  */
1052
1053 static void
1054 parse_path (const char *path, char **dir, char **file)
1055 {
1056   char *last_slash;
1057
1058   last_slash = strrchr (path, '/');
1059   if (!last_slash)
1060     {
1061       *dir = xstrdup ("");
1062       *file = xstrdup (path);
1063     }
1064   else
1065     {
1066       *dir = strdupdelim (path, last_slash);
1067       *file = xstrdup (last_slash + 1);
1068     }
1069   url_unescape (*dir);
1070   url_unescape (*file);
1071 }
1072
1073 /* Note: URL's "full path" is the path with the query string and
1074    params appended.  The "fragment" (#foo) is intentionally ignored,
1075    but that might be changed.  For example, if the original URL was
1076    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
1077    the full path will be "/foo/bar/baz;bullshit?querystring".  */
1078
1079 /* Return the length of the full path, without the terminating
1080    zero.  */
1081
1082 static int
1083 full_path_length (const struct url *url)
1084 {
1085   int len = 0;
1086
1087 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
1088
1089   FROB (path);
1090   FROB (params);
1091   FROB (query);
1092
1093 #undef FROB
1094
1095   return len;
1096 }
1097
1098 /* Write out the full path. */
1099
1100 static void
1101 full_path_write (const struct url *url, char *where)
1102 {
1103 #define FROB(el, chr) do {                      \
1104   char *f_el = url->el;                         \
1105   if (f_el) {                                   \
1106     int l = strlen (f_el);                      \
1107     *where++ = chr;                             \
1108     memcpy (where, f_el, l);                    \
1109     where += l;                                 \
1110   }                                             \
1111 } while (0)
1112
1113   FROB (path, '/');
1114   FROB (params, ';');
1115   FROB (query, '?');
1116
1117 #undef FROB
1118 }
1119
1120 /* Public function for getting the "full path".  E.g. if u->path is
1121    "foo/bar" and u->query is "param=value", full_path will be
1122    "/foo/bar?param=value". */
1123
1124 char *
1125 url_full_path (const struct url *url)
1126 {
1127   int length = full_path_length (url);
1128   char *full_path = (char *)xmalloc(length + 1);
1129
1130   full_path_write (url, full_path);
1131   full_path[length] = '\0';
1132
1133   return full_path;
1134 }
1135
1136 /* Escape unsafe and reserved characters, except for the slash
1137    characters.  */
1138
1139 static char *
1140 url_escape_dir (const char *dir)
1141 {
1142   char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1143   char *h, *t;
1144   if (newdir == dir)
1145     return (char *)dir;
1146
1147   /* Unescape slashes in NEWDIR. */
1148
1149   h = newdir;                   /* hare */
1150   t = newdir;                   /* tortoise */
1151
1152   for (; *h; h++, t++)
1153     {
1154       if (*h == '%' && h[1] == '2' && h[2] == 'F')
1155         {
1156           *t = '/';
1157           h += 2;
1158         }
1159       else
1160         *t = *h;
1161     }
1162   *t = '\0';
1163
1164   return newdir;
1165 }
1166
1167 /* Sync u->path and u->url with u->dir and u->file.  Called after
1168    u->file or u->dir have been changed, typically by the FTP code.  */
1169
1170 static void
1171 sync_path (struct url *u)
1172 {
1173   char *newpath, *efile, *edir;
1174
1175   xfree (u->path);
1176
1177   /* u->dir and u->file are not escaped.  URL-escape them before
1178      reassembling them into u->path.  That way, if they contain
1179      separators like '?' or even if u->file contains slashes, the
1180      path will be correctly assembled.  (u->file can contain slashes
1181      if the URL specifies it with %2f, or if an FTP server returns
1182      it.)  */
1183   edir = url_escape_dir (u->dir);
1184   efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1185
1186   if (!*edir)
1187     newpath = xstrdup (efile);
1188   else
1189     {
1190       int dirlen = strlen (edir);
1191       int filelen = strlen (efile);
1192
1193       /* Copy "DIR/FILE" to newpath. */
1194       char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1195       memcpy (p, edir, dirlen);
1196       p += dirlen;
1197       *p++ = '/';
1198       memcpy (p, efile, filelen);
1199       p += filelen;
1200       *p++ = '\0';
1201     }
1202
1203   u->path = newpath;
1204
1205   if (edir != u->dir)
1206     xfree (edir);
1207   if (efile != u->file)
1208     xfree (efile);
1209
1210   /* Regenerate u->url as well.  */
1211   xfree (u->url);
1212   u->url = url_string (u, 0);
1213 }
1214
1215 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1216    This way we can sync u->path and u->url when they get changed.  */
1217
1218 void
1219 url_set_dir (struct url *url, const char *newdir)
1220 {
1221   xfree (url->dir);
1222   url->dir = xstrdup (newdir);
1223   sync_path (url);
1224 }
1225
1226 void
1227 url_set_file (struct url *url, const char *newfile)
1228 {
1229   xfree (url->file);
1230   url->file = xstrdup (newfile);
1231   sync_path (url);
1232 }
1233
1234 void
1235 url_free (struct url *url)
1236 {
1237   xfree (url->host);
1238   xfree (url->path);
1239   xfree (url->url);
1240
1241   FREE_MAYBE (url->params);
1242   FREE_MAYBE (url->query);
1243   FREE_MAYBE (url->fragment);
1244   FREE_MAYBE (url->user);
1245   FREE_MAYBE (url->passwd);
1246
1247   xfree (url->dir);
1248   xfree (url->file);
1249
1250   xfree (url);
1251 }
1252 \f
1253 struct urlpos *
1254 get_urls_file (const char *file)
1255 {
1256   struct file_memory *fm;
1257   struct urlpos *head, *tail;
1258   const char *text, *text_end;
1259
1260   /* Load the file.  */
1261   fm = read_file (file);
1262   if (!fm)
1263     {
1264       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
1265       return NULL;
1266     }
1267   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
1268
1269   head = tail = NULL;
1270   text = fm->content;
1271   text_end = fm->content + fm->length;
1272   while (text < text_end)
1273     {
1274       const char *line_beg = text;
1275       const char *line_end = memchr (text, '\n', text_end - text);
1276       if (!line_end)
1277         line_end = text_end;
1278       else
1279         ++line_end;
1280       text = line_end;
1281
1282       /* Strip whitespace from the beginning and end of line. */
1283       while (line_beg < line_end && ISSPACE (*line_beg))
1284         ++line_beg;
1285       while (line_end > line_beg && ISSPACE (*(line_end - 1)))
1286         --line_end;
1287
1288       if (line_end > line_beg)
1289         {
1290           /* URL is in the [line_beg, line_end) region. */
1291
1292           int up_error_code;
1293           char *url_text;
1294           struct urlpos *entry;
1295           struct url *url;
1296
1297           /* We must copy the URL to a zero-terminated string, and we
1298              can't use alloca because we're in a loop.  *sigh*.  */
1299           url_text = strdupdelim (line_beg, line_end);
1300
1301           if (opt.base_href)
1302             {
1303               /* Merge opt.base_href with URL. */
1304               char *merged = uri_merge (opt.base_href, url_text);
1305               xfree (url_text);
1306               url_text = merged;
1307             }
1308
1309           url = url_parse (url_text, &up_error_code);
1310           if (!url)
1311             {
1312               logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1313                          file, url_text, url_error (up_error_code));
1314               xfree (url_text);
1315               continue;
1316             }
1317           xfree (url_text);
1318
1319           entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1320           memset (entry, 0, sizeof (*entry));
1321           entry->next = NULL;
1322           entry->url = url;
1323
1324           if (!head)
1325             head = entry;
1326           else
1327             tail->next = entry;
1328           tail = entry;
1329         }
1330     }
1331   read_file_free (fm);
1332   return head;
1333 }
1334 \f
1335 /* Free the linked list of urlpos.  */
1336 void
1337 free_urlpos (struct urlpos *l)
1338 {
1339   while (l)
1340     {
1341       struct urlpos *next = l->next;
1342       if (l->url)
1343         url_free (l->url);
1344       FREE_MAYBE (l->local_name);
1345       xfree (l);
1346       l = next;
1347     }
1348 }
1349
1350 /* Rotate FNAME opt.backups times */
1351 void
1352 rotate_backups(const char *fname)
1353 {
1354   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1355   char *from = (char *)alloca (maxlen);
1356   char *to = (char *)alloca (maxlen);
1357   struct stat sb;
1358   int i;
1359
1360   if (stat (fname, &sb) == 0)
1361     if (S_ISREG (sb.st_mode) == 0)
1362       return;
1363
1364   for (i = opt.backups; i > 1; i--)
1365     {
1366       sprintf (from, "%s.%d", fname, i - 1);
1367       sprintf (to, "%s.%d", fname, i);
1368       rename (from, to);
1369     }
1370
1371   sprintf (to, "%s.%d", fname, 1);
1372   rename(fname, to);
1373 }
1374
1375 /* Create all the necessary directories for PATH (a file).  Calls
1376    mkdirhier() internally.  */
1377 int
1378 mkalldirs (const char *path)
1379 {
1380   const char *p;
1381   char *t;
1382   struct stat st;
1383   int res;
1384
1385   p = path + strlen (path);
1386   for (; *p != '/' && p != path; p--)
1387     ;
1388
1389   /* Don't create if it's just a file.  */
1390   if ((p == path) && (*p != '/'))
1391     return 0;
1392   t = strdupdelim (path, p);
1393
1394   /* Check whether the directory exists.  */
1395   if ((stat (t, &st) == 0))
1396     {
1397       if (S_ISDIR (st.st_mode))
1398         {
1399           xfree (t);
1400           return 0;
1401         }
1402       else
1403         {
1404           /* If the dir exists as a file name, remove it first.  This
1405              is *only* for Wget to work with buggy old CERN http
1406              servers.  Here is the scenario: When Wget tries to
1407              retrieve a directory without a slash, e.g.
1408              http://foo/bar (bar being a directory), CERN server will
1409              not redirect it too http://foo/bar/ -- it will generate a
1410              directory listing containing links to bar/file1,
1411              bar/file2, etc.  Wget will lose because it saves this
1412              HTML listing to a file `bar', so it cannot create the
1413              directory.  To work around this, if the file of the same
1414              name exists, we just remove it and create the directory
1415              anyway.  */
1416           DEBUGP (("Removing %s because of directory danger!\n", t));
1417           unlink (t);
1418         }
1419     }
1420   res = make_directory (t);
1421   if (res != 0)
1422     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1423   xfree (t);
1424   return res;
1425 }
1426 \f
1427 /* Functions for constructing the file name out of URL components.  */
1428
1429 /* A growable string structure, used by url_file_name and friends.
1430    This should perhaps be moved to utils.c.
1431
1432    The idea is to have a convenient and efficient way to construct a
1433    string by having various functions append data to it.  Instead of
1434    passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1435    functions in questions, we pass the pointer to this struct.  */
1436
1437 struct growable {
1438   char *base;
1439   int size;
1440   int tail;
1441 };
1442
1443 /* Ensure that the string can accept APPEND_COUNT more characters past
1444    the current TAIL position.  If necessary, this will grow the string
1445    and update its allocated size.  If the string is already large
1446    enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1447 #define GROW(g, append_size) do {                                       \
1448   struct growable *G_ = g;                                              \
1449   DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1450 } while (0)
1451
1452 /* Return the tail position of the string. */
1453 #define TAIL(r) ((r)->base + (r)->tail)
1454
1455 /* Move the tail position by APPEND_COUNT characters. */
1456 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1457
1458 /* Append the string STR to DEST.  NOTICE: the string in DEST is not
1459    terminated.  */
1460
1461 static void
1462 append_string (const char *str, struct growable *dest)
1463 {
1464   int l = strlen (str);
1465   GROW (dest, l);
1466   memcpy (TAIL (dest), str, l);
1467   TAIL_INCR (dest, l);
1468 }
1469
1470 /* Append CH to DEST.  For example, append_char (0, DEST)
1471    zero-terminates DEST.  */
1472
1473 static void
1474 append_char (char ch, struct growable *dest)
1475 {
1476   GROW (dest, 1);
1477   *TAIL (dest) = ch;
1478   TAIL_INCR (dest, 1);
1479 }
1480
1481 enum {
1482   filechr_unsafe_always  = 1,   /* always unsafe, e.g. / or \0 */
1483   filechr_unsafe_shell   = 2,   /* unsafe for shell use, e.g. control chars */
1484   filechr_unsafe_windows = 2,   /* disallowed on Windows file system */
1485 };
1486
1487 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1488
1489 /* Shorthands for the table: */
1490 #define A filechr_unsafe_always
1491 #define S filechr_unsafe_shell
1492 #define W filechr_unsafe_windows
1493
1494 /* Forbidden chars:
1495
1496    always: \0, /
1497    Unix shell: 0-31, 128-159
1498    Windows:    \, |, /, <, >, ?, :
1499
1500    Arguably we could also claim `%' to be unsafe, since we use it as
1501    the escape character.  If we ever want to be able to reliably
1502    translate file name back to URL, this would become important
1503    crucial.  Right now, it's better to be minimal in escaping.  */
1504
1505 const static unsigned char filechr_table[256] =
1506 {
1507   A,  S,  S,  S,   S,  S,  S,  S,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1508   S,  S,  S,  S,   S,  S,  S,  S,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1509   S,  S,  S,  S,   S,  S,  S,  S,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1510   S,  S,  S,  S,   S,  S,  S,  S,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1511   0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1512   0,  0,  W,  0,   0,  0,  0,  A,   /* (   )   *   +    ,   -   .   /   */
1513   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1514   0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
1515   0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1516   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1517   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1518   0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1519   0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1520   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1521   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1522   0,  0,  0,  0,   0,  0,  0,  0,   /* x   y   z   {    |   }   ~   DEL */
1523
1524   S, S, S, S,  S, S, S, S,  S, S, S, S,  S, S, S, S, /* 128-143 */
1525   S, S, S, S,  S, S, S, S,  S, S, S, S,  S, S, S, S, /* 144-159 */
1526   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1527   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1528
1529   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1530   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1531   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1532   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1533 };
1534
1535 /* Return non-zero if character CH is unsafe for use in file or
1536    directory name.  Called by append_uri_pathel. */
1537
1538 static inline int
1539 file_unsafe_char (char ch, int restrict)
1540 {
1541   int mask = filechr_unsafe_always;
1542   if (restrict == restrict_shell)
1543     mask |= filechr_unsafe_shell;
1544   else if (restrict == restrict_windows)
1545     mask |= (filechr_unsafe_shell | filechr_unsafe_windows);
1546   return FILE_CHAR_TEST (ch, mask);
1547 }
1548
1549 /* FN_PORT_SEP is the separator between host and port in file names
1550    for non-standard port numbers.  On Unix this is normally ':', as in
1551    "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1552    because Windows can't handle ':' in file names.  */
1553 #define FN_PORT_SEP  (opt.restrict_file_names != restrict_windows ? ':' : '+')
1554
1555 /* FN_QUERY_SEP is the separator between the file name and the URL
1556    query, normally '?'.  Since Windows cannot handle '?' as part of
1557    file name, we use '@' instead there.  */
1558 #define FN_QUERY_SEP (opt.restrict_file_names != restrict_windows ? '?' : '@')
1559
1560 /* Quote path element, characters in [b, e), as file name, and append
1561    the quoted string to DEST.  Each character is quoted as per
1562    file_unsafe_char and the corresponding table.  */
1563
1564 static void
1565 append_uri_pathel (const char *b, const char *e, struct growable *dest)
1566 {
1567   char *pathel;
1568   int pathlen;
1569
1570   const char *p;
1571   int quoted, outlen;
1572
1573   /* Currently restrict_for_windows is determined at compile time
1574      only.  But some users download files to Windows partitions; they
1575      should be able to say --windows-file-names so Wget escapes
1576      characters invalid on Windows.  Similar run-time restrictions for
1577      other file systems can be implemented.  */
1578   const int restrict = opt.restrict_file_names;
1579
1580   /* Copy [b, e) to PATHEL and URL-unescape it. */
1581   BOUNDED_TO_ALLOCA (b, e, pathel);
1582   url_unescape (pathel);
1583   pathlen = strlen (pathel);
1584
1585   /* Go through PATHEL and check how many characters we'll need to
1586      add for file quoting. */
1587   quoted = 0;
1588   for (p = pathel; *p; p++)
1589     if (file_unsafe_char (*p, restrict))
1590       ++quoted;
1591
1592   /* p - pathel is the string length.  Each quoted char means two
1593      additional characters in the string, hence 2*quoted.  */
1594   outlen = (p - pathel) + (2 * quoted);
1595   GROW (dest, outlen);
1596
1597   if (!quoted)
1598     {
1599       /* If there's nothing to quote, we don't need to go through the
1600          string the second time.  */
1601       memcpy (TAIL (dest), pathel, outlen);
1602     }
1603   else
1604     {
1605       char *q = TAIL (dest);
1606       for (p = pathel; *p; p++)
1607         {
1608           if (!file_unsafe_char (*p, restrict))
1609             *q++ = *p;
1610           else
1611             {
1612               unsigned char ch = *p;
1613               *q++ = '%';
1614               *q++ = XDIGIT_TO_XCHAR (ch >> 4);
1615               *q++ = XDIGIT_TO_XCHAR (ch & 0xf);
1616             }
1617         }
1618       assert (q - TAIL (dest) == outlen);
1619     }
1620   TAIL_INCR (dest, outlen);
1621 }
1622
1623 /* Append to DEST the directory structure that corresponds the
1624    directory part of URL's path.  For example, if the URL is
1625    http://server/dir1/dir2/file, this appends "/dir1/dir2".
1626
1627    Each path element ("dir1" and "dir2" in the above example) is
1628    examined, url-unescaped, and re-escaped as file name element.
1629
1630    Additionally, it cuts as many directories from the path as
1631    specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1632    will produce "bar" for the above example.  For 2 or more, it will
1633    produce "".
1634
1635    Each component of the path is quoted for use as file name.  */
1636
1637 static void
1638 append_dir_structure (const struct url *u, struct growable *dest)
1639 {
1640   char *pathel, *next;
1641   int cut = opt.cut_dirs;
1642
1643   /* Go through the path components, de-URL-quote them, and quote them
1644      (if necessary) as file names.  */
1645
1646   pathel = u->path;
1647   for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1648     {
1649       if (cut-- > 0)
1650         continue;
1651       if (pathel == next)
1652         /* Ignore empty pathels.  path_simplify should remove
1653            occurrences of "//" from the path, but it has special cases
1654            for starting / which generates an empty pathel here.  */
1655         continue;
1656
1657       if (dest->tail)
1658         append_char ('/', dest);
1659       append_uri_pathel (pathel, next, dest);
1660     }
1661 }
1662
1663 /* Return a unique file name that matches the given URL as good as
1664    possible.  Does not create directories on the file system.  */
1665
1666 char *
1667 url_file_name (const struct url *u)
1668 {
1669   struct growable fnres;
1670
1671   char *u_file, *u_query;
1672   char *fname, *unique;
1673
1674   fnres.base = NULL;
1675   fnres.size = 0;
1676   fnres.tail = 0;
1677
1678   /* Start with the directory prefix, if specified. */
1679   if (!DOTP (opt.dir_prefix))
1680     append_string (opt.dir_prefix, &fnres);
1681
1682   /* If "dirstruct" is turned on (typically the case with -r), add
1683      the host and port (unless those have been turned off) and
1684      directory structure.  */
1685   if (opt.dirstruct)
1686     {
1687       if (opt.add_hostdir)
1688         {
1689           if (fnres.tail)
1690             append_char ('/', &fnres);
1691           append_string (u->host, &fnres);
1692           if (u->port != scheme_default_port (u->scheme))
1693             {
1694               char portstr[24];
1695               number_to_string (portstr, u->port);
1696               append_char (FN_PORT_SEP, &fnres);
1697               append_string (portstr, &fnres);
1698             }
1699         }
1700
1701       append_dir_structure (u, &fnres);
1702     }
1703
1704   /* Add the file name. */
1705   if (fnres.tail)
1706     append_char ('/', &fnres);
1707   u_file = *u->file ? u->file : "index.html";
1708   append_uri_pathel (u_file, u_file + strlen (u_file), &fnres);
1709
1710   /* Append "?query" to the file name. */
1711   u_query = u->query && *u->query ? u->query : NULL;
1712   if (u_query)
1713     {
1714       append_char (FN_QUERY_SEP, &fnres);
1715       append_uri_pathel (u_query, u_query + strlen (u_query), &fnres);
1716     }
1717
1718   /* Zero-terminate the file name. */
1719   append_char ('\0', &fnres);
1720
1721   fname = fnres.base;
1722
1723   /* Check the cases in which the unique extensions are not used:
1724      1) Clobbering is turned off (-nc).
1725      2) Retrieval with regetting.
1726      3) Timestamping is used.
1727      4) Hierarchy is built.
1728
1729      The exception is the case when file does exist and is a
1730      directory (see `mkalldirs' for explanation).  */
1731
1732   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1733       && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1734     return fname;
1735
1736   unique = unique_name (fname, 1);
1737   if (unique != fname)
1738     xfree (fname);
1739   return unique;
1740 }
1741
1742 /* Return the length of URL's path.  Path is considered to be
1743    terminated by one of '?', ';', '#', or by the end of the
1744    string.  */
1745 static int
1746 path_length (const char *url)
1747 {
1748   const char *q = strpbrk_or_eos (url, "?;#");
1749   return q - url;
1750 }
1751
1752 /* Find the last occurrence of character C in the range [b, e), or
1753    NULL, if none are present.  This is equivalent to strrchr(b, c),
1754    except that it accepts an END argument instead of requiring the
1755    string to be zero-terminated.  Why is there no memrchr()?  */
1756 static const char *
1757 find_last_char (const char *b, const char *e, char c)
1758 {
1759   for (; e > b; e--)
1760     if (*e == c)
1761       return e;
1762   return NULL;
1763 }
1764 \f
1765 /* Resolve "." and ".." elements of PATH by destructively modifying
1766    PATH.  "." is resolved by removing that path element, and ".." is
1767    resolved by removing the preceding path element.  Leading and
1768    trailing slashes are preserved.
1769
1770    Return non-zero if any changes have been made.
1771
1772    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1773    test examples are provided below.  If you change anything in this
1774    function, run test_path_simplify to make sure you haven't broken a
1775    test case.
1776
1777    A previous version of this function was based on path_simplify()
1778    from GNU Bash, but it has been rewritten for Wget 1.8.1.  */
1779
1780 static int
1781 path_simplify (char *path)
1782 {
1783   int change = 0;
1784   char *p, *end;
1785
1786   if (path[0] == '/')
1787     ++path;                     /* preserve the leading '/'. */
1788
1789   p = path;
1790   end = p + strlen (p) + 1;     /* position past the terminating zero. */
1791
1792   while (1)
1793     {
1794     again:
1795       /* P should point to the beginning of a path element. */
1796
1797       if (*p == '.' && (*(p + 1) == '/' || *(p + 1) == '\0'))
1798         {
1799           /* Handle "./foo" by moving "foo" two characters to the
1800              left. */
1801           if (*(p + 1) == '/')
1802             {
1803               change = 1;
1804               memmove (p, p + 2, end - p);
1805               end -= 2;
1806               goto again;
1807             }
1808           else
1809             {
1810               change = 1;
1811               *p = '\0';
1812               break;
1813             }
1814         }
1815       else if (*p == '.' && *(p + 1) == '.'
1816                && (*(p + 2) == '/' || *(p + 2) == '\0'))
1817         {
1818           /* Handle "../foo" by moving "foo" one path element to the
1819              left.  */
1820           char *b = p;          /* not p-1 because P can equal PATH */
1821
1822           /* Backtrack by one path element, but not past the beginning
1823              of PATH. */
1824
1825           /* foo/bar/../baz */
1826           /*         ^ p    */
1827           /*     ^ b        */
1828
1829           if (b > path)
1830             {
1831               /* Move backwards until B hits the beginning of the
1832                  previous path element or the beginning of path. */
1833               for (--b; b > path && *(b - 1) != '/'; b--)
1834                 ;
1835             }
1836
1837           change = 1;
1838           if (*(p + 2) == '/')
1839             {
1840               memmove (b, p + 3, end - (p + 3));
1841               end -= (p + 3) - b;
1842               p = b;
1843             }
1844           else
1845             {
1846               *b = '\0';
1847               break;
1848             }
1849
1850           goto again;
1851         }
1852       else if (*p == '/')
1853         {
1854           /* Remove empty path elements.  Not mandated by rfc1808 et
1855              al, but it seems like a good idea to get rid of them.
1856              Supporting them properly is hard (in which directory do
1857              you save http://x.com///y.html?) and they don't seem to
1858              bring much gain.  */
1859           char *q = p;
1860           while (*q == '/')
1861             ++q;
1862           change = 1;
1863           if (*q == '\0')
1864             {
1865               *p = '\0';
1866               break;
1867             }
1868           memmove (p, q, end - q);
1869           end -= q - p;
1870           goto again;
1871         }
1872
1873       /* Skip to the next path element. */
1874       while (*p && *p != '/')
1875         ++p;
1876       if (*p == '\0')
1877         break;
1878
1879       /* Make sure P points to the beginning of the next path element,
1880          which is location after the slash. */
1881       ++p;
1882     }
1883
1884   return change;
1885 }
1886 \f
1887 /* Resolve the result of "linking" a base URI (BASE) to a
1888    link-specified URI (LINK).
1889
1890    Either of the URIs may be absolute or relative, complete with the
1891    host name, or path only.  This tries to behave "reasonably" in all
1892    foreseeable cases.  It employs little specific knowledge about
1893    schemes or URL-specific stuff -- it just works on strings.
1894
1895    The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1896    See uri_merge for a gentler interface to this functionality.
1897
1898    Perhaps this function should call path_simplify so that the callers
1899    don't have to call url_parse unconditionally.  */
1900 static char *
1901 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1902 {
1903   char *constr;
1904
1905   if (no_scheme)
1906     {
1907       const char *end = base + path_length (base);
1908
1909       if (!*link)
1910         {
1911           /* Empty LINK points back to BASE, query string and all. */
1912           constr = xstrdup (base);
1913         }
1914       else if (*link == '?')
1915         {
1916           /* LINK points to the same location, but changes the query
1917              string.  Examples: */
1918           /* uri_merge("path",         "?new") -> "path?new"     */
1919           /* uri_merge("path?foo",     "?new") -> "path?new"     */
1920           /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1921           /* uri_merge("path#foo",     "?new") -> "path?new"     */
1922           int baselength = end - base;
1923           constr = xmalloc (baselength + linklength + 1);
1924           memcpy (constr, base, baselength);
1925           memcpy (constr + baselength, link, linklength);
1926           constr[baselength + linklength] = '\0';
1927         }
1928       else if (*link == '#')
1929         {
1930           /* uri_merge("path",         "#new") -> "path#new"     */
1931           /* uri_merge("path#foo",     "#new") -> "path#new"     */
1932           /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1933           /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1934           int baselength;
1935           const char *end1 = strchr (base, '#');
1936           if (!end1)
1937             end1 = base + strlen (base);
1938           baselength = end1 - base;
1939           constr = xmalloc (baselength + linklength + 1);
1940           memcpy (constr, base, baselength);
1941           memcpy (constr + baselength, link, linklength);
1942           constr[baselength + linklength] = '\0';
1943         }
1944       else if (linklength > 1 && *link == '/' && *(link + 1) == '/')
1945         {
1946           /* LINK begins with "//" and so is a net path: we need to
1947              replace everything after (and including) the double slash
1948              with LINK. */
1949
1950           /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1951           /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1952           /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1953
1954           int span;
1955           const char *slash;
1956           const char *start_insert;
1957
1958           /* Look for first slash. */
1959           slash = memchr (base, '/', end - base);
1960           /* If found slash and it is a double slash, then replace
1961              from this point, else default to replacing from the
1962              beginning.  */
1963           if (slash && *(slash + 1) == '/')
1964             start_insert = slash;
1965           else
1966             start_insert = base;
1967
1968           span = start_insert - base;
1969           constr = (char *)xmalloc (span + linklength + 1);
1970           if (span)
1971             memcpy (constr, base, span);
1972           memcpy (constr + span, link, linklength);
1973           constr[span + linklength] = '\0';
1974         }
1975       else if (*link == '/')
1976         {
1977           /* LINK is an absolute path: we need to replace everything
1978              after (and including) the FIRST slash with LINK.
1979
1980              So, if BASE is "http://host/whatever/foo/bar", and LINK is
1981              "/qux/xyzzy", our result should be
1982              "http://host/qux/xyzzy".  */
1983           int span;
1984           const char *slash;
1985           const char *start_insert = NULL; /* for gcc to shut up. */
1986           const char *pos = base;
1987           int seen_slash_slash = 0;
1988           /* We're looking for the first slash, but want to ignore
1989              double slash. */
1990         again:
1991           slash = memchr (pos, '/', end - pos);
1992           if (slash && !seen_slash_slash)
1993             if (*(slash + 1) == '/')
1994               {
1995                 pos = slash + 2;
1996                 seen_slash_slash = 1;
1997                 goto again;
1998               }
1999
2000           /* At this point, SLASH is the location of the first / after
2001              "//", or the first slash altogether.  START_INSERT is the
2002              pointer to the location where LINK will be inserted.  When
2003              examining the last two examples, keep in mind that LINK
2004              begins with '/'. */
2005
2006           if (!slash && !seen_slash_slash)
2007             /* example: "foo" */
2008             /*           ^    */
2009             start_insert = base;
2010           else if (!slash && seen_slash_slash)
2011             /* example: "http://foo" */
2012             /*                     ^ */
2013             start_insert = end;
2014           else if (slash && !seen_slash_slash)
2015             /* example: "foo/bar" */
2016             /*           ^        */
2017             start_insert = base;
2018           else if (slash && seen_slash_slash)
2019             /* example: "http://something/" */
2020             /*                           ^  */
2021             start_insert = slash;
2022
2023           span = start_insert - base;
2024           constr = (char *)xmalloc (span + linklength + 1);
2025           if (span)
2026             memcpy (constr, base, span);
2027           if (linklength)
2028             memcpy (constr + span, link, linklength);
2029           constr[span + linklength] = '\0';
2030         }
2031       else
2032         {
2033           /* LINK is a relative URL: we need to replace everything
2034              after last slash (possibly empty) with LINK.
2035
2036              So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
2037              our result should be "whatever/foo/qux/xyzzy".  */
2038           int need_explicit_slash = 0;
2039           int span;
2040           const char *start_insert;
2041           const char *last_slash = find_last_char (base, end, '/');
2042           if (!last_slash)
2043             {
2044               /* No slash found at all.  Append LINK to what we have,
2045                  but we'll need a slash as a separator.
2046
2047                  Example: if base == "foo" and link == "qux/xyzzy", then
2048                  we cannot just append link to base, because we'd get
2049                  "fooqux/xyzzy", whereas what we want is
2050                  "foo/qux/xyzzy".
2051
2052                  To make sure the / gets inserted, we set
2053                  need_explicit_slash to 1.  We also set start_insert
2054                  to end + 1, so that the length calculations work out
2055                  correctly for one more (slash) character.  Accessing
2056                  that character is fine, since it will be the
2057                  delimiter, '\0' or '?'.  */
2058               /* example: "foo?..." */
2059               /*               ^    ('?' gets changed to '/') */
2060               start_insert = end + 1;
2061               need_explicit_slash = 1;
2062             }
2063           else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
2064             {
2065               /* example: http://host"  */
2066               /*                      ^ */
2067               start_insert = end + 1;
2068               need_explicit_slash = 1;
2069             }
2070           else
2071             {
2072               /* example: "whatever/foo/bar" */
2073               /*                        ^    */
2074               start_insert = last_slash + 1;
2075             }
2076
2077           span = start_insert - base;
2078           constr = (char *)xmalloc (span + linklength + 1);
2079           if (span)
2080             memcpy (constr, base, span);
2081           if (need_explicit_slash)
2082             constr[span - 1] = '/';
2083           if (linklength)
2084             memcpy (constr + span, link, linklength);
2085           constr[span + linklength] = '\0';
2086         }
2087     }
2088   else /* !no_scheme */
2089     {
2090       constr = strdupdelim (link, link + linklength);
2091     }
2092   return constr;
2093 }
2094
2095 /* Merge BASE with LINK and return the resulting URI.  This is an
2096    interface to uri_merge_1 that assumes that LINK is a
2097    zero-terminated string.  */
2098 char *
2099 uri_merge (const char *base, const char *link)
2100 {
2101   return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
2102 }
2103 \f
2104 #define APPEND(p, s) do {                       \
2105   int len = strlen (s);                         \
2106   memcpy (p, s, len);                           \
2107   p += len;                                     \
2108 } while (0)
2109
2110 /* Use this instead of password when the actual password is supposed
2111    to be hidden.  We intentionally use a generic string without giving
2112    away the number of characters in the password, like previous
2113    versions did.  */
2114 #define HIDDEN_PASSWORD "*password*"
2115
2116 /* Recreate the URL string from the data in URL.
2117
2118    If HIDE is non-zero (as it is when we're calling this on a URL we
2119    plan to print, but not when calling it to canonicalize a URL for
2120    use within the program), password will be hidden.  Unsafe
2121    characters in the URL will be quoted.  */
2122
2123 char *
2124 url_string (const struct url *url, int hide_password)
2125 {
2126   int size;
2127   char *result, *p;
2128   char *quoted_user = NULL, *quoted_passwd = NULL;
2129
2130   int scheme_port  = supported_schemes[url->scheme].default_port;
2131   char *scheme_str = supported_schemes[url->scheme].leading_string;
2132   int fplen = full_path_length (url);
2133
2134   int brackets_around_host = 0;
2135
2136   assert (scheme_str != NULL);
2137
2138   /* Make sure the user name and password are quoted. */
2139   if (url->user)
2140     {
2141       quoted_user = url_escape_allow_passthrough (url->user);
2142       if (url->passwd)
2143         {
2144           if (hide_password)
2145             quoted_passwd = HIDDEN_PASSWORD;
2146           else
2147             quoted_passwd = url_escape_allow_passthrough (url->passwd);
2148         }
2149     }
2150
2151   if (strchr (url->host, ':'))
2152     brackets_around_host = 1;
2153
2154   size = (strlen (scheme_str)
2155           + strlen (url->host)
2156           + (brackets_around_host ? 2 : 0)
2157           + fplen
2158           + 1);
2159   if (url->port != scheme_port)
2160     size += 1 + numdigit (url->port);
2161   if (quoted_user)
2162     {
2163       size += 1 + strlen (quoted_user);
2164       if (quoted_passwd)
2165         size += 1 + strlen (quoted_passwd);
2166     }
2167
2168   p = result = xmalloc (size);
2169
2170   APPEND (p, scheme_str);
2171   if (quoted_user)
2172     {
2173       APPEND (p, quoted_user);
2174       if (quoted_passwd)
2175         {
2176           *p++ = ':';
2177           APPEND (p, quoted_passwd);
2178         }
2179       *p++ = '@';
2180     }
2181
2182   if (brackets_around_host)
2183     *p++ = '[';
2184   APPEND (p, url->host);
2185   if (brackets_around_host)
2186     *p++ = ']';
2187   if (url->port != scheme_port)
2188     {
2189       *p++ = ':';
2190       p = number_to_string (p, url->port);
2191     }
2192
2193   full_path_write (url, p);
2194   p += fplen;
2195   *p++ = '\0';
2196
2197   assert (p - result == size);
2198
2199   if (quoted_user && quoted_user != url->user)
2200     xfree (quoted_user);
2201   if (quoted_passwd && !hide_password
2202       && quoted_passwd != url->passwd)
2203     xfree (quoted_passwd);
2204
2205   return result;
2206 }
2207 \f
2208 /* Return the URL of the proxy appropriate for url U.  */
2209 char *
2210 getproxy (struct url *u)
2211 {
2212   char *proxy = NULL;
2213   char *rewritten_url;
2214   static char rewritten_storage[1024];
2215
2216   if (!opt.use_proxy)
2217     return NULL;
2218   if (!no_proxy_match (u->host, (const char **)opt.no_proxy))
2219     return NULL;
2220
2221   switch (u->scheme)
2222     {
2223     case SCHEME_HTTP:
2224       proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
2225       break;
2226 #ifdef HAVE_SSL
2227     case SCHEME_HTTPS:
2228       proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
2229       break;
2230 #endif
2231     case SCHEME_FTP:
2232       proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
2233       break;
2234     case SCHEME_INVALID:
2235       break;
2236     }
2237   if (!proxy || !*proxy)
2238     return NULL;
2239
2240   /* Handle shorthands.  `rewritten_storage' is a kludge to allow
2241      getproxy() to return static storage. */
2242   rewritten_url = rewrite_shorthand_url (proxy);
2243   if (rewritten_url)
2244     {
2245       strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
2246       rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
2247       proxy = rewritten_storage;
2248     }
2249
2250   return proxy;
2251 }
2252
2253 /* Should a host be accessed through proxy, concerning no_proxy?  */
2254 int
2255 no_proxy_match (const char *host, const char **no_proxy)
2256 {
2257   if (!no_proxy)
2258     return 1;
2259   else
2260     return !sufmatch (no_proxy, host);
2261 }
2262 \f
2263 /* Support for converting links for local viewing in downloaded HTML
2264    files.  This should be moved to another file, because it has
2265    nothing to do with processing URLs.  */
2266
2267 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
2268 static const char *replace_attr PARAMS ((const char *, int, FILE *,
2269                                          const char *));
2270 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
2271                                                       const char *, int));
2272 static char *local_quote_string PARAMS ((const char *));
2273
2274 /* Change the links in one HTML file.  LINKS is a list of links in the
2275    document, along with their positions and the desired direction of
2276    the conversion.  */
2277 void
2278 convert_links (const char *file, struct urlpos *links)
2279 {
2280   struct file_memory *fm;
2281   FILE *fp;
2282   const char *p;
2283   downloaded_file_t downloaded_file_return;
2284
2285   struct urlpos *link;
2286   int to_url_count = 0, to_file_count = 0;
2287
2288   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
2289
2290   {
2291     /* First we do a "dry run": go through the list L and see whether
2292        any URL needs to be converted in the first place.  If not, just
2293        leave the file alone.  */
2294     int dry_count = 0;
2295     struct urlpos *dry = links;
2296     for (dry = links; dry; dry = dry->next)
2297       if (dry->convert != CO_NOCONVERT)
2298         ++dry_count;
2299     if (!dry_count)
2300       {
2301         logputs (LOG_VERBOSE, _("nothing to do.\n"));
2302         return;
2303       }
2304   }
2305
2306   fm = read_file (file);
2307   if (!fm)
2308     {
2309       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2310                  file, strerror (errno));
2311       return;
2312     }
2313
2314   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
2315   if (opt.backup_converted && downloaded_file_return)
2316     write_backup_file (file, downloaded_file_return);
2317
2318   /* Before opening the file for writing, unlink the file.  This is
2319      important if the data in FM is mmaped.  In such case, nulling the
2320      file, which is what fopen() below does, would make us read all
2321      zeroes from the mmaped region.  */
2322   if (unlink (file) < 0 && errno != ENOENT)
2323     {
2324       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
2325                  file, strerror (errno));
2326       read_file_free (fm);
2327       return;
2328     }
2329   /* Now open the file for writing.  */
2330   fp = fopen (file, "wb");
2331   if (!fp)
2332     {
2333       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2334                  file, strerror (errno));
2335       read_file_free (fm);
2336       return;
2337     }
2338
2339   /* Here we loop through all the URLs in file, replacing those of
2340      them that are downloaded with relative references.  */
2341   p = fm->content;
2342   for (link = links; link; link = link->next)
2343     {
2344       char *url_start = fm->content + link->pos;
2345
2346       if (link->pos >= fm->length)
2347         {
2348           DEBUGP (("Something strange is going on.  Please investigate."));
2349           break;
2350         }
2351       /* If the URL is not to be converted, skip it.  */
2352       if (link->convert == CO_NOCONVERT)
2353         {
2354           DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
2355           continue;
2356         }
2357
2358       /* Echo the file contents, up to the offending URL's opening
2359          quote, to the outfile.  */
2360       fwrite (p, 1, url_start - p, fp);
2361       p = url_start;
2362
2363       switch (link->convert)
2364         {
2365         case CO_CONVERT_TO_RELATIVE:
2366           /* Convert absolute URL to relative. */
2367           {
2368             char *newname = construct_relative (file, link->local_name);
2369             char *quoted_newname = local_quote_string (newname);
2370
2371             if (!link->link_refresh_p)
2372               p = replace_attr (p, link->size, fp, quoted_newname);
2373             else
2374               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
2375                                              link->refresh_timeout);
2376
2377             DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
2378                      link->url->url, newname, link->pos, file));
2379             xfree (newname);
2380             xfree (quoted_newname);
2381             ++to_file_count;
2382             break;
2383           }
2384         case CO_CONVERT_TO_COMPLETE:
2385           /* Convert the link to absolute URL. */
2386           {
2387             char *newlink = link->url->url;
2388             char *quoted_newlink = html_quote_string (newlink);
2389
2390             if (!link->link_refresh_p)
2391               p = replace_attr (p, link->size, fp, quoted_newlink);
2392             else
2393               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
2394                                              link->refresh_timeout);
2395
2396             DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
2397                      newlink, link->pos, file));
2398             xfree (quoted_newlink);
2399             ++to_url_count;
2400             break;
2401           }
2402         case CO_NULLIFY_BASE:
2403           /* Change the base href to "". */
2404           p = replace_attr (p, link->size, fp, "");
2405           break;
2406         case CO_NOCONVERT:
2407           abort ();
2408           break;
2409         }
2410     }
2411
2412   /* Output the rest of the file. */
2413   if (p - fm->content < fm->length)
2414     fwrite (p, 1, fm->length - (p - fm->content), fp);
2415   fclose (fp);
2416   read_file_free (fm);
2417
2418   logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
2419 }
2420
2421 /* Construct and return a malloced copy of the relative link from two
2422    pieces of information: local name S1 of the referring file and
2423    local name S2 of the referred file.
2424
2425    So, if S1 is "jagor.srce.hr/index.html" and S2 is
2426    "jagor.srce.hr/images/news.gif", the function will return
2427    "images/news.gif".
2428
2429    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
2430    "fly.cc.fer.hr/images/fly.gif", the function will return
2431    "../images/fly.gif".
2432
2433    Caveats: S1 should not begin with `/', unless S2 also begins with
2434    '/'.  S1 should not contain things like ".." and such --
2435    construct_relative ("fly/ioccc/../index.html",
2436    "fly/images/fly.gif") will fail.  (A workaround is to call
2437    something like path_simplify() on S1).  */
2438 static char *
2439 construct_relative (const char *s1, const char *s2)
2440 {
2441   int i, cnt, sepdirs1;
2442   char *res;
2443
2444   if (*s2 == '/')
2445     return xstrdup (s2);
2446   /* S1 should *not* be absolute, if S2 wasn't.  */
2447   assert (*s1 != '/');
2448   i = cnt = 0;
2449   /* Skip the directories common to both strings.  */
2450   while (1)
2451     {
2452       while (s1[i] && s2[i]
2453              && (s1[i] == s2[i])
2454              && (s1[i] != '/')
2455              && (s2[i] != '/'))
2456         ++i;
2457       if (s1[i] == '/' && s2[i] == '/')
2458         cnt = ++i;
2459       else
2460         break;
2461     }
2462   for (sepdirs1 = 0; s1[i]; i++)
2463     if (s1[i] == '/')
2464       ++sepdirs1;
2465   /* Now, construct the file as of:
2466      - ../ repeated sepdirs1 time
2467      - all the non-mutual directories of S2.  */
2468   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
2469   for (i = 0; i < sepdirs1; i++)
2470     memcpy (res + 3 * i, "../", 3);
2471   strcpy (res + 3 * i, s2 + cnt);
2472   return res;
2473 }
2474 \f
2475 static void
2476 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
2477 {
2478   /* Rather than just writing over the original .html file with the
2479      converted version, save the former to *.orig.  Note we only do
2480      this for files we've _successfully_ downloaded, so we don't
2481      clobber .orig files sitting around from previous invocations. */
2482
2483   /* Construct the backup filename as the original name plus ".orig". */
2484   size_t         filename_len = strlen(file);
2485   char*          filename_plus_orig_suffix;
2486   boolean        already_wrote_backup_file = FALSE;
2487   slist*         converted_file_ptr;
2488   static slist*  converted_files = NULL;
2489
2490   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
2491     {
2492       /* Just write "orig" over "html".  We need to do it this way
2493          because when we're checking to see if we've downloaded the
2494          file before (to see if we can skip downloading it), we don't
2495          know if it's a text/html file.  Therefore we don't know yet
2496          at that stage that -E is going to cause us to tack on
2497          ".html", so we need to compare vs. the original URL plus
2498          ".orig", not the original URL plus ".html.orig". */
2499       filename_plus_orig_suffix = alloca (filename_len + 1);
2500       strcpy(filename_plus_orig_suffix, file);
2501       strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
2502     }
2503   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
2504     {
2505       /* Append ".orig" to the name. */
2506       filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
2507       strcpy(filename_plus_orig_suffix, file);
2508       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
2509     }
2510
2511   /* We can get called twice on the same URL thanks to the
2512      convert_all_links() call in main().  If we write the .orig file
2513      each time in such a case, it'll end up containing the first-pass
2514      conversion, not the original file.  So, see if we've already been
2515      called on this file. */
2516   converted_file_ptr = converted_files;
2517   while (converted_file_ptr != NULL)
2518     if (strcmp(converted_file_ptr->string, file) == 0)
2519       {
2520         already_wrote_backup_file = TRUE;
2521         break;
2522       }
2523     else
2524       converted_file_ptr = converted_file_ptr->next;
2525
2526   if (!already_wrote_backup_file)
2527     {
2528       /* Rename <file> to <file>.orig before former gets written over. */
2529       if (rename(file, filename_plus_orig_suffix) != 0)
2530         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
2531                    file, filename_plus_orig_suffix, strerror (errno));
2532
2533       /* Remember that we've already written a .orig backup for this file.
2534          Note that we never free this memory since we need it till the
2535          convert_all_links() call, which is one of the last things the
2536          program does before terminating.  BTW, I'm not sure if it would be
2537          safe to just set 'converted_file_ptr->string' to 'file' below,
2538          rather than making a copy of the string...  Another note is that I
2539          thought I could just add a field to the urlpos structure saying
2540          that we'd written a .orig file for this URL, but that didn't work,
2541          so I had to make this separate list.
2542          -- Dan Harkless <wget@harkless.org>
2543
2544          This [adding a field to the urlpos structure] didn't work
2545          because convert_file() is called from convert_all_links at
2546          the end of the retrieval with a freshly built new urlpos
2547          list.
2548          -- Hrvoje Niksic <hniksic@arsdigita.com>
2549       */
2550       converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
2551       converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
2552       converted_file_ptr->next = converted_files;
2553       converted_files = converted_file_ptr;
2554     }
2555 }
2556
2557 static int find_fragment PARAMS ((const char *, int, const char **,
2558                                   const char **));
2559
2560 /* Replace an attribute's original text with NEW_TEXT. */
2561
2562 static const char *
2563 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
2564 {
2565   int quote_flag = 0;
2566   char quote_char = '\"';       /* use "..." for quoting, unless the
2567                                    original value is quoted, in which
2568                                    case reuse its quoting char. */
2569   const char *frag_beg, *frag_end;
2570
2571   /* Structure of our string is:
2572        "...old-contents..."
2573        <---    size    --->  (with quotes)
2574      OR:
2575        ...old-contents...
2576        <---    size   -->    (no quotes)   */
2577
2578   if (*p == '\"' || *p == '\'')
2579     {
2580       quote_char = *p;
2581       quote_flag = 1;
2582       ++p;
2583       size -= 2;                /* disregard opening and closing quote */
2584     }
2585   putc (quote_char, fp);
2586   fputs (new_text, fp);
2587
2588   /* Look for fragment identifier, if any. */
2589   if (find_fragment (p, size, &frag_beg, &frag_end))
2590     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2591   p += size;
2592   if (quote_flag)
2593     ++p;
2594   putc (quote_char, fp);
2595
2596   return p;
2597 }
2598
2599 /* The same as REPLACE_ATTR, but used when replacing
2600    <meta http-equiv=refresh content="new_text"> because we need to
2601    append "timeout_value; URL=" before the next_text.  */
2602
2603 static const char *
2604 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
2605                            const char *new_text, int timeout)
2606 {
2607   /* "0; URL=..." */
2608   char *new_with_timeout = (char *)alloca (numdigit (timeout)
2609                                            + 6 /* "; URL=" */
2610                                            + strlen (new_text)
2611                                            + 1);
2612   sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
2613
2614   return replace_attr (p, size, fp, new_with_timeout);
2615 }
2616
2617 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2618    preceded by '&'.  If the character is not found, return zero.  If
2619    the character is found, return 1 and set BP and EP to point to the
2620    beginning and end of the region.
2621
2622    This is used for finding the fragment indentifiers in URLs.  */
2623
2624 static int
2625 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2626 {
2627   const char *end = beg + size;
2628   int saw_amp = 0;
2629   for (; beg < end; beg++)
2630     {
2631       switch (*beg)
2632         {
2633         case '&':
2634           saw_amp = 1;
2635           break;
2636         case '#':
2637           if (!saw_amp)
2638             {
2639               *bp = beg;
2640               *ep = end;
2641               return 1;
2642             }
2643           /* fallthrough */
2644         default:
2645           saw_amp = 0;
2646         }
2647     }
2648   return 0;
2649 }
2650
2651 /* Quote FILE for use as local reference to an HTML file.
2652
2653    We quote ? as %3F to avoid passing part of the file name as the
2654    parameter when browsing the converted file through HTTP.  However,
2655    it is safe to do this only when `--html-extension' is turned on.
2656    This is because converting "index.html?foo=bar" to
2657    "index.html%3Ffoo=bar" would break local browsing, as the latter
2658    isn't even recognized as an HTML file!  However, converting
2659    "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
2660    safe for both local and HTTP-served browsing.  */
2661
2662 static char *
2663 local_quote_string (const char *file)
2664 {
2665   const char *file_sans_qmark;
2666   int qm;
2667
2668   if (!opt.html_extension)
2669     return html_quote_string (file);
2670
2671   qm = count_char (file, '?');
2672
2673   if (qm)
2674     {
2675       const char *from = file;
2676       char *to, *newname;
2677
2678       /* qm * 2 because we replace each question mark with "%3F",
2679          i.e. replace one char with three, hence two more.  */
2680       int fsqlen = strlen (file) + qm * 2;
2681
2682       to = newname = (char *)alloca (fsqlen + 1);
2683       for (; *from; from++)
2684         {
2685           if (*from != '?')
2686             *to++ = *from;
2687           else
2688             {
2689               *to++ = '%';
2690               *to++ = '3';
2691               *to++ = 'F';
2692             }
2693         }
2694       assert (to - newname == fsqlen);
2695       *to = '\0';
2696
2697       file_sans_qmark = newname;
2698     }
2699   else
2700     file_sans_qmark = file;
2701
2702   return html_quote_string (file_sans_qmark);
2703 }
2704
2705 /* We're storing "modes" of type downloaded_file_t in the hash table.
2706    However, our hash tables only accept pointers for keys and values.
2707    So when we need a pointer, we use the address of a
2708    downloaded_file_t variable of static storage.  */
2709
2710 static downloaded_file_t *
2711 downloaded_mode_to_ptr (downloaded_file_t mode)
2712 {
2713   static downloaded_file_t
2714     v1 = FILE_NOT_ALREADY_DOWNLOADED,
2715     v2 = FILE_DOWNLOADED_NORMALLY,
2716     v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2717     v4 = CHECK_FOR_FILE;
2718
2719   switch (mode)
2720     {
2721     case FILE_NOT_ALREADY_DOWNLOADED:
2722       return &v1;
2723     case FILE_DOWNLOADED_NORMALLY:
2724       return &v2;
2725     case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2726       return &v3;
2727     case CHECK_FOR_FILE:
2728       return &v4;
2729     }
2730   return NULL;
2731 }
2732
2733 /* This should really be merged with dl_file_url_map and
2734    downloaded_html_files in recur.c.  This was originally a list, but
2735    I changed it to a hash table beause it was actually taking a lot of
2736    time to find things in it.  */
2737
2738 static struct hash_table *downloaded_files_hash;
2739
2740 /* Remembers which files have been downloaded.  In the standard case, should be
2741    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2742    download successfully (i.e. not for ones we have failures on or that we skip
2743    due to -N).
2744
2745    When we've downloaded a file and tacked on a ".html" extension due to -E,
2746    call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2747    FILE_DOWNLOADED_NORMALLY.
2748
2749    If you just want to check if a file has been previously added without adding
2750    it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
2751    with local filenames, not remote URLs. */
2752 downloaded_file_t
2753 downloaded_file (downloaded_file_t mode, const char *file)
2754 {
2755   downloaded_file_t *ptr;
2756
2757   if (mode == CHECK_FOR_FILE)
2758     {
2759       if (!downloaded_files_hash)
2760         return FILE_NOT_ALREADY_DOWNLOADED;
2761       ptr = hash_table_get (downloaded_files_hash, file);
2762       if (!ptr)
2763         return FILE_NOT_ALREADY_DOWNLOADED;
2764       return *ptr;
2765     }
2766
2767   if (!downloaded_files_hash)
2768     downloaded_files_hash = make_string_hash_table (0);
2769
2770   ptr = hash_table_get (downloaded_files_hash, file);
2771   if (ptr)
2772     return *ptr;
2773
2774   ptr = downloaded_mode_to_ptr (mode);
2775   hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2776
2777   return FILE_NOT_ALREADY_DOWNLOADED;
2778 }
2779
2780 static int
2781 df_free_mapper (void *key, void *value, void *ignored)
2782 {
2783   xfree (key);
2784   return 0;
2785 }
2786
2787 void
2788 downloaded_files_free (void)
2789 {
2790   if (downloaded_files_hash)
2791     {
2792       hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2793       hash_table_destroy (downloaded_files_hash);
2794       downloaded_files_hash = NULL;
2795     }
2796 }
2797
2798 /* Return non-zero if scheme a is similar to scheme b.
2799
2800    Schemes are similar if they are equal.  If SSL is supported, schemes
2801    are also similar if one is http (SCHEME_HTTP) and the other is https
2802    (SCHEME_HTTPS).  */
2803 int
2804 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2805 {
2806   if (a == b)
2807     return 1;
2808 #ifdef HAVE_SSL
2809   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2810       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2811     return 1;
2812 #endif
2813   return 0;
2814 }
2815 \f
2816 #if 0
2817 /* Debugging and testing support for path_simplify. */
2818
2819 /* Debug: run path_simplify on PATH and return the result in a new
2820    string.  Useful for calling from the debugger.  */
2821 static char *
2822 ps (char *path)
2823 {
2824   char *copy = xstrdup (path);
2825   path_simplify (copy);
2826   return copy;
2827 }
2828
2829 static void
2830 run_test (char *test, char *expected_result, int expected_change)
2831 {
2832   char *test_copy = xstrdup (test);
2833   int modified = path_simplify (test_copy);
2834
2835   if (0 != strcmp (test_copy, expected_result))
2836     {
2837       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2838               test, expected_result, test_copy);
2839     }
2840   if (modified != expected_change)
2841     {
2842       if (expected_change == 1)
2843         printf ("Expected no modification with path_simplify(\"%s\").\n",
2844                 test);
2845       else
2846         printf ("Expected modification with path_simplify(\"%s\").\n",
2847                 test);
2848     }
2849   xfree (test_copy);
2850 }
2851
2852 static void
2853 test_path_simplify (void)
2854 {
2855   static struct {
2856     char *test, *result;
2857     int should_modify;
2858   } tests[] = {
2859     { "",               "",             0 },
2860     { ".",              "",             1 },
2861     { "..",             "",             1 },
2862     { "foo",            "foo",          0 },
2863     { "foo/bar",        "foo/bar",      0 },
2864     { "foo///bar",      "foo/bar",      1 },
2865     { "foo/.",          "foo/",         1 },
2866     { "foo/./",         "foo/",         1 },
2867     { "foo./",          "foo./",        0 },
2868     { "foo/../bar",     "bar",          1 },
2869     { "foo/../bar/",    "bar/",         1 },
2870     { "foo/bar/..",     "foo/",         1 },
2871     { "foo/bar/../x",   "foo/x",        1 },
2872     { "foo/bar/../x/",  "foo/x/",       1 },
2873     { "foo/..",         "",             1 },
2874     { "foo/../..",      "",             1 },
2875     { "a/b/../../c",    "c",            1 },
2876     { "./a/../b",       "b",            1 }
2877   };
2878   int i;
2879
2880   for (i = 0; i < ARRAY_SIZE (tests); i++)
2881     {
2882       char *test = tests[i].test;
2883       char *expected_result = tests[i].result;
2884       int   expected_change = tests[i].should_modify;
2885       run_test (test, expected_result, expected_change);
2886     }
2887
2888   /* Now run all the tests with a leading slash before the test case,
2889      to prove that the slash is being preserved.  */
2890   for (i = 0; i < ARRAY_SIZE (tests); i++)
2891     {
2892       char *test, *expected_result;
2893       int expected_change = tests[i].should_modify;
2894
2895       test = xmalloc (1 + strlen (tests[i].test) + 1);
2896       sprintf (test, "/%s", tests[i].test);
2897
2898       expected_result = xmalloc (1 + strlen (tests[i].result) + 1);
2899       sprintf (expected_result, "/%s", tests[i].result);
2900
2901       run_test (test, expected_result, expected_change);
2902
2903       xfree (test);
2904       xfree (expected_result);
2905     }
2906 }
2907 #endif