sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001, 2003, 2003
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2 of the License, or (at
  10 your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget; if not, write to the Free Software
  19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  20
  21 In addition, as a special exception, the Free Software Foundation
  22 gives permission to link the code of its release of Wget with the
  23 OpenSSL project's "OpenSSL" library (or with modified versions of it
  24 that use the same license as the "OpenSSL" library), and distribute
  25 the linked executables.  You must obey the GNU General Public License
  26 in all respects for all of the code used other than "OpenSSL".  If you
  27 modify this file, you may extend this exception to your version of the
  28 file, but you are not obligated to do so.  If you do not wish to do
  29 so, delete this exception statement from your version.  */
  30
  31 #include <config.h>
  32
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #ifdef HAVE_STRING_H
  36 # include <string.h>
  37 #else
  38 # include <strings.h>
  39 #endif
  40 #include <sys/types.h>
  41 #ifdef HAVE_UNISTD_H
  42 # include <unistd.h>
  43 #endif
  44 #include <errno.h>
  45 #include <assert.h>
  46
  47 #include "wget.h"
  48 #include "utils.h"
  49 #include "url.h"
  50
  51 #ifndef errno
  52 extern int errno;
  53 #endif
  54
  55 struct scheme_data
  56 {
  57   const char *name;
  58   const char *leading_string;
  59   int default_port;
  60   int enabled;
  61 };
  62
  63 /* Supported schemes: */
  64 static struct scheme_data supported_schemes[] =
  65 {
  66   { "http",     "http://",  DEFAULT_HTTP_PORT,  1 },
  67 #ifdef HAVE_SSL
  68   { "https",    "https://", DEFAULT_HTTPS_PORT, 1 },
  69 #endif
  70   { "ftp",      "ftp://",   DEFAULT_FTP_PORT,   1 },
  71
  72   /* SCHEME_INVALID */
  73   { NULL,       NULL,       -1,                 0 }
  74 };
  75
  76 /* Forward declarations: */
  77
  78 static int path_simplify PARAMS ((char *));
  79 \f
  80 /* Support for escaping and unescaping of URL strings.  */
  81
  82 /* Table of "reserved" and "unsafe" characters.  Those terms are
  83    rfc1738-speak, as such largely obsoleted by rfc2396 and later
  84    specs, but the general idea remains.
  85
  86    A reserved character is the one that you can't decode without
  87    changing the meaning of the URL.  For example, you can't decode
  88    "/foo/%2f/bar" into "/foo///bar" because the number and contents of
  89    path components is different.  Non-reserved characters can be
  90    changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar".  Wget
  91    uses the rfc1738 set of reserved characters, plus "$" and ",", as
  92    recommended by rfc2396.
  93
  94    An unsafe characters is the one that should be encoded when URLs
  95    are placed in foreign environments.  E.g. space and newline are
  96    unsafe in HTTP contexts because HTTP uses them as separator and
  97    terminator, so they must be encoded to %20 and %0A respectively.
  98    "*" is unsafe in shell context, etc.
  99
 100    We determine whether a character is unsafe through static table
 101    lookup.  This code assumes ASCII character set and 8-bit chars.  */
 102
 103 enum {
 104   /* rfc1738 reserved chars + "$" and ",".  */
 105   urlchr_reserved = 1,
 106
 107   /* rfc1738 unsafe chars, plus non-printables.  */
 108   urlchr_unsafe   = 2
 109 };
 110
 111 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 112 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 113 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 114
 115 /* Shorthands for the table: */
 116 #define R  urlchr_reserved
 117 #define U  urlchr_unsafe
 118 #define RU R|U
 119
 120 const static unsigned char urlchr_table[256] =
 121 {
 122   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 123   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 124   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 125   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 126   U,  0,  U, RU,   R,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 127   0,  0,  0,  R,   R,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 128   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 129   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 130  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 131   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 132   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 133   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 134   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 135   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 136   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 137   0,  0,  0,  U,   U,  U,  U,  U,   /* x   y   z   {    |   }   ~   DEL */
 138
 139   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 140   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 141   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 142   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 143
 144   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 145   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 146   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 147   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 148 };
 149 #undef R
 150 #undef U
 151 #undef RU
 152
 153 /* URL-unescape the string S.
 154
 155    This is done by transforming the sequences "%HH" to the character
 156    represented by the hexadecimal digits HH.  If % is not followed by
 157    two hexadecimal digits, it is inserted literally.
 158
 159    The transformation is done in place.  If you need the original
 160    string intact, make a copy before calling this function.  */
 161
 162 static void
 163 url_unescape (char *s)
 164 {
 165   char *t = s;                  /* t - tortoise */
 166   char *h = s;                  /* h - hare     */
 167
 168   for (; *h; h++, t++)
 169     {
 170       if (*h != '%')
 171         {
 172         copychar:
 173           *t = *h;
 174         }
 175       else
 176         {
 177           /* Do nothing if '%' is not followed by two hex digits. */
 178           if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2])))
 179             goto copychar;
 180           *t = X2DIGITS_TO_NUM (h[1], h[2]);
 181           h += 2;
 182         }
 183     }
 184   *t = '\0';
 185 }
 186
 187 /* The core of url_escape_* functions.  Escapes the characters that
 188    match the provided mask in urlchr_table.
 189
 190    If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars
 191    will be returned unchanged.  If ALLOW_PASSTHROUGH is zero, a
 192    freshly allocated string will be returned in all cases.  */
 193
 194 static char *
 195 url_escape_1 (const char *s, unsigned char mask, int allow_passthrough)
 196 {
 197   const char *p1;
 198   char *p2, *newstr;
 199   int newlen;
 200   int addition = 0;
 201
 202   for (p1 = s; *p1; p1++)
 203     if (urlchr_test (*p1, mask))
 204       addition += 2;            /* Two more characters (hex digits) */
 205
 206   if (!addition)
 207     return allow_passthrough ? (char *)s : xstrdup (s);
 208
 209   newlen = (p1 - s) + addition;
 210   newstr = (char *)xmalloc (newlen + 1);
 211
 212   p1 = s;
 213   p2 = newstr;
 214   while (*p1)
 215     {
 216       /* Quote the characters that match the test mask. */
 217       if (urlchr_test (*p1, mask))
 218         {
 219           unsigned char c = *p1++;
 220           *p2++ = '%';
 221           *p2++ = XNUM_TO_DIGIT (c >> 4);
 222           *p2++ = XNUM_TO_DIGIT (c & 0xf);
 223         }
 224       else
 225         *p2++ = *p1++;
 226     }
 227   assert (p2 - newstr == newlen);
 228   *p2 = '\0';
 229
 230   return newstr;
 231 }
 232
 233 /* URL-escape the unsafe characters (see urlchr_table) in a given
 234    string, returning a freshly allocated string.  */
 235
 236 char *
 237 url_escape (const char *s)
 238 {
 239   return url_escape_1 (s, urlchr_unsafe, 0);
 240 }
 241
 242 /* URL-escape the unsafe characters (see urlchr_table) in a given
 243    string.  If no characters are unsafe, S is returned.  */
 244
 245 static char *
 246 url_escape_allow_passthrough (const char *s)
 247 {
 248   return url_escape_1 (s, urlchr_unsafe, 1);
 249 }
 250 \f
 251 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
 252
 253 /* Decide whether to encode, decode, or pass through the char at P.
 254    This used to be a macro, but it got a little too convoluted.  */
 255 static inline enum copy_method
 256 decide_copy_method (const char *p)
 257 {
 258   if (*p == '%')
 259     {
 260       if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
 261         {
 262           /* %xx sequence: decode it, unless it would decode to an
 263              unsafe or a reserved char; in that case, leave it as
 264              is. */
 265           char preempt = X2DIGITS_TO_NUM (*(p + 1), *(p + 2));
 266           if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt))
 267             return CM_PASSTHROUGH;
 268           else
 269             return CM_DECODE;
 270         }
 271       else
 272         /* Garbled %.. sequence: encode `%'. */
 273         return CM_ENCODE;
 274     }
 275   else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
 276     return CM_ENCODE;
 277   else
 278     return CM_PASSTHROUGH;
 279 }
 280
 281 /* Translate a %-escaped (but possibly non-conformant) input string S
 282    into a %-escaped (and conformant) output string.  If no characters
 283    are encoded or decoded, return the same string S; otherwise, return
 284    a freshly allocated string with the new contents.
 285
 286    After a URL has been run through this function, the protocols that
 287    use `%' as the quote character can use the resulting string as-is,
 288    while those that don't call url_unescape() to get to the intended
 289    data.  This function is also stable: after an input string is
 290    transformed the first time, all further transformations of the
 291    result yield the same result string.
 292
 293    Let's discuss why this function is needed.
 294
 295    Imagine Wget is to retrieve `http://abc.xyz/abc def'.  Since a raw
 296    space character would mess up the HTTP request, it needs to be
 297    quoted, like this:
 298
 299        GET /abc%20def HTTP/1.0
 300
 301    It appears that the unsafe chars need to be quoted, for example
 302    with url_escape.  But what if we're requested to download
 303    `abc%20def'?  url_escape transforms "%" to "%25", which would leave
 304    us with `abc%2520def'.  This is incorrect -- since %-escapes are
 305    part of URL syntax, "%20" is the correct way to denote a literal
 306    space on the Wget command line.  This leaves us in the conclusion
 307    that in that case Wget should not call url_escape, but leave the
 308    `%20' as is.
 309
 310    And what if the requested URI is `abc%20 def'?  If we call
 311    url_escape, we end up with `/abc%2520%20def', which is almost
 312    certainly not intended.  If we don't call url_escape, we are left
 313    with the embedded space and cannot complete the request.  What the
 314    user meant was for Wget to request `/abc%20%20def', and this is
 315    where reencode_escapes kicks in.
 316
 317    Wget used to solve this by first decoding %-quotes, and then
 318    encoding all the "unsafe" characters found in the resulting string.
 319    This was wrong because it didn't preserve certain URL special
 320    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 321    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 322    whether we considered `+' reserved (it is).  One of these results
 323    is inevitable because by the second step we would lose information
 324    on whether the `+' was originally encoded or not.  Both results
 325    were wrong because in CGI parameters + means space, while %2B means
 326    literal plus.  reencode_escapes correctly translates the above to
 327    "a%2B+b", i.e. returns the original string.
 328
 329    This function uses an algorithm proposed by Anon Sricharoenchai:
 330
 331    1. Encode all URL_UNSAFE and the "%" that are not followed by 2
 332       hexdigits.
 333
 334    2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
 335       "+".
 336
 337    ...except that this code conflates the two steps, and decides
 338    whether to encode, decode, or pass through each character in turn.
 339    The function still uses two passes, but their logic is the same --
 340    the first pass exists merely for the sake of allocation.  Another
 341    small difference is that we include `+' to URL_RESERVED.
 342
 343    Anon's test case:
 344
 345    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 346    ->
 347    "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
 348
 349    Simpler test cases:
 350
 351    "foo bar"         -> "foo%20bar"
 352    "foo%20bar"       -> "foo%20bar"
 353    "foo %20bar"      -> "foo%20%20bar"
 354    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 355    "foo%25%20bar"    -> "foo%25%20bar"
 356    "foo%2%20bar"     -> "foo%252%20bar"
 357    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 358    "foo%2b+bar"      -> "foo%2b+bar"  */
 359
 360 static char *
 361 reencode_escapes (const char *s)
 362 {
 363   const char *p1;
 364   char *newstr, *p2;
 365   int oldlen, newlen;
 366
 367   int encode_count = 0;
 368   int decode_count = 0;
 369
 370   /* First, pass through the string to see if there's anything to do,
 371      and to calculate the new length.  */
 372   for (p1 = s; *p1; p1++)
 373     {
 374       switch (decide_copy_method (p1))
 375         {
 376         case CM_ENCODE:
 377           ++encode_count;
 378           break;
 379         case CM_DECODE:
 380           ++decode_count;
 381           break;
 382         case CM_PASSTHROUGH:
 383           break;
 384         }
 385     }
 386
 387   if (!encode_count && !decode_count)
 388     /* The string is good as it is. */
 389     return (char *)s;           /* C const model sucks. */
 390
 391   oldlen = p1 - s;
 392   /* Each encoding adds two characters (hex digits), while each
 393      decoding removes two characters.  */
 394   newlen = oldlen + 2 * (encode_count - decode_count);
 395   newstr = xmalloc (newlen + 1);
 396
 397   p1 = s;
 398   p2 = newstr;
 399
 400   while (*p1)
 401     {
 402       switch (decide_copy_method (p1))
 403         {
 404         case CM_ENCODE:
 405           {
 406             unsigned char c = *p1++;
 407             *p2++ = '%';
 408             *p2++ = XNUM_TO_DIGIT (c >> 4);
 409             *p2++ = XNUM_TO_DIGIT (c & 0xf);
 410           }
 411           break;
 412         case CM_DECODE:
 413           *p2++ = X2DIGITS_TO_NUM (p1[1], p1[2]);
 414           p1 += 3;              /* skip %xx */
 415           break;
 416         case CM_PASSTHROUGH:
 417           *p2++ = *p1++;
 418         }
 419     }
 420   *p2 = '\0';
 421   assert (p2 - newstr == newlen);
 422   return newstr;
 423 }
 424 \f
 425 /* Returns the scheme type if the scheme is supported, or
 426    SCHEME_INVALID if not.  */
 427
 428 enum url_scheme
 429 url_scheme (const char *url)
 430 {
 431   int i;
 432
 433   for (i = 0; supported_schemes[i].leading_string; i++)
 434     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 435                           strlen (supported_schemes[i].leading_string)))
 436       {
 437         if (supported_schemes[i].enabled)
 438           return (enum url_scheme) i;
 439         else
 440           return SCHEME_INVALID;
 441       }
 442
 443   return SCHEME_INVALID;
 444 }
 445
 446 #define SCHEME_CHAR(ch) (ISALNUM (ch) || (ch) == '-' || (ch) == '+')
 447
 448 /* Return 1 if the URL begins with any "scheme", 0 otherwise.  As
 449    currently implemented, it returns true if URL begins with
 450    [-+a-zA-Z0-9]+: .  */
 451
 452 int
 453 url_has_scheme (const char *url)
 454 {
 455   const char *p = url;
 456
 457   /* The first char must be a scheme char. */
 458   if (!*p || !SCHEME_CHAR (*p))
 459     return 0;
 460   ++p;
 461   /* Followed by 0 or more scheme chars. */
 462   while (*p && SCHEME_CHAR (*p))
 463     ++p;
 464   /* Terminated by ':'. */
 465   return *p == ':';
 466 }
 467
 468 int
 469 scheme_default_port (enum url_scheme scheme)
 470 {
 471   return supported_schemes[scheme].default_port;
 472 }
 473
 474 void
 475 scheme_disable (enum url_scheme scheme)
 476 {
 477   supported_schemes[scheme].enabled = 0;
 478 }
 479
 480 /* Skip the username and password, if present in the URL.  The
 481    function should *not* be called with the complete URL, but with the
 482    portion after the scheme.
 483
 484    If no username and password are found, return URL.  */
 485
 486 static const char *
 487 url_skip_credentials (const char *url)
 488 {
 489   /* Look for '@' that comes before terminators, such as '/', '?',
 490      '#', or ';'.  */
 491   const char *p = (const char *)strpbrk (url, "@/?#;");
 492   if (!p || *p != '@')
 493     return url;
 494   return p + 1;
 495 }
 496
 497 /* Parse credentials contained in [BEG, END).  The region is expected
 498    to have come from a URL and is unescaped.  */
 499
 500 static int
 501 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
 502 {
 503   char *colon;
 504   const char *userend;
 505
 506   if (beg == end)
 507     return 0;                   /* empty user name */
 508
 509   colon = memchr (beg, ':', end - beg);
 510   if (colon == beg)
 511     return 0;                   /* again empty user name */
 512
 513   if (colon)
 514     {
 515       *passwd = strdupdelim (colon + 1, end);
 516       userend = colon;
 517       url_unescape (*passwd);
 518     }
 519   else
 520     {
 521       *passwd = NULL;
 522       userend = end;
 523     }
 524   *user = strdupdelim (beg, userend);
 525   url_unescape (*user);
 526   return 1;
 527 }
 528
 529 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 530    popularized by Netscape and NcFTP.  HTTP shorthands look like this:
 531
 532    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 533    www.foo.com[:port]            -> http://www.foo.com[:port]
 534
 535    FTP shorthands look like this:
 536
 537    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 538    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 539
 540    If the URL needs not or cannot be rewritten, return NULL.  */
 541
 542 char *
 543 rewrite_shorthand_url (const char *url)
 544 {
 545   const char *p;
 546
 547   if (url_has_scheme (url))
 548     return NULL;
 549
 550   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 551      latter Netscape.  */
 552   for (p = url; *p && *p != ':' && *p != '/'; p++)
 553     ;
 554
 555   if (p == url)
 556     return NULL;
 557
 558   if (*p == ':')
 559     {
 560       const char *pp;
 561       char *res;
 562       /* If the characters after the colon and before the next slash
 563          or end of string are all digits, it's HTTP.  */
 564       int digits = 0;
 565       for (pp = p + 1; ISDIGIT (*pp); pp++)
 566         ++digits;
 567       if (digits > 0 && (*pp == '/' || *pp == '\0'))
 568         goto http;
 569
 570       /* Prepend "ftp://" to the entire URL... */
 571       res = xmalloc (6 + strlen (url) + 1);
 572       sprintf (res, "ftp://%s", url);
 573       /* ...and replace ':' with '/'. */
 574       res[6 + (p - url)] = '/';
 575       return res;
 576     }
 577   else
 578     {
 579       char *res;
 580     http:
 581       /* Just prepend "http://" to what we have. */
 582       res = xmalloc (7 + strlen (url) + 1);
 583       sprintf (res, "http://%s", url);
 584       return res;
 585     }
 586 }
 587 \f
 588 static void split_path PARAMS ((const char *, char **, char **));
 589
 590 /* Like strpbrk, with the exception that it returns the pointer to the
 591    terminating zero (end-of-string aka "eos") if no matching character
 592    is found.
 593
 594    Although I normally balk at Gcc-specific optimizations, it probably
 595    makes sense here: glibc has optimizations that detect strpbrk being
 596    called with literal string as ACCEPT and inline the search.  That
 597    optimization is defeated if strpbrk is hidden within the call to
 598    another function.  (And no, making strpbrk_or_eos inline doesn't
 599    help because the check for literal accept is in the
 600    preprocessor.)  */
 601
 602 #ifdef __GNUC__
 603
 604 #define strpbrk_or_eos(s, accept) ({            \
 605   char *SOE_p = strpbrk (s, accept);            \
 606   if (!SOE_p)                                   \
 607     SOE_p = (char *)s + strlen (s);             \
 608   SOE_p;                                        \
 609 })
 610
 611 #else  /* not __GNUC__ */
 612
 613 static char *
 614 strpbrk_or_eos (const char *s, const char *accept)
 615 {
 616   char *p = strpbrk (s, accept);
 617   if (!p)
 618     p = (char *)s + strlen (s);
 619   return p;
 620 }
 621 #endif
 622
 623 /* Turn STR into lowercase; return non-zero if a character was
 624    actually changed. */
 625
 626 static int
 627 lowercase_str (char *str)
 628 {
 629   int change = 0;
 630   for (; *str; str++)
 631     if (ISUPPER (*str))
 632       {
 633         change = 1;
 634         *str = TOLOWER (*str);
 635       }
 636   return change;
 637 }
 638
 639 static const char *parse_errors[] = {
 640 #define PE_NO_ERROR                     0
 641   N_("No error"),
 642 #define PE_UNSUPPORTED_SCHEME           1
 643   N_("Unsupported scheme"),
 644 #define PE_EMPTY_HOST                   2
 645   N_("Empty host"),
 646 #define PE_BAD_PORT_NUMBER              3
 647   N_("Bad port number"),
 648 #define PE_INVALID_USER_NAME            4
 649   N_("Invalid user name"),
 650 #define PE_UNTERMINATED_IPV6_ADDRESS    5
 651   N_("Unterminated IPv6 numeric address"),
 652 #define PE_IPV6_NOT_SUPPORTED           6
 653   N_("IPv6 addresses not supported"),
 654 #define PE_INVALID_IPV6_ADDRESS         7
 655   N_("Invalid IPv6 numeric address")
 656 };
 657
 658 #ifdef ENABLE_IPV6
 659 /* The following two functions were adapted from glibc. */
 660
 661 static int
 662 is_valid_ipv4_address (const char *str, const char *end)
 663 {
 664   int saw_digit = 0;
 665   int octets = 0;
 666   int val = 0;
 667
 668   while (str < end)
 669     {
 670       int ch = *str++;
 671
 672       if (ch >= '0' && ch <= '9')
 673         {
 674           val = val * 10 + (ch - '0');
 675
 676           if (val > 255)
 677             return 0;
 678           if (saw_digit == 0)
 679             {
 680               if (++octets > 4)
 681                 return 0;
 682               saw_digit = 1;
 683             }
 684         }
 685       else if (ch == '.' && saw_digit == 1)
 686         {
 687           if (octets == 4)
 688             return 0;
 689           val = 0;
 690           saw_digit = 0;
 691         }
 692       else
 693         return 0;
 694     }
 695   if (octets < 4)
 696     return 0;
 697
 698   return 1;
 699 }
 700
 701 static int
 702 is_valid_ipv6_address (const char *str, const char *end)
 703 {
 704   enum {
 705     NS_INADDRSZ  = 4,
 706     NS_IN6ADDRSZ = 16,
 707     NS_INT16SZ   = 2
 708   };
 709
 710   const char *curtok;
 711   int tp;
 712   const char *colonp;
 713   int saw_xdigit;
 714   unsigned int val;
 715
 716   tp = 0;
 717   colonp = NULL;
 718
 719   if (str == end)
 720     return 0;
 721
 722   /* Leading :: requires some special handling. */
 723   if (*str == ':')
 724     {
 725       ++str;
 726       if (str == end || *str != ':')
 727         return 0;
 728     }
 729
 730   curtok = str;
 731   saw_xdigit = 0;
 732   val = 0;
 733
 734   while (str < end)
 735     {
 736       int ch = *str++;
 737
 738       /* if ch is a number, add it to val. */
 739       if (ISXDIGIT (ch))
 740         {
 741           val <<= 4;
 742           val |= XDIGIT_TO_NUM (ch);
 743           if (val > 0xffff)
 744             return 0;
 745           saw_xdigit = 1;
 746           continue;
 747         }
 748
 749       /* if ch is a colon ... */
 750       if (ch == ':')
 751         {
 752           curtok = str;
 753           if (saw_xdigit == 0)
 754             {
 755               if (colonp != NULL)
 756                 return 0;
 757               colonp = str + tp;
 758               continue;
 759             }
 760           else if (str == end)
 761             return 0;
 762           if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
 763             return 0;
 764           tp += NS_INT16SZ;
 765           saw_xdigit = 0;
 766           val = 0;
 767           continue;
 768         }
 769
 770       /* if ch is a dot ... */
 771       if (ch == '.' && (tp <= NS_IN6ADDRSZ - NS_INADDRSZ)
 772           && is_valid_ipv4_address (curtok, end) == 1)
 773         {
 774           tp += NS_INADDRSZ;
 775           saw_xdigit = 0;
 776           break;
 777         }
 778
 779       return 0;
 780     }
 781
 782   if (saw_xdigit == 1)
 783     {
 784       if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
 785         return 0;
 786       tp += NS_INT16SZ;
 787     }
 788
 789   if (colonp != NULL)
 790     {
 791       if (tp == NS_IN6ADDRSZ)
 792         return 0;
 793       tp = NS_IN6ADDRSZ;
 794     }
 795
 796   if (tp != NS_IN6ADDRSZ)
 797     return 0;
 798
 799   return 1;
 800 }
 801 #endif
 802
 803 /* Parse a URL.
 804
 805    Return a new struct url if successful, NULL on error.  In case of
 806    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 807    error code. */
 808 struct url *
 809 url_parse (const char *url, int *error)
 810 {
 811   struct url *u;
 812   const char *p;
 813   int path_modified, host_modified;
 814
 815   enum url_scheme scheme;
 816
 817   const char *uname_b,     *uname_e;
 818   const char *host_b,      *host_e;
 819   const char *path_b,      *path_e;
 820   const char *params_b,    *params_e;
 821   const char *query_b,     *query_e;
 822   const char *fragment_b,  *fragment_e;
 823
 824   int port;
 825   char *user = NULL, *passwd = NULL;
 826
 827   char *url_encoded = NULL;
 828
 829   int error_code;
 830
 831   scheme = url_scheme (url);
 832   if (scheme == SCHEME_INVALID)
 833     {
 834       error_code = PE_UNSUPPORTED_SCHEME;
 835       goto error;
 836     }
 837
 838   url_encoded = reencode_escapes (url);
 839   p = url_encoded;
 840
 841   p += strlen (supported_schemes[scheme].leading_string);
 842   uname_b = p;
 843   p = url_skip_credentials (p);
 844   uname_e = p;
 845
 846   /* scheme://user:pass@host[:port]... */
 847   /*                    ^              */
 848
 849   /* We attempt to break down the URL into the components path,
 850      params, query, and fragment.  They are ordered like this:
 851
 852        scheme://host[:port][/path][;params][?query][#fragment]  */
 853
 854   params_b   = params_e   = NULL;
 855   query_b    = query_e    = NULL;
 856   fragment_b = fragment_e = NULL;
 857
 858   host_b = p;
 859
 860   if (*p == '[')
 861     {
 862       /* Handle IPv6 address inside square brackets.  Ideally we'd
 863          just look for the terminating ']', but rfc2732 mandates
 864          rejecting invalid IPv6 addresses.  */
 865
 866       /* The address begins after '['. */
 867       host_b = p + 1;
 868       host_e = strchr (host_b, ']');
 869
 870       if (!host_e)
 871         {
 872           error_code = PE_UNTERMINATED_IPV6_ADDRESS;
 873           goto error;
 874         }
 875
 876 #ifdef ENABLE_IPV6
 877       /* Check if the IPv6 address is valid. */
 878       if (!is_valid_ipv6_address(host_b, host_e))
 879         {
 880           error_code = PE_INVALID_IPV6_ADDRESS;
 881           goto error;
 882         }
 883
 884       /* Continue parsing after the closing ']'. */
 885       p = host_e + 1;
 886 #else
 887       error_code = PE_IPV6_NOT_SUPPORTED;
 888       goto error;
 889 #endif
 890     }
 891   else
 892     {
 893       p = strpbrk_or_eos (p, ":/;?#");
 894       host_e = p;
 895     }
 896
 897   if (host_b == host_e)
 898     {
 899       error_code = PE_EMPTY_HOST;
 900       goto error;
 901     }
 902
 903   port = scheme_default_port (scheme);
 904   if (*p == ':')
 905     {
 906       const char *port_b, *port_e, *pp;
 907
 908       /* scheme://host:port/tralala */
 909       /*              ^             */
 910       ++p;
 911       port_b = p;
 912       p = strpbrk_or_eos (p, "/;?#");
 913       port_e = p;
 914
 915       /* Allow empty port, as per rfc2396. */
 916       if (port_b != port_e)
 917         {
 918           for (port = 0, pp = port_b; pp < port_e; pp++)
 919             {
 920               if (!ISDIGIT (*pp))
 921                 {
 922                   /* http://host:12randomgarbage/blah */
 923                   /*               ^                  */
 924                   error_code = PE_BAD_PORT_NUMBER;
 925                   goto error;
 926                 }
 927               port = 10 * port + (*pp - '0');
 928             }
 929         }
 930     }
 931
 932   if (*p == '/')
 933     {
 934       ++p;
 935       path_b = p;
 936       p = strpbrk_or_eos (p, ";?#");
 937       path_e = p;
 938     }
 939   else
 940     {
 941       /* Path is not allowed not to exist. */
 942       path_b = path_e = p;
 943     }
 944
 945   if (*p == ';')
 946     {
 947       ++p;
 948       params_b = p;
 949       p = strpbrk_or_eos (p, "?#");
 950       params_e = p;
 951     }
 952   if (*p == '?')
 953     {
 954       ++p;
 955       query_b = p;
 956       p = strpbrk_or_eos (p, "#");
 957       query_e = p;
 958
 959       /* Hack that allows users to use '?' (a wildcard character) in
 960          FTP URLs without it being interpreted as a query string
 961          delimiter.  */
 962       if (scheme == SCHEME_FTP)
 963         {
 964           query_b = query_e = NULL;
 965           path_e = p;
 966         }
 967     }
 968   if (*p == '#')
 969     {
 970       ++p;
 971       fragment_b = p;
 972       p += strlen (p);
 973       fragment_e = p;
 974     }
 975   assert (*p == 0);
 976
 977   if (uname_b != uname_e)
 978     {
 979       /* http://user:pass@host */
 980       /*        ^         ^    */
 981       /*     uname_b   uname_e */
 982       if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
 983         {
 984           error_code = PE_INVALID_USER_NAME;
 985           goto error;
 986         }
 987     }
 988
 989   u = xnew0 (struct url);
 990   u->scheme = scheme;
 991   u->host   = strdupdelim (host_b, host_e);
 992   u->port   = port;
 993   u->user   = user;
 994   u->passwd = passwd;
 995
 996   u->path = strdupdelim (path_b, path_e);
 997   path_modified = path_simplify (u->path);
 998   split_path (u->path, &u->dir, &u->file);
 999
1000   host_modified = lowercase_str (u->host);
1001
1002   /* Decode %HH sequences in host name.  This is important not so much
1003      to support %HH sequences, but to support binary characters (which
1004      will have been converted to %HH by reencode_escapes).  */
1005   if (strchr (u->host, '%'))
1006     {
1007       url_unescape (u->host);
1008       host_modified = 1;
1009     }
1010
1011   if (params_b)
1012     u->params = strdupdelim (params_b, params_e);
1013   if (query_b)
1014     u->query = strdupdelim (query_b, query_e);
1015   if (fragment_b)
1016     u->fragment = strdupdelim (fragment_b, fragment_e);
1017
1018   if (path_modified || u->fragment || host_modified || path_b == path_e)
1019     {
1020       /* If we suspect that a transformation has rendered what
1021          url_string might return different from URL_ENCODED, rebuild
1022          u->url using url_string.  */
1023       u->url = url_string (u, 0);
1024
1025       if (url_encoded != url)
1026         xfree ((char *) url_encoded);
1027     }
1028   else
1029     {
1030       if (url_encoded == url)
1031         u->url = xstrdup (url);
1032       else
1033         u->url = url_encoded;
1034     }
1035   url_encoded = NULL;
1036
1037   return u;
1038
1039  error:
1040   /* Cleanup in case of error: */
1041   if (url_encoded && url_encoded != url)
1042     xfree (url_encoded);
1043
1044   /* Transmit the error code to the caller, if the caller wants to
1045      know.  */
1046   if (error)
1047     *error = error_code;
1048   return NULL;
1049 }
1050
1051 /* Return the error message string from ERROR_CODE, which should have
1052    been retrieved from url_parse.  The error message is translated.  */
1053
1054 const char *
1055 url_error (int error_code)
1056 {
1057   assert (error_code >= 0 && error_code < countof (parse_errors));
1058   return _(parse_errors[error_code]);
1059 }
1060
1061 /* Split PATH into DIR and FILE.  PATH comes from the URL and is
1062    expected to be URL-escaped.
1063
1064    The path is split into directory (the part up to the last slash)
1065    and file (the part after the last slash), which are subsequently
1066    unescaped.  Examples:
1067
1068    PATH                 DIR           FILE
1069    "foo/bar/baz"        "foo/bar"     "baz"
1070    "foo/bar/"           "foo/bar"     ""
1071    "foo"                ""            "foo"
1072    "foo/bar/baz%2fqux"  "foo/bar"     "baz/qux" (!)
1073
1074    DIR and FILE are freshly allocated.  */
1075
1076 static void
1077 split_path (const char *path, char **dir, char **file)
1078 {
1079   char *last_slash = strrchr (path, '/');
1080   if (!last_slash)
1081     {
1082       *dir = xstrdup ("");
1083       *file = xstrdup (path);
1084     }
1085   else
1086     {
1087       *dir = strdupdelim (path, last_slash);
1088       *file = xstrdup (last_slash + 1);
1089     }
1090   url_unescape (*dir);
1091   url_unescape (*file);
1092 }
1093
1094 /* Note: URL's "full path" is the path with the query string and
1095    params appended.  The "fragment" (#foo) is intentionally ignored,
1096    but that might be changed.  For example, if the original URL was
1097    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
1098    the full path will be "/foo/bar/baz;bullshit?querystring".  */
1099
1100 /* Return the length of the full path, without the terminating
1101    zero.  */
1102
1103 static int
1104 full_path_length (const struct url *url)
1105 {
1106   int len = 0;
1107
1108 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
1109
1110   FROB (path);
1111   FROB (params);
1112   FROB (query);
1113
1114 #undef FROB
1115
1116   return len;
1117 }
1118
1119 /* Write out the full path. */
1120
1121 static void
1122 full_path_write (const struct url *url, char *where)
1123 {
1124 #define FROB(el, chr) do {                      \
1125   char *f_el = url->el;                         \
1126   if (f_el) {                                   \
1127     int l = strlen (f_el);                      \
1128     *where++ = chr;                             \
1129     memcpy (where, f_el, l);                    \
1130     where += l;                                 \
1131   }                                             \
1132 } while (0)
1133
1134   FROB (path, '/');
1135   FROB (params, ';');
1136   FROB (query, '?');
1137
1138 #undef FROB
1139 }
1140
1141 /* Public function for getting the "full path".  E.g. if u->path is
1142    "foo/bar" and u->query is "param=value", full_path will be
1143    "/foo/bar?param=value". */
1144
1145 char *
1146 url_full_path (const struct url *url)
1147 {
1148   int length = full_path_length (url);
1149   char *full_path = (char *)xmalloc(length + 1);
1150
1151   full_path_write (url, full_path);
1152   full_path[length] = '\0';
1153
1154   return full_path;
1155 }
1156
1157 /* Escape unsafe and reserved characters, except for the slash
1158    characters.  */
1159
1160 static char *
1161 url_escape_dir (const char *dir)
1162 {
1163   char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1164   char *h, *t;
1165   if (newdir == dir)
1166     return (char *)dir;
1167
1168   /* Unescape slashes in NEWDIR. */
1169
1170   h = newdir;                   /* hare */
1171   t = newdir;                   /* tortoise */
1172
1173   for (; *h; h++, t++)
1174     {
1175       /* url_escape_1 having converted '/' to "%2F" exactly. */
1176       if (*h == '%' && h[1] == '2' && h[2] == 'F')
1177         {
1178           *t = '/';
1179           h += 2;
1180         }
1181       else
1182         *t = *h;
1183     }
1184   *t = '\0';
1185
1186   return newdir;
1187 }
1188
1189 /* Sync u->path and u->url with u->dir and u->file.  Called after
1190    u->file or u->dir have been changed, typically by the FTP code.  */
1191
1192 static void
1193 sync_path (struct url *u)
1194 {
1195   char *newpath, *efile, *edir;
1196
1197   xfree (u->path);
1198
1199   /* u->dir and u->file are not escaped.  URL-escape them before
1200      reassembling them into u->path.  That way, if they contain
1201      separators like '?' or even if u->file contains slashes, the
1202      path will be correctly assembled.  (u->file can contain slashes
1203      if the URL specifies it with %2f, or if an FTP server returns
1204      it.)  */
1205   edir = url_escape_dir (u->dir);
1206   efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1207
1208   if (!*edir)
1209     newpath = xstrdup (efile);
1210   else
1211     {
1212       int dirlen = strlen (edir);
1213       int filelen = strlen (efile);
1214
1215       /* Copy "DIR/FILE" to newpath. */
1216       char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1217       memcpy (p, edir, dirlen);
1218       p += dirlen;
1219       *p++ = '/';
1220       memcpy (p, efile, filelen);
1221       p += filelen;
1222       *p++ = '\0';
1223     }
1224
1225   u->path = newpath;
1226
1227   if (edir != u->dir)
1228     xfree (edir);
1229   if (efile != u->file)
1230     xfree (efile);
1231
1232   /* Regenerate u->url as well.  */
1233   xfree (u->url);
1234   u->url = url_string (u, 0);
1235 }
1236
1237 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1238    This way we can sync u->path and u->url when they get changed.  */
1239
1240 void
1241 url_set_dir (struct url *url, const char *newdir)
1242 {
1243   xfree (url->dir);
1244   url->dir = xstrdup (newdir);
1245   sync_path (url);
1246 }
1247
1248 void
1249 url_set_file (struct url *url, const char *newfile)
1250 {
1251   xfree (url->file);
1252   url->file = xstrdup (newfile);
1253   sync_path (url);
1254 }
1255
1256 void
1257 url_free (struct url *url)
1258 {
1259   xfree (url->host);
1260   xfree (url->path);
1261   xfree (url->url);
1262
1263   xfree_null (url->params);
1264   xfree_null (url->query);
1265   xfree_null (url->fragment);
1266   xfree_null (url->user);
1267   xfree_null (url->passwd);
1268
1269   xfree (url->dir);
1270   xfree (url->file);
1271
1272   xfree (url);
1273 }
1274 \f
1275 /* Create all the necessary directories for PATH (a file).  Calls
1276    mkdirhier() internally.  */
1277 int
1278 mkalldirs (const char *path)
1279 {
1280   const char *p;
1281   char *t;
1282   struct_stat st;
1283   int res;
1284
1285   p = path + strlen (path);
1286   for (; *p != '/' && p != path; p--)
1287     ;
1288
1289   /* Don't create if it's just a file.  */
1290   if ((p == path) && (*p != '/'))
1291     return 0;
1292   t = strdupdelim (path, p);
1293
1294   /* Check whether the directory exists.  */
1295   if ((stat (t, &st) == 0))
1296     {
1297       if (S_ISDIR (st.st_mode))
1298         {
1299           xfree (t);
1300           return 0;
1301         }
1302       else
1303         {
1304           /* If the dir exists as a file name, remove it first.  This
1305              is *only* for Wget to work with buggy old CERN http
1306              servers.  Here is the scenario: When Wget tries to
1307              retrieve a directory without a slash, e.g.
1308              http://foo/bar (bar being a directory), CERN server will
1309              not redirect it too http://foo/bar/ -- it will generate a
1310              directory listing containing links to bar/file1,
1311              bar/file2, etc.  Wget will lose because it saves this
1312              HTML listing to a file `bar', so it cannot create the
1313              directory.  To work around this, if the file of the same
1314              name exists, we just remove it and create the directory
1315              anyway.  */
1316           DEBUGP (("Removing %s because of directory danger!\n", t));
1317           unlink (t);
1318         }
1319     }
1320   res = make_directory (t);
1321   if (res != 0)
1322     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1323   xfree (t);
1324   return res;
1325 }
1326 \f
1327 /* Functions for constructing the file name out of URL components.  */
1328
1329 /* A growable string structure, used by url_file_name and friends.
1330    This should perhaps be moved to utils.c.
1331
1332    The idea is to have a convenient and efficient way to construct a
1333    string by having various functions append data to it.  Instead of
1334    passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1335    functions in questions, we pass the pointer to this struct.  */
1336
1337 struct growable {
1338   char *base;
1339   int size;
1340   int tail;
1341 };
1342
1343 /* Ensure that the string can accept APPEND_COUNT more characters past
1344    the current TAIL position.  If necessary, this will grow the string
1345    and update its allocated size.  If the string is already large
1346    enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1347 #define GROW(g, append_size) do {                                       \
1348   struct growable *G_ = g;                                              \
1349   DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1350 } while (0)
1351
1352 /* Return the tail position of the string. */
1353 #define TAIL(r) ((r)->base + (r)->tail)
1354
1355 /* Move the tail position by APPEND_COUNT characters. */
1356 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1357
1358 /* Append the string STR to DEST.  NOTICE: the string in DEST is not
1359    terminated.  */
1360
1361 static void
1362 append_string (const char *str, struct growable *dest)
1363 {
1364   int l = strlen (str);
1365   GROW (dest, l);
1366   memcpy (TAIL (dest), str, l);
1367   TAIL_INCR (dest, l);
1368 }
1369
1370 /* Append CH to DEST.  For example, append_char (0, DEST)
1371    zero-terminates DEST.  */
1372
1373 static void
1374 append_char (char ch, struct growable *dest)
1375 {
1376   GROW (dest, 1);
1377   *TAIL (dest) = ch;
1378   TAIL_INCR (dest, 1);
1379 }
1380
1381 enum {
1382   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1383   filechr_not_windows = 2,      /* unusable on Windows, one of \|/<>?:*" */
1384   filechr_control     = 4       /* a control character, e.g. 0-31 */
1385 };
1386
1387 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1388
1389 /* Shorthands for the table: */
1390 #define U filechr_not_unix
1391 #define W filechr_not_windows
1392 #define C filechr_control
1393
1394 #define UW U|W
1395 #define UWC U|W|C
1396
1397 /* Table of characters unsafe under various conditions (see above).
1398
1399    Arguably we could also claim `%' to be unsafe, since we use it as
1400    the escape character.  If we ever want to be able to reliably
1401    translate file name back to URL, this would become important
1402    crucial.  Right now, it's better to be minimal in escaping.  */
1403
1404 const static unsigned char filechr_table[256] =
1405 {
1406 UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1407   C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1408   C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1409   C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1410   0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1411   0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1412   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1413   0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
1414   0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1415   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1416   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1417   0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1418   0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1419   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1420   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1421   0,  0,  0,  0,   0,  0,  0,  0,   /* x   y   z   {    |   }   ~   DEL */
1422
1423   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
1424   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
1425   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1426   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1427
1428   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1429   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1430   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1431   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1432 };
1433 #undef U
1434 #undef W
1435 #undef C
1436 #undef UW
1437 #undef UWC
1438
1439 /* FN_PORT_SEP is the separator between host and port in file names
1440    for non-standard port numbers.  On Unix this is normally ':', as in
1441    "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1442    because Windows can't handle ':' in file names.  */
1443 #define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1444
1445 /* FN_QUERY_SEP is the separator between the file name and the URL
1446    query, normally '?'.  Since Windows cannot handle '?' as part of
1447    file name, we use '@' instead there.  */
1448 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1449
1450 /* Quote path element, characters in [b, e), as file name, and append
1451    the quoted string to DEST.  Each character is quoted as per
1452    file_unsafe_char and the corresponding table.
1453
1454    If ESCAPED_P is non-zero, the path element is considered to be
1455    URL-escaped and will be unescaped prior to inspection.  */
1456
1457 static void
1458 append_uri_pathel (const char *b, const char *e, int escaped_p,
1459                    struct growable *dest)
1460 {
1461   const char *p;
1462   int quoted, outlen;
1463
1464   int mask;
1465   if (opt.restrict_files_os == restrict_unix)
1466     mask = filechr_not_unix;
1467   else
1468     mask = filechr_not_windows;
1469   if (opt.restrict_files_ctrl)
1470     mask |= filechr_control;
1471
1472   /* Copy [b, e) to PATHEL and URL-unescape it. */
1473   if (escaped_p)
1474     {
1475       char *unescaped;
1476       BOUNDED_TO_ALLOCA (b, e, unescaped);
1477       url_unescape (unescaped);
1478       b = unescaped;
1479       e = unescaped + strlen (unescaped);
1480     }
1481
1482   /* Defang ".." when found as component of path.  Remember that path
1483      comes from the URL and might contain malicious input.  */
1484   if (e - b == 2 && b[0] == '.' && b[1] == '.')
1485     {
1486       b = "%2E%2E";
1487       e = b + 6;
1488     }
1489
1490   /* Walk the PATHEL string and check how many characters we'll need
1491      to quote.  */
1492   quoted = 0;
1493   for (p = b; p < e; p++)
1494     if (FILE_CHAR_TEST (*p, mask))
1495       ++quoted;
1496
1497   /* Calculate the length of the output string.  e-b is the input
1498      string length.  Each quoted char introduces two additional
1499      characters in the string, hence 2*quoted.  */
1500   outlen = (e - b) + (2 * quoted);
1501   GROW (dest, outlen);
1502
1503   if (!quoted)
1504     {
1505       /* If there's nothing to quote, we can simply append the string
1506          without processing it again.  */
1507       memcpy (TAIL (dest), b, outlen);
1508     }
1509   else
1510     {
1511       char *q = TAIL (dest);
1512       for (p = b; p < e; p++)
1513         {
1514           if (!FILE_CHAR_TEST (*p, mask))
1515             *q++ = *p;
1516           else
1517             {
1518               unsigned char ch = *p;
1519               *q++ = '%';
1520               *q++ = XNUM_TO_DIGIT (ch >> 4);
1521               *q++ = XNUM_TO_DIGIT (ch & 0xf);
1522             }
1523         }
1524       assert (q - TAIL (dest) == outlen);
1525     }
1526   TAIL_INCR (dest, outlen);
1527 }
1528
1529 /* Append to DEST the directory structure that corresponds the
1530    directory part of URL's path.  For example, if the URL is
1531    http://server/dir1/dir2/file, this appends "/dir1/dir2".
1532
1533    Each path element ("dir1" and "dir2" in the above example) is
1534    examined, url-unescaped, and re-escaped as file name element.
1535
1536    Additionally, it cuts as many directories from the path as
1537    specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1538    will produce "bar" for the above example.  For 2 or more, it will
1539    produce "".
1540
1541    Each component of the path is quoted for use as file name.  */
1542
1543 static void
1544 append_dir_structure (const struct url *u, struct growable *dest)
1545 {
1546   char *pathel, *next;
1547   int cut = opt.cut_dirs;
1548
1549   /* Go through the path components, de-URL-quote them, and quote them
1550      (if necessary) as file names.  */
1551
1552   pathel = u->path;
1553   for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1554     {
1555       if (cut-- > 0)
1556         continue;
1557       if (pathel == next)
1558         /* Ignore empty pathels.  */
1559         continue;
1560
1561       if (dest->tail)
1562         append_char ('/', dest);
1563       append_uri_pathel (pathel, next, 1, dest);
1564     }
1565 }
1566
1567 /* Return a unique file name that matches the given URL as good as
1568    possible.  Does not create directories on the file system.  */
1569
1570 char *
1571 url_file_name (const struct url *u)
1572 {
1573   struct growable fnres;
1574
1575   const char *u_file, *u_query;
1576   char *fname, *unique;
1577
1578   fnres.base = NULL;
1579   fnres.size = 0;
1580   fnres.tail = 0;
1581
1582   /* Start with the directory prefix, if specified. */
1583   if (opt.dir_prefix)
1584     append_string (opt.dir_prefix, &fnres);
1585
1586   /* If "dirstruct" is turned on (typically the case with -r), add
1587      the host and port (unless those have been turned off) and
1588      directory structure.  */
1589   if (opt.dirstruct)
1590     {
1591       if (opt.protocol_directories)
1592         {
1593           if (fnres.tail)
1594             append_char ('/', &fnres);
1595           append_string (supported_schemes[u->scheme].name, &fnres);
1596         }
1597       if (opt.add_hostdir)
1598         {
1599           if (fnres.tail)
1600             append_char ('/', &fnres);
1601           append_string (u->host, &fnres);
1602           if (u->port != scheme_default_port (u->scheme))
1603             {
1604               char portstr[24];
1605               number_to_string (portstr, u->port);
1606               append_char (FN_PORT_SEP, &fnres);
1607               append_string (portstr, &fnres);
1608             }
1609         }
1610
1611       append_dir_structure (u, &fnres);
1612     }
1613
1614   /* Add the file name. */
1615   if (fnres.tail)
1616     append_char ('/', &fnres);
1617   u_file = *u->file ? u->file : "index.html";
1618   append_uri_pathel (u_file, u_file + strlen (u_file), 0, &fnres);
1619
1620   /* Append "?query" to the file name. */
1621   u_query = u->query && *u->query ? u->query : NULL;
1622   if (u_query)
1623     {
1624       append_char (FN_QUERY_SEP, &fnres);
1625       append_uri_pathel (u_query, u_query + strlen (u_query), 1, &fnres);
1626     }
1627
1628   /* Zero-terminate the file name. */
1629   append_char ('\0', &fnres);
1630
1631   fname = fnres.base;
1632
1633   /* Check the cases in which the unique extensions are not used:
1634      1) Clobbering is turned off (-nc).
1635      2) Retrieval with regetting.
1636      3) Timestamping is used.
1637      4) Hierarchy is built.
1638
1639      The exception is the case when file does exist and is a
1640      directory (see `mkalldirs' for explanation).  */
1641
1642   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1643       && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1644     return fname;
1645
1646   unique = unique_name (fname, 1);
1647   if (unique != fname)
1648     xfree (fname);
1649   return unique;
1650 }
1651 \f
1652 /* Resolve "." and ".." elements of PATH by destructively modifying
1653    PATH and return non-zero if PATH has been modified, zero otherwise.
1654
1655    The algorithm is in spirit similar to the one described in rfc1808,
1656    although implemented differently, in one pass.  To recap, path
1657    elements containing only "." are removed, and ".." is taken to mean
1658    "back up one element".  Single leading and trailing slashes are
1659    preserved.
1660
1661    This function does not handle URL escapes explicitly.  If you're
1662    passing paths from URLs, make sure to unquote "%2e" and "%2E" to
1663    ".", so that this function can find the dots.  (Wget's URL parser
1664    calls reencode_escapes, which see.)
1665
1666    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1667    test examples are provided below.  If you change anything in this
1668    function, run test_path_simplify to make sure you haven't broken a
1669    test case.  */
1670
1671 static int
1672 path_simplify (char *path)
1673 {
1674   char *h = path;               /* hare */
1675   char *t = path;               /* tortoise */
1676   char *beg = path;             /* boundary for backing the tortoise */
1677   char *end = path + strlen (path);
1678
1679   while (h < end)
1680     {
1681       /* Hare should be at the beginning of a path element. */
1682
1683       if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1684         {
1685           /* Ignore "./". */
1686           h += 2;
1687         }
1688       else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1689         {
1690           /* Handle "../" by retreating the tortoise by one path
1691              element -- but not past beggining.  */
1692           if (t > beg)
1693             {
1694               /* Move backwards until T hits the beginning of the
1695                  previous path element or the beginning of path. */
1696               for (--t; t > beg && t[-1] != '/'; t--)
1697                 ;
1698             }
1699           else
1700             {
1701               /* If we're at the beginning, copy the "../" literally
1702                  move the beginning so a later ".." doesn't remove
1703                  it.  */
1704               beg = t + 3;
1705               goto regular;
1706             }
1707           h += 3;
1708         }
1709       else
1710         {
1711         regular:
1712           /* A regular path element.  If H hasn't advanced past T,
1713              simply skip to the next path element.  Otherwise, copy
1714              the path element until the next slash.  */
1715           if (t == h)
1716             {
1717               /* Skip the path element, including the slash.  */
1718               while (h < end && *h != '/')
1719                 t++, h++;
1720               if (h < end)
1721                 t++, h++;
1722             }
1723           else
1724             {
1725               /* Copy the path element, including the final slash.  */
1726               while (h < end && *h != '/')
1727                 *t++ = *h++;
1728               if (h < end)
1729                 *t++ = *h++;
1730             }
1731         }
1732     }
1733
1734   if (t != h)
1735     *t = '\0';
1736
1737   return t != h;
1738 }
1739 \f
1740 /* Return the length of URL's path.  Path is considered to be
1741    terminated by one of '?', ';', '#', or by the end of the
1742    string.  */
1743
1744 static int
1745 path_length (const char *url)
1746 {
1747   const char *q = strpbrk_or_eos (url, "?;#");
1748   return q - url;
1749 }
1750
1751 /* Find the last occurrence of character C in the range [b, e), or
1752    NULL, if none are present.  We might want to use memrchr (a GNU
1753    extension) under GNU libc.  */
1754
1755 static const char *
1756 find_last_char (const char *b, const char *e, char c)
1757 {
1758   for (; e > b; e--)
1759     if (*e == c)
1760       return e;
1761   return NULL;
1762 }
1763
1764 /* Merge BASE with LINK and return the resulting URI.
1765
1766    Either of the URIs may be absolute or relative, complete with the
1767    host name, or path only.  This tries to reasonably handle all
1768    foreseeable cases.  It only employs minimal URL parsing, without
1769    knowledge of the specifics of schemes.
1770
1771    I briefly considered making this function call path_simplify after
1772    the merging process, as rfc1738 seems to suggest.  This is a bad
1773    idea for several reasons: 1) it complexifies the code, and 2)
1774    url_parse has to simplify path anyway, so it's wasteful to boot.  */
1775
1776 char *
1777 uri_merge (const char *base, const char *link)
1778 {
1779   int linklength;
1780   const char *end;
1781   char *merge;
1782
1783   if (url_has_scheme (link))
1784     return xstrdup (link);
1785
1786   /* We may not examine BASE past END. */
1787   end = base + path_length (base);
1788   linklength = strlen (link);
1789
1790   if (!*link)
1791     {
1792       /* Empty LINK points back to BASE, query string and all. */
1793       return xstrdup (base);
1794     }
1795   else if (*link == '?')
1796     {
1797       /* LINK points to the same location, but changes the query
1798          string.  Examples: */
1799       /* uri_merge("path",         "?new") -> "path?new"     */
1800       /* uri_merge("path?foo",     "?new") -> "path?new"     */
1801       /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1802       /* uri_merge("path#foo",     "?new") -> "path?new"     */
1803       int baselength = end - base;
1804       merge = xmalloc (baselength + linklength + 1);
1805       memcpy (merge, base, baselength);
1806       memcpy (merge + baselength, link, linklength);
1807       merge[baselength + linklength] = '\0';
1808     }
1809   else if (*link == '#')
1810     {
1811       /* uri_merge("path",         "#new") -> "path#new"     */
1812       /* uri_merge("path#foo",     "#new") -> "path#new"     */
1813       /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1814       /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1815       int baselength;
1816       const char *end1 = strchr (base, '#');
1817       if (!end1)
1818         end1 = base + strlen (base);
1819       baselength = end1 - base;
1820       merge = xmalloc (baselength + linklength + 1);
1821       memcpy (merge, base, baselength);
1822       memcpy (merge + baselength, link, linklength);
1823       merge[baselength + linklength] = '\0';
1824     }
1825   else if (*link == '/' && *(link + 1) == '/')
1826     {
1827       /* LINK begins with "//" and so is a net path: we need to
1828          replace everything after (and including) the double slash
1829          with LINK. */
1830
1831       /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1832       /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1833       /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1834
1835       int span;
1836       const char *slash;
1837       const char *start_insert;
1838
1839       /* Look for first slash. */
1840       slash = memchr (base, '/', end - base);
1841       /* If found slash and it is a double slash, then replace
1842          from this point, else default to replacing from the
1843          beginning.  */
1844       if (slash && *(slash + 1) == '/')
1845         start_insert = slash;
1846       else
1847         start_insert = base;
1848
1849       span = start_insert - base;
1850       merge = (char *)xmalloc (span + linklength + 1);
1851       if (span)
1852         memcpy (merge, base, span);
1853       memcpy (merge + span, link, linklength);
1854       merge[span + linklength] = '\0';
1855     }
1856   else if (*link == '/')
1857     {
1858       /* LINK is an absolute path: we need to replace everything
1859          after (and including) the FIRST slash with LINK.
1860
1861          So, if BASE is "http://host/whatever/foo/bar", and LINK is
1862          "/qux/xyzzy", our result should be
1863          "http://host/qux/xyzzy".  */
1864       int span;
1865       const char *slash;
1866       const char *start_insert = NULL; /* for gcc to shut up. */
1867       const char *pos = base;
1868       int seen_slash_slash = 0;
1869       /* We're looking for the first slash, but want to ignore
1870          double slash. */
1871     again:
1872       slash = memchr (pos, '/', end - pos);
1873       if (slash && !seen_slash_slash)
1874         if (*(slash + 1) == '/')
1875           {
1876             pos = slash + 2;
1877             seen_slash_slash = 1;
1878             goto again;
1879           }
1880
1881       /* At this point, SLASH is the location of the first / after
1882          "//", or the first slash altogether.  START_INSERT is the
1883          pointer to the location where LINK will be inserted.  When
1884          examining the last two examples, keep in mind that LINK
1885          begins with '/'. */
1886
1887       if (!slash && !seen_slash_slash)
1888         /* example: "foo" */
1889         /*           ^    */
1890         start_insert = base;
1891       else if (!slash && seen_slash_slash)
1892         /* example: "http://foo" */
1893         /*                     ^ */
1894         start_insert = end;
1895       else if (slash && !seen_slash_slash)
1896         /* example: "foo/bar" */
1897         /*           ^        */
1898         start_insert = base;
1899       else if (slash && seen_slash_slash)
1900         /* example: "http://something/" */
1901         /*                           ^  */
1902         start_insert = slash;
1903
1904       span = start_insert - base;
1905       merge = (char *)xmalloc (span + linklength + 1);
1906       if (span)
1907         memcpy (merge, base, span);
1908       memcpy (merge + span, link, linklength);
1909       merge[span + linklength] = '\0';
1910     }
1911   else
1912     {
1913       /* LINK is a relative URL: we need to replace everything
1914          after last slash (possibly empty) with LINK.
1915
1916          So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1917          our result should be "whatever/foo/qux/xyzzy".  */
1918       int need_explicit_slash = 0;
1919       int span;
1920       const char *start_insert;
1921       const char *last_slash = find_last_char (base, end, '/');
1922       if (!last_slash)
1923         {
1924           /* No slash found at all.  Replace what we have with LINK. */
1925           start_insert = base;
1926         }
1927       else if (last_slash && last_slash >= base + 2
1928                && last_slash[-2] == ':' && last_slash[-1] == '/')
1929         {
1930           /* example: http://host"  */
1931           /*                      ^ */
1932           start_insert = end + 1;
1933           need_explicit_slash = 1;
1934         }
1935       else
1936         {
1937           /* example: "whatever/foo/bar" */
1938           /*                        ^    */
1939           start_insert = last_slash + 1;
1940         }
1941
1942       span = start_insert - base;
1943       merge = (char *)xmalloc (span + linklength + 1);
1944       if (span)
1945         memcpy (merge, base, span);
1946       if (need_explicit_slash)
1947         merge[span - 1] = '/';
1948       memcpy (merge + span, link, linklength);
1949       merge[span + linklength] = '\0';
1950     }
1951
1952   return merge;
1953 }
1954 \f
1955 #define APPEND(p, s) do {                       \
1956   int len = strlen (s);                         \
1957   memcpy (p, s, len);                           \
1958   p += len;                                     \
1959 } while (0)
1960
1961 /* Use this instead of password when the actual password is supposed
1962    to be hidden.  We intentionally use a generic string without giving
1963    away the number of characters in the password, like previous
1964    versions did.  */
1965 #define HIDDEN_PASSWORD "*password*"
1966
1967 /* Recreate the URL string from the data in URL.
1968
1969    If HIDE is non-zero (as it is when we're calling this on a URL we
1970    plan to print, but not when calling it to canonicalize a URL for
1971    use within the program), password will be hidden.  Unsafe
1972    characters in the URL will be quoted.  */
1973
1974 char *
1975 url_string (const struct url *url, int hide_password)
1976 {
1977   int size;
1978   char *result, *p;
1979   char *quoted_user = NULL, *quoted_passwd = NULL;
1980
1981   int scheme_port  = supported_schemes[url->scheme].default_port;
1982   const char *scheme_str = supported_schemes[url->scheme].leading_string;
1983   int fplen = full_path_length (url);
1984
1985   int brackets_around_host;
1986
1987   assert (scheme_str != NULL);
1988
1989   /* Make sure the user name and password are quoted. */
1990   if (url->user)
1991     {
1992       quoted_user = url_escape_allow_passthrough (url->user);
1993       if (url->passwd)
1994         {
1995           if (hide_password)
1996             quoted_passwd = HIDDEN_PASSWORD;
1997           else
1998             quoted_passwd = url_escape_allow_passthrough (url->passwd);
1999         }
2000     }
2001
2002   /* Numeric IPv6 addresses can contain ':' and need to be quoted with
2003      brackets.  */
2004   brackets_around_host = strchr (url->host, ':') != NULL;
2005
2006   size = (strlen (scheme_str)
2007           + strlen (url->host)
2008           + (brackets_around_host ? 2 : 0)
2009           + fplen
2010           + 1);
2011   if (url->port != scheme_port)
2012     size += 1 + numdigit (url->port);
2013   if (quoted_user)
2014     {
2015       size += 1 + strlen (quoted_user);
2016       if (quoted_passwd)
2017         size += 1 + strlen (quoted_passwd);
2018     }
2019
2020   p = result = xmalloc (size);
2021
2022   APPEND (p, scheme_str);
2023   if (quoted_user)
2024     {
2025       APPEND (p, quoted_user);
2026       if (quoted_passwd)
2027         {
2028           *p++ = ':';
2029           APPEND (p, quoted_passwd);
2030         }
2031       *p++ = '@';
2032     }
2033
2034   if (brackets_around_host)
2035     *p++ = '[';
2036   APPEND (p, url->host);
2037   if (brackets_around_host)
2038     *p++ = ']';
2039   if (url->port != scheme_port)
2040     {
2041       *p++ = ':';
2042       p = number_to_string (p, url->port);
2043     }
2044
2045   full_path_write (url, p);
2046   p += fplen;
2047   *p++ = '\0';
2048
2049   assert (p - result == size);
2050
2051   if (quoted_user && quoted_user != url->user)
2052     xfree (quoted_user);
2053   if (quoted_passwd && !hide_password
2054       && quoted_passwd != url->passwd)
2055     xfree (quoted_passwd);
2056
2057   return result;
2058 }
2059 \f
2060 /* Return non-zero if scheme a is similar to scheme b.
2061
2062    Schemes are similar if they are equal.  If SSL is supported, schemes
2063    are also similar if one is http (SCHEME_HTTP) and the other is https
2064    (SCHEME_HTTPS).  */
2065 int
2066 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2067 {
2068   if (a == b)
2069     return 1;
2070 #ifdef HAVE_SSL
2071   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2072       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2073     return 1;
2074 #endif
2075   return 0;
2076 }
2077 \f
2078 #if 0
2079 /* Debugging and testing support for path_simplify. */
2080
2081 /* Debug: run path_simplify on PATH and return the result in a new
2082    string.  Useful for calling from the debugger.  */
2083 static char *
2084 ps (char *path)
2085 {
2086   char *copy = xstrdup (path);
2087   path_simplify (copy);
2088   return copy;
2089 }
2090
2091 static void
2092 run_test (char *test, char *expected_result, int expected_change)
2093 {
2094   char *test_copy = xstrdup (test);
2095   int modified = path_simplify (test_copy);
2096
2097   if (0 != strcmp (test_copy, expected_result))
2098     {
2099       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2100               test, expected_result, test_copy);
2101     }
2102   if (modified != expected_change)
2103     {
2104       if (expected_change == 1)
2105         printf ("Expected modification with path_simplify(\"%s\").\n",
2106                 test);
2107       else
2108         printf ("Expected no modification with path_simplify(\"%s\").\n",
2109                 test);
2110     }
2111   xfree (test_copy);
2112 }
2113
2114 static void
2115 test_path_simplify (void)
2116 {
2117   static struct {
2118     char *test, *result;
2119     int should_modify;
2120   } tests[] = {
2121     { "",                       "",             0 },
2122     { ".",                      "",             1 },
2123     { "./",                     "",             1 },
2124     { "..",                     "..",           0 },
2125     { "../",                    "../",          0 },
2126     { "foo",                    "foo",          0 },
2127     { "foo/bar",                "foo/bar",      0 },
2128     { "foo///bar",              "foo///bar",    0 },
2129     { "foo/.",                  "foo/",         1 },
2130     { "foo/./",                 "foo/",         1 },
2131     { "foo./",                  "foo./",        0 },
2132     { "foo/../bar",             "bar",          1 },
2133     { "foo/../bar/",            "bar/",         1 },
2134     { "foo/bar/..",             "foo/",         1 },
2135     { "foo/bar/../x",           "foo/x",        1 },
2136     { "foo/bar/../x/",          "foo/x/",       1 },
2137     { "foo/..",                 "",             1 },
2138     { "foo/../..",              "..",           1 },
2139     { "foo/../../..",           "../..",        1 },
2140     { "foo/../../bar/../../baz", "../../baz",   1 },
2141     { "a/b/../../c",            "c",            1 },
2142     { "./a/../b",               "b",            1 }
2143   };
2144   int i;
2145
2146   for (i = 0; i < countof (tests); i++)
2147     {
2148       char *test = tests[i].test;
2149       char *expected_result = tests[i].result;
2150       int   expected_change = tests[i].should_modify;
2151       run_test (test, expected_result, expected_change);
2152     }
2153 }
2154 #endif