sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001, 2003, 2003
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2 of the License, or (at
  10 your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget; if not, write to the Free Software
  19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  20
  21 In addition, as a special exception, the Free Software Foundation
  22 gives permission to link the code of its release of Wget with the
  23 OpenSSL project's "OpenSSL" library (or with modified versions of it
  24 that use the same license as the "OpenSSL" library), and distribute
  25 the linked executables.  You must obey the GNU General Public License
  26 in all respects for all of the code used other than "OpenSSL".  If you
  27 modify this file, you may extend this exception to your version of the
  28 file, but you are not obligated to do so.  If you do not wish to do
  29 so, delete this exception statement from your version.  */
  30
  31 #include <config.h>
  32
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #ifdef HAVE_STRING_H
  36 # include <string.h>
  37 #else
  38 # include <strings.h>
  39 #endif
  40 #include <sys/types.h>
  41 #ifdef HAVE_UNISTD_H
  42 # include <unistd.h>
  43 #endif
  44 #include <errno.h>
  45 #include <assert.h>
  46
  47 #include "wget.h"
  48 #include "utils.h"
  49 #include "url.h"
  50
  51 #ifndef errno
  52 extern int errno;
  53 #endif
  54
  55 struct scheme_data
  56 {
  57   char *leading_string;
  58   int default_port;
  59   int enabled;
  60 };
  61
  62 /* Supported schemes: */
  63 static struct scheme_data supported_schemes[] =
  64 {
  65   { "http://",  DEFAULT_HTTP_PORT,  1 },
  66 #ifdef HAVE_SSL
  67   { "https://", DEFAULT_HTTPS_PORT, 1 },
  68 #endif
  69   { "ftp://",   DEFAULT_FTP_PORT,   1 },
  70
  71   /* SCHEME_INVALID */
  72   { NULL,       -1,                 0 }
  73 };
  74
  75 /* Forward declarations: */
  76
  77 static int path_simplify PARAMS ((char *));
  78 \f
  79 /* Support for encoding and decoding of URL strings.  We determine
  80    whether a character is unsafe through static table lookup.  This
  81    code assumes ASCII character set and 8-bit chars.  */
  82
  83 enum {
  84   /* rfc1738 reserved chars, preserved from encoding.  */
  85   urlchr_reserved = 1,
  86
  87   /* rfc1738 unsafe chars, plus some more.  */
  88   urlchr_unsafe   = 2
  89 };
  90
  91 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
  92 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
  93 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
  94
  95 /* Shorthands for the table: */
  96 #define R  urlchr_reserved
  97 #define U  urlchr_unsafe
  98 #define RU R|U
  99
 100 const static unsigned char urlchr_table[256] =
 101 {
 102   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 103   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 104   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 105   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 106   U,  0,  U, RU,   0,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 107   0,  0,  0,  R,   0,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 108   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 109   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 110  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 111   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 112   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 113   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 114   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 115   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 116   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 117   0,  0,  0,  U,   U,  U,  U,  U,   /* x   y   z   {    |   }   ~   DEL */
 118
 119   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 120   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 121   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 122   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 123
 124   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 125   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 126   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 127   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 128 };
 129 #undef R
 130 #undef U
 131 #undef RU
 132
 133 /* URL-unescape the string S.
 134
 135    This is done by transforming the sequences "%HH" to the character
 136    represented by the hexadecimal digits HH.  If % is not followed by
 137    two hexadecimal digits, it is inserted literally.
 138
 139    The transformation is done in place.  If you need the original
 140    string intact, make a copy before calling this function.  */
 141
 142 static void
 143 url_unescape (char *s)
 144 {
 145   char *t = s;                  /* t - tortoise */
 146   char *h = s;                  /* h - hare     */
 147
 148   for (; *h; h++, t++)
 149     {
 150       if (*h != '%')
 151         {
 152         copychar:
 153           *t = *h;
 154         }
 155       else
 156         {
 157           /* Do nothing if '%' is not followed by two hex digits. */
 158           if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2])))
 159             goto copychar;
 160           *t = X2DIGITS_TO_NUM (h[1], h[2]);
 161           h += 2;
 162         }
 163     }
 164   *t = '\0';
 165 }
 166
 167 /* The core of url_escape_* functions.  Escapes the characters that
 168    match the provided mask in urlchr_table.
 169
 170    If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars
 171    will be returned unchanged.  If ALLOW_PASSTHROUGH is zero, a
 172    freshly allocated string will be returned in all cases.  */
 173
 174 static char *
 175 url_escape_1 (const char *s, unsigned char mask, int allow_passthrough)
 176 {
 177   const char *p1;
 178   char *p2, *newstr;
 179   int newlen;
 180   int addition = 0;
 181
 182   for (p1 = s; *p1; p1++)
 183     if (urlchr_test (*p1, mask))
 184       addition += 2;            /* Two more characters (hex digits) */
 185
 186   if (!addition)
 187     return allow_passthrough ? (char *)s : xstrdup (s);
 188
 189   newlen = (p1 - s) + addition;
 190   newstr = (char *)xmalloc (newlen + 1);
 191
 192   p1 = s;
 193   p2 = newstr;
 194   while (*p1)
 195     {
 196       /* Quote the characters that match the test mask. */
 197       if (urlchr_test (*p1, mask))
 198         {
 199           unsigned char c = *p1++;
 200           *p2++ = '%';
 201           *p2++ = XNUM_TO_DIGIT (c >> 4);
 202           *p2++ = XNUM_TO_DIGIT (c & 0xf);
 203         }
 204       else
 205         *p2++ = *p1++;
 206     }
 207   assert (p2 - newstr == newlen);
 208   *p2 = '\0';
 209
 210   return newstr;
 211 }
 212
 213 /* URL-escape the unsafe characters (see urlchr_table) in a given
 214    string, returning a freshly allocated string.  */
 215
 216 char *
 217 url_escape (const char *s)
 218 {
 219   return url_escape_1 (s, urlchr_unsafe, 0);
 220 }
 221
 222 /* URL-escape the unsafe characters (see urlchr_table) in a given
 223    string.  If no characters are unsafe, S is returned.  */
 224
 225 static char *
 226 url_escape_allow_passthrough (const char *s)
 227 {
 228   return url_escape_1 (s, urlchr_unsafe, 1);
 229 }
 230 \f
 231 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
 232
 233 /* Decide whether to encode, decode, or pass through the char at P.
 234    This used to be a macro, but it got a little too convoluted.  */
 235 static inline enum copy_method
 236 decide_copy_method (const char *p)
 237 {
 238   if (*p == '%')
 239     {
 240       if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
 241         {
 242           /* %xx sequence: decode it, unless it would decode to an
 243              unsafe or a reserved char; in that case, leave it as
 244              is. */
 245           char preempt = X2DIGITS_TO_NUM (*(p + 1), *(p + 2));
 246           if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt))
 247             return CM_PASSTHROUGH;
 248           else
 249             return CM_DECODE;
 250         }
 251       else
 252         /* Garbled %.. sequence: encode `%'. */
 253         return CM_ENCODE;
 254     }
 255   else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
 256     return CM_ENCODE;
 257   else
 258     return CM_PASSTHROUGH;
 259 }
 260
 261 /* Translate a %-escaped (but possibly non-conformant) input string S
 262    into a %-escaped (and conformant) output string.  If no characters
 263    are encoded or decoded, return the same string S; otherwise, return
 264    a freshly allocated string with the new contents.
 265
 266    After a URL has been run through this function, the protocols that
 267    use `%' as the quote character can use the resulting string as-is,
 268    while those that don't call url_unescape() to get to the intended
 269    data.  This function is also stable: after an input string is
 270    transformed the first time, all further transformations of the
 271    result yield the same result string.
 272
 273    Let's discuss why this function is needed.
 274
 275    Imagine Wget is to retrieve `http://abc.xyz/abc def'.  Since a raw
 276    space character would mess up the HTTP request, it needs to be
 277    quoted, like this:
 278
 279        GET /abc%20def HTTP/1.0
 280
 281    It appears that the unsafe chars need to be quoted, for example
 282    with url_escape.  But what if we're requested to download
 283    `abc%20def'?  url_escape transforms "%" to "%25", which would leave
 284    us with `abc%2520def'.  This is incorrect -- since %-escapes are
 285    part of URL syntax, "%20" is the correct way to denote a literal
 286    space on the Wget command line.  This leaves us in the conclusion
 287    that in that case Wget should not call url_escape, but leave the
 288    `%20' as is.
 289
 290    And what if the requested URI is `abc%20 def'?  If we call
 291    url_escape, we end up with `/abc%2520%20def', which is almost
 292    certainly not intended.  If we don't call url_escape, we are left
 293    with the embedded space and cannot complete the request.  What the
 294    user meant was for Wget to request `/abc%20%20def', and this is
 295    where reencode_escapes kicks in.
 296
 297    Wget used to solve this by first decoding %-quotes, and then
 298    encoding all the "unsafe" characters found in the resulting string.
 299    This was wrong because it didn't preserve certain URL special
 300    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 301    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 302    whether we considered `+' reserved (it is).  One of these results
 303    is inevitable because by the second step we would lose information
 304    on whether the `+' was originally encoded or not.  Both results
 305    were wrong because in CGI parameters + means space, while %2B means
 306    literal plus.  reencode_escapes correctly translates the above to
 307    "a%2B+b", i.e. returns the original string.
 308
 309    This function uses an algorithm proposed by Anon Sricharoenchai:
 310
 311    1. Encode all URL_UNSAFE and the "%" that are not followed by 2
 312       hexdigits.
 313
 314    2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
 315       "+".
 316
 317    ...except that this code conflates the two steps, and decides
 318    whether to encode, decode, or pass through each character in turn.
 319    The function still uses two passes, but their logic is the same --
 320    the first pass exists merely for the sake of allocation.  Another
 321    small difference is that we include `+' to URL_RESERVED.
 322
 323    Anon's test case:
 324
 325    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 326    ->
 327    "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
 328
 329    Simpler test cases:
 330
 331    "foo bar"         -> "foo%20bar"
 332    "foo%20bar"       -> "foo%20bar"
 333    "foo %20bar"      -> "foo%20%20bar"
 334    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 335    "foo%25%20bar"    -> "foo%25%20bar"
 336    "foo%2%20bar"     -> "foo%252%20bar"
 337    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 338    "foo%2b+bar"      -> "foo%2b+bar"  */
 339
 340 static char *
 341 reencode_escapes (const char *s)
 342 {
 343   const char *p1;
 344   char *newstr, *p2;
 345   int oldlen, newlen;
 346
 347   int encode_count = 0;
 348   int decode_count = 0;
 349
 350   /* First, pass through the string to see if there's anything to do,
 351      and to calculate the new length.  */
 352   for (p1 = s; *p1; p1++)
 353     {
 354       switch (decide_copy_method (p1))
 355         {
 356         case CM_ENCODE:
 357           ++encode_count;
 358           break;
 359         case CM_DECODE:
 360           ++decode_count;
 361           break;
 362         case CM_PASSTHROUGH:
 363           break;
 364         }
 365     }
 366
 367   if (!encode_count && !decode_count)
 368     /* The string is good as it is. */
 369     return (char *)s;           /* C const model sucks. */
 370
 371   oldlen = p1 - s;
 372   /* Each encoding adds two characters (hex digits), while each
 373      decoding removes two characters.  */
 374   newlen = oldlen + 2 * (encode_count - decode_count);
 375   newstr = xmalloc (newlen + 1);
 376
 377   p1 = s;
 378   p2 = newstr;
 379
 380   while (*p1)
 381     {
 382       switch (decide_copy_method (p1))
 383         {
 384         case CM_ENCODE:
 385           {
 386             unsigned char c = *p1++;
 387             *p2++ = '%';
 388             *p2++ = XNUM_TO_DIGIT (c >> 4);
 389             *p2++ = XNUM_TO_DIGIT (c & 0xf);
 390           }
 391           break;
 392         case CM_DECODE:
 393           *p2++ = X2DIGITS_TO_NUM (p1[1], p1[2]);
 394           p1 += 3;              /* skip %xx */
 395           break;
 396         case CM_PASSTHROUGH:
 397           *p2++ = *p1++;
 398         }
 399     }
 400   *p2 = '\0';
 401   assert (p2 - newstr == newlen);
 402   return newstr;
 403 }
 404 \f
 405 /* Returns the scheme type if the scheme is supported, or
 406    SCHEME_INVALID if not.  */
 407
 408 enum url_scheme
 409 url_scheme (const char *url)
 410 {
 411   int i;
 412
 413   for (i = 0; supported_schemes[i].leading_string; i++)
 414     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 415                           strlen (supported_schemes[i].leading_string)))
 416       {
 417         if (supported_schemes[i].enabled)
 418           return (enum url_scheme) i;
 419         else
 420           return SCHEME_INVALID;
 421       }
 422
 423   return SCHEME_INVALID;
 424 }
 425
 426 #define SCHEME_CHAR(ch) (ISALNUM (ch) || (ch) == '-' || (ch) == '+')
 427
 428 /* Return 1 if the URL begins with any "scheme", 0 otherwise.  As
 429    currently implemented, it returns true if URL begins with
 430    [-+a-zA-Z0-9]+: .  */
 431
 432 int
 433 url_has_scheme (const char *url)
 434 {
 435   const char *p = url;
 436
 437   /* The first char must be a scheme char. */
 438   if (!*p || !SCHEME_CHAR (*p))
 439     return 0;
 440   ++p;
 441   /* Followed by 0 or more scheme chars. */
 442   while (*p && SCHEME_CHAR (*p))
 443     ++p;
 444   /* Terminated by ':'. */
 445   return *p == ':';
 446 }
 447
 448 int
 449 scheme_default_port (enum url_scheme scheme)
 450 {
 451   return supported_schemes[scheme].default_port;
 452 }
 453
 454 void
 455 scheme_disable (enum url_scheme scheme)
 456 {
 457   supported_schemes[scheme].enabled = 0;
 458 }
 459
 460 /* Skip the username and password, if present here.  The function
 461    should *not* be called with the complete URL, but with the part
 462    right after the scheme.
 463
 464    If no username and password are found, return 0.  */
 465
 466 static int
 467 url_skip_credentials (const char *url)
 468 {
 469   /* Look for '@' that comes before terminators, such as '/', '?',
 470      '#', or ';'.  */
 471   const char *p = (const char *)strpbrk (url, "@/?#;");
 472   if (!p || *p != '@')
 473     return 0;
 474   return p + 1 - url;
 475 }
 476
 477 /* Parse credentials contained in [BEG, END).  The region is expected
 478    to have come from a URL and is unescaped.  */
 479
 480 static int
 481 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
 482 {
 483   char *colon;
 484   const char *userend;
 485
 486   if (beg == end)
 487     return 0;                   /* empty user name */
 488
 489   colon = memchr (beg, ':', end - beg);
 490   if (colon == beg)
 491     return 0;                   /* again empty user name */
 492
 493   if (colon)
 494     {
 495       *passwd = strdupdelim (colon + 1, end);
 496       userend = colon;
 497       url_unescape (*passwd);
 498     }
 499   else
 500     {
 501       *passwd = NULL;
 502       userend = end;
 503     }
 504   *user = strdupdelim (beg, userend);
 505   url_unescape (*user);
 506   return 1;
 507 }
 508
 509 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 510    popularized by Netscape and NcFTP.  HTTP shorthands look like this:
 511
 512    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 513    www.foo.com[:port]            -> http://www.foo.com[:port]
 514
 515    FTP shorthands look like this:
 516
 517    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 518    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 519
 520    If the URL needs not or cannot be rewritten, return NULL.  */
 521
 522 char *
 523 rewrite_shorthand_url (const char *url)
 524 {
 525   const char *p;
 526
 527   if (url_has_scheme (url))
 528     return NULL;
 529
 530   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 531      latter Netscape.  */
 532   for (p = url; *p && *p != ':' && *p != '/'; p++)
 533     ;
 534
 535   if (p == url)
 536     return NULL;
 537
 538   if (*p == ':')
 539     {
 540       const char *pp;
 541       char *res;
 542       /* If the characters after the colon and before the next slash
 543          or end of string are all digits, it's HTTP.  */
 544       int digits = 0;
 545       for (pp = p + 1; ISDIGIT (*pp); pp++)
 546         ++digits;
 547       if (digits > 0 && (*pp == '/' || *pp == '\0'))
 548         goto http;
 549
 550       /* Prepend "ftp://" to the entire URL... */
 551       res = xmalloc (6 + strlen (url) + 1);
 552       sprintf (res, "ftp://%s", url);
 553       /* ...and replace ':' with '/'. */
 554       res[6 + (p - url)] = '/';
 555       return res;
 556     }
 557   else
 558     {
 559       char *res;
 560     http:
 561       /* Just prepend "http://" to what we have. */
 562       res = xmalloc (7 + strlen (url) + 1);
 563       sprintf (res, "http://%s", url);
 564       return res;
 565     }
 566 }
 567 \f
 568 static void split_path PARAMS ((const char *, char **, char **));
 569
 570 /* Like strpbrk, with the exception that it returns the pointer to the
 571    terminating zero (end-of-string aka "eos") if no matching character
 572    is found.
 573
 574    Although I normally balk at Gcc-specific optimizations, it probably
 575    makes sense here: glibc has optimizations that detect strpbrk being
 576    called with literal string as ACCEPT and inline the search.  That
 577    optimization is defeated if strpbrk is hidden within the call to
 578    another function.  (And no, making strpbrk_or_eos inline doesn't
 579    help because the check for literal accept is in the
 580    preprocessor.)  */
 581
 582 #ifdef __GNUC__
 583
 584 #define strpbrk_or_eos(s, accept) ({            \
 585   char *SOE_p = strpbrk (s, accept);            \
 586   if (!SOE_p)                                   \
 587     SOE_p = (char *)s + strlen (s);             \
 588   SOE_p;                                        \
 589 })
 590
 591 #else  /* not __GNUC__ */
 592
 593 static char *
 594 strpbrk_or_eos (const char *s, const char *accept)
 595 {
 596   char *p = strpbrk (s, accept);
 597   if (!p)
 598     p = (char *)s + strlen (s);
 599   return p;
 600 }
 601 #endif
 602
 603 /* Turn STR into lowercase; return non-zero if a character was
 604    actually changed. */
 605
 606 static int
 607 lowercase_str (char *str)
 608 {
 609   int change = 0;
 610   for (; *str; str++)
 611     if (ISUPPER (*str))
 612       {
 613         change = 1;
 614         *str = TOLOWER (*str);
 615       }
 616   return change;
 617 }
 618
 619 static char *parse_errors[] = {
 620 #define PE_NO_ERROR                     0
 621   N_("No error"),
 622 #define PE_UNSUPPORTED_SCHEME           1
 623   N_("Unsupported scheme"),
 624 #define PE_EMPTY_HOST                   2
 625   N_("Empty host"),
 626 #define PE_BAD_PORT_NUMBER              3
 627   N_("Bad port number"),
 628 #define PE_INVALID_USER_NAME            4
 629   N_("Invalid user name"),
 630 #define PE_UNTERMINATED_IPV6_ADDRESS    5
 631   N_("Unterminated IPv6 numeric address"),
 632 #define PE_IPV6_NOT_SUPPORTED           6
 633   N_("IPv6 addresses not supported"),
 634 #define PE_INVALID_IPV6_ADDRESS         7
 635   N_("Invalid IPv6 numeric address")
 636 };
 637
 638 #ifdef ENABLE_IPV6
 639 /* The following two functions were adapted from glibc. */
 640
 641 static int
 642 is_valid_ipv4_address (const char *str, const char *end)
 643 {
 644   int saw_digit = 0;
 645   int octets = 0;
 646   int val = 0;
 647
 648   while (str < end)
 649     {
 650       int ch = *str++;
 651
 652       if (ch >= '0' && ch <= '9')
 653         {
 654           val = val * 10 + (ch - '0');
 655
 656           if (val > 255)
 657             return 0;
 658           if (saw_digit == 0)
 659             {
 660               if (++octets > 4)
 661                 return 0;
 662               saw_digit = 1;
 663             }
 664         }
 665       else if (ch == '.' && saw_digit == 1)
 666         {
 667           if (octets == 4)
 668             return 0;
 669           val = 0;
 670           saw_digit = 0;
 671         }
 672       else
 673         return 0;
 674     }
 675   if (octets < 4)
 676     return 0;
 677
 678   return 1;
 679 }
 680
 681 static int
 682 is_valid_ipv6_address (const char *str, const char *end)
 683 {
 684   enum {
 685     NS_INADDRSZ  = 4,
 686     NS_IN6ADDRSZ = 16,
 687     NS_INT16SZ   = 2
 688   };
 689
 690   const char *curtok;
 691   int tp;
 692   const char *colonp;
 693   int saw_xdigit;
 694   unsigned int val;
 695
 696   tp = 0;
 697   colonp = NULL;
 698
 699   if (str == end)
 700     return 0;
 701
 702   /* Leading :: requires some special handling. */
 703   if (*str == ':')
 704     {
 705       ++str;
 706       if (str == end || *str != ':')
 707         return 0;
 708     }
 709
 710   curtok = str;
 711   saw_xdigit = 0;
 712   val = 0;
 713
 714   while (str < end)
 715     {
 716       int ch = *str++;
 717
 718       /* if ch is a number, add it to val. */
 719       if (ISXDIGIT (ch))
 720         {
 721           val <<= 4;
 722           val |= XDIGIT_TO_NUM (ch);
 723           if (val > 0xffff)
 724             return 0;
 725           saw_xdigit = 1;
 726           continue;
 727         }
 728
 729       /* if ch is a colon ... */
 730       if (ch == ':')
 731         {
 732           curtok = str;
 733           if (saw_xdigit == 0)
 734             {
 735               if (colonp != NULL)
 736                 return 0;
 737               colonp = str + tp;
 738               continue;
 739             }
 740           else if (str == end)
 741             return 0;
 742           if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
 743             return 0;
 744           tp += NS_INT16SZ;
 745           saw_xdigit = 0;
 746           val = 0;
 747           continue;
 748         }
 749
 750       /* if ch is a dot ... */
 751       if (ch == '.' && (tp <= NS_IN6ADDRSZ - NS_INADDRSZ)
 752           && is_valid_ipv4_address (curtok, end) == 1)
 753         {
 754           tp += NS_INADDRSZ;
 755           saw_xdigit = 0;
 756           break;
 757         }
 758
 759       return 0;
 760     }
 761
 762   if (saw_xdigit == 1)
 763     {
 764       if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
 765         return 0;
 766       tp += NS_INT16SZ;
 767     }
 768
 769   if (colonp != NULL)
 770     {
 771       if (tp == NS_IN6ADDRSZ)
 772         return 0;
 773       tp = NS_IN6ADDRSZ;
 774     }
 775
 776   if (tp != NS_IN6ADDRSZ)
 777     return 0;
 778
 779   return 1;
 780 }
 781 #endif
 782
 783 /* Parse a URL.
 784
 785    Return a new struct url if successful, NULL on error.  In case of
 786    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 787    error code. */
 788 struct url *
 789 url_parse (const char *url, int *error)
 790 {
 791   struct url *u;
 792   const char *p;
 793   int path_modified, host_modified;
 794
 795   enum url_scheme scheme;
 796
 797   const char *uname_b,     *uname_e;
 798   const char *host_b,      *host_e;
 799   const char *path_b,      *path_e;
 800   const char *params_b,    *params_e;
 801   const char *query_b,     *query_e;
 802   const char *fragment_b,  *fragment_e;
 803
 804   int port;
 805   char *user = NULL, *passwd = NULL;
 806
 807   char *url_encoded = NULL;
 808
 809   int error_code;
 810
 811   scheme = url_scheme (url);
 812   if (scheme == SCHEME_INVALID)
 813     {
 814       error_code = PE_UNSUPPORTED_SCHEME;
 815       goto error;
 816     }
 817
 818   url_encoded = reencode_escapes (url);
 819   p = url_encoded;
 820
 821   p += strlen (supported_schemes[scheme].leading_string);
 822   uname_b = p;
 823   p += url_skip_credentials (p);
 824   uname_e = p;
 825
 826   /* scheme://user:pass@host[:port]... */
 827   /*                    ^              */
 828
 829   /* We attempt to break down the URL into the components path,
 830      params, query, and fragment.  They are ordered like this:
 831
 832        scheme://host[:port][/path][;params][?query][#fragment]  */
 833
 834   params_b   = params_e   = NULL;
 835   query_b    = query_e    = NULL;
 836   fragment_b = fragment_e = NULL;
 837
 838   host_b = p;
 839
 840   if (*p == '[')
 841     {
 842       /* Handle IPv6 address inside square brackets.  Ideally we'd
 843          just look for the terminating ']', but rfc2732 mandates
 844          rejecting invalid IPv6 addresses.  */
 845
 846       /* The address begins after '['. */
 847       host_b = p + 1;
 848       host_e = strchr (host_b, ']');
 849
 850       if (!host_e)
 851         {
 852           error_code = PE_UNTERMINATED_IPV6_ADDRESS;
 853           goto error;
 854         }
 855
 856 #ifdef ENABLE_IPV6
 857       /* Check if the IPv6 address is valid. */
 858       if (!is_valid_ipv6_address(host_b, host_e))
 859         {
 860           error_code = PE_INVALID_IPV6_ADDRESS;
 861           goto error;
 862         }
 863
 864       /* Continue parsing after the closing ']'. */
 865       p = host_e + 1;
 866 #else
 867       error_code = PE_IPV6_NOT_SUPPORTED;
 868       goto error;
 869 #endif
 870     }
 871   else
 872     {
 873       p = strpbrk_or_eos (p, ":/;?#");
 874       host_e = p;
 875     }
 876
 877   if (host_b == host_e)
 878     {
 879       error_code = PE_EMPTY_HOST;
 880       goto error;
 881     }
 882
 883   port = scheme_default_port (scheme);
 884   if (*p == ':')
 885     {
 886       const char *port_b, *port_e, *pp;
 887
 888       /* scheme://host:port/tralala */
 889       /*              ^             */
 890       ++p;
 891       port_b = p;
 892       p = strpbrk_or_eos (p, "/;?#");
 893       port_e = p;
 894
 895       if (port_b == port_e)
 896         {
 897           /* http://host:/whatever */
 898           /*             ^         */
 899           error_code = PE_BAD_PORT_NUMBER;
 900           goto error;
 901         }
 902
 903       for (port = 0, pp = port_b; pp < port_e; pp++)
 904         {
 905           if (!ISDIGIT (*pp))
 906             {
 907               /* http://host:12randomgarbage/blah */
 908               /*               ^                  */
 909               error_code = PE_BAD_PORT_NUMBER;
 910               goto error;
 911             }
 912
 913           port = 10 * port + (*pp - '0');
 914         }
 915     }
 916
 917   if (*p == '/')
 918     {
 919       ++p;
 920       path_b = p;
 921       p = strpbrk_or_eos (p, ";?#");
 922       path_e = p;
 923     }
 924   else
 925     {
 926       /* Path is not allowed not to exist. */
 927       path_b = path_e = p;
 928     }
 929
 930   if (*p == ';')
 931     {
 932       ++p;
 933       params_b = p;
 934       p = strpbrk_or_eos (p, "?#");
 935       params_e = p;
 936     }
 937   if (*p == '?')
 938     {
 939       ++p;
 940       query_b = p;
 941       p = strpbrk_or_eos (p, "#");
 942       query_e = p;
 943
 944       /* Hack that allows users to use '?' (a wildcard character) in
 945          FTP URLs without it being interpreted as a query string
 946          delimiter.  */
 947       if (scheme == SCHEME_FTP)
 948         {
 949           query_b = query_e = NULL;
 950           path_e = p;
 951         }
 952     }
 953   if (*p == '#')
 954     {
 955       ++p;
 956       fragment_b = p;
 957       p += strlen (p);
 958       fragment_e = p;
 959     }
 960   assert (*p == 0);
 961
 962   if (uname_b != uname_e)
 963     {
 964       /* http://user:pass@host */
 965       /*        ^         ^    */
 966       /*     uname_b   uname_e */
 967       if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
 968         {
 969           error_code = PE_INVALID_USER_NAME;
 970           goto error;
 971         }
 972     }
 973
 974   u = (struct url *)xmalloc (sizeof (struct url));
 975   memset (u, 0, sizeof (*u));
 976
 977   u->scheme = scheme;
 978   u->host   = strdupdelim (host_b, host_e);
 979   u->port   = port;
 980   u->user   = user;
 981   u->passwd = passwd;
 982
 983   u->path = strdupdelim (path_b, path_e);
 984   path_modified = path_simplify (u->path);
 985   split_path (u->path, &u->dir, &u->file);
 986
 987   host_modified = lowercase_str (u->host);
 988
 989   if (params_b)
 990     u->params = strdupdelim (params_b, params_e);
 991   if (query_b)
 992     u->query = strdupdelim (query_b, query_e);
 993   if (fragment_b)
 994     u->fragment = strdupdelim (fragment_b, fragment_e);
 995
 996   if (path_modified || u->fragment || host_modified || path_b == path_e)
 997     {
 998       /* If we suspect that a transformation has rendered what
 999          url_string might return different from URL_ENCODED, rebuild
1000          u->url using url_string.  */
1001       u->url = url_string (u, 0);
1002
1003       if (url_encoded != url)
1004         xfree ((char *) url_encoded);
1005     }
1006   else
1007     {
1008       if (url_encoded == url)
1009         u->url = xstrdup (url);
1010       else
1011         u->url = url_encoded;
1012     }
1013   url_encoded = NULL;
1014
1015   return u;
1016
1017  error:
1018   /* Cleanup in case of error: */
1019   if (url_encoded && url_encoded != url)
1020     xfree (url_encoded);
1021
1022   /* Transmit the error code to the caller, if the caller wants to
1023      know.  */
1024   if (error)
1025     *error = error_code;
1026   return NULL;
1027 }
1028
1029 /* Return the error message string from ERROR_CODE, which should have
1030    been retrieved from url_parse.  The error message is translated.  */
1031
1032 const char *
1033 url_error (int error_code)
1034 {
1035   assert (error_code >= 0 && error_code < countof (parse_errors));
1036   return _(parse_errors[error_code]);
1037 }
1038
1039 /* Split PATH into DIR and FILE.  PATH comes from the URL and is
1040    expected to be URL-escaped.
1041
1042    The path is split into directory (the part up to the last slash)
1043    and file (the part after the last slash), which are subsequently
1044    unescaped.  Examples:
1045
1046    PATH                 DIR           FILE
1047    "foo/bar/baz"        "foo/bar"     "baz"
1048    "foo/bar/"           "foo/bar"     ""
1049    "foo"                ""            "foo"
1050    "foo/bar/baz%2fqux"  "foo/bar"     "baz/qux" (!)
1051
1052    DIR and FILE are freshly allocated.  */
1053
1054 static void
1055 split_path (const char *path, char **dir, char **file)
1056 {
1057   char *last_slash = strrchr (path, '/');
1058   if (!last_slash)
1059     {
1060       *dir = xstrdup ("");
1061       *file = xstrdup (path);
1062     }
1063   else
1064     {
1065       *dir = strdupdelim (path, last_slash);
1066       *file = xstrdup (last_slash + 1);
1067     }
1068   url_unescape (*dir);
1069   url_unescape (*file);
1070 }
1071
1072 /* Note: URL's "full path" is the path with the query string and
1073    params appended.  The "fragment" (#foo) is intentionally ignored,
1074    but that might be changed.  For example, if the original URL was
1075    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
1076    the full path will be "/foo/bar/baz;bullshit?querystring".  */
1077
1078 /* Return the length of the full path, without the terminating
1079    zero.  */
1080
1081 static int
1082 full_path_length (const struct url *url)
1083 {
1084   int len = 0;
1085
1086 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
1087
1088   FROB (path);
1089   FROB (params);
1090   FROB (query);
1091
1092 #undef FROB
1093
1094   return len;
1095 }
1096
1097 /* Write out the full path. */
1098
1099 static void
1100 full_path_write (const struct url *url, char *where)
1101 {
1102 #define FROB(el, chr) do {                      \
1103   char *f_el = url->el;                         \
1104   if (f_el) {                                   \
1105     int l = strlen (f_el);                      \
1106     *where++ = chr;                             \
1107     memcpy (where, f_el, l);                    \
1108     where += l;                                 \
1109   }                                             \
1110 } while (0)
1111
1112   FROB (path, '/');
1113   FROB (params, ';');
1114   FROB (query, '?');
1115
1116 #undef FROB
1117 }
1118
1119 /* Public function for getting the "full path".  E.g. if u->path is
1120    "foo/bar" and u->query is "param=value", full_path will be
1121    "/foo/bar?param=value". */
1122
1123 char *
1124 url_full_path (const struct url *url)
1125 {
1126   int length = full_path_length (url);
1127   char *full_path = (char *)xmalloc(length + 1);
1128
1129   full_path_write (url, full_path);
1130   full_path[length] = '\0';
1131
1132   return full_path;
1133 }
1134
1135 /* Escape unsafe and reserved characters, except for the slash
1136    characters.  */
1137
1138 static char *
1139 url_escape_dir (const char *dir)
1140 {
1141   char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1142   char *h, *t;
1143   if (newdir == dir)
1144     return (char *)dir;
1145
1146   /* Unescape slashes in NEWDIR. */
1147
1148   h = newdir;                   /* hare */
1149   t = newdir;                   /* tortoise */
1150
1151   for (; *h; h++, t++)
1152     {
1153       /* url_escape_1 having converted '/' to "%2F" exactly. */
1154       if (*h == '%' && h[1] == '2' && h[2] == 'F')
1155         {
1156           *t = '/';
1157           h += 2;
1158         }
1159       else
1160         *t = *h;
1161     }
1162   *t = '\0';
1163
1164   return newdir;
1165 }
1166
1167 /* Sync u->path and u->url with u->dir and u->file.  Called after
1168    u->file or u->dir have been changed, typically by the FTP code.  */
1169
1170 static void
1171 sync_path (struct url *u)
1172 {
1173   char *newpath, *efile, *edir;
1174
1175   xfree (u->path);
1176
1177   /* u->dir and u->file are not escaped.  URL-escape them before
1178      reassembling them into u->path.  That way, if they contain
1179      separators like '?' or even if u->file contains slashes, the
1180      path will be correctly assembled.  (u->file can contain slashes
1181      if the URL specifies it with %2f, or if an FTP server returns
1182      it.)  */
1183   edir = url_escape_dir (u->dir);
1184   efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1185
1186   if (!*edir)
1187     newpath = xstrdup (efile);
1188   else
1189     {
1190       int dirlen = strlen (edir);
1191       int filelen = strlen (efile);
1192
1193       /* Copy "DIR/FILE" to newpath. */
1194       char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1195       memcpy (p, edir, dirlen);
1196       p += dirlen;
1197       *p++ = '/';
1198       memcpy (p, efile, filelen);
1199       p += filelen;
1200       *p++ = '\0';
1201     }
1202
1203   u->path = newpath;
1204
1205   if (edir != u->dir)
1206     xfree (edir);
1207   if (efile != u->file)
1208     xfree (efile);
1209
1210   /* Regenerate u->url as well.  */
1211   xfree (u->url);
1212   u->url = url_string (u, 0);
1213 }
1214
1215 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1216    This way we can sync u->path and u->url when they get changed.  */
1217
1218 void
1219 url_set_dir (struct url *url, const char *newdir)
1220 {
1221   xfree (url->dir);
1222   url->dir = xstrdup (newdir);
1223   sync_path (url);
1224 }
1225
1226 void
1227 url_set_file (struct url *url, const char *newfile)
1228 {
1229   xfree (url->file);
1230   url->file = xstrdup (newfile);
1231   sync_path (url);
1232 }
1233
1234 void
1235 url_free (struct url *url)
1236 {
1237   xfree (url->host);
1238   xfree (url->path);
1239   xfree (url->url);
1240
1241   FREE_MAYBE (url->params);
1242   FREE_MAYBE (url->query);
1243   FREE_MAYBE (url->fragment);
1244   FREE_MAYBE (url->user);
1245   FREE_MAYBE (url->passwd);
1246
1247   xfree (url->dir);
1248   xfree (url->file);
1249
1250   xfree (url);
1251 }
1252 \f
1253 /* Create all the necessary directories for PATH (a file).  Calls
1254    mkdirhier() internally.  */
1255 int
1256 mkalldirs (const char *path)
1257 {
1258   const char *p;
1259   char *t;
1260   struct stat st;
1261   int res;
1262
1263   p = path + strlen (path);
1264   for (; *p != '/' && p != path; p--)
1265     ;
1266
1267   /* Don't create if it's just a file.  */
1268   if ((p == path) && (*p != '/'))
1269     return 0;
1270   t = strdupdelim (path, p);
1271
1272   /* Check whether the directory exists.  */
1273   if ((stat (t, &st) == 0))
1274     {
1275       if (S_ISDIR (st.st_mode))
1276         {
1277           xfree (t);
1278           return 0;
1279         }
1280       else
1281         {
1282           /* If the dir exists as a file name, remove it first.  This
1283              is *only* for Wget to work with buggy old CERN http
1284              servers.  Here is the scenario: When Wget tries to
1285              retrieve a directory without a slash, e.g.
1286              http://foo/bar (bar being a directory), CERN server will
1287              not redirect it too http://foo/bar/ -- it will generate a
1288              directory listing containing links to bar/file1,
1289              bar/file2, etc.  Wget will lose because it saves this
1290              HTML listing to a file `bar', so it cannot create the
1291              directory.  To work around this, if the file of the same
1292              name exists, we just remove it and create the directory
1293              anyway.  */
1294           DEBUGP (("Removing %s because of directory danger!\n", t));
1295           unlink (t);
1296         }
1297     }
1298   res = make_directory (t);
1299   if (res != 0)
1300     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1301   xfree (t);
1302   return res;
1303 }
1304 \f
1305 /* Functions for constructing the file name out of URL components.  */
1306
1307 /* A growable string structure, used by url_file_name and friends.
1308    This should perhaps be moved to utils.c.
1309
1310    The idea is to have a convenient and efficient way to construct a
1311    string by having various functions append data to it.  Instead of
1312    passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1313    functions in questions, we pass the pointer to this struct.  */
1314
1315 struct growable {
1316   char *base;
1317   int size;
1318   int tail;
1319 };
1320
1321 /* Ensure that the string can accept APPEND_COUNT more characters past
1322    the current TAIL position.  If necessary, this will grow the string
1323    and update its allocated size.  If the string is already large
1324    enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1325 #define GROW(g, append_size) do {                                       \
1326   struct growable *G_ = g;                                              \
1327   DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1328 } while (0)
1329
1330 /* Return the tail position of the string. */
1331 #define TAIL(r) ((r)->base + (r)->tail)
1332
1333 /* Move the tail position by APPEND_COUNT characters. */
1334 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1335
1336 /* Append the string STR to DEST.  NOTICE: the string in DEST is not
1337    terminated.  */
1338
1339 static void
1340 append_string (const char *str, struct growable *dest)
1341 {
1342   int l = strlen (str);
1343   GROW (dest, l);
1344   memcpy (TAIL (dest), str, l);
1345   TAIL_INCR (dest, l);
1346 }
1347
1348 /* Append CH to DEST.  For example, append_char (0, DEST)
1349    zero-terminates DEST.  */
1350
1351 static void
1352 append_char (char ch, struct growable *dest)
1353 {
1354   GROW (dest, 1);
1355   *TAIL (dest) = ch;
1356   TAIL_INCR (dest, 1);
1357 }
1358
1359 enum {
1360   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1361   filechr_not_windows = 2,      /* unusable on Windows, one of \|/<>?:*" */
1362   filechr_control     = 4       /* a control character, e.g. 0-31 */
1363 };
1364
1365 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1366
1367 /* Shorthands for the table: */
1368 #define U filechr_not_unix
1369 #define W filechr_not_windows
1370 #define C filechr_control
1371
1372 #define UW U|W
1373 #define UWC U|W|C
1374
1375 /* Table of characters unsafe under various conditions (see above).
1376
1377    Arguably we could also claim `%' to be unsafe, since we use it as
1378    the escape character.  If we ever want to be able to reliably
1379    translate file name back to URL, this would become important
1380    crucial.  Right now, it's better to be minimal in escaping.  */
1381
1382 const static unsigned char filechr_table[256] =
1383 {
1384 UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1385   C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1386   C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1387   C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1388   0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1389   0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1390   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1391   0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
1392   0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1393   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1394   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1395   0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1396   0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1397   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1398   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1399   0,  0,  0,  0,   0,  0,  0,  0,   /* x   y   z   {    |   }   ~   DEL */
1400
1401   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
1402   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
1403   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1404   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1405
1406   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1407   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1408   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1409   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1410 };
1411 #undef U
1412 #undef W
1413 #undef C
1414 #undef UW
1415 #undef UWC
1416
1417 /* FN_PORT_SEP is the separator between host and port in file names
1418    for non-standard port numbers.  On Unix this is normally ':', as in
1419    "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1420    because Windows can't handle ':' in file names.  */
1421 #define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1422
1423 /* FN_QUERY_SEP is the separator between the file name and the URL
1424    query, normally '?'.  Since Windows cannot handle '?' as part of
1425    file name, we use '@' instead there.  */
1426 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1427
1428 /* Quote path element, characters in [b, e), as file name, and append
1429    the quoted string to DEST.  Each character is quoted as per
1430    file_unsafe_char and the corresponding table.
1431
1432    If ESCAPED_P is non-zero, the path element is considered to be
1433    URL-escaped and will be unescaped prior to inspection.  */
1434
1435 static void
1436 append_uri_pathel (const char *b, const char *e, int escaped_p,
1437                    struct growable *dest)
1438 {
1439   const char *p;
1440   int quoted, outlen;
1441
1442   int mask;
1443   if (opt.restrict_files_os == restrict_unix)
1444     mask = filechr_not_unix;
1445   else
1446     mask = filechr_not_windows;
1447   if (opt.restrict_files_ctrl)
1448     mask |= filechr_control;
1449
1450   /* Copy [b, e) to PATHEL and URL-unescape it. */
1451   if (escaped_p)
1452     {
1453       char *unescaped;
1454       BOUNDED_TO_ALLOCA (b, e, unescaped);
1455       url_unescape (unescaped);
1456       b = unescaped;
1457       e = unescaped + strlen (unescaped);
1458     }
1459
1460   /* Walk the PATHEL string and check how many characters we'll need
1461      to add for file quoting.  */
1462   quoted = 0;
1463   for (p = b; p < e; p++)
1464     if (FILE_CHAR_TEST (*p, mask))
1465       ++quoted;
1466
1467   /* e-b is the string length.  Each quoted char means two additional
1468      characters in the string, hence 2*quoted.  */
1469   outlen = (e - b) + (2 * quoted);
1470   GROW (dest, outlen);
1471
1472   if (!quoted)
1473     {
1474       /* If there's nothing to quote, we don't need to go through the
1475          string the second time.  */
1476       memcpy (TAIL (dest), b, outlen);
1477     }
1478   else
1479     {
1480       char *q = TAIL (dest);
1481       for (p = b; p < e; p++)
1482         {
1483           if (!FILE_CHAR_TEST (*p, mask))
1484             *q++ = *p;
1485           else
1486             {
1487               unsigned char ch = *p;
1488               *q++ = '%';
1489               *q++ = XNUM_TO_DIGIT (ch >> 4);
1490               *q++ = XNUM_TO_DIGIT (ch & 0xf);
1491             }
1492         }
1493       assert (q - TAIL (dest) == outlen);
1494     }
1495   TAIL_INCR (dest, outlen);
1496 }
1497
1498 /* Append to DEST the directory structure that corresponds the
1499    directory part of URL's path.  For example, if the URL is
1500    http://server/dir1/dir2/file, this appends "/dir1/dir2".
1501
1502    Each path element ("dir1" and "dir2" in the above example) is
1503    examined, url-unescaped, and re-escaped as file name element.
1504
1505    Additionally, it cuts as many directories from the path as
1506    specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1507    will produce "bar" for the above example.  For 2 or more, it will
1508    produce "".
1509
1510    Each component of the path is quoted for use as file name.  */
1511
1512 static void
1513 append_dir_structure (const struct url *u, struct growable *dest)
1514 {
1515   char *pathel, *next;
1516   int cut = opt.cut_dirs;
1517
1518   /* Go through the path components, de-URL-quote them, and quote them
1519      (if necessary) as file names.  */
1520
1521   pathel = u->path;
1522   for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1523     {
1524       if (cut-- > 0)
1525         continue;
1526       if (pathel == next)
1527         /* Ignore empty pathels.  */
1528         continue;
1529
1530       if (dest->tail)
1531         append_char ('/', dest);
1532       append_uri_pathel (pathel, next, 1, dest);
1533     }
1534 }
1535
1536 /* Return a unique file name that matches the given URL as good as
1537    possible.  Does not create directories on the file system.  */
1538
1539 char *
1540 url_file_name (const struct url *u)
1541 {
1542   struct growable fnres;
1543
1544   char *u_file, *u_query;
1545   char *fname, *unique;
1546
1547   fnres.base = NULL;
1548   fnres.size = 0;
1549   fnres.tail = 0;
1550
1551   /* Start with the directory prefix, if specified. */
1552   if (opt.dir_prefix)
1553     append_string (opt.dir_prefix, &fnres);
1554
1555   /* If "dirstruct" is turned on (typically the case with -r), add
1556      the host and port (unless those have been turned off) and
1557      directory structure.  */
1558   if (opt.dirstruct)
1559     {
1560       if (opt.add_hostdir)
1561         {
1562           if (fnres.tail)
1563             append_char ('/', &fnres);
1564           append_string (u->host, &fnres);
1565           if (u->port != scheme_default_port (u->scheme))
1566             {
1567               char portstr[24];
1568               number_to_string (portstr, u->port);
1569               append_char (FN_PORT_SEP, &fnres);
1570               append_string (portstr, &fnres);
1571             }
1572         }
1573
1574       append_dir_structure (u, &fnres);
1575     }
1576
1577   /* Add the file name. */
1578   if (fnres.tail)
1579     append_char ('/', &fnres);
1580   u_file = *u->file ? u->file : "index.html";
1581   append_uri_pathel (u_file, u_file + strlen (u_file), 0, &fnres);
1582
1583   /* Append "?query" to the file name. */
1584   u_query = u->query && *u->query ? u->query : NULL;
1585   if (u_query)
1586     {
1587       append_char (FN_QUERY_SEP, &fnres);
1588       append_uri_pathel (u_query, u_query + strlen (u_query), 1, &fnres);
1589     }
1590
1591   /* Zero-terminate the file name. */
1592   append_char ('\0', &fnres);
1593
1594   fname = fnres.base;
1595
1596   /* Check the cases in which the unique extensions are not used:
1597      1) Clobbering is turned off (-nc).
1598      2) Retrieval with regetting.
1599      3) Timestamping is used.
1600      4) Hierarchy is built.
1601
1602      The exception is the case when file does exist and is a
1603      directory (see `mkalldirs' for explanation).  */
1604
1605   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1606       && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1607     return fname;
1608
1609   unique = unique_name (fname, 1);
1610   if (unique != fname)
1611     xfree (fname);
1612   return unique;
1613 }
1614
1615 /* Return the length of URL's path.  Path is considered to be
1616    terminated by one of '?', ';', '#', or by the end of the
1617    string.  */
1618 static int
1619 path_length (const char *url)
1620 {
1621   const char *q = strpbrk_or_eos (url, "?;#");
1622   return q - url;
1623 }
1624
1625 /* Find the last occurrence of character C in the range [b, e), or
1626    NULL, if none are present.  This is equivalent to strrchr(b, c),
1627    except that it accepts an END argument instead of requiring the
1628    string to be zero-terminated.  Why is there no memrchr()?  */
1629 static const char *
1630 find_last_char (const char *b, const char *e, char c)
1631 {
1632   for (; e > b; e--)
1633     if (*e == c)
1634       return e;
1635   return NULL;
1636 }
1637 \f
1638 /* Resolve "." and ".." elements of PATH by destructively modifying
1639    PATH and return non-zero if PATH has been modified, zero otherwise.
1640
1641    The algorithm is in spirit similar to the one described in rfc1808,
1642    although implemented differently, in one pass.  To recap, path
1643    elements containing only "." are removed, and ".." is taken to mean
1644    "back up one element".  Single leading and trailing slashes are
1645    preserved.
1646
1647    This function does not handle URL escapes explicitly.  If you're
1648    passing paths from URLs, make sure to unquote "%2e" and "%2E" to
1649    ".", so that this function can find the dots.  (Wget's URL parser
1650    calls reencode_escapes, which see.)
1651
1652    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1653    test examples are provided below.  If you change anything in this
1654    function, run test_path_simplify to make sure you haven't broken a
1655    test case.  */
1656
1657 static int
1658 path_simplify (char *path)
1659 {
1660   char *h, *t, *end;
1661
1662   /* Preserve the leading '/'. */
1663   if (path[0] == '/')
1664     ++path;
1665
1666   h = path;                     /* hare */
1667   t = path;                     /* tortoise */
1668   end = path + strlen (path);
1669
1670   while (h < end)
1671     {
1672       /* Hare should be at the beginning of a path element. */
1673
1674       if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1675         {
1676           /* Ignore "./". */
1677           h += 2;
1678         }
1679       else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1680         {
1681           /* Handle "../" by retreating the tortoise by one path
1682              element -- but not past beggining of PATH.  */
1683           if (t > path)
1684             {
1685               /* Move backwards until T hits the beginning of the
1686                  previous path element or the beginning of path. */
1687               for (--t; t > path && t[-1] != '/'; t--)
1688                 ;
1689             }
1690           h += 3;
1691         }
1692       else if (*h == '/')
1693         {
1694           /* Ignore empty path elements.  Supporting them well is hard
1695              (where do you save "http://x.com///y.html"?), and they
1696              don't bring any practical gain.  Plus, they break our
1697              filesystem-influenced assumptions: allowing them would
1698              make "x/y//../z" simplify to "x/y/z", whereas most people
1699              would expect "x/z".  */
1700           ++h;
1701         }
1702       else
1703         {
1704           /* A regular path element.  If H hasn't advanced past T,
1705              simply skip to the next path element.  Otherwise, copy
1706              the path element until the next slash.  */
1707           if (t == h)
1708             {
1709               /* Skip the path element, including the slash.  */
1710               while (h < end && *h != '/')
1711                 t++, h++;
1712               if (h < end)
1713                 t++, h++;
1714             }
1715           else
1716             {
1717               /* Copy the path element, including the final slash.  */
1718               while (h < end && *h != '/')
1719                 *t++ = *h++;
1720               if (h < end)
1721                 *t++ = *h++;
1722             }
1723         }
1724     }
1725
1726   if (t != h)
1727     *t = '\0';
1728
1729   return t != h;
1730 }
1731 \f
1732 /* Merge BASE with LINK and return the resulting URI.
1733
1734    Either of the URIs may be absolute or relative, complete with the
1735    host name, or path only.  This tries to reasonably handle all
1736    foreseeable cases.  It only employs minimal URL parsing, without
1737    knowledge of the specifics of schemes.
1738
1739    Perhaps this function should call path_simplify so that the callers
1740    don't have to call url_parse unconditionally.  */
1741
1742 char *
1743 uri_merge (const char *base, const char *link)
1744 {
1745   int linklength;
1746   const char *end;
1747   char *merge;
1748
1749   if (url_has_scheme (link))
1750     return xstrdup (link);
1751
1752   /* We may not examine BASE past END. */
1753   end = base + path_length (base);
1754   linklength = strlen (link);
1755
1756   if (!*link)
1757     {
1758       /* Empty LINK points back to BASE, query string and all. */
1759       return xstrdup (base);
1760     }
1761   else if (*link == '?')
1762     {
1763       /* LINK points to the same location, but changes the query
1764          string.  Examples: */
1765       /* uri_merge("path",         "?new") -> "path?new"     */
1766       /* uri_merge("path?foo",     "?new") -> "path?new"     */
1767       /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1768       /* uri_merge("path#foo",     "?new") -> "path?new"     */
1769       int baselength = end - base;
1770       merge = xmalloc (baselength + linklength + 1);
1771       memcpy (merge, base, baselength);
1772       memcpy (merge + baselength, link, linklength);
1773       merge[baselength + linklength] = '\0';
1774     }
1775   else if (*link == '#')
1776     {
1777       /* uri_merge("path",         "#new") -> "path#new"     */
1778       /* uri_merge("path#foo",     "#new") -> "path#new"     */
1779       /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1780       /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1781       int baselength;
1782       const char *end1 = strchr (base, '#');
1783       if (!end1)
1784         end1 = base + strlen (base);
1785       baselength = end1 - base;
1786       merge = xmalloc (baselength + linklength + 1);
1787       memcpy (merge, base, baselength);
1788       memcpy (merge + baselength, link, linklength);
1789       merge[baselength + linklength] = '\0';
1790     }
1791   else if (*link == '/' && *(link + 1) == '/')
1792     {
1793       /* LINK begins with "//" and so is a net path: we need to
1794          replace everything after (and including) the double slash
1795          with LINK. */
1796
1797       /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1798       /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1799       /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1800
1801       int span;
1802       const char *slash;
1803       const char *start_insert;
1804
1805       /* Look for first slash. */
1806       slash = memchr (base, '/', end - base);
1807       /* If found slash and it is a double slash, then replace
1808          from this point, else default to replacing from the
1809          beginning.  */
1810       if (slash && *(slash + 1) == '/')
1811         start_insert = slash;
1812       else
1813         start_insert = base;
1814
1815       span = start_insert - base;
1816       merge = (char *)xmalloc (span + linklength + 1);
1817       if (span)
1818         memcpy (merge, base, span);
1819       memcpy (merge + span, link, linklength);
1820       merge[span + linklength] = '\0';
1821     }
1822   else if (*link == '/')
1823     {
1824       /* LINK is an absolute path: we need to replace everything
1825          after (and including) the FIRST slash with LINK.
1826
1827          So, if BASE is "http://host/whatever/foo/bar", and LINK is
1828          "/qux/xyzzy", our result should be
1829          "http://host/qux/xyzzy".  */
1830       int span;
1831       const char *slash;
1832       const char *start_insert = NULL; /* for gcc to shut up. */
1833       const char *pos = base;
1834       int seen_slash_slash = 0;
1835       /* We're looking for the first slash, but want to ignore
1836          double slash. */
1837     again:
1838       slash = memchr (pos, '/', end - pos);
1839       if (slash && !seen_slash_slash)
1840         if (*(slash + 1) == '/')
1841           {
1842             pos = slash + 2;
1843             seen_slash_slash = 1;
1844             goto again;
1845           }
1846
1847       /* At this point, SLASH is the location of the first / after
1848          "//", or the first slash altogether.  START_INSERT is the
1849          pointer to the location where LINK will be inserted.  When
1850          examining the last two examples, keep in mind that LINK
1851          begins with '/'. */
1852
1853       if (!slash && !seen_slash_slash)
1854         /* example: "foo" */
1855         /*           ^    */
1856         start_insert = base;
1857       else if (!slash && seen_slash_slash)
1858         /* example: "http://foo" */
1859         /*                     ^ */
1860         start_insert = end;
1861       else if (slash && !seen_slash_slash)
1862         /* example: "foo/bar" */
1863         /*           ^        */
1864         start_insert = base;
1865       else if (slash && seen_slash_slash)
1866         /* example: "http://something/" */
1867         /*                           ^  */
1868         start_insert = slash;
1869
1870       span = start_insert - base;
1871       merge = (char *)xmalloc (span + linklength + 1);
1872       if (span)
1873         memcpy (merge, base, span);
1874       memcpy (merge + span, link, linklength);
1875       merge[span + linklength] = '\0';
1876     }
1877   else
1878     {
1879       /* LINK is a relative URL: we need to replace everything
1880          after last slash (possibly empty) with LINK.
1881
1882          So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1883          our result should be "whatever/foo/qux/xyzzy".  */
1884       int need_explicit_slash = 0;
1885       int span;
1886       const char *start_insert;
1887       const char *last_slash = find_last_char (base, end, '/');
1888       if (!last_slash)
1889         {
1890           /* No slash found at all.  Append LINK to what we have,
1891              but we'll need a slash as a separator.
1892
1893              Example: if base == "foo" and link == "qux/xyzzy", then
1894              we cannot just append link to base, because we'd get
1895              "fooqux/xyzzy", whereas what we want is
1896              "foo/qux/xyzzy".
1897
1898              To make sure the / gets inserted, we set
1899              need_explicit_slash to 1.  We also set start_insert
1900              to end + 1, so that the length calculations work out
1901              correctly for one more (slash) character.  Accessing
1902              that character is fine, since it will be the
1903              delimiter, '\0' or '?'.  */
1904           /* example: "foo?..." */
1905           /*               ^    ('?' gets changed to '/') */
1906           start_insert = end + 1;
1907           need_explicit_slash = 1;
1908         }
1909       else if (last_slash && last_slash >= base + 2
1910                && last_slash[-2] == ':' && last_slash[-1] == '/')
1911         {
1912           /* example: http://host"  */
1913           /*                      ^ */
1914           start_insert = end + 1;
1915           need_explicit_slash = 1;
1916         }
1917       else
1918         {
1919           /* example: "whatever/foo/bar" */
1920           /*                        ^    */
1921           start_insert = last_slash + 1;
1922         }
1923
1924       span = start_insert - base;
1925       merge = (char *)xmalloc (span + linklength + 1);
1926       if (span)
1927         memcpy (merge, base, span);
1928       if (need_explicit_slash)
1929         merge[span - 1] = '/';
1930       memcpy (merge + span, link, linklength);
1931       merge[span + linklength] = '\0';
1932     }
1933
1934   return merge;
1935 }
1936 \f
1937 #define APPEND(p, s) do {                       \
1938   int len = strlen (s);                         \
1939   memcpy (p, s, len);                           \
1940   p += len;                                     \
1941 } while (0)
1942
1943 /* Use this instead of password when the actual password is supposed
1944    to be hidden.  We intentionally use a generic string without giving
1945    away the number of characters in the password, like previous
1946    versions did.  */
1947 #define HIDDEN_PASSWORD "*password*"
1948
1949 /* Recreate the URL string from the data in URL.
1950
1951    If HIDE is non-zero (as it is when we're calling this on a URL we
1952    plan to print, but not when calling it to canonicalize a URL for
1953    use within the program), password will be hidden.  Unsafe
1954    characters in the URL will be quoted.  */
1955
1956 char *
1957 url_string (const struct url *url, int hide_password)
1958 {
1959   int size;
1960   char *result, *p;
1961   char *quoted_user = NULL, *quoted_passwd = NULL;
1962
1963   int scheme_port  = supported_schemes[url->scheme].default_port;
1964   char *scheme_str = supported_schemes[url->scheme].leading_string;
1965   int fplen = full_path_length (url);
1966
1967   int brackets_around_host = 0;
1968
1969   assert (scheme_str != NULL);
1970
1971   /* Make sure the user name and password are quoted. */
1972   if (url->user)
1973     {
1974       quoted_user = url_escape_allow_passthrough (url->user);
1975       if (url->passwd)
1976         {
1977           if (hide_password)
1978             quoted_passwd = HIDDEN_PASSWORD;
1979           else
1980             quoted_passwd = url_escape_allow_passthrough (url->passwd);
1981         }
1982     }
1983
1984   if (strchr (url->host, ':'))
1985     brackets_around_host = 1;
1986
1987   size = (strlen (scheme_str)
1988           + strlen (url->host)
1989           + (brackets_around_host ? 2 : 0)
1990           + fplen
1991           + 1);
1992   if (url->port != scheme_port)
1993     size += 1 + numdigit (url->port);
1994   if (quoted_user)
1995     {
1996       size += 1 + strlen (quoted_user);
1997       if (quoted_passwd)
1998         size += 1 + strlen (quoted_passwd);
1999     }
2000
2001   p = result = xmalloc (size);
2002
2003   APPEND (p, scheme_str);
2004   if (quoted_user)
2005     {
2006       APPEND (p, quoted_user);
2007       if (quoted_passwd)
2008         {
2009           *p++ = ':';
2010           APPEND (p, quoted_passwd);
2011         }
2012       *p++ = '@';
2013     }
2014
2015   if (brackets_around_host)
2016     *p++ = '[';
2017   APPEND (p, url->host);
2018   if (brackets_around_host)
2019     *p++ = ']';
2020   if (url->port != scheme_port)
2021     {
2022       *p++ = ':';
2023       p = number_to_string (p, url->port);
2024     }
2025
2026   full_path_write (url, p);
2027   p += fplen;
2028   *p++ = '\0';
2029
2030   assert (p - result == size);
2031
2032   if (quoted_user && quoted_user != url->user)
2033     xfree (quoted_user);
2034   if (quoted_passwd && !hide_password
2035       && quoted_passwd != url->passwd)
2036     xfree (quoted_passwd);
2037
2038   return result;
2039 }
2040 \f
2041 /* Return non-zero if scheme a is similar to scheme b.
2042
2043    Schemes are similar if they are equal.  If SSL is supported, schemes
2044    are also similar if one is http (SCHEME_HTTP) and the other is https
2045    (SCHEME_HTTPS).  */
2046 int
2047 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2048 {
2049   if (a == b)
2050     return 1;
2051 #ifdef HAVE_SSL
2052   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2053       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2054     return 1;
2055 #endif
2056   return 0;
2057 }
2058 \f
2059 #if 0
2060 /* Debugging and testing support for path_simplify. */
2061
2062 /* Debug: run path_simplify on PATH and return the result in a new
2063    string.  Useful for calling from the debugger.  */
2064 static char *
2065 ps (char *path)
2066 {
2067   char *copy = xstrdup (path);
2068   path_simplify (copy);
2069   return copy;
2070 }
2071
2072 static void
2073 run_test (char *test, char *expected_result, int expected_change)
2074 {
2075   char *test_copy = xstrdup (test);
2076   int modified = path_simplify (test_copy);
2077
2078   if (0 != strcmp (test_copy, expected_result))
2079     {
2080       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2081               test, expected_result, test_copy);
2082     }
2083   if (modified != expected_change)
2084     {
2085       if (expected_change == 1)
2086         printf ("Expected no modification with path_simplify(\"%s\").\n",
2087                 test);
2088       else
2089         printf ("Expected modification with path_simplify(\"%s\").\n",
2090                 test);
2091     }
2092   xfree (test_copy);
2093 }
2094
2095 static void
2096 test_path_simplify (void)
2097 {
2098   static struct {
2099     char *test, *result;
2100     int should_modify;
2101   } tests[] = {
2102     { "",               "",             0 },
2103     { ".",              "",             1 },
2104     { "..",             "",             1 },
2105     { "foo",            "foo",          0 },
2106     { "foo/bar",        "foo/bar",      0 },
2107     { "foo///bar",      "foo/bar",      1 },
2108     { "foo/.",          "foo/",         1 },
2109     { "foo/./",         "foo/",         1 },
2110     { "foo./",          "foo./",        0 },
2111     { "foo/../bar",     "bar",          1 },
2112     { "foo/../bar/",    "bar/",         1 },
2113     { "foo/bar/..",     "foo/",         1 },
2114     { "foo/bar/../x",   "foo/x",        1 },
2115     { "foo/bar/../x/",  "foo/x/",       1 },
2116     { "foo/..",         "",             1 },
2117     { "foo/../..",      "",             1 },
2118     { "a/b/../../c",    "c",            1 },
2119     { "./a/../b",       "b",            1 }
2120   };
2121   int i;
2122
2123   for (i = 0; i < countof (tests); i++)
2124     {
2125       char *test = tests[i].test;
2126       char *expected_result = tests[i].result;
2127       int   expected_change = tests[i].should_modify;
2128       run_test (test, expected_result, expected_change);
2129     }
2130
2131   /* Now run all the tests with a leading slash before the test case,
2132      to prove that the slash is being preserved.  */
2133   for (i = 0; i < countof (tests); i++)
2134     {
2135       char *test, *expected_result;
2136       int expected_change = tests[i].should_modify;
2137
2138       test = xmalloc (1 + strlen (tests[i].test) + 1);
2139       sprintf (test, "/%s", tests[i].test);
2140
2141       expected_result = xmalloc (1 + strlen (tests[i].result) + 1);
2142       sprintf (expected_result, "/%s", tests[i].result);
2143
2144       run_test (test, expected_result, expected_change);
2145
2146       xfree (test);
2147       xfree (expected_result);
2148     }
2149 }
2150 #endif