sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001, 2003, 2003
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2 of the License, or (at
  10 your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget; if not, write to the Free Software
  19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  20
  21 In addition, as a special exception, the Free Software Foundation
  22 gives permission to link the code of its release of Wget with the
  23 OpenSSL project's "OpenSSL" library (or with modified versions of it
  24 that use the same license as the "OpenSSL" library), and distribute
  25 the linked executables.  You must obey the GNU General Public License
  26 in all respects for all of the code used other than "OpenSSL".  If you
  27 modify this file, you may extend this exception to your version of the
  28 file, but you are not obligated to do so.  If you do not wish to do
  29 so, delete this exception statement from your version.  */
  30
  31 #include <config.h>
  32
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #ifdef HAVE_STRING_H
  36 # include <string.h>
  37 #else
  38 # include <strings.h>
  39 #endif
  40 #include <sys/types.h>
  41 #ifdef HAVE_UNISTD_H
  42 # include <unistd.h>
  43 #endif
  44 #include <errno.h>
  45 #include <assert.h>
  46
  47 #include "wget.h"
  48 #include "utils.h"
  49 #include "url.h"
  50 #include "host.h"  /* for is_valid_ipv6_address */
  51
  52 #ifndef errno
  53 extern int errno;
  54 #endif
  55
  56 struct scheme_data
  57 {
  58   const char *name;
  59   const char *leading_string;
  60   int default_port;
  61   int enabled;
  62 };
  63
  64 /* Supported schemes: */
  65 static struct scheme_data supported_schemes[] =
  66 {
  67   { "http",     "http://",  DEFAULT_HTTP_PORT,  1 },
  68 #ifdef HAVE_SSL
  69   { "https",    "https://", DEFAULT_HTTPS_PORT, 1 },
  70 #endif
  71   { "ftp",      "ftp://",   DEFAULT_FTP_PORT,   1 },
  72
  73   /* SCHEME_INVALID */
  74   { NULL,       NULL,       -1,                 0 }
  75 };
  76
  77 /* Forward declarations: */
  78
  79 static int path_simplify PARAMS ((char *));
  80 \f
  81 /* Support for escaping and unescaping of URL strings.  */
  82
  83 /* Table of "reserved" and "unsafe" characters.  Those terms are
  84    rfc1738-speak, as such largely obsoleted by rfc2396 and later
  85    specs, but the general idea remains.
  86
  87    A reserved character is the one that you can't decode without
  88    changing the meaning of the URL.  For example, you can't decode
  89    "/foo/%2f/bar" into "/foo///bar" because the number and contents of
  90    path components is different.  Non-reserved characters can be
  91    changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar".  Wget
  92    uses the rfc1738 set of reserved characters, plus "$" and ",", as
  93    recommended by rfc2396.
  94
  95    An unsafe characters is the one that should be encoded when URLs
  96    are placed in foreign environments.  E.g. space and newline are
  97    unsafe in HTTP contexts because HTTP uses them as separator and
  98    terminator, so they must be encoded to %20 and %0A respectively.
  99    "*" is unsafe in shell context, etc.
 100
 101    We determine whether a character is unsafe through static table
 102    lookup.  This code assumes ASCII character set and 8-bit chars.  */
 103
 104 enum {
 105   /* rfc1738 reserved chars + "$" and ",".  */
 106   urlchr_reserved = 1,
 107
 108   /* rfc1738 unsafe chars, plus non-printables.  */
 109   urlchr_unsafe   = 2
 110 };
 111
 112 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 113 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 114 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 115
 116 /* Shorthands for the table: */
 117 #define R  urlchr_reserved
 118 #define U  urlchr_unsafe
 119 #define RU R|U
 120
 121 const static unsigned char urlchr_table[256] =
 122 {
 123   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 124   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 125   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 126   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 127   U,  0,  U, RU,   R,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 128   0,  0,  0,  R,   R,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 129   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 130   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 131  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 132   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 133   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 134   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 135   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 136   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 137   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 138   0,  0,  0,  U,   U,  U,  U,  U,   /* x   y   z   {    |   }   ~   DEL */
 139
 140   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 141   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 142   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 143   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 144
 145   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 146   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 147   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 148   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 149 };
 150 #undef R
 151 #undef U
 152 #undef RU
 153
 154 /* URL-unescape the string S.
 155
 156    This is done by transforming the sequences "%HH" to the character
 157    represented by the hexadecimal digits HH.  If % is not followed by
 158    two hexadecimal digits, it is inserted literally.
 159
 160    The transformation is done in place.  If you need the original
 161    string intact, make a copy before calling this function.  */
 162
 163 static void
 164 url_unescape (char *s)
 165 {
 166   char *t = s;                  /* t - tortoise */
 167   char *h = s;                  /* h - hare     */
 168
 169   for (; *h; h++, t++)
 170     {
 171       if (*h != '%')
 172         {
 173         copychar:
 174           *t = *h;
 175         }
 176       else
 177         {
 178           /* Do nothing if '%' is not followed by two hex digits. */
 179           if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2])))
 180             goto copychar;
 181           *t = X2DIGITS_TO_NUM (h[1], h[2]);
 182           h += 2;
 183         }
 184     }
 185   *t = '\0';
 186 }
 187
 188 /* The core of url_escape_* functions.  Escapes the characters that
 189    match the provided mask in urlchr_table.
 190
 191    If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars
 192    will be returned unchanged.  If ALLOW_PASSTHROUGH is zero, a
 193    freshly allocated string will be returned in all cases.  */
 194
 195 static char *
 196 url_escape_1 (const char *s, unsigned char mask, int allow_passthrough)
 197 {
 198   const char *p1;
 199   char *p2, *newstr;
 200   int newlen;
 201   int addition = 0;
 202
 203   for (p1 = s; *p1; p1++)
 204     if (urlchr_test (*p1, mask))
 205       addition += 2;            /* Two more characters (hex digits) */
 206
 207   if (!addition)
 208     return allow_passthrough ? (char *)s : xstrdup (s);
 209
 210   newlen = (p1 - s) + addition;
 211   newstr = (char *)xmalloc (newlen + 1);
 212
 213   p1 = s;
 214   p2 = newstr;
 215   while (*p1)
 216     {
 217       /* Quote the characters that match the test mask. */
 218       if (urlchr_test (*p1, mask))
 219         {
 220           unsigned char c = *p1++;
 221           *p2++ = '%';
 222           *p2++ = XNUM_TO_DIGIT (c >> 4);
 223           *p2++ = XNUM_TO_DIGIT (c & 0xf);
 224         }
 225       else
 226         *p2++ = *p1++;
 227     }
 228   assert (p2 - newstr == newlen);
 229   *p2 = '\0';
 230
 231   return newstr;
 232 }
 233
 234 /* URL-escape the unsafe characters (see urlchr_table) in a given
 235    string, returning a freshly allocated string.  */
 236
 237 char *
 238 url_escape (const char *s)
 239 {
 240   return url_escape_1 (s, urlchr_unsafe, 0);
 241 }
 242
 243 /* URL-escape the unsafe characters (see urlchr_table) in a given
 244    string.  If no characters are unsafe, S is returned.  */
 245
 246 static char *
 247 url_escape_allow_passthrough (const char *s)
 248 {
 249   return url_escape_1 (s, urlchr_unsafe, 1);
 250 }
 251 \f
 252 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
 253
 254 /* Decide whether to encode, decode, or pass through the char at P.
 255    This used to be a macro, but it got a little too convoluted.  */
 256 static inline enum copy_method
 257 decide_copy_method (const char *p)
 258 {
 259   if (*p == '%')
 260     {
 261       if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
 262         {
 263           /* %xx sequence: decode it, unless it would decode to an
 264              unsafe or a reserved char; in that case, leave it as
 265              is. */
 266           char preempt = X2DIGITS_TO_NUM (*(p + 1), *(p + 2));
 267           if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt))
 268             return CM_PASSTHROUGH;
 269           else
 270             return CM_DECODE;
 271         }
 272       else
 273         /* Garbled %.. sequence: encode `%'. */
 274         return CM_ENCODE;
 275     }
 276   else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
 277     return CM_ENCODE;
 278   else
 279     return CM_PASSTHROUGH;
 280 }
 281
 282 /* Translate a %-escaped (but possibly non-conformant) input string S
 283    into a %-escaped (and conformant) output string.  If no characters
 284    are encoded or decoded, return the same string S; otherwise, return
 285    a freshly allocated string with the new contents.
 286
 287    After a URL has been run through this function, the protocols that
 288    use `%' as the quote character can use the resulting string as-is,
 289    while those that don't call url_unescape() to get to the intended
 290    data.  This function is also stable: after an input string is
 291    transformed the first time, all further transformations of the
 292    result yield the same result string.
 293
 294    Let's discuss why this function is needed.
 295
 296    Imagine Wget is to retrieve `http://abc.xyz/abc def'.  Since a raw
 297    space character would mess up the HTTP request, it needs to be
 298    quoted, like this:
 299
 300        GET /abc%20def HTTP/1.0
 301
 302    It appears that the unsafe chars need to be quoted, for example
 303    with url_escape.  But what if we're requested to download
 304    `abc%20def'?  url_escape transforms "%" to "%25", which would leave
 305    us with `abc%2520def'.  This is incorrect -- since %-escapes are
 306    part of URL syntax, "%20" is the correct way to denote a literal
 307    space on the Wget command line.  This leaves us in the conclusion
 308    that in that case Wget should not call url_escape, but leave the
 309    `%20' as is.
 310
 311    And what if the requested URI is `abc%20 def'?  If we call
 312    url_escape, we end up with `/abc%2520%20def', which is almost
 313    certainly not intended.  If we don't call url_escape, we are left
 314    with the embedded space and cannot complete the request.  What the
 315    user meant was for Wget to request `/abc%20%20def', and this is
 316    where reencode_escapes kicks in.
 317
 318    Wget used to solve this by first decoding %-quotes, and then
 319    encoding all the "unsafe" characters found in the resulting string.
 320    This was wrong because it didn't preserve certain URL special
 321    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 322    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 323    whether we considered `+' reserved (it is).  One of these results
 324    is inevitable because by the second step we would lose information
 325    on whether the `+' was originally encoded or not.  Both results
 326    were wrong because in CGI parameters + means space, while %2B means
 327    literal plus.  reencode_escapes correctly translates the above to
 328    "a%2B+b", i.e. returns the original string.
 329
 330    This function uses an algorithm proposed by Anon Sricharoenchai:
 331
 332    1. Encode all URL_UNSAFE and the "%" that are not followed by 2
 333       hexdigits.
 334
 335    2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
 336       "+".
 337
 338    ...except that this code conflates the two steps, and decides
 339    whether to encode, decode, or pass through each character in turn.
 340    The function still uses two passes, but their logic is the same --
 341    the first pass exists merely for the sake of allocation.  Another
 342    small difference is that we include `+' to URL_RESERVED.
 343
 344    Anon's test case:
 345
 346    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 347    ->
 348    "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
 349
 350    Simpler test cases:
 351
 352    "foo bar"         -> "foo%20bar"
 353    "foo%20bar"       -> "foo%20bar"
 354    "foo %20bar"      -> "foo%20%20bar"
 355    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 356    "foo%25%20bar"    -> "foo%25%20bar"
 357    "foo%2%20bar"     -> "foo%252%20bar"
 358    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 359    "foo%2b+bar"      -> "foo%2b+bar"  */
 360
 361 static char *
 362 reencode_escapes (const char *s)
 363 {
 364   const char *p1;
 365   char *newstr, *p2;
 366   int oldlen, newlen;
 367
 368   int encode_count = 0;
 369   int decode_count = 0;
 370
 371   /* First, pass through the string to see if there's anything to do,
 372      and to calculate the new length.  */
 373   for (p1 = s; *p1; p1++)
 374     {
 375       switch (decide_copy_method (p1))
 376         {
 377         case CM_ENCODE:
 378           ++encode_count;
 379           break;
 380         case CM_DECODE:
 381           ++decode_count;
 382           break;
 383         case CM_PASSTHROUGH:
 384           break;
 385         }
 386     }
 387
 388   if (!encode_count && !decode_count)
 389     /* The string is good as it is. */
 390     return (char *)s;           /* C const model sucks. */
 391
 392   oldlen = p1 - s;
 393   /* Each encoding adds two characters (hex digits), while each
 394      decoding removes two characters.  */
 395   newlen = oldlen + 2 * (encode_count - decode_count);
 396   newstr = xmalloc (newlen + 1);
 397
 398   p1 = s;
 399   p2 = newstr;
 400
 401   while (*p1)
 402     {
 403       switch (decide_copy_method (p1))
 404         {
 405         case CM_ENCODE:
 406           {
 407             unsigned char c = *p1++;
 408             *p2++ = '%';
 409             *p2++ = XNUM_TO_DIGIT (c >> 4);
 410             *p2++ = XNUM_TO_DIGIT (c & 0xf);
 411           }
 412           break;
 413         case CM_DECODE:
 414           *p2++ = X2DIGITS_TO_NUM (p1[1], p1[2]);
 415           p1 += 3;              /* skip %xx */
 416           break;
 417         case CM_PASSTHROUGH:
 418           *p2++ = *p1++;
 419         }
 420     }
 421   *p2 = '\0';
 422   assert (p2 - newstr == newlen);
 423   return newstr;
 424 }
 425 \f
 426 /* Returns the scheme type if the scheme is supported, or
 427    SCHEME_INVALID if not.  */
 428
 429 enum url_scheme
 430 url_scheme (const char *url)
 431 {
 432   int i;
 433
 434   for (i = 0; supported_schemes[i].leading_string; i++)
 435     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 436                           strlen (supported_schemes[i].leading_string)))
 437       {
 438         if (supported_schemes[i].enabled)
 439           return (enum url_scheme) i;
 440         else
 441           return SCHEME_INVALID;
 442       }
 443
 444   return SCHEME_INVALID;
 445 }
 446
 447 #define SCHEME_CHAR(ch) (ISALNUM (ch) || (ch) == '-' || (ch) == '+')
 448
 449 /* Return 1 if the URL begins with any "scheme", 0 otherwise.  As
 450    currently implemented, it returns true if URL begins with
 451    [-+a-zA-Z0-9]+: .  */
 452
 453 int
 454 url_has_scheme (const char *url)
 455 {
 456   const char *p = url;
 457
 458   /* The first char must be a scheme char. */
 459   if (!*p || !SCHEME_CHAR (*p))
 460     return 0;
 461   ++p;
 462   /* Followed by 0 or more scheme chars. */
 463   while (*p && SCHEME_CHAR (*p))
 464     ++p;
 465   /* Terminated by ':'. */
 466   return *p == ':';
 467 }
 468
 469 int
 470 scheme_default_port (enum url_scheme scheme)
 471 {
 472   return supported_schemes[scheme].default_port;
 473 }
 474
 475 void
 476 scheme_disable (enum url_scheme scheme)
 477 {
 478   supported_schemes[scheme].enabled = 0;
 479 }
 480
 481 /* Skip the username and password, if present in the URL.  The
 482    function should *not* be called with the complete URL, but with the
 483    portion after the scheme.
 484
 485    If no username and password are found, return URL.  */
 486
 487 static const char *
 488 url_skip_credentials (const char *url)
 489 {
 490   /* Look for '@' that comes before terminators, such as '/', '?',
 491      '#', or ';'.  */
 492   const char *p = (const char *)strpbrk (url, "@/?#;");
 493   if (!p || *p != '@')
 494     return url;
 495   return p + 1;
 496 }
 497
 498 /* Parse credentials contained in [BEG, END).  The region is expected
 499    to have come from a URL and is unescaped.  */
 500
 501 static int
 502 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
 503 {
 504   char *colon;
 505   const char *userend;
 506
 507   if (beg == end)
 508     return 0;                   /* empty user name */
 509
 510   colon = memchr (beg, ':', end - beg);
 511   if (colon == beg)
 512     return 0;                   /* again empty user name */
 513
 514   if (colon)
 515     {
 516       *passwd = strdupdelim (colon + 1, end);
 517       userend = colon;
 518       url_unescape (*passwd);
 519     }
 520   else
 521     {
 522       *passwd = NULL;
 523       userend = end;
 524     }
 525   *user = strdupdelim (beg, userend);
 526   url_unescape (*user);
 527   return 1;
 528 }
 529
 530 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 531    popularized by Netscape and NcFTP.  HTTP shorthands look like this:
 532
 533    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 534    www.foo.com[:port]            -> http://www.foo.com[:port]
 535
 536    FTP shorthands look like this:
 537
 538    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 539    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 540
 541    If the URL needs not or cannot be rewritten, return NULL.  */
 542
 543 char *
 544 rewrite_shorthand_url (const char *url)
 545 {
 546   const char *p;
 547
 548   if (url_has_scheme (url))
 549     return NULL;
 550
 551   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 552      latter Netscape.  */
 553   for (p = url; *p && *p != ':' && *p != '/'; p++)
 554     ;
 555
 556   if (p == url)
 557     return NULL;
 558
 559   if (*p == ':')
 560     {
 561       const char *pp;
 562       char *res;
 563       /* If the characters after the colon and before the next slash
 564          or end of string are all digits, it's HTTP.  */
 565       int digits = 0;
 566       for (pp = p + 1; ISDIGIT (*pp); pp++)
 567         ++digits;
 568       if (digits > 0 && (*pp == '/' || *pp == '\0'))
 569         goto http;
 570
 571       /* Prepend "ftp://" to the entire URL... */
 572       res = xmalloc (6 + strlen (url) + 1);
 573       sprintf (res, "ftp://%s", url);
 574       /* ...and replace ':' with '/'. */
 575       res[6 + (p - url)] = '/';
 576       return res;
 577     }
 578   else
 579     {
 580       char *res;
 581     http:
 582       /* Just prepend "http://" to what we have. */
 583       res = xmalloc (7 + strlen (url) + 1);
 584       sprintf (res, "http://%s", url);
 585       return res;
 586     }
 587 }
 588 \f
 589 static void split_path PARAMS ((const char *, char **, char **));
 590
 591 /* Like strpbrk, with the exception that it returns the pointer to the
 592    terminating zero (end-of-string aka "eos") if no matching character
 593    is found.
 594
 595    Although I normally balk at Gcc-specific optimizations, it probably
 596    makes sense here: glibc has optimizations that detect strpbrk being
 597    called with literal string as ACCEPT and inline the search.  That
 598    optimization is defeated if strpbrk is hidden within the call to
 599    another function.  (And no, making strpbrk_or_eos inline doesn't
 600    help because the check for literal accept is in the
 601    preprocessor.)  */
 602
 603 #ifdef __GNUC__
 604
 605 #define strpbrk_or_eos(s, accept) ({            \
 606   char *SOE_p = strpbrk (s, accept);            \
 607   if (!SOE_p)                                   \
 608     SOE_p = (char *)s + strlen (s);             \
 609   SOE_p;                                        \
 610 })
 611
 612 #else  /* not __GNUC__ */
 613
 614 static char *
 615 strpbrk_or_eos (const char *s, const char *accept)
 616 {
 617   char *p = strpbrk (s, accept);
 618   if (!p)
 619     p = (char *)s + strlen (s);
 620   return p;
 621 }
 622 #endif
 623
 624 /* Turn STR into lowercase; return non-zero if a character was
 625    actually changed. */
 626
 627 static int
 628 lowercase_str (char *str)
 629 {
 630   int change = 0;
 631   for (; *str; str++)
 632     if (ISUPPER (*str))
 633       {
 634         change = 1;
 635         *str = TOLOWER (*str);
 636       }
 637   return change;
 638 }
 639
 640 static const char *parse_errors[] = {
 641 #define PE_NO_ERROR                     0
 642   N_("No error"),
 643 #define PE_UNSUPPORTED_SCHEME           1
 644   N_("Unsupported scheme"),
 645 #define PE_EMPTY_HOST                   2
 646   N_("Empty host"),
 647 #define PE_BAD_PORT_NUMBER              3
 648   N_("Bad port number"),
 649 #define PE_INVALID_USER_NAME            4
 650   N_("Invalid user name"),
 651 #define PE_UNTERMINATED_IPV6_ADDRESS    5
 652   N_("Unterminated IPv6 numeric address"),
 653 #define PE_IPV6_NOT_SUPPORTED           6
 654   N_("IPv6 addresses not supported"),
 655 #define PE_INVALID_IPV6_ADDRESS         7
 656   N_("Invalid IPv6 numeric address")
 657 };
 658
 659 /* Parse a URL.
 660
 661    Return a new struct url if successful, NULL on error.  In case of
 662    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 663    error code. */
 664 struct url *
 665 url_parse (const char *url, int *error)
 666 {
 667   struct url *u;
 668   const char *p;
 669   int path_modified, host_modified;
 670
 671   enum url_scheme scheme;
 672
 673   const char *uname_b,     *uname_e;
 674   const char *host_b,      *host_e;
 675   const char *path_b,      *path_e;
 676   const char *params_b,    *params_e;
 677   const char *query_b,     *query_e;
 678   const char *fragment_b,  *fragment_e;
 679
 680   int port;
 681   char *user = NULL, *passwd = NULL;
 682
 683   char *url_encoded = NULL;
 684
 685   int error_code;
 686
 687   scheme = url_scheme (url);
 688   if (scheme == SCHEME_INVALID)
 689     {
 690       error_code = PE_UNSUPPORTED_SCHEME;
 691       goto error;
 692     }
 693
 694   url_encoded = reencode_escapes (url);
 695   p = url_encoded;
 696
 697   p += strlen (supported_schemes[scheme].leading_string);
 698   uname_b = p;
 699   p = url_skip_credentials (p);
 700   uname_e = p;
 701
 702   /* scheme://user:pass@host[:port]... */
 703   /*                    ^              */
 704
 705   /* We attempt to break down the URL into the components path,
 706      params, query, and fragment.  They are ordered like this:
 707
 708        scheme://host[:port][/path][;params][?query][#fragment]  */
 709
 710   params_b   = params_e   = NULL;
 711   query_b    = query_e    = NULL;
 712   fragment_b = fragment_e = NULL;
 713
 714   host_b = p;
 715
 716   if (*p == '[')
 717     {
 718       /* Handle IPv6 address inside square brackets.  Ideally we'd
 719          just look for the terminating ']', but rfc2732 mandates
 720          rejecting invalid IPv6 addresses.  */
 721
 722       /* The address begins after '['. */
 723       host_b = p + 1;
 724       host_e = strchr (host_b, ']');
 725
 726       if (!host_e)
 727         {
 728           error_code = PE_UNTERMINATED_IPV6_ADDRESS;
 729           goto error;
 730         }
 731
 732 #ifdef ENABLE_IPV6
 733       /* Check if the IPv6 address is valid. */
 734       if (!is_valid_ipv6_address(host_b, host_e))
 735         {
 736           error_code = PE_INVALID_IPV6_ADDRESS;
 737           goto error;
 738         }
 739
 740       /* Continue parsing after the closing ']'. */
 741       p = host_e + 1;
 742 #else
 743       error_code = PE_IPV6_NOT_SUPPORTED;
 744       goto error;
 745 #endif
 746     }
 747   else
 748     {
 749       p = strpbrk_or_eos (p, ":/;?#");
 750       host_e = p;
 751     }
 752
 753   if (host_b == host_e)
 754     {
 755       error_code = PE_EMPTY_HOST;
 756       goto error;
 757     }
 758
 759   port = scheme_default_port (scheme);
 760   if (*p == ':')
 761     {
 762       const char *port_b, *port_e, *pp;
 763
 764       /* scheme://host:port/tralala */
 765       /*              ^             */
 766       ++p;
 767       port_b = p;
 768       p = strpbrk_or_eos (p, "/;?#");
 769       port_e = p;
 770
 771       /* Allow empty port, as per rfc2396. */
 772       if (port_b != port_e)
 773         {
 774           for (port = 0, pp = port_b; pp < port_e; pp++)
 775             {
 776               if (!ISDIGIT (*pp))
 777                 {
 778                   /* http://host:12randomgarbage/blah */
 779                   /*               ^                  */
 780                   error_code = PE_BAD_PORT_NUMBER;
 781                   goto error;
 782                 }
 783               port = 10 * port + (*pp - '0');
 784             }
 785         }
 786     }
 787
 788   if (*p == '/')
 789     {
 790       ++p;
 791       path_b = p;
 792       p = strpbrk_or_eos (p, ";?#");
 793       path_e = p;
 794     }
 795   else
 796     {
 797       /* Path is not allowed not to exist. */
 798       path_b = path_e = p;
 799     }
 800
 801   if (*p == ';')
 802     {
 803       ++p;
 804       params_b = p;
 805       p = strpbrk_or_eos (p, "?#");
 806       params_e = p;
 807     }
 808   if (*p == '?')
 809     {
 810       ++p;
 811       query_b = p;
 812       p = strpbrk_or_eos (p, "#");
 813       query_e = p;
 814
 815       /* Hack that allows users to use '?' (a wildcard character) in
 816          FTP URLs without it being interpreted as a query string
 817          delimiter.  */
 818       if (scheme == SCHEME_FTP)
 819         {
 820           query_b = query_e = NULL;
 821           path_e = p;
 822         }
 823     }
 824   if (*p == '#')
 825     {
 826       ++p;
 827       fragment_b = p;
 828       p += strlen (p);
 829       fragment_e = p;
 830     }
 831   assert (*p == 0);
 832
 833   if (uname_b != uname_e)
 834     {
 835       /* http://user:pass@host */
 836       /*        ^         ^    */
 837       /*     uname_b   uname_e */
 838       if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
 839         {
 840           error_code = PE_INVALID_USER_NAME;
 841           goto error;
 842         }
 843     }
 844
 845   u = xnew0 (struct url);
 846   u->scheme = scheme;
 847   u->host   = strdupdelim (host_b, host_e);
 848   u->port   = port;
 849   u->user   = user;
 850   u->passwd = passwd;
 851
 852   u->path = strdupdelim (path_b, path_e);
 853   path_modified = path_simplify (u->path);
 854   split_path (u->path, &u->dir, &u->file);
 855
 856   host_modified = lowercase_str (u->host);
 857
 858   /* Decode %HH sequences in host name.  This is important not so much
 859      to support %HH sequences, but to support binary characters (which
 860      will have been converted to %HH by reencode_escapes).  */
 861   if (strchr (u->host, '%'))
 862     {
 863       url_unescape (u->host);
 864       host_modified = 1;
 865     }
 866
 867   if (params_b)
 868     u->params = strdupdelim (params_b, params_e);
 869   if (query_b)
 870     u->query = strdupdelim (query_b, query_e);
 871   if (fragment_b)
 872     u->fragment = strdupdelim (fragment_b, fragment_e);
 873
 874   if (path_modified || u->fragment || host_modified || path_b == path_e)
 875     {
 876       /* If we suspect that a transformation has rendered what
 877          url_string might return different from URL_ENCODED, rebuild
 878          u->url using url_string.  */
 879       u->url = url_string (u, 0);
 880
 881       if (url_encoded != url)
 882         xfree ((char *) url_encoded);
 883     }
 884   else
 885     {
 886       if (url_encoded == url)
 887         u->url = xstrdup (url);
 888       else
 889         u->url = url_encoded;
 890     }
 891   url_encoded = NULL;
 892
 893   return u;
 894
 895  error:
 896   /* Cleanup in case of error: */
 897   if (url_encoded && url_encoded != url)
 898     xfree (url_encoded);
 899
 900   /* Transmit the error code to the caller, if the caller wants to
 901      know.  */
 902   if (error)
 903     *error = error_code;
 904   return NULL;
 905 }
 906
 907 /* Return the error message string from ERROR_CODE, which should have
 908    been retrieved from url_parse.  The error message is translated.  */
 909
 910 const char *
 911 url_error (int error_code)
 912 {
 913   assert (error_code >= 0 && error_code < countof (parse_errors));
 914   return _(parse_errors[error_code]);
 915 }
 916
 917 /* Split PATH into DIR and FILE.  PATH comes from the URL and is
 918    expected to be URL-escaped.
 919
 920    The path is split into directory (the part up to the last slash)
 921    and file (the part after the last slash), which are subsequently
 922    unescaped.  Examples:
 923
 924    PATH                 DIR           FILE
 925    "foo/bar/baz"        "foo/bar"     "baz"
 926    "foo/bar/"           "foo/bar"     ""
 927    "foo"                ""            "foo"
 928    "foo/bar/baz%2fqux"  "foo/bar"     "baz/qux" (!)
 929
 930    DIR and FILE are freshly allocated.  */
 931
 932 static void
 933 split_path (const char *path, char **dir, char **file)
 934 {
 935   char *last_slash = strrchr (path, '/');
 936   if (!last_slash)
 937     {
 938       *dir = xstrdup ("");
 939       *file = xstrdup (path);
 940     }
 941   else
 942     {
 943       *dir = strdupdelim (path, last_slash);
 944       *file = xstrdup (last_slash + 1);
 945     }
 946   url_unescape (*dir);
 947   url_unescape (*file);
 948 }
 949
 950 /* Note: URL's "full path" is the path with the query string and
 951    params appended.  The "fragment" (#foo) is intentionally ignored,
 952    but that might be changed.  For example, if the original URL was
 953    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
 954    the full path will be "/foo/bar/baz;bullshit?querystring".  */
 955
 956 /* Return the length of the full path, without the terminating
 957    zero.  */
 958
 959 static int
 960 full_path_length (const struct url *url)
 961 {
 962   int len = 0;
 963
 964 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
 965
 966   FROB (path);
 967   FROB (params);
 968   FROB (query);
 969
 970 #undef FROB
 971
 972   return len;
 973 }
 974
 975 /* Write out the full path. */
 976
 977 static void
 978 full_path_write (const struct url *url, char *where)
 979 {
 980 #define FROB(el, chr) do {                      \
 981   char *f_el = url->el;                         \
 982   if (f_el) {                                   \
 983     int l = strlen (f_el);                      \
 984     *where++ = chr;                             \
 985     memcpy (where, f_el, l);                    \
 986     where += l;                                 \
 987   }                                             \
 988 } while (0)
 989
 990   FROB (path, '/');
 991   FROB (params, ';');
 992   FROB (query, '?');
 993
 994 #undef FROB
 995 }
 996
 997 /* Public function for getting the "full path".  E.g. if u->path is
 998    "foo/bar" and u->query is "param=value", full_path will be
 999    "/foo/bar?param=value". */
1000
1001 char *
1002 url_full_path (const struct url *url)
1003 {
1004   int length = full_path_length (url);
1005   char *full_path = (char *)xmalloc(length + 1);
1006
1007   full_path_write (url, full_path);
1008   full_path[length] = '\0';
1009
1010   return full_path;
1011 }
1012
1013 /* Escape unsafe and reserved characters, except for the slash
1014    characters.  */
1015
1016 static char *
1017 url_escape_dir (const char *dir)
1018 {
1019   char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1020   char *h, *t;
1021   if (newdir == dir)
1022     return (char *)dir;
1023
1024   /* Unescape slashes in NEWDIR. */
1025
1026   h = newdir;                   /* hare */
1027   t = newdir;                   /* tortoise */
1028
1029   for (; *h; h++, t++)
1030     {
1031       /* url_escape_1 having converted '/' to "%2F" exactly. */
1032       if (*h == '%' && h[1] == '2' && h[2] == 'F')
1033         {
1034           *t = '/';
1035           h += 2;
1036         }
1037       else
1038         *t = *h;
1039     }
1040   *t = '\0';
1041
1042   return newdir;
1043 }
1044
1045 /* Sync u->path and u->url with u->dir and u->file.  Called after
1046    u->file or u->dir have been changed, typically by the FTP code.  */
1047
1048 static void
1049 sync_path (struct url *u)
1050 {
1051   char *newpath, *efile, *edir;
1052
1053   xfree (u->path);
1054
1055   /* u->dir and u->file are not escaped.  URL-escape them before
1056      reassembling them into u->path.  That way, if they contain
1057      separators like '?' or even if u->file contains slashes, the
1058      path will be correctly assembled.  (u->file can contain slashes
1059      if the URL specifies it with %2f, or if an FTP server returns
1060      it.)  */
1061   edir = url_escape_dir (u->dir);
1062   efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1063
1064   if (!*edir)
1065     newpath = xstrdup (efile);
1066   else
1067     {
1068       int dirlen = strlen (edir);
1069       int filelen = strlen (efile);
1070
1071       /* Copy "DIR/FILE" to newpath. */
1072       char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1073       memcpy (p, edir, dirlen);
1074       p += dirlen;
1075       *p++ = '/';
1076       memcpy (p, efile, filelen);
1077       p += filelen;
1078       *p++ = '\0';
1079     }
1080
1081   u->path = newpath;
1082
1083   if (edir != u->dir)
1084     xfree (edir);
1085   if (efile != u->file)
1086     xfree (efile);
1087
1088   /* Regenerate u->url as well.  */
1089   xfree (u->url);
1090   u->url = url_string (u, 0);
1091 }
1092
1093 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1094    This way we can sync u->path and u->url when they get changed.  */
1095
1096 void
1097 url_set_dir (struct url *url, const char *newdir)
1098 {
1099   xfree (url->dir);
1100   url->dir = xstrdup (newdir);
1101   sync_path (url);
1102 }
1103
1104 void
1105 url_set_file (struct url *url, const char *newfile)
1106 {
1107   xfree (url->file);
1108   url->file = xstrdup (newfile);
1109   sync_path (url);
1110 }
1111
1112 void
1113 url_free (struct url *url)
1114 {
1115   xfree (url->host);
1116   xfree (url->path);
1117   xfree (url->url);
1118
1119   xfree_null (url->params);
1120   xfree_null (url->query);
1121   xfree_null (url->fragment);
1122   xfree_null (url->user);
1123   xfree_null (url->passwd);
1124
1125   xfree (url->dir);
1126   xfree (url->file);
1127
1128   xfree (url);
1129 }
1130 \f
1131 /* Create all the necessary directories for PATH (a file).  Calls
1132    mkdirhier() internally.  */
1133 int
1134 mkalldirs (const char *path)
1135 {
1136   const char *p;
1137   char *t;
1138   struct_stat st;
1139   int res;
1140
1141   p = path + strlen (path);
1142   for (; *p != '/' && p != path; p--)
1143     ;
1144
1145   /* Don't create if it's just a file.  */
1146   if ((p == path) && (*p != '/'))
1147     return 0;
1148   t = strdupdelim (path, p);
1149
1150   /* Check whether the directory exists.  */
1151   if ((stat (t, &st) == 0))
1152     {
1153       if (S_ISDIR (st.st_mode))
1154         {
1155           xfree (t);
1156           return 0;
1157         }
1158       else
1159         {
1160           /* If the dir exists as a file name, remove it first.  This
1161              is *only* for Wget to work with buggy old CERN http
1162              servers.  Here is the scenario: When Wget tries to
1163              retrieve a directory without a slash, e.g.
1164              http://foo/bar (bar being a directory), CERN server will
1165              not redirect it too http://foo/bar/ -- it will generate a
1166              directory listing containing links to bar/file1,
1167              bar/file2, etc.  Wget will lose because it saves this
1168              HTML listing to a file `bar', so it cannot create the
1169              directory.  To work around this, if the file of the same
1170              name exists, we just remove it and create the directory
1171              anyway.  */
1172           DEBUGP (("Removing %s because of directory danger!\n", t));
1173           unlink (t);
1174         }
1175     }
1176   res = make_directory (t);
1177   if (res != 0)
1178     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1179   xfree (t);
1180   return res;
1181 }
1182 \f
1183 /* Functions for constructing the file name out of URL components.  */
1184
1185 /* A growable string structure, used by url_file_name and friends.
1186    This should perhaps be moved to utils.c.
1187
1188    The idea is to have a convenient and efficient way to construct a
1189    string by having various functions append data to it.  Instead of
1190    passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1191    functions in questions, we pass the pointer to this struct.  */
1192
1193 struct growable {
1194   char *base;
1195   int size;
1196   int tail;
1197 };
1198
1199 /* Ensure that the string can accept APPEND_COUNT more characters past
1200    the current TAIL position.  If necessary, this will grow the string
1201    and update its allocated size.  If the string is already large
1202    enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1203 #define GROW(g, append_size) do {                                       \
1204   struct growable *G_ = g;                                              \
1205   DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1206 } while (0)
1207
1208 /* Return the tail position of the string. */
1209 #define TAIL(r) ((r)->base + (r)->tail)
1210
1211 /* Move the tail position by APPEND_COUNT characters. */
1212 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1213
1214 /* Append the string STR to DEST.  NOTICE: the string in DEST is not
1215    terminated.  */
1216
1217 static void
1218 append_string (const char *str, struct growable *dest)
1219 {
1220   int l = strlen (str);
1221   GROW (dest, l);
1222   memcpy (TAIL (dest), str, l);
1223   TAIL_INCR (dest, l);
1224 }
1225
1226 /* Append CH to DEST.  For example, append_char (0, DEST)
1227    zero-terminates DEST.  */
1228
1229 static void
1230 append_char (char ch, struct growable *dest)
1231 {
1232   GROW (dest, 1);
1233   *TAIL (dest) = ch;
1234   TAIL_INCR (dest, 1);
1235 }
1236
1237 enum {
1238   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1239   filechr_not_windows = 2,      /* unusable on Windows, one of \|/<>?:*" */
1240   filechr_control     = 4       /* a control character, e.g. 0-31 */
1241 };
1242
1243 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1244
1245 /* Shorthands for the table: */
1246 #define U filechr_not_unix
1247 #define W filechr_not_windows
1248 #define C filechr_control
1249
1250 #define UW U|W
1251 #define UWC U|W|C
1252
1253 /* Table of characters unsafe under various conditions (see above).
1254
1255    Arguably we could also claim `%' to be unsafe, since we use it as
1256    the escape character.  If we ever want to be able to reliably
1257    translate file name back to URL, this would become important
1258    crucial.  Right now, it's better to be minimal in escaping.  */
1259
1260 const static unsigned char filechr_table[256] =
1261 {
1262 UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1263   C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1264   C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1265   C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1266   0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1267   0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1268   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1269   0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
1270   0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1271   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1272   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1273   0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1274   0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1275   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1276   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1277   0,  0,  0,  0,   0,  0,  0,  0,   /* x   y   z   {    |   }   ~   DEL */
1278
1279   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
1280   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
1281   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1282   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1283
1284   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1285   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1286   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1287   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1288 };
1289 #undef U
1290 #undef W
1291 #undef C
1292 #undef UW
1293 #undef UWC
1294
1295 /* FN_PORT_SEP is the separator between host and port in file names
1296    for non-standard port numbers.  On Unix this is normally ':', as in
1297    "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1298    because Windows can't handle ':' in file names.  */
1299 #define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1300
1301 /* FN_QUERY_SEP is the separator between the file name and the URL
1302    query, normally '?'.  Since Windows cannot handle '?' as part of
1303    file name, we use '@' instead there.  */
1304 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1305
1306 /* Quote path element, characters in [b, e), as file name, and append
1307    the quoted string to DEST.  Each character is quoted as per
1308    file_unsafe_char and the corresponding table.
1309
1310    If ESCAPED_P is non-zero, the path element is considered to be
1311    URL-escaped and will be unescaped prior to inspection.  */
1312
1313 static void
1314 append_uri_pathel (const char *b, const char *e, int escaped_p,
1315                    struct growable *dest)
1316 {
1317   const char *p;
1318   int quoted, outlen;
1319
1320   int mask;
1321   if (opt.restrict_files_os == restrict_unix)
1322     mask = filechr_not_unix;
1323   else
1324     mask = filechr_not_windows;
1325   if (opt.restrict_files_ctrl)
1326     mask |= filechr_control;
1327
1328   /* Copy [b, e) to PATHEL and URL-unescape it. */
1329   if (escaped_p)
1330     {
1331       char *unescaped;
1332       BOUNDED_TO_ALLOCA (b, e, unescaped);
1333       url_unescape (unescaped);
1334       b = unescaped;
1335       e = unescaped + strlen (unescaped);
1336     }
1337
1338   /* Defang ".." when found as component of path.  Remember that path
1339      comes from the URL and might contain malicious input.  */
1340   if (e - b == 2 && b[0] == '.' && b[1] == '.')
1341     {
1342       b = "%2E%2E";
1343       e = b + 6;
1344     }
1345
1346   /* Walk the PATHEL string and check how many characters we'll need
1347      to quote.  */
1348   quoted = 0;
1349   for (p = b; p < e; p++)
1350     if (FILE_CHAR_TEST (*p, mask))
1351       ++quoted;
1352
1353   /* Calculate the length of the output string.  e-b is the input
1354      string length.  Each quoted char introduces two additional
1355      characters in the string, hence 2*quoted.  */
1356   outlen = (e - b) + (2 * quoted);
1357   GROW (dest, outlen);
1358
1359   if (!quoted)
1360     {
1361       /* If there's nothing to quote, we can simply append the string
1362          without processing it again.  */
1363       memcpy (TAIL (dest), b, outlen);
1364     }
1365   else
1366     {
1367       char *q = TAIL (dest);
1368       for (p = b; p < e; p++)
1369         {
1370           if (!FILE_CHAR_TEST (*p, mask))
1371             *q++ = *p;
1372           else
1373             {
1374               unsigned char ch = *p;
1375               *q++ = '%';
1376               *q++ = XNUM_TO_DIGIT (ch >> 4);
1377               *q++ = XNUM_TO_DIGIT (ch & 0xf);
1378             }
1379         }
1380       assert (q - TAIL (dest) == outlen);
1381     }
1382   TAIL_INCR (dest, outlen);
1383 }
1384
1385 /* Append to DEST the directory structure that corresponds the
1386    directory part of URL's path.  For example, if the URL is
1387    http://server/dir1/dir2/file, this appends "/dir1/dir2".
1388
1389    Each path element ("dir1" and "dir2" in the above example) is
1390    examined, url-unescaped, and re-escaped as file name element.
1391
1392    Additionally, it cuts as many directories from the path as
1393    specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1394    will produce "bar" for the above example.  For 2 or more, it will
1395    produce "".
1396
1397    Each component of the path is quoted for use as file name.  */
1398
1399 static void
1400 append_dir_structure (const struct url *u, struct growable *dest)
1401 {
1402   char *pathel, *next;
1403   int cut = opt.cut_dirs;
1404
1405   /* Go through the path components, de-URL-quote them, and quote them
1406      (if necessary) as file names.  */
1407
1408   pathel = u->path;
1409   for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1410     {
1411       if (cut-- > 0)
1412         continue;
1413       if (pathel == next)
1414         /* Ignore empty pathels.  */
1415         continue;
1416
1417       if (dest->tail)
1418         append_char ('/', dest);
1419       append_uri_pathel (pathel, next, 1, dest);
1420     }
1421 }
1422
1423 /* Return a unique file name that matches the given URL as good as
1424    possible.  Does not create directories on the file system.  */
1425
1426 char *
1427 url_file_name (const struct url *u)
1428 {
1429   struct growable fnres;
1430
1431   const char *u_file, *u_query;
1432   char *fname, *unique;
1433
1434   fnres.base = NULL;
1435   fnres.size = 0;
1436   fnres.tail = 0;
1437
1438   /* Start with the directory prefix, if specified. */
1439   if (opt.dir_prefix)
1440     append_string (opt.dir_prefix, &fnres);
1441
1442   /* If "dirstruct" is turned on (typically the case with -r), add
1443      the host and port (unless those have been turned off) and
1444      directory structure.  */
1445   if (opt.dirstruct)
1446     {
1447       if (opt.protocol_directories)
1448         {
1449           if (fnres.tail)
1450             append_char ('/', &fnres);
1451           append_string (supported_schemes[u->scheme].name, &fnres);
1452         }
1453       if (opt.add_hostdir)
1454         {
1455           if (fnres.tail)
1456             append_char ('/', &fnres);
1457           append_string (u->host, &fnres);
1458           if (u->port != scheme_default_port (u->scheme))
1459             {
1460               char portstr[24];
1461               number_to_string (portstr, u->port);
1462               append_char (FN_PORT_SEP, &fnres);
1463               append_string (portstr, &fnres);
1464             }
1465         }
1466
1467       append_dir_structure (u, &fnres);
1468     }
1469
1470   /* Add the file name. */
1471   if (fnres.tail)
1472     append_char ('/', &fnres);
1473   u_file = *u->file ? u->file : "index.html";
1474   append_uri_pathel (u_file, u_file + strlen (u_file), 0, &fnres);
1475
1476   /* Append "?query" to the file name. */
1477   u_query = u->query && *u->query ? u->query : NULL;
1478   if (u_query)
1479     {
1480       append_char (FN_QUERY_SEP, &fnres);
1481       append_uri_pathel (u_query, u_query + strlen (u_query), 1, &fnres);
1482     }
1483
1484   /* Zero-terminate the file name. */
1485   append_char ('\0', &fnres);
1486
1487   fname = fnres.base;
1488
1489   /* Check the cases in which the unique extensions are not used:
1490      1) Clobbering is turned off (-nc).
1491      2) Retrieval with regetting.
1492      3) Timestamping is used.
1493      4) Hierarchy is built.
1494
1495      The exception is the case when file does exist and is a
1496      directory (see `mkalldirs' for explanation).  */
1497
1498   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1499       && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1500     return fname;
1501
1502   unique = unique_name (fname, 1);
1503   if (unique != fname)
1504     xfree (fname);
1505   return unique;
1506 }
1507 \f
1508 /* Resolve "." and ".." elements of PATH by destructively modifying
1509    PATH and return non-zero if PATH has been modified, zero otherwise.
1510
1511    The algorithm is in spirit similar to the one described in rfc1808,
1512    although implemented differently, in one pass.  To recap, path
1513    elements containing only "." are removed, and ".." is taken to mean
1514    "back up one element".  Single leading and trailing slashes are
1515    preserved.
1516
1517    This function does not handle URL escapes explicitly.  If you're
1518    passing paths from URLs, make sure to unquote "%2e" and "%2E" to
1519    ".", so that this function can find the dots.  (Wget's URL parser
1520    calls reencode_escapes, which see.)
1521
1522    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1523    test examples are provided below.  If you change anything in this
1524    function, run test_path_simplify to make sure you haven't broken a
1525    test case.  */
1526
1527 static int
1528 path_simplify (char *path)
1529 {
1530   char *h = path;               /* hare */
1531   char *t = path;               /* tortoise */
1532   char *beg = path;             /* boundary for backing the tortoise */
1533   char *end = path + strlen (path);
1534
1535   while (h < end)
1536     {
1537       /* Hare should be at the beginning of a path element. */
1538
1539       if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1540         {
1541           /* Ignore "./". */
1542           h += 2;
1543         }
1544       else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1545         {
1546           /* Handle "../" by retreating the tortoise by one path
1547              element -- but not past beggining.  */
1548           if (t > beg)
1549             {
1550               /* Move backwards until T hits the beginning of the
1551                  previous path element or the beginning of path. */
1552               for (--t; t > beg && t[-1] != '/'; t--)
1553                 ;
1554             }
1555           else
1556             {
1557               /* If we're at the beginning, copy the "../" literally
1558                  move the beginning so a later ".." doesn't remove
1559                  it.  */
1560               beg = t + 3;
1561               goto regular;
1562             }
1563           h += 3;
1564         }
1565       else
1566         {
1567         regular:
1568           /* A regular path element.  If H hasn't advanced past T,
1569              simply skip to the next path element.  Otherwise, copy
1570              the path element until the next slash.  */
1571           if (t == h)
1572             {
1573               /* Skip the path element, including the slash.  */
1574               while (h < end && *h != '/')
1575                 t++, h++;
1576               if (h < end)
1577                 t++, h++;
1578             }
1579           else
1580             {
1581               /* Copy the path element, including the final slash.  */
1582               while (h < end && *h != '/')
1583                 *t++ = *h++;
1584               if (h < end)
1585                 *t++ = *h++;
1586             }
1587         }
1588     }
1589
1590   if (t != h)
1591     *t = '\0';
1592
1593   return t != h;
1594 }
1595 \f
1596 /* Return the length of URL's path.  Path is considered to be
1597    terminated by one of '?', ';', '#', or by the end of the
1598    string.  */
1599
1600 static int
1601 path_length (const char *url)
1602 {
1603   const char *q = strpbrk_or_eos (url, "?;#");
1604   return q - url;
1605 }
1606
1607 /* Find the last occurrence of character C in the range [b, e), or
1608    NULL, if none are present.  We might want to use memrchr (a GNU
1609    extension) under GNU libc.  */
1610
1611 static const char *
1612 find_last_char (const char *b, const char *e, char c)
1613 {
1614   for (; e > b; e--)
1615     if (*e == c)
1616       return e;
1617   return NULL;
1618 }
1619
1620 /* Merge BASE with LINK and return the resulting URI.
1621
1622    Either of the URIs may be absolute or relative, complete with the
1623    host name, or path only.  This tries to reasonably handle all
1624    foreseeable cases.  It only employs minimal URL parsing, without
1625    knowledge of the specifics of schemes.
1626
1627    I briefly considered making this function call path_simplify after
1628    the merging process, as rfc1738 seems to suggest.  This is a bad
1629    idea for several reasons: 1) it complexifies the code, and 2)
1630    url_parse has to simplify path anyway, so it's wasteful to boot.  */
1631
1632 char *
1633 uri_merge (const char *base, const char *link)
1634 {
1635   int linklength;
1636   const char *end;
1637   char *merge;
1638
1639   if (url_has_scheme (link))
1640     return xstrdup (link);
1641
1642   /* We may not examine BASE past END. */
1643   end = base + path_length (base);
1644   linklength = strlen (link);
1645
1646   if (!*link)
1647     {
1648       /* Empty LINK points back to BASE, query string and all. */
1649       return xstrdup (base);
1650     }
1651   else if (*link == '?')
1652     {
1653       /* LINK points to the same location, but changes the query
1654          string.  Examples: */
1655       /* uri_merge("path",         "?new") -> "path?new"     */
1656       /* uri_merge("path?foo",     "?new") -> "path?new"     */
1657       /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1658       /* uri_merge("path#foo",     "?new") -> "path?new"     */
1659       int baselength = end - base;
1660       merge = xmalloc (baselength + linklength + 1);
1661       memcpy (merge, base, baselength);
1662       memcpy (merge + baselength, link, linklength);
1663       merge[baselength + linklength] = '\0';
1664     }
1665   else if (*link == '#')
1666     {
1667       /* uri_merge("path",         "#new") -> "path#new"     */
1668       /* uri_merge("path#foo",     "#new") -> "path#new"     */
1669       /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1670       /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1671       int baselength;
1672       const char *end1 = strchr (base, '#');
1673       if (!end1)
1674         end1 = base + strlen (base);
1675       baselength = end1 - base;
1676       merge = xmalloc (baselength + linklength + 1);
1677       memcpy (merge, base, baselength);
1678       memcpy (merge + baselength, link, linklength);
1679       merge[baselength + linklength] = '\0';
1680     }
1681   else if (*link == '/' && *(link + 1) == '/')
1682     {
1683       /* LINK begins with "//" and so is a net path: we need to
1684          replace everything after (and including) the double slash
1685          with LINK. */
1686
1687       /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1688       /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1689       /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1690
1691       int span;
1692       const char *slash;
1693       const char *start_insert;
1694
1695       /* Look for first slash. */
1696       slash = memchr (base, '/', end - base);
1697       /* If found slash and it is a double slash, then replace
1698          from this point, else default to replacing from the
1699          beginning.  */
1700       if (slash && *(slash + 1) == '/')
1701         start_insert = slash;
1702       else
1703         start_insert = base;
1704
1705       span = start_insert - base;
1706       merge = (char *)xmalloc (span + linklength + 1);
1707       if (span)
1708         memcpy (merge, base, span);
1709       memcpy (merge + span, link, linklength);
1710       merge[span + linklength] = '\0';
1711     }
1712   else if (*link == '/')
1713     {
1714       /* LINK is an absolute path: we need to replace everything
1715          after (and including) the FIRST slash with LINK.
1716
1717          So, if BASE is "http://host/whatever/foo/bar", and LINK is
1718          "/qux/xyzzy", our result should be
1719          "http://host/qux/xyzzy".  */
1720       int span;
1721       const char *slash;
1722       const char *start_insert = NULL; /* for gcc to shut up. */
1723       const char *pos = base;
1724       int seen_slash_slash = 0;
1725       /* We're looking for the first slash, but want to ignore
1726          double slash. */
1727     again:
1728       slash = memchr (pos, '/', end - pos);
1729       if (slash && !seen_slash_slash)
1730         if (*(slash + 1) == '/')
1731           {
1732             pos = slash + 2;
1733             seen_slash_slash = 1;
1734             goto again;
1735           }
1736
1737       /* At this point, SLASH is the location of the first / after
1738          "//", or the first slash altogether.  START_INSERT is the
1739          pointer to the location where LINK will be inserted.  When
1740          examining the last two examples, keep in mind that LINK
1741          begins with '/'. */
1742
1743       if (!slash && !seen_slash_slash)
1744         /* example: "foo" */
1745         /*           ^    */
1746         start_insert = base;
1747       else if (!slash && seen_slash_slash)
1748         /* example: "http://foo" */
1749         /*                     ^ */
1750         start_insert = end;
1751       else if (slash && !seen_slash_slash)
1752         /* example: "foo/bar" */
1753         /*           ^        */
1754         start_insert = base;
1755       else if (slash && seen_slash_slash)
1756         /* example: "http://something/" */
1757         /*                           ^  */
1758         start_insert = slash;
1759
1760       span = start_insert - base;
1761       merge = (char *)xmalloc (span + linklength + 1);
1762       if (span)
1763         memcpy (merge, base, span);
1764       memcpy (merge + span, link, linklength);
1765       merge[span + linklength] = '\0';
1766     }
1767   else
1768     {
1769       /* LINK is a relative URL: we need to replace everything
1770          after last slash (possibly empty) with LINK.
1771
1772          So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1773          our result should be "whatever/foo/qux/xyzzy".  */
1774       int need_explicit_slash = 0;
1775       int span;
1776       const char *start_insert;
1777       const char *last_slash = find_last_char (base, end, '/');
1778       if (!last_slash)
1779         {
1780           /* No slash found at all.  Replace what we have with LINK. */
1781           start_insert = base;
1782         }
1783       else if (last_slash && last_slash >= base + 2
1784                && last_slash[-2] == ':' && last_slash[-1] == '/')
1785         {
1786           /* example: http://host"  */
1787           /*                      ^ */
1788           start_insert = end + 1;
1789           need_explicit_slash = 1;
1790         }
1791       else
1792         {
1793           /* example: "whatever/foo/bar" */
1794           /*                        ^    */
1795           start_insert = last_slash + 1;
1796         }
1797
1798       span = start_insert - base;
1799       merge = (char *)xmalloc (span + linklength + 1);
1800       if (span)
1801         memcpy (merge, base, span);
1802       if (need_explicit_slash)
1803         merge[span - 1] = '/';
1804       memcpy (merge + span, link, linklength);
1805       merge[span + linklength] = '\0';
1806     }
1807
1808   return merge;
1809 }
1810 \f
1811 #define APPEND(p, s) do {                       \
1812   int len = strlen (s);                         \
1813   memcpy (p, s, len);                           \
1814   p += len;                                     \
1815 } while (0)
1816
1817 /* Use this instead of password when the actual password is supposed
1818    to be hidden.  We intentionally use a generic string without giving
1819    away the number of characters in the password, like previous
1820    versions did.  */
1821 #define HIDDEN_PASSWORD "*password*"
1822
1823 /* Recreate the URL string from the data in URL.
1824
1825    If HIDE is non-zero (as it is when we're calling this on a URL we
1826    plan to print, but not when calling it to canonicalize a URL for
1827    use within the program), password will be hidden.  Unsafe
1828    characters in the URL will be quoted.  */
1829
1830 char *
1831 url_string (const struct url *url, int hide_password)
1832 {
1833   int size;
1834   char *result, *p;
1835   char *quoted_user = NULL, *quoted_passwd = NULL;
1836
1837   int scheme_port  = supported_schemes[url->scheme].default_port;
1838   const char *scheme_str = supported_schemes[url->scheme].leading_string;
1839   int fplen = full_path_length (url);
1840
1841   int brackets_around_host;
1842
1843   assert (scheme_str != NULL);
1844
1845   /* Make sure the user name and password are quoted. */
1846   if (url->user)
1847     {
1848       quoted_user = url_escape_allow_passthrough (url->user);
1849       if (url->passwd)
1850         {
1851           if (hide_password)
1852             quoted_passwd = HIDDEN_PASSWORD;
1853           else
1854             quoted_passwd = url_escape_allow_passthrough (url->passwd);
1855         }
1856     }
1857
1858   /* Numeric IPv6 addresses can contain ':' and need to be quoted with
1859      brackets.  */
1860   brackets_around_host = strchr (url->host, ':') != NULL;
1861
1862   size = (strlen (scheme_str)
1863           + strlen (url->host)
1864           + (brackets_around_host ? 2 : 0)
1865           + fplen
1866           + 1);
1867   if (url->port != scheme_port)
1868     size += 1 + numdigit (url->port);
1869   if (quoted_user)
1870     {
1871       size += 1 + strlen (quoted_user);
1872       if (quoted_passwd)
1873         size += 1 + strlen (quoted_passwd);
1874     }
1875
1876   p = result = xmalloc (size);
1877
1878   APPEND (p, scheme_str);
1879   if (quoted_user)
1880     {
1881       APPEND (p, quoted_user);
1882       if (quoted_passwd)
1883         {
1884           *p++ = ':';
1885           APPEND (p, quoted_passwd);
1886         }
1887       *p++ = '@';
1888     }
1889
1890   if (brackets_around_host)
1891     *p++ = '[';
1892   APPEND (p, url->host);
1893   if (brackets_around_host)
1894     *p++ = ']';
1895   if (url->port != scheme_port)
1896     {
1897       *p++ = ':';
1898       p = number_to_string (p, url->port);
1899     }
1900
1901   full_path_write (url, p);
1902   p += fplen;
1903   *p++ = '\0';
1904
1905   assert (p - result == size);
1906
1907   if (quoted_user && quoted_user != url->user)
1908     xfree (quoted_user);
1909   if (quoted_passwd && !hide_password
1910       && quoted_passwd != url->passwd)
1911     xfree (quoted_passwd);
1912
1913   return result;
1914 }
1915 \f
1916 /* Return non-zero if scheme a is similar to scheme b.
1917
1918    Schemes are similar if they are equal.  If SSL is supported, schemes
1919    are also similar if one is http (SCHEME_HTTP) and the other is https
1920    (SCHEME_HTTPS).  */
1921 int
1922 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
1923 {
1924   if (a == b)
1925     return 1;
1926 #ifdef HAVE_SSL
1927   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
1928       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
1929     return 1;
1930 #endif
1931   return 0;
1932 }
1933 \f
1934 #if 0
1935 /* Debugging and testing support for path_simplify. */
1936
1937 /* Debug: run path_simplify on PATH and return the result in a new
1938    string.  Useful for calling from the debugger.  */
1939 static char *
1940 ps (char *path)
1941 {
1942   char *copy = xstrdup (path);
1943   path_simplify (copy);
1944   return copy;
1945 }
1946
1947 static void
1948 run_test (char *test, char *expected_result, int expected_change)
1949 {
1950   char *test_copy = xstrdup (test);
1951   int modified = path_simplify (test_copy);
1952
1953   if (0 != strcmp (test_copy, expected_result))
1954     {
1955       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
1956               test, expected_result, test_copy);
1957     }
1958   if (modified != expected_change)
1959     {
1960       if (expected_change == 1)
1961         printf ("Expected modification with path_simplify(\"%s\").\n",
1962                 test);
1963       else
1964         printf ("Expected no modification with path_simplify(\"%s\").\n",
1965                 test);
1966     }
1967   xfree (test_copy);
1968 }
1969
1970 static void
1971 test_path_simplify (void)
1972 {
1973   static struct {
1974     char *test, *result;
1975     int should_modify;
1976   } tests[] = {
1977     { "",                       "",             0 },
1978     { ".",                      "",             1 },
1979     { "./",                     "",             1 },
1980     { "..",                     "..",           0 },
1981     { "../",                    "../",          0 },
1982     { "foo",                    "foo",          0 },
1983     { "foo/bar",                "foo/bar",      0 },
1984     { "foo///bar",              "foo///bar",    0 },
1985     { "foo/.",                  "foo/",         1 },
1986     { "foo/./",                 "foo/",         1 },
1987     { "foo./",                  "foo./",        0 },
1988     { "foo/../bar",             "bar",          1 },
1989     { "foo/../bar/",            "bar/",         1 },
1990     { "foo/bar/..",             "foo/",         1 },
1991     { "foo/bar/../x",           "foo/x",        1 },
1992     { "foo/bar/../x/",          "foo/x/",       1 },
1993     { "foo/..",                 "",             1 },
1994     { "foo/../..",              "..",           1 },
1995     { "foo/../../..",           "../..",        1 },
1996     { "foo/../../bar/../../baz", "../../baz",   1 },
1997     { "a/b/../../c",            "c",            1 },
1998     { "./a/../b",               "b",            1 }
1999   };
2000   int i;
2001
2002   for (i = 0; i < countof (tests); i++)
2003     {
2004       char *test = tests[i].test;
2005       char *expected_result = tests[i].result;
2006       int   expected_change = tests[i].should_modify;
2007       run_test (test, expected_result, expected_change);
2008     }
2009 }
2010 #endif