sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001, 2003, 2003
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2 of the License, or (at
  10 your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget; if not, write to the Free Software
  19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  20
  21 In addition, as a special exception, the Free Software Foundation
  22 gives permission to link the code of its release of Wget with the
  23 OpenSSL project's "OpenSSL" library (or with modified versions of it
  24 that use the same license as the "OpenSSL" library), and distribute
  25 the linked executables.  You must obey the GNU General Public License
  26 in all respects for all of the code used other than "OpenSSL".  If you
  27 modify this file, you may extend this exception to your version of the
  28 file, but you are not obligated to do so.  If you do not wish to do
  29 so, delete this exception statement from your version.  */
  30
  31 #include <config.h>
  32
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #ifdef HAVE_STRING_H
  36 # include <string.h>
  37 #else
  38 # include <strings.h>
  39 #endif
  40 #include <sys/types.h>
  41 #ifdef HAVE_UNISTD_H
  42 # include <unistd.h>
  43 #endif
  44 #include <errno.h>
  45 #include <assert.h>
  46
  47 #include "wget.h"
  48 #include "utils.h"
  49 #include "url.h"
  50 #include "host.h"  /* for is_valid_ipv6_address */
  51
  52 #ifndef errno
  53 extern int errno;
  54 #endif
  55
  56 struct scheme_data
  57 {
  58   const char *name;
  59   const char *leading_string;
  60   int default_port;
  61   int enabled;
  62 };
  63
  64 /* Supported schemes: */
  65 static struct scheme_data supported_schemes[] =
  66 {
  67   { "http",     "http://",  DEFAULT_HTTP_PORT,  1 },
  68 #ifdef HAVE_SSL
  69   { "https",    "https://", DEFAULT_HTTPS_PORT, 1 },
  70 #endif
  71   { "ftp",      "ftp://",   DEFAULT_FTP_PORT,   1 },
  72
  73   /* SCHEME_INVALID */
  74   { NULL,       NULL,       -1,                 0 }
  75 };
  76
  77 /* Forward declarations: */
  78
  79 static int path_simplify PARAMS ((char *));
  80 \f
  81 /* Support for escaping and unescaping of URL strings.  */
  82
  83 /* Table of "reserved" and "unsafe" characters.  Those terms are
  84    rfc1738-speak, as such largely obsoleted by rfc2396 and later
  85    specs, but the general idea remains.
  86
  87    A reserved character is the one that you can't decode without
  88    changing the meaning of the URL.  For example, you can't decode
  89    "/foo/%2f/bar" into "/foo///bar" because the number and contents of
  90    path components is different.  Non-reserved characters can be
  91    changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar".  Wget
  92    uses the rfc1738 set of reserved characters, plus "$" and ",", as
  93    recommended by rfc2396.
  94
  95    An unsafe characters is the one that should be encoded when URLs
  96    are placed in foreign environments.  E.g. space and newline are
  97    unsafe in HTTP contexts because HTTP uses them as separator and
  98    terminator, so they must be encoded to %20 and %0A respectively.
  99    "*" is unsafe in shell context, etc.
 100
 101    We determine whether a character is unsafe through static table
 102    lookup.  This code assumes ASCII character set and 8-bit chars.  */
 103
 104 enum {
 105   /* rfc1738 reserved chars + "$" and ",".  */
 106   urlchr_reserved = 1,
 107
 108   /* rfc1738 unsafe chars, plus non-printables.  */
 109   urlchr_unsafe   = 2
 110 };
 111
 112 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 113 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 114 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 115
 116 /* Shorthands for the table: */
 117 #define R  urlchr_reserved
 118 #define U  urlchr_unsafe
 119 #define RU R|U
 120
 121 const static unsigned char urlchr_table[256] =
 122 {
 123   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 124   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 125   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 126   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 127   U,  0,  U, RU,   R,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 128   0,  0,  0,  R,   R,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 129   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 130   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 131  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 132   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 133   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 134   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 135   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 136   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 137   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 138   0,  0,  0,  U,   U,  U,  U,  U,   /* x   y   z   {    |   }   ~   DEL */
 139
 140   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 141   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 142   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 143   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 144
 145   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 146   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 147   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 148   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 149 };
 150 #undef R
 151 #undef U
 152 #undef RU
 153
 154 /* URL-unescape the string S.
 155
 156    This is done by transforming the sequences "%HH" to the character
 157    represented by the hexadecimal digits HH.  If % is not followed by
 158    two hexadecimal digits, it is inserted literally.
 159
 160    The transformation is done in place.  If you need the original
 161    string intact, make a copy before calling this function.  */
 162
 163 static void
 164 url_unescape (char *s)
 165 {
 166   char *t = s;                  /* t - tortoise */
 167   char *h = s;                  /* h - hare     */
 168
 169   for (; *h; h++, t++)
 170     {
 171       if (*h != '%')
 172         {
 173         copychar:
 174           *t = *h;
 175         }
 176       else
 177         {
 178           /* Do nothing if '%' is not followed by two hex digits. */
 179           if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2])))
 180             goto copychar;
 181           *t = X2DIGITS_TO_NUM (h[1], h[2]);
 182           h += 2;
 183         }
 184     }
 185   *t = '\0';
 186 }
 187
 188 /* The core of url_escape_* functions.  Escapes the characters that
 189    match the provided mask in urlchr_table.
 190
 191    If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars
 192    will be returned unchanged.  If ALLOW_PASSTHROUGH is zero, a
 193    freshly allocated string will be returned in all cases.  */
 194
 195 static char *
 196 url_escape_1 (const char *s, unsigned char mask, int allow_passthrough)
 197 {
 198   const char *p1;
 199   char *p2, *newstr;
 200   int newlen;
 201   int addition = 0;
 202
 203   for (p1 = s; *p1; p1++)
 204     if (urlchr_test (*p1, mask))
 205       addition += 2;            /* Two more characters (hex digits) */
 206
 207   if (!addition)
 208     return allow_passthrough ? (char *)s : xstrdup (s);
 209
 210   newlen = (p1 - s) + addition;
 211   newstr = (char *)xmalloc (newlen + 1);
 212
 213   p1 = s;
 214   p2 = newstr;
 215   while (*p1)
 216     {
 217       /* Quote the characters that match the test mask. */
 218       if (urlchr_test (*p1, mask))
 219         {
 220           unsigned char c = *p1++;
 221           *p2++ = '%';
 222           *p2++ = XNUM_TO_DIGIT (c >> 4);
 223           *p2++ = XNUM_TO_DIGIT (c & 0xf);
 224         }
 225       else
 226         *p2++ = *p1++;
 227     }
 228   assert (p2 - newstr == newlen);
 229   *p2 = '\0';
 230
 231   return newstr;
 232 }
 233
 234 /* URL-escape the unsafe characters (see urlchr_table) in a given
 235    string, returning a freshly allocated string.  */
 236
 237 char *
 238 url_escape (const char *s)
 239 {
 240   return url_escape_1 (s, urlchr_unsafe, 0);
 241 }
 242
 243 /* URL-escape the unsafe characters (see urlchr_table) in a given
 244    string.  If no characters are unsafe, S is returned.  */
 245
 246 static char *
 247 url_escape_allow_passthrough (const char *s)
 248 {
 249   return url_escape_1 (s, urlchr_unsafe, 1);
 250 }
 251 \f
 252 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
 253
 254 /* Decide whether to encode, decode, or pass through the char at P.
 255    This used to be a macro, but it got a little too convoluted.  */
 256 static inline enum copy_method
 257 decide_copy_method (const char *p)
 258 {
 259   if (*p == '%')
 260     {
 261       if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
 262         {
 263           /* %xx sequence: decode it, unless it would decode to an
 264              unsafe or a reserved char; in that case, leave it as
 265              is. */
 266           char preempt = X2DIGITS_TO_NUM (*(p + 1), *(p + 2));
 267           if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt))
 268             return CM_PASSTHROUGH;
 269           else
 270             return CM_DECODE;
 271         }
 272       else
 273         /* Garbled %.. sequence: encode `%'. */
 274         return CM_ENCODE;
 275     }
 276   else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
 277     return CM_ENCODE;
 278   else
 279     return CM_PASSTHROUGH;
 280 }
 281
 282 /* Translate a %-escaped (but possibly non-conformant) input string S
 283    into a %-escaped (and conformant) output string.  If no characters
 284    are encoded or decoded, return the same string S; otherwise, return
 285    a freshly allocated string with the new contents.
 286
 287    After a URL has been run through this function, the protocols that
 288    use `%' as the quote character can use the resulting string as-is,
 289    while those that don't call url_unescape() to get to the intended
 290    data.  This function is also stable: after an input string is
 291    transformed the first time, all further transformations of the
 292    result yield the same result string.
 293
 294    Let's discuss why this function is needed.
 295
 296    Imagine Wget is to retrieve `http://abc.xyz/abc def'.  Since a raw
 297    space character would mess up the HTTP request, it needs to be
 298    quoted, like this:
 299
 300        GET /abc%20def HTTP/1.0
 301
 302    It appears that the unsafe chars need to be quoted, for example
 303    with url_escape.  But what if we're requested to download
 304    `abc%20def'?  url_escape transforms "%" to "%25", which would leave
 305    us with `abc%2520def'.  This is incorrect -- since %-escapes are
 306    part of URL syntax, "%20" is the correct way to denote a literal
 307    space on the Wget command line.  This leaves us in the conclusion
 308    that in that case Wget should not call url_escape, but leave the
 309    `%20' as is.
 310
 311    And what if the requested URI is `abc%20 def'?  If we call
 312    url_escape, we end up with `/abc%2520%20def', which is almost
 313    certainly not intended.  If we don't call url_escape, we are left
 314    with the embedded space and cannot complete the request.  What the
 315    user meant was for Wget to request `/abc%20%20def', and this is
 316    where reencode_escapes kicks in.
 317
 318    Wget used to solve this by first decoding %-quotes, and then
 319    encoding all the "unsafe" characters found in the resulting string.
 320    This was wrong because it didn't preserve certain URL special
 321    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 322    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 323    whether we considered `+' reserved (it is).  One of these results
 324    is inevitable because by the second step we would lose information
 325    on whether the `+' was originally encoded or not.  Both results
 326    were wrong because in CGI parameters + means space, while %2B means
 327    literal plus.  reencode_escapes correctly translates the above to
 328    "a%2B+b", i.e. returns the original string.
 329
 330    This function uses an algorithm proposed by Anon Sricharoenchai:
 331
 332    1. Encode all URL_UNSAFE and the "%" that are not followed by 2
 333       hexdigits.
 334
 335    2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
 336       "+".
 337
 338    ...except that this code conflates the two steps, and decides
 339    whether to encode, decode, or pass through each character in turn.
 340    The function still uses two passes, but their logic is the same --
 341    the first pass exists merely for the sake of allocation.  Another
 342    small difference is that we include `+' to URL_RESERVED.
 343
 344    Anon's test case:
 345
 346    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 347    ->
 348    "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
 349
 350    Simpler test cases:
 351
 352    "foo bar"         -> "foo%20bar"
 353    "foo%20bar"       -> "foo%20bar"
 354    "foo %20bar"      -> "foo%20%20bar"
 355    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 356    "foo%25%20bar"    -> "foo%25%20bar"
 357    "foo%2%20bar"     -> "foo%252%20bar"
 358    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 359    "foo%2b+bar"      -> "foo%2b+bar"  */
 360
 361 static char *
 362 reencode_escapes (const char *s)
 363 {
 364   const char *p1;
 365   char *newstr, *p2;
 366   int oldlen, newlen;
 367
 368   int encode_count = 0;
 369   int decode_count = 0;
 370
 371   /* First, pass through the string to see if there's anything to do,
 372      and to calculate the new length.  */
 373   for (p1 = s; *p1; p1++)
 374     {
 375       switch (decide_copy_method (p1))
 376         {
 377         case CM_ENCODE:
 378           ++encode_count;
 379           break;
 380         case CM_DECODE:
 381           ++decode_count;
 382           break;
 383         case CM_PASSTHROUGH:
 384           break;
 385         }
 386     }
 387
 388   if (!encode_count && !decode_count)
 389     /* The string is good as it is. */
 390     return (char *)s;           /* C const model sucks. */
 391
 392   oldlen = p1 - s;
 393   /* Each encoding adds two characters (hex digits), while each
 394      decoding removes two characters.  */
 395   newlen = oldlen + 2 * (encode_count - decode_count);
 396   newstr = xmalloc (newlen + 1);
 397
 398   p1 = s;
 399   p2 = newstr;
 400
 401   while (*p1)
 402     {
 403       switch (decide_copy_method (p1))
 404         {
 405         case CM_ENCODE:
 406           {
 407             unsigned char c = *p1++;
 408             *p2++ = '%';
 409             *p2++ = XNUM_TO_DIGIT (c >> 4);
 410             *p2++ = XNUM_TO_DIGIT (c & 0xf);
 411           }
 412           break;
 413         case CM_DECODE:
 414           *p2++ = X2DIGITS_TO_NUM (p1[1], p1[2]);
 415           p1 += 3;              /* skip %xx */
 416           break;
 417         case CM_PASSTHROUGH:
 418           *p2++ = *p1++;
 419         }
 420     }
 421   *p2 = '\0';
 422   assert (p2 - newstr == newlen);
 423   return newstr;
 424 }
 425 \f
 426 /* Returns the scheme type if the scheme is supported, or
 427    SCHEME_INVALID if not.  */
 428
 429 enum url_scheme
 430 url_scheme (const char *url)
 431 {
 432   int i;
 433
 434   for (i = 0; supported_schemes[i].leading_string; i++)
 435     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 436                           strlen (supported_schemes[i].leading_string)))
 437       {
 438         if (supported_schemes[i].enabled)
 439           return (enum url_scheme) i;
 440         else
 441           return SCHEME_INVALID;
 442       }
 443
 444   return SCHEME_INVALID;
 445 }
 446
 447 #define SCHEME_CHAR(ch) (ISALNUM (ch) || (ch) == '-' || (ch) == '+')
 448
 449 /* Return 1 if the URL begins with any "scheme", 0 otherwise.  As
 450    currently implemented, it returns true if URL begins with
 451    [-+a-zA-Z0-9]+: .  */
 452
 453 int
 454 url_has_scheme (const char *url)
 455 {
 456   const char *p = url;
 457
 458   /* The first char must be a scheme char. */
 459   if (!*p || !SCHEME_CHAR (*p))
 460     return 0;
 461   ++p;
 462   /* Followed by 0 or more scheme chars. */
 463   while (*p && SCHEME_CHAR (*p))
 464     ++p;
 465   /* Terminated by ':'. */
 466   return *p == ':';
 467 }
 468
 469 int
 470 scheme_default_port (enum url_scheme scheme)
 471 {
 472   return supported_schemes[scheme].default_port;
 473 }
 474
 475 void
 476 scheme_disable (enum url_scheme scheme)
 477 {
 478   supported_schemes[scheme].enabled = 0;
 479 }
 480
 481 /* Skip the username and password, if present in the URL.  The
 482    function should *not* be called with the complete URL, but with the
 483    portion after the scheme.
 484
 485    If no username and password are found, return URL.  */
 486
 487 static const char *
 488 url_skip_credentials (const char *url)
 489 {
 490   /* Look for '@' that comes before terminators, such as '/', '?',
 491      '#', or ';'.  */
 492   const char *p = (const char *)strpbrk (url, "@/?#;");
 493   if (!p || *p != '@')
 494     return url;
 495   return p + 1;
 496 }
 497
 498 /* Parse credentials contained in [BEG, END).  The region is expected
 499    to have come from a URL and is unescaped.  */
 500
 501 static int
 502 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
 503 {
 504   char *colon;
 505   const char *userend;
 506
 507   if (beg == end)
 508     return 0;                   /* empty user name */
 509
 510   colon = memchr (beg, ':', end - beg);
 511   if (colon == beg)
 512     return 0;                   /* again empty user name */
 513
 514   if (colon)
 515     {
 516       *passwd = strdupdelim (colon + 1, end);
 517       userend = colon;
 518       url_unescape (*passwd);
 519     }
 520   else
 521     {
 522       *passwd = NULL;
 523       userend = end;
 524     }
 525   *user = strdupdelim (beg, userend);
 526   url_unescape (*user);
 527   return 1;
 528 }
 529
 530 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 531    popularized by Netscape and NcFTP.  HTTP shorthands look like this:
 532
 533    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 534    www.foo.com[:port]            -> http://www.foo.com[:port]
 535
 536    FTP shorthands look like this:
 537
 538    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 539    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 540
 541    If the URL needs not or cannot be rewritten, return NULL.  */
 542
 543 char *
 544 rewrite_shorthand_url (const char *url)
 545 {
 546   const char *p;
 547
 548   if (url_has_scheme (url))
 549     return NULL;
 550
 551   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 552      latter Netscape.  */
 553   for (p = url; *p && *p != ':' && *p != '/'; p++)
 554     ;
 555
 556   if (p == url)
 557     return NULL;
 558
 559   if (*p == ':')
 560     {
 561       const char *pp;
 562       char *res;
 563       /* If the characters after the colon and before the next slash
 564          or end of string are all digits, it's HTTP.  */
 565       int digits = 0;
 566       for (pp = p + 1; ISDIGIT (*pp); pp++)
 567         ++digits;
 568       if (digits > 0 && (*pp == '/' || *pp == '\0'))
 569         goto http;
 570
 571       /* Prepend "ftp://" to the entire URL... */
 572       res = xmalloc (6 + strlen (url) + 1);
 573       sprintf (res, "ftp://%s", url);
 574       /* ...and replace ':' with '/'. */
 575       res[6 + (p - url)] = '/';
 576       return res;
 577     }
 578   else
 579     {
 580       char *res;
 581     http:
 582       /* Just prepend "http://" to what we have. */
 583       res = xmalloc (7 + strlen (url) + 1);
 584       sprintf (res, "http://%s", url);
 585       return res;
 586     }
 587 }
 588 \f
 589 static void split_path PARAMS ((const char *, char **, char **));
 590
 591 /* Like strpbrk, with the exception that it returns the pointer to the
 592    terminating zero (end-of-string aka "eos") if no matching character
 593    is found.
 594
 595    Although I normally balk at Gcc-specific optimizations, it probably
 596    makes sense here: glibc has optimizations that detect strpbrk being
 597    called with literal string as ACCEPT and inline the search.  That
 598    optimization is defeated if strpbrk is hidden within the call to
 599    another function.  (And no, making strpbrk_or_eos inline doesn't
 600    help because the check for literal accept is in the
 601    preprocessor.)  */
 602
 603 #ifdef __GNUC__
 604
 605 #define strpbrk_or_eos(s, accept) ({            \
 606   char *SOE_p = strpbrk (s, accept);            \
 607   if (!SOE_p)                                   \
 608     SOE_p = (char *)s + strlen (s);             \
 609   SOE_p;                                        \
 610 })
 611
 612 #else  /* not __GNUC__ */
 613
 614 static char *
 615 strpbrk_or_eos (const char *s, const char *accept)
 616 {
 617   char *p = strpbrk (s, accept);
 618   if (!p)
 619     p = (char *)s + strlen (s);
 620   return p;
 621 }
 622 #endif
 623
 624 /* Turn STR into lowercase; return non-zero if a character was
 625    actually changed. */
 626
 627 static int
 628 lowercase_str (char *str)
 629 {
 630   int change = 0;
 631   for (; *str; str++)
 632     if (ISUPPER (*str))
 633       {
 634         change = 1;
 635         *str = TOLOWER (*str);
 636       }
 637   return change;
 638 }
 639
 640 static const char *parse_errors[] = {
 641 #define PE_NO_ERROR                     0
 642   N_("No error"),
 643 #define PE_UNSUPPORTED_SCHEME           1
 644   N_("Unsupported scheme"),
 645 #define PE_EMPTY_HOST                   2
 646   N_("Empty host"),
 647 #define PE_BAD_PORT_NUMBER              3
 648   N_("Bad port number"),
 649 #define PE_INVALID_USER_NAME            4
 650   N_("Invalid user name"),
 651 #define PE_UNTERMINATED_IPV6_ADDRESS    5
 652   N_("Unterminated IPv6 numeric address"),
 653 #define PE_IPV6_NOT_SUPPORTED           6
 654   N_("IPv6 addresses not supported"),
 655 #define PE_INVALID_IPV6_ADDRESS         7
 656   N_("Invalid IPv6 numeric address")
 657 };
 658
 659 /* Parse a URL.
 660
 661    Return a new struct url if successful, NULL on error.  In case of
 662    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 663    error code. */
 664 struct url *
 665 url_parse (const char *url, int *error)
 666 {
 667   struct url *u;
 668   const char *p;
 669   int path_modified, host_modified;
 670
 671   enum url_scheme scheme;
 672
 673   const char *uname_b,     *uname_e;
 674   const char *host_b,      *host_e;
 675   const char *path_b,      *path_e;
 676   const char *params_b,    *params_e;
 677   const char *query_b,     *query_e;
 678   const char *fragment_b,  *fragment_e;
 679
 680   int port;
 681   char *user = NULL, *passwd = NULL;
 682
 683   char *url_encoded = NULL;
 684
 685   int error_code;
 686
 687   scheme = url_scheme (url);
 688   if (scheme == SCHEME_INVALID)
 689     {
 690       error_code = PE_UNSUPPORTED_SCHEME;
 691       goto error;
 692     }
 693
 694   url_encoded = reencode_escapes (url);
 695   p = url_encoded;
 696
 697   p += strlen (supported_schemes[scheme].leading_string);
 698   uname_b = p;
 699   p = url_skip_credentials (p);
 700   uname_e = p;
 701
 702   /* scheme://user:pass@host[:port]... */
 703   /*                    ^              */
 704
 705   /* We attempt to break down the URL into the components path,
 706      params, query, and fragment.  They are ordered like this:
 707
 708        scheme://host[:port][/path][;params][?query][#fragment]  */
 709
 710   params_b   = params_e   = NULL;
 711   query_b    = query_e    = NULL;
 712   fragment_b = fragment_e = NULL;
 713
 714   host_b = p;
 715
 716   if (*p == '[')
 717     {
 718       /* Handle IPv6 address inside square brackets.  Ideally we'd
 719          just look for the terminating ']', but rfc2732 mandates
 720          rejecting invalid IPv6 addresses.  */
 721
 722       /* The address begins after '['. */
 723       host_b = p + 1;
 724       host_e = strchr (host_b, ']');
 725
 726       if (!host_e)
 727         {
 728           error_code = PE_UNTERMINATED_IPV6_ADDRESS;
 729           goto error;
 730         }
 731
 732 #ifdef ENABLE_IPV6
 733       /* Check if the IPv6 address is valid. */
 734       if (!is_valid_ipv6_address(host_b, host_e))
 735         {
 736           error_code = PE_INVALID_IPV6_ADDRESS;
 737           goto error;
 738         }
 739
 740       /* Continue parsing after the closing ']'. */
 741       p = host_e + 1;
 742 #else
 743       error_code = PE_IPV6_NOT_SUPPORTED;
 744       goto error;
 745 #endif
 746     }
 747   else
 748     {
 749       p = strpbrk_or_eos (p, ":/;?#");
 750       host_e = p;
 751     }
 752
 753   if (host_b == host_e)
 754     {
 755       error_code = PE_EMPTY_HOST;
 756       goto error;
 757     }
 758
 759   port = scheme_default_port (scheme);
 760   if (*p == ':')
 761     {
 762       const char *port_b, *port_e, *pp;
 763
 764       /* scheme://host:port/tralala */
 765       /*              ^             */
 766       ++p;
 767       port_b = p;
 768       p = strpbrk_or_eos (p, "/;?#");
 769       port_e = p;
 770
 771       /* Allow empty port, as per rfc2396. */
 772       if (port_b != port_e)
 773         {
 774           for (port = 0, pp = port_b; pp < port_e; pp++)
 775             {
 776               if (!ISDIGIT (*pp))
 777                 {
 778                   /* http://host:12randomgarbage/blah */
 779                   /*               ^                  */
 780                   error_code = PE_BAD_PORT_NUMBER;
 781                   goto error;
 782                 }
 783               port = 10 * port + (*pp - '0');
 784             }
 785         }
 786     }
 787
 788   if (*p == '/')
 789     {
 790       ++p;
 791       path_b = p;
 792       p = strpbrk_or_eos (p, ";?#");
 793       path_e = p;
 794     }
 795   else
 796     {
 797       /* Path is not allowed not to exist. */
 798       path_b = path_e = p;
 799     }
 800
 801   if (*p == ';')
 802     {
 803       ++p;
 804       params_b = p;
 805       p = strpbrk_or_eos (p, "?#");
 806       params_e = p;
 807     }
 808   if (*p == '?')
 809     {
 810       ++p;
 811       query_b = p;
 812       p = strpbrk_or_eos (p, "#");
 813       query_e = p;
 814
 815       /* Hack that allows users to use '?' (a wildcard character) in
 816          FTP URLs without it being interpreted as a query string
 817          delimiter.  */
 818       if (scheme == SCHEME_FTP)
 819         {
 820           query_b = query_e = NULL;
 821           path_e = p;
 822         }
 823     }
 824   if (*p == '#')
 825     {
 826       ++p;
 827       fragment_b = p;
 828       p += strlen (p);
 829       fragment_e = p;
 830     }
 831   assert (*p == 0);
 832
 833   if (uname_b != uname_e)
 834     {
 835       /* http://user:pass@host */
 836       /*        ^         ^    */
 837       /*     uname_b   uname_e */
 838       if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
 839         {
 840           error_code = PE_INVALID_USER_NAME;
 841           goto error;
 842         }
 843     }
 844
 845   u = xnew0 (struct url);
 846   u->scheme = scheme;
 847   u->host   = strdupdelim (host_b, host_e);
 848   u->port   = port;
 849   u->user   = user;
 850   u->passwd = passwd;
 851
 852   u->path = strdupdelim (path_b, path_e);
 853   path_modified = path_simplify (u->path);
 854   split_path (u->path, &u->dir, &u->file);
 855
 856   host_modified = lowercase_str (u->host);
 857
 858   /* Decode %HH sequences in host name.  This is important not so much
 859      to support %HH sequences, but to support binary characters (which
 860      will have been converted to %HH by reencode_escapes).  */
 861   if (strchr (u->host, '%'))
 862     {
 863       url_unescape (u->host);
 864       host_modified = 1;
 865     }
 866
 867   if (params_b)
 868     u->params = strdupdelim (params_b, params_e);
 869   if (query_b)
 870     u->query = strdupdelim (query_b, query_e);
 871   if (fragment_b)
 872     u->fragment = strdupdelim (fragment_b, fragment_e);
 873
 874   if (path_modified || u->fragment || host_modified || path_b == path_e)
 875     {
 876       /* If we suspect that a transformation has rendered what
 877          url_string might return different from URL_ENCODED, rebuild
 878          u->url using url_string.  */
 879       u->url = url_string (u, 0);
 880
 881       if (url_encoded != url)
 882         xfree ((char *) url_encoded);
 883     }
 884   else
 885     {
 886       if (url_encoded == url)
 887         u->url = xstrdup (url);
 888       else
 889         u->url = url_encoded;
 890     }
 891   url_encoded = NULL;
 892
 893   return u;
 894
 895  error:
 896   /* Cleanup in case of error: */
 897   if (url_encoded && url_encoded != url)
 898     xfree (url_encoded);
 899
 900   /* Transmit the error code to the caller, if the caller wants to
 901      know.  */
 902   if (error)
 903     *error = error_code;
 904   return NULL;
 905 }
 906
 907 /* Return the error message string from ERROR_CODE, which should have
 908    been retrieved from url_parse.  The error message is translated.  */
 909
 910 const char *
 911 url_error (int error_code)
 912 {
 913   assert (error_code >= 0 && error_code < countof (parse_errors));
 914   return _(parse_errors[error_code]);
 915 }
 916
 917 /* Split PATH into DIR and FILE.  PATH comes from the URL and is
 918    expected to be URL-escaped.
 919
 920    The path is split into directory (the part up to the last slash)
 921    and file (the part after the last slash), which are subsequently
 922    unescaped.  Examples:
 923
 924    PATH                 DIR           FILE
 925    "foo/bar/baz"        "foo/bar"     "baz"
 926    "foo/bar/"           "foo/bar"     ""
 927    "foo"                ""            "foo"
 928    "foo/bar/baz%2fqux"  "foo/bar"     "baz/qux" (!)
 929
 930    DIR and FILE are freshly allocated.  */
 931
 932 static void
 933 split_path (const char *path, char **dir, char **file)
 934 {
 935   char *last_slash = strrchr (path, '/');
 936   if (!last_slash)
 937     {
 938       *dir = xstrdup ("");
 939       *file = xstrdup (path);
 940     }
 941   else
 942     {
 943       *dir = strdupdelim (path, last_slash);
 944       *file = xstrdup (last_slash + 1);
 945     }
 946   url_unescape (*dir);
 947   url_unescape (*file);
 948 }
 949
 950 /* Note: URL's "full path" is the path with the query string and
 951    params appended.  The "fragment" (#foo) is intentionally ignored,
 952    but that might be changed.  For example, if the original URL was
 953    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
 954    the full path will be "/foo/bar/baz;bullshit?querystring".  */
 955
 956 /* Return the length of the full path, without the terminating
 957    zero.  */
 958
 959 static int
 960 full_path_length (const struct url *url)
 961 {
 962   int len = 0;
 963
 964 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
 965
 966   FROB (path);
 967   FROB (params);
 968   FROB (query);
 969
 970 #undef FROB
 971
 972   return len;
 973 }
 974
 975 /* Write out the full path. */
 976
 977 static void
 978 full_path_write (const struct url *url, char *where)
 979 {
 980 #define FROB(el, chr) do {                      \
 981   char *f_el = url->el;                         \
 982   if (f_el) {                                   \
 983     int l = strlen (f_el);                      \
 984     *where++ = chr;                             \
 985     memcpy (where, f_el, l);                    \
 986     where += l;                                 \
 987   }                                             \
 988 } while (0)
 989
 990   FROB (path, '/');
 991   FROB (params, ';');
 992   FROB (query, '?');
 993
 994 #undef FROB
 995 }
 996
 997 /* Public function for getting the "full path".  E.g. if u->path is
 998    "foo/bar" and u->query is "param=value", full_path will be
 999    "/foo/bar?param=value". */
1000
1001 char *
1002 url_full_path (const struct url *url)
1003 {
1004   int length = full_path_length (url);
1005   char *full_path = (char *)xmalloc(length + 1);
1006
1007   full_path_write (url, full_path);
1008   full_path[length] = '\0';
1009
1010   return full_path;
1011 }
1012
1013 /* Escape unsafe and reserved characters, except for the slash
1014    characters.  */
1015
1016 static char *
1017 url_escape_dir (const char *dir)
1018 {
1019   char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1020   char *h, *t;
1021   if (newdir == dir)
1022     return (char *)dir;
1023
1024   /* Unescape slashes in NEWDIR. */
1025
1026   h = newdir;                   /* hare */
1027   t = newdir;                   /* tortoise */
1028
1029   for (; *h; h++, t++)
1030     {
1031       /* url_escape_1 having converted '/' to "%2F" exactly. */
1032       if (*h == '%' && h[1] == '2' && h[2] == 'F')
1033         {
1034           *t = '/';
1035           h += 2;
1036         }
1037       else
1038         *t = *h;
1039     }
1040   *t = '\0';
1041
1042   return newdir;
1043 }
1044
1045 /* Sync u->path and u->url with u->dir and u->file.  Called after
1046    u->file or u->dir have been changed, typically by the FTP code.  */
1047
1048 static void
1049 sync_path (struct url *u)
1050 {
1051   char *newpath, *efile, *edir;
1052
1053   xfree (u->path);
1054
1055   /* u->dir and u->file are not escaped.  URL-escape them before
1056      reassembling them into u->path.  That way, if they contain
1057      separators like '?' or even if u->file contains slashes, the
1058      path will be correctly assembled.  (u->file can contain slashes
1059      if the URL specifies it with %2f, or if an FTP server returns
1060      it.)  */
1061   edir = url_escape_dir (u->dir);
1062   efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1063
1064   if (!*edir)
1065     newpath = xstrdup (efile);
1066   else
1067     {
1068       int dirlen = strlen (edir);
1069       int filelen = strlen (efile);
1070
1071       /* Copy "DIR/FILE" to newpath. */
1072       char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1073       memcpy (p, edir, dirlen);
1074       p += dirlen;
1075       *p++ = '/';
1076       memcpy (p, efile, filelen);
1077       p += filelen;
1078       *p++ = '\0';
1079     }
1080
1081   u->path = newpath;
1082
1083   if (edir != u->dir)
1084     xfree (edir);
1085   if (efile != u->file)
1086     xfree (efile);
1087
1088   /* Regenerate u->url as well.  */
1089   xfree (u->url);
1090   u->url = url_string (u, 0);
1091 }
1092
1093 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1094    This way we can sync u->path and u->url when they get changed.  */
1095
1096 void
1097 url_set_dir (struct url *url, const char *newdir)
1098 {
1099   xfree (url->dir);
1100   url->dir = xstrdup (newdir);
1101   sync_path (url);
1102 }
1103
1104 void
1105 url_set_file (struct url *url, const char *newfile)
1106 {
1107   xfree (url->file);
1108   url->file = xstrdup (newfile);
1109   sync_path (url);
1110 }
1111
1112 void
1113 url_free (struct url *url)
1114 {
1115   xfree (url->host);
1116   xfree (url->path);
1117   xfree (url->url);
1118
1119   xfree_null (url->params);
1120   xfree_null (url->query);
1121   xfree_null (url->fragment);
1122   xfree_null (url->user);
1123   xfree_null (url->passwd);
1124
1125   xfree (url->dir);
1126   xfree (url->file);
1127
1128   xfree (url);
1129 }
1130 \f
1131 /* Create all the necessary directories for PATH (a file).  Calls
1132    mkdirhier() internally.  */
1133 int
1134 mkalldirs (const char *path)
1135 {
1136   const char *p;
1137   char *t;
1138   struct_stat st;
1139   int res;
1140
1141   p = path + strlen (path);
1142   for (; *p != '/' && p != path; p--)
1143     ;
1144
1145   /* Don't create if it's just a file.  */
1146   if ((p == path) && (*p != '/'))
1147     return 0;
1148   t = strdupdelim (path, p);
1149
1150   /* Check whether the directory exists.  */
1151   if ((stat (t, &st) == 0))
1152     {
1153       if (S_ISDIR (st.st_mode))
1154         {
1155           xfree (t);
1156           return 0;
1157         }
1158       else
1159         {
1160           /* If the dir exists as a file name, remove it first.  This
1161              is *only* for Wget to work with buggy old CERN http
1162              servers.  Here is the scenario: When Wget tries to
1163              retrieve a directory without a slash, e.g.
1164              http://foo/bar (bar being a directory), CERN server will
1165              not redirect it too http://foo/bar/ -- it will generate a
1166              directory listing containing links to bar/file1,
1167              bar/file2, etc.  Wget will lose because it saves this
1168              HTML listing to a file `bar', so it cannot create the
1169              directory.  To work around this, if the file of the same
1170              name exists, we just remove it and create the directory
1171              anyway.  */
1172           DEBUGP (("Removing %s because of directory danger!\n", t));
1173           unlink (t);
1174         }
1175     }
1176   res = make_directory (t);
1177   if (res != 0)
1178     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1179   xfree (t);
1180   return res;
1181 }
1182 \f
1183 /* Functions for constructing the file name out of URL components.  */
1184
1185 /* A growable string structure, used by url_file_name and friends.
1186    This should perhaps be moved to utils.c.
1187
1188    The idea is to have a convenient and efficient way to construct a
1189    string by having various functions append data to it.  Instead of
1190    passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1191    functions in questions, we pass the pointer to this struct.  */
1192
1193 struct growable {
1194   char *base;
1195   int size;
1196   int tail;
1197 };
1198
1199 /* Ensure that the string can accept APPEND_COUNT more characters past
1200    the current TAIL position.  If necessary, this will grow the string
1201    and update its allocated size.  If the string is already large
1202    enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1203 #define GROW(g, append_size) do {                                       \
1204   struct growable *G_ = g;                                              \
1205   DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1206 } while (0)
1207
1208 /* Return the tail position of the string. */
1209 #define TAIL(r) ((r)->base + (r)->tail)
1210
1211 /* Move the tail position by APPEND_COUNT characters. */
1212 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1213
1214 /* Append the string STR to DEST.  NOTICE: the string in DEST is not
1215    terminated.  */
1216
1217 static void
1218 append_string (const char *str, struct growable *dest)
1219 {
1220   int l = strlen (str);
1221   GROW (dest, l);
1222   memcpy (TAIL (dest), str, l);
1223   TAIL_INCR (dest, l);
1224 }
1225
1226 /* Append CH to DEST.  For example, append_char (0, DEST)
1227    zero-terminates DEST.  */
1228
1229 static void
1230 append_char (char ch, struct growable *dest)
1231 {
1232   GROW (dest, 1);
1233   *TAIL (dest) = ch;
1234   TAIL_INCR (dest, 1);
1235 }
1236
1237 enum {
1238   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1239   filechr_not_windows = 2,      /* unusable on Windows, one of \|/<>?:*" */
1240   filechr_control     = 4       /* a control character, e.g. 0-31 */
1241 };
1242
1243 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1244
1245 /* Shorthands for the table: */
1246 #define U filechr_not_unix
1247 #define W filechr_not_windows
1248 #define C filechr_control
1249
1250 #define UW U|W
1251 #define UWC U|W|C
1252
1253 /* Table of characters unsafe under various conditions (see above).
1254
1255    Arguably we could also claim `%' to be unsafe, since we use it as
1256    the escape character.  If we ever want to be able to reliably
1257    translate file name back to URL, this would become important
1258    crucial.  Right now, it's better to be minimal in escaping.  */
1259
1260 const static unsigned char filechr_table[256] =
1261 {
1262 UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1263   C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1264   C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1265   C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1266   0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1267   0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1268   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1269   0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
1270   0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1271   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1272   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1273   0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1274   0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1275   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1276   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1277   0,  0,  0,  0,   0,  0,  0,  0,   /* x   y   z   {    |   }   ~   DEL */
1278
1279   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
1280   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
1281   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1282   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1283
1284   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1285   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1286   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1287   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1288 };
1289 #undef U
1290 #undef W
1291 #undef C
1292 #undef UW
1293 #undef UWC
1294
1295 /* FN_PORT_SEP is the separator between host and port in file names
1296    for non-standard port numbers.  On Unix this is normally ':', as in
1297    "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1298    because Windows can't handle ':' in file names.  */
1299 #define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1300
1301 /* FN_QUERY_SEP is the separator between the file name and the URL
1302    query, normally '?'.  Since Windows cannot handle '?' as part of
1303    file name, we use '@' instead there.  */
1304 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1305
1306 /* Quote path element, characters in [b, e), as file name, and append
1307    the quoted string to DEST.  Each character is quoted as per
1308    file_unsafe_char and the corresponding table.
1309
1310    If ESCAPED_P is non-zero, the path element is considered to be
1311    URL-escaped and will be unescaped prior to inspection.  */
1312
1313 static void
1314 append_uri_pathel (const char *b, const char *e, int escaped_p,
1315                    struct growable *dest)
1316 {
1317   const char *p;
1318   int quoted, outlen;
1319
1320   int mask;
1321   if (opt.restrict_files_os == restrict_unix)
1322     mask = filechr_not_unix;
1323   else
1324     mask = filechr_not_windows;
1325   if (opt.restrict_files_ctrl)
1326     mask |= filechr_control;
1327
1328   /* Copy [b, e) to PATHEL and URL-unescape it. */
1329   if (escaped_p)
1330     {
1331       char *unescaped;
1332       BOUNDED_TO_ALLOCA (b, e, unescaped);
1333       url_unescape (unescaped);
1334       b = unescaped;
1335       e = unescaped + strlen (unescaped);
1336     }
1337
1338   /* Defang ".." when found as component of path.  Remember that path
1339      comes from the URL and might contain malicious input.  */
1340   if (e - b == 2 && b[0] == '.' && b[1] == '.')
1341     {
1342       b = "%2E%2E";
1343       e = b + 6;
1344     }
1345
1346   /* Walk the PATHEL string and check how many characters we'll need
1347      to quote.  */
1348   quoted = 0;
1349   for (p = b; p < e; p++)
1350     if (FILE_CHAR_TEST (*p, mask))
1351       ++quoted;
1352
1353   /* Calculate the length of the output string.  e-b is the input
1354      string length.  Each quoted char introduces two additional
1355      characters in the string, hence 2*quoted.  */
1356   outlen = (e - b) + (2 * quoted);
1357   GROW (dest, outlen);
1358
1359   if (!quoted)
1360     {
1361       /* If there's nothing to quote, we can simply append the string
1362          without processing it again.  */
1363       memcpy (TAIL (dest), b, outlen);
1364     }
1365   else
1366     {
1367       char *q = TAIL (dest);
1368       for (p = b; p < e; p++)
1369         {
1370           if (!FILE_CHAR_TEST (*p, mask))
1371             *q++ = *p;
1372           else
1373             {
1374               unsigned char ch = *p;
1375               *q++ = '%';
1376               *q++ = XNUM_TO_DIGIT (ch >> 4);
1377               *q++ = XNUM_TO_DIGIT (ch & 0xf);
1378             }
1379         }
1380       assert (q - TAIL (dest) == outlen);
1381     }
1382   TAIL_INCR (dest, outlen);
1383 }
1384
1385 /* Append to DEST the directory structure that corresponds the
1386    directory part of URL's path.  For example, if the URL is
1387    http://server/dir1/dir2/file, this appends "/dir1/dir2".
1388
1389    Each path element ("dir1" and "dir2" in the above example) is
1390    examined, url-unescaped, and re-escaped as file name element.
1391
1392    Additionally, it cuts as many directories from the path as
1393    specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1394    will produce "bar" for the above example.  For 2 or more, it will
1395    produce "".
1396
1397    Each component of the path is quoted for use as file name.  */
1398
1399 static void
1400 append_dir_structure (const struct url *u, struct growable *dest)
1401 {
1402   char *pathel, *next;
1403   int cut = opt.cut_dirs;
1404
1405   /* Go through the path components, de-URL-quote them, and quote them
1406      (if necessary) as file names.  */
1407
1408   pathel = u->path;
1409   for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1410     {
1411       if (cut-- > 0)
1412         continue;
1413       if (pathel == next)
1414         /* Ignore empty pathels.  */
1415         continue;
1416
1417       if (dest->tail)
1418         append_char ('/', dest);
1419       append_uri_pathel (pathel, next, 1, dest);
1420     }
1421 }
1422
1423 /* Return a unique file name that matches the given URL as good as
1424    possible.  Does not create directories on the file system.  */
1425
1426 char *
1427 url_file_name (const struct url *u)
1428 {
1429   struct growable fnres;
1430
1431   const char *u_file, *u_query;
1432   char *fname, *unique;
1433
1434   fnres.base = NULL;
1435   fnres.size = 0;
1436   fnres.tail = 0;
1437
1438   /* Start with the directory prefix, if specified. */
1439   if (opt.dir_prefix)
1440     append_string (opt.dir_prefix, &fnres);
1441
1442   /* If "dirstruct" is turned on (typically the case with -r), add
1443      the host and port (unless those have been turned off) and
1444      directory structure.  */
1445   if (opt.dirstruct)
1446     {
1447       if (opt.protocol_directories)
1448         {
1449           if (fnres.tail)
1450             append_char ('/', &fnres);
1451           append_string (supported_schemes[u->scheme].name, &fnres);
1452         }
1453       if (opt.add_hostdir)
1454         {
1455           if (fnres.tail)
1456             append_char ('/', &fnres);
1457           if (0 != strcmp (u->host, ".."))
1458             append_string (u->host, &fnres);
1459           else
1460             /* Host name can come from the network; malicious DNS may
1461                allow ".." to be resolved, causing us to write to
1462                "../<file>".  Defang such host names.  */
1463             append_string ("%2E%2E", &fnres);
1464           if (u->port != scheme_default_port (u->scheme))
1465             {
1466               char portstr[24];
1467               number_to_string (portstr, u->port);
1468               append_char (FN_PORT_SEP, &fnres);
1469               append_string (portstr, &fnres);
1470             }
1471         }
1472
1473       append_dir_structure (u, &fnres);
1474     }
1475
1476   /* Add the file name. */
1477   if (fnres.tail)
1478     append_char ('/', &fnres);
1479   u_file = *u->file ? u->file : "index.html";
1480   append_uri_pathel (u_file, u_file + strlen (u_file), 0, &fnres);
1481
1482   /* Append "?query" to the file name. */
1483   u_query = u->query && *u->query ? u->query : NULL;
1484   if (u_query)
1485     {
1486       append_char (FN_QUERY_SEP, &fnres);
1487       append_uri_pathel (u_query, u_query + strlen (u_query), 1, &fnres);
1488     }
1489
1490   /* Zero-terminate the file name. */
1491   append_char ('\0', &fnres);
1492
1493   fname = fnres.base;
1494
1495   /* Check the cases in which the unique extensions are not used:
1496      1) Clobbering is turned off (-nc).
1497      2) Retrieval with regetting.
1498      3) Timestamping is used.
1499      4) Hierarchy is built.
1500
1501      The exception is the case when file does exist and is a
1502      directory (see `mkalldirs' for explanation).  */
1503
1504   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1505       && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1506     return fname;
1507
1508   unique = unique_name (fname, 1);
1509   if (unique != fname)
1510     xfree (fname);
1511   return unique;
1512 }
1513 \f
1514 /* Resolve "." and ".." elements of PATH by destructively modifying
1515    PATH and return non-zero if PATH has been modified, zero otherwise.
1516
1517    The algorithm is in spirit similar to the one described in rfc1808,
1518    although implemented differently, in one pass.  To recap, path
1519    elements containing only "." are removed, and ".." is taken to mean
1520    "back up one element".  Single leading and trailing slashes are
1521    preserved.
1522
1523    This function does not handle URL escapes explicitly.  If you're
1524    passing paths from URLs, make sure to unquote "%2e" and "%2E" to
1525    ".", so that this function can find the dots.  (Wget's URL parser
1526    calls reencode_escapes, which see.)
1527
1528    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1529    test examples are provided below.  If you change anything in this
1530    function, run test_path_simplify to make sure you haven't broken a
1531    test case.  */
1532
1533 static int
1534 path_simplify (char *path)
1535 {
1536   char *h = path;               /* hare */
1537   char *t = path;               /* tortoise */
1538   char *beg = path;             /* boundary for backing the tortoise */
1539   char *end = path + strlen (path);
1540
1541   while (h < end)
1542     {
1543       /* Hare should be at the beginning of a path element. */
1544
1545       if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1546         {
1547           /* Ignore "./". */
1548           h += 2;
1549         }
1550       else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1551         {
1552           /* Handle "../" by retreating the tortoise by one path
1553              element -- but not past beggining.  */
1554           if (t > beg)
1555             {
1556               /* Move backwards until T hits the beginning of the
1557                  previous path element or the beginning of path. */
1558               for (--t; t > beg && t[-1] != '/'; t--)
1559                 ;
1560             }
1561           else
1562             {
1563               /* If we're at the beginning, copy the "../" literally
1564                  move the beginning so a later ".." doesn't remove
1565                  it.  */
1566               beg = t + 3;
1567               goto regular;
1568             }
1569           h += 3;
1570         }
1571       else
1572         {
1573         regular:
1574           /* A regular path element.  If H hasn't advanced past T,
1575              simply skip to the next path element.  Otherwise, copy
1576              the path element until the next slash.  */
1577           if (t == h)
1578             {
1579               /* Skip the path element, including the slash.  */
1580               while (h < end && *h != '/')
1581                 t++, h++;
1582               if (h < end)
1583                 t++, h++;
1584             }
1585           else
1586             {
1587               /* Copy the path element, including the final slash.  */
1588               while (h < end && *h != '/')
1589                 *t++ = *h++;
1590               if (h < end)
1591                 *t++ = *h++;
1592             }
1593         }
1594     }
1595
1596   if (t != h)
1597     *t = '\0';
1598
1599   return t != h;
1600 }
1601 \f
1602 /* Return the length of URL's path.  Path is considered to be
1603    terminated by one of '?', ';', '#', or by the end of the
1604    string.  */
1605
1606 static int
1607 path_length (const char *url)
1608 {
1609   const char *q = strpbrk_or_eos (url, "?;#");
1610   return q - url;
1611 }
1612
1613 /* Find the last occurrence of character C in the range [b, e), or
1614    NULL, if none are present.  We might want to use memrchr (a GNU
1615    extension) under GNU libc.  */
1616
1617 static const char *
1618 find_last_char (const char *b, const char *e, char c)
1619 {
1620   for (; e > b; e--)
1621     if (*e == c)
1622       return e;
1623   return NULL;
1624 }
1625
1626 /* Merge BASE with LINK and return the resulting URI.
1627
1628    Either of the URIs may be absolute or relative, complete with the
1629    host name, or path only.  This tries to reasonably handle all
1630    foreseeable cases.  It only employs minimal URL parsing, without
1631    knowledge of the specifics of schemes.
1632
1633    I briefly considered making this function call path_simplify after
1634    the merging process, as rfc1738 seems to suggest.  This is a bad
1635    idea for several reasons: 1) it complexifies the code, and 2)
1636    url_parse has to simplify path anyway, so it's wasteful to boot.  */
1637
1638 char *
1639 uri_merge (const char *base, const char *link)
1640 {
1641   int linklength;
1642   const char *end;
1643   char *merge;
1644
1645   if (url_has_scheme (link))
1646     return xstrdup (link);
1647
1648   /* We may not examine BASE past END. */
1649   end = base + path_length (base);
1650   linklength = strlen (link);
1651
1652   if (!*link)
1653     {
1654       /* Empty LINK points back to BASE, query string and all. */
1655       return xstrdup (base);
1656     }
1657   else if (*link == '?')
1658     {
1659       /* LINK points to the same location, but changes the query
1660          string.  Examples: */
1661       /* uri_merge("path",         "?new") -> "path?new"     */
1662       /* uri_merge("path?foo",     "?new") -> "path?new"     */
1663       /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1664       /* uri_merge("path#foo",     "?new") -> "path?new"     */
1665       int baselength = end - base;
1666       merge = xmalloc (baselength + linklength + 1);
1667       memcpy (merge, base, baselength);
1668       memcpy (merge + baselength, link, linklength);
1669       merge[baselength + linklength] = '\0';
1670     }
1671   else if (*link == '#')
1672     {
1673       /* uri_merge("path",         "#new") -> "path#new"     */
1674       /* uri_merge("path#foo",     "#new") -> "path#new"     */
1675       /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1676       /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1677       int baselength;
1678       const char *end1 = strchr (base, '#');
1679       if (!end1)
1680         end1 = base + strlen (base);
1681       baselength = end1 - base;
1682       merge = xmalloc (baselength + linklength + 1);
1683       memcpy (merge, base, baselength);
1684       memcpy (merge + baselength, link, linklength);
1685       merge[baselength + linklength] = '\0';
1686     }
1687   else if (*link == '/' && *(link + 1) == '/')
1688     {
1689       /* LINK begins with "//" and so is a net path: we need to
1690          replace everything after (and including) the double slash
1691          with LINK. */
1692
1693       /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1694       /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1695       /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1696
1697       int span;
1698       const char *slash;
1699       const char *start_insert;
1700
1701       /* Look for first slash. */
1702       slash = memchr (base, '/', end - base);
1703       /* If found slash and it is a double slash, then replace
1704          from this point, else default to replacing from the
1705          beginning.  */
1706       if (slash && *(slash + 1) == '/')
1707         start_insert = slash;
1708       else
1709         start_insert = base;
1710
1711       span = start_insert - base;
1712       merge = (char *)xmalloc (span + linklength + 1);
1713       if (span)
1714         memcpy (merge, base, span);
1715       memcpy (merge + span, link, linklength);
1716       merge[span + linklength] = '\0';
1717     }
1718   else if (*link == '/')
1719     {
1720       /* LINK is an absolute path: we need to replace everything
1721          after (and including) the FIRST slash with LINK.
1722
1723          So, if BASE is "http://host/whatever/foo/bar", and LINK is
1724          "/qux/xyzzy", our result should be
1725          "http://host/qux/xyzzy".  */
1726       int span;
1727       const char *slash;
1728       const char *start_insert = NULL; /* for gcc to shut up. */
1729       const char *pos = base;
1730       int seen_slash_slash = 0;
1731       /* We're looking for the first slash, but want to ignore
1732          double slash. */
1733     again:
1734       slash = memchr (pos, '/', end - pos);
1735       if (slash && !seen_slash_slash)
1736         if (*(slash + 1) == '/')
1737           {
1738             pos = slash + 2;
1739             seen_slash_slash = 1;
1740             goto again;
1741           }
1742
1743       /* At this point, SLASH is the location of the first / after
1744          "//", or the first slash altogether.  START_INSERT is the
1745          pointer to the location where LINK will be inserted.  When
1746          examining the last two examples, keep in mind that LINK
1747          begins with '/'. */
1748
1749       if (!slash && !seen_slash_slash)
1750         /* example: "foo" */
1751         /*           ^    */
1752         start_insert = base;
1753       else if (!slash && seen_slash_slash)
1754         /* example: "http://foo" */
1755         /*                     ^ */
1756         start_insert = end;
1757       else if (slash && !seen_slash_slash)
1758         /* example: "foo/bar" */
1759         /*           ^        */
1760         start_insert = base;
1761       else if (slash && seen_slash_slash)
1762         /* example: "http://something/" */
1763         /*                           ^  */
1764         start_insert = slash;
1765
1766       span = start_insert - base;
1767       merge = (char *)xmalloc (span + linklength + 1);
1768       if (span)
1769         memcpy (merge, base, span);
1770       memcpy (merge + span, link, linklength);
1771       merge[span + linklength] = '\0';
1772     }
1773   else
1774     {
1775       /* LINK is a relative URL: we need to replace everything
1776          after last slash (possibly empty) with LINK.
1777
1778          So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1779          our result should be "whatever/foo/qux/xyzzy".  */
1780       int need_explicit_slash = 0;
1781       int span;
1782       const char *start_insert;
1783       const char *last_slash = find_last_char (base, end, '/');
1784       if (!last_slash)
1785         {
1786           /* No slash found at all.  Replace what we have with LINK. */
1787           start_insert = base;
1788         }
1789       else if (last_slash && last_slash >= base + 2
1790                && last_slash[-2] == ':' && last_slash[-1] == '/')
1791         {
1792           /* example: http://host"  */
1793           /*                      ^ */
1794           start_insert = end + 1;
1795           need_explicit_slash = 1;
1796         }
1797       else
1798         {
1799           /* example: "whatever/foo/bar" */
1800           /*                        ^    */
1801           start_insert = last_slash + 1;
1802         }
1803
1804       span = start_insert - base;
1805       merge = (char *)xmalloc (span + linklength + 1);
1806       if (span)
1807         memcpy (merge, base, span);
1808       if (need_explicit_slash)
1809         merge[span - 1] = '/';
1810       memcpy (merge + span, link, linklength);
1811       merge[span + linklength] = '\0';
1812     }
1813
1814   return merge;
1815 }
1816 \f
1817 #define APPEND(p, s) do {                       \
1818   int len = strlen (s);                         \
1819   memcpy (p, s, len);                           \
1820   p += len;                                     \
1821 } while (0)
1822
1823 /* Use this instead of password when the actual password is supposed
1824    to be hidden.  We intentionally use a generic string without giving
1825    away the number of characters in the password, like previous
1826    versions did.  */
1827 #define HIDDEN_PASSWORD "*password*"
1828
1829 /* Recreate the URL string from the data in URL.
1830
1831    If HIDE is non-zero (as it is when we're calling this on a URL we
1832    plan to print, but not when calling it to canonicalize a URL for
1833    use within the program), password will be hidden.  Unsafe
1834    characters in the URL will be quoted.  */
1835
1836 char *
1837 url_string (const struct url *url, int hide_password)
1838 {
1839   int size;
1840   char *result, *p;
1841   char *quoted_user = NULL, *quoted_passwd = NULL;
1842
1843   int scheme_port  = supported_schemes[url->scheme].default_port;
1844   const char *scheme_str = supported_schemes[url->scheme].leading_string;
1845   int fplen = full_path_length (url);
1846
1847   int brackets_around_host;
1848
1849   assert (scheme_str != NULL);
1850
1851   /* Make sure the user name and password are quoted. */
1852   if (url->user)
1853     {
1854       quoted_user = url_escape_allow_passthrough (url->user);
1855       if (url->passwd)
1856         {
1857           if (hide_password)
1858             quoted_passwd = HIDDEN_PASSWORD;
1859           else
1860             quoted_passwd = url_escape_allow_passthrough (url->passwd);
1861         }
1862     }
1863
1864   /* Numeric IPv6 addresses can contain ':' and need to be quoted with
1865      brackets.  */
1866   brackets_around_host = strchr (url->host, ':') != NULL;
1867
1868   size = (strlen (scheme_str)
1869           + strlen (url->host)
1870           + (brackets_around_host ? 2 : 0)
1871           + fplen
1872           + 1);
1873   if (url->port != scheme_port)
1874     size += 1 + numdigit (url->port);
1875   if (quoted_user)
1876     {
1877       size += 1 + strlen (quoted_user);
1878       if (quoted_passwd)
1879         size += 1 + strlen (quoted_passwd);
1880     }
1881
1882   p = result = xmalloc (size);
1883
1884   APPEND (p, scheme_str);
1885   if (quoted_user)
1886     {
1887       APPEND (p, quoted_user);
1888       if (quoted_passwd)
1889         {
1890           *p++ = ':';
1891           APPEND (p, quoted_passwd);
1892         }
1893       *p++ = '@';
1894     }
1895
1896   if (brackets_around_host)
1897     *p++ = '[';
1898   APPEND (p, url->host);
1899   if (brackets_around_host)
1900     *p++ = ']';
1901   if (url->port != scheme_port)
1902     {
1903       *p++ = ':';
1904       p = number_to_string (p, url->port);
1905     }
1906
1907   full_path_write (url, p);
1908   p += fplen;
1909   *p++ = '\0';
1910
1911   assert (p - result == size);
1912
1913   if (quoted_user && quoted_user != url->user)
1914     xfree (quoted_user);
1915   if (quoted_passwd && !hide_password
1916       && quoted_passwd != url->passwd)
1917     xfree (quoted_passwd);
1918
1919   return result;
1920 }
1921 \f
1922 /* Return non-zero if scheme a is similar to scheme b.
1923
1924    Schemes are similar if they are equal.  If SSL is supported, schemes
1925    are also similar if one is http (SCHEME_HTTP) and the other is https
1926    (SCHEME_HTTPS).  */
1927 int
1928 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
1929 {
1930   if (a == b)
1931     return 1;
1932 #ifdef HAVE_SSL
1933   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
1934       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
1935     return 1;
1936 #endif
1937   return 0;
1938 }
1939 \f
1940 #if 0
1941 /* Debugging and testing support for path_simplify. */
1942
1943 /* Debug: run path_simplify on PATH and return the result in a new
1944    string.  Useful for calling from the debugger.  */
1945 static char *
1946 ps (char *path)
1947 {
1948   char *copy = xstrdup (path);
1949   path_simplify (copy);
1950   return copy;
1951 }
1952
1953 static void
1954 run_test (char *test, char *expected_result, int expected_change)
1955 {
1956   char *test_copy = xstrdup (test);
1957   int modified = path_simplify (test_copy);
1958
1959   if (0 != strcmp (test_copy, expected_result))
1960     {
1961       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
1962               test, expected_result, test_copy);
1963     }
1964   if (modified != expected_change)
1965     {
1966       if (expected_change == 1)
1967         printf ("Expected modification with path_simplify(\"%s\").\n",
1968                 test);
1969       else
1970         printf ("Expected no modification with path_simplify(\"%s\").\n",
1971                 test);
1972     }
1973   xfree (test_copy);
1974 }
1975
1976 static void
1977 test_path_simplify (void)
1978 {
1979   static struct {
1980     char *test, *result;
1981     int should_modify;
1982   } tests[] = {
1983     { "",                       "",             0 },
1984     { ".",                      "",             1 },
1985     { "./",                     "",             1 },
1986     { "..",                     "..",           0 },
1987     { "../",                    "../",          0 },
1988     { "foo",                    "foo",          0 },
1989     { "foo/bar",                "foo/bar",      0 },
1990     { "foo///bar",              "foo///bar",    0 },
1991     { "foo/.",                  "foo/",         1 },
1992     { "foo/./",                 "foo/",         1 },
1993     { "foo./",                  "foo./",        0 },
1994     { "foo/../bar",             "bar",          1 },
1995     { "foo/../bar/",            "bar/",         1 },
1996     { "foo/bar/..",             "foo/",         1 },
1997     { "foo/bar/../x",           "foo/x",        1 },
1998     { "foo/bar/../x/",          "foo/x/",       1 },
1999     { "foo/..",                 "",             1 },
2000     { "foo/../..",              "..",           1 },
2001     { "foo/../../..",           "../..",        1 },
2002     { "foo/../../bar/../../baz", "../../baz",   1 },
2003     { "a/b/../../c",            "c",            1 },
2004     { "./a/../b",               "b",            1 }
2005   };
2006   int i;
2007
2008   for (i = 0; i < countof (tests); i++)
2009     {
2010       char *test = tests[i].test;
2011       char *expected_result = tests[i].result;
2012       int   expected_change = tests[i].should_modify;
2013       run_test (test, expected_result, expected_change);
2014     }
2015 }
2016 #endif