sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 2005 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include <config.h>
  31
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #ifdef HAVE_STRING_H
  35 # include <string.h>
  36 #else
  37 # include <strings.h>
  38 #endif
  39 #include <sys/types.h>
  40 #ifdef HAVE_UNISTD_H
  41 # include <unistd.h>
  42 #endif
  43 #include <errno.h>
  44 #include <assert.h>
  45
  46 #include "wget.h"
  47 #include "utils.h"
  48 #include "url.h"
  49 #include "host.h"  /* for is_valid_ipv6_address */
  50
  51 #ifndef errno
  52 extern int errno;
  53 #endif
  54
  55 struct scheme_data
  56 {
  57   const char *name;
  58   const char *leading_string;
  59   int default_port;
  60   int enabled;
  61 };
  62
  63 /* Supported schemes: */
  64 static struct scheme_data supported_schemes[] =
  65 {
  66   { "http",     "http://",  DEFAULT_HTTP_PORT,  1 },
  67 #ifdef HAVE_SSL
  68   { "https",    "https://", DEFAULT_HTTPS_PORT, 1 },
  69 #endif
  70   { "ftp",      "ftp://",   DEFAULT_FTP_PORT,   1 },
  71
  72   /* SCHEME_INVALID */
  73   { NULL,       NULL,       -1,                 0 }
  74 };
  75
  76 /* Forward declarations: */
  77
  78 static int path_simplify PARAMS ((char *));
  79 \f
  80 /* Support for escaping and unescaping of URL strings.  */
  81
  82 /* Table of "reserved" and "unsafe" characters.  Those terms are
  83    rfc1738-speak, as such largely obsoleted by rfc2396 and later
  84    specs, but the general idea remains.
  85
  86    A reserved character is the one that you can't decode without
  87    changing the meaning of the URL.  For example, you can't decode
  88    "/foo/%2f/bar" into "/foo///bar" because the number and contents of
  89    path components is different.  Non-reserved characters can be
  90    changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar".  The
  91    unsafe characters are loosely based on rfc1738, plus "$" and ",",
  92    as recommended by rfc2396, and minus "~", which is very frequently
  93    used (and sometimes unrecognized as %7E by broken servers).
  94
  95    An unsafe character is the one that should be encoded when URLs are
  96    placed in foreign environments.  E.g. space and newline are unsafe
  97    in HTTP contexts because HTTP uses them as separator and line
  98    terminator, so they must be encoded to %20 and %0A respectively.
  99    "*" is unsafe in shell context, etc.
 100
 101    We determine whether a character is unsafe through static table
 102    lookup.  This code assumes ASCII character set and 8-bit chars.  */
 103
 104 enum {
 105   /* rfc1738 reserved chars + "$" and ",".  */
 106   urlchr_reserved = 1,
 107
 108   /* rfc1738 unsafe chars, plus non-printables.  */
 109   urlchr_unsafe   = 2
 110 };
 111
 112 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 113 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 114 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 115
 116 /* Shorthands for the table: */
 117 #define R  urlchr_reserved
 118 #define U  urlchr_unsafe
 119 #define RU R|U
 120
 121 static const unsigned char urlchr_table[256] =
 122 {
 123   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 124   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 125   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 126   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 127   U,  0,  U, RU,   R,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 128   0,  0,  0,  R,   R,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 129   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 130   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 131  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 132   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 133   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 134   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 135   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 136   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 137   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 138   0,  0,  0,  U,   U,  U,  0,  U,   /* x   y   z   {    |   }   ~   DEL */
 139
 140   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 141   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 142   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 143   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 144
 145   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 146   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 147   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 148   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 149 };
 150 #undef R
 151 #undef U
 152 #undef RU
 153
 154 /* URL-unescape the string S.
 155
 156    This is done by transforming the sequences "%HH" to the character
 157    represented by the hexadecimal digits HH.  If % is not followed by
 158    two hexadecimal digits, it is inserted literally.
 159
 160    The transformation is done in place.  If you need the original
 161    string intact, make a copy before calling this function.  */
 162
 163 static void
 164 url_unescape (char *s)
 165 {
 166   char *t = s;                  /* t - tortoise */
 167   char *h = s;                  /* h - hare     */
 168
 169   for (; *h; h++, t++)
 170     {
 171       if (*h != '%')
 172         {
 173         copychar:
 174           *t = *h;
 175         }
 176       else
 177         {
 178           char c;
 179           /* Do nothing if '%' is not followed by two hex digits. */
 180           if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2])))
 181             goto copychar;
 182           c = X2DIGITS_TO_NUM (h[1], h[2]);
 183           /* Don't unescape %00 because there is no way to insert it
 184              into a C string without effectively truncating it. */
 185           if (c == '\0')
 186             goto copychar;
 187           *t = c;
 188           h += 2;
 189         }
 190     }
 191   *t = '\0';
 192 }
 193
 194 /* The core of url_escape_* functions.  Escapes the characters that
 195    match the provided mask in urlchr_table.
 196
 197    If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars
 198    will be returned unchanged.  If ALLOW_PASSTHROUGH is zero, a
 199    freshly allocated string will be returned in all cases.  */
 200
 201 static char *
 202 url_escape_1 (const char *s, unsigned char mask, int allow_passthrough)
 203 {
 204   const char *p1;
 205   char *p2, *newstr;
 206   int newlen;
 207   int addition = 0;
 208
 209   for (p1 = s; *p1; p1++)
 210     if (urlchr_test (*p1, mask))
 211       addition += 2;            /* Two more characters (hex digits) */
 212
 213   if (!addition)
 214     return allow_passthrough ? (char *)s : xstrdup (s);
 215
 216   newlen = (p1 - s) + addition;
 217   newstr = (char *)xmalloc (newlen + 1);
 218
 219   p1 = s;
 220   p2 = newstr;
 221   while (*p1)
 222     {
 223       /* Quote the characters that match the test mask. */
 224       if (urlchr_test (*p1, mask))
 225         {
 226           unsigned char c = *p1++;
 227           *p2++ = '%';
 228           *p2++ = XNUM_TO_DIGIT (c >> 4);
 229           *p2++ = XNUM_TO_DIGIT (c & 0xf);
 230         }
 231       else
 232         *p2++ = *p1++;
 233     }
 234   assert (p2 - newstr == newlen);
 235   *p2 = '\0';
 236
 237   return newstr;
 238 }
 239
 240 /* URL-escape the unsafe characters (see urlchr_table) in a given
 241    string, returning a freshly allocated string.  */
 242
 243 char *
 244 url_escape (const char *s)
 245 {
 246   return url_escape_1 (s, urlchr_unsafe, 0);
 247 }
 248
 249 /* URL-escape the unsafe characters (see urlchr_table) in a given
 250    string.  If no characters are unsafe, S is returned.  */
 251
 252 static char *
 253 url_escape_allow_passthrough (const char *s)
 254 {
 255   return url_escape_1 (s, urlchr_unsafe, 1);
 256 }
 257 \f
 258 /* Decide whether the char at position P needs to be encoded.  (It is
 259    not enough to pass a single char *P because the function may need
 260    to inspect the surrounding context.)
 261
 262    Return 1 if the char should be escaped as %XX, 0 otherwise.  */
 263
 264 static inline int
 265 char_needs_escaping (const char *p)
 266 {
 267   if (*p == '%')
 268     {
 269       if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
 270         return 0;
 271       else
 272         /* Garbled %.. sequence: encode `%'. */
 273         return 1;
 274     }
 275   else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
 276     return 1;
 277   else
 278     return 0;
 279 }
 280
 281 /* Translate a %-escaped (but possibly non-conformant) input string S
 282    into a %-escaped (and conformant) output string.  If no characters
 283    are encoded or decoded, return the same string S; otherwise, return
 284    a freshly allocated string with the new contents.
 285
 286    After a URL has been run through this function, the protocols that
 287    use `%' as the quote character can use the resulting string as-is,
 288    while those that don't can use url_unescape to get to the intended
 289    data.  This function is stable: once the input is transformed,
 290    further transformations of the result yield the same output.
 291
 292    Let's discuss why this function is needed.
 293
 294    Imagine Wget is asked to retrieve `http://abc.xyz/abc def'.  Since
 295    a raw space character would mess up the HTTP request, it needs to
 296    be quoted, like this:
 297
 298        GET /abc%20def HTTP/1.0
 299
 300    It would appear that the unsafe chars need to be quoted, for
 301    example with url_escape.  But what if we're requested to download
 302    `abc%20def'?  url_escape transforms "%" to "%25", which would leave
 303    us with `abc%2520def'.  This is incorrect -- since %-escapes are
 304    part of URL syntax, "%20" is the correct way to denote a literal
 305    space on the Wget command line.  This leads to the conclusion that
 306    in that case Wget should not call url_escape, but leave the `%20'
 307    as is.  This is clearly contradictory, but it only gets worse.
 308
 309    What if the requested URI is `abc%20 def'?  If we call url_escape,
 310    we end up with `/abc%2520%20def', which is almost certainly not
 311    intended.  If we don't call url_escape, we are left with the
 312    embedded space and cannot complete the request.  What the user
 313    meant was for Wget to request `/abc%20%20def', and this is where
 314    reencode_escapes kicks in.
 315
 316    Wget used to solve this by first decoding %-quotes, and then
 317    encoding all the "unsafe" characters found in the resulting string.
 318    This was wrong because it didn't preserve certain URL special
 319    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 320    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 321    whether we considered `+' reserved (it is).  One of these results
 322    is inevitable because by the second step we would lose information
 323    on whether the `+' was originally encoded or not.  Both results
 324    were wrong because in CGI parameters + means space, while %2B means
 325    literal plus.  reencode_escapes correctly translates the above to
 326    "a%2B+b", i.e. returns the original string.
 327
 328    This function uses a modified version of the algorithm originally
 329    proposed by Anon Sricharoenchai:
 330
 331    * Encode all "unsafe" characters, except those that are also
 332      "reserved", to %XX.  See urlchr_table for which characters are
 333      unsafe and reserved.
 334
 335    * Encode the "%" characters not followed by two hex digits to
 336      "%25".
 337
 338    * Pass through all other characters and %XX escapes as-is.  (Up to
 339      Wget 1.10 this decoded %XX escapes corresponding to "safe"
 340      characters, but that was obtrusive and broke some servers.)
 341
 342    Anon's test case:
 343
 344    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 345    ->
 346    "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc"
 347
 348    Simpler test cases:
 349
 350    "foo bar"         -> "foo%20bar"
 351    "foo%20bar"       -> "foo%20bar"
 352    "foo %20bar"      -> "foo%20%20bar"
 353    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 354    "foo%25%20bar"    -> "foo%25%20bar"
 355    "foo%2%20bar"     -> "foo%252%20bar"
 356    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 357    "foo%2b+bar"      -> "foo%2b+bar"  */
 358
 359 static char *
 360 reencode_escapes (const char *s)
 361 {
 362   const char *p1;
 363   char *newstr, *p2;
 364   int oldlen, newlen;
 365
 366   int encode_count = 0;
 367
 368   /* First pass: inspect the string to see if there's anything to do,
 369      and to calculate the new length.  */
 370   for (p1 = s; *p1; p1++)
 371     if (char_needs_escaping (p1))
 372       ++encode_count;
 373
 374   if (!encode_count)
 375     /* The string is good as it is. */
 376     return (char *) s;          /* C const model sucks. */
 377
 378   oldlen = p1 - s;
 379   /* Each encoding adds two characters (hex digits).  */
 380   newlen = oldlen + 2 * encode_count;
 381   newstr = xmalloc (newlen + 1);
 382
 383   /* Second pass: copy the string to the destination address, encoding
 384      chars when needed.  */
 385   p1 = s;
 386   p2 = newstr;
 387
 388   while (*p1)
 389     if (char_needs_escaping (p1))
 390       {
 391         unsigned char c = *p1++;
 392         *p2++ = '%';
 393         *p2++ = XNUM_TO_DIGIT (c >> 4);
 394         *p2++ = XNUM_TO_DIGIT (c & 0xf);
 395       }
 396     else
 397       *p2++ = *p1++;
 398
 399   *p2 = '\0';
 400   assert (p2 - newstr == newlen);
 401   return newstr;
 402 }
 403 \f
 404 /* Returns the scheme type if the scheme is supported, or
 405    SCHEME_INVALID if not.  */
 406
 407 enum url_scheme
 408 url_scheme (const char *url)
 409 {
 410   int i;
 411
 412   for (i = 0; supported_schemes[i].leading_string; i++)
 413     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 414                           strlen (supported_schemes[i].leading_string)))
 415       {
 416         if (supported_schemes[i].enabled)
 417           return (enum url_scheme) i;
 418         else
 419           return SCHEME_INVALID;
 420       }
 421
 422   return SCHEME_INVALID;
 423 }
 424
 425 #define SCHEME_CHAR(ch) (ISALNUM (ch) || (ch) == '-' || (ch) == '+')
 426
 427 /* Return 1 if the URL begins with any "scheme", 0 otherwise.  As
 428    currently implemented, it returns true if URL begins with
 429    [-+a-zA-Z0-9]+: .  */
 430
 431 int
 432 url_has_scheme (const char *url)
 433 {
 434   const char *p = url;
 435
 436   /* The first char must be a scheme char. */
 437   if (!*p || !SCHEME_CHAR (*p))
 438     return 0;
 439   ++p;
 440   /* Followed by 0 or more scheme chars. */
 441   while (*p && SCHEME_CHAR (*p))
 442     ++p;
 443   /* Terminated by ':'. */
 444   return *p == ':';
 445 }
 446
 447 int
 448 scheme_default_port (enum url_scheme scheme)
 449 {
 450   return supported_schemes[scheme].default_port;
 451 }
 452
 453 void
 454 scheme_disable (enum url_scheme scheme)
 455 {
 456   supported_schemes[scheme].enabled = 0;
 457 }
 458
 459 /* Skip the username and password, if present in the URL.  The
 460    function should *not* be called with the complete URL, but with the
 461    portion after the scheme.
 462
 463    If no username and password are found, return URL.  */
 464
 465 static const char *
 466 url_skip_credentials (const char *url)
 467 {
 468   /* Look for '@' that comes before terminators, such as '/', '?',
 469      '#', or ';'.  */
 470   const char *p = (const char *)strpbrk (url, "@/?#;");
 471   if (!p || *p != '@')
 472     return url;
 473   return p + 1;
 474 }
 475
 476 /* Parse credentials contained in [BEG, END).  The region is expected
 477    to have come from a URL and is unescaped.  */
 478
 479 static int
 480 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
 481 {
 482   char *colon;
 483   const char *userend;
 484
 485   if (beg == end)
 486     return 0;                   /* empty user name */
 487
 488   colon = memchr (beg, ':', end - beg);
 489   if (colon == beg)
 490     return 0;                   /* again empty user name */
 491
 492   if (colon)
 493     {
 494       *passwd = strdupdelim (colon + 1, end);
 495       userend = colon;
 496       url_unescape (*passwd);
 497     }
 498   else
 499     {
 500       *passwd = NULL;
 501       userend = end;
 502     }
 503   *user = strdupdelim (beg, userend);
 504   url_unescape (*user);
 505   return 1;
 506 }
 507
 508 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 509    popularized by Netscape and NcFTP.  HTTP shorthands look like this:
 510
 511    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 512    www.foo.com[:port]            -> http://www.foo.com[:port]
 513
 514    FTP shorthands look like this:
 515
 516    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 517    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 518
 519    If the URL needs not or cannot be rewritten, return NULL.  */
 520
 521 char *
 522 rewrite_shorthand_url (const char *url)
 523 {
 524   const char *p;
 525
 526   if (url_scheme (url) != SCHEME_INVALID)
 527     return NULL;
 528
 529   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 530      latter Netscape.  */
 531   for (p = url; *p && *p != ':' && *p != '/'; p++)
 532     ;
 533
 534   if (p == url)
 535     return NULL;
 536
 537   if (*p == ':')
 538     {
 539       const char *pp;
 540       char *res;
 541       /* If the characters after the colon and before the next slash
 542          or end of string are all digits, it's HTTP.  */
 543       int digits = 0;
 544       for (pp = p + 1; ISDIGIT (*pp); pp++)
 545         ++digits;
 546       if (digits > 0 && (*pp == '/' || *pp == '\0'))
 547         goto http;
 548
 549       /* Prepend "ftp://" to the entire URL... */
 550       res = xmalloc (6 + strlen (url) + 1);
 551       sprintf (res, "ftp://%s", url);
 552       /* ...and replace ':' with '/'. */
 553       res[6 + (p - url)] = '/';
 554       return res;
 555     }
 556   else
 557     {
 558       char *res;
 559     http:
 560       /* Just prepend "http://" to what we have. */
 561       res = xmalloc (7 + strlen (url) + 1);
 562       sprintf (res, "http://%s", url);
 563       return res;
 564     }
 565 }
 566 \f
 567 static void split_path PARAMS ((const char *, char **, char **));
 568
 569 /* Like strpbrk, with the exception that it returns the pointer to the
 570    terminating zero (end-of-string aka "eos") if no matching character
 571    is found.
 572
 573    Although I normally balk at Gcc-specific optimizations, it probably
 574    makes sense here: glibc has optimizations that detect strpbrk being
 575    called with literal string as ACCEPT and inline the search.  That
 576    optimization is defeated if strpbrk is hidden within the call to
 577    another function.  (And no, making strpbrk_or_eos inline doesn't
 578    help because the check for literal accept is in the
 579    preprocessor.)  */
 580
 581 #ifdef __GNUC__
 582
 583 #define strpbrk_or_eos(s, accept) ({            \
 584   char *SOE_p = strpbrk (s, accept);            \
 585   if (!SOE_p)                                   \
 586     SOE_p = strchr (s, '\0');                   \
 587   SOE_p;                                        \
 588 })
 589
 590 #else  /* not __GNUC__ */
 591
 592 static inline char *
 593 strpbrk_or_eos (const char *s, const char *accept)
 594 {
 595   char *p = strpbrk (s, accept);
 596   if (!p)
 597     p = strchr (s, '\0');
 598   return p;
 599 }
 600 #endif /* not __GNUC__ */
 601
 602 /* Turn STR into lowercase; return non-zero if a character was
 603    actually changed. */
 604
 605 static int
 606 lowercase_str (char *str)
 607 {
 608   int change = 0;
 609   for (; *str; str++)
 610     if (ISUPPER (*str))
 611       {
 612         change = 1;
 613         *str = TOLOWER (*str);
 614       }
 615   return change;
 616 }
 617
 618 static const char *parse_errors[] = {
 619 #define PE_NO_ERROR                     0
 620   N_("No error"),
 621 #define PE_UNSUPPORTED_SCHEME           1
 622   N_("Unsupported scheme"),
 623 #define PE_EMPTY_HOST                   2
 624   N_("Empty host"),
 625 #define PE_BAD_PORT_NUMBER              3
 626   N_("Bad port number"),
 627 #define PE_INVALID_USER_NAME            4
 628   N_("Invalid user name"),
 629 #define PE_UNTERMINATED_IPV6_ADDRESS    5
 630   N_("Unterminated IPv6 numeric address"),
 631 #define PE_IPV6_NOT_SUPPORTED           6
 632   N_("IPv6 addresses not supported"),
 633 #define PE_INVALID_IPV6_ADDRESS         7
 634   N_("Invalid IPv6 numeric address")
 635 };
 636
 637 /* Parse a URL.
 638
 639    Return a new struct url if successful, NULL on error.  In case of
 640    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 641    error code. */
 642 struct url *
 643 url_parse (const char *url, int *error)
 644 {
 645   struct url *u;
 646   const char *p;
 647   int path_modified, host_modified;
 648
 649   enum url_scheme scheme;
 650
 651   const char *uname_b,     *uname_e;
 652   const char *host_b,      *host_e;
 653   const char *path_b,      *path_e;
 654   const char *params_b,    *params_e;
 655   const char *query_b,     *query_e;
 656   const char *fragment_b,  *fragment_e;
 657
 658   int port;
 659   char *user = NULL, *passwd = NULL;
 660
 661   char *url_encoded = NULL;
 662
 663   int error_code;
 664
 665   scheme = url_scheme (url);
 666   if (scheme == SCHEME_INVALID)
 667     {
 668       error_code = PE_UNSUPPORTED_SCHEME;
 669       goto err;
 670     }
 671
 672   url_encoded = reencode_escapes (url);
 673   p = url_encoded;
 674
 675   p += strlen (supported_schemes[scheme].leading_string);
 676   uname_b = p;
 677   p = url_skip_credentials (p);
 678   uname_e = p;
 679
 680   /* scheme://user:pass@host[:port]... */
 681   /*                    ^              */
 682
 683   /* We attempt to break down the URL into the components path,
 684      params, query, and fragment.  They are ordered like this:
 685
 686        scheme://host[:port][/path][;params][?query][#fragment]  */
 687
 688   params_b   = params_e   = NULL;
 689   query_b    = query_e    = NULL;
 690   fragment_b = fragment_e = NULL;
 691
 692   host_b = p;
 693
 694   if (*p == '[')
 695     {
 696       /* Handle IPv6 address inside square brackets.  Ideally we'd
 697          just look for the terminating ']', but rfc2732 mandates
 698          rejecting invalid IPv6 addresses.  */
 699
 700       /* The address begins after '['. */
 701       host_b = p + 1;
 702       host_e = strchr (host_b, ']');
 703
 704       if (!host_e)
 705         {
 706           error_code = PE_UNTERMINATED_IPV6_ADDRESS;
 707           goto err;
 708         }
 709
 710 #ifdef ENABLE_IPV6
 711       /* Check if the IPv6 address is valid. */
 712       if (!is_valid_ipv6_address(host_b, host_e))
 713         {
 714           error_code = PE_INVALID_IPV6_ADDRESS;
 715           goto err;
 716         }
 717
 718       /* Continue parsing after the closing ']'. */
 719       p = host_e + 1;
 720 #else
 721       error_code = PE_IPV6_NOT_SUPPORTED;
 722       goto err;
 723 #endif
 724     }
 725   else
 726     {
 727       p = strpbrk_or_eos (p, ":/;?#");
 728       host_e = p;
 729     }
 730
 731   if (host_b == host_e)
 732     {
 733       error_code = PE_EMPTY_HOST;
 734       goto err;
 735     }
 736
 737   port = scheme_default_port (scheme);
 738   if (*p == ':')
 739     {
 740       const char *port_b, *port_e, *pp;
 741
 742       /* scheme://host:port/tralala */
 743       /*              ^             */
 744       ++p;
 745       port_b = p;
 746       p = strpbrk_or_eos (p, "/;?#");
 747       port_e = p;
 748
 749       /* Allow empty port, as per rfc2396. */
 750       if (port_b != port_e)
 751         {
 752           for (port = 0, pp = port_b; pp < port_e; pp++)
 753             {
 754               if (!ISDIGIT (*pp))
 755                 {
 756                   /* http://host:12randomgarbage/blah */
 757                   /*               ^                  */
 758                   error_code = PE_BAD_PORT_NUMBER;
 759                   goto err;
 760                 }
 761               port = 10 * port + (*pp - '0');
 762               /* Check for too large port numbers here, before we have
 763                  a chance to overflow on bogus port values.  */
 764               if (port > 65535)
 765                 {
 766                   error_code = PE_BAD_PORT_NUMBER;
 767                   goto err;
 768                 }
 769             }
 770         }
 771     }
 772
 773   if (*p == '/')
 774     {
 775       ++p;
 776       path_b = p;
 777       p = strpbrk_or_eos (p, ";?#");
 778       path_e = p;
 779     }
 780   else
 781     {
 782       /* Path is not allowed not to exist. */
 783       path_b = path_e = p;
 784     }
 785
 786   if (*p == ';')
 787     {
 788       ++p;
 789       params_b = p;
 790       p = strpbrk_or_eos (p, "?#");
 791       params_e = p;
 792     }
 793   if (*p == '?')
 794     {
 795       ++p;
 796       query_b = p;
 797       p = strpbrk_or_eos (p, "#");
 798       query_e = p;
 799
 800       /* Hack that allows users to use '?' (a wildcard character) in
 801          FTP URLs without it being interpreted as a query string
 802          delimiter.  */
 803       if (scheme == SCHEME_FTP)
 804         {
 805           query_b = query_e = NULL;
 806           path_e = p;
 807         }
 808     }
 809   if (*p == '#')
 810     {
 811       ++p;
 812       fragment_b = p;
 813       p += strlen (p);
 814       fragment_e = p;
 815     }
 816   assert (*p == 0);
 817
 818   if (uname_b != uname_e)
 819     {
 820       /* http://user:pass@host */
 821       /*        ^         ^    */
 822       /*     uname_b   uname_e */
 823       if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
 824         {
 825           error_code = PE_INVALID_USER_NAME;
 826           goto err;
 827         }
 828     }
 829
 830   u = xnew0 (struct url);
 831   u->scheme = scheme;
 832   u->host   = strdupdelim (host_b, host_e);
 833   u->port   = port;
 834   u->user   = user;
 835   u->passwd = passwd;
 836
 837   u->path = strdupdelim (path_b, path_e);
 838   path_modified = path_simplify (u->path);
 839   split_path (u->path, &u->dir, &u->file);
 840
 841   host_modified = lowercase_str (u->host);
 842
 843   /* Decode %HH sequences in host name.  This is important not so much
 844      to support %HH sequences in host names (which other browser
 845      don't), but to support binary characters (which will have been
 846      converted to %HH by reencode_escapes).  */
 847   if (strchr (u->host, '%'))
 848     {
 849       url_unescape (u->host);
 850       host_modified = 1;
 851     }
 852
 853   if (params_b)
 854     u->params = strdupdelim (params_b, params_e);
 855   if (query_b)
 856     u->query = strdupdelim (query_b, query_e);
 857   if (fragment_b)
 858     u->fragment = strdupdelim (fragment_b, fragment_e);
 859
 860   if (path_modified || u->fragment || host_modified || path_b == path_e)
 861     {
 862       /* If we suspect that a transformation has rendered what
 863          url_string might return different from URL_ENCODED, rebuild
 864          u->url using url_string.  */
 865       u->url = url_string (u, 0);
 866
 867       if (url_encoded != url)
 868         xfree ((char *) url_encoded);
 869     }
 870   else
 871     {
 872       if (url_encoded == url)
 873         u->url = xstrdup (url);
 874       else
 875         u->url = url_encoded;
 876     }
 877   url_encoded = NULL;
 878
 879   return u;
 880
 881  err:
 882   /* Cleanup in case of error: */
 883   if (url_encoded && url_encoded != url)
 884     xfree (url_encoded);
 885
 886   /* Transmit the error code to the caller, if the caller wants to
 887      know.  */
 888   if (error)
 889     *error = error_code;
 890   return NULL;
 891 }
 892
 893 /* Return the error message string from ERROR_CODE, which should have
 894    been retrieved from url_parse.  The error message is translated.  */
 895
 896 const char *
 897 url_error (int error_code)
 898 {
 899   assert (error_code >= 0 && error_code < countof (parse_errors));
 900   return _(parse_errors[error_code]);
 901 }
 902
 903 /* Split PATH into DIR and FILE.  PATH comes from the URL and is
 904    expected to be URL-escaped.
 905
 906    The path is split into directory (the part up to the last slash)
 907    and file (the part after the last slash), which are subsequently
 908    unescaped.  Examples:
 909
 910    PATH                 DIR           FILE
 911    "foo/bar/baz"        "foo/bar"     "baz"
 912    "foo/bar/"           "foo/bar"     ""
 913    "foo"                ""            "foo"
 914    "foo/bar/baz%2fqux"  "foo/bar"     "baz/qux" (!)
 915
 916    DIR and FILE are freshly allocated.  */
 917
 918 static void
 919 split_path (const char *path, char **dir, char **file)
 920 {
 921   char *last_slash = strrchr (path, '/');
 922   if (!last_slash)
 923     {
 924       *dir = xstrdup ("");
 925       *file = xstrdup (path);
 926     }
 927   else
 928     {
 929       *dir = strdupdelim (path, last_slash);
 930       *file = xstrdup (last_slash + 1);
 931     }
 932   url_unescape (*dir);
 933   url_unescape (*file);
 934 }
 935
 936 /* Note: URL's "full path" is the path with the query string and
 937    params appended.  The "fragment" (#foo) is intentionally ignored,
 938    but that might be changed.  For example, if the original URL was
 939    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
 940    the full path will be "/foo/bar/baz;bullshit?querystring".  */
 941
 942 /* Return the length of the full path, without the terminating
 943    zero.  */
 944
 945 static int
 946 full_path_length (const struct url *url)
 947 {
 948   int len = 0;
 949
 950 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
 951
 952   FROB (path);
 953   FROB (params);
 954   FROB (query);
 955
 956 #undef FROB
 957
 958   return len;
 959 }
 960
 961 /* Write out the full path. */
 962
 963 static void
 964 full_path_write (const struct url *url, char *where)
 965 {
 966 #define FROB(el, chr) do {                      \
 967   char *f_el = url->el;                         \
 968   if (f_el) {                                   \
 969     int l = strlen (f_el);                      \
 970     *where++ = chr;                             \
 971     memcpy (where, f_el, l);                    \
 972     where += l;                                 \
 973   }                                             \
 974 } while (0)
 975
 976   FROB (path, '/');
 977   FROB (params, ';');
 978   FROB (query, '?');
 979
 980 #undef FROB
 981 }
 982
 983 /* Public function for getting the "full path".  E.g. if u->path is
 984    "foo/bar" and u->query is "param=value", full_path will be
 985    "/foo/bar?param=value". */
 986
 987 char *
 988 url_full_path (const struct url *url)
 989 {
 990   int length = full_path_length (url);
 991   char *full_path = (char *) xmalloc (length + 1);
 992
 993   full_path_write (url, full_path);
 994   full_path[length] = '\0';
 995
 996   return full_path;
 997 }
 998
 999 /* Unescape CHR in an otherwise escaped STR.  Used to selectively
1000    escaping of certain characters, such as "/" and ":".  Returns a
1001    count of unescaped chars.  */
1002
1003 static void
1004 unescape_single_char (char *str, char chr)
1005 {
1006   const char c1 = XNUM_TO_DIGIT (chr >> 4);
1007   const char c2 = XNUM_TO_DIGIT (chr & 0xf);
1008   char *h = str;                /* hare */
1009   char *t = str;                /* tortoise */
1010   for (; *h; h++, t++)
1011     {
1012       if (h[0] == '%' && h[1] == c1 && h[2] == c2)
1013         {
1014           *t = chr;
1015           h += 2;
1016         }
1017       else
1018         *t = *h;
1019     }
1020   *t = '\0';
1021 }
1022
1023 /* Escape unsafe and reserved characters, except for the slash
1024    characters.  */
1025
1026 static char *
1027 url_escape_dir (const char *dir)
1028 {
1029   char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1030   if (newdir == dir)
1031     return (char *)dir;
1032
1033   unescape_single_char (newdir, '/');
1034   return newdir;
1035 }
1036
1037 /* Sync u->path and u->url with u->dir and u->file.  Called after
1038    u->file or u->dir have been changed, typically by the FTP code.  */
1039
1040 static void
1041 sync_path (struct url *u)
1042 {
1043   char *newpath, *efile, *edir;
1044
1045   xfree (u->path);
1046
1047   /* u->dir and u->file are not escaped.  URL-escape them before
1048      reassembling them into u->path.  That way, if they contain
1049      separators like '?' or even if u->file contains slashes, the
1050      path will be correctly assembled.  (u->file can contain slashes
1051      if the URL specifies it with %2f, or if an FTP server returns
1052      it.)  */
1053   edir = url_escape_dir (u->dir);
1054   efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1055
1056   if (!*edir)
1057     newpath = xstrdup (efile);
1058   else
1059     {
1060       int dirlen = strlen (edir);
1061       int filelen = strlen (efile);
1062
1063       /* Copy "DIR/FILE" to newpath. */
1064       char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1065       memcpy (p, edir, dirlen);
1066       p += dirlen;
1067       *p++ = '/';
1068       memcpy (p, efile, filelen);
1069       p += filelen;
1070       *p++ = '\0';
1071     }
1072
1073   u->path = newpath;
1074
1075   if (edir != u->dir)
1076     xfree (edir);
1077   if (efile != u->file)
1078     xfree (efile);
1079
1080   /* Regenerate u->url as well.  */
1081   xfree (u->url);
1082   u->url = url_string (u, 0);
1083 }
1084
1085 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1086    This way we can sync u->path and u->url when they get changed.  */
1087
1088 void
1089 url_set_dir (struct url *url, const char *newdir)
1090 {
1091   xfree (url->dir);
1092   url->dir = xstrdup (newdir);
1093   sync_path (url);
1094 }
1095
1096 void
1097 url_set_file (struct url *url, const char *newfile)
1098 {
1099   xfree (url->file);
1100   url->file = xstrdup (newfile);
1101   sync_path (url);
1102 }
1103
1104 void
1105 url_free (struct url *url)
1106 {
1107   xfree (url->host);
1108   xfree (url->path);
1109   xfree (url->url);
1110
1111   xfree_null (url->params);
1112   xfree_null (url->query);
1113   xfree_null (url->fragment);
1114   xfree_null (url->user);
1115   xfree_null (url->passwd);
1116
1117   xfree (url->dir);
1118   xfree (url->file);
1119
1120   xfree (url);
1121 }
1122 \f
1123 /* Create all the necessary directories for PATH (a file).  Calls
1124    mkdirhier() internally.  */
1125 int
1126 mkalldirs (const char *path)
1127 {
1128   const char *p;
1129   char *t;
1130   struct_stat st;
1131   int res;
1132
1133   p = path + strlen (path);
1134   for (; *p != '/' && p != path; p--)
1135     ;
1136
1137   /* Don't create if it's just a file.  */
1138   if ((p == path) && (*p != '/'))
1139     return 0;
1140   t = strdupdelim (path, p);
1141
1142   /* Check whether the directory exists.  */
1143   if ((stat (t, &st) == 0))
1144     {
1145       if (S_ISDIR (st.st_mode))
1146         {
1147           xfree (t);
1148           return 0;
1149         }
1150       else
1151         {
1152           /* If the dir exists as a file name, remove it first.  This
1153              is *only* for Wget to work with buggy old CERN http
1154              servers.  Here is the scenario: When Wget tries to
1155              retrieve a directory without a slash, e.g.
1156              http://foo/bar (bar being a directory), CERN server will
1157              not redirect it too http://foo/bar/ -- it will generate a
1158              directory listing containing links to bar/file1,
1159              bar/file2, etc.  Wget will lose because it saves this
1160              HTML listing to a file `bar', so it cannot create the
1161              directory.  To work around this, if the file of the same
1162              name exists, we just remove it and create the directory
1163              anyway.  */
1164           DEBUGP (("Removing %s because of directory danger!\n", t));
1165           unlink (t);
1166         }
1167     }
1168   res = make_directory (t);
1169   if (res != 0)
1170     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1171   xfree (t);
1172   return res;
1173 }
1174 \f
1175 /* Functions for constructing the file name out of URL components.  */
1176
1177 /* A growable string structure, used by url_file_name and friends.
1178    This should perhaps be moved to utils.c.
1179
1180    The idea is to have a convenient and efficient way to construct a
1181    string by having various functions append data to it.  Instead of
1182    passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1183    functions in questions, we pass the pointer to this struct.  */
1184
1185 struct growable {
1186   char *base;
1187   int size;
1188   int tail;
1189 };
1190
1191 /* Ensure that the string can accept APPEND_COUNT more characters past
1192    the current TAIL position.  If necessary, this will grow the string
1193    and update its allocated size.  If the string is already large
1194    enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1195 #define GROW(g, append_size) do {                                       \
1196   struct growable *G_ = g;                                              \
1197   DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1198 } while (0)
1199
1200 /* Return the tail position of the string. */
1201 #define TAIL(r) ((r)->base + (r)->tail)
1202
1203 /* Move the tail position by APPEND_COUNT characters. */
1204 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1205
1206 /* Append the string STR to DEST.  NOTICE: the string in DEST is not
1207    terminated.  */
1208
1209 static void
1210 append_string (const char *str, struct growable *dest)
1211 {
1212   int l = strlen (str);
1213   GROW (dest, l);
1214   memcpy (TAIL (dest), str, l);
1215   TAIL_INCR (dest, l);
1216 }
1217
1218 /* Append CH to DEST.  For example, append_char (0, DEST)
1219    zero-terminates DEST.  */
1220
1221 static void
1222 append_char (char ch, struct growable *dest)
1223 {
1224   GROW (dest, 1);
1225   *TAIL (dest) = ch;
1226   TAIL_INCR (dest, 1);
1227 }
1228
1229 enum {
1230   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1231   filechr_not_windows = 2,      /* unusable on Windows, one of \|/<>?:*" */
1232   filechr_control     = 4       /* a control character, e.g. 0-31 */
1233 };
1234
1235 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1236
1237 /* Shorthands for the table: */
1238 #define U filechr_not_unix
1239 #define W filechr_not_windows
1240 #define C filechr_control
1241
1242 #define UW U|W
1243 #define UWC U|W|C
1244
1245 /* Table of characters unsafe under various conditions (see above).
1246
1247    Arguably we could also claim `%' to be unsafe, since we use it as
1248    the escape character.  If we ever want to be able to reliably
1249    translate file name back to URL, this would become important
1250    crucial.  Right now, it's better to be minimal in escaping.  */
1251
1252 static const unsigned char filechr_table[256] =
1253 {
1254 UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1255   C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1256   C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1257   C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1258   0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1259   0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1260   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1261   0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
1262   0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1263   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1264   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1265   0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1266   0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1267   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1268   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1269   0,  0,  0,  0,   0,  0,  0,  0,   /* x   y   z   {    |   }   ~   DEL */
1270
1271   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
1272   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
1273   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1274   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1275
1276   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1277   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1278   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1279   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1280 };
1281 #undef U
1282 #undef W
1283 #undef C
1284 #undef UW
1285 #undef UWC
1286
1287 /* FN_PORT_SEP is the separator between host and port in file names
1288    for non-standard port numbers.  On Unix this is normally ':', as in
1289    "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1290    because Windows can't handle ':' in file names.  */
1291 #define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1292
1293 /* FN_QUERY_SEP is the separator between the file name and the URL
1294    query, normally '?'.  Since Windows cannot handle '?' as part of
1295    file name, we use '@' instead there.  */
1296 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1297
1298 /* Quote path element, characters in [b, e), as file name, and append
1299    the quoted string to DEST.  Each character is quoted as per
1300    file_unsafe_char and the corresponding table.
1301
1302    If ESCAPED_P is non-zero, the path element is considered to be
1303    URL-escaped and will be unescaped prior to inspection.  */
1304
1305 static void
1306 append_uri_pathel (const char *b, const char *e, int escaped_p,
1307                    struct growable *dest)
1308 {
1309   const char *p;
1310   int quoted, outlen;
1311
1312   int mask;
1313   if (opt.restrict_files_os == restrict_unix)
1314     mask = filechr_not_unix;
1315   else
1316     mask = filechr_not_windows;
1317   if (opt.restrict_files_ctrl)
1318     mask |= filechr_control;
1319
1320   /* Copy [b, e) to PATHEL and URL-unescape it. */
1321   if (escaped_p)
1322     {
1323       char *unescaped;
1324       BOUNDED_TO_ALLOCA (b, e, unescaped);
1325       url_unescape (unescaped);
1326       b = unescaped;
1327       e = unescaped + strlen (unescaped);
1328     }
1329
1330   /* Defang ".." when found as component of path.  Remember that path
1331      comes from the URL and might contain malicious input.  */
1332   if (e - b == 2 && b[0] == '.' && b[1] == '.')
1333     {
1334       b = "%2E%2E";
1335       e = b + 6;
1336     }
1337
1338   /* Walk the PATHEL string and check how many characters we'll need
1339      to quote.  */
1340   quoted = 0;
1341   for (p = b; p < e; p++)
1342     if (FILE_CHAR_TEST (*p, mask))
1343       ++quoted;
1344
1345   /* Calculate the length of the output string.  e-b is the input
1346      string length.  Each quoted char introduces two additional
1347      characters in the string, hence 2*quoted.  */
1348   outlen = (e - b) + (2 * quoted);
1349   GROW (dest, outlen);
1350
1351   if (!quoted)
1352     {
1353       /* If there's nothing to quote, we can simply append the string
1354          without processing it again.  */
1355       memcpy (TAIL (dest), b, outlen);
1356     }
1357   else
1358     {
1359       char *q = TAIL (dest);
1360       for (p = b; p < e; p++)
1361         {
1362           if (!FILE_CHAR_TEST (*p, mask))
1363             *q++ = *p;
1364           else
1365             {
1366               unsigned char ch = *p;
1367               *q++ = '%';
1368               *q++ = XNUM_TO_DIGIT (ch >> 4);
1369               *q++ = XNUM_TO_DIGIT (ch & 0xf);
1370             }
1371         }
1372       assert (q - TAIL (dest) == outlen);
1373     }
1374   TAIL_INCR (dest, outlen);
1375 }
1376
1377 /* Append to DEST the directory structure that corresponds the
1378    directory part of URL's path.  For example, if the URL is
1379    http://server/dir1/dir2/file, this appends "/dir1/dir2".
1380
1381    Each path element ("dir1" and "dir2" in the above example) is
1382    examined, url-unescaped, and re-escaped as file name element.
1383
1384    Additionally, it cuts as many directories from the path as
1385    specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1386    will produce "bar" for the above example.  For 2 or more, it will
1387    produce "".
1388
1389    Each component of the path is quoted for use as file name.  */
1390
1391 static void
1392 append_dir_structure (const struct url *u, struct growable *dest)
1393 {
1394   char *pathel, *next;
1395   int cut = opt.cut_dirs;
1396
1397   /* Go through the path components, de-URL-quote them, and quote them
1398      (if necessary) as file names.  */
1399
1400   pathel = u->path;
1401   for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1402     {
1403       if (cut-- > 0)
1404         continue;
1405       if (pathel == next)
1406         /* Ignore empty pathels.  */
1407         continue;
1408
1409       if (dest->tail)
1410         append_char ('/', dest);
1411       append_uri_pathel (pathel, next, 1, dest);
1412     }
1413 }
1414
1415 /* Return a unique file name that matches the given URL as good as
1416    possible.  Does not create directories on the file system.  */
1417
1418 char *
1419 url_file_name (const struct url *u)
1420 {
1421   struct growable fnres;        /* stands for "file name result" */
1422
1423   const char *u_file, *u_query;
1424   char *fname, *unique;
1425
1426   fnres.base = NULL;
1427   fnres.size = 0;
1428   fnres.tail = 0;
1429
1430   /* Start with the directory prefix, if specified. */
1431   if (opt.dir_prefix)
1432     append_string (opt.dir_prefix, &fnres);
1433
1434   /* If "dirstruct" is turned on (typically the case with -r), add
1435      the host and port (unless those have been turned off) and
1436      directory structure.  */
1437   if (opt.dirstruct)
1438     {
1439       if (opt.protocol_directories)
1440         {
1441           if (fnres.tail)
1442             append_char ('/', &fnres);
1443           append_string (supported_schemes[u->scheme].name, &fnres);
1444         }
1445       if (opt.add_hostdir)
1446         {
1447           if (fnres.tail)
1448             append_char ('/', &fnres);
1449           if (0 != strcmp (u->host, ".."))
1450             append_string (u->host, &fnres);
1451           else
1452             /* Host name can come from the network; malicious DNS may
1453                allow ".." to be resolved, causing us to write to
1454                "../<file>".  Defang such host names.  */
1455             append_string ("%2E%2E", &fnres);
1456           if (u->port != scheme_default_port (u->scheme))
1457             {
1458               char portstr[24];
1459               number_to_string (portstr, u->port);
1460               append_char (FN_PORT_SEP, &fnres);
1461               append_string (portstr, &fnres);
1462             }
1463         }
1464
1465       append_dir_structure (u, &fnres);
1466     }
1467
1468   /* Add the file name. */
1469   if (fnres.tail)
1470     append_char ('/', &fnres);
1471   u_file = *u->file ? u->file : "index.html";
1472   append_uri_pathel (u_file, u_file + strlen (u_file), 0, &fnres);
1473
1474   /* Append "?query" to the file name. */
1475   u_query = u->query && *u->query ? u->query : NULL;
1476   if (u_query)
1477     {
1478       append_char (FN_QUERY_SEP, &fnres);
1479       append_uri_pathel (u_query, u_query + strlen (u_query), 1, &fnres);
1480     }
1481
1482   /* Zero-terminate the file name. */
1483   append_char ('\0', &fnres);
1484
1485   fname = fnres.base;
1486
1487   /* Check the cases in which the unique extensions are not used:
1488      1) Clobbering is turned off (-nc).
1489      2) Retrieval with regetting.
1490      3) Timestamping is used.
1491      4) Hierarchy is built.
1492
1493      The exception is the case when file does exist and is a
1494      directory (see `mkalldirs' for explanation).  */
1495
1496   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1497       && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1498     return fname;
1499
1500   unique = unique_name (fname, 1);
1501   if (unique != fname)
1502     xfree (fname);
1503   return unique;
1504 }
1505 \f
1506 /* Resolve "." and ".." elements of PATH by destructively modifying
1507    PATH and return non-zero if PATH has been modified, zero otherwise.
1508
1509    The algorithm is in spirit similar to the one described in rfc1808,
1510    although implemented differently, in one pass.  To recap, path
1511    elements containing only "." are removed, and ".." is taken to mean
1512    "back up one element".  Single leading and trailing slashes are
1513    preserved.
1514
1515    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1516    test examples are provided below.  If you change anything in this
1517    function, run test_path_simplify to make sure you haven't broken a
1518    test case.  */
1519
1520 static int
1521 path_simplify (char *path)
1522 {
1523   char *h = path;               /* hare */
1524   char *t = path;               /* tortoise */
1525   char *beg = path;             /* boundary for backing the tortoise */
1526   char *end = path + strlen (path);
1527
1528   while (h < end)
1529     {
1530       /* Hare should be at the beginning of a path element. */
1531
1532       if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1533         {
1534           /* Ignore "./". */
1535           h += 2;
1536         }
1537       else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1538         {
1539           /* Handle "../" by retreating the tortoise by one path
1540              element -- but not past beggining.  */
1541           if (t > beg)
1542             {
1543               /* Move backwards until T hits the beginning of the
1544                  previous path element or the beginning of path. */
1545               for (--t; t > beg && t[-1] != '/'; t--)
1546                 ;
1547             }
1548           else
1549             {
1550               /* If we're at the beginning, copy the "../" literally
1551                  move the beginning so a later ".." doesn't remove
1552                  it.  */
1553               beg = t + 3;
1554               goto regular;
1555             }
1556           h += 3;
1557         }
1558       else
1559         {
1560         regular:
1561           /* A regular path element.  If H hasn't advanced past T,
1562              simply skip to the next path element.  Otherwise, copy
1563              the path element until the next slash.  */
1564           if (t == h)
1565             {
1566               /* Skip the path element, including the slash.  */
1567               while (h < end && *h != '/')
1568                 t++, h++;
1569               if (h < end)
1570                 t++, h++;
1571             }
1572           else
1573             {
1574               /* Copy the path element, including the final slash.  */
1575               while (h < end && *h != '/')
1576                 *t++ = *h++;
1577               if (h < end)
1578                 *t++ = *h++;
1579             }
1580         }
1581     }
1582
1583   if (t != h)
1584     *t = '\0';
1585
1586   return t != h;
1587 }
1588 \f
1589 /* Return the length of URL's path.  Path is considered to be
1590    terminated by one of '?', ';', '#', or by the end of the
1591    string.  */
1592
1593 static int
1594 path_length (const char *url)
1595 {
1596   const char *q = strpbrk_or_eos (url, "?;#");
1597   return q - url;
1598 }
1599
1600 /* Find the last occurrence of character C in the range [b, e), or
1601    NULL, if none are present.  We might want to use memrchr (a GNU
1602    extension) under GNU libc.  */
1603
1604 static const char *
1605 find_last_char (const char *b, const char *e, char c)
1606 {
1607   for (; e > b; e--)
1608     if (*e == c)
1609       return e;
1610   return NULL;
1611 }
1612
1613 /* Merge BASE with LINK and return the resulting URI.
1614
1615    Either of the URIs may be absolute or relative, complete with the
1616    host name, or path only.  This tries to reasonably handle all
1617    foreseeable cases.  It only employs minimal URL parsing, without
1618    knowledge of the specifics of schemes.
1619
1620    I briefly considered making this function call path_simplify after
1621    the merging process, as rfc1738 seems to suggest.  This is a bad
1622    idea for several reasons: 1) it complexifies the code, and 2)
1623    url_parse has to simplify path anyway, so it's wasteful to boot.  */
1624
1625 char *
1626 uri_merge (const char *base, const char *link)
1627 {
1628   int linklength;
1629   const char *end;
1630   char *merge;
1631
1632   if (url_has_scheme (link))
1633     return xstrdup (link);
1634
1635   /* We may not examine BASE past END. */
1636   end = base + path_length (base);
1637   linklength = strlen (link);
1638
1639   if (!*link)
1640     {
1641       /* Empty LINK points back to BASE, query string and all. */
1642       return xstrdup (base);
1643     }
1644   else if (*link == '?')
1645     {
1646       /* LINK points to the same location, but changes the query
1647          string.  Examples: */
1648       /* uri_merge("path",         "?new") -> "path?new"     */
1649       /* uri_merge("path?foo",     "?new") -> "path?new"     */
1650       /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1651       /* uri_merge("path#foo",     "?new") -> "path?new"     */
1652       int baselength = end - base;
1653       merge = xmalloc (baselength + linklength + 1);
1654       memcpy (merge, base, baselength);
1655       memcpy (merge + baselength, link, linklength);
1656       merge[baselength + linklength] = '\0';
1657     }
1658   else if (*link == '#')
1659     {
1660       /* uri_merge("path",         "#new") -> "path#new"     */
1661       /* uri_merge("path#foo",     "#new") -> "path#new"     */
1662       /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1663       /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1664       int baselength;
1665       const char *end1 = strchr (base, '#');
1666       if (!end1)
1667         end1 = base + strlen (base);
1668       baselength = end1 - base;
1669       merge = xmalloc (baselength + linklength + 1);
1670       memcpy (merge, base, baselength);
1671       memcpy (merge + baselength, link, linklength);
1672       merge[baselength + linklength] = '\0';
1673     }
1674   else if (*link == '/' && *(link + 1) == '/')
1675     {
1676       /* LINK begins with "//" and so is a net path: we need to
1677          replace everything after (and including) the double slash
1678          with LINK. */
1679
1680       /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1681       /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1682       /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1683
1684       int span;
1685       const char *slash;
1686       const char *start_insert;
1687
1688       /* Look for first slash. */
1689       slash = memchr (base, '/', end - base);
1690       /* If found slash and it is a double slash, then replace
1691          from this point, else default to replacing from the
1692          beginning.  */
1693       if (slash && *(slash + 1) == '/')
1694         start_insert = slash;
1695       else
1696         start_insert = base;
1697
1698       span = start_insert - base;
1699       merge = (char *)xmalloc (span + linklength + 1);
1700       if (span)
1701         memcpy (merge, base, span);
1702       memcpy (merge + span, link, linklength);
1703       merge[span + linklength] = '\0';
1704     }
1705   else if (*link == '/')
1706     {
1707       /* LINK is an absolute path: we need to replace everything
1708          after (and including) the FIRST slash with LINK.
1709
1710          So, if BASE is "http://host/whatever/foo/bar", and LINK is
1711          "/qux/xyzzy", our result should be
1712          "http://host/qux/xyzzy".  */
1713       int span;
1714       const char *slash;
1715       const char *start_insert = NULL; /* for gcc to shut up. */
1716       const char *pos = base;
1717       int seen_slash_slash = 0;
1718       /* We're looking for the first slash, but want to ignore
1719          double slash. */
1720     again:
1721       slash = memchr (pos, '/', end - pos);
1722       if (slash && !seen_slash_slash)
1723         if (*(slash + 1) == '/')
1724           {
1725             pos = slash + 2;
1726             seen_slash_slash = 1;
1727             goto again;
1728           }
1729
1730       /* At this point, SLASH is the location of the first / after
1731          "//", or the first slash altogether.  START_INSERT is the
1732          pointer to the location where LINK will be inserted.  When
1733          examining the last two examples, keep in mind that LINK
1734          begins with '/'. */
1735
1736       if (!slash && !seen_slash_slash)
1737         /* example: "foo" */
1738         /*           ^    */
1739         start_insert = base;
1740       else if (!slash && seen_slash_slash)
1741         /* example: "http://foo" */
1742         /*                     ^ */
1743         start_insert = end;
1744       else if (slash && !seen_slash_slash)
1745         /* example: "foo/bar" */
1746         /*           ^        */
1747         start_insert = base;
1748       else if (slash && seen_slash_slash)
1749         /* example: "http://something/" */
1750         /*                           ^  */
1751         start_insert = slash;
1752
1753       span = start_insert - base;
1754       merge = (char *)xmalloc (span + linklength + 1);
1755       if (span)
1756         memcpy (merge, base, span);
1757       memcpy (merge + span, link, linklength);
1758       merge[span + linklength] = '\0';
1759     }
1760   else
1761     {
1762       /* LINK is a relative URL: we need to replace everything
1763          after last slash (possibly empty) with LINK.
1764
1765          So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1766          our result should be "whatever/foo/qux/xyzzy".  */
1767       int need_explicit_slash = 0;
1768       int span;
1769       const char *start_insert;
1770       const char *last_slash = find_last_char (base, end, '/');
1771       if (!last_slash)
1772         {
1773           /* No slash found at all.  Replace what we have with LINK. */
1774           start_insert = base;
1775         }
1776       else if (last_slash && last_slash >= base + 2
1777                && last_slash[-2] == ':' && last_slash[-1] == '/')
1778         {
1779           /* example: http://host"  */
1780           /*                      ^ */
1781           start_insert = end + 1;
1782           need_explicit_slash = 1;
1783         }
1784       else
1785         {
1786           /* example: "whatever/foo/bar" */
1787           /*                        ^    */
1788           start_insert = last_slash + 1;
1789         }
1790
1791       span = start_insert - base;
1792       merge = (char *)xmalloc (span + linklength + 1);
1793       if (span)
1794         memcpy (merge, base, span);
1795       if (need_explicit_slash)
1796         merge[span - 1] = '/';
1797       memcpy (merge + span, link, linklength);
1798       merge[span + linklength] = '\0';
1799     }
1800
1801   return merge;
1802 }
1803 \f
1804 #define APPEND(p, s) do {                       \
1805   int len = strlen (s);                         \
1806   memcpy (p, s, len);                           \
1807   p += len;                                     \
1808 } while (0)
1809
1810 /* Use this instead of password when the actual password is supposed
1811    to be hidden.  We intentionally use a generic string without giving
1812    away the number of characters in the password, like previous
1813    versions did.  */
1814 #define HIDDEN_PASSWORD "*password*"
1815
1816 /* Recreate the URL string from the data in URL.
1817
1818    If HIDE is non-zero (as it is when we're calling this on a URL we
1819    plan to print, but not when calling it to canonicalize a URL for
1820    use within the program), password will be hidden.  Unsafe
1821    characters in the URL will be quoted.  */
1822
1823 char *
1824 url_string (const struct url *url, int hide_password)
1825 {
1826   int size;
1827   char *result, *p;
1828   char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;
1829
1830   int scheme_port  = supported_schemes[url->scheme].default_port;
1831   const char *scheme_str = supported_schemes[url->scheme].leading_string;
1832   int fplen = full_path_length (url);
1833
1834   int brackets_around_host;
1835
1836   assert (scheme_str != NULL);
1837
1838   /* Make sure the user name and password are quoted. */
1839   if (url->user)
1840     {
1841       quoted_user = url_escape_allow_passthrough (url->user);
1842       if (url->passwd)
1843         {
1844           if (hide_password)
1845             quoted_passwd = HIDDEN_PASSWORD;
1846           else
1847             quoted_passwd = url_escape_allow_passthrough (url->passwd);
1848         }
1849     }
1850
1851   /* In the unlikely event that the host name contains non-printable
1852      characters, quote it for displaying to the user.  */
1853   quoted_host = url_escape_allow_passthrough (url->host);
1854
1855   /* Undo the quoting of colons that URL escaping performs.  IPv6
1856      addresses may legally contain colons, and in that case must be
1857      placed in square brackets.  */
1858   if (quoted_host != url->host)
1859     unescape_single_char (quoted_host, ':');
1860   brackets_around_host = strchr (quoted_host, ':') != NULL;
1861
1862   size = (strlen (scheme_str)
1863           + strlen (quoted_host)
1864           + (brackets_around_host ? 2 : 0)
1865           + fplen
1866           + 1);
1867   if (url->port != scheme_port)
1868     size += 1 + numdigit (url->port);
1869   if (quoted_user)
1870     {
1871       size += 1 + strlen (quoted_user);
1872       if (quoted_passwd)
1873         size += 1 + strlen (quoted_passwd);
1874     }
1875
1876   p = result = xmalloc (size);
1877
1878   APPEND (p, scheme_str);
1879   if (quoted_user)
1880     {
1881       APPEND (p, quoted_user);
1882       if (quoted_passwd)
1883         {
1884           *p++ = ':';
1885           APPEND (p, quoted_passwd);
1886         }
1887       *p++ = '@';
1888     }
1889
1890   if (brackets_around_host)
1891     *p++ = '[';
1892   APPEND (p, quoted_host);
1893   if (brackets_around_host)
1894     *p++ = ']';
1895   if (url->port != scheme_port)
1896     {
1897       *p++ = ':';
1898       p = number_to_string (p, url->port);
1899     }
1900
1901   full_path_write (url, p);
1902   p += fplen;
1903   *p++ = '\0';
1904
1905   assert (p - result == size);
1906
1907   if (quoted_user && quoted_user != url->user)
1908     xfree (quoted_user);
1909   if (quoted_passwd && !hide_password && quoted_passwd != url->passwd)
1910     xfree (quoted_passwd);
1911   if (quoted_host != url->host)
1912     xfree (quoted_host);
1913
1914   return result;
1915 }
1916 \f
1917 /* Return non-zero if scheme a is similar to scheme b.
1918
1919    Schemes are similar if they are equal.  If SSL is supported, schemes
1920    are also similar if one is http (SCHEME_HTTP) and the other is https
1921    (SCHEME_HTTPS).  */
1922 int
1923 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
1924 {
1925   if (a == b)
1926     return 1;
1927 #ifdef HAVE_SSL
1928   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
1929       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
1930     return 1;
1931 #endif
1932   return 0;
1933 }
1934 \f
1935 #if 0
1936 /* Debugging and testing support for path_simplify. */
1937
1938 /* Debug: run path_simplify on PATH and return the result in a new
1939    string.  Useful for calling from the debugger.  */
1940 static char *
1941 ps (char *path)
1942 {
1943   char *copy = xstrdup (path);
1944   path_simplify (copy);
1945   return copy;
1946 }
1947
1948 static void
1949 run_test (char *test, char *expected_result, int expected_change)
1950 {
1951   char *test_copy = xstrdup (test);
1952   int modified = path_simplify (test_copy);
1953
1954   if (0 != strcmp (test_copy, expected_result))
1955     {
1956       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
1957               test, expected_result, test_copy);
1958     }
1959   if (modified != expected_change)
1960     {
1961       if (expected_change == 1)
1962         printf ("Expected modification with path_simplify(\"%s\").\n",
1963                 test);
1964       else
1965         printf ("Expected no modification with path_simplify(\"%s\").\n",
1966                 test);
1967     }
1968   xfree (test_copy);
1969 }
1970
1971 static void
1972 test_path_simplify (void)
1973 {
1974   static struct {
1975     char *test, *result;
1976     int should_modify;
1977   } tests[] = {
1978     { "",                       "",             0 },
1979     { ".",                      "",             1 },
1980     { "./",                     "",             1 },
1981     { "..",                     "..",           0 },
1982     { "../",                    "../",          0 },
1983     { "foo",                    "foo",          0 },
1984     { "foo/bar",                "foo/bar",      0 },
1985     { "foo///bar",              "foo///bar",    0 },
1986     { "foo/.",                  "foo/",         1 },
1987     { "foo/./",                 "foo/",         1 },
1988     { "foo./",                  "foo./",        0 },
1989     { "foo/../bar",             "bar",          1 },
1990     { "foo/../bar/",            "bar/",         1 },
1991     { "foo/bar/..",             "foo/",         1 },
1992     { "foo/bar/../x",           "foo/x",        1 },
1993     { "foo/bar/../x/",          "foo/x/",       1 },
1994     { "foo/..",                 "",             1 },
1995     { "foo/../..",              "..",           1 },
1996     { "foo/../../..",           "../..",        1 },
1997     { "foo/../../bar/../../baz", "../../baz",   1 },
1998     { "a/b/../../c",            "c",            1 },
1999     { "./a/../b",               "b",            1 }
2000   };
2001   int i;
2002
2003   for (i = 0; i < countof (tests); i++)
2004     {
2005       char *test = tests[i].test;
2006       char *expected_result = tests[i].result;
2007       int   expected_change = tests[i].should_modify;
2008       run_test (test, expected_result, expected_change);
2009     }
2010 }
2011 #endif