sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 2005 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include <config.h>
  31
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #ifdef HAVE_STRING_H
  35 # include <string.h>
  36 #else
  37 # include <strings.h>
  38 #endif
  39 #include <sys/types.h>
  40 #ifdef HAVE_UNISTD_H
  41 # include <unistd.h>
  42 #endif
  43 #include <errno.h>
  44 #include <assert.h>
  45
  46 #include "wget.h"
  47 #include "utils.h"
  48 #include "url.h"
  49 #include "host.h"  /* for is_valid_ipv6_address */
  50
  51 #ifndef errno
  52 extern int errno;
  53 #endif
  54
  55 struct scheme_data
  56 {
  57   const char *name;
  58   const char *leading_string;
  59   int default_port;
  60   int enabled;
  61 };
  62
  63 /* Supported schemes: */
  64 static struct scheme_data supported_schemes[] =
  65 {
  66   { "http",     "http://",  DEFAULT_HTTP_PORT,  1 },
  67 #ifdef HAVE_SSL
  68   { "https",    "https://", DEFAULT_HTTPS_PORT, 1 },
  69 #endif
  70   { "ftp",      "ftp://",   DEFAULT_FTP_PORT,   1 },
  71
  72   /* SCHEME_INVALID */
  73   { NULL,       NULL,       -1,                 0 }
  74 };
  75
  76 /* Forward declarations: */
  77
  78 static int path_simplify PARAMS ((char *));
  79 \f
  80 /* Support for escaping and unescaping of URL strings.  */
  81
  82 /* Table of "reserved" and "unsafe" characters.  Those terms are
  83    rfc1738-speak, as such largely obsoleted by rfc2396 and later
  84    specs, but the general idea remains.
  85
  86    A reserved character is the one that you can't decode without
  87    changing the meaning of the URL.  For example, you can't decode
  88    "/foo/%2f/bar" into "/foo///bar" because the number and contents of
  89    path components is different.  Non-reserved characters can be
  90    changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar".  The
  91    unsafe characters are loosely based on rfc1738, plus "$" and ",",
  92    as recommended by rfc2396, and minus "~", which is very frequently
  93    used (and sometimes unrecognized as %7E by broken servers).
  94
  95    An unsafe character is the one that should be encoded when URLs are
  96    placed in foreign environments.  E.g. space and newline are unsafe
  97    in HTTP contexts because HTTP uses them as separator and line
  98    terminator, so they must be encoded to %20 and %0A respectively.
  99    "*" is unsafe in shell context, etc.
 100
 101    We determine whether a character is unsafe through static table
 102    lookup.  This code assumes ASCII character set and 8-bit chars.  */
 103
 104 enum {
 105   /* rfc1738 reserved chars + "$" and ",".  */
 106   urlchr_reserved = 1,
 107
 108   /* rfc1738 unsafe chars, plus non-printables.  */
 109   urlchr_unsafe   = 2
 110 };
 111
 112 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 113 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 114 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 115
 116 /* Shorthands for the table: */
 117 #define R  urlchr_reserved
 118 #define U  urlchr_unsafe
 119 #define RU R|U
 120
 121 static const unsigned char urlchr_table[256] =
 122 {
 123   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 124   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 125   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 126   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 127   U,  0,  U, RU,   R,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 128   0,  0,  0,  R,   R,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 129   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 130   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 131  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 132   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 133   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 134   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 135   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 136   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 137   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 138   0,  0,  0,  U,   U,  U,  0,  U,   /* x   y   z   {    |   }   ~   DEL */
 139
 140   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 141   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 142   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 143   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 144
 145   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 146   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 147   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 148   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 149 };
 150 #undef R
 151 #undef U
 152 #undef RU
 153
 154 /* URL-unescape the string S.
 155
 156    This is done by transforming the sequences "%HH" to the character
 157    represented by the hexadecimal digits HH.  If % is not followed by
 158    two hexadecimal digits, it is inserted literally.
 159
 160    The transformation is done in place.  If you need the original
 161    string intact, make a copy before calling this function.  */
 162
 163 static void
 164 url_unescape (char *s)
 165 {
 166   char *t = s;                  /* t - tortoise */
 167   char *h = s;                  /* h - hare     */
 168
 169   for (; *h; h++, t++)
 170     {
 171       if (*h != '%')
 172         {
 173         copychar:
 174           *t = *h;
 175         }
 176       else
 177         {
 178           char c;
 179           /* Do nothing if '%' is not followed by two hex digits. */
 180           if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2])))
 181             goto copychar;
 182           c = X2DIGITS_TO_NUM (h[1], h[2]);
 183           /* Don't unescape %00 because there is no way to insert it
 184              into a C string without effectively truncating it. */
 185           if (c == '\0')
 186             goto copychar;
 187           *t = c;
 188           h += 2;
 189         }
 190     }
 191   *t = '\0';
 192 }
 193
 194 /* The core of url_escape_* functions.  Escapes the characters that
 195    match the provided mask in urlchr_table.
 196
 197    If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars
 198    will be returned unchanged.  If ALLOW_PASSTHROUGH is zero, a
 199    freshly allocated string will be returned in all cases.  */
 200
 201 static char *
 202 url_escape_1 (const char *s, unsigned char mask, int allow_passthrough)
 203 {
 204   const char *p1;
 205   char *p2, *newstr;
 206   int newlen;
 207   int addition = 0;
 208
 209   for (p1 = s; *p1; p1++)
 210     if (urlchr_test (*p1, mask))
 211       addition += 2;            /* Two more characters (hex digits) */
 212
 213   if (!addition)
 214     return allow_passthrough ? (char *)s : xstrdup (s);
 215
 216   newlen = (p1 - s) + addition;
 217   newstr = (char *)xmalloc (newlen + 1);
 218
 219   p1 = s;
 220   p2 = newstr;
 221   while (*p1)
 222     {
 223       /* Quote the characters that match the test mask. */
 224       if (urlchr_test (*p1, mask))
 225         {
 226           unsigned char c = *p1++;
 227           *p2++ = '%';
 228           *p2++ = XNUM_TO_DIGIT (c >> 4);
 229           *p2++ = XNUM_TO_DIGIT (c & 0xf);
 230         }
 231       else
 232         *p2++ = *p1++;
 233     }
 234   assert (p2 - newstr == newlen);
 235   *p2 = '\0';
 236
 237   return newstr;
 238 }
 239
 240 /* URL-escape the unsafe characters (see urlchr_table) in a given
 241    string, returning a freshly allocated string.  */
 242
 243 char *
 244 url_escape (const char *s)
 245 {
 246   return url_escape_1 (s, urlchr_unsafe, 0);
 247 }
 248
 249 /* URL-escape the unsafe characters (see urlchr_table) in a given
 250    string.  If no characters are unsafe, S is returned.  */
 251
 252 static char *
 253 url_escape_allow_passthrough (const char *s)
 254 {
 255   return url_escape_1 (s, urlchr_unsafe, 1);
 256 }
 257 \f
 258 /* Decide whether the char at position P needs to be encoded.  (It is
 259    not enough to pass a single char *P because the function may need
 260    to inspect the surrounding context.)
 261
 262    Return 1 if the char should be escaped as %XX, 0 otherwise.  */
 263
 264 static inline int
 265 char_needs_escaping (const char *p)
 266 {
 267   if (*p == '%')
 268     {
 269       if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
 270         return 0;
 271       else
 272         /* Garbled %.. sequence: encode `%'. */
 273         return 1;
 274     }
 275   else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
 276     return 1;
 277   else
 278     return 0;
 279 }
 280
 281 /* Translate a %-escaped (but possibly non-conformant) input string S
 282    into a %-escaped (and conformant) output string.  If no characters
 283    are encoded or decoded, return the same string S; otherwise, return
 284    a freshly allocated string with the new contents.
 285
 286    After a URL has been run through this function, the protocols that
 287    use `%' as the quote character can use the resulting string as-is,
 288    while those that don't can use url_unescape to get to the intended
 289    data.  This function is stable: once the input is transformed,
 290    further transformations of the result yield the same output.
 291
 292    Let's discuss why this function is needed.
 293
 294    Imagine Wget is asked to retrieve `http://abc.xyz/abc def'.  Since
 295    a raw space character would mess up the HTTP request, it needs to
 296    be quoted, like this:
 297
 298        GET /abc%20def HTTP/1.0
 299
 300    It appears that the unsafe chars need to be quoted, for example
 301    with url_escape.  But what if we're requested to download
 302    `abc%20def'?  url_escape transforms "%" to "%25", which would leave
 303    us with `abc%2520def'.  This is incorrect -- since %-escapes are
 304    part of URL syntax, "%20" is the correct way to denote a literal
 305    space on the Wget command line.  This leaves us in the conclusion
 306    that in that case Wget should not call url_escape, but leave the
 307    `%20' as is.  This is clearly contradictory, but it only gets
 308    worse.
 309
 310    What if the requested URI is `abc%20 def'?  If we call url_escape,
 311    we end up with `/abc%2520%20def', which is almost certainly not
 312    intended.  If we don't call url_escape, we are left with the
 313    embedded space and cannot complete the request.  What the user
 314    meant was for Wget to request `/abc%20%20def', and this is where
 315    reencode_escapes kicks in.
 316
 317    Wget used to solve this by first decoding %-quotes, and then
 318    encoding all the "unsafe" characters found in the resulting string.
 319    This was wrong because it didn't preserve certain URL special
 320    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 321    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 322    whether we considered `+' reserved (it is).  One of these results
 323    is inevitable because by the second step we would lose information
 324    on whether the `+' was originally encoded or not.  Both results
 325    were wrong because in CGI parameters + means space, while %2B means
 326    literal plus.  reencode_escapes correctly translates the above to
 327    "a%2B+b", i.e. returns the original string.
 328
 329    This function uses a modified version of the algorithm originally
 330    proposed by Anon Sricharoenchai:
 331
 332    * Encode all "unsafe" characters, except those that are also
 333      "reserved", to %XX.  See urlchr_table for which characters are
 334      unsafe and reserved.
 335
 336    * Encode the "%" characters not followed by two hex digits to
 337      "%25".
 338
 339    * Pass through all other characters and %XX escapes as-is.  (Up to
 340      Wget 1.10 this decoded %XX escapes corresponding to "safe"
 341      characters, but that was obtrusive and broke some servers.)
 342
 343    Anon's test case:
 344
 345    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 346    ->
 347    "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc"
 348
 349    Simpler test cases:
 350
 351    "foo bar"         -> "foo%20bar"
 352    "foo%20bar"       -> "foo%20bar"
 353    "foo %20bar"      -> "foo%20%20bar"
 354    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 355    "foo%25%20bar"    -> "foo%25%20bar"
 356    "foo%2%20bar"     -> "foo%252%20bar"
 357    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 358    "foo%2b+bar"      -> "foo%2b+bar"  */
 359
 360 static char *
 361 reencode_escapes (const char *s)
 362 {
 363   const char *p1;
 364   char *newstr, *p2;
 365   int oldlen, newlen;
 366
 367   int encode_count = 0;
 368
 369   /* First pass: inspect the string to see if there's anything to do,
 370      and to calculate the new length.  */
 371   for (p1 = s; *p1; p1++)
 372     if (char_needs_escaping (p1))
 373       ++encode_count;
 374
 375   if (!encode_count)
 376     /* The string is good as it is. */
 377     return (char *) s;          /* C const model sucks. */
 378
 379   oldlen = p1 - s;
 380   /* Each encoding adds two characters (hex digits).  */
 381   newlen = oldlen + 2 * encode_count;
 382   newstr = xmalloc (newlen + 1);
 383
 384   /* Second pass: copy the string to the destination address, encoding
 385      chars when needed.  */
 386   p1 = s;
 387   p2 = newstr;
 388
 389   while (*p1)
 390     if (char_needs_escaping (p1))
 391       {
 392         unsigned char c = *p1++;
 393         *p2++ = '%';
 394         *p2++ = XNUM_TO_DIGIT (c >> 4);
 395         *p2++ = XNUM_TO_DIGIT (c & 0xf);
 396       }
 397     else
 398       *p2++ = *p1++;
 399
 400   *p2 = '\0';
 401   assert (p2 - newstr == newlen);
 402   return newstr;
 403 }
 404 \f
 405 /* Returns the scheme type if the scheme is supported, or
 406    SCHEME_INVALID if not.  */
 407
 408 enum url_scheme
 409 url_scheme (const char *url)
 410 {
 411   int i;
 412
 413   for (i = 0; supported_schemes[i].leading_string; i++)
 414     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 415                           strlen (supported_schemes[i].leading_string)))
 416       {
 417         if (supported_schemes[i].enabled)
 418           return (enum url_scheme) i;
 419         else
 420           return SCHEME_INVALID;
 421       }
 422
 423   return SCHEME_INVALID;
 424 }
 425
 426 #define SCHEME_CHAR(ch) (ISALNUM (ch) || (ch) == '-' || (ch) == '+')
 427
 428 /* Return 1 if the URL begins with any "scheme", 0 otherwise.  As
 429    currently implemented, it returns true if URL begins with
 430    [-+a-zA-Z0-9]+: .  */
 431
 432 int
 433 url_has_scheme (const char *url)
 434 {
 435   const char *p = url;
 436
 437   /* The first char must be a scheme char. */
 438   if (!*p || !SCHEME_CHAR (*p))
 439     return 0;
 440   ++p;
 441   /* Followed by 0 or more scheme chars. */
 442   while (*p && SCHEME_CHAR (*p))
 443     ++p;
 444   /* Terminated by ':'. */
 445   return *p == ':';
 446 }
 447
 448 int
 449 scheme_default_port (enum url_scheme scheme)
 450 {
 451   return supported_schemes[scheme].default_port;
 452 }
 453
 454 void
 455 scheme_disable (enum url_scheme scheme)
 456 {
 457   supported_schemes[scheme].enabled = 0;
 458 }
 459
 460 /* Skip the username and password, if present in the URL.  The
 461    function should *not* be called with the complete URL, but with the
 462    portion after the scheme.
 463
 464    If no username and password are found, return URL.  */
 465
 466 static const char *
 467 url_skip_credentials (const char *url)
 468 {
 469   /* Look for '@' that comes before terminators, such as '/', '?',
 470      '#', or ';'.  */
 471   const char *p = (const char *)strpbrk (url, "@/?#;");
 472   if (!p || *p != '@')
 473     return url;
 474   return p + 1;
 475 }
 476
 477 /* Parse credentials contained in [BEG, END).  The region is expected
 478    to have come from a URL and is unescaped.  */
 479
 480 static int
 481 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
 482 {
 483   char *colon;
 484   const char *userend;
 485
 486   if (beg == end)
 487     return 0;                   /* empty user name */
 488
 489   colon = memchr (beg, ':', end - beg);
 490   if (colon == beg)
 491     return 0;                   /* again empty user name */
 492
 493   if (colon)
 494     {
 495       *passwd = strdupdelim (colon + 1, end);
 496       userend = colon;
 497       url_unescape (*passwd);
 498     }
 499   else
 500     {
 501       *passwd = NULL;
 502       userend = end;
 503     }
 504   *user = strdupdelim (beg, userend);
 505   url_unescape (*user);
 506   return 1;
 507 }
 508
 509 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 510    popularized by Netscape and NcFTP.  HTTP shorthands look like this:
 511
 512    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 513    www.foo.com[:port]            -> http://www.foo.com[:port]
 514
 515    FTP shorthands look like this:
 516
 517    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 518    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 519
 520    If the URL needs not or cannot be rewritten, return NULL.  */
 521
 522 char *
 523 rewrite_shorthand_url (const char *url)
 524 {
 525   const char *p;
 526
 527   if (url_scheme (url) != SCHEME_INVALID)
 528     return NULL;
 529
 530   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 531      latter Netscape.  */
 532   for (p = url; *p && *p != ':' && *p != '/'; p++)
 533     ;
 534
 535   if (p == url)
 536     return NULL;
 537
 538   if (*p == ':')
 539     {
 540       const char *pp;
 541       char *res;
 542       /* If the characters after the colon and before the next slash
 543          or end of string are all digits, it's HTTP.  */
 544       int digits = 0;
 545       for (pp = p + 1; ISDIGIT (*pp); pp++)
 546         ++digits;
 547       if (digits > 0 && (*pp == '/' || *pp == '\0'))
 548         goto http;
 549
 550       /* Prepend "ftp://" to the entire URL... */
 551       res = xmalloc (6 + strlen (url) + 1);
 552       sprintf (res, "ftp://%s", url);
 553       /* ...and replace ':' with '/'. */
 554       res[6 + (p - url)] = '/';
 555       return res;
 556     }
 557   else
 558     {
 559       char *res;
 560     http:
 561       /* Just prepend "http://" to what we have. */
 562       res = xmalloc (7 + strlen (url) + 1);
 563       sprintf (res, "http://%s", url);
 564       return res;
 565     }
 566 }
 567 \f
 568 static void split_path PARAMS ((const char *, char **, char **));
 569
 570 /* Like strpbrk, with the exception that it returns the pointer to the
 571    terminating zero (end-of-string aka "eos") if no matching character
 572    is found.
 573
 574    Although I normally balk at Gcc-specific optimizations, it probably
 575    makes sense here: glibc has optimizations that detect strpbrk being
 576    called with literal string as ACCEPT and inline the search.  That
 577    optimization is defeated if strpbrk is hidden within the call to
 578    another function.  (And no, making strpbrk_or_eos inline doesn't
 579    help because the check for literal accept is in the
 580    preprocessor.)  */
 581
 582 #ifdef __GNUC__
 583
 584 #define strpbrk_or_eos(s, accept) ({            \
 585   char *SOE_p = strpbrk (s, accept);            \
 586   if (!SOE_p)                                   \
 587     SOE_p = (char *)s + strlen (s);             \
 588   SOE_p;                                        \
 589 })
 590
 591 #else  /* not __GNUC__ */
 592
 593 static char *
 594 strpbrk_or_eos (const char *s, const char *accept)
 595 {
 596   char *p = strpbrk (s, accept);
 597   if (!p)
 598     p = (char *)s + strlen (s);
 599   return p;
 600 }
 601 #endif
 602
 603 /* Turn STR into lowercase; return non-zero if a character was
 604    actually changed. */
 605
 606 static int
 607 lowercase_str (char *str)
 608 {
 609   int change = 0;
 610   for (; *str; str++)
 611     if (ISUPPER (*str))
 612       {
 613         change = 1;
 614         *str = TOLOWER (*str);
 615       }
 616   return change;
 617 }
 618
 619 static const char *parse_errors[] = {
 620 #define PE_NO_ERROR                     0
 621   N_("No error"),
 622 #define PE_UNSUPPORTED_SCHEME           1
 623   N_("Unsupported scheme"),
 624 #define PE_EMPTY_HOST                   2
 625   N_("Empty host"),
 626 #define PE_BAD_PORT_NUMBER              3
 627   N_("Bad port number"),
 628 #define PE_INVALID_USER_NAME            4
 629   N_("Invalid user name"),
 630 #define PE_UNTERMINATED_IPV6_ADDRESS    5
 631   N_("Unterminated IPv6 numeric address"),
 632 #define PE_IPV6_NOT_SUPPORTED           6
 633   N_("IPv6 addresses not supported"),
 634 #define PE_INVALID_IPV6_ADDRESS         7
 635   N_("Invalid IPv6 numeric address")
 636 };
 637
 638 /* Parse a URL.
 639
 640    Return a new struct url if successful, NULL on error.  In case of
 641    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 642    error code. */
 643 struct url *
 644 url_parse (const char *url, int *error)
 645 {
 646   struct url *u;
 647   const char *p;
 648   int path_modified, host_modified;
 649
 650   enum url_scheme scheme;
 651
 652   const char *uname_b,     *uname_e;
 653   const char *host_b,      *host_e;
 654   const char *path_b,      *path_e;
 655   const char *params_b,    *params_e;
 656   const char *query_b,     *query_e;
 657   const char *fragment_b,  *fragment_e;
 658
 659   int port;
 660   char *user = NULL, *passwd = NULL;
 661
 662   char *url_encoded = NULL;
 663
 664   int error_code;
 665
 666   scheme = url_scheme (url);
 667   if (scheme == SCHEME_INVALID)
 668     {
 669       error_code = PE_UNSUPPORTED_SCHEME;
 670       goto err;
 671     }
 672
 673   url_encoded = reencode_escapes (url);
 674   p = url_encoded;
 675
 676   p += strlen (supported_schemes[scheme].leading_string);
 677   uname_b = p;
 678   p = url_skip_credentials (p);
 679   uname_e = p;
 680
 681   /* scheme://user:pass@host[:port]... */
 682   /*                    ^              */
 683
 684   /* We attempt to break down the URL into the components path,
 685      params, query, and fragment.  They are ordered like this:
 686
 687        scheme://host[:port][/path][;params][?query][#fragment]  */
 688
 689   params_b   = params_e   = NULL;
 690   query_b    = query_e    = NULL;
 691   fragment_b = fragment_e = NULL;
 692
 693   host_b = p;
 694
 695   if (*p == '[')
 696     {
 697       /* Handle IPv6 address inside square brackets.  Ideally we'd
 698          just look for the terminating ']', but rfc2732 mandates
 699          rejecting invalid IPv6 addresses.  */
 700
 701       /* The address begins after '['. */
 702       host_b = p + 1;
 703       host_e = strchr (host_b, ']');
 704
 705       if (!host_e)
 706         {
 707           error_code = PE_UNTERMINATED_IPV6_ADDRESS;
 708           goto err;
 709         }
 710
 711 #ifdef ENABLE_IPV6
 712       /* Check if the IPv6 address is valid. */
 713       if (!is_valid_ipv6_address(host_b, host_e))
 714         {
 715           error_code = PE_INVALID_IPV6_ADDRESS;
 716           goto err;
 717         }
 718
 719       /* Continue parsing after the closing ']'. */
 720       p = host_e + 1;
 721 #else
 722       error_code = PE_IPV6_NOT_SUPPORTED;
 723       goto err;
 724 #endif
 725     }
 726   else
 727     {
 728       p = strpbrk_or_eos (p, ":/;?#");
 729       host_e = p;
 730     }
 731
 732   if (host_b == host_e)
 733     {
 734       error_code = PE_EMPTY_HOST;
 735       goto err;
 736     }
 737
 738   port = scheme_default_port (scheme);
 739   if (*p == ':')
 740     {
 741       const char *port_b, *port_e, *pp;
 742
 743       /* scheme://host:port/tralala */
 744       /*              ^             */
 745       ++p;
 746       port_b = p;
 747       p = strpbrk_or_eos (p, "/;?#");
 748       port_e = p;
 749
 750       /* Allow empty port, as per rfc2396. */
 751       if (port_b != port_e)
 752         {
 753           for (port = 0, pp = port_b; pp < port_e; pp++)
 754             {
 755               if (!ISDIGIT (*pp))
 756                 {
 757                   /* http://host:12randomgarbage/blah */
 758                   /*               ^                  */
 759                   error_code = PE_BAD_PORT_NUMBER;
 760                   goto err;
 761                 }
 762               port = 10 * port + (*pp - '0');
 763               /* Check for too large port numbers here, before we have
 764                  a chance to overflow on bogus port values.  */
 765               if (port > 65535)
 766                 {
 767                   error_code = PE_BAD_PORT_NUMBER;
 768                   goto err;
 769                 }
 770             }
 771         }
 772     }
 773
 774   if (*p == '/')
 775     {
 776       ++p;
 777       path_b = p;
 778       p = strpbrk_or_eos (p, ";?#");
 779       path_e = p;
 780     }
 781   else
 782     {
 783       /* Path is not allowed not to exist. */
 784       path_b = path_e = p;
 785     }
 786
 787   if (*p == ';')
 788     {
 789       ++p;
 790       params_b = p;
 791       p = strpbrk_or_eos (p, "?#");
 792       params_e = p;
 793     }
 794   if (*p == '?')
 795     {
 796       ++p;
 797       query_b = p;
 798       p = strpbrk_or_eos (p, "#");
 799       query_e = p;
 800
 801       /* Hack that allows users to use '?' (a wildcard character) in
 802          FTP URLs without it being interpreted as a query string
 803          delimiter.  */
 804       if (scheme == SCHEME_FTP)
 805         {
 806           query_b = query_e = NULL;
 807           path_e = p;
 808         }
 809     }
 810   if (*p == '#')
 811     {
 812       ++p;
 813       fragment_b = p;
 814       p += strlen (p);
 815       fragment_e = p;
 816     }
 817   assert (*p == 0);
 818
 819   if (uname_b != uname_e)
 820     {
 821       /* http://user:pass@host */
 822       /*        ^         ^    */
 823       /*     uname_b   uname_e */
 824       if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
 825         {
 826           error_code = PE_INVALID_USER_NAME;
 827           goto err;
 828         }
 829     }
 830
 831   u = xnew0 (struct url);
 832   u->scheme = scheme;
 833   u->host   = strdupdelim (host_b, host_e);
 834   u->port   = port;
 835   u->user   = user;
 836   u->passwd = passwd;
 837
 838   u->path = strdupdelim (path_b, path_e);
 839   path_modified = path_simplify (u->path);
 840   split_path (u->path, &u->dir, &u->file);
 841
 842   host_modified = lowercase_str (u->host);
 843
 844   /* Decode %HH sequences in host name.  This is important not so much
 845      to support %HH sequences in host names (which other browser
 846      don't), but to support binary characters (which will have been
 847      converted to %HH by reencode_escapes).  */
 848   if (strchr (u->host, '%'))
 849     {
 850       url_unescape (u->host);
 851       host_modified = 1;
 852     }
 853
 854   if (params_b)
 855     u->params = strdupdelim (params_b, params_e);
 856   if (query_b)
 857     u->query = strdupdelim (query_b, query_e);
 858   if (fragment_b)
 859     u->fragment = strdupdelim (fragment_b, fragment_e);
 860
 861   if (path_modified || u->fragment || host_modified || path_b == path_e)
 862     {
 863       /* If we suspect that a transformation has rendered what
 864          url_string might return different from URL_ENCODED, rebuild
 865          u->url using url_string.  */
 866       u->url = url_string (u, 0);
 867
 868       if (url_encoded != url)
 869         xfree ((char *) url_encoded);
 870     }
 871   else
 872     {
 873       if (url_encoded == url)
 874         u->url = xstrdup (url);
 875       else
 876         u->url = url_encoded;
 877     }
 878   url_encoded = NULL;
 879
 880   return u;
 881
 882  err:
 883   /* Cleanup in case of error: */
 884   if (url_encoded && url_encoded != url)
 885     xfree (url_encoded);
 886
 887   /* Transmit the error code to the caller, if the caller wants to
 888      know.  */
 889   if (error)
 890     *error = error_code;
 891   return NULL;
 892 }
 893
 894 /* Return the error message string from ERROR_CODE, which should have
 895    been retrieved from url_parse.  The error message is translated.  */
 896
 897 const char *
 898 url_error (int error_code)
 899 {
 900   assert (error_code >= 0 && error_code < countof (parse_errors));
 901   return _(parse_errors[error_code]);
 902 }
 903
 904 /* Split PATH into DIR and FILE.  PATH comes from the URL and is
 905    expected to be URL-escaped.
 906
 907    The path is split into directory (the part up to the last slash)
 908    and file (the part after the last slash), which are subsequently
 909    unescaped.  Examples:
 910
 911    PATH                 DIR           FILE
 912    "foo/bar/baz"        "foo/bar"     "baz"
 913    "foo/bar/"           "foo/bar"     ""
 914    "foo"                ""            "foo"
 915    "foo/bar/baz%2fqux"  "foo/bar"     "baz/qux" (!)
 916
 917    DIR and FILE are freshly allocated.  */
 918
 919 static void
 920 split_path (const char *path, char **dir, char **file)
 921 {
 922   char *last_slash = strrchr (path, '/');
 923   if (!last_slash)
 924     {
 925       *dir = xstrdup ("");
 926       *file = xstrdup (path);
 927     }
 928   else
 929     {
 930       *dir = strdupdelim (path, last_slash);
 931       *file = xstrdup (last_slash + 1);
 932     }
 933   url_unescape (*dir);
 934   url_unescape (*file);
 935 }
 936
 937 /* Note: URL's "full path" is the path with the query string and
 938    params appended.  The "fragment" (#foo) is intentionally ignored,
 939    but that might be changed.  For example, if the original URL was
 940    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
 941    the full path will be "/foo/bar/baz;bullshit?querystring".  */
 942
 943 /* Return the length of the full path, without the terminating
 944    zero.  */
 945
 946 static int
 947 full_path_length (const struct url *url)
 948 {
 949   int len = 0;
 950
 951 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
 952
 953   FROB (path);
 954   FROB (params);
 955   FROB (query);
 956
 957 #undef FROB
 958
 959   return len;
 960 }
 961
 962 /* Write out the full path. */
 963
 964 static void
 965 full_path_write (const struct url *url, char *where)
 966 {
 967 #define FROB(el, chr) do {                      \
 968   char *f_el = url->el;                         \
 969   if (f_el) {                                   \
 970     int l = strlen (f_el);                      \
 971     *where++ = chr;                             \
 972     memcpy (where, f_el, l);                    \
 973     where += l;                                 \
 974   }                                             \
 975 } while (0)
 976
 977   FROB (path, '/');
 978   FROB (params, ';');
 979   FROB (query, '?');
 980
 981 #undef FROB
 982 }
 983
 984 /* Public function for getting the "full path".  E.g. if u->path is
 985    "foo/bar" and u->query is "param=value", full_path will be
 986    "/foo/bar?param=value". */
 987
 988 char *
 989 url_full_path (const struct url *url)
 990 {
 991   int length = full_path_length (url);
 992   char *full_path = (char *) xmalloc (length + 1);
 993
 994   full_path_write (url, full_path);
 995   full_path[length] = '\0';
 996
 997   return full_path;
 998 }
 999
1000 /* Unescape CHR in an otherwise escaped STR.  Used to selectively
1001    escaping of certain characters, such as "/" and ":".  Returns a
1002    count of unescaped chars.  */
1003
1004 static void
1005 unescape_single_char (char *str, char chr)
1006 {
1007   const char c1 = XNUM_TO_DIGIT (chr >> 4);
1008   const char c2 = XNUM_TO_DIGIT (chr & 0xf);
1009   char *h = str;                /* hare */
1010   char *t = str;                /* tortoise */
1011   for (; *h; h++, t++)
1012     {
1013       if (h[0] == '%' && h[1] == c1 && h[2] == c2)
1014         {
1015           *t = chr;
1016           h += 2;
1017         }
1018       else
1019         *t = *h;
1020     }
1021   *t = '\0';
1022 }
1023
1024 /* Escape unsafe and reserved characters, except for the slash
1025    characters.  */
1026
1027 static char *
1028 url_escape_dir (const char *dir)
1029 {
1030   char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1031   if (newdir == dir)
1032     return (char *)dir;
1033
1034   unescape_single_char (newdir, '/');
1035   return newdir;
1036 }
1037
1038 /* Sync u->path and u->url with u->dir and u->file.  Called after
1039    u->file or u->dir have been changed, typically by the FTP code.  */
1040
1041 static void
1042 sync_path (struct url *u)
1043 {
1044   char *newpath, *efile, *edir;
1045
1046   xfree (u->path);
1047
1048   /* u->dir and u->file are not escaped.  URL-escape them before
1049      reassembling them into u->path.  That way, if they contain
1050      separators like '?' or even if u->file contains slashes, the
1051      path will be correctly assembled.  (u->file can contain slashes
1052      if the URL specifies it with %2f, or if an FTP server returns
1053      it.)  */
1054   edir = url_escape_dir (u->dir);
1055   efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1056
1057   if (!*edir)
1058     newpath = xstrdup (efile);
1059   else
1060     {
1061       int dirlen = strlen (edir);
1062       int filelen = strlen (efile);
1063
1064       /* Copy "DIR/FILE" to newpath. */
1065       char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1066       memcpy (p, edir, dirlen);
1067       p += dirlen;
1068       *p++ = '/';
1069       memcpy (p, efile, filelen);
1070       p += filelen;
1071       *p++ = '\0';
1072     }
1073
1074   u->path = newpath;
1075
1076   if (edir != u->dir)
1077     xfree (edir);
1078   if (efile != u->file)
1079     xfree (efile);
1080
1081   /* Regenerate u->url as well.  */
1082   xfree (u->url);
1083   u->url = url_string (u, 0);
1084 }
1085
1086 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1087    This way we can sync u->path and u->url when they get changed.  */
1088
1089 void
1090 url_set_dir (struct url *url, const char *newdir)
1091 {
1092   xfree (url->dir);
1093   url->dir = xstrdup (newdir);
1094   sync_path (url);
1095 }
1096
1097 void
1098 url_set_file (struct url *url, const char *newfile)
1099 {
1100   xfree (url->file);
1101   url->file = xstrdup (newfile);
1102   sync_path (url);
1103 }
1104
1105 void
1106 url_free (struct url *url)
1107 {
1108   xfree (url->host);
1109   xfree (url->path);
1110   xfree (url->url);
1111
1112   xfree_null (url->params);
1113   xfree_null (url->query);
1114   xfree_null (url->fragment);
1115   xfree_null (url->user);
1116   xfree_null (url->passwd);
1117
1118   xfree (url->dir);
1119   xfree (url->file);
1120
1121   xfree (url);
1122 }
1123 \f
1124 /* Create all the necessary directories for PATH (a file).  Calls
1125    mkdirhier() internally.  */
1126 int
1127 mkalldirs (const char *path)
1128 {
1129   const char *p;
1130   char *t;
1131   struct_stat st;
1132   int res;
1133
1134   p = path + strlen (path);
1135   for (; *p != '/' && p != path; p--)
1136     ;
1137
1138   /* Don't create if it's just a file.  */
1139   if ((p == path) && (*p != '/'))
1140     return 0;
1141   t = strdupdelim (path, p);
1142
1143   /* Check whether the directory exists.  */
1144   if ((stat (t, &st) == 0))
1145     {
1146       if (S_ISDIR (st.st_mode))
1147         {
1148           xfree (t);
1149           return 0;
1150         }
1151       else
1152         {
1153           /* If the dir exists as a file name, remove it first.  This
1154              is *only* for Wget to work with buggy old CERN http
1155              servers.  Here is the scenario: When Wget tries to
1156              retrieve a directory without a slash, e.g.
1157              http://foo/bar (bar being a directory), CERN server will
1158              not redirect it too http://foo/bar/ -- it will generate a
1159              directory listing containing links to bar/file1,
1160              bar/file2, etc.  Wget will lose because it saves this
1161              HTML listing to a file `bar', so it cannot create the
1162              directory.  To work around this, if the file of the same
1163              name exists, we just remove it and create the directory
1164              anyway.  */
1165           DEBUGP (("Removing %s because of directory danger!\n", t));
1166           unlink (t);
1167         }
1168     }
1169   res = make_directory (t);
1170   if (res != 0)
1171     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1172   xfree (t);
1173   return res;
1174 }
1175 \f
1176 /* Functions for constructing the file name out of URL components.  */
1177
1178 /* A growable string structure, used by url_file_name and friends.
1179    This should perhaps be moved to utils.c.
1180
1181    The idea is to have a convenient and efficient way to construct a
1182    string by having various functions append data to it.  Instead of
1183    passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1184    functions in questions, we pass the pointer to this struct.  */
1185
1186 struct growable {
1187   char *base;
1188   int size;
1189   int tail;
1190 };
1191
1192 /* Ensure that the string can accept APPEND_COUNT more characters past
1193    the current TAIL position.  If necessary, this will grow the string
1194    and update its allocated size.  If the string is already large
1195    enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1196 #define GROW(g, append_size) do {                                       \
1197   struct growable *G_ = g;                                              \
1198   DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1199 } while (0)
1200
1201 /* Return the tail position of the string. */
1202 #define TAIL(r) ((r)->base + (r)->tail)
1203
1204 /* Move the tail position by APPEND_COUNT characters. */
1205 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1206
1207 /* Append the string STR to DEST.  NOTICE: the string in DEST is not
1208    terminated.  */
1209
1210 static void
1211 append_string (const char *str, struct growable *dest)
1212 {
1213   int l = strlen (str);
1214   GROW (dest, l);
1215   memcpy (TAIL (dest), str, l);
1216   TAIL_INCR (dest, l);
1217 }
1218
1219 /* Append CH to DEST.  For example, append_char (0, DEST)
1220    zero-terminates DEST.  */
1221
1222 static void
1223 append_char (char ch, struct growable *dest)
1224 {
1225   GROW (dest, 1);
1226   *TAIL (dest) = ch;
1227   TAIL_INCR (dest, 1);
1228 }
1229
1230 enum {
1231   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1232   filechr_not_windows = 2,      /* unusable on Windows, one of \|/<>?:*" */
1233   filechr_control     = 4       /* a control character, e.g. 0-31 */
1234 };
1235
1236 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1237
1238 /* Shorthands for the table: */
1239 #define U filechr_not_unix
1240 #define W filechr_not_windows
1241 #define C filechr_control
1242
1243 #define UW U|W
1244 #define UWC U|W|C
1245
1246 /* Table of characters unsafe under various conditions (see above).
1247
1248    Arguably we could also claim `%' to be unsafe, since we use it as
1249    the escape character.  If we ever want to be able to reliably
1250    translate file name back to URL, this would become important
1251    crucial.  Right now, it's better to be minimal in escaping.  */
1252
1253 static const unsigned char filechr_table[256] =
1254 {
1255 UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1256   C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1257   C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1258   C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1259   0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1260   0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1261   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1262   0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
1263   0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1264   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1265   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1266   0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1267   0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1268   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1269   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1270   0,  0,  0,  0,   0,  0,  0,  0,   /* x   y   z   {    |   }   ~   DEL */
1271
1272   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
1273   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
1274   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1275   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1276
1277   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1278   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1279   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1280   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1281 };
1282 #undef U
1283 #undef W
1284 #undef C
1285 #undef UW
1286 #undef UWC
1287
1288 /* FN_PORT_SEP is the separator between host and port in file names
1289    for non-standard port numbers.  On Unix this is normally ':', as in
1290    "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1291    because Windows can't handle ':' in file names.  */
1292 #define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1293
1294 /* FN_QUERY_SEP is the separator between the file name and the URL
1295    query, normally '?'.  Since Windows cannot handle '?' as part of
1296    file name, we use '@' instead there.  */
1297 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1298
1299 /* Quote path element, characters in [b, e), as file name, and append
1300    the quoted string to DEST.  Each character is quoted as per
1301    file_unsafe_char and the corresponding table.
1302
1303    If ESCAPED_P is non-zero, the path element is considered to be
1304    URL-escaped and will be unescaped prior to inspection.  */
1305
1306 static void
1307 append_uri_pathel (const char *b, const char *e, int escaped_p,
1308                    struct growable *dest)
1309 {
1310   const char *p;
1311   int quoted, outlen;
1312
1313   int mask;
1314   if (opt.restrict_files_os == restrict_unix)
1315     mask = filechr_not_unix;
1316   else
1317     mask = filechr_not_windows;
1318   if (opt.restrict_files_ctrl)
1319     mask |= filechr_control;
1320
1321   /* Copy [b, e) to PATHEL and URL-unescape it. */
1322   if (escaped_p)
1323     {
1324       char *unescaped;
1325       BOUNDED_TO_ALLOCA (b, e, unescaped);
1326       url_unescape (unescaped);
1327       b = unescaped;
1328       e = unescaped + strlen (unescaped);
1329     }
1330
1331   /* Defang ".." when found as component of path.  Remember that path
1332      comes from the URL and might contain malicious input.  */
1333   if (e - b == 2 && b[0] == '.' && b[1] == '.')
1334     {
1335       b = "%2E%2E";
1336       e = b + 6;
1337     }
1338
1339   /* Walk the PATHEL string and check how many characters we'll need
1340      to quote.  */
1341   quoted = 0;
1342   for (p = b; p < e; p++)
1343     if (FILE_CHAR_TEST (*p, mask))
1344       ++quoted;
1345
1346   /* Calculate the length of the output string.  e-b is the input
1347      string length.  Each quoted char introduces two additional
1348      characters in the string, hence 2*quoted.  */
1349   outlen = (e - b) + (2 * quoted);
1350   GROW (dest, outlen);
1351
1352   if (!quoted)
1353     {
1354       /* If there's nothing to quote, we can simply append the string
1355          without processing it again.  */
1356       memcpy (TAIL (dest), b, outlen);
1357     }
1358   else
1359     {
1360       char *q = TAIL (dest);
1361       for (p = b; p < e; p++)
1362         {
1363           if (!FILE_CHAR_TEST (*p, mask))
1364             *q++ = *p;
1365           else
1366             {
1367               unsigned char ch = *p;
1368               *q++ = '%';
1369               *q++ = XNUM_TO_DIGIT (ch >> 4);
1370               *q++ = XNUM_TO_DIGIT (ch & 0xf);
1371             }
1372         }
1373       assert (q - TAIL (dest) == outlen);
1374     }
1375   TAIL_INCR (dest, outlen);
1376 }
1377
1378 /* Append to DEST the directory structure that corresponds the
1379    directory part of URL's path.  For example, if the URL is
1380    http://server/dir1/dir2/file, this appends "/dir1/dir2".
1381
1382    Each path element ("dir1" and "dir2" in the above example) is
1383    examined, url-unescaped, and re-escaped as file name element.
1384
1385    Additionally, it cuts as many directories from the path as
1386    specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1387    will produce "bar" for the above example.  For 2 or more, it will
1388    produce "".
1389
1390    Each component of the path is quoted for use as file name.  */
1391
1392 static void
1393 append_dir_structure (const struct url *u, struct growable *dest)
1394 {
1395   char *pathel, *next;
1396   int cut = opt.cut_dirs;
1397
1398   /* Go through the path components, de-URL-quote them, and quote them
1399      (if necessary) as file names.  */
1400
1401   pathel = u->path;
1402   for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1403     {
1404       if (cut-- > 0)
1405         continue;
1406       if (pathel == next)
1407         /* Ignore empty pathels.  */
1408         continue;
1409
1410       if (dest->tail)
1411         append_char ('/', dest);
1412       append_uri_pathel (pathel, next, 1, dest);
1413     }
1414 }
1415
1416 /* Return a unique file name that matches the given URL as good as
1417    possible.  Does not create directories on the file system.  */
1418
1419 char *
1420 url_file_name (const struct url *u)
1421 {
1422   struct growable fnres;        /* stands for "file name result" */
1423
1424   const char *u_file, *u_query;
1425   char *fname, *unique;
1426
1427   fnres.base = NULL;
1428   fnres.size = 0;
1429   fnres.tail = 0;
1430
1431   /* Start with the directory prefix, if specified. */
1432   if (opt.dir_prefix)
1433     append_string (opt.dir_prefix, &fnres);
1434
1435   /* If "dirstruct" is turned on (typically the case with -r), add
1436      the host and port (unless those have been turned off) and
1437      directory structure.  */
1438   if (opt.dirstruct)
1439     {
1440       if (opt.protocol_directories)
1441         {
1442           if (fnres.tail)
1443             append_char ('/', &fnres);
1444           append_string (supported_schemes[u->scheme].name, &fnres);
1445         }
1446       if (opt.add_hostdir)
1447         {
1448           if (fnres.tail)
1449             append_char ('/', &fnres);
1450           if (0 != strcmp (u->host, ".."))
1451             append_string (u->host, &fnres);
1452           else
1453             /* Host name can come from the network; malicious DNS may
1454                allow ".." to be resolved, causing us to write to
1455                "../<file>".  Defang such host names.  */
1456             append_string ("%2E%2E", &fnres);
1457           if (u->port != scheme_default_port (u->scheme))
1458             {
1459               char portstr[24];
1460               number_to_string (portstr, u->port);
1461               append_char (FN_PORT_SEP, &fnres);
1462               append_string (portstr, &fnres);
1463             }
1464         }
1465
1466       append_dir_structure (u, &fnres);
1467     }
1468
1469   /* Add the file name. */
1470   if (fnres.tail)
1471     append_char ('/', &fnres);
1472   u_file = *u->file ? u->file : "index.html";
1473   append_uri_pathel (u_file, u_file + strlen (u_file), 0, &fnres);
1474
1475   /* Append "?query" to the file name. */
1476   u_query = u->query && *u->query ? u->query : NULL;
1477   if (u_query)
1478     {
1479       append_char (FN_QUERY_SEP, &fnres);
1480       append_uri_pathel (u_query, u_query + strlen (u_query), 1, &fnres);
1481     }
1482
1483   /* Zero-terminate the file name. */
1484   append_char ('\0', &fnres);
1485
1486   fname = fnres.base;
1487
1488   /* Check the cases in which the unique extensions are not used:
1489      1) Clobbering is turned off (-nc).
1490      2) Retrieval with regetting.
1491      3) Timestamping is used.
1492      4) Hierarchy is built.
1493
1494      The exception is the case when file does exist and is a
1495      directory (see `mkalldirs' for explanation).  */
1496
1497   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1498       && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1499     return fname;
1500
1501   unique = unique_name (fname, 1);
1502   if (unique != fname)
1503     xfree (fname);
1504   return unique;
1505 }
1506 \f
1507 /* Resolve "." and ".." elements of PATH by destructively modifying
1508    PATH and return non-zero if PATH has been modified, zero otherwise.
1509
1510    The algorithm is in spirit similar to the one described in rfc1808,
1511    although implemented differently, in one pass.  To recap, path
1512    elements containing only "." are removed, and ".." is taken to mean
1513    "back up one element".  Single leading and trailing slashes are
1514    preserved.
1515
1516    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1517    test examples are provided below.  If you change anything in this
1518    function, run test_path_simplify to make sure you haven't broken a
1519    test case.  */
1520
1521 static int
1522 path_simplify (char *path)
1523 {
1524   char *h = path;               /* hare */
1525   char *t = path;               /* tortoise */
1526   char *beg = path;             /* boundary for backing the tortoise */
1527   char *end = path + strlen (path);
1528
1529   while (h < end)
1530     {
1531       /* Hare should be at the beginning of a path element. */
1532
1533       if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1534         {
1535           /* Ignore "./". */
1536           h += 2;
1537         }
1538       else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1539         {
1540           /* Handle "../" by retreating the tortoise by one path
1541              element -- but not past beggining.  */
1542           if (t > beg)
1543             {
1544               /* Move backwards until T hits the beginning of the
1545                  previous path element or the beginning of path. */
1546               for (--t; t > beg && t[-1] != '/'; t--)
1547                 ;
1548             }
1549           else
1550             {
1551               /* If we're at the beginning, copy the "../" literally
1552                  move the beginning so a later ".." doesn't remove
1553                  it.  */
1554               beg = t + 3;
1555               goto regular;
1556             }
1557           h += 3;
1558         }
1559       else
1560         {
1561         regular:
1562           /* A regular path element.  If H hasn't advanced past T,
1563              simply skip to the next path element.  Otherwise, copy
1564              the path element until the next slash.  */
1565           if (t == h)
1566             {
1567               /* Skip the path element, including the slash.  */
1568               while (h < end && *h != '/')
1569                 t++, h++;
1570               if (h < end)
1571                 t++, h++;
1572             }
1573           else
1574             {
1575               /* Copy the path element, including the final slash.  */
1576               while (h < end && *h != '/')
1577                 *t++ = *h++;
1578               if (h < end)
1579                 *t++ = *h++;
1580             }
1581         }
1582     }
1583
1584   if (t != h)
1585     *t = '\0';
1586
1587   return t != h;
1588 }
1589 \f
1590 /* Return the length of URL's path.  Path is considered to be
1591    terminated by one of '?', ';', '#', or by the end of the
1592    string.  */
1593
1594 static int
1595 path_length (const char *url)
1596 {
1597   const char *q = strpbrk_or_eos (url, "?;#");
1598   return q - url;
1599 }
1600
1601 /* Find the last occurrence of character C in the range [b, e), or
1602    NULL, if none are present.  We might want to use memrchr (a GNU
1603    extension) under GNU libc.  */
1604
1605 static const char *
1606 find_last_char (const char *b, const char *e, char c)
1607 {
1608   for (; e > b; e--)
1609     if (*e == c)
1610       return e;
1611   return NULL;
1612 }
1613
1614 /* Merge BASE with LINK and return the resulting URI.
1615
1616    Either of the URIs may be absolute or relative, complete with the
1617    host name, or path only.  This tries to reasonably handle all
1618    foreseeable cases.  It only employs minimal URL parsing, without
1619    knowledge of the specifics of schemes.
1620
1621    I briefly considered making this function call path_simplify after
1622    the merging process, as rfc1738 seems to suggest.  This is a bad
1623    idea for several reasons: 1) it complexifies the code, and 2)
1624    url_parse has to simplify path anyway, so it's wasteful to boot.  */
1625
1626 char *
1627 uri_merge (const char *base, const char *link)
1628 {
1629   int linklength;
1630   const char *end;
1631   char *merge;
1632
1633   if (url_has_scheme (link))
1634     return xstrdup (link);
1635
1636   /* We may not examine BASE past END. */
1637   end = base + path_length (base);
1638   linklength = strlen (link);
1639
1640   if (!*link)
1641     {
1642       /* Empty LINK points back to BASE, query string and all. */
1643       return xstrdup (base);
1644     }
1645   else if (*link == '?')
1646     {
1647       /* LINK points to the same location, but changes the query
1648          string.  Examples: */
1649       /* uri_merge("path",         "?new") -> "path?new"     */
1650       /* uri_merge("path?foo",     "?new") -> "path?new"     */
1651       /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1652       /* uri_merge("path#foo",     "?new") -> "path?new"     */
1653       int baselength = end - base;
1654       merge = xmalloc (baselength + linklength + 1);
1655       memcpy (merge, base, baselength);
1656       memcpy (merge + baselength, link, linklength);
1657       merge[baselength + linklength] = '\0';
1658     }
1659   else if (*link == '#')
1660     {
1661       /* uri_merge("path",         "#new") -> "path#new"     */
1662       /* uri_merge("path#foo",     "#new") -> "path#new"     */
1663       /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1664       /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1665       int baselength;
1666       const char *end1 = strchr (base, '#');
1667       if (!end1)
1668         end1 = base + strlen (base);
1669       baselength = end1 - base;
1670       merge = xmalloc (baselength + linklength + 1);
1671       memcpy (merge, base, baselength);
1672       memcpy (merge + baselength, link, linklength);
1673       merge[baselength + linklength] = '\0';
1674     }
1675   else if (*link == '/' && *(link + 1) == '/')
1676     {
1677       /* LINK begins with "//" and so is a net path: we need to
1678          replace everything after (and including) the double slash
1679          with LINK. */
1680
1681       /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1682       /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1683       /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1684
1685       int span;
1686       const char *slash;
1687       const char *start_insert;
1688
1689       /* Look for first slash. */
1690       slash = memchr (base, '/', end - base);
1691       /* If found slash and it is a double slash, then replace
1692          from this point, else default to replacing from the
1693          beginning.  */
1694       if (slash && *(slash + 1) == '/')
1695         start_insert = slash;
1696       else
1697         start_insert = base;
1698
1699       span = start_insert - base;
1700       merge = (char *)xmalloc (span + linklength + 1);
1701       if (span)
1702         memcpy (merge, base, span);
1703       memcpy (merge + span, link, linklength);
1704       merge[span + linklength] = '\0';
1705     }
1706   else if (*link == '/')
1707     {
1708       /* LINK is an absolute path: we need to replace everything
1709          after (and including) the FIRST slash with LINK.
1710
1711          So, if BASE is "http://host/whatever/foo/bar", and LINK is
1712          "/qux/xyzzy", our result should be
1713          "http://host/qux/xyzzy".  */
1714       int span;
1715       const char *slash;
1716       const char *start_insert = NULL; /* for gcc to shut up. */
1717       const char *pos = base;
1718       int seen_slash_slash = 0;
1719       /* We're looking for the first slash, but want to ignore
1720          double slash. */
1721     again:
1722       slash = memchr (pos, '/', end - pos);
1723       if (slash && !seen_slash_slash)
1724         if (*(slash + 1) == '/')
1725           {
1726             pos = slash + 2;
1727             seen_slash_slash = 1;
1728             goto again;
1729           }
1730
1731       /* At this point, SLASH is the location of the first / after
1732          "//", or the first slash altogether.  START_INSERT is the
1733          pointer to the location where LINK will be inserted.  When
1734          examining the last two examples, keep in mind that LINK
1735          begins with '/'. */
1736
1737       if (!slash && !seen_slash_slash)
1738         /* example: "foo" */
1739         /*           ^    */
1740         start_insert = base;
1741       else if (!slash && seen_slash_slash)
1742         /* example: "http://foo" */
1743         /*                     ^ */
1744         start_insert = end;
1745       else if (slash && !seen_slash_slash)
1746         /* example: "foo/bar" */
1747         /*           ^        */
1748         start_insert = base;
1749       else if (slash && seen_slash_slash)
1750         /* example: "http://something/" */
1751         /*                           ^  */
1752         start_insert = slash;
1753
1754       span = start_insert - base;
1755       merge = (char *)xmalloc (span + linklength + 1);
1756       if (span)
1757         memcpy (merge, base, span);
1758       memcpy (merge + span, link, linklength);
1759       merge[span + linklength] = '\0';
1760     }
1761   else
1762     {
1763       /* LINK is a relative URL: we need to replace everything
1764          after last slash (possibly empty) with LINK.
1765
1766          So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1767          our result should be "whatever/foo/qux/xyzzy".  */
1768       int need_explicit_slash = 0;
1769       int span;
1770       const char *start_insert;
1771       const char *last_slash = find_last_char (base, end, '/');
1772       if (!last_slash)
1773         {
1774           /* No slash found at all.  Replace what we have with LINK. */
1775           start_insert = base;
1776         }
1777       else if (last_slash && last_slash >= base + 2
1778                && last_slash[-2] == ':' && last_slash[-1] == '/')
1779         {
1780           /* example: http://host"  */
1781           /*                      ^ */
1782           start_insert = end + 1;
1783           need_explicit_slash = 1;
1784         }
1785       else
1786         {
1787           /* example: "whatever/foo/bar" */
1788           /*                        ^    */
1789           start_insert = last_slash + 1;
1790         }
1791
1792       span = start_insert - base;
1793       merge = (char *)xmalloc (span + linklength + 1);
1794       if (span)
1795         memcpy (merge, base, span);
1796       if (need_explicit_slash)
1797         merge[span - 1] = '/';
1798       memcpy (merge + span, link, linklength);
1799       merge[span + linklength] = '\0';
1800     }
1801
1802   return merge;
1803 }
1804 \f
1805 #define APPEND(p, s) do {                       \
1806   int len = strlen (s);                         \
1807   memcpy (p, s, len);                           \
1808   p += len;                                     \
1809 } while (0)
1810
1811 /* Use this instead of password when the actual password is supposed
1812    to be hidden.  We intentionally use a generic string without giving
1813    away the number of characters in the password, like previous
1814    versions did.  */
1815 #define HIDDEN_PASSWORD "*password*"
1816
1817 /* Recreate the URL string from the data in URL.
1818
1819    If HIDE is non-zero (as it is when we're calling this on a URL we
1820    plan to print, but not when calling it to canonicalize a URL for
1821    use within the program), password will be hidden.  Unsafe
1822    characters in the URL will be quoted.  */
1823
1824 char *
1825 url_string (const struct url *url, int hide_password)
1826 {
1827   int size;
1828   char *result, *p;
1829   char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;
1830
1831   int scheme_port  = supported_schemes[url->scheme].default_port;
1832   const char *scheme_str = supported_schemes[url->scheme].leading_string;
1833   int fplen = full_path_length (url);
1834
1835   int brackets_around_host;
1836
1837   assert (scheme_str != NULL);
1838
1839   /* Make sure the user name and password are quoted. */
1840   if (url->user)
1841     {
1842       quoted_user = url_escape_allow_passthrough (url->user);
1843       if (url->passwd)
1844         {
1845           if (hide_password)
1846             quoted_passwd = HIDDEN_PASSWORD;
1847           else
1848             quoted_passwd = url_escape_allow_passthrough (url->passwd);
1849         }
1850     }
1851
1852   /* In the unlikely event that the host name contains non-printable
1853      characters, quote it for displaying to the user.  */
1854   quoted_host = url_escape_allow_passthrough (url->host);
1855
1856   /* Undo the quoting of colons that URL escaping performs.  IPv6
1857      addresses may legally contain colons, and in that case must be
1858      placed in square brackets.  */
1859   if (quoted_host != url->host)
1860     unescape_single_char (quoted_host, ':');
1861   brackets_around_host = strchr (quoted_host, ':') != NULL;
1862
1863   size = (strlen (scheme_str)
1864           + strlen (quoted_host)
1865           + (brackets_around_host ? 2 : 0)
1866           + fplen
1867           + 1);
1868   if (url->port != scheme_port)
1869     size += 1 + numdigit (url->port);
1870   if (quoted_user)
1871     {
1872       size += 1 + strlen (quoted_user);
1873       if (quoted_passwd)
1874         size += 1 + strlen (quoted_passwd);
1875     }
1876
1877   p = result = xmalloc (size);
1878
1879   APPEND (p, scheme_str);
1880   if (quoted_user)
1881     {
1882       APPEND (p, quoted_user);
1883       if (quoted_passwd)
1884         {
1885           *p++ = ':';
1886           APPEND (p, quoted_passwd);
1887         }
1888       *p++ = '@';
1889     }
1890
1891   if (brackets_around_host)
1892     *p++ = '[';
1893   APPEND (p, quoted_host);
1894   if (brackets_around_host)
1895     *p++ = ']';
1896   if (url->port != scheme_port)
1897     {
1898       *p++ = ':';
1899       p = number_to_string (p, url->port);
1900     }
1901
1902   full_path_write (url, p);
1903   p += fplen;
1904   *p++ = '\0';
1905
1906   assert (p - result == size);
1907
1908   if (quoted_user && quoted_user != url->user)
1909     xfree (quoted_user);
1910   if (quoted_passwd && !hide_password && quoted_passwd != url->passwd)
1911     xfree (quoted_passwd);
1912   if (quoted_host != url->host)
1913     xfree (quoted_host);
1914
1915   return result;
1916 }
1917 \f
1918 /* Return non-zero if scheme a is similar to scheme b.
1919
1920    Schemes are similar if they are equal.  If SSL is supported, schemes
1921    are also similar if one is http (SCHEME_HTTP) and the other is https
1922    (SCHEME_HTTPS).  */
1923 int
1924 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
1925 {
1926   if (a == b)
1927     return 1;
1928 #ifdef HAVE_SSL
1929   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
1930       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
1931     return 1;
1932 #endif
1933   return 0;
1934 }
1935 \f
1936 #if 0
1937 /* Debugging and testing support for path_simplify. */
1938
1939 /* Debug: run path_simplify on PATH and return the result in a new
1940    string.  Useful for calling from the debugger.  */
1941 static char *
1942 ps (char *path)
1943 {
1944   char *copy = xstrdup (path);
1945   path_simplify (copy);
1946   return copy;
1947 }
1948
1949 static void
1950 run_test (char *test, char *expected_result, int expected_change)
1951 {
1952   char *test_copy = xstrdup (test);
1953   int modified = path_simplify (test_copy);
1954
1955   if (0 != strcmp (test_copy, expected_result))
1956     {
1957       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
1958               test, expected_result, test_copy);
1959     }
1960   if (modified != expected_change)
1961     {
1962       if (expected_change == 1)
1963         printf ("Expected modification with path_simplify(\"%s\").\n",
1964                 test);
1965       else
1966         printf ("Expected no modification with path_simplify(\"%s\").\n",
1967                 test);
1968     }
1969   xfree (test_copy);
1970 }
1971
1972 static void
1973 test_path_simplify (void)
1974 {
1975   static struct {
1976     char *test, *result;
1977     int should_modify;
1978   } tests[] = {
1979     { "",                       "",             0 },
1980     { ".",                      "",             1 },
1981     { "./",                     "",             1 },
1982     { "..",                     "..",           0 },
1983     { "../",                    "../",          0 },
1984     { "foo",                    "foo",          0 },
1985     { "foo/bar",                "foo/bar",      0 },
1986     { "foo///bar",              "foo///bar",    0 },
1987     { "foo/.",                  "foo/",         1 },
1988     { "foo/./",                 "foo/",         1 },
1989     { "foo./",                  "foo./",        0 },
1990     { "foo/../bar",             "bar",          1 },
1991     { "foo/../bar/",            "bar/",         1 },
1992     { "foo/bar/..",             "foo/",         1 },
1993     { "foo/bar/../x",           "foo/x",        1 },
1994     { "foo/bar/../x/",          "foo/x/",       1 },
1995     { "foo/..",                 "",             1 },
1996     { "foo/../..",              "..",           1 },
1997     { "foo/../../..",           "../..",        1 },
1998     { "foo/../../bar/../../baz", "../../baz",   1 },
1999     { "a/b/../../c",            "c",            1 },
2000     { "./a/../b",               "b",            1 }
2001   };
2002   int i;
2003
2004   for (i = 0; i < countof (tests); i++)
2005     {
2006       char *test = tests[i].test;
2007       char *expected_result = tests[i].result;
2008       int   expected_change = tests[i].should_modify;
2009       run_test (test, expected_result, expected_change);
2010     }
2011 }
2012 #endif