sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 2005 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include <config.h>
  31
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #include <string.h>
  35 #ifdef HAVE_UNISTD_H
  36 # include <unistd.h>
  37 #endif
  38 #include <errno.h>
  39 #include <assert.h>
  40
  41 #include "wget.h"
  42 #include "utils.h"
  43 #include "url.h"
  44 #include "host.h"  /* for is_valid_ipv6_address */
  45
  46 struct scheme_data
  47 {
  48   const char *name;
  49   const char *leading_string;
  50   int default_port;
  51   bool enabled;
  52 };
  53
  54 /* Supported schemes: */
  55 static struct scheme_data supported_schemes[] =
  56 {
  57   { "http",     "http://",  DEFAULT_HTTP_PORT,  1 },
  58 #ifdef HAVE_SSL
  59   { "https",    "https://", DEFAULT_HTTPS_PORT, 1 },
  60 #endif
  61   { "ftp",      "ftp://",   DEFAULT_FTP_PORT,   1 },
  62
  63   /* SCHEME_INVALID */
  64   { NULL,       NULL,       -1,                 0 }
  65 };
  66
  67 /* Forward declarations: */
  68
  69 static bool path_simplify (char *);
  70 \f
  71 /* Support for escaping and unescaping of URL strings.  */
  72
  73 /* Table of "reserved" and "unsafe" characters.  Those terms are
  74    rfc1738-speak, as such largely obsoleted by rfc2396 and later
  75    specs, but the general idea remains.
  76
  77    A reserved character is the one that you can't decode without
  78    changing the meaning of the URL.  For example, you can't decode
  79    "/foo/%2f/bar" into "/foo///bar" because the number and contents of
  80    path components is different.  Non-reserved characters can be
  81    changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar".  The
  82    unsafe characters are loosely based on rfc1738, plus "$" and ",",
  83    as recommended by rfc2396, and minus "~", which is very frequently
  84    used (and sometimes unrecognized as %7E by broken servers).
  85
  86    An unsafe character is the one that should be encoded when URLs are
  87    placed in foreign environments.  E.g. space and newline are unsafe
  88    in HTTP contexts because HTTP uses them as separator and line
  89    terminator, so they must be encoded to %20 and %0A respectively.
  90    "*" is unsafe in shell context, etc.
  91
  92    We determine whether a character is unsafe through static table
  93    lookup.  This code assumes ASCII character set and 8-bit chars.  */
  94
  95 enum {
  96   /* rfc1738 reserved chars + "$" and ",".  */
  97   urlchr_reserved = 1,
  98
  99   /* rfc1738 unsafe chars, plus non-printables.  */
 100   urlchr_unsafe   = 2
 101 };
 102
 103 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 104 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 105 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 106
 107 /* Shorthands for the table: */
 108 #define R  urlchr_reserved
 109 #define U  urlchr_unsafe
 110 #define RU R|U
 111
 112 static const unsigned char urlchr_table[256] =
 113 {
 114   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 115   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 116   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 117   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 118   U,  0,  U, RU,   R,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 119   0,  0,  0,  R,   R,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 120   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 121   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 122  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 123   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 124   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 125   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 126   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 127   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 128   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 129   0,  0,  0,  U,   U,  U,  0,  U,   /* x   y   z   {    |   }   ~   DEL */
 130
 131   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 132   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 133   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 134   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 135
 136   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 137   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 138   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 139   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 140 };
 141 #undef R
 142 #undef U
 143 #undef RU
 144
 145 /* URL-unescape the string S.
 146
 147    This is done by transforming the sequences "%HH" to the character
 148    represented by the hexadecimal digits HH.  If % is not followed by
 149    two hexadecimal digits, it is inserted literally.
 150
 151    The transformation is done in place.  If you need the original
 152    string intact, make a copy before calling this function.  */
 153
 154 static void
 155 url_unescape (char *s)
 156 {
 157   char *t = s;                  /* t - tortoise */
 158   char *h = s;                  /* h - hare     */
 159
 160   for (; *h; h++, t++)
 161     {
 162       if (*h != '%')
 163         {
 164         copychar:
 165           *t = *h;
 166         }
 167       else
 168         {
 169           char c;
 170           /* Do nothing if '%' is not followed by two hex digits. */
 171           if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2])))
 172             goto copychar;
 173           c = X2DIGITS_TO_NUM (h[1], h[2]);
 174           /* Don't unescape %00 because there is no way to insert it
 175              into a C string without effectively truncating it. */
 176           if (c == '\0')
 177             goto copychar;
 178           *t = c;
 179           h += 2;
 180         }
 181     }
 182   *t = '\0';
 183 }
 184
 185 /* The core of url_escape_* functions.  Escapes the characters that
 186    match the provided mask in urlchr_table.
 187
 188    If ALLOW_PASSTHROUGH is true, a string with no unsafe chars will be
 189    returned unchanged.  If ALLOW_PASSTHROUGH is false, a freshly
 190    allocated string will be returned in all cases.  */
 191
 192 static char *
 193 url_escape_1 (const char *s, unsigned char mask, bool allow_passthrough)
 194 {
 195   const char *p1;
 196   char *p2, *newstr;
 197   int newlen;
 198   int addition = 0;
 199
 200   for (p1 = s; *p1; p1++)
 201     if (urlchr_test (*p1, mask))
 202       addition += 2;            /* Two more characters (hex digits) */
 203
 204   if (!addition)
 205     return allow_passthrough ? (char *)s : xstrdup (s);
 206
 207   newlen = (p1 - s) + addition;
 208   newstr = xmalloc (newlen + 1);
 209
 210   p1 = s;
 211   p2 = newstr;
 212   while (*p1)
 213     {
 214       /* Quote the characters that match the test mask. */
 215       if (urlchr_test (*p1, mask))
 216         {
 217           unsigned char c = *p1++;
 218           *p2++ = '%';
 219           *p2++ = XNUM_TO_DIGIT (c >> 4);
 220           *p2++ = XNUM_TO_DIGIT (c & 0xf);
 221         }
 222       else
 223         *p2++ = *p1++;
 224     }
 225   assert (p2 - newstr == newlen);
 226   *p2 = '\0';
 227
 228   return newstr;
 229 }
 230
 231 /* URL-escape the unsafe characters (see urlchr_table) in a given
 232    string, returning a freshly allocated string.  */
 233
 234 char *
 235 url_escape (const char *s)
 236 {
 237   return url_escape_1 (s, urlchr_unsafe, false);
 238 }
 239
 240 /* URL-escape the unsafe characters (see urlchr_table) in a given
 241    string.  If no characters are unsafe, S is returned.  */
 242
 243 static char *
 244 url_escape_allow_passthrough (const char *s)
 245 {
 246   return url_escape_1 (s, urlchr_unsafe, true);
 247 }
 248 \f
 249 /* Decide whether the char at position P needs to be encoded.  (It is
 250    not enough to pass a single char *P because the function may need
 251    to inspect the surrounding context.)
 252
 253    Return true if the char should be escaped as %XX, false otherwise.  */
 254
 255 static inline bool
 256 char_needs_escaping (const char *p)
 257 {
 258   if (*p == '%')
 259     {
 260       if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
 261         return false;
 262       else
 263         /* Garbled %.. sequence: encode `%'. */
 264         return true;
 265     }
 266   else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
 267     return true;
 268   else
 269     return false;
 270 }
 271
 272 /* Translate a %-escaped (but possibly non-conformant) input string S
 273    into a %-escaped (and conformant) output string.  If no characters
 274    are encoded or decoded, return the same string S; otherwise, return
 275    a freshly allocated string with the new contents.
 276
 277    After a URL has been run through this function, the protocols that
 278    use `%' as the quote character can use the resulting string as-is,
 279    while those that don't can use url_unescape to get to the intended
 280    data.  This function is stable: once the input is transformed,
 281    further transformations of the result yield the same output.
 282
 283    Let's discuss why this function is needed.
 284
 285    Imagine Wget is asked to retrieve `http://abc.xyz/abc def'.  Since
 286    a raw space character would mess up the HTTP request, it needs to
 287    be quoted, like this:
 288
 289        GET /abc%20def HTTP/1.0
 290
 291    It would appear that the unsafe chars need to be quoted, for
 292    example with url_escape.  But what if we're requested to download
 293    `abc%20def'?  url_escape transforms "%" to "%25", which would leave
 294    us with `abc%2520def'.  This is incorrect -- since %-escapes are
 295    part of URL syntax, "%20" is the correct way to denote a literal
 296    space on the Wget command line.  This leads to the conclusion that
 297    in that case Wget should not call url_escape, but leave the `%20'
 298    as is.  This is clearly contradictory, but it only gets worse.
 299
 300    What if the requested URI is `abc%20 def'?  If we call url_escape,
 301    we end up with `/abc%2520%20def', which is almost certainly not
 302    intended.  If we don't call url_escape, we are left with the
 303    embedded space and cannot complete the request.  What the user
 304    meant was for Wget to request `/abc%20%20def', and this is where
 305    reencode_escapes kicks in.
 306
 307    Wget used to solve this by first decoding %-quotes, and then
 308    encoding all the "unsafe" characters found in the resulting string.
 309    This was wrong because it didn't preserve certain URL special
 310    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 311    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 312    whether we considered `+' reserved (it is).  One of these results
 313    is inevitable because by the second step we would lose information
 314    on whether the `+' was originally encoded or not.  Both results
 315    were wrong because in CGI parameters + means space, while %2B means
 316    literal plus.  reencode_escapes correctly translates the above to
 317    "a%2B+b", i.e. returns the original string.
 318
 319    This function uses a modified version of the algorithm originally
 320    proposed by Anon Sricharoenchai:
 321
 322    * Encode all "unsafe" characters, except those that are also
 323      "reserved", to %XX.  See urlchr_table for which characters are
 324      unsafe and reserved.
 325
 326    * Encode the "%" characters not followed by two hex digits to
 327      "%25".
 328
 329    * Pass through all other characters and %XX escapes as-is.  (Up to
 330      Wget 1.10 this decoded %XX escapes corresponding to "safe"
 331      characters, but that was obtrusive and broke some servers.)
 332
 333    Anon's test case:
 334
 335    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 336    ->
 337    "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc"
 338
 339    Simpler test cases:
 340
 341    "foo bar"         -> "foo%20bar"
 342    "foo%20bar"       -> "foo%20bar"
 343    "foo %20bar"      -> "foo%20%20bar"
 344    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 345    "foo%25%20bar"    -> "foo%25%20bar"
 346    "foo%2%20bar"     -> "foo%252%20bar"
 347    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 348    "foo%2b+bar"      -> "foo%2b+bar"  */
 349
 350 static char *
 351 reencode_escapes (const char *s)
 352 {
 353   const char *p1;
 354   char *newstr, *p2;
 355   int oldlen, newlen;
 356
 357   int encode_count = 0;
 358
 359   /* First pass: inspect the string to see if there's anything to do,
 360      and to calculate the new length.  */
 361   for (p1 = s; *p1; p1++)
 362     if (char_needs_escaping (p1))
 363       ++encode_count;
 364
 365   if (!encode_count)
 366     /* The string is good as it is. */
 367     return (char *) s;          /* C const model sucks. */
 368
 369   oldlen = p1 - s;
 370   /* Each encoding adds two characters (hex digits).  */
 371   newlen = oldlen + 2 * encode_count;
 372   newstr = xmalloc (newlen + 1);
 373
 374   /* Second pass: copy the string to the destination address, encoding
 375      chars when needed.  */
 376   p1 = s;
 377   p2 = newstr;
 378
 379   while (*p1)
 380     if (char_needs_escaping (p1))
 381       {
 382         unsigned char c = *p1++;
 383         *p2++ = '%';
 384         *p2++ = XNUM_TO_DIGIT (c >> 4);
 385         *p2++ = XNUM_TO_DIGIT (c & 0xf);
 386       }
 387     else
 388       *p2++ = *p1++;
 389
 390   *p2 = '\0';
 391   assert (p2 - newstr == newlen);
 392   return newstr;
 393 }
 394 \f
 395 /* Returns the scheme type if the scheme is supported, or
 396    SCHEME_INVALID if not.  */
 397
 398 enum url_scheme
 399 url_scheme (const char *url)
 400 {
 401   int i;
 402
 403   for (i = 0; supported_schemes[i].leading_string; i++)
 404     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 405                           strlen (supported_schemes[i].leading_string)))
 406       {
 407         if (supported_schemes[i].enabled)
 408           return (enum url_scheme) i;
 409         else
 410           return SCHEME_INVALID;
 411       }
 412
 413   return SCHEME_INVALID;
 414 }
 415
 416 #define SCHEME_CHAR(ch) (ISALNUM (ch) || (ch) == '-' || (ch) == '+')
 417
 418 /* Return 1 if the URL begins with any "scheme", 0 otherwise.  As
 419    currently implemented, it returns true if URL begins with
 420    [-+a-zA-Z0-9]+: .  */
 421
 422 bool
 423 url_has_scheme (const char *url)
 424 {
 425   const char *p = url;
 426
 427   /* The first char must be a scheme char. */
 428   if (!*p || !SCHEME_CHAR (*p))
 429     return false;
 430   ++p;
 431   /* Followed by 0 or more scheme chars. */
 432   while (*p && SCHEME_CHAR (*p))
 433     ++p;
 434   /* Terminated by ':'. */
 435   return *p == ':';
 436 }
 437
 438 int
 439 scheme_default_port (enum url_scheme scheme)
 440 {
 441   return supported_schemes[scheme].default_port;
 442 }
 443
 444 void
 445 scheme_disable (enum url_scheme scheme)
 446 {
 447   supported_schemes[scheme].enabled = false;
 448 }
 449
 450 /* Skip the username and password, if present in the URL.  The
 451    function should *not* be called with the complete URL, but with the
 452    portion after the scheme.
 453
 454    If no username and password are found, return URL.  */
 455
 456 static const char *
 457 url_skip_credentials (const char *url)
 458 {
 459   /* Look for '@' that comes before terminators, such as '/', '?',
 460      '#', or ';'.  */
 461   const char *p = (const char *)strpbrk (url, "@/?#;");
 462   if (!p || *p != '@')
 463     return url;
 464   return p + 1;
 465 }
 466
 467 /* Parse credentials contained in [BEG, END).  The region is expected
 468    to have come from a URL and is unescaped.  */
 469
 470 static bool
 471 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
 472 {
 473   char *colon;
 474   const char *userend;
 475
 476   if (beg == end)
 477     return false;               /* empty user name */
 478
 479   colon = memchr (beg, ':', end - beg);
 480   if (colon == beg)
 481     return false;               /* again empty user name */
 482
 483   if (colon)
 484     {
 485       *passwd = strdupdelim (colon + 1, end);
 486       userend = colon;
 487       url_unescape (*passwd);
 488     }
 489   else
 490     {
 491       *passwd = NULL;
 492       userend = end;
 493     }
 494   *user = strdupdelim (beg, userend);
 495   url_unescape (*user);
 496   return true;
 497 }
 498
 499 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 500    popularized by Netscape and NcFTP.  HTTP shorthands look like this:
 501
 502    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 503    www.foo.com[:port]            -> http://www.foo.com[:port]
 504
 505    FTP shorthands look like this:
 506
 507    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 508    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 509
 510    If the URL needs not or cannot be rewritten, return NULL.  */
 511
 512 char *
 513 rewrite_shorthand_url (const char *url)
 514 {
 515   const char *p;
 516
 517   if (url_scheme (url) != SCHEME_INVALID)
 518     return NULL;
 519
 520   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 521      latter Netscape.  */
 522   for (p = url; *p && *p != ':' && *p != '/'; p++)
 523     ;
 524
 525   if (p == url)
 526     return NULL;
 527
 528   /* If we're looking at "://", it means the URL uses a scheme we
 529      don't support, which may include "https" when compiled without
 530      SSL support.  Don't bogusly rewrite such URLs.  */
 531   if (p[0] == ':' && p[1] == '/' && p[2] == '/')
 532     return NULL;
 533
 534   if (*p == ':')
 535     {
 536       const char *pp;
 537       char *res;
 538       /* If the characters after the colon and before the next slash
 539          or end of string are all digits, it's HTTP.  */
 540       int digits = 0;
 541       for (pp = p + 1; ISDIGIT (*pp); pp++)
 542         ++digits;
 543       if (digits > 0 && (*pp == '/' || *pp == '\0'))
 544         goto http;
 545
 546       /* Prepend "ftp://" to the entire URL... */
 547       res = xmalloc (6 + strlen (url) + 1);
 548       sprintf (res, "ftp://%s", url);
 549       /* ...and replace ':' with '/'. */
 550       res[6 + (p - url)] = '/';
 551       return res;
 552     }
 553   else
 554     {
 555       char *res;
 556     http:
 557       /* Just prepend "http://" to what we have. */
 558       res = xmalloc (7 + strlen (url) + 1);
 559       sprintf (res, "http://%s", url);
 560       return res;
 561     }
 562 }
 563 \f
 564 static void split_path (const char *, char **, char **);
 565
 566 /* Like strpbrk, with the exception that it returns the pointer to the
 567    terminating zero (end-of-string aka "eos") if no matching character
 568    is found.
 569
 570    Although I normally balk at Gcc-specific optimizations, it probably
 571    makes sense here: glibc has optimizations that detect strpbrk being
 572    called with literal string as ACCEPT and inline the search.  That
 573    optimization is defeated if strpbrk is hidden within the call to
 574    another function.  (And no, making strpbrk_or_eos inline doesn't
 575    help because the check for literal accept is in the
 576    preprocessor.)  */
 577
 578 #if defined(__GNUC__) && __GNUC__ >= 3
 579
 580 #define strpbrk_or_eos(s, accept) ({            \
 581   char *SOE_p = strpbrk (s, accept);            \
 582   if (!SOE_p)                                   \
 583     SOE_p = strchr (s, '\0');                   \
 584   SOE_p;                                        \
 585 })
 586
 587 #else  /* not __GNUC__ or old gcc */
 588
 589 static inline char *
 590 strpbrk_or_eos (const char *s, const char *accept)
 591 {
 592   char *p = strpbrk (s, accept);
 593   if (!p)
 594     p = strchr (s, '\0');
 595   return p;
 596 }
 597 #endif /* not __GNUC__ or old gcc */
 598
 599 /* Turn STR into lowercase; return true if a character was actually
 600    changed. */
 601
 602 static bool
 603 lowercase_str (char *str)
 604 {
 605   bool changed = false;
 606   for (; *str; str++)
 607     if (ISUPPER (*str))
 608       {
 609         changed = true;
 610         *str = TOLOWER (*str);
 611       }
 612   return changed;
 613 }
 614
 615 static const char *parse_errors[] = {
 616 #define PE_NO_ERROR                     0
 617   N_("No error"),
 618 #define PE_UNSUPPORTED_SCHEME           1
 619   N_("Unsupported scheme"),
 620 #define PE_EMPTY_HOST                   2
 621   N_("Empty host"),
 622 #define PE_BAD_PORT_NUMBER              3
 623   N_("Bad port number"),
 624 #define PE_INVALID_USER_NAME            4
 625   N_("Invalid user name"),
 626 #define PE_UNTERMINATED_IPV6_ADDRESS    5
 627   N_("Unterminated IPv6 numeric address"),
 628 #define PE_IPV6_NOT_SUPPORTED           6
 629   N_("IPv6 addresses not supported"),
 630 #define PE_INVALID_IPV6_ADDRESS         7
 631   N_("Invalid IPv6 numeric address")
 632 };
 633
 634 /* Parse a URL.
 635
 636    Return a new struct url if successful, NULL on error.  In case of
 637    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 638    error code. */
 639 struct url *
 640 url_parse (const char *url, int *error)
 641 {
 642   struct url *u;
 643   const char *p;
 644   bool path_modified, host_modified;
 645
 646   enum url_scheme scheme;
 647
 648   const char *uname_b,     *uname_e;
 649   const char *host_b,      *host_e;
 650   const char *path_b,      *path_e;
 651   const char *params_b,    *params_e;
 652   const char *query_b,     *query_e;
 653   const char *fragment_b,  *fragment_e;
 654
 655   int port;
 656   char *user = NULL, *passwd = NULL;
 657
 658   char *url_encoded = NULL;
 659
 660   int error_code;
 661
 662   scheme = url_scheme (url);
 663   if (scheme == SCHEME_INVALID)
 664     {
 665       error_code = PE_UNSUPPORTED_SCHEME;
 666       goto error;
 667     }
 668
 669   url_encoded = reencode_escapes (url);
 670   p = url_encoded;
 671
 672   p += strlen (supported_schemes[scheme].leading_string);
 673   uname_b = p;
 674   p = url_skip_credentials (p);
 675   uname_e = p;
 676
 677   /* scheme://user:pass@host[:port]... */
 678   /*                    ^              */
 679
 680   /* We attempt to break down the URL into the components path,
 681      params, query, and fragment.  They are ordered like this:
 682
 683        scheme://host[:port][/path][;params][?query][#fragment]  */
 684
 685   params_b   = params_e   = NULL;
 686   query_b    = query_e    = NULL;
 687   fragment_b = fragment_e = NULL;
 688
 689   host_b = p;
 690
 691   if (*p == '[')
 692     {
 693       /* Handle IPv6 address inside square brackets.  Ideally we'd
 694          just look for the terminating ']', but rfc2732 mandates
 695          rejecting invalid IPv6 addresses.  */
 696
 697       /* The address begins after '['. */
 698       host_b = p + 1;
 699       host_e = strchr (host_b, ']');
 700
 701       if (!host_e)
 702         {
 703           error_code = PE_UNTERMINATED_IPV6_ADDRESS;
 704           goto error;
 705         }
 706
 707 #ifdef ENABLE_IPV6
 708       /* Check if the IPv6 address is valid. */
 709       if (!is_valid_ipv6_address(host_b, host_e))
 710         {
 711           error_code = PE_INVALID_IPV6_ADDRESS;
 712           goto error;
 713         }
 714
 715       /* Continue parsing after the closing ']'. */
 716       p = host_e + 1;
 717 #else
 718       error_code = PE_IPV6_NOT_SUPPORTED;
 719       goto error;
 720 #endif
 721     }
 722   else
 723     {
 724       p = strpbrk_or_eos (p, ":/;?#");
 725       host_e = p;
 726     }
 727
 728   if (host_b == host_e)
 729     {
 730       error_code = PE_EMPTY_HOST;
 731       goto error;
 732     }
 733
 734   port = scheme_default_port (scheme);
 735   if (*p == ':')
 736     {
 737       const char *port_b, *port_e, *pp;
 738
 739       /* scheme://host:port/tralala */
 740       /*              ^             */
 741       ++p;
 742       port_b = p;
 743       p = strpbrk_or_eos (p, "/;?#");
 744       port_e = p;
 745
 746       /* Allow empty port, as per rfc2396. */
 747       if (port_b != port_e)
 748         {
 749           for (port = 0, pp = port_b; pp < port_e; pp++)
 750             {
 751               if (!ISDIGIT (*pp))
 752                 {
 753                   /* http://host:12randomgarbage/blah */
 754                   /*               ^                  */
 755                   error_code = PE_BAD_PORT_NUMBER;
 756                   goto error;
 757                 }
 758               port = 10 * port + (*pp - '0');
 759               /* Check for too large port numbers here, before we have
 760                  a chance to overflow on bogus port values.  */
 761               if (port > 65535)
 762                 {
 763                   error_code = PE_BAD_PORT_NUMBER;
 764                   goto error;
 765                 }
 766             }
 767         }
 768     }
 769
 770   if (*p == '/')
 771     {
 772       ++p;
 773       path_b = p;
 774       p = strpbrk_or_eos (p, ";?#");
 775       path_e = p;
 776     }
 777   else
 778     {
 779       /* Path is not allowed not to exist. */
 780       path_b = path_e = p;
 781     }
 782
 783   if (*p == ';')
 784     {
 785       ++p;
 786       params_b = p;
 787       p = strpbrk_or_eos (p, "?#");
 788       params_e = p;
 789     }
 790   if (*p == '?')
 791     {
 792       ++p;
 793       query_b = p;
 794       p = strpbrk_or_eos (p, "#");
 795       query_e = p;
 796
 797       /* Hack that allows users to use '?' (a wildcard character) in
 798          FTP URLs without it being interpreted as a query string
 799          delimiter.  */
 800       if (scheme == SCHEME_FTP)
 801         {
 802           query_b = query_e = NULL;
 803           path_e = p;
 804         }
 805     }
 806   if (*p == '#')
 807     {
 808       ++p;
 809       fragment_b = p;
 810       p += strlen (p);
 811       fragment_e = p;
 812     }
 813   assert (*p == 0);
 814
 815   if (uname_b != uname_e)
 816     {
 817       /* http://user:pass@host */
 818       /*        ^         ^    */
 819       /*     uname_b   uname_e */
 820       if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
 821         {
 822           error_code = PE_INVALID_USER_NAME;
 823           goto error;
 824         }
 825     }
 826
 827   u = xnew0 (struct url);
 828   u->scheme = scheme;
 829   u->host   = strdupdelim (host_b, host_e);
 830   u->port   = port;
 831   u->user   = user;
 832   u->passwd = passwd;
 833
 834   u->path = strdupdelim (path_b, path_e);
 835   path_modified = path_simplify (u->path);
 836   split_path (u->path, &u->dir, &u->file);
 837
 838   host_modified = lowercase_str (u->host);
 839
 840   /* Decode %HH sequences in host name.  This is important not so much
 841      to support %HH sequences in host names (which other browser
 842      don't), but to support binary characters (which will have been
 843      converted to %HH by reencode_escapes).  */
 844   if (strchr (u->host, '%'))
 845     {
 846       url_unescape (u->host);
 847       host_modified = true;
 848     }
 849
 850   if (params_b)
 851     u->params = strdupdelim (params_b, params_e);
 852   if (query_b)
 853     u->query = strdupdelim (query_b, query_e);
 854   if (fragment_b)
 855     u->fragment = strdupdelim (fragment_b, fragment_e);
 856
 857   if (path_modified || u->fragment || host_modified || path_b == path_e)
 858     {
 859       /* If we suspect that a transformation has rendered what
 860          url_string might return different from URL_ENCODED, rebuild
 861          u->url using url_string.  */
 862       u->url = url_string (u, false);
 863
 864       if (url_encoded != url)
 865         xfree ((char *) url_encoded);
 866     }
 867   else
 868     {
 869       if (url_encoded == url)
 870         u->url = xstrdup (url);
 871       else
 872         u->url = url_encoded;
 873     }
 874
 875   return u;
 876
 877  error:
 878   /* Cleanup in case of error: */
 879   if (url_encoded && url_encoded != url)
 880     xfree (url_encoded);
 881
 882   /* Transmit the error code to the caller, if the caller wants to
 883      know.  */
 884   if (error)
 885     *error = error_code;
 886   return NULL;
 887 }
 888
 889 /* Return the error message string from ERROR_CODE, which should have
 890    been retrieved from url_parse.  The error message is translated.  */
 891
 892 const char *
 893 url_error (int error_code)
 894 {
 895   assert (error_code >= 0 && error_code < countof (parse_errors));
 896   return _(parse_errors[error_code]);
 897 }
 898
 899 /* Split PATH into DIR and FILE.  PATH comes from the URL and is
 900    expected to be URL-escaped.
 901
 902    The path is split into directory (the part up to the last slash)
 903    and file (the part after the last slash), which are subsequently
 904    unescaped.  Examples:
 905
 906    PATH                 DIR           FILE
 907    "foo/bar/baz"        "foo/bar"     "baz"
 908    "foo/bar/"           "foo/bar"     ""
 909    "foo"                ""            "foo"
 910    "foo/bar/baz%2fqux"  "foo/bar"     "baz/qux" (!)
 911
 912    DIR and FILE are freshly allocated.  */
 913
 914 static void
 915 split_path (const char *path, char **dir, char **file)
 916 {
 917   char *last_slash = strrchr (path, '/');
 918   if (!last_slash)
 919     {
 920       *dir = xstrdup ("");
 921       *file = xstrdup (path);
 922     }
 923   else
 924     {
 925       *dir = strdupdelim (path, last_slash);
 926       *file = xstrdup (last_slash + 1);
 927     }
 928   url_unescape (*dir);
 929   url_unescape (*file);
 930 }
 931
 932 /* Note: URL's "full path" is the path with the query string and
 933    params appended.  The "fragment" (#foo) is intentionally ignored,
 934    but that might be changed.  For example, if the original URL was
 935    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
 936    the full path will be "/foo/bar/baz;bullshit?querystring".  */
 937
 938 /* Return the length of the full path, without the terminating
 939    zero.  */
 940
 941 static int
 942 full_path_length (const struct url *url)
 943 {
 944   int len = 0;
 945
 946 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
 947
 948   FROB (path);
 949   FROB (params);
 950   FROB (query);
 951
 952 #undef FROB
 953
 954   return len;
 955 }
 956
 957 /* Write out the full path. */
 958
 959 static void
 960 full_path_write (const struct url *url, char *where)
 961 {
 962 #define FROB(el, chr) do {                      \
 963   char *f_el = url->el;                         \
 964   if (f_el) {                                   \
 965     int l = strlen (f_el);                      \
 966     *where++ = chr;                             \
 967     memcpy (where, f_el, l);                    \
 968     where += l;                                 \
 969   }                                             \
 970 } while (0)
 971
 972   FROB (path, '/');
 973   FROB (params, ';');
 974   FROB (query, '?');
 975
 976 #undef FROB
 977 }
 978
 979 /* Public function for getting the "full path".  E.g. if u->path is
 980    "foo/bar" and u->query is "param=value", full_path will be
 981    "/foo/bar?param=value". */
 982
 983 char *
 984 url_full_path (const struct url *url)
 985 {
 986   int length = full_path_length (url);
 987   char *full_path = xmalloc (length + 1);
 988
 989   full_path_write (url, full_path);
 990   full_path[length] = '\0';
 991
 992   return full_path;
 993 }
 994
 995 /* Unescape CHR in an otherwise escaped STR.  Used to selectively
 996    escaping of certain characters, such as "/" and ":".  Returns a
 997    count of unescaped chars.  */
 998
 999 static void
1000 unescape_single_char (char *str, char chr)
1001 {
1002   const char c1 = XNUM_TO_DIGIT (chr >> 4);
1003   const char c2 = XNUM_TO_DIGIT (chr & 0xf);
1004   char *h = str;                /* hare */
1005   char *t = str;                /* tortoise */
1006   for (; *h; h++, t++)
1007     {
1008       if (h[0] == '%' && h[1] == c1 && h[2] == c2)
1009         {
1010           *t = chr;
1011           h += 2;
1012         }
1013       else
1014         *t = *h;
1015     }
1016   *t = '\0';
1017 }
1018
1019 /* Escape unsafe and reserved characters, except for the slash
1020    characters.  */
1021
1022 static char *
1023 url_escape_dir (const char *dir)
1024 {
1025   char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1026   if (newdir == dir)
1027     return (char *)dir;
1028
1029   unescape_single_char (newdir, '/');
1030   return newdir;
1031 }
1032
1033 /* Sync u->path and u->url with u->dir and u->file.  Called after
1034    u->file or u->dir have been changed, typically by the FTP code.  */
1035
1036 static void
1037 sync_path (struct url *u)
1038 {
1039   char *newpath, *efile, *edir;
1040
1041   xfree (u->path);
1042
1043   /* u->dir and u->file are not escaped.  URL-escape them before
1044      reassembling them into u->path.  That way, if they contain
1045      separators like '?' or even if u->file contains slashes, the
1046      path will be correctly assembled.  (u->file can contain slashes
1047      if the URL specifies it with %2f, or if an FTP server returns
1048      it.)  */
1049   edir = url_escape_dir (u->dir);
1050   efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1051
1052   if (!*edir)
1053     newpath = xstrdup (efile);
1054   else
1055     {
1056       int dirlen = strlen (edir);
1057       int filelen = strlen (efile);
1058
1059       /* Copy "DIR/FILE" to newpath. */
1060       char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1061       memcpy (p, edir, dirlen);
1062       p += dirlen;
1063       *p++ = '/';
1064       memcpy (p, efile, filelen);
1065       p += filelen;
1066       *p = '\0';
1067     }
1068
1069   u->path = newpath;
1070
1071   if (edir != u->dir)
1072     xfree (edir);
1073   if (efile != u->file)
1074     xfree (efile);
1075
1076   /* Regenerate u->url as well.  */
1077   xfree (u->url);
1078   u->url = url_string (u, false);
1079 }
1080
1081 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1082    This way we can sync u->path and u->url when they get changed.  */
1083
1084 void
1085 url_set_dir (struct url *url, const char *newdir)
1086 {
1087   xfree (url->dir);
1088   url->dir = xstrdup (newdir);
1089   sync_path (url);
1090 }
1091
1092 void
1093 url_set_file (struct url *url, const char *newfile)
1094 {
1095   xfree (url->file);
1096   url->file = xstrdup (newfile);
1097   sync_path (url);
1098 }
1099
1100 void
1101 url_free (struct url *url)
1102 {
1103   xfree (url->host);
1104   xfree (url->path);
1105   xfree (url->url);
1106
1107   xfree_null (url->params);
1108   xfree_null (url->query);
1109   xfree_null (url->fragment);
1110   xfree_null (url->user);
1111   xfree_null (url->passwd);
1112
1113   xfree (url->dir);
1114   xfree (url->file);
1115
1116   xfree (url);
1117 }
1118 \f
1119 /* Create all the necessary directories for PATH (a file).  Calls
1120    make_directory internally.  */
1121 int
1122 mkalldirs (const char *path)
1123 {
1124   const char *p;
1125   char *t;
1126   struct_stat st;
1127   int res;
1128
1129   p = path + strlen (path);
1130   for (; *p != '/' && p != path; p--)
1131     ;
1132
1133   /* Don't create if it's just a file.  */
1134   if ((p == path) && (*p != '/'))
1135     return 0;
1136   t = strdupdelim (path, p);
1137
1138   /* Check whether the directory exists.  */
1139   if ((stat (t, &st) == 0))
1140     {
1141       if (S_ISDIR (st.st_mode))
1142         {
1143           xfree (t);
1144           return 0;
1145         }
1146       else
1147         {
1148           /* If the dir exists as a file name, remove it first.  This
1149              is *only* for Wget to work with buggy old CERN http
1150              servers.  Here is the scenario: When Wget tries to
1151              retrieve a directory without a slash, e.g.
1152              http://foo/bar (bar being a directory), CERN server will
1153              not redirect it too http://foo/bar/ -- it will generate a
1154              directory listing containing links to bar/file1,
1155              bar/file2, etc.  Wget will lose because it saves this
1156              HTML listing to a file `bar', so it cannot create the
1157              directory.  To work around this, if the file of the same
1158              name exists, we just remove it and create the directory
1159              anyway.  */
1160           DEBUGP (("Removing %s because of directory danger!\n", t));
1161           unlink (t);
1162         }
1163     }
1164   res = make_directory (t);
1165   if (res != 0)
1166     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1167   xfree (t);
1168   return res;
1169 }
1170 \f
1171 /* Functions for constructing the file name out of URL components.  */
1172
1173 /* A growable string structure, used by url_file_name and friends.
1174    This should perhaps be moved to utils.c.
1175
1176    The idea is to have a convenient and efficient way to construct a
1177    string by having various functions append data to it.  Instead of
1178    passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1179    functions in questions, we pass the pointer to this struct.  */
1180
1181 struct growable {
1182   char *base;
1183   int size;
1184   int tail;
1185 };
1186
1187 /* Ensure that the string can accept APPEND_COUNT more characters past
1188    the current TAIL position.  If necessary, this will grow the string
1189    and update its allocated size.  If the string is already large
1190    enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1191 #define GROW(g, append_size) do {                                       \
1192   struct growable *G_ = g;                                              \
1193   DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1194 } while (0)
1195
1196 /* Return the tail position of the string. */
1197 #define TAIL(r) ((r)->base + (r)->tail)
1198
1199 /* Move the tail position by APPEND_COUNT characters. */
1200 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1201
1202 /* Append the string STR to DEST.  NOTICE: the string in DEST is not
1203    terminated.  */
1204
1205 static void
1206 append_string (const char *str, struct growable *dest)
1207 {
1208   int l = strlen (str);
1209   GROW (dest, l);
1210   memcpy (TAIL (dest), str, l);
1211   TAIL_INCR (dest, l);
1212 }
1213
1214 /* Append CH to DEST.  For example, append_char (0, DEST)
1215    zero-terminates DEST.  */
1216
1217 static void
1218 append_char (char ch, struct growable *dest)
1219 {
1220   GROW (dest, 1);
1221   *TAIL (dest) = ch;
1222   TAIL_INCR (dest, 1);
1223 }
1224
1225 enum {
1226   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1227   filechr_not_windows = 2,      /* unusable on Windows, one of \|/<>?:*" */
1228   filechr_control     = 4       /* a control character, e.g. 0-31 */
1229 };
1230
1231 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1232
1233 /* Shorthands for the table: */
1234 #define U filechr_not_unix
1235 #define W filechr_not_windows
1236 #define C filechr_control
1237
1238 #define UW U|W
1239 #define UWC U|W|C
1240
1241 /* Table of characters unsafe under various conditions (see above).
1242
1243    Arguably we could also claim `%' to be unsafe, since we use it as
1244    the escape character.  If we ever want to be able to reliably
1245    translate file name back to URL, this would become important
1246    crucial.  Right now, it's better to be minimal in escaping.  */
1247
1248 static const unsigned char filechr_table[256] =
1249 {
1250 UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1251   C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1252   C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1253   C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1254   0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1255   0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1256   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1257   0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
1258   0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1259   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1260   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1261   0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1262   0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1263   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1264   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1265   0,  0,  0,  0,   0,  0,  0,  0,   /* x   y   z   {    |   }   ~   DEL */
1266
1267   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
1268   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
1269   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1270   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1271
1272   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1273   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1274   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1275   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1276 };
1277 #undef U
1278 #undef W
1279 #undef C
1280 #undef UW
1281 #undef UWC
1282
1283 /* FN_PORT_SEP is the separator between host and port in file names
1284    for non-standard port numbers.  On Unix this is normally ':', as in
1285    "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1286    because Windows can't handle ':' in file names.  */
1287 #define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1288
1289 /* FN_QUERY_SEP is the separator between the file name and the URL
1290    query, normally '?'.  Since Windows cannot handle '?' as part of
1291    file name, we use '@' instead there.  */
1292 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1293
1294 /* Quote path element, characters in [b, e), as file name, and append
1295    the quoted string to DEST.  Each character is quoted as per
1296    file_unsafe_char and the corresponding table.
1297
1298    If ESCAPED is true, the path element is considered to be
1299    URL-escaped and will be unescaped prior to inspection.  */
1300
1301 static void
1302 append_uri_pathel (const char *b, const char *e, bool escaped,
1303                    struct growable *dest)
1304 {
1305   const char *p;
1306   int quoted, outlen;
1307
1308   int mask;
1309   if (opt.restrict_files_os == restrict_unix)
1310     mask = filechr_not_unix;
1311   else
1312     mask = filechr_not_windows;
1313   if (opt.restrict_files_ctrl)
1314     mask |= filechr_control;
1315
1316   /* Copy [b, e) to PATHEL and URL-unescape it. */
1317   if (escaped)
1318     {
1319       char *unescaped;
1320       BOUNDED_TO_ALLOCA (b, e, unescaped);
1321       url_unescape (unescaped);
1322       b = unescaped;
1323       e = unescaped + strlen (unescaped);
1324     }
1325
1326   /* Defang ".." when found as component of path.  Remember that path
1327      comes from the URL and might contain malicious input.  */
1328   if (e - b == 2 && b[0] == '.' && b[1] == '.')
1329     {
1330       b = "%2E%2E";
1331       e = b + 6;
1332     }
1333
1334   /* Walk the PATHEL string and check how many characters we'll need
1335      to quote.  */
1336   quoted = 0;
1337   for (p = b; p < e; p++)
1338     if (FILE_CHAR_TEST (*p, mask))
1339       ++quoted;
1340
1341   /* Calculate the length of the output string.  e-b is the input
1342      string length.  Each quoted char introduces two additional
1343      characters in the string, hence 2*quoted.  */
1344   outlen = (e - b) + (2 * quoted);
1345   GROW (dest, outlen);
1346
1347   if (!quoted)
1348     {
1349       /* If there's nothing to quote, we can simply append the string
1350          without processing it again.  */
1351       memcpy (TAIL (dest), b, outlen);
1352     }
1353   else
1354     {
1355       char *q = TAIL (dest);
1356       for (p = b; p < e; p++)
1357         {
1358           if (!FILE_CHAR_TEST (*p, mask))
1359             *q++ = *p;
1360           else
1361             {
1362               unsigned char ch = *p;
1363               *q++ = '%';
1364               *q++ = XNUM_TO_DIGIT (ch >> 4);
1365               *q++ = XNUM_TO_DIGIT (ch & 0xf);
1366             }
1367         }
1368       assert (q - TAIL (dest) == outlen);
1369     }
1370   TAIL_INCR (dest, outlen);
1371 }
1372
1373 /* Append to DEST the directory structure that corresponds the
1374    directory part of URL's path.  For example, if the URL is
1375    http://server/dir1/dir2/file, this appends "/dir1/dir2".
1376
1377    Each path element ("dir1" and "dir2" in the above example) is
1378    examined, url-unescaped, and re-escaped as file name element.
1379
1380    Additionally, it cuts as many directories from the path as
1381    specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1382    will produce "bar" for the above example.  For 2 or more, it will
1383    produce "".
1384
1385    Each component of the path is quoted for use as file name.  */
1386
1387 static void
1388 append_dir_structure (const struct url *u, struct growable *dest)
1389 {
1390   char *pathel, *next;
1391   int cut = opt.cut_dirs;
1392
1393   /* Go through the path components, de-URL-quote them, and quote them
1394      (if necessary) as file names.  */
1395
1396   pathel = u->path;
1397   for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1398     {
1399       if (cut-- > 0)
1400         continue;
1401       if (pathel == next)
1402         /* Ignore empty pathels.  */
1403         continue;
1404
1405       if (dest->tail)
1406         append_char ('/', dest);
1407       append_uri_pathel (pathel, next, true, dest);
1408     }
1409 }
1410
1411 /* Return a unique file name that matches the given URL as good as
1412    possible.  Does not create directories on the file system.  */
1413
1414 char *
1415 url_file_name (const struct url *u)
1416 {
1417   struct growable fnres;        /* stands for "file name result" */
1418
1419   const char *u_file, *u_query;
1420   char *fname, *unique;
1421
1422   fnres.base = NULL;
1423   fnres.size = 0;
1424   fnres.tail = 0;
1425
1426   /* Start with the directory prefix, if specified. */
1427   if (opt.dir_prefix)
1428     append_string (opt.dir_prefix, &fnres);
1429
1430   /* If "dirstruct" is turned on (typically the case with -r), add
1431      the host and port (unless those have been turned off) and
1432      directory structure.  */
1433   if (opt.dirstruct)
1434     {
1435       if (opt.protocol_directories)
1436         {
1437           if (fnres.tail)
1438             append_char ('/', &fnres);
1439           append_string (supported_schemes[u->scheme].name, &fnres);
1440         }
1441       if (opt.add_hostdir)
1442         {
1443           if (fnres.tail)
1444             append_char ('/', &fnres);
1445           if (0 != strcmp (u->host, ".."))
1446             append_string (u->host, &fnres);
1447           else
1448             /* Host name can come from the network; malicious DNS may
1449                allow ".." to be resolved, causing us to write to
1450                "../<file>".  Defang such host names.  */
1451             append_string ("%2E%2E", &fnres);
1452           if (u->port != scheme_default_port (u->scheme))
1453             {
1454               char portstr[24];
1455               number_to_string (portstr, u->port);
1456               append_char (FN_PORT_SEP, &fnres);
1457               append_string (portstr, &fnres);
1458             }
1459         }
1460
1461       append_dir_structure (u, &fnres);
1462     }
1463
1464   /* Add the file name. */
1465   if (fnres.tail)
1466     append_char ('/', &fnres);
1467   u_file = *u->file ? u->file : "index.html";
1468   append_uri_pathel (u_file, u_file + strlen (u_file), false, &fnres);
1469
1470   /* Append "?query" to the file name. */
1471   u_query = u->query && *u->query ? u->query : NULL;
1472   if (u_query)
1473     {
1474       append_char (FN_QUERY_SEP, &fnres);
1475       append_uri_pathel (u_query, u_query + strlen (u_query), true, &fnres);
1476     }
1477
1478   /* Zero-terminate the file name. */
1479   append_char ('\0', &fnres);
1480
1481   fname = fnres.base;
1482
1483   /* Check the cases in which the unique extensions are not used:
1484      1) Clobbering is turned off (-nc).
1485      2) Retrieval with regetting.
1486      3) Timestamping is used.
1487      4) Hierarchy is built.
1488
1489      The exception is the case when file does exist and is a
1490      directory (see `mkalldirs' for explanation).  */
1491
1492   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1493       && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1494     return fname;
1495
1496   unique = unique_name (fname, true);
1497   if (unique != fname)
1498     xfree (fname);
1499   return unique;
1500 }
1501 \f
1502 /* Resolve "." and ".." elements of PATH by destructively modifying
1503    PATH and return true if PATH has been modified, false otherwise.
1504
1505    The algorithm is in spirit similar to the one described in rfc1808,
1506    although implemented differently, in one pass.  To recap, path
1507    elements containing only "." are removed, and ".." is taken to mean
1508    "back up one element".  Single leading and trailing slashes are
1509    preserved.
1510
1511    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1512    test examples are provided below.  If you change anything in this
1513    function, run test_path_simplify to make sure you haven't broken a
1514    test case.  */
1515
1516 static bool
1517 path_simplify (char *path)
1518 {
1519   char *h = path;               /* hare */
1520   char *t = path;               /* tortoise */
1521   char *beg = path;             /* boundary for backing the tortoise */
1522   char *end = path + strlen (path);
1523
1524   while (h < end)
1525     {
1526       /* Hare should be at the beginning of a path element. */
1527
1528       if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1529         {
1530           /* Ignore "./". */
1531           h += 2;
1532         }
1533       else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1534         {
1535           /* Handle "../" by retreating the tortoise by one path
1536              element -- but not past beggining.  */
1537           if (t > beg)
1538             {
1539               /* Move backwards until T hits the beginning of the
1540                  previous path element or the beginning of path. */
1541               for (--t; t > beg && t[-1] != '/'; t--)
1542                 ;
1543             }
1544           else
1545             {
1546               /* If we're at the beginning, copy the "../" literally
1547                  move the beginning so a later ".." doesn't remove
1548                  it.  */
1549               beg = t + 3;
1550               goto regular;
1551             }
1552           h += 3;
1553         }
1554       else
1555         {
1556         regular:
1557           /* A regular path element.  If H hasn't advanced past T,
1558              simply skip to the next path element.  Otherwise, copy
1559              the path element until the next slash.  */
1560           if (t == h)
1561             {
1562               /* Skip the path element, including the slash.  */
1563               while (h < end && *h != '/')
1564                 t++, h++;
1565               if (h < end)
1566                 t++, h++;
1567             }
1568           else
1569             {
1570               /* Copy the path element, including the final slash.  */
1571               while (h < end && *h != '/')
1572                 *t++ = *h++;
1573               if (h < end)
1574                 *t++ = *h++;
1575             }
1576         }
1577     }
1578
1579   if (t != h)
1580     *t = '\0';
1581
1582   return t != h;
1583 }
1584 \f
1585 /* Return the length of URL's path.  Path is considered to be
1586    terminated by one of '?', ';', '#', or by the end of the
1587    string.  */
1588
1589 static int
1590 path_length (const char *url)
1591 {
1592   const char *q = strpbrk_or_eos (url, "?;#");
1593   return q - url;
1594 }
1595
1596 /* Find the last occurrence of character C in the range [b, e), or
1597    NULL, if none are present.  We might want to use memrchr (a GNU
1598    extension) under GNU libc.  */
1599
1600 static const char *
1601 find_last_char (const char *b, const char *e, char c)
1602 {
1603   for (; e > b; e--)
1604     if (*e == c)
1605       return e;
1606   return NULL;
1607 }
1608
1609 /* Merge BASE with LINK and return the resulting URI.
1610
1611    Either of the URIs may be absolute or relative, complete with the
1612    host name, or path only.  This tries to reasonably handle all
1613    foreseeable cases.  It only employs minimal URL parsing, without
1614    knowledge of the specifics of schemes.
1615
1616    I briefly considered making this function call path_simplify after
1617    the merging process, as rfc1738 seems to suggest.  This is a bad
1618    idea for several reasons: 1) it complexifies the code, and 2)
1619    url_parse has to simplify path anyway, so it's wasteful to boot.  */
1620
1621 char *
1622 uri_merge (const char *base, const char *link)
1623 {
1624   int linklength;
1625   const char *end;
1626   char *merge;
1627
1628   if (url_has_scheme (link))
1629     return xstrdup (link);
1630
1631   /* We may not examine BASE past END. */
1632   end = base + path_length (base);
1633   linklength = strlen (link);
1634
1635   if (!*link)
1636     {
1637       /* Empty LINK points back to BASE, query string and all. */
1638       return xstrdup (base);
1639     }
1640   else if (*link == '?')
1641     {
1642       /* LINK points to the same location, but changes the query
1643          string.  Examples: */
1644       /* uri_merge("path",         "?new") -> "path?new"     */
1645       /* uri_merge("path?foo",     "?new") -> "path?new"     */
1646       /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1647       /* uri_merge("path#foo",     "?new") -> "path?new"     */
1648       int baselength = end - base;
1649       merge = xmalloc (baselength + linklength + 1);
1650       memcpy (merge, base, baselength);
1651       memcpy (merge + baselength, link, linklength);
1652       merge[baselength + linklength] = '\0';
1653     }
1654   else if (*link == '#')
1655     {
1656       /* uri_merge("path",         "#new") -> "path#new"     */
1657       /* uri_merge("path#foo",     "#new") -> "path#new"     */
1658       /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1659       /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1660       int baselength;
1661       const char *end1 = strchr (base, '#');
1662       if (!end1)
1663         end1 = base + strlen (base);
1664       baselength = end1 - base;
1665       merge = xmalloc (baselength + linklength + 1);
1666       memcpy (merge, base, baselength);
1667       memcpy (merge + baselength, link, linklength);
1668       merge[baselength + linklength] = '\0';
1669     }
1670   else if (*link == '/' && *(link + 1) == '/')
1671     {
1672       /* LINK begins with "//" and so is a net path: we need to
1673          replace everything after (and including) the double slash
1674          with LINK. */
1675
1676       /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1677       /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1678       /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1679
1680       int span;
1681       const char *slash;
1682       const char *start_insert;
1683
1684       /* Look for first slash. */
1685       slash = memchr (base, '/', end - base);
1686       /* If found slash and it is a double slash, then replace
1687          from this point, else default to replacing from the
1688          beginning.  */
1689       if (slash && *(slash + 1) == '/')
1690         start_insert = slash;
1691       else
1692         start_insert = base;
1693
1694       span = start_insert - base;
1695       merge = xmalloc (span + linklength + 1);
1696       if (span)
1697         memcpy (merge, base, span);
1698       memcpy (merge + span, link, linklength);
1699       merge[span + linklength] = '\0';
1700     }
1701   else if (*link == '/')
1702     {
1703       /* LINK is an absolute path: we need to replace everything
1704          after (and including) the FIRST slash with LINK.
1705
1706          So, if BASE is "http://host/whatever/foo/bar", and LINK is
1707          "/qux/xyzzy", our result should be
1708          "http://host/qux/xyzzy".  */
1709       int span;
1710       const char *slash;
1711       const char *start_insert = NULL; /* for gcc to shut up. */
1712       const char *pos = base;
1713       bool seen_slash_slash = false;
1714       /* We're looking for the first slash, but want to ignore
1715          double slash. */
1716     again:
1717       slash = memchr (pos, '/', end - pos);
1718       if (slash && !seen_slash_slash)
1719         if (*(slash + 1) == '/')
1720           {
1721             pos = slash + 2;
1722             seen_slash_slash = true;
1723             goto again;
1724           }
1725
1726       /* At this point, SLASH is the location of the first / after
1727          "//", or the first slash altogether.  START_INSERT is the
1728          pointer to the location where LINK will be inserted.  When
1729          examining the last two examples, keep in mind that LINK
1730          begins with '/'. */
1731
1732       if (!slash && !seen_slash_slash)
1733         /* example: "foo" */
1734         /*           ^    */
1735         start_insert = base;
1736       else if (!slash && seen_slash_slash)
1737         /* example: "http://foo" */
1738         /*                     ^ */
1739         start_insert = end;
1740       else if (slash && !seen_slash_slash)
1741         /* example: "foo/bar" */
1742         /*           ^        */
1743         start_insert = base;
1744       else if (slash && seen_slash_slash)
1745         /* example: "http://something/" */
1746         /*                           ^  */
1747         start_insert = slash;
1748
1749       span = start_insert - base;
1750       merge = xmalloc (span + linklength + 1);
1751       if (span)
1752         memcpy (merge, base, span);
1753       memcpy (merge + span, link, linklength);
1754       merge[span + linklength] = '\0';
1755     }
1756   else
1757     {
1758       /* LINK is a relative URL: we need to replace everything
1759          after last slash (possibly empty) with LINK.
1760
1761          So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1762          our result should be "whatever/foo/qux/xyzzy".  */
1763       bool need_explicit_slash = false;
1764       int span;
1765       const char *start_insert;
1766       const char *last_slash = find_last_char (base, end, '/');
1767       if (!last_slash)
1768         {
1769           /* No slash found at all.  Replace what we have with LINK. */
1770           start_insert = base;
1771         }
1772       else if (last_slash && last_slash >= base + 2
1773                && last_slash[-2] == ':' && last_slash[-1] == '/')
1774         {
1775           /* example: http://host"  */
1776           /*                      ^ */
1777           start_insert = end + 1;
1778           need_explicit_slash = true;
1779         }
1780       else
1781         {
1782           /* example: "whatever/foo/bar" */
1783           /*                        ^    */
1784           start_insert = last_slash + 1;
1785         }
1786
1787       span = start_insert - base;
1788       merge = xmalloc (span + linklength + 1);
1789       if (span)
1790         memcpy (merge, base, span);
1791       if (need_explicit_slash)
1792         merge[span - 1] = '/';
1793       memcpy (merge + span, link, linklength);
1794       merge[span + linklength] = '\0';
1795     }
1796
1797   return merge;
1798 }
1799 \f
1800 #define APPEND(p, s) do {                       \
1801   int len = strlen (s);                         \
1802   memcpy (p, s, len);                           \
1803   p += len;                                     \
1804 } while (0)
1805
1806 /* Use this instead of password when the actual password is supposed
1807    to be hidden.  We intentionally use a generic string without giving
1808    away the number of characters in the password, like previous
1809    versions did.  */
1810 #define HIDDEN_PASSWORD "*password*"
1811
1812 /* Recreate the URL string from the data in URL.
1813
1814    If HIDE is true (as it is when we're calling this on a URL we plan
1815    to print, but not when calling it to canonicalize a URL for use
1816    within the program), password will be hidden.  Unsafe characters in
1817    the URL will be quoted.  */
1818
1819 char *
1820 url_string (const struct url *url, bool hide_password)
1821 {
1822   int size;
1823   char *result, *p;
1824   char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;
1825
1826   int scheme_port = supported_schemes[url->scheme].default_port;
1827   const char *scheme_str = supported_schemes[url->scheme].leading_string;
1828   int fplen = full_path_length (url);
1829
1830   bool brackets_around_host;
1831
1832   assert (scheme_str != NULL);
1833
1834   /* Make sure the user name and password are quoted. */
1835   if (url->user)
1836     {
1837       quoted_user = url_escape_allow_passthrough (url->user);
1838       if (url->passwd)
1839         {
1840           if (hide_password)
1841             quoted_passwd = HIDDEN_PASSWORD;
1842           else
1843             quoted_passwd = url_escape_allow_passthrough (url->passwd);
1844         }
1845     }
1846
1847   /* In the unlikely event that the host name contains non-printable
1848      characters, quote it for displaying to the user.  */
1849   quoted_host = url_escape_allow_passthrough (url->host);
1850
1851   /* Undo the quoting of colons that URL escaping performs.  IPv6
1852      addresses may legally contain colons, and in that case must be
1853      placed in square brackets.  */
1854   if (quoted_host != url->host)
1855     unescape_single_char (quoted_host, ':');
1856   brackets_around_host = strchr (quoted_host, ':') != NULL;
1857
1858   size = (strlen (scheme_str)
1859           + strlen (quoted_host)
1860           + (brackets_around_host ? 2 : 0)
1861           + fplen
1862           + 1);
1863   if (url->port != scheme_port)
1864     size += 1 + numdigit (url->port);
1865   if (quoted_user)
1866     {
1867       size += 1 + strlen (quoted_user);
1868       if (quoted_passwd)
1869         size += 1 + strlen (quoted_passwd);
1870     }
1871
1872   p = result = xmalloc (size);
1873
1874   APPEND (p, scheme_str);
1875   if (quoted_user)
1876     {
1877       APPEND (p, quoted_user);
1878       if (quoted_passwd)
1879         {
1880           *p++ = ':';
1881           APPEND (p, quoted_passwd);
1882         }
1883       *p++ = '@';
1884     }
1885
1886   if (brackets_around_host)
1887     *p++ = '[';
1888   APPEND (p, quoted_host);
1889   if (brackets_around_host)
1890     *p++ = ']';
1891   if (url->port != scheme_port)
1892     {
1893       *p++ = ':';
1894       p = number_to_string (p, url->port);
1895     }
1896
1897   full_path_write (url, p);
1898   p += fplen;
1899   *p++ = '\0';
1900
1901   assert (p - result == size);
1902
1903   if (quoted_user && quoted_user != url->user)
1904     xfree (quoted_user);
1905   if (quoted_passwd && !hide_password && quoted_passwd != url->passwd)
1906     xfree (quoted_passwd);
1907   if (quoted_host != url->host)
1908     xfree (quoted_host);
1909
1910   return result;
1911 }
1912 \f
1913 /* Return true if scheme a is similar to scheme b.
1914
1915    Schemes are similar if they are equal.  If SSL is supported, schemes
1916    are also similar if one is http (SCHEME_HTTP) and the other is https
1917    (SCHEME_HTTPS).  */
1918 bool
1919 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
1920 {
1921   if (a == b)
1922     return true;
1923 #ifdef HAVE_SSL
1924   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
1925       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
1926     return true;
1927 #endif
1928   return false;
1929 }
1930 \f
1931 #if 0
1932 /* Debugging and testing support for path_simplify. */
1933
1934 /* Debug: run path_simplify on PATH and return the result in a new
1935    string.  Useful for calling from the debugger.  */
1936 static char *
1937 ps (char *path)
1938 {
1939   char *copy = xstrdup (path);
1940   path_simplify (copy);
1941   return copy;
1942 }
1943
1944 static void
1945 run_test (char *test, char *expected_result, bool expected_change)
1946 {
1947   char *test_copy = xstrdup (test);
1948   bool modified = path_simplify (test_copy);
1949
1950   if (0 != strcmp (test_copy, expected_result))
1951     {
1952       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
1953               test, expected_result, test_copy);
1954     }
1955   if (modified != expected_change)
1956     {
1957       if (expected_change)
1958         printf ("Expected modification with path_simplify(\"%s\").\n",
1959                 test);
1960       else
1961         printf ("Expected no modification with path_simplify(\"%s\").\n",
1962                 test);
1963     }
1964   xfree (test_copy);
1965 }
1966
1967 static void
1968 test_path_simplify (void)
1969 {
1970   static struct {
1971     char *test, *result;
1972     bool should_modify;
1973   } tests[] = {
1974     { "",                       "",             false },
1975     { ".",                      "",             true },
1976     { "./",                     "",             true },
1977     { "..",                     "..",           false },
1978     { "../",                    "../",          false },
1979     { "foo",                    "foo",          false },
1980     { "foo/bar",                "foo/bar",      false },
1981     { "foo///bar",              "foo///bar",    false },
1982     { "foo/.",                  "foo/",         true },
1983     { "foo/./",                 "foo/",         true },
1984     { "foo./",                  "foo./",        false },
1985     { "foo/../bar",             "bar",          true },
1986     { "foo/../bar/",            "bar/",         true },
1987     { "foo/bar/..",             "foo/",         true },
1988     { "foo/bar/../x",           "foo/x",        true },
1989     { "foo/bar/../x/",          "foo/x/",       true },
1990     { "foo/..",                 "",             true },
1991     { "foo/../..",              "..",           true },
1992     { "foo/../../..",           "../..",        true },
1993     { "foo/../../bar/../../baz", "../../baz",   true },
1994     { "a/b/../../c",            "c",            true },
1995     { "./a/../b",               "b",            true }
1996   };
1997   int i;
1998
1999   for (i = 0; i < countof (tests); i++)
2000     {
2001       char *test = tests[i].test;
2002       char *expected_result = tests[i].result;
2003       bool  expected_change = tests[i].should_modify;
2004       run_test (test, expected_result, expected_change);
2005     }
2006 }
2007 #endif