sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 2005 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include <config.h>
  31
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #ifdef HAVE_STRING_H
  35 # include <string.h>
  36 #else
  37 # include <strings.h>
  38 #endif
  39 #include <sys/types.h>
  40 #ifdef HAVE_UNISTD_H
  41 # include <unistd.h>
  42 #endif
  43 #include <errno.h>
  44 #include <assert.h>
  45
  46 #include "wget.h"
  47 #include "utils.h"
  48 #include "url.h"
  49 #include "host.h"  /* for is_valid_ipv6_address */
  50
  51 #ifndef errno
  52 extern int errno;
  53 #endif
  54
  55 struct scheme_data
  56 {
  57   const char *name;
  58   const char *leading_string;
  59   int default_port;
  60   int enabled;
  61 };
  62
  63 /* Supported schemes: */
  64 static struct scheme_data supported_schemes[] =
  65 {
  66   { "http",     "http://",  DEFAULT_HTTP_PORT,  1 },
  67 #ifdef HAVE_SSL
  68   { "https",    "https://", DEFAULT_HTTPS_PORT, 1 },
  69 #endif
  70   { "ftp",      "ftp://",   DEFAULT_FTP_PORT,   1 },
  71
  72   /* SCHEME_INVALID */
  73   { NULL,       NULL,       -1,                 0 }
  74 };
  75
  76 /* Forward declarations: */
  77
  78 static int path_simplify PARAMS ((char *));
  79 \f
  80 /* Support for escaping and unescaping of URL strings.  */
  81
  82 /* Table of "reserved" and "unsafe" characters.  Those terms are
  83    rfc1738-speak, as such largely obsoleted by rfc2396 and later
  84    specs, but the general idea remains.
  85
  86    A reserved character is the one that you can't decode without
  87    changing the meaning of the URL.  For example, you can't decode
  88    "/foo/%2f/bar" into "/foo///bar" because the number and contents of
  89    path components is different.  Non-reserved characters can be
  90    changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar".  The
  91    unsafe characters are loosely based on rfc1738, plus "$" and ",",
  92    as recommended by rfc2396, and minus "~", which is very frequently
  93    used (and sometimes unrecognized as %7E by broken servers).
  94
  95    An unsafe character is the one that should be encoded when URLs are
  96    placed in foreign environments.  E.g. space and newline are unsafe
  97    in HTTP contexts because HTTP uses them as separator and line
  98    terminator, so they must be encoded to %20 and %0A respectively.
  99    "*" is unsafe in shell context, etc.
 100
 101    We determine whether a character is unsafe through static table
 102    lookup.  This code assumes ASCII character set and 8-bit chars.  */
 103
 104 enum {
 105   /* rfc1738 reserved chars + "$" and ",".  */
 106   urlchr_reserved = 1,
 107
 108   /* rfc1738 unsafe chars, plus non-printables.  */
 109   urlchr_unsafe   = 2
 110 };
 111
 112 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 113 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 114 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 115
 116 /* Shorthands for the table: */
 117 #define R  urlchr_reserved
 118 #define U  urlchr_unsafe
 119 #define RU R|U
 120
 121 static const unsigned char urlchr_table[256] =
 122 {
 123   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 124   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 125   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 126   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 127   U,  0,  U, RU,   R,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 128   0,  0,  0,  R,   R,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 129   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 130   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 131  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 132   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 133   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 134   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 135   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 136   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 137   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 138   0,  0,  0,  U,   U,  U,  0,  U,   /* x   y   z   {    |   }   ~   DEL */
 139
 140   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 141   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 142   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 143   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 144
 145   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 146   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 147   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 148   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 149 };
 150 #undef R
 151 #undef U
 152 #undef RU
 153
 154 /* URL-unescape the string S.
 155
 156    This is done by transforming the sequences "%HH" to the character
 157    represented by the hexadecimal digits HH.  If % is not followed by
 158    two hexadecimal digits, it is inserted literally.
 159
 160    The transformation is done in place.  If you need the original
 161    string intact, make a copy before calling this function.  */
 162
 163 static void
 164 url_unescape (char *s)
 165 {
 166   char *t = s;                  /* t - tortoise */
 167   char *h = s;                  /* h - hare     */
 168
 169   for (; *h; h++, t++)
 170     {
 171       if (*h != '%')
 172         {
 173         copychar:
 174           *t = *h;
 175         }
 176       else
 177         {
 178           char c;
 179           /* Do nothing if '%' is not followed by two hex digits. */
 180           if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2])))
 181             goto copychar;
 182           c = X2DIGITS_TO_NUM (h[1], h[2]);
 183           /* Don't unescape %00 because there is no way to insert it
 184              into a C string without effectively truncating it. */
 185           if (c == '\0')
 186             goto copychar;
 187           *t = c;
 188           h += 2;
 189         }
 190     }
 191   *t = '\0';
 192 }
 193
 194 /* The core of url_escape_* functions.  Escapes the characters that
 195    match the provided mask in urlchr_table.
 196
 197    If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars
 198    will be returned unchanged.  If ALLOW_PASSTHROUGH is zero, a
 199    freshly allocated string will be returned in all cases.  */
 200
 201 static char *
 202 url_escape_1 (const char *s, unsigned char mask, int allow_passthrough)
 203 {
 204   const char *p1;
 205   char *p2, *newstr;
 206   int newlen;
 207   int addition = 0;
 208
 209   for (p1 = s; *p1; p1++)
 210     if (urlchr_test (*p1, mask))
 211       addition += 2;            /* Two more characters (hex digits) */
 212
 213   if (!addition)
 214     return allow_passthrough ? (char *)s : xstrdup (s);
 215
 216   newlen = (p1 - s) + addition;
 217   newstr = (char *)xmalloc (newlen + 1);
 218
 219   p1 = s;
 220   p2 = newstr;
 221   while (*p1)
 222     {
 223       /* Quote the characters that match the test mask. */
 224       if (urlchr_test (*p1, mask))
 225         {
 226           unsigned char c = *p1++;
 227           *p2++ = '%';
 228           *p2++ = XNUM_TO_DIGIT (c >> 4);
 229           *p2++ = XNUM_TO_DIGIT (c & 0xf);
 230         }
 231       else
 232         *p2++ = *p1++;
 233     }
 234   assert (p2 - newstr == newlen);
 235   *p2 = '\0';
 236
 237   return newstr;
 238 }
 239
 240 /* URL-escape the unsafe characters (see urlchr_table) in a given
 241    string, returning a freshly allocated string.  */
 242
 243 char *
 244 url_escape (const char *s)
 245 {
 246   return url_escape_1 (s, urlchr_unsafe, 0);
 247 }
 248
 249 /* URL-escape the unsafe characters (see urlchr_table) in a given
 250    string.  If no characters are unsafe, S is returned.  */
 251
 252 static char *
 253 url_escape_allow_passthrough (const char *s)
 254 {
 255   return url_escape_1 (s, urlchr_unsafe, 1);
 256 }
 257 \f
 258 /* Decide whether the char at position P needs to be encoded.  (It is
 259    not enough to pass a single char *P because the function may need
 260    to inspect the surrounding context.)
 261
 262    Return 1 if the char should be escaped as %XX, 0 otherwise.  */
 263
 264 static inline int
 265 char_needs_escaping (const char *p)
 266 {
 267   if (*p == '%')
 268     {
 269       if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
 270         return 0;
 271       else
 272         /* Garbled %.. sequence: encode `%'. */
 273         return 1;
 274     }
 275   else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
 276     return 1;
 277   else
 278     return 0;
 279 }
 280
 281 /* Translate a %-escaped (but possibly non-conformant) input string S
 282    into a %-escaped (and conformant) output string.  If no characters
 283    are encoded or decoded, return the same string S; otherwise, return
 284    a freshly allocated string with the new contents.
 285
 286    After a URL has been run through this function, the protocols that
 287    use `%' as the quote character can use the resulting string as-is,
 288    while those that don't can use url_unescape to get to the intended
 289    data.  This function is stable: once the input is transformed,
 290    further transformations of the result yield the same output.
 291
 292    Let's discuss why this function is needed.
 293
 294    Imagine Wget is asked to retrieve `http://abc.xyz/abc def'.  Since
 295    a raw space character would mess up the HTTP request, it needs to
 296    be quoted, like this:
 297
 298        GET /abc%20def HTTP/1.0
 299
 300    It would appear that the unsafe chars need to be quoted, for
 301    example with url_escape.  But what if we're requested to download
 302    `abc%20def'?  url_escape transforms "%" to "%25", which would leave
 303    us with `abc%2520def'.  This is incorrect -- since %-escapes are
 304    part of URL syntax, "%20" is the correct way to denote a literal
 305    space on the Wget command line.  This leads to the conclusion that
 306    in that case Wget should not call url_escape, but leave the `%20'
 307    as is.  This is clearly contradictory, but it only gets worse.
 308
 309    What if the requested URI is `abc%20 def'?  If we call url_escape,
 310    we end up with `/abc%2520%20def', which is almost certainly not
 311    intended.  If we don't call url_escape, we are left with the
 312    embedded space and cannot complete the request.  What the user
 313    meant was for Wget to request `/abc%20%20def', and this is where
 314    reencode_escapes kicks in.
 315
 316    Wget used to solve this by first decoding %-quotes, and then
 317    encoding all the "unsafe" characters found in the resulting string.
 318    This was wrong because it didn't preserve certain URL special
 319    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 320    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 321    whether we considered `+' reserved (it is).  One of these results
 322    is inevitable because by the second step we would lose information
 323    on whether the `+' was originally encoded or not.  Both results
 324    were wrong because in CGI parameters + means space, while %2B means
 325    literal plus.  reencode_escapes correctly translates the above to
 326    "a%2B+b", i.e. returns the original string.
 327
 328    This function uses a modified version of the algorithm originally
 329    proposed by Anon Sricharoenchai:
 330
 331    * Encode all "unsafe" characters, except those that are also
 332      "reserved", to %XX.  See urlchr_table for which characters are
 333      unsafe and reserved.
 334
 335    * Encode the "%" characters not followed by two hex digits to
 336      "%25".
 337
 338    * Pass through all other characters and %XX escapes as-is.  (Up to
 339      Wget 1.10 this decoded %XX escapes corresponding to "safe"
 340      characters, but that was obtrusive and broke some servers.)
 341
 342    Anon's test case:
 343
 344    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 345    ->
 346    "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc"
 347
 348    Simpler test cases:
 349
 350    "foo bar"         -> "foo%20bar"
 351    "foo%20bar"       -> "foo%20bar"
 352    "foo %20bar"      -> "foo%20%20bar"
 353    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 354    "foo%25%20bar"    -> "foo%25%20bar"
 355    "foo%2%20bar"     -> "foo%252%20bar"
 356    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 357    "foo%2b+bar"      -> "foo%2b+bar"  */
 358
 359 static char *
 360 reencode_escapes (const char *s)
 361 {
 362   const char *p1;
 363   char *newstr, *p2;
 364   int oldlen, newlen;
 365
 366   int encode_count = 0;
 367
 368   /* First pass: inspect the string to see if there's anything to do,
 369      and to calculate the new length.  */
 370   for (p1 = s; *p1; p1++)
 371     if (char_needs_escaping (p1))
 372       ++encode_count;
 373
 374   if (!encode_count)
 375     /* The string is good as it is. */
 376     return (char *) s;          /* C const model sucks. */
 377
 378   oldlen = p1 - s;
 379   /* Each encoding adds two characters (hex digits).  */
 380   newlen = oldlen + 2 * encode_count;
 381   newstr = xmalloc (newlen + 1);
 382
 383   /* Second pass: copy the string to the destination address, encoding
 384      chars when needed.  */
 385   p1 = s;
 386   p2 = newstr;
 387
 388   while (*p1)
 389     if (char_needs_escaping (p1))
 390       {
 391         unsigned char c = *p1++;
 392         *p2++ = '%';
 393         *p2++ = XNUM_TO_DIGIT (c >> 4);
 394         *p2++ = XNUM_TO_DIGIT (c & 0xf);
 395       }
 396     else
 397       *p2++ = *p1++;
 398
 399   *p2 = '\0';
 400   assert (p2 - newstr == newlen);
 401   return newstr;
 402 }
 403 \f
 404 /* Returns the scheme type if the scheme is supported, or
 405    SCHEME_INVALID if not.  */
 406
 407 enum url_scheme
 408 url_scheme (const char *url)
 409 {
 410   int i;
 411
 412   for (i = 0; supported_schemes[i].leading_string; i++)
 413     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 414                           strlen (supported_schemes[i].leading_string)))
 415       {
 416         if (supported_schemes[i].enabled)
 417           return (enum url_scheme) i;
 418         else
 419           return SCHEME_INVALID;
 420       }
 421
 422   return SCHEME_INVALID;
 423 }
 424
 425 #define SCHEME_CHAR(ch) (ISALNUM (ch) || (ch) == '-' || (ch) == '+')
 426
 427 /* Return 1 if the URL begins with any "scheme", 0 otherwise.  As
 428    currently implemented, it returns true if URL begins with
 429    [-+a-zA-Z0-9]+: .  */
 430
 431 int
 432 url_has_scheme (const char *url)
 433 {
 434   const char *p = url;
 435
 436   /* The first char must be a scheme char. */
 437   if (!*p || !SCHEME_CHAR (*p))
 438     return 0;
 439   ++p;
 440   /* Followed by 0 or more scheme chars. */
 441   while (*p && SCHEME_CHAR (*p))
 442     ++p;
 443   /* Terminated by ':'. */
 444   return *p == ':';
 445 }
 446
 447 int
 448 scheme_default_port (enum url_scheme scheme)
 449 {
 450   return supported_schemes[scheme].default_port;
 451 }
 452
 453 void
 454 scheme_disable (enum url_scheme scheme)
 455 {
 456   supported_schemes[scheme].enabled = 0;
 457 }
 458
 459 /* Skip the username and password, if present in the URL.  The
 460    function should *not* be called with the complete URL, but with the
 461    portion after the scheme.
 462
 463    If no username and password are found, return URL.  */
 464
 465 static const char *
 466 url_skip_credentials (const char *url)
 467 {
 468   /* Look for '@' that comes before terminators, such as '/', '?',
 469      '#', or ';'.  */
 470   const char *p = (const char *)strpbrk (url, "@/?#;");
 471   if (!p || *p != '@')
 472     return url;
 473   return p + 1;
 474 }
 475
 476 /* Parse credentials contained in [BEG, END).  The region is expected
 477    to have come from a URL and is unescaped.  */
 478
 479 static int
 480 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
 481 {
 482   char *colon;
 483   const char *userend;
 484
 485   if (beg == end)
 486     return 0;                   /* empty user name */
 487
 488   colon = memchr (beg, ':', end - beg);
 489   if (colon == beg)
 490     return 0;                   /* again empty user name */
 491
 492   if (colon)
 493     {
 494       *passwd = strdupdelim (colon + 1, end);
 495       userend = colon;
 496       url_unescape (*passwd);
 497     }
 498   else
 499     {
 500       *passwd = NULL;
 501       userend = end;
 502     }
 503   *user = strdupdelim (beg, userend);
 504   url_unescape (*user);
 505   return 1;
 506 }
 507
 508 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 509    popularized by Netscape and NcFTP.  HTTP shorthands look like this:
 510
 511    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 512    www.foo.com[:port]            -> http://www.foo.com[:port]
 513
 514    FTP shorthands look like this:
 515
 516    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 517    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 518
 519    If the URL needs not or cannot be rewritten, return NULL.  */
 520
 521 char *
 522 rewrite_shorthand_url (const char *url)
 523 {
 524   const char *p;
 525
 526   if (url_scheme (url) != SCHEME_INVALID)
 527     return NULL;
 528
 529   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 530      latter Netscape.  */
 531   for (p = url; *p && *p != ':' && *p != '/'; p++)
 532     ;
 533
 534   if (p == url)
 535     return NULL;
 536
 537   /* If we're looking at "://", it means the URL uses a scheme we
 538      don't support, which may include "https" when compiled without
 539      SSL support.  Don't bogusly rewrite such URLs.  */
 540   if (p[0] == ':' && p[1] == '/' && p[2] == '/')
 541     return NULL;
 542
 543   if (*p == ':')
 544     {
 545       const char *pp;
 546       char *res;
 547       /* If the characters after the colon and before the next slash
 548          or end of string are all digits, it's HTTP.  */
 549       int digits = 0;
 550       for (pp = p + 1; ISDIGIT (*pp); pp++)
 551         ++digits;
 552       if (digits > 0 && (*pp == '/' || *pp == '\0'))
 553         goto http;
 554
 555       /* Prepend "ftp://" to the entire URL... */
 556       res = xmalloc (6 + strlen (url) + 1);
 557       sprintf (res, "ftp://%s", url);
 558       /* ...and replace ':' with '/'. */
 559       res[6 + (p - url)] = '/';
 560       return res;
 561     }
 562   else
 563     {
 564       char *res;
 565     http:
 566       /* Just prepend "http://" to what we have. */
 567       res = xmalloc (7 + strlen (url) + 1);
 568       sprintf (res, "http://%s", url);
 569       return res;
 570     }
 571 }
 572 \f
 573 static void split_path PARAMS ((const char *, char **, char **));
 574
 575 /* Like strpbrk, with the exception that it returns the pointer to the
 576    terminating zero (end-of-string aka "eos") if no matching character
 577    is found.
 578
 579    Although I normally balk at Gcc-specific optimizations, it probably
 580    makes sense here: glibc has optimizations that detect strpbrk being
 581    called with literal string as ACCEPT and inline the search.  That
 582    optimization is defeated if strpbrk is hidden within the call to
 583    another function.  (And no, making strpbrk_or_eos inline doesn't
 584    help because the check for literal accept is in the
 585    preprocessor.)  */
 586
 587 #ifdef __GNUC__
 588
 589 #define strpbrk_or_eos(s, accept) ({            \
 590   char *SOE_p = strpbrk (s, accept);            \
 591   if (!SOE_p)                                   \
 592     SOE_p = strchr (s, '\0');                   \
 593   SOE_p;                                        \
 594 })
 595
 596 #else  /* not __GNUC__ */
 597
 598 static inline char *
 599 strpbrk_or_eos (const char *s, const char *accept)
 600 {
 601   char *p = strpbrk (s, accept);
 602   if (!p)
 603     p = strchr (s, '\0');
 604   return p;
 605 }
 606 #endif /* not __GNUC__ */
 607
 608 /* Turn STR into lowercase; return non-zero if a character was
 609    actually changed. */
 610
 611 static int
 612 lowercase_str (char *str)
 613 {
 614   int change = 0;
 615   for (; *str; str++)
 616     if (ISUPPER (*str))
 617       {
 618         change = 1;
 619         *str = TOLOWER (*str);
 620       }
 621   return change;
 622 }
 623
 624 static const char *parse_errors[] = {
 625 #define PE_NO_ERROR                     0
 626   N_("No error"),
 627 #define PE_UNSUPPORTED_SCHEME           1
 628   N_("Unsupported scheme"),
 629 #define PE_EMPTY_HOST                   2
 630   N_("Empty host"),
 631 #define PE_BAD_PORT_NUMBER              3
 632   N_("Bad port number"),
 633 #define PE_INVALID_USER_NAME            4
 634   N_("Invalid user name"),
 635 #define PE_UNTERMINATED_IPV6_ADDRESS    5
 636   N_("Unterminated IPv6 numeric address"),
 637 #define PE_IPV6_NOT_SUPPORTED           6
 638   N_("IPv6 addresses not supported"),
 639 #define PE_INVALID_IPV6_ADDRESS         7
 640   N_("Invalid IPv6 numeric address")
 641 };
 642
 643 /* Parse a URL.
 644
 645    Return a new struct url if successful, NULL on error.  In case of
 646    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 647    error code. */
 648 struct url *
 649 url_parse (const char *url, int *error)
 650 {
 651   struct url *u;
 652   const char *p;
 653   int path_modified, host_modified;
 654
 655   enum url_scheme scheme;
 656
 657   const char *uname_b,     *uname_e;
 658   const char *host_b,      *host_e;
 659   const char *path_b,      *path_e;
 660   const char *params_b,    *params_e;
 661   const char *query_b,     *query_e;
 662   const char *fragment_b,  *fragment_e;
 663
 664   int port;
 665   char *user = NULL, *passwd = NULL;
 666
 667   char *url_encoded = NULL;
 668
 669   int error_code;
 670
 671   scheme = url_scheme (url);
 672   if (scheme == SCHEME_INVALID)
 673     {
 674       error_code = PE_UNSUPPORTED_SCHEME;
 675       goto err;
 676     }
 677
 678   url_encoded = reencode_escapes (url);
 679   p = url_encoded;
 680
 681   p += strlen (supported_schemes[scheme].leading_string);
 682   uname_b = p;
 683   p = url_skip_credentials (p);
 684   uname_e = p;
 685
 686   /* scheme://user:pass@host[:port]... */
 687   /*                    ^              */
 688
 689   /* We attempt to break down the URL into the components path,
 690      params, query, and fragment.  They are ordered like this:
 691
 692        scheme://host[:port][/path][;params][?query][#fragment]  */
 693
 694   params_b   = params_e   = NULL;
 695   query_b    = query_e    = NULL;
 696   fragment_b = fragment_e = NULL;
 697
 698   host_b = p;
 699
 700   if (*p == '[')
 701     {
 702       /* Handle IPv6 address inside square brackets.  Ideally we'd
 703          just look for the terminating ']', but rfc2732 mandates
 704          rejecting invalid IPv6 addresses.  */
 705
 706       /* The address begins after '['. */
 707       host_b = p + 1;
 708       host_e = strchr (host_b, ']');
 709
 710       if (!host_e)
 711         {
 712           error_code = PE_UNTERMINATED_IPV6_ADDRESS;
 713           goto err;
 714         }
 715
 716 #ifdef ENABLE_IPV6
 717       /* Check if the IPv6 address is valid. */
 718       if (!is_valid_ipv6_address(host_b, host_e))
 719         {
 720           error_code = PE_INVALID_IPV6_ADDRESS;
 721           goto err;
 722         }
 723
 724       /* Continue parsing after the closing ']'. */
 725       p = host_e + 1;
 726 #else
 727       error_code = PE_IPV6_NOT_SUPPORTED;
 728       goto err;
 729 #endif
 730     }
 731   else
 732     {
 733       p = strpbrk_or_eos (p, ":/;?#");
 734       host_e = p;
 735     }
 736
 737   if (host_b == host_e)
 738     {
 739       error_code = PE_EMPTY_HOST;
 740       goto err;
 741     }
 742
 743   port = scheme_default_port (scheme);
 744   if (*p == ':')
 745     {
 746       const char *port_b, *port_e, *pp;
 747
 748       /* scheme://host:port/tralala */
 749       /*              ^             */
 750       ++p;
 751       port_b = p;
 752       p = strpbrk_or_eos (p, "/;?#");
 753       port_e = p;
 754
 755       /* Allow empty port, as per rfc2396. */
 756       if (port_b != port_e)
 757         {
 758           for (port = 0, pp = port_b; pp < port_e; pp++)
 759             {
 760               if (!ISDIGIT (*pp))
 761                 {
 762                   /* http://host:12randomgarbage/blah */
 763                   /*               ^                  */
 764                   error_code = PE_BAD_PORT_NUMBER;
 765                   goto err;
 766                 }
 767               port = 10 * port + (*pp - '0');
 768               /* Check for too large port numbers here, before we have
 769                  a chance to overflow on bogus port values.  */
 770               if (port > 65535)
 771                 {
 772                   error_code = PE_BAD_PORT_NUMBER;
 773                   goto err;
 774                 }
 775             }
 776         }
 777     }
 778
 779   if (*p == '/')
 780     {
 781       ++p;
 782       path_b = p;
 783       p = strpbrk_or_eos (p, ";?#");
 784       path_e = p;
 785     }
 786   else
 787     {
 788       /* Path is not allowed not to exist. */
 789       path_b = path_e = p;
 790     }
 791
 792   if (*p == ';')
 793     {
 794       ++p;
 795       params_b = p;
 796       p = strpbrk_or_eos (p, "?#");
 797       params_e = p;
 798     }
 799   if (*p == '?')
 800     {
 801       ++p;
 802       query_b = p;
 803       p = strpbrk_or_eos (p, "#");
 804       query_e = p;
 805
 806       /* Hack that allows users to use '?' (a wildcard character) in
 807          FTP URLs without it being interpreted as a query string
 808          delimiter.  */
 809       if (scheme == SCHEME_FTP)
 810         {
 811           query_b = query_e = NULL;
 812           path_e = p;
 813         }
 814     }
 815   if (*p == '#')
 816     {
 817       ++p;
 818       fragment_b = p;
 819       p += strlen (p);
 820       fragment_e = p;
 821     }
 822   assert (*p == 0);
 823
 824   if (uname_b != uname_e)
 825     {
 826       /* http://user:pass@host */
 827       /*        ^         ^    */
 828       /*     uname_b   uname_e */
 829       if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
 830         {
 831           error_code = PE_INVALID_USER_NAME;
 832           goto err;
 833         }
 834     }
 835
 836   u = xnew0 (struct url);
 837   u->scheme = scheme;
 838   u->host   = strdupdelim (host_b, host_e);
 839   u->port   = port;
 840   u->user   = user;
 841   u->passwd = passwd;
 842
 843   u->path = strdupdelim (path_b, path_e);
 844   path_modified = path_simplify (u->path);
 845   split_path (u->path, &u->dir, &u->file);
 846
 847   host_modified = lowercase_str (u->host);
 848
 849   /* Decode %HH sequences in host name.  This is important not so much
 850      to support %HH sequences in host names (which other browser
 851      don't), but to support binary characters (which will have been
 852      converted to %HH by reencode_escapes).  */
 853   if (strchr (u->host, '%'))
 854     {
 855       url_unescape (u->host);
 856       host_modified = 1;
 857     }
 858
 859   if (params_b)
 860     u->params = strdupdelim (params_b, params_e);
 861   if (query_b)
 862     u->query = strdupdelim (query_b, query_e);
 863   if (fragment_b)
 864     u->fragment = strdupdelim (fragment_b, fragment_e);
 865
 866   if (path_modified || u->fragment || host_modified || path_b == path_e)
 867     {
 868       /* If we suspect that a transformation has rendered what
 869          url_string might return different from URL_ENCODED, rebuild
 870          u->url using url_string.  */
 871       u->url = url_string (u, 0);
 872
 873       if (url_encoded != url)
 874         xfree ((char *) url_encoded);
 875     }
 876   else
 877     {
 878       if (url_encoded == url)
 879         u->url = xstrdup (url);
 880       else
 881         u->url = url_encoded;
 882     }
 883
 884   return u;
 885
 886  err:
 887   /* Cleanup in case of error: */
 888   if (url_encoded && url_encoded != url)
 889     xfree (url_encoded);
 890
 891   /* Transmit the error code to the caller, if the caller wants to
 892      know.  */
 893   if (error)
 894     *error = error_code;
 895   return NULL;
 896 }
 897
 898 /* Return the error message string from ERROR_CODE, which should have
 899    been retrieved from url_parse.  The error message is translated.  */
 900
 901 const char *
 902 url_error (int error_code)
 903 {
 904   assert (error_code >= 0 && error_code < countof (parse_errors));
 905   return _(parse_errors[error_code]);
 906 }
 907
 908 /* Split PATH into DIR and FILE.  PATH comes from the URL and is
 909    expected to be URL-escaped.
 910
 911    The path is split into directory (the part up to the last slash)
 912    and file (the part after the last slash), which are subsequently
 913    unescaped.  Examples:
 914
 915    PATH                 DIR           FILE
 916    "foo/bar/baz"        "foo/bar"     "baz"
 917    "foo/bar/"           "foo/bar"     ""
 918    "foo"                ""            "foo"
 919    "foo/bar/baz%2fqux"  "foo/bar"     "baz/qux" (!)
 920
 921    DIR and FILE are freshly allocated.  */
 922
 923 static void
 924 split_path (const char *path, char **dir, char **file)
 925 {
 926   char *last_slash = strrchr (path, '/');
 927   if (!last_slash)
 928     {
 929       *dir = xstrdup ("");
 930       *file = xstrdup (path);
 931     }
 932   else
 933     {
 934       *dir = strdupdelim (path, last_slash);
 935       *file = xstrdup (last_slash + 1);
 936     }
 937   url_unescape (*dir);
 938   url_unescape (*file);
 939 }
 940
 941 /* Note: URL's "full path" is the path with the query string and
 942    params appended.  The "fragment" (#foo) is intentionally ignored,
 943    but that might be changed.  For example, if the original URL was
 944    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
 945    the full path will be "/foo/bar/baz;bullshit?querystring".  */
 946
 947 /* Return the length of the full path, without the terminating
 948    zero.  */
 949
 950 static int
 951 full_path_length (const struct url *url)
 952 {
 953   int len = 0;
 954
 955 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
 956
 957   FROB (path);
 958   FROB (params);
 959   FROB (query);
 960
 961 #undef FROB
 962
 963   return len;
 964 }
 965
 966 /* Write out the full path. */
 967
 968 static void
 969 full_path_write (const struct url *url, char *where)
 970 {
 971 #define FROB(el, chr) do {                      \
 972   char *f_el = url->el;                         \
 973   if (f_el) {                                   \
 974     int l = strlen (f_el);                      \
 975     *where++ = chr;                             \
 976     memcpy (where, f_el, l);                    \
 977     where += l;                                 \
 978   }                                             \
 979 } while (0)
 980
 981   FROB (path, '/');
 982   FROB (params, ';');
 983   FROB (query, '?');
 984
 985 #undef FROB
 986 }
 987
 988 /* Public function for getting the "full path".  E.g. if u->path is
 989    "foo/bar" and u->query is "param=value", full_path will be
 990    "/foo/bar?param=value". */
 991
 992 char *
 993 url_full_path (const struct url *url)
 994 {
 995   int length = full_path_length (url);
 996   char *full_path = (char *) xmalloc (length + 1);
 997
 998   full_path_write (url, full_path);
 999   full_path[length] = '\0';
1000
1001   return full_path;
1002 }
1003
1004 /* Unescape CHR in an otherwise escaped STR.  Used to selectively
1005    escaping of certain characters, such as "/" and ":".  Returns a
1006    count of unescaped chars.  */
1007
1008 static void
1009 unescape_single_char (char *str, char chr)
1010 {
1011   const char c1 = XNUM_TO_DIGIT (chr >> 4);
1012   const char c2 = XNUM_TO_DIGIT (chr & 0xf);
1013   char *h = str;                /* hare */
1014   char *t = str;                /* tortoise */
1015   for (; *h; h++, t++)
1016     {
1017       if (h[0] == '%' && h[1] == c1 && h[2] == c2)
1018         {
1019           *t = chr;
1020           h += 2;
1021         }
1022       else
1023         *t = *h;
1024     }
1025   *t = '\0';
1026 }
1027
1028 /* Escape unsafe and reserved characters, except for the slash
1029    characters.  */
1030
1031 static char *
1032 url_escape_dir (const char *dir)
1033 {
1034   char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1035   if (newdir == dir)
1036     return (char *)dir;
1037
1038   unescape_single_char (newdir, '/');
1039   return newdir;
1040 }
1041
1042 /* Sync u->path and u->url with u->dir and u->file.  Called after
1043    u->file or u->dir have been changed, typically by the FTP code.  */
1044
1045 static void
1046 sync_path (struct url *u)
1047 {
1048   char *newpath, *efile, *edir;
1049
1050   xfree (u->path);
1051
1052   /* u->dir and u->file are not escaped.  URL-escape them before
1053      reassembling them into u->path.  That way, if they contain
1054      separators like '?' or even if u->file contains slashes, the
1055      path will be correctly assembled.  (u->file can contain slashes
1056      if the URL specifies it with %2f, or if an FTP server returns
1057      it.)  */
1058   edir = url_escape_dir (u->dir);
1059   efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1060
1061   if (!*edir)
1062     newpath = xstrdup (efile);
1063   else
1064     {
1065       int dirlen = strlen (edir);
1066       int filelen = strlen (efile);
1067
1068       /* Copy "DIR/FILE" to newpath. */
1069       char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1070       memcpy (p, edir, dirlen);
1071       p += dirlen;
1072       *p++ = '/';
1073       memcpy (p, efile, filelen);
1074       p += filelen;
1075       *p = '\0';
1076     }
1077
1078   u->path = newpath;
1079
1080   if (edir != u->dir)
1081     xfree (edir);
1082   if (efile != u->file)
1083     xfree (efile);
1084
1085   /* Regenerate u->url as well.  */
1086   xfree (u->url);
1087   u->url = url_string (u, 0);
1088 }
1089
1090 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1091    This way we can sync u->path and u->url when they get changed.  */
1092
1093 void
1094 url_set_dir (struct url *url, const char *newdir)
1095 {
1096   xfree (url->dir);
1097   url->dir = xstrdup (newdir);
1098   sync_path (url);
1099 }
1100
1101 void
1102 url_set_file (struct url *url, const char *newfile)
1103 {
1104   xfree (url->file);
1105   url->file = xstrdup (newfile);
1106   sync_path (url);
1107 }
1108
1109 void
1110 url_free (struct url *url)
1111 {
1112   xfree (url->host);
1113   xfree (url->path);
1114   xfree (url->url);
1115
1116   xfree_null (url->params);
1117   xfree_null (url->query);
1118   xfree_null (url->fragment);
1119   xfree_null (url->user);
1120   xfree_null (url->passwd);
1121
1122   xfree (url->dir);
1123   xfree (url->file);
1124
1125   xfree (url);
1126 }
1127 \f
1128 /* Create all the necessary directories for PATH (a file).  Calls
1129    make_directory internally.  */
1130 int
1131 mkalldirs (const char *path)
1132 {
1133   const char *p;
1134   char *t;
1135   struct_stat st;
1136   int res;
1137
1138   p = path + strlen (path);
1139   for (; *p != '/' && p != path; p--)
1140     ;
1141
1142   /* Don't create if it's just a file.  */
1143   if ((p == path) && (*p != '/'))
1144     return 0;
1145   t = strdupdelim (path, p);
1146
1147   /* Check whether the directory exists.  */
1148   if ((stat (t, &st) == 0))
1149     {
1150       if (S_ISDIR (st.st_mode))
1151         {
1152           xfree (t);
1153           return 0;
1154         }
1155       else
1156         {
1157           /* If the dir exists as a file name, remove it first.  This
1158              is *only* for Wget to work with buggy old CERN http
1159              servers.  Here is the scenario: When Wget tries to
1160              retrieve a directory without a slash, e.g.
1161              http://foo/bar (bar being a directory), CERN server will
1162              not redirect it too http://foo/bar/ -- it will generate a
1163              directory listing containing links to bar/file1,
1164              bar/file2, etc.  Wget will lose because it saves this
1165              HTML listing to a file `bar', so it cannot create the
1166              directory.  To work around this, if the file of the same
1167              name exists, we just remove it and create the directory
1168              anyway.  */
1169           DEBUGP (("Removing %s because of directory danger!\n", t));
1170           unlink (t);
1171         }
1172     }
1173   res = make_directory (t);
1174   if (res != 0)
1175     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1176   xfree (t);
1177   return res;
1178 }
1179 \f
1180 /* Functions for constructing the file name out of URL components.  */
1181
1182 /* A growable string structure, used by url_file_name and friends.
1183    This should perhaps be moved to utils.c.
1184
1185    The idea is to have a convenient and efficient way to construct a
1186    string by having various functions append data to it.  Instead of
1187    passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1188    functions in questions, we pass the pointer to this struct.  */
1189
1190 struct growable {
1191   char *base;
1192   int size;
1193   int tail;
1194 };
1195
1196 /* Ensure that the string can accept APPEND_COUNT more characters past
1197    the current TAIL position.  If necessary, this will grow the string
1198    and update its allocated size.  If the string is already large
1199    enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1200 #define GROW(g, append_size) do {                                       \
1201   struct growable *G_ = g;                                              \
1202   DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1203 } while (0)
1204
1205 /* Return the tail position of the string. */
1206 #define TAIL(r) ((r)->base + (r)->tail)
1207
1208 /* Move the tail position by APPEND_COUNT characters. */
1209 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1210
1211 /* Append the string STR to DEST.  NOTICE: the string in DEST is not
1212    terminated.  */
1213
1214 static void
1215 append_string (const char *str, struct growable *dest)
1216 {
1217   int l = strlen (str);
1218   GROW (dest, l);
1219   memcpy (TAIL (dest), str, l);
1220   TAIL_INCR (dest, l);
1221 }
1222
1223 /* Append CH to DEST.  For example, append_char (0, DEST)
1224    zero-terminates DEST.  */
1225
1226 static void
1227 append_char (char ch, struct growable *dest)
1228 {
1229   GROW (dest, 1);
1230   *TAIL (dest) = ch;
1231   TAIL_INCR (dest, 1);
1232 }
1233
1234 enum {
1235   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1236   filechr_not_windows = 2,      /* unusable on Windows, one of \|/<>?:*" */
1237   filechr_control     = 4       /* a control character, e.g. 0-31 */
1238 };
1239
1240 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1241
1242 /* Shorthands for the table: */
1243 #define U filechr_not_unix
1244 #define W filechr_not_windows
1245 #define C filechr_control
1246
1247 #define UW U|W
1248 #define UWC U|W|C
1249
1250 /* Table of characters unsafe under various conditions (see above).
1251
1252    Arguably we could also claim `%' to be unsafe, since we use it as
1253    the escape character.  If we ever want to be able to reliably
1254    translate file name back to URL, this would become important
1255    crucial.  Right now, it's better to be minimal in escaping.  */
1256
1257 static const unsigned char filechr_table[256] =
1258 {
1259 UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1260   C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1261   C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1262   C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1263   0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1264   0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1265   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1266   0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
1267   0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1268   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1269   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1270   0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1271   0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1272   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1273   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1274   0,  0,  0,  0,   0,  0,  0,  0,   /* x   y   z   {    |   }   ~   DEL */
1275
1276   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
1277   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
1278   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1279   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1280
1281   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1282   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1283   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1284   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1285 };
1286 #undef U
1287 #undef W
1288 #undef C
1289 #undef UW
1290 #undef UWC
1291
1292 /* FN_PORT_SEP is the separator between host and port in file names
1293    for non-standard port numbers.  On Unix this is normally ':', as in
1294    "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1295    because Windows can't handle ':' in file names.  */
1296 #define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1297
1298 /* FN_QUERY_SEP is the separator between the file name and the URL
1299    query, normally '?'.  Since Windows cannot handle '?' as part of
1300    file name, we use '@' instead there.  */
1301 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1302
1303 /* Quote path element, characters in [b, e), as file name, and append
1304    the quoted string to DEST.  Each character is quoted as per
1305    file_unsafe_char and the corresponding table.
1306
1307    If ESCAPED_P is non-zero, the path element is considered to be
1308    URL-escaped and will be unescaped prior to inspection.  */
1309
1310 static void
1311 append_uri_pathel (const char *b, const char *e, int escaped_p,
1312                    struct growable *dest)
1313 {
1314   const char *p;
1315   int quoted, outlen;
1316
1317   int mask;
1318   if (opt.restrict_files_os == restrict_unix)
1319     mask = filechr_not_unix;
1320   else
1321     mask = filechr_not_windows;
1322   if (opt.restrict_files_ctrl)
1323     mask |= filechr_control;
1324
1325   /* Copy [b, e) to PATHEL and URL-unescape it. */
1326   if (escaped_p)
1327     {
1328       char *unescaped;
1329       BOUNDED_TO_ALLOCA (b, e, unescaped);
1330       url_unescape (unescaped);
1331       b = unescaped;
1332       e = unescaped + strlen (unescaped);
1333     }
1334
1335   /* Defang ".." when found as component of path.  Remember that path
1336      comes from the URL and might contain malicious input.  */
1337   if (e - b == 2 && b[0] == '.' && b[1] == '.')
1338     {
1339       b = "%2E%2E";
1340       e = b + 6;
1341     }
1342
1343   /* Walk the PATHEL string and check how many characters we'll need
1344      to quote.  */
1345   quoted = 0;
1346   for (p = b; p < e; p++)
1347     if (FILE_CHAR_TEST (*p, mask))
1348       ++quoted;
1349
1350   /* Calculate the length of the output string.  e-b is the input
1351      string length.  Each quoted char introduces two additional
1352      characters in the string, hence 2*quoted.  */
1353   outlen = (e - b) + (2 * quoted);
1354   GROW (dest, outlen);
1355
1356   if (!quoted)
1357     {
1358       /* If there's nothing to quote, we can simply append the string
1359          without processing it again.  */
1360       memcpy (TAIL (dest), b, outlen);
1361     }
1362   else
1363     {
1364       char *q = TAIL (dest);
1365       for (p = b; p < e; p++)
1366         {
1367           if (!FILE_CHAR_TEST (*p, mask))
1368             *q++ = *p;
1369           else
1370             {
1371               unsigned char ch = *p;
1372               *q++ = '%';
1373               *q++ = XNUM_TO_DIGIT (ch >> 4);
1374               *q++ = XNUM_TO_DIGIT (ch & 0xf);
1375             }
1376         }
1377       assert (q - TAIL (dest) == outlen);
1378     }
1379   TAIL_INCR (dest, outlen);
1380 }
1381
1382 /* Append to DEST the directory structure that corresponds the
1383    directory part of URL's path.  For example, if the URL is
1384    http://server/dir1/dir2/file, this appends "/dir1/dir2".
1385
1386    Each path element ("dir1" and "dir2" in the above example) is
1387    examined, url-unescaped, and re-escaped as file name element.
1388
1389    Additionally, it cuts as many directories from the path as
1390    specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1391    will produce "bar" for the above example.  For 2 or more, it will
1392    produce "".
1393
1394    Each component of the path is quoted for use as file name.  */
1395
1396 static void
1397 append_dir_structure (const struct url *u, struct growable *dest)
1398 {
1399   char *pathel, *next;
1400   int cut = opt.cut_dirs;
1401
1402   /* Go through the path components, de-URL-quote them, and quote them
1403      (if necessary) as file names.  */
1404
1405   pathel = u->path;
1406   for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1407     {
1408       if (cut-- > 0)
1409         continue;
1410       if (pathel == next)
1411         /* Ignore empty pathels.  */
1412         continue;
1413
1414       if (dest->tail)
1415         append_char ('/', dest);
1416       append_uri_pathel (pathel, next, 1, dest);
1417     }
1418 }
1419
1420 /* Return a unique file name that matches the given URL as good as
1421    possible.  Does not create directories on the file system.  */
1422
1423 char *
1424 url_file_name (const struct url *u)
1425 {
1426   struct growable fnres;        /* stands for "file name result" */
1427
1428   const char *u_file, *u_query;
1429   char *fname, *unique;
1430
1431   fnres.base = NULL;
1432   fnres.size = 0;
1433   fnres.tail = 0;
1434
1435   /* Start with the directory prefix, if specified. */
1436   if (opt.dir_prefix)
1437     append_string (opt.dir_prefix, &fnres);
1438
1439   /* If "dirstruct" is turned on (typically the case with -r), add
1440      the host and port (unless those have been turned off) and
1441      directory structure.  */
1442   if (opt.dirstruct)
1443     {
1444       if (opt.protocol_directories)
1445         {
1446           if (fnres.tail)
1447             append_char ('/', &fnres);
1448           append_string (supported_schemes[u->scheme].name, &fnres);
1449         }
1450       if (opt.add_hostdir)
1451         {
1452           if (fnres.tail)
1453             append_char ('/', &fnres);
1454           if (0 != strcmp (u->host, ".."))
1455             append_string (u->host, &fnres);
1456           else
1457             /* Host name can come from the network; malicious DNS may
1458                allow ".." to be resolved, causing us to write to
1459                "../<file>".  Defang such host names.  */
1460             append_string ("%2E%2E", &fnres);
1461           if (u->port != scheme_default_port (u->scheme))
1462             {
1463               char portstr[24];
1464               number_to_string (portstr, u->port);
1465               append_char (FN_PORT_SEP, &fnres);
1466               append_string (portstr, &fnres);
1467             }
1468         }
1469
1470       append_dir_structure (u, &fnres);
1471     }
1472
1473   /* Add the file name. */
1474   if (fnres.tail)
1475     append_char ('/', &fnres);
1476   u_file = *u->file ? u->file : "index.html";
1477   append_uri_pathel (u_file, u_file + strlen (u_file), 0, &fnres);
1478
1479   /* Append "?query" to the file name. */
1480   u_query = u->query && *u->query ? u->query : NULL;
1481   if (u_query)
1482     {
1483       append_char (FN_QUERY_SEP, &fnres);
1484       append_uri_pathel (u_query, u_query + strlen (u_query), 1, &fnres);
1485     }
1486
1487   /* Zero-terminate the file name. */
1488   append_char ('\0', &fnres);
1489
1490   fname = fnres.base;
1491
1492   /* Check the cases in which the unique extensions are not used:
1493      1) Clobbering is turned off (-nc).
1494      2) Retrieval with regetting.
1495      3) Timestamping is used.
1496      4) Hierarchy is built.
1497
1498      The exception is the case when file does exist and is a
1499      directory (see `mkalldirs' for explanation).  */
1500
1501   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1502       && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1503     return fname;
1504
1505   unique = unique_name (fname, 1);
1506   if (unique != fname)
1507     xfree (fname);
1508   return unique;
1509 }
1510 \f
1511 /* Resolve "." and ".." elements of PATH by destructively modifying
1512    PATH and return non-zero if PATH has been modified, zero otherwise.
1513
1514    The algorithm is in spirit similar to the one described in rfc1808,
1515    although implemented differently, in one pass.  To recap, path
1516    elements containing only "." are removed, and ".." is taken to mean
1517    "back up one element".  Single leading and trailing slashes are
1518    preserved.
1519
1520    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1521    test examples are provided below.  If you change anything in this
1522    function, run test_path_simplify to make sure you haven't broken a
1523    test case.  */
1524
1525 static int
1526 path_simplify (char *path)
1527 {
1528   char *h = path;               /* hare */
1529   char *t = path;               /* tortoise */
1530   char *beg = path;             /* boundary for backing the tortoise */
1531   char *end = path + strlen (path);
1532
1533   while (h < end)
1534     {
1535       /* Hare should be at the beginning of a path element. */
1536
1537       if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1538         {
1539           /* Ignore "./". */
1540           h += 2;
1541         }
1542       else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1543         {
1544           /* Handle "../" by retreating the tortoise by one path
1545              element -- but not past beggining.  */
1546           if (t > beg)
1547             {
1548               /* Move backwards until T hits the beginning of the
1549                  previous path element or the beginning of path. */
1550               for (--t; t > beg && t[-1] != '/'; t--)
1551                 ;
1552             }
1553           else
1554             {
1555               /* If we're at the beginning, copy the "../" literally
1556                  move the beginning so a later ".." doesn't remove
1557                  it.  */
1558               beg = t + 3;
1559               goto regular;
1560             }
1561           h += 3;
1562         }
1563       else
1564         {
1565         regular:
1566           /* A regular path element.  If H hasn't advanced past T,
1567              simply skip to the next path element.  Otherwise, copy
1568              the path element until the next slash.  */
1569           if (t == h)
1570             {
1571               /* Skip the path element, including the slash.  */
1572               while (h < end && *h != '/')
1573                 t++, h++;
1574               if (h < end)
1575                 t++, h++;
1576             }
1577           else
1578             {
1579               /* Copy the path element, including the final slash.  */
1580               while (h < end && *h != '/')
1581                 *t++ = *h++;
1582               if (h < end)
1583                 *t++ = *h++;
1584             }
1585         }
1586     }
1587
1588   if (t != h)
1589     *t = '\0';
1590
1591   return t != h;
1592 }
1593 \f
1594 /* Return the length of URL's path.  Path is considered to be
1595    terminated by one of '?', ';', '#', or by the end of the
1596    string.  */
1597
1598 static int
1599 path_length (const char *url)
1600 {
1601   const char *q = strpbrk_or_eos (url, "?;#");
1602   return q - url;
1603 }
1604
1605 /* Find the last occurrence of character C in the range [b, e), or
1606    NULL, if none are present.  We might want to use memrchr (a GNU
1607    extension) under GNU libc.  */
1608
1609 static const char *
1610 find_last_char (const char *b, const char *e, char c)
1611 {
1612   for (; e > b; e--)
1613     if (*e == c)
1614       return e;
1615   return NULL;
1616 }
1617
1618 /* Merge BASE with LINK and return the resulting URI.
1619
1620    Either of the URIs may be absolute or relative, complete with the
1621    host name, or path only.  This tries to reasonably handle all
1622    foreseeable cases.  It only employs minimal URL parsing, without
1623    knowledge of the specifics of schemes.
1624
1625    I briefly considered making this function call path_simplify after
1626    the merging process, as rfc1738 seems to suggest.  This is a bad
1627    idea for several reasons: 1) it complexifies the code, and 2)
1628    url_parse has to simplify path anyway, so it's wasteful to boot.  */
1629
1630 char *
1631 uri_merge (const char *base, const char *link)
1632 {
1633   int linklength;
1634   const char *end;
1635   char *merge;
1636
1637   if (url_has_scheme (link))
1638     return xstrdup (link);
1639
1640   /* We may not examine BASE past END. */
1641   end = base + path_length (base);
1642   linklength = strlen (link);
1643
1644   if (!*link)
1645     {
1646       /* Empty LINK points back to BASE, query string and all. */
1647       return xstrdup (base);
1648     }
1649   else if (*link == '?')
1650     {
1651       /* LINK points to the same location, but changes the query
1652          string.  Examples: */
1653       /* uri_merge("path",         "?new") -> "path?new"     */
1654       /* uri_merge("path?foo",     "?new") -> "path?new"     */
1655       /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1656       /* uri_merge("path#foo",     "?new") -> "path?new"     */
1657       int baselength = end - base;
1658       merge = xmalloc (baselength + linklength + 1);
1659       memcpy (merge, base, baselength);
1660       memcpy (merge + baselength, link, linklength);
1661       merge[baselength + linklength] = '\0';
1662     }
1663   else if (*link == '#')
1664     {
1665       /* uri_merge("path",         "#new") -> "path#new"     */
1666       /* uri_merge("path#foo",     "#new") -> "path#new"     */
1667       /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1668       /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1669       int baselength;
1670       const char *end1 = strchr (base, '#');
1671       if (!end1)
1672         end1 = base + strlen (base);
1673       baselength = end1 - base;
1674       merge = xmalloc (baselength + linklength + 1);
1675       memcpy (merge, base, baselength);
1676       memcpy (merge + baselength, link, linklength);
1677       merge[baselength + linklength] = '\0';
1678     }
1679   else if (*link == '/' && *(link + 1) == '/')
1680     {
1681       /* LINK begins with "//" and so is a net path: we need to
1682          replace everything after (and including) the double slash
1683          with LINK. */
1684
1685       /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1686       /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1687       /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1688
1689       int span;
1690       const char *slash;
1691       const char *start_insert;
1692
1693       /* Look for first slash. */
1694       slash = memchr (base, '/', end - base);
1695       /* If found slash and it is a double slash, then replace
1696          from this point, else default to replacing from the
1697          beginning.  */
1698       if (slash && *(slash + 1) == '/')
1699         start_insert = slash;
1700       else
1701         start_insert = base;
1702
1703       span = start_insert - base;
1704       merge = (char *)xmalloc (span + linklength + 1);
1705       if (span)
1706         memcpy (merge, base, span);
1707       memcpy (merge + span, link, linklength);
1708       merge[span + linklength] = '\0';
1709     }
1710   else if (*link == '/')
1711     {
1712       /* LINK is an absolute path: we need to replace everything
1713          after (and including) the FIRST slash with LINK.
1714
1715          So, if BASE is "http://host/whatever/foo/bar", and LINK is
1716          "/qux/xyzzy", our result should be
1717          "http://host/qux/xyzzy".  */
1718       int span;
1719       const char *slash;
1720       const char *start_insert = NULL; /* for gcc to shut up. */
1721       const char *pos = base;
1722       int seen_slash_slash = 0;
1723       /* We're looking for the first slash, but want to ignore
1724          double slash. */
1725     again:
1726       slash = memchr (pos, '/', end - pos);
1727       if (slash && !seen_slash_slash)
1728         if (*(slash + 1) == '/')
1729           {
1730             pos = slash + 2;
1731             seen_slash_slash = 1;
1732             goto again;
1733           }
1734
1735       /* At this point, SLASH is the location of the first / after
1736          "//", or the first slash altogether.  START_INSERT is the
1737          pointer to the location where LINK will be inserted.  When
1738          examining the last two examples, keep in mind that LINK
1739          begins with '/'. */
1740
1741       if (!slash && !seen_slash_slash)
1742         /* example: "foo" */
1743         /*           ^    */
1744         start_insert = base;
1745       else if (!slash && seen_slash_slash)
1746         /* example: "http://foo" */
1747         /*                     ^ */
1748         start_insert = end;
1749       else if (slash && !seen_slash_slash)
1750         /* example: "foo/bar" */
1751         /*           ^        */
1752         start_insert = base;
1753       else if (slash && seen_slash_slash)
1754         /* example: "http://something/" */
1755         /*                           ^  */
1756         start_insert = slash;
1757
1758       span = start_insert - base;
1759       merge = (char *)xmalloc (span + linklength + 1);
1760       if (span)
1761         memcpy (merge, base, span);
1762       memcpy (merge + span, link, linklength);
1763       merge[span + linklength] = '\0';
1764     }
1765   else
1766     {
1767       /* LINK is a relative URL: we need to replace everything
1768          after last slash (possibly empty) with LINK.
1769
1770          So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1771          our result should be "whatever/foo/qux/xyzzy".  */
1772       int need_explicit_slash = 0;
1773       int span;
1774       const char *start_insert;
1775       const char *last_slash = find_last_char (base, end, '/');
1776       if (!last_slash)
1777         {
1778           /* No slash found at all.  Replace what we have with LINK. */
1779           start_insert = base;
1780         }
1781       else if (last_slash && last_slash >= base + 2
1782                && last_slash[-2] == ':' && last_slash[-1] == '/')
1783         {
1784           /* example: http://host"  */
1785           /*                      ^ */
1786           start_insert = end + 1;
1787           need_explicit_slash = 1;
1788         }
1789       else
1790         {
1791           /* example: "whatever/foo/bar" */
1792           /*                        ^    */
1793           start_insert = last_slash + 1;
1794         }
1795
1796       span = start_insert - base;
1797       merge = (char *)xmalloc (span + linklength + 1);
1798       if (span)
1799         memcpy (merge, base, span);
1800       if (need_explicit_slash)
1801         merge[span - 1] = '/';
1802       memcpy (merge + span, link, linklength);
1803       merge[span + linklength] = '\0';
1804     }
1805
1806   return merge;
1807 }
1808 \f
1809 #define APPEND(p, s) do {                       \
1810   int len = strlen (s);                         \
1811   memcpy (p, s, len);                           \
1812   p += len;                                     \
1813 } while (0)
1814
1815 /* Use this instead of password when the actual password is supposed
1816    to be hidden.  We intentionally use a generic string without giving
1817    away the number of characters in the password, like previous
1818    versions did.  */
1819 #define HIDDEN_PASSWORD "*password*"
1820
1821 /* Recreate the URL string from the data in URL.
1822
1823    If HIDE is non-zero (as it is when we're calling this on a URL we
1824    plan to print, but not when calling it to canonicalize a URL for
1825    use within the program), password will be hidden.  Unsafe
1826    characters in the URL will be quoted.  */
1827
1828 char *
1829 url_string (const struct url *url, int hide_password)
1830 {
1831   int size;
1832   char *result, *p;
1833   char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;
1834
1835   int scheme_port  = supported_schemes[url->scheme].default_port;
1836   const char *scheme_str = supported_schemes[url->scheme].leading_string;
1837   int fplen = full_path_length (url);
1838
1839   int brackets_around_host;
1840
1841   assert (scheme_str != NULL);
1842
1843   /* Make sure the user name and password are quoted. */
1844   if (url->user)
1845     {
1846       quoted_user = url_escape_allow_passthrough (url->user);
1847       if (url->passwd)
1848         {
1849           if (hide_password)
1850             quoted_passwd = HIDDEN_PASSWORD;
1851           else
1852             quoted_passwd = url_escape_allow_passthrough (url->passwd);
1853         }
1854     }
1855
1856   /* In the unlikely event that the host name contains non-printable
1857      characters, quote it for displaying to the user.  */
1858   quoted_host = url_escape_allow_passthrough (url->host);
1859
1860   /* Undo the quoting of colons that URL escaping performs.  IPv6
1861      addresses may legally contain colons, and in that case must be
1862      placed in square brackets.  */
1863   if (quoted_host != url->host)
1864     unescape_single_char (quoted_host, ':');
1865   brackets_around_host = strchr (quoted_host, ':') != NULL;
1866
1867   size = (strlen (scheme_str)
1868           + strlen (quoted_host)
1869           + (brackets_around_host ? 2 : 0)
1870           + fplen
1871           + 1);
1872   if (url->port != scheme_port)
1873     size += 1 + numdigit (url->port);
1874   if (quoted_user)
1875     {
1876       size += 1 + strlen (quoted_user);
1877       if (quoted_passwd)
1878         size += 1 + strlen (quoted_passwd);
1879     }
1880
1881   p = result = xmalloc (size);
1882
1883   APPEND (p, scheme_str);
1884   if (quoted_user)
1885     {
1886       APPEND (p, quoted_user);
1887       if (quoted_passwd)
1888         {
1889           *p++ = ':';
1890           APPEND (p, quoted_passwd);
1891         }
1892       *p++ = '@';
1893     }
1894
1895   if (brackets_around_host)
1896     *p++ = '[';
1897   APPEND (p, quoted_host);
1898   if (brackets_around_host)
1899     *p++ = ']';
1900   if (url->port != scheme_port)
1901     {
1902       *p++ = ':';
1903       p = number_to_string (p, url->port);
1904     }
1905
1906   full_path_write (url, p);
1907   p += fplen;
1908   *p++ = '\0';
1909
1910   assert (p - result == size);
1911
1912   if (quoted_user && quoted_user != url->user)
1913     xfree (quoted_user);
1914   if (quoted_passwd && !hide_password && quoted_passwd != url->passwd)
1915     xfree (quoted_passwd);
1916   if (quoted_host != url->host)
1917     xfree (quoted_host);
1918
1919   return result;
1920 }
1921 \f
1922 /* Return non-zero if scheme a is similar to scheme b.
1923
1924    Schemes are similar if they are equal.  If SSL is supported, schemes
1925    are also similar if one is http (SCHEME_HTTP) and the other is https
1926    (SCHEME_HTTPS).  */
1927 int
1928 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
1929 {
1930   if (a == b)
1931     return 1;
1932 #ifdef HAVE_SSL
1933   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
1934       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
1935     return 1;
1936 #endif
1937   return 0;
1938 }
1939 \f
1940 #if 0
1941 /* Debugging and testing support for path_simplify. */
1942
1943 /* Debug: run path_simplify on PATH and return the result in a new
1944    string.  Useful for calling from the debugger.  */
1945 static char *
1946 ps (char *path)
1947 {
1948   char *copy = xstrdup (path);
1949   path_simplify (copy);
1950   return copy;
1951 }
1952
1953 static void
1954 run_test (char *test, char *expected_result, int expected_change)
1955 {
1956   char *test_copy = xstrdup (test);
1957   int modified = path_simplify (test_copy);
1958
1959   if (0 != strcmp (test_copy, expected_result))
1960     {
1961       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
1962               test, expected_result, test_copy);
1963     }
1964   if (modified != expected_change)
1965     {
1966       if (expected_change == 1)
1967         printf ("Expected modification with path_simplify(\"%s\").\n",
1968                 test);
1969       else
1970         printf ("Expected no modification with path_simplify(\"%s\").\n",
1971                 test);
1972     }
1973   xfree (test_copy);
1974 }
1975
1976 static void
1977 test_path_simplify (void)
1978 {
1979   static struct {
1980     char *test, *result;
1981     int should_modify;
1982   } tests[] = {
1983     { "",                       "",             0 },
1984     { ".",                      "",             1 },
1985     { "./",                     "",             1 },
1986     { "..",                     "..",           0 },
1987     { "../",                    "../",          0 },
1988     { "foo",                    "foo",          0 },
1989     { "foo/bar",                "foo/bar",      0 },
1990     { "foo///bar",              "foo///bar",    0 },
1991     { "foo/.",                  "foo/",         1 },
1992     { "foo/./",                 "foo/",         1 },
1993     { "foo./",                  "foo./",        0 },
1994     { "foo/../bar",             "bar",          1 },
1995     { "foo/../bar/",            "bar/",         1 },
1996     { "foo/bar/..",             "foo/",         1 },
1997     { "foo/bar/../x",           "foo/x",        1 },
1998     { "foo/bar/../x/",          "foo/x/",       1 },
1999     { "foo/..",                 "",             1 },
2000     { "foo/../..",              "..",           1 },
2001     { "foo/../../..",           "../..",        1 },
2002     { "foo/../../bar/../../baz", "../../baz",   1 },
2003     { "a/b/../../c",            "c",            1 },
2004     { "./a/../b",               "b",            1 }
2005   };
2006   int i;
2007
2008   for (i = 0; i < countof (tests); i++)
2009     {
2010       char *test = tests[i].test;
2011       char *expected_result = tests[i].result;
2012       int   expected_change = tests[i].should_modify;
2013       run_test (test, expected_result, expected_change);
2014     }
2015 }
2016 #endif