sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 2005 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include <config.h>
  31
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #ifdef HAVE_STRING_H
  35 # include <string.h>
  36 #else
  37 # include <strings.h>
  38 #endif
  39 #include <sys/types.h>
  40 #ifdef HAVE_UNISTD_H
  41 # include <unistd.h>
  42 #endif
  43 #include <errno.h>
  44 #include <assert.h>
  45
  46 #include "wget.h"
  47 #include "utils.h"
  48 #include "url.h"
  49 #include "host.h"  /* for is_valid_ipv6_address */
  50
  51 #ifndef errno
  52 extern int errno;
  53 #endif
  54
  55 struct scheme_data
  56 {
  57   const char *name;
  58   const char *leading_string;
  59   int default_port;
  60   int enabled;
  61 };
  62
  63 /* Supported schemes: */
  64 static struct scheme_data supported_schemes[] =
  65 {
  66   { "http",     "http://",  DEFAULT_HTTP_PORT,  1 },
  67 #ifdef HAVE_SSL
  68   { "https",    "https://", DEFAULT_HTTPS_PORT, 1 },
  69 #endif
  70   { "ftp",      "ftp://",   DEFAULT_FTP_PORT,   1 },
  71
  72   /* SCHEME_INVALID */
  73   { NULL,       NULL,       -1,                 0 }
  74 };
  75
  76 /* Forward declarations: */
  77
  78 static int path_simplify PARAMS ((char *));
  79 \f
  80 /* Support for escaping and unescaping of URL strings.  */
  81
  82 /* Table of "reserved" and "unsafe" characters.  Those terms are
  83    rfc1738-speak, as such largely obsoleted by rfc2396 and later
  84    specs, but the general idea remains.
  85
  86    A reserved character is the one that you can't decode without
  87    changing the meaning of the URL.  For example, you can't decode
  88    "/foo/%2f/bar" into "/foo///bar" because the number and contents of
  89    path components is different.  Non-reserved characters can be
  90    changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar".  Wget
  91    uses the rfc1738 set of reserved characters, plus "$" and ",", as
  92    recommended by rfc2396.
  93
  94    An unsafe characters is the one that should be encoded when URLs
  95    are placed in foreign environments.  E.g. space and newline are
  96    unsafe in HTTP contexts because HTTP uses them as separator and
  97    terminator, so they must be encoded to %20 and %0A respectively.
  98    "*" is unsafe in shell context, etc.
  99
 100    We determine whether a character is unsafe through static table
 101    lookup.  This code assumes ASCII character set and 8-bit chars.  */
 102
 103 enum {
 104   /* rfc1738 reserved chars + "$" and ",".  */
 105   urlchr_reserved = 1,
 106
 107   /* rfc1738 unsafe chars, plus non-printables.  */
 108   urlchr_unsafe   = 2
 109 };
 110
 111 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 112 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 113 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 114
 115 /* Shorthands for the table: */
 116 #define R  urlchr_reserved
 117 #define U  urlchr_unsafe
 118 #define RU R|U
 119
 120 const static unsigned char urlchr_table[256] =
 121 {
 122   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 123   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 124   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 125   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 126   U,  0,  U, RU,   R,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 127   0,  0,  0,  R,   R,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 128   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 129   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 130  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 131   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 132   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 133   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 134   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 135   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 136   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 137   0,  0,  0,  U,   U,  U,  U,  U,   /* x   y   z   {    |   }   ~   DEL */
 138
 139   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 140   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 141   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 142   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 143
 144   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 145   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 146   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 147   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 148 };
 149 #undef R
 150 #undef U
 151 #undef RU
 152
 153 /* URL-unescape the string S.
 154
 155    This is done by transforming the sequences "%HH" to the character
 156    represented by the hexadecimal digits HH.  If % is not followed by
 157    two hexadecimal digits, it is inserted literally.
 158
 159    The transformation is done in place.  If you need the original
 160    string intact, make a copy before calling this function.  */
 161
 162 static void
 163 url_unescape (char *s)
 164 {
 165   char *t = s;                  /* t - tortoise */
 166   char *h = s;                  /* h - hare     */
 167
 168   for (; *h; h++, t++)
 169     {
 170       if (*h != '%')
 171         {
 172         copychar:
 173           *t = *h;
 174         }
 175       else
 176         {
 177           /* Do nothing if '%' is not followed by two hex digits. */
 178           if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2])))
 179             goto copychar;
 180           *t = X2DIGITS_TO_NUM (h[1], h[2]);
 181           h += 2;
 182         }
 183     }
 184   *t = '\0';
 185 }
 186
 187 /* The core of url_escape_* functions.  Escapes the characters that
 188    match the provided mask in urlchr_table.
 189
 190    If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars
 191    will be returned unchanged.  If ALLOW_PASSTHROUGH is zero, a
 192    freshly allocated string will be returned in all cases.  */
 193
 194 static char *
 195 url_escape_1 (const char *s, unsigned char mask, int allow_passthrough)
 196 {
 197   const char *p1;
 198   char *p2, *newstr;
 199   int newlen;
 200   int addition = 0;
 201
 202   for (p1 = s; *p1; p1++)
 203     if (urlchr_test (*p1, mask))
 204       addition += 2;            /* Two more characters (hex digits) */
 205
 206   if (!addition)
 207     return allow_passthrough ? (char *)s : xstrdup (s);
 208
 209   newlen = (p1 - s) + addition;
 210   newstr = (char *)xmalloc (newlen + 1);
 211
 212   p1 = s;
 213   p2 = newstr;
 214   while (*p1)
 215     {
 216       /* Quote the characters that match the test mask. */
 217       if (urlchr_test (*p1, mask))
 218         {
 219           unsigned char c = *p1++;
 220           *p2++ = '%';
 221           *p2++ = XNUM_TO_DIGIT (c >> 4);
 222           *p2++ = XNUM_TO_DIGIT (c & 0xf);
 223         }
 224       else
 225         *p2++ = *p1++;
 226     }
 227   assert (p2 - newstr == newlen);
 228   *p2 = '\0';
 229
 230   return newstr;
 231 }
 232
 233 /* URL-escape the unsafe characters (see urlchr_table) in a given
 234    string, returning a freshly allocated string.  */
 235
 236 char *
 237 url_escape (const char *s)
 238 {
 239   return url_escape_1 (s, urlchr_unsafe, 0);
 240 }
 241
 242 /* URL-escape the unsafe characters (see urlchr_table) in a given
 243    string.  If no characters are unsafe, S is returned.  */
 244
 245 static char *
 246 url_escape_allow_passthrough (const char *s)
 247 {
 248   return url_escape_1 (s, urlchr_unsafe, 1);
 249 }
 250 \f
 251 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
 252
 253 /* Decide whether to encode, decode, or pass through the char at P.
 254    This used to be a macro, but it got a little too convoluted.  */
 255 static inline enum copy_method
 256 decide_copy_method (const char *p)
 257 {
 258   if (*p == '%')
 259     {
 260       if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
 261         {
 262           /* %xx sequence: decode it, unless it would decode to an
 263              unsafe or a reserved char; in that case, leave it as
 264              is. */
 265           char preempt = X2DIGITS_TO_NUM (*(p + 1), *(p + 2));
 266           if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt))
 267             return CM_PASSTHROUGH;
 268           else
 269             return CM_DECODE;
 270         }
 271       else
 272         /* Garbled %.. sequence: encode `%'. */
 273         return CM_ENCODE;
 274     }
 275   else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
 276     return CM_ENCODE;
 277   else
 278     return CM_PASSTHROUGH;
 279 }
 280
 281 /* Translate a %-escaped (but possibly non-conformant) input string S
 282    into a %-escaped (and conformant) output string.  If no characters
 283    are encoded or decoded, return the same string S; otherwise, return
 284    a freshly allocated string with the new contents.
 285
 286    After a URL has been run through this function, the protocols that
 287    use `%' as the quote character can use the resulting string as-is,
 288    while those that don't call url_unescape() to get to the intended
 289    data.  This function is also stable: after an input string is
 290    transformed the first time, all further transformations of the
 291    result yield the same result string.
 292
 293    Let's discuss why this function is needed.
 294
 295    Imagine Wget is to retrieve `http://abc.xyz/abc def'.  Since a raw
 296    space character would mess up the HTTP request, it needs to be
 297    quoted, like this:
 298
 299        GET /abc%20def HTTP/1.0
 300
 301    It appears that the unsafe chars need to be quoted, for example
 302    with url_escape.  But what if we're requested to download
 303    `abc%20def'?  url_escape transforms "%" to "%25", which would leave
 304    us with `abc%2520def'.  This is incorrect -- since %-escapes are
 305    part of URL syntax, "%20" is the correct way to denote a literal
 306    space on the Wget command line.  This leaves us in the conclusion
 307    that in that case Wget should not call url_escape, but leave the
 308    `%20' as is.
 309
 310    And what if the requested URI is `abc%20 def'?  If we call
 311    url_escape, we end up with `/abc%2520%20def', which is almost
 312    certainly not intended.  If we don't call url_escape, we are left
 313    with the embedded space and cannot complete the request.  What the
 314    user meant was for Wget to request `/abc%20%20def', and this is
 315    where reencode_escapes kicks in.
 316
 317    Wget used to solve this by first decoding %-quotes, and then
 318    encoding all the "unsafe" characters found in the resulting string.
 319    This was wrong because it didn't preserve certain URL special
 320    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 321    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 322    whether we considered `+' reserved (it is).  One of these results
 323    is inevitable because by the second step we would lose information
 324    on whether the `+' was originally encoded or not.  Both results
 325    were wrong because in CGI parameters + means space, while %2B means
 326    literal plus.  reencode_escapes correctly translates the above to
 327    "a%2B+b", i.e. returns the original string.
 328
 329    This function uses an algorithm proposed by Anon Sricharoenchai:
 330
 331    1. Encode all URL_UNSAFE and the "%" that are not followed by 2
 332       hexdigits.
 333
 334    2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
 335       "+".
 336
 337    ...except that this code conflates the two steps, and decides
 338    whether to encode, decode, or pass through each character in turn.
 339    The function still uses two passes, but their logic is the same --
 340    the first pass exists merely for the sake of allocation.  Another
 341    small difference is that we include `+' to URL_RESERVED.
 342
 343    Anon's test case:
 344
 345    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 346    ->
 347    "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
 348
 349    Simpler test cases:
 350
 351    "foo bar"         -> "foo%20bar"
 352    "foo%20bar"       -> "foo%20bar"
 353    "foo %20bar"      -> "foo%20%20bar"
 354    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 355    "foo%25%20bar"    -> "foo%25%20bar"
 356    "foo%2%20bar"     -> "foo%252%20bar"
 357    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 358    "foo%2b+bar"      -> "foo%2b+bar"  */
 359
 360 static char *
 361 reencode_escapes (const char *s)
 362 {
 363   const char *p1;
 364   char *newstr, *p2;
 365   int oldlen, newlen;
 366
 367   int encode_count = 0;
 368   int decode_count = 0;
 369
 370   /* First, pass through the string to see if there's anything to do,
 371      and to calculate the new length.  */
 372   for (p1 = s; *p1; p1++)
 373     {
 374       switch (decide_copy_method (p1))
 375         {
 376         case CM_ENCODE:
 377           ++encode_count;
 378           break;
 379         case CM_DECODE:
 380           ++decode_count;
 381           break;
 382         case CM_PASSTHROUGH:
 383           break;
 384         }
 385     }
 386
 387   if (!encode_count && !decode_count)
 388     /* The string is good as it is. */
 389     return (char *)s;           /* C const model sucks. */
 390
 391   oldlen = p1 - s;
 392   /* Each encoding adds two characters (hex digits), while each
 393      decoding removes two characters.  */
 394   newlen = oldlen + 2 * (encode_count - decode_count);
 395   newstr = xmalloc (newlen + 1);
 396
 397   p1 = s;
 398   p2 = newstr;
 399
 400   while (*p1)
 401     {
 402       switch (decide_copy_method (p1))
 403         {
 404         case CM_ENCODE:
 405           {
 406             unsigned char c = *p1++;
 407             *p2++ = '%';
 408             *p2++ = XNUM_TO_DIGIT (c >> 4);
 409             *p2++ = XNUM_TO_DIGIT (c & 0xf);
 410           }
 411           break;
 412         case CM_DECODE:
 413           *p2++ = X2DIGITS_TO_NUM (p1[1], p1[2]);
 414           p1 += 3;              /* skip %xx */
 415           break;
 416         case CM_PASSTHROUGH:
 417           *p2++ = *p1++;
 418         }
 419     }
 420   *p2 = '\0';
 421   assert (p2 - newstr == newlen);
 422   return newstr;
 423 }
 424 \f
 425 /* Returns the scheme type if the scheme is supported, or
 426    SCHEME_INVALID if not.  */
 427
 428 enum url_scheme
 429 url_scheme (const char *url)
 430 {
 431   int i;
 432
 433   for (i = 0; supported_schemes[i].leading_string; i++)
 434     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 435                           strlen (supported_schemes[i].leading_string)))
 436       {
 437         if (supported_schemes[i].enabled)
 438           return (enum url_scheme) i;
 439         else
 440           return SCHEME_INVALID;
 441       }
 442
 443   return SCHEME_INVALID;
 444 }
 445
 446 #define SCHEME_CHAR(ch) (ISALNUM (ch) || (ch) == '-' || (ch) == '+')
 447
 448 /* Return 1 if the URL begins with any "scheme", 0 otherwise.  As
 449    currently implemented, it returns true if URL begins with
 450    [-+a-zA-Z0-9]+: .  */
 451
 452 int
 453 url_has_scheme (const char *url)
 454 {
 455   const char *p = url;
 456
 457   /* The first char must be a scheme char. */
 458   if (!*p || !SCHEME_CHAR (*p))
 459     return 0;
 460   ++p;
 461   /* Followed by 0 or more scheme chars. */
 462   while (*p && SCHEME_CHAR (*p))
 463     ++p;
 464   /* Terminated by ':'. */
 465   return *p == ':';
 466 }
 467
 468 int
 469 scheme_default_port (enum url_scheme scheme)
 470 {
 471   return supported_schemes[scheme].default_port;
 472 }
 473
 474 void
 475 scheme_disable (enum url_scheme scheme)
 476 {
 477   supported_schemes[scheme].enabled = 0;
 478 }
 479
 480 /* Skip the username and password, if present in the URL.  The
 481    function should *not* be called with the complete URL, but with the
 482    portion after the scheme.
 483
 484    If no username and password are found, return URL.  */
 485
 486 static const char *
 487 url_skip_credentials (const char *url)
 488 {
 489   /* Look for '@' that comes before terminators, such as '/', '?',
 490      '#', or ';'.  */
 491   const char *p = (const char *)strpbrk (url, "@/?#;");
 492   if (!p || *p != '@')
 493     return url;
 494   return p + 1;
 495 }
 496
 497 /* Parse credentials contained in [BEG, END).  The region is expected
 498    to have come from a URL and is unescaped.  */
 499
 500 static int
 501 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
 502 {
 503   char *colon;
 504   const char *userend;
 505
 506   if (beg == end)
 507     return 0;                   /* empty user name */
 508
 509   colon = memchr (beg, ':', end - beg);
 510   if (colon == beg)
 511     return 0;                   /* again empty user name */
 512
 513   if (colon)
 514     {
 515       *passwd = strdupdelim (colon + 1, end);
 516       userend = colon;
 517       url_unescape (*passwd);
 518     }
 519   else
 520     {
 521       *passwd = NULL;
 522       userend = end;
 523     }
 524   *user = strdupdelim (beg, userend);
 525   url_unescape (*user);
 526   return 1;
 527 }
 528
 529 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 530    popularized by Netscape and NcFTP.  HTTP shorthands look like this:
 531
 532    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 533    www.foo.com[:port]            -> http://www.foo.com[:port]
 534
 535    FTP shorthands look like this:
 536
 537    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 538    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 539
 540    If the URL needs not or cannot be rewritten, return NULL.  */
 541
 542 char *
 543 rewrite_shorthand_url (const char *url)
 544 {
 545   const char *p;
 546
 547   if (url_has_scheme (url))
 548     return NULL;
 549
 550   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 551      latter Netscape.  */
 552   for (p = url; *p && *p != ':' && *p != '/'; p++)
 553     ;
 554
 555   if (p == url)
 556     return NULL;
 557
 558   if (*p == ':')
 559     {
 560       const char *pp;
 561       char *res;
 562       /* If the characters after the colon and before the next slash
 563          or end of string are all digits, it's HTTP.  */
 564       int digits = 0;
 565       for (pp = p + 1; ISDIGIT (*pp); pp++)
 566         ++digits;
 567       if (digits > 0 && (*pp == '/' || *pp == '\0'))
 568         goto http;
 569
 570       /* Prepend "ftp://" to the entire URL... */
 571       res = xmalloc (6 + strlen (url) + 1);
 572       sprintf (res, "ftp://%s", url);
 573       /* ...and replace ':' with '/'. */
 574       res[6 + (p - url)] = '/';
 575       return res;
 576     }
 577   else
 578     {
 579       char *res;
 580     http:
 581       /* Just prepend "http://" to what we have. */
 582       res = xmalloc (7 + strlen (url) + 1);
 583       sprintf (res, "http://%s", url);
 584       return res;
 585     }
 586 }
 587 \f
 588 static void split_path PARAMS ((const char *, char **, char **));
 589
 590 /* Like strpbrk, with the exception that it returns the pointer to the
 591    terminating zero (end-of-string aka "eos") if no matching character
 592    is found.
 593
 594    Although I normally balk at Gcc-specific optimizations, it probably
 595    makes sense here: glibc has optimizations that detect strpbrk being
 596    called with literal string as ACCEPT and inline the search.  That
 597    optimization is defeated if strpbrk is hidden within the call to
 598    another function.  (And no, making strpbrk_or_eos inline doesn't
 599    help because the check for literal accept is in the
 600    preprocessor.)  */
 601
 602 #ifdef __GNUC__
 603
 604 #define strpbrk_or_eos(s, accept) ({            \
 605   char *SOE_p = strpbrk (s, accept);            \
 606   if (!SOE_p)                                   \
 607     SOE_p = (char *)s + strlen (s);             \
 608   SOE_p;                                        \
 609 })
 610
 611 #else  /* not __GNUC__ */
 612
 613 static char *
 614 strpbrk_or_eos (const char *s, const char *accept)
 615 {
 616   char *p = strpbrk (s, accept);
 617   if (!p)
 618     p = (char *)s + strlen (s);
 619   return p;
 620 }
 621 #endif
 622
 623 /* Turn STR into lowercase; return non-zero if a character was
 624    actually changed. */
 625
 626 static int
 627 lowercase_str (char *str)
 628 {
 629   int change = 0;
 630   for (; *str; str++)
 631     if (ISUPPER (*str))
 632       {
 633         change = 1;
 634         *str = TOLOWER (*str);
 635       }
 636   return change;
 637 }
 638
 639 static const char *parse_errors[] = {
 640 #define PE_NO_ERROR                     0
 641   N_("No error"),
 642 #define PE_UNSUPPORTED_SCHEME           1
 643   N_("Unsupported scheme"),
 644 #define PE_EMPTY_HOST                   2
 645   N_("Empty host"),
 646 #define PE_BAD_PORT_NUMBER              3
 647   N_("Bad port number"),
 648 #define PE_INVALID_USER_NAME            4
 649   N_("Invalid user name"),
 650 #define PE_UNTERMINATED_IPV6_ADDRESS    5
 651   N_("Unterminated IPv6 numeric address"),
 652 #define PE_IPV6_NOT_SUPPORTED           6
 653   N_("IPv6 addresses not supported"),
 654 #define PE_INVALID_IPV6_ADDRESS         7
 655   N_("Invalid IPv6 numeric address")
 656 };
 657
 658 /* Parse a URL.
 659
 660    Return a new struct url if successful, NULL on error.  In case of
 661    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 662    error code. */
 663 struct url *
 664 url_parse (const char *url, int *error)
 665 {
 666   struct url *u;
 667   const char *p;
 668   int path_modified, host_modified;
 669
 670   enum url_scheme scheme;
 671
 672   const char *uname_b,     *uname_e;
 673   const char *host_b,      *host_e;
 674   const char *path_b,      *path_e;
 675   const char *params_b,    *params_e;
 676   const char *query_b,     *query_e;
 677   const char *fragment_b,  *fragment_e;
 678
 679   int port;
 680   char *user = NULL, *passwd = NULL;
 681
 682   char *url_encoded = NULL;
 683
 684   int error_code;
 685
 686   scheme = url_scheme (url);
 687   if (scheme == SCHEME_INVALID)
 688     {
 689       error_code = PE_UNSUPPORTED_SCHEME;
 690       goto error;
 691     }
 692
 693   url_encoded = reencode_escapes (url);
 694   p = url_encoded;
 695
 696   p += strlen (supported_schemes[scheme].leading_string);
 697   uname_b = p;
 698   p = url_skip_credentials (p);
 699   uname_e = p;
 700
 701   /* scheme://user:pass@host[:port]... */
 702   /*                    ^              */
 703
 704   /* We attempt to break down the URL into the components path,
 705      params, query, and fragment.  They are ordered like this:
 706
 707        scheme://host[:port][/path][;params][?query][#fragment]  */
 708
 709   params_b   = params_e   = NULL;
 710   query_b    = query_e    = NULL;
 711   fragment_b = fragment_e = NULL;
 712
 713   host_b = p;
 714
 715   if (*p == '[')
 716     {
 717       /* Handle IPv6 address inside square brackets.  Ideally we'd
 718          just look for the terminating ']', but rfc2732 mandates
 719          rejecting invalid IPv6 addresses.  */
 720
 721       /* The address begins after '['. */
 722       host_b = p + 1;
 723       host_e = strchr (host_b, ']');
 724
 725       if (!host_e)
 726         {
 727           error_code = PE_UNTERMINATED_IPV6_ADDRESS;
 728           goto error;
 729         }
 730
 731 #ifdef ENABLE_IPV6
 732       /* Check if the IPv6 address is valid. */
 733       if (!is_valid_ipv6_address(host_b, host_e))
 734         {
 735           error_code = PE_INVALID_IPV6_ADDRESS;
 736           goto error;
 737         }
 738
 739       /* Continue parsing after the closing ']'. */
 740       p = host_e + 1;
 741 #else
 742       error_code = PE_IPV6_NOT_SUPPORTED;
 743       goto error;
 744 #endif
 745     }
 746   else
 747     {
 748       p = strpbrk_or_eos (p, ":/;?#");
 749       host_e = p;
 750     }
 751
 752   if (host_b == host_e)
 753     {
 754       error_code = PE_EMPTY_HOST;
 755       goto error;
 756     }
 757
 758   port = scheme_default_port (scheme);
 759   if (*p == ':')
 760     {
 761       const char *port_b, *port_e, *pp;
 762
 763       /* scheme://host:port/tralala */
 764       /*              ^             */
 765       ++p;
 766       port_b = p;
 767       p = strpbrk_or_eos (p, "/;?#");
 768       port_e = p;
 769
 770       /* Allow empty port, as per rfc2396. */
 771       if (port_b != port_e)
 772         {
 773           for (port = 0, pp = port_b; pp < port_e; pp++)
 774             {
 775               if (!ISDIGIT (*pp))
 776                 {
 777                   /* http://host:12randomgarbage/blah */
 778                   /*               ^                  */
 779                   error_code = PE_BAD_PORT_NUMBER;
 780                   goto error;
 781                 }
 782               port = 10 * port + (*pp - '0');
 783               /* Check for too large port numbers here, before we have
 784                  a chance to overflow on bogus port values.  */
 785               if (port > 65535)
 786                 {
 787                   error_code = PE_BAD_PORT_NUMBER;
 788                   goto error;
 789                 }
 790             }
 791         }
 792     }
 793
 794   if (*p == '/')
 795     {
 796       ++p;
 797       path_b = p;
 798       p = strpbrk_or_eos (p, ";?#");
 799       path_e = p;
 800     }
 801   else
 802     {
 803       /* Path is not allowed not to exist. */
 804       path_b = path_e = p;
 805     }
 806
 807   if (*p == ';')
 808     {
 809       ++p;
 810       params_b = p;
 811       p = strpbrk_or_eos (p, "?#");
 812       params_e = p;
 813     }
 814   if (*p == '?')
 815     {
 816       ++p;
 817       query_b = p;
 818       p = strpbrk_or_eos (p, "#");
 819       query_e = p;
 820
 821       /* Hack that allows users to use '?' (a wildcard character) in
 822          FTP URLs without it being interpreted as a query string
 823          delimiter.  */
 824       if (scheme == SCHEME_FTP)
 825         {
 826           query_b = query_e = NULL;
 827           path_e = p;
 828         }
 829     }
 830   if (*p == '#')
 831     {
 832       ++p;
 833       fragment_b = p;
 834       p += strlen (p);
 835       fragment_e = p;
 836     }
 837   assert (*p == 0);
 838
 839   if (uname_b != uname_e)
 840     {
 841       /* http://user:pass@host */
 842       /*        ^         ^    */
 843       /*     uname_b   uname_e */
 844       if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
 845         {
 846           error_code = PE_INVALID_USER_NAME;
 847           goto error;
 848         }
 849     }
 850
 851   u = xnew0 (struct url);
 852   u->scheme = scheme;
 853   u->host   = strdupdelim (host_b, host_e);
 854   u->port   = port;
 855   u->user   = user;
 856   u->passwd = passwd;
 857
 858   u->path = strdupdelim (path_b, path_e);
 859   path_modified = path_simplify (u->path);
 860   split_path (u->path, &u->dir, &u->file);
 861
 862   host_modified = lowercase_str (u->host);
 863
 864   /* Decode %HH sequences in host name.  This is important not so much
 865      to support %HH sequences, but to support binary characters (which
 866      will have been converted to %HH by reencode_escapes).  */
 867   if (strchr (u->host, '%'))
 868     {
 869       url_unescape (u->host);
 870       host_modified = 1;
 871     }
 872
 873   if (params_b)
 874     u->params = strdupdelim (params_b, params_e);
 875   if (query_b)
 876     u->query = strdupdelim (query_b, query_e);
 877   if (fragment_b)
 878     u->fragment = strdupdelim (fragment_b, fragment_e);
 879
 880   if (path_modified || u->fragment || host_modified || path_b == path_e)
 881     {
 882       /* If we suspect that a transformation has rendered what
 883          url_string might return different from URL_ENCODED, rebuild
 884          u->url using url_string.  */
 885       u->url = url_string (u, 0);
 886
 887       if (url_encoded != url)
 888         xfree ((char *) url_encoded);
 889     }
 890   else
 891     {
 892       if (url_encoded == url)
 893         u->url = xstrdup (url);
 894       else
 895         u->url = url_encoded;
 896     }
 897   url_encoded = NULL;
 898
 899   return u;
 900
 901  error:
 902   /* Cleanup in case of error: */
 903   if (url_encoded && url_encoded != url)
 904     xfree (url_encoded);
 905
 906   /* Transmit the error code to the caller, if the caller wants to
 907      know.  */
 908   if (error)
 909     *error = error_code;
 910   return NULL;
 911 }
 912
 913 /* Return the error message string from ERROR_CODE, which should have
 914    been retrieved from url_parse.  The error message is translated.  */
 915
 916 const char *
 917 url_error (int error_code)
 918 {
 919   assert (error_code >= 0 && error_code < countof (parse_errors));
 920   return _(parse_errors[error_code]);
 921 }
 922
 923 /* Split PATH into DIR and FILE.  PATH comes from the URL and is
 924    expected to be URL-escaped.
 925
 926    The path is split into directory (the part up to the last slash)
 927    and file (the part after the last slash), which are subsequently
 928    unescaped.  Examples:
 929
 930    PATH                 DIR           FILE
 931    "foo/bar/baz"        "foo/bar"     "baz"
 932    "foo/bar/"           "foo/bar"     ""
 933    "foo"                ""            "foo"
 934    "foo/bar/baz%2fqux"  "foo/bar"     "baz/qux" (!)
 935
 936    DIR and FILE are freshly allocated.  */
 937
 938 static void
 939 split_path (const char *path, char **dir, char **file)
 940 {
 941   char *last_slash = strrchr (path, '/');
 942   if (!last_slash)
 943     {
 944       *dir = xstrdup ("");
 945       *file = xstrdup (path);
 946     }
 947   else
 948     {
 949       *dir = strdupdelim (path, last_slash);
 950       *file = xstrdup (last_slash + 1);
 951     }
 952   url_unescape (*dir);
 953   url_unescape (*file);
 954 }
 955
 956 /* Note: URL's "full path" is the path with the query string and
 957    params appended.  The "fragment" (#foo) is intentionally ignored,
 958    but that might be changed.  For example, if the original URL was
 959    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
 960    the full path will be "/foo/bar/baz;bullshit?querystring".  */
 961
 962 /* Return the length of the full path, without the terminating
 963    zero.  */
 964
 965 static int
 966 full_path_length (const struct url *url)
 967 {
 968   int len = 0;
 969
 970 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
 971
 972   FROB (path);
 973   FROB (params);
 974   FROB (query);
 975
 976 #undef FROB
 977
 978   return len;
 979 }
 980
 981 /* Write out the full path. */
 982
 983 static void
 984 full_path_write (const struct url *url, char *where)
 985 {
 986 #define FROB(el, chr) do {                      \
 987   char *f_el = url->el;                         \
 988   if (f_el) {                                   \
 989     int l = strlen (f_el);                      \
 990     *where++ = chr;                             \
 991     memcpy (where, f_el, l);                    \
 992     where += l;                                 \
 993   }                                             \
 994 } while (0)
 995
 996   FROB (path, '/');
 997   FROB (params, ';');
 998   FROB (query, '?');
 999
1000 #undef FROB
1001 }
1002
1003 /* Public function for getting the "full path".  E.g. if u->path is
1004    "foo/bar" and u->query is "param=value", full_path will be
1005    "/foo/bar?param=value". */
1006
1007 char *
1008 url_full_path (const struct url *url)
1009 {
1010   int length = full_path_length (url);
1011   char *full_path = (char *) xmalloc (length + 1);
1012
1013   full_path_write (url, full_path);
1014   full_path[length] = '\0';
1015
1016   return full_path;
1017 }
1018
1019 /* Unescape CHR in an otherwise escaped STR.  Used to selectively
1020    escaping of certain characters, such as "/" and ":".  Returns a
1021    count of unescaped chars.  */
1022
1023 static void
1024 unescape_single_char (char *str, char chr)
1025 {
1026   const char c1 = XNUM_TO_DIGIT (chr >> 4);
1027   const char c2 = XNUM_TO_DIGIT (chr & 0xf);
1028   char *h = str;                /* hare */
1029   char *t = str;                /* tortoise */
1030   for (; *h; h++, t++)
1031     {
1032       if (h[0] == '%' && h[1] == c1 && h[2] == c2)
1033         {
1034           *t = chr;
1035           h += 2;
1036         }
1037       else
1038         *t = *h;
1039     }
1040   *t = '\0';
1041 }
1042
1043 /* Escape unsafe and reserved characters, except for the slash
1044    characters.  */
1045
1046 static char *
1047 url_escape_dir (const char *dir)
1048 {
1049   char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1050   if (newdir == dir)
1051     return (char *)dir;
1052
1053   unescape_single_char (newdir, '/');
1054   return newdir;
1055 }
1056
1057 /* Sync u->path and u->url with u->dir and u->file.  Called after
1058    u->file or u->dir have been changed, typically by the FTP code.  */
1059
1060 static void
1061 sync_path (struct url *u)
1062 {
1063   char *newpath, *efile, *edir;
1064
1065   xfree (u->path);
1066
1067   /* u->dir and u->file are not escaped.  URL-escape them before
1068      reassembling them into u->path.  That way, if they contain
1069      separators like '?' or even if u->file contains slashes, the
1070      path will be correctly assembled.  (u->file can contain slashes
1071      if the URL specifies it with %2f, or if an FTP server returns
1072      it.)  */
1073   edir = url_escape_dir (u->dir);
1074   efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1075
1076   if (!*edir)
1077     newpath = xstrdup (efile);
1078   else
1079     {
1080       int dirlen = strlen (edir);
1081       int filelen = strlen (efile);
1082
1083       /* Copy "DIR/FILE" to newpath. */
1084       char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1085       memcpy (p, edir, dirlen);
1086       p += dirlen;
1087       *p++ = '/';
1088       memcpy (p, efile, filelen);
1089       p += filelen;
1090       *p++ = '\0';
1091     }
1092
1093   u->path = newpath;
1094
1095   if (edir != u->dir)
1096     xfree (edir);
1097   if (efile != u->file)
1098     xfree (efile);
1099
1100   /* Regenerate u->url as well.  */
1101   xfree (u->url);
1102   u->url = url_string (u, 0);
1103 }
1104
1105 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1106    This way we can sync u->path and u->url when they get changed.  */
1107
1108 void
1109 url_set_dir (struct url *url, const char *newdir)
1110 {
1111   xfree (url->dir);
1112   url->dir = xstrdup (newdir);
1113   sync_path (url);
1114 }
1115
1116 void
1117 url_set_file (struct url *url, const char *newfile)
1118 {
1119   xfree (url->file);
1120   url->file = xstrdup (newfile);
1121   sync_path (url);
1122 }
1123
1124 void
1125 url_free (struct url *url)
1126 {
1127   xfree (url->host);
1128   xfree (url->path);
1129   xfree (url->url);
1130
1131   xfree_null (url->params);
1132   xfree_null (url->query);
1133   xfree_null (url->fragment);
1134   xfree_null (url->user);
1135   xfree_null (url->passwd);
1136
1137   xfree (url->dir);
1138   xfree (url->file);
1139
1140   xfree (url);
1141 }
1142 \f
1143 /* Create all the necessary directories for PATH (a file).  Calls
1144    mkdirhier() internally.  */
1145 int
1146 mkalldirs (const char *path)
1147 {
1148   const char *p;
1149   char *t;
1150   struct_stat st;
1151   int res;
1152
1153   p = path + strlen (path);
1154   for (; *p != '/' && p != path; p--)
1155     ;
1156
1157   /* Don't create if it's just a file.  */
1158   if ((p == path) && (*p != '/'))
1159     return 0;
1160   t = strdupdelim (path, p);
1161
1162   /* Check whether the directory exists.  */
1163   if ((stat (t, &st) == 0))
1164     {
1165       if (S_ISDIR (st.st_mode))
1166         {
1167           xfree (t);
1168           return 0;
1169         }
1170       else
1171         {
1172           /* If the dir exists as a file name, remove it first.  This
1173              is *only* for Wget to work with buggy old CERN http
1174              servers.  Here is the scenario: When Wget tries to
1175              retrieve a directory without a slash, e.g.
1176              http://foo/bar (bar being a directory), CERN server will
1177              not redirect it too http://foo/bar/ -- it will generate a
1178              directory listing containing links to bar/file1,
1179              bar/file2, etc.  Wget will lose because it saves this
1180              HTML listing to a file `bar', so it cannot create the
1181              directory.  To work around this, if the file of the same
1182              name exists, we just remove it and create the directory
1183              anyway.  */
1184           DEBUGP (("Removing %s because of directory danger!\n", t));
1185           unlink (t);
1186         }
1187     }
1188   res = make_directory (t);
1189   if (res != 0)
1190     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1191   xfree (t);
1192   return res;
1193 }
1194 \f
1195 /* Functions for constructing the file name out of URL components.  */
1196
1197 /* A growable string structure, used by url_file_name and friends.
1198    This should perhaps be moved to utils.c.
1199
1200    The idea is to have a convenient and efficient way to construct a
1201    string by having various functions append data to it.  Instead of
1202    passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1203    functions in questions, we pass the pointer to this struct.  */
1204
1205 struct growable {
1206   char *base;
1207   int size;
1208   int tail;
1209 };
1210
1211 /* Ensure that the string can accept APPEND_COUNT more characters past
1212    the current TAIL position.  If necessary, this will grow the string
1213    and update its allocated size.  If the string is already large
1214    enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1215 #define GROW(g, append_size) do {                                       \
1216   struct growable *G_ = g;                                              \
1217   DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1218 } while (0)
1219
1220 /* Return the tail position of the string. */
1221 #define TAIL(r) ((r)->base + (r)->tail)
1222
1223 /* Move the tail position by APPEND_COUNT characters. */
1224 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1225
1226 /* Append the string STR to DEST.  NOTICE: the string in DEST is not
1227    terminated.  */
1228
1229 static void
1230 append_string (const char *str, struct growable *dest)
1231 {
1232   int l = strlen (str);
1233   GROW (dest, l);
1234   memcpy (TAIL (dest), str, l);
1235   TAIL_INCR (dest, l);
1236 }
1237
1238 /* Append CH to DEST.  For example, append_char (0, DEST)
1239    zero-terminates DEST.  */
1240
1241 static void
1242 append_char (char ch, struct growable *dest)
1243 {
1244   GROW (dest, 1);
1245   *TAIL (dest) = ch;
1246   TAIL_INCR (dest, 1);
1247 }
1248
1249 enum {
1250   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1251   filechr_not_windows = 2,      /* unusable on Windows, one of \|/<>?:*" */
1252   filechr_control     = 4       /* a control character, e.g. 0-31 */
1253 };
1254
1255 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1256
1257 /* Shorthands for the table: */
1258 #define U filechr_not_unix
1259 #define W filechr_not_windows
1260 #define C filechr_control
1261
1262 #define UW U|W
1263 #define UWC U|W|C
1264
1265 /* Table of characters unsafe under various conditions (see above).
1266
1267    Arguably we could also claim `%' to be unsafe, since we use it as
1268    the escape character.  If we ever want to be able to reliably
1269    translate file name back to URL, this would become important
1270    crucial.  Right now, it's better to be minimal in escaping.  */
1271
1272 const static unsigned char filechr_table[256] =
1273 {
1274 UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1275   C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1276   C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1277   C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1278   0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1279   0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1280   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1281   0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
1282   0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1283   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1284   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1285   0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1286   0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1287   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1288   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1289   0,  0,  0,  0,   0,  0,  0,  0,   /* x   y   z   {    |   }   ~   DEL */
1290
1291   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
1292   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
1293   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1294   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1295
1296   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1297   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1298   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1299   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1300 };
1301 #undef U
1302 #undef W
1303 #undef C
1304 #undef UW
1305 #undef UWC
1306
1307 /* FN_PORT_SEP is the separator between host and port in file names
1308    for non-standard port numbers.  On Unix this is normally ':', as in
1309    "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1310    because Windows can't handle ':' in file names.  */
1311 #define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1312
1313 /* FN_QUERY_SEP is the separator between the file name and the URL
1314    query, normally '?'.  Since Windows cannot handle '?' as part of
1315    file name, we use '@' instead there.  */
1316 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1317
1318 /* Quote path element, characters in [b, e), as file name, and append
1319    the quoted string to DEST.  Each character is quoted as per
1320    file_unsafe_char and the corresponding table.
1321
1322    If ESCAPED_P is non-zero, the path element is considered to be
1323    URL-escaped and will be unescaped prior to inspection.  */
1324
1325 static void
1326 append_uri_pathel (const char *b, const char *e, int escaped_p,
1327                    struct growable *dest)
1328 {
1329   const char *p;
1330   int quoted, outlen;
1331
1332   int mask;
1333   if (opt.restrict_files_os == restrict_unix)
1334     mask = filechr_not_unix;
1335   else
1336     mask = filechr_not_windows;
1337   if (opt.restrict_files_ctrl)
1338     mask |= filechr_control;
1339
1340   /* Copy [b, e) to PATHEL and URL-unescape it. */
1341   if (escaped_p)
1342     {
1343       char *unescaped;
1344       BOUNDED_TO_ALLOCA (b, e, unescaped);
1345       url_unescape (unescaped);
1346       b = unescaped;
1347       e = unescaped + strlen (unescaped);
1348     }
1349
1350   /* Defang ".." when found as component of path.  Remember that path
1351      comes from the URL and might contain malicious input.  */
1352   if (e - b == 2 && b[0] == '.' && b[1] == '.')
1353     {
1354       b = "%2E%2E";
1355       e = b + 6;
1356     }
1357
1358   /* Walk the PATHEL string and check how many characters we'll need
1359      to quote.  */
1360   quoted = 0;
1361   for (p = b; p < e; p++)
1362     if (FILE_CHAR_TEST (*p, mask))
1363       ++quoted;
1364
1365   /* Calculate the length of the output string.  e-b is the input
1366      string length.  Each quoted char introduces two additional
1367      characters in the string, hence 2*quoted.  */
1368   outlen = (e - b) + (2 * quoted);
1369   GROW (dest, outlen);
1370
1371   if (!quoted)
1372     {
1373       /* If there's nothing to quote, we can simply append the string
1374          without processing it again.  */
1375       memcpy (TAIL (dest), b, outlen);
1376     }
1377   else
1378     {
1379       char *q = TAIL (dest);
1380       for (p = b; p < e; p++)
1381         {
1382           if (!FILE_CHAR_TEST (*p, mask))
1383             *q++ = *p;
1384           else
1385             {
1386               unsigned char ch = *p;
1387               *q++ = '%';
1388               *q++ = XNUM_TO_DIGIT (ch >> 4);
1389               *q++ = XNUM_TO_DIGIT (ch & 0xf);
1390             }
1391         }
1392       assert (q - TAIL (dest) == outlen);
1393     }
1394   TAIL_INCR (dest, outlen);
1395 }
1396
1397 /* Append to DEST the directory structure that corresponds the
1398    directory part of URL's path.  For example, if the URL is
1399    http://server/dir1/dir2/file, this appends "/dir1/dir2".
1400
1401    Each path element ("dir1" and "dir2" in the above example) is
1402    examined, url-unescaped, and re-escaped as file name element.
1403
1404    Additionally, it cuts as many directories from the path as
1405    specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1406    will produce "bar" for the above example.  For 2 or more, it will
1407    produce "".
1408
1409    Each component of the path is quoted for use as file name.  */
1410
1411 static void
1412 append_dir_structure (const struct url *u, struct growable *dest)
1413 {
1414   char *pathel, *next;
1415   int cut = opt.cut_dirs;
1416
1417   /* Go through the path components, de-URL-quote them, and quote them
1418      (if necessary) as file names.  */
1419
1420   pathel = u->path;
1421   for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1422     {
1423       if (cut-- > 0)
1424         continue;
1425       if (pathel == next)
1426         /* Ignore empty pathels.  */
1427         continue;
1428
1429       if (dest->tail)
1430         append_char ('/', dest);
1431       append_uri_pathel (pathel, next, 1, dest);
1432     }
1433 }
1434
1435 /* Return a unique file name that matches the given URL as good as
1436    possible.  Does not create directories on the file system.  */
1437
1438 char *
1439 url_file_name (const struct url *u)
1440 {
1441   struct growable fnres;        /* stands for "file name result" */
1442
1443   const char *u_file, *u_query;
1444   char *fname, *unique;
1445
1446   fnres.base = NULL;
1447   fnres.size = 0;
1448   fnres.tail = 0;
1449
1450   /* Start with the directory prefix, if specified. */
1451   if (opt.dir_prefix)
1452     append_string (opt.dir_prefix, &fnres);
1453
1454   /* If "dirstruct" is turned on (typically the case with -r), add
1455      the host and port (unless those have been turned off) and
1456      directory structure.  */
1457   if (opt.dirstruct)
1458     {
1459       if (opt.protocol_directories)
1460         {
1461           if (fnres.tail)
1462             append_char ('/', &fnres);
1463           append_string (supported_schemes[u->scheme].name, &fnres);
1464         }
1465       if (opt.add_hostdir)
1466         {
1467           if (fnres.tail)
1468             append_char ('/', &fnres);
1469           if (0 != strcmp (u->host, ".."))
1470             append_string (u->host, &fnres);
1471           else
1472             /* Host name can come from the network; malicious DNS may
1473                allow ".." to be resolved, causing us to write to
1474                "../<file>".  Defang such host names.  */
1475             append_string ("%2E%2E", &fnres);
1476           if (u->port != scheme_default_port (u->scheme))
1477             {
1478               char portstr[24];
1479               number_to_string (portstr, u->port);
1480               append_char (FN_PORT_SEP, &fnres);
1481               append_string (portstr, &fnres);
1482             }
1483         }
1484
1485       append_dir_structure (u, &fnres);
1486     }
1487
1488   /* Add the file name. */
1489   if (fnres.tail)
1490     append_char ('/', &fnres);
1491   u_file = *u->file ? u->file : "index.html";
1492   append_uri_pathel (u_file, u_file + strlen (u_file), 0, &fnres);
1493
1494   /* Append "?query" to the file name. */
1495   u_query = u->query && *u->query ? u->query : NULL;
1496   if (u_query)
1497     {
1498       append_char (FN_QUERY_SEP, &fnres);
1499       append_uri_pathel (u_query, u_query + strlen (u_query), 1, &fnres);
1500     }
1501
1502   /* Zero-terminate the file name. */
1503   append_char ('\0', &fnres);
1504
1505   fname = fnres.base;
1506
1507   /* Check the cases in which the unique extensions are not used:
1508      1) Clobbering is turned off (-nc).
1509      2) Retrieval with regetting.
1510      3) Timestamping is used.
1511      4) Hierarchy is built.
1512
1513      The exception is the case when file does exist and is a
1514      directory (see `mkalldirs' for explanation).  */
1515
1516   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1517       && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1518     return fname;
1519
1520   unique = unique_name (fname, 1);
1521   if (unique != fname)
1522     xfree (fname);
1523   return unique;
1524 }
1525 \f
1526 /* Resolve "." and ".." elements of PATH by destructively modifying
1527    PATH and return non-zero if PATH has been modified, zero otherwise.
1528
1529    The algorithm is in spirit similar to the one described in rfc1808,
1530    although implemented differently, in one pass.  To recap, path
1531    elements containing only "." are removed, and ".." is taken to mean
1532    "back up one element".  Single leading and trailing slashes are
1533    preserved.
1534
1535    This function does not handle URL escapes explicitly.  If you're
1536    passing paths from URLs, make sure to unquote "%2e" and "%2E" to
1537    ".", so that this function can find the dots.  (Wget's URL parser
1538    calls reencode_escapes, which see.)
1539
1540    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1541    test examples are provided below.  If you change anything in this
1542    function, run test_path_simplify to make sure you haven't broken a
1543    test case.  */
1544
1545 static int
1546 path_simplify (char *path)
1547 {
1548   char *h = path;               /* hare */
1549   char *t = path;               /* tortoise */
1550   char *beg = path;             /* boundary for backing the tortoise */
1551   char *end = path + strlen (path);
1552
1553   while (h < end)
1554     {
1555       /* Hare should be at the beginning of a path element. */
1556
1557       if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1558         {
1559           /* Ignore "./". */
1560           h += 2;
1561         }
1562       else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1563         {
1564           /* Handle "../" by retreating the tortoise by one path
1565              element -- but not past beggining.  */
1566           if (t > beg)
1567             {
1568               /* Move backwards until T hits the beginning of the
1569                  previous path element or the beginning of path. */
1570               for (--t; t > beg && t[-1] != '/'; t--)
1571                 ;
1572             }
1573           else
1574             {
1575               /* If we're at the beginning, copy the "../" literally
1576                  move the beginning so a later ".." doesn't remove
1577                  it.  */
1578               beg = t + 3;
1579               goto regular;
1580             }
1581           h += 3;
1582         }
1583       else
1584         {
1585         regular:
1586           /* A regular path element.  If H hasn't advanced past T,
1587              simply skip to the next path element.  Otherwise, copy
1588              the path element until the next slash.  */
1589           if (t == h)
1590             {
1591               /* Skip the path element, including the slash.  */
1592               while (h < end && *h != '/')
1593                 t++, h++;
1594               if (h < end)
1595                 t++, h++;
1596             }
1597           else
1598             {
1599               /* Copy the path element, including the final slash.  */
1600               while (h < end && *h != '/')
1601                 *t++ = *h++;
1602               if (h < end)
1603                 *t++ = *h++;
1604             }
1605         }
1606     }
1607
1608   if (t != h)
1609     *t = '\0';
1610
1611   return t != h;
1612 }
1613 \f
1614 /* Return the length of URL's path.  Path is considered to be
1615    terminated by one of '?', ';', '#', or by the end of the
1616    string.  */
1617
1618 static int
1619 path_length (const char *url)
1620 {
1621   const char *q = strpbrk_or_eos (url, "?;#");
1622   return q - url;
1623 }
1624
1625 /* Find the last occurrence of character C in the range [b, e), or
1626    NULL, if none are present.  We might want to use memrchr (a GNU
1627    extension) under GNU libc.  */
1628
1629 static const char *
1630 find_last_char (const char *b, const char *e, char c)
1631 {
1632   for (; e > b; e--)
1633     if (*e == c)
1634       return e;
1635   return NULL;
1636 }
1637
1638 /* Merge BASE with LINK and return the resulting URI.
1639
1640    Either of the URIs may be absolute or relative, complete with the
1641    host name, or path only.  This tries to reasonably handle all
1642    foreseeable cases.  It only employs minimal URL parsing, without
1643    knowledge of the specifics of schemes.
1644
1645    I briefly considered making this function call path_simplify after
1646    the merging process, as rfc1738 seems to suggest.  This is a bad
1647    idea for several reasons: 1) it complexifies the code, and 2)
1648    url_parse has to simplify path anyway, so it's wasteful to boot.  */
1649
1650 char *
1651 uri_merge (const char *base, const char *link)
1652 {
1653   int linklength;
1654   const char *end;
1655   char *merge;
1656
1657   if (url_has_scheme (link))
1658     return xstrdup (link);
1659
1660   /* We may not examine BASE past END. */
1661   end = base + path_length (base);
1662   linklength = strlen (link);
1663
1664   if (!*link)
1665     {
1666       /* Empty LINK points back to BASE, query string and all. */
1667       return xstrdup (base);
1668     }
1669   else if (*link == '?')
1670     {
1671       /* LINK points to the same location, but changes the query
1672          string.  Examples: */
1673       /* uri_merge("path",         "?new") -> "path?new"     */
1674       /* uri_merge("path?foo",     "?new") -> "path?new"     */
1675       /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1676       /* uri_merge("path#foo",     "?new") -> "path?new"     */
1677       int baselength = end - base;
1678       merge = xmalloc (baselength + linklength + 1);
1679       memcpy (merge, base, baselength);
1680       memcpy (merge + baselength, link, linklength);
1681       merge[baselength + linklength] = '\0';
1682     }
1683   else if (*link == '#')
1684     {
1685       /* uri_merge("path",         "#new") -> "path#new"     */
1686       /* uri_merge("path#foo",     "#new") -> "path#new"     */
1687       /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1688       /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1689       int baselength;
1690       const char *end1 = strchr (base, '#');
1691       if (!end1)
1692         end1 = base + strlen (base);
1693       baselength = end1 - base;
1694       merge = xmalloc (baselength + linklength + 1);
1695       memcpy (merge, base, baselength);
1696       memcpy (merge + baselength, link, linklength);
1697       merge[baselength + linklength] = '\0';
1698     }
1699   else if (*link == '/' && *(link + 1) == '/')
1700     {
1701       /* LINK begins with "//" and so is a net path: we need to
1702          replace everything after (and including) the double slash
1703          with LINK. */
1704
1705       /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1706       /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1707       /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1708
1709       int span;
1710       const char *slash;
1711       const char *start_insert;
1712
1713       /* Look for first slash. */
1714       slash = memchr (base, '/', end - base);
1715       /* If found slash and it is a double slash, then replace
1716          from this point, else default to replacing from the
1717          beginning.  */
1718       if (slash && *(slash + 1) == '/')
1719         start_insert = slash;
1720       else
1721         start_insert = base;
1722
1723       span = start_insert - base;
1724       merge = (char *)xmalloc (span + linklength + 1);
1725       if (span)
1726         memcpy (merge, base, span);
1727       memcpy (merge + span, link, linklength);
1728       merge[span + linklength] = '\0';
1729     }
1730   else if (*link == '/')
1731     {
1732       /* LINK is an absolute path: we need to replace everything
1733          after (and including) the FIRST slash with LINK.
1734
1735          So, if BASE is "http://host/whatever/foo/bar", and LINK is
1736          "/qux/xyzzy", our result should be
1737          "http://host/qux/xyzzy".  */
1738       int span;
1739       const char *slash;
1740       const char *start_insert = NULL; /* for gcc to shut up. */
1741       const char *pos = base;
1742       int seen_slash_slash = 0;
1743       /* We're looking for the first slash, but want to ignore
1744          double slash. */
1745     again:
1746       slash = memchr (pos, '/', end - pos);
1747       if (slash && !seen_slash_slash)
1748         if (*(slash + 1) == '/')
1749           {
1750             pos = slash + 2;
1751             seen_slash_slash = 1;
1752             goto again;
1753           }
1754
1755       /* At this point, SLASH is the location of the first / after
1756          "//", or the first slash altogether.  START_INSERT is the
1757          pointer to the location where LINK will be inserted.  When
1758          examining the last two examples, keep in mind that LINK
1759          begins with '/'. */
1760
1761       if (!slash && !seen_slash_slash)
1762         /* example: "foo" */
1763         /*           ^    */
1764         start_insert = base;
1765       else if (!slash && seen_slash_slash)
1766         /* example: "http://foo" */
1767         /*                     ^ */
1768         start_insert = end;
1769       else if (slash && !seen_slash_slash)
1770         /* example: "foo/bar" */
1771         /*           ^        */
1772         start_insert = base;
1773       else if (slash && seen_slash_slash)
1774         /* example: "http://something/" */
1775         /*                           ^  */
1776         start_insert = slash;
1777
1778       span = start_insert - base;
1779       merge = (char *)xmalloc (span + linklength + 1);
1780       if (span)
1781         memcpy (merge, base, span);
1782       memcpy (merge + span, link, linklength);
1783       merge[span + linklength] = '\0';
1784     }
1785   else
1786     {
1787       /* LINK is a relative URL: we need to replace everything
1788          after last slash (possibly empty) with LINK.
1789
1790          So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1791          our result should be "whatever/foo/qux/xyzzy".  */
1792       int need_explicit_slash = 0;
1793       int span;
1794       const char *start_insert;
1795       const char *last_slash = find_last_char (base, end, '/');
1796       if (!last_slash)
1797         {
1798           /* No slash found at all.  Replace what we have with LINK. */
1799           start_insert = base;
1800         }
1801       else if (last_slash && last_slash >= base + 2
1802                && last_slash[-2] == ':' && last_slash[-1] == '/')
1803         {
1804           /* example: http://host"  */
1805           /*                      ^ */
1806           start_insert = end + 1;
1807           need_explicit_slash = 1;
1808         }
1809       else
1810         {
1811           /* example: "whatever/foo/bar" */
1812           /*                        ^    */
1813           start_insert = last_slash + 1;
1814         }
1815
1816       span = start_insert - base;
1817       merge = (char *)xmalloc (span + linklength + 1);
1818       if (span)
1819         memcpy (merge, base, span);
1820       if (need_explicit_slash)
1821         merge[span - 1] = '/';
1822       memcpy (merge + span, link, linklength);
1823       merge[span + linklength] = '\0';
1824     }
1825
1826   return merge;
1827 }
1828 \f
1829 #define APPEND(p, s) do {                       \
1830   int len = strlen (s);                         \
1831   memcpy (p, s, len);                           \
1832   p += len;                                     \
1833 } while (0)
1834
1835 /* Use this instead of password when the actual password is supposed
1836    to be hidden.  We intentionally use a generic string without giving
1837    away the number of characters in the password, like previous
1838    versions did.  */
1839 #define HIDDEN_PASSWORD "*password*"
1840
1841 /* Recreate the URL string from the data in URL.
1842
1843    If HIDE is non-zero (as it is when we're calling this on a URL we
1844    plan to print, but not when calling it to canonicalize a URL for
1845    use within the program), password will be hidden.  Unsafe
1846    characters in the URL will be quoted.  */
1847
1848 char *
1849 url_string (const struct url *url, int hide_password)
1850 {
1851   int size;
1852   char *result, *p;
1853   char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;
1854
1855   int scheme_port  = supported_schemes[url->scheme].default_port;
1856   const char *scheme_str = supported_schemes[url->scheme].leading_string;
1857   int fplen = full_path_length (url);
1858
1859   int brackets_around_host;
1860
1861   assert (scheme_str != NULL);
1862
1863   /* Make sure the user name and password are quoted. */
1864   if (url->user)
1865     {
1866       quoted_user = url_escape_allow_passthrough (url->user);
1867       if (url->passwd)
1868         {
1869           if (hide_password)
1870             quoted_passwd = HIDDEN_PASSWORD;
1871           else
1872             quoted_passwd = url_escape_allow_passthrough (url->passwd);
1873         }
1874     }
1875
1876   /* In the unlikely event that the host name contains non-printable
1877      characters, quote it for displaying to the user.  */
1878   quoted_host = url_escape_allow_passthrough (url->host);
1879
1880   /* Undo the quoting of colons that URL escaping performs.  IPv6
1881      addresses may legally contain colons, and in that case must be
1882      placed in square brackets.  */
1883   if (quoted_host != url->host)
1884     unescape_single_char (quoted_host, ':');
1885   brackets_around_host = strchr (quoted_host, ':') != NULL;
1886
1887   size = (strlen (scheme_str)
1888           + strlen (quoted_host)
1889           + (brackets_around_host ? 2 : 0)
1890           + fplen
1891           + 1);
1892   if (url->port != scheme_port)
1893     size += 1 + numdigit (url->port);
1894   if (quoted_user)
1895     {
1896       size += 1 + strlen (quoted_user);
1897       if (quoted_passwd)
1898         size += 1 + strlen (quoted_passwd);
1899     }
1900
1901   p = result = xmalloc (size);
1902
1903   APPEND (p, scheme_str);
1904   if (quoted_user)
1905     {
1906       APPEND (p, quoted_user);
1907       if (quoted_passwd)
1908         {
1909           *p++ = ':';
1910           APPEND (p, quoted_passwd);
1911         }
1912       *p++ = '@';
1913     }
1914
1915   if (brackets_around_host)
1916     *p++ = '[';
1917   APPEND (p, quoted_host);
1918   if (brackets_around_host)
1919     *p++ = ']';
1920   if (url->port != scheme_port)
1921     {
1922       *p++ = ':';
1923       p = number_to_string (p, url->port);
1924     }
1925
1926   full_path_write (url, p);
1927   p += fplen;
1928   *p++ = '\0';
1929
1930   assert (p - result == size);
1931
1932   if (quoted_user && quoted_user != url->user)
1933     xfree (quoted_user);
1934   if (quoted_passwd && !hide_password && quoted_passwd != url->passwd)
1935     xfree (quoted_passwd);
1936   if (quoted_host != url->host)
1937     xfree (quoted_host);
1938
1939   return result;
1940 }
1941 \f
1942 /* Return non-zero if scheme a is similar to scheme b.
1943
1944    Schemes are similar if they are equal.  If SSL is supported, schemes
1945    are also similar if one is http (SCHEME_HTTP) and the other is https
1946    (SCHEME_HTTPS).  */
1947 int
1948 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
1949 {
1950   if (a == b)
1951     return 1;
1952 #ifdef HAVE_SSL
1953   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
1954       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
1955     return 1;
1956 #endif
1957   return 0;
1958 }
1959 \f
1960 #if 0
1961 /* Debugging and testing support for path_simplify. */
1962
1963 /* Debug: run path_simplify on PATH and return the result in a new
1964    string.  Useful for calling from the debugger.  */
1965 static char *
1966 ps (char *path)
1967 {
1968   char *copy = xstrdup (path);
1969   path_simplify (copy);
1970   return copy;
1971 }
1972
1973 static void
1974 run_test (char *test, char *expected_result, int expected_change)
1975 {
1976   char *test_copy = xstrdup (test);
1977   int modified = path_simplify (test_copy);
1978
1979   if (0 != strcmp (test_copy, expected_result))
1980     {
1981       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
1982               test, expected_result, test_copy);
1983     }
1984   if (modified != expected_change)
1985     {
1986       if (expected_change == 1)
1987         printf ("Expected modification with path_simplify(\"%s\").\n",
1988                 test);
1989       else
1990         printf ("Expected no modification with path_simplify(\"%s\").\n",
1991                 test);
1992     }
1993   xfree (test_copy);
1994 }
1995
1996 static void
1997 test_path_simplify (void)
1998 {
1999   static struct {
2000     char *test, *result;
2001     int should_modify;
2002   } tests[] = {
2003     { "",                       "",             0 },
2004     { ".",                      "",             1 },
2005     { "./",                     "",             1 },
2006     { "..",                     "..",           0 },
2007     { "../",                    "../",          0 },
2008     { "foo",                    "foo",          0 },
2009     { "foo/bar",                "foo/bar",      0 },
2010     { "foo///bar",              "foo///bar",    0 },
2011     { "foo/.",                  "foo/",         1 },
2012     { "foo/./",                 "foo/",         1 },
2013     { "foo./",                  "foo./",        0 },
2014     { "foo/../bar",             "bar",          1 },
2015     { "foo/../bar/",            "bar/",         1 },
2016     { "foo/bar/..",             "foo/",         1 },
2017     { "foo/bar/../x",           "foo/x",        1 },
2018     { "foo/bar/../x/",          "foo/x/",       1 },
2019     { "foo/..",                 "",             1 },
2020     { "foo/../..",              "..",           1 },
2021     { "foo/../../..",           "../..",        1 },
2022     { "foo/../../bar/../../baz", "../../baz",   1 },
2023     { "a/b/../../c",            "c",            1 },
2024     { "./a/../b",               "b",            1 }
2025   };
2026   int i;
2027
2028   for (i = 0; i < countof (tests); i++)
2029     {
2030       char *test = tests[i].test;
2031       char *expected_result = tests[i].result;
2032       int   expected_change = tests[i].should_modify;
2033       run_test (test, expected_result, expected_change);
2034     }
2035 }
2036 #endif