sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001, 2003, 2003
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2 of the License, or (at
  10 your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget; if not, write to the Free Software
  19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  20
  21 In addition, as a special exception, the Free Software Foundation
  22 gives permission to link the code of its release of Wget with the
  23 OpenSSL project's "OpenSSL" library (or with modified versions of it
  24 that use the same license as the "OpenSSL" library), and distribute
  25 the linked executables.  You must obey the GNU General Public License
  26 in all respects for all of the code used other than "OpenSSL".  If you
  27 modify this file, you may extend this exception to your version of the
  28 file, but you are not obligated to do so.  If you do not wish to do
  29 so, delete this exception statement from your version.  */
  30
  31 #include <config.h>
  32
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #ifdef HAVE_STRING_H
  36 # include <string.h>
  37 #else
  38 # include <strings.h>
  39 #endif
  40 #include <sys/types.h>
  41 #ifdef HAVE_UNISTD_H
  42 # include <unistd.h>
  43 #endif
  44 #include <errno.h>
  45 #include <assert.h>
  46
  47 #include "wget.h"
  48 #include "utils.h"
  49 #include "url.h"
  50 #include "host.h"  /* for is_valid_ipv6_address */
  51
  52 #ifndef errno
  53 extern int errno;
  54 #endif
  55
  56 struct scheme_data
  57 {
  58   const char *name;
  59   const char *leading_string;
  60   int default_port;
  61   int enabled;
  62 };
  63
  64 /* Supported schemes: */
  65 static struct scheme_data supported_schemes[] =
  66 {
  67   { "http",     "http://",  DEFAULT_HTTP_PORT,  1 },
  68 #ifdef HAVE_SSL
  69   { "https",    "https://", DEFAULT_HTTPS_PORT, 1 },
  70 #endif
  71   { "ftp",      "ftp://",   DEFAULT_FTP_PORT,   1 },
  72
  73   /* SCHEME_INVALID */
  74   { NULL,       NULL,       -1,                 0 }
  75 };
  76
  77 /* Forward declarations: */
  78
  79 static int path_simplify PARAMS ((char *));
  80 \f
  81 /* Support for escaping and unescaping of URL strings.  */
  82
  83 /* Table of "reserved" and "unsafe" characters.  Those terms are
  84    rfc1738-speak, as such largely obsoleted by rfc2396 and later
  85    specs, but the general idea remains.
  86
  87    A reserved character is the one that you can't decode without
  88    changing the meaning of the URL.  For example, you can't decode
  89    "/foo/%2f/bar" into "/foo///bar" because the number and contents of
  90    path components is different.  Non-reserved characters can be
  91    changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar".  Wget
  92    uses the rfc1738 set of reserved characters, plus "$" and ",", as
  93    recommended by rfc2396.
  94
  95    An unsafe characters is the one that should be encoded when URLs
  96    are placed in foreign environments.  E.g. space and newline are
  97    unsafe in HTTP contexts because HTTP uses them as separator and
  98    terminator, so they must be encoded to %20 and %0A respectively.
  99    "*" is unsafe in shell context, etc.
 100
 101    We determine whether a character is unsafe through static table
 102    lookup.  This code assumes ASCII character set and 8-bit chars.  */
 103
 104 enum {
 105   /* rfc1738 reserved chars + "$" and ",".  */
 106   urlchr_reserved = 1,
 107
 108   /* rfc1738 unsafe chars, plus non-printables.  */
 109   urlchr_unsafe   = 2
 110 };
 111
 112 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 113 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 114 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 115
 116 /* Shorthands for the table: */
 117 #define R  urlchr_reserved
 118 #define U  urlchr_unsafe
 119 #define RU R|U
 120
 121 const static unsigned char urlchr_table[256] =
 122 {
 123   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 124   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 125   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 126   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 127   U,  0,  U, RU,   R,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 128   0,  0,  0,  R,   R,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 129   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 130   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 131  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 132   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 133   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 134   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 135   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 136   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 137   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 138   0,  0,  0,  U,   U,  U,  U,  U,   /* x   y   z   {    |   }   ~   DEL */
 139
 140   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 141   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 142   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 143   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 144
 145   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 146   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 147   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 148   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 149 };
 150 #undef R
 151 #undef U
 152 #undef RU
 153
 154 /* URL-unescape the string S.
 155
 156    This is done by transforming the sequences "%HH" to the character
 157    represented by the hexadecimal digits HH.  If % is not followed by
 158    two hexadecimal digits, it is inserted literally.
 159
 160    The transformation is done in place.  If you need the original
 161    string intact, make a copy before calling this function.  */
 162
 163 static void
 164 url_unescape (char *s)
 165 {
 166   char *t = s;                  /* t - tortoise */
 167   char *h = s;                  /* h - hare     */
 168
 169   for (; *h; h++, t++)
 170     {
 171       if (*h != '%')
 172         {
 173         copychar:
 174           *t = *h;
 175         }
 176       else
 177         {
 178           /* Do nothing if '%' is not followed by two hex digits. */
 179           if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2])))
 180             goto copychar;
 181           *t = X2DIGITS_TO_NUM (h[1], h[2]);
 182           h += 2;
 183         }
 184     }
 185   *t = '\0';
 186 }
 187
 188 /* The core of url_escape_* functions.  Escapes the characters that
 189    match the provided mask in urlchr_table.
 190
 191    If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars
 192    will be returned unchanged.  If ALLOW_PASSTHROUGH is zero, a
 193    freshly allocated string will be returned in all cases.  */
 194
 195 static char *
 196 url_escape_1 (const char *s, unsigned char mask, int allow_passthrough)
 197 {
 198   const char *p1;
 199   char *p2, *newstr;
 200   int newlen;
 201   int addition = 0;
 202
 203   for (p1 = s; *p1; p1++)
 204     if (urlchr_test (*p1, mask))
 205       addition += 2;            /* Two more characters (hex digits) */
 206
 207   if (!addition)
 208     return allow_passthrough ? (char *)s : xstrdup (s);
 209
 210   newlen = (p1 - s) + addition;
 211   newstr = (char *)xmalloc (newlen + 1);
 212
 213   p1 = s;
 214   p2 = newstr;
 215   while (*p1)
 216     {
 217       /* Quote the characters that match the test mask. */
 218       if (urlchr_test (*p1, mask))
 219         {
 220           unsigned char c = *p1++;
 221           *p2++ = '%';
 222           *p2++ = XNUM_TO_DIGIT (c >> 4);
 223           *p2++ = XNUM_TO_DIGIT (c & 0xf);
 224         }
 225       else
 226         *p2++ = *p1++;
 227     }
 228   assert (p2 - newstr == newlen);
 229   *p2 = '\0';
 230
 231   return newstr;
 232 }
 233
 234 /* URL-escape the unsafe characters (see urlchr_table) in a given
 235    string, returning a freshly allocated string.  */
 236
 237 char *
 238 url_escape (const char *s)
 239 {
 240   return url_escape_1 (s, urlchr_unsafe, 0);
 241 }
 242
 243 /* URL-escape the unsafe characters (see urlchr_table) in a given
 244    string.  If no characters are unsafe, S is returned.  */
 245
 246 static char *
 247 url_escape_allow_passthrough (const char *s)
 248 {
 249   return url_escape_1 (s, urlchr_unsafe, 1);
 250 }
 251 \f
 252 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
 253
 254 /* Decide whether to encode, decode, or pass through the char at P.
 255    This used to be a macro, but it got a little too convoluted.  */
 256 static inline enum copy_method
 257 decide_copy_method (const char *p)
 258 {
 259   if (*p == '%')
 260     {
 261       if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
 262         {
 263           /* %xx sequence: decode it, unless it would decode to an
 264              unsafe or a reserved char; in that case, leave it as
 265              is. */
 266           char preempt = X2DIGITS_TO_NUM (*(p + 1), *(p + 2));
 267           if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt))
 268             return CM_PASSTHROUGH;
 269           else
 270             return CM_DECODE;
 271         }
 272       else
 273         /* Garbled %.. sequence: encode `%'. */
 274         return CM_ENCODE;
 275     }
 276   else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
 277     return CM_ENCODE;
 278   else
 279     return CM_PASSTHROUGH;
 280 }
 281
 282 /* Translate a %-escaped (but possibly non-conformant) input string S
 283    into a %-escaped (and conformant) output string.  If no characters
 284    are encoded or decoded, return the same string S; otherwise, return
 285    a freshly allocated string with the new contents.
 286
 287    After a URL has been run through this function, the protocols that
 288    use `%' as the quote character can use the resulting string as-is,
 289    while those that don't call url_unescape() to get to the intended
 290    data.  This function is also stable: after an input string is
 291    transformed the first time, all further transformations of the
 292    result yield the same result string.
 293
 294    Let's discuss why this function is needed.
 295
 296    Imagine Wget is to retrieve `http://abc.xyz/abc def'.  Since a raw
 297    space character would mess up the HTTP request, it needs to be
 298    quoted, like this:
 299
 300        GET /abc%20def HTTP/1.0
 301
 302    It appears that the unsafe chars need to be quoted, for example
 303    with url_escape.  But what if we're requested to download
 304    `abc%20def'?  url_escape transforms "%" to "%25", which would leave
 305    us with `abc%2520def'.  This is incorrect -- since %-escapes are
 306    part of URL syntax, "%20" is the correct way to denote a literal
 307    space on the Wget command line.  This leaves us in the conclusion
 308    that in that case Wget should not call url_escape, but leave the
 309    `%20' as is.
 310
 311    And what if the requested URI is `abc%20 def'?  If we call
 312    url_escape, we end up with `/abc%2520%20def', which is almost
 313    certainly not intended.  If we don't call url_escape, we are left
 314    with the embedded space and cannot complete the request.  What the
 315    user meant was for Wget to request `/abc%20%20def', and this is
 316    where reencode_escapes kicks in.
 317
 318    Wget used to solve this by first decoding %-quotes, and then
 319    encoding all the "unsafe" characters found in the resulting string.
 320    This was wrong because it didn't preserve certain URL special
 321    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 322    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 323    whether we considered `+' reserved (it is).  One of these results
 324    is inevitable because by the second step we would lose information
 325    on whether the `+' was originally encoded or not.  Both results
 326    were wrong because in CGI parameters + means space, while %2B means
 327    literal plus.  reencode_escapes correctly translates the above to
 328    "a%2B+b", i.e. returns the original string.
 329
 330    This function uses an algorithm proposed by Anon Sricharoenchai:
 331
 332    1. Encode all URL_UNSAFE and the "%" that are not followed by 2
 333       hexdigits.
 334
 335    2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
 336       "+".
 337
 338    ...except that this code conflates the two steps, and decides
 339    whether to encode, decode, or pass through each character in turn.
 340    The function still uses two passes, but their logic is the same --
 341    the first pass exists merely for the sake of allocation.  Another
 342    small difference is that we include `+' to URL_RESERVED.
 343
 344    Anon's test case:
 345
 346    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 347    ->
 348    "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
 349
 350    Simpler test cases:
 351
 352    "foo bar"         -> "foo%20bar"
 353    "foo%20bar"       -> "foo%20bar"
 354    "foo %20bar"      -> "foo%20%20bar"
 355    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 356    "foo%25%20bar"    -> "foo%25%20bar"
 357    "foo%2%20bar"     -> "foo%252%20bar"
 358    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 359    "foo%2b+bar"      -> "foo%2b+bar"  */
 360
 361 static char *
 362 reencode_escapes (const char *s)
 363 {
 364   const char *p1;
 365   char *newstr, *p2;
 366   int oldlen, newlen;
 367
 368   int encode_count = 0;
 369   int decode_count = 0;
 370
 371   /* First, pass through the string to see if there's anything to do,
 372      and to calculate the new length.  */
 373   for (p1 = s; *p1; p1++)
 374     {
 375       switch (decide_copy_method (p1))
 376         {
 377         case CM_ENCODE:
 378           ++encode_count;
 379           break;
 380         case CM_DECODE:
 381           ++decode_count;
 382           break;
 383         case CM_PASSTHROUGH:
 384           break;
 385         }
 386     }
 387
 388   if (!encode_count && !decode_count)
 389     /* The string is good as it is. */
 390     return (char *)s;           /* C const model sucks. */
 391
 392   oldlen = p1 - s;
 393   /* Each encoding adds two characters (hex digits), while each
 394      decoding removes two characters.  */
 395   newlen = oldlen + 2 * (encode_count - decode_count);
 396   newstr = xmalloc (newlen + 1);
 397
 398   p1 = s;
 399   p2 = newstr;
 400
 401   while (*p1)
 402     {
 403       switch (decide_copy_method (p1))
 404         {
 405         case CM_ENCODE:
 406           {
 407             unsigned char c = *p1++;
 408             *p2++ = '%';
 409             *p2++ = XNUM_TO_DIGIT (c >> 4);
 410             *p2++ = XNUM_TO_DIGIT (c & 0xf);
 411           }
 412           break;
 413         case CM_DECODE:
 414           *p2++ = X2DIGITS_TO_NUM (p1[1], p1[2]);
 415           p1 += 3;              /* skip %xx */
 416           break;
 417         case CM_PASSTHROUGH:
 418           *p2++ = *p1++;
 419         }
 420     }
 421   *p2 = '\0';
 422   assert (p2 - newstr == newlen);
 423   return newstr;
 424 }
 425 \f
 426 /* Returns the scheme type if the scheme is supported, or
 427    SCHEME_INVALID if not.  */
 428
 429 enum url_scheme
 430 url_scheme (const char *url)
 431 {
 432   int i;
 433
 434   for (i = 0; supported_schemes[i].leading_string; i++)
 435     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 436                           strlen (supported_schemes[i].leading_string)))
 437       {
 438         if (supported_schemes[i].enabled)
 439           return (enum url_scheme) i;
 440         else
 441           return SCHEME_INVALID;
 442       }
 443
 444   return SCHEME_INVALID;
 445 }
 446
 447 #define SCHEME_CHAR(ch) (ISALNUM (ch) || (ch) == '-' || (ch) == '+')
 448
 449 /* Return 1 if the URL begins with any "scheme", 0 otherwise.  As
 450    currently implemented, it returns true if URL begins with
 451    [-+a-zA-Z0-9]+: .  */
 452
 453 int
 454 url_has_scheme (const char *url)
 455 {
 456   const char *p = url;
 457
 458   /* The first char must be a scheme char. */
 459   if (!*p || !SCHEME_CHAR (*p))
 460     return 0;
 461   ++p;
 462   /* Followed by 0 or more scheme chars. */
 463   while (*p && SCHEME_CHAR (*p))
 464     ++p;
 465   /* Terminated by ':'. */
 466   return *p == ':';
 467 }
 468
 469 int
 470 scheme_default_port (enum url_scheme scheme)
 471 {
 472   return supported_schemes[scheme].default_port;
 473 }
 474
 475 void
 476 scheme_disable (enum url_scheme scheme)
 477 {
 478   supported_schemes[scheme].enabled = 0;
 479 }
 480
 481 /* Skip the username and password, if present in the URL.  The
 482    function should *not* be called with the complete URL, but with the
 483    portion after the scheme.
 484
 485    If no username and password are found, return URL.  */
 486
 487 static const char *
 488 url_skip_credentials (const char *url)
 489 {
 490   /* Look for '@' that comes before terminators, such as '/', '?',
 491      '#', or ';'.  */
 492   const char *p = (const char *)strpbrk (url, "@/?#;");
 493   if (!p || *p != '@')
 494     return url;
 495   return p + 1;
 496 }
 497
 498 /* Parse credentials contained in [BEG, END).  The region is expected
 499    to have come from a URL and is unescaped.  */
 500
 501 static int
 502 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
 503 {
 504   char *colon;
 505   const char *userend;
 506
 507   if (beg == end)
 508     return 0;                   /* empty user name */
 509
 510   colon = memchr (beg, ':', end - beg);
 511   if (colon == beg)
 512     return 0;                   /* again empty user name */
 513
 514   if (colon)
 515     {
 516       *passwd = strdupdelim (colon + 1, end);
 517       userend = colon;
 518       url_unescape (*passwd);
 519     }
 520   else
 521     {
 522       *passwd = NULL;
 523       userend = end;
 524     }
 525   *user = strdupdelim (beg, userend);
 526   url_unescape (*user);
 527   return 1;
 528 }
 529
 530 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 531    popularized by Netscape and NcFTP.  HTTP shorthands look like this:
 532
 533    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 534    www.foo.com[:port]            -> http://www.foo.com[:port]
 535
 536    FTP shorthands look like this:
 537
 538    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 539    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 540
 541    If the URL needs not or cannot be rewritten, return NULL.  */
 542
 543 char *
 544 rewrite_shorthand_url (const char *url)
 545 {
 546   const char *p;
 547
 548   if (url_has_scheme (url))
 549     return NULL;
 550
 551   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 552      latter Netscape.  */
 553   for (p = url; *p && *p != ':' && *p != '/'; p++)
 554     ;
 555
 556   if (p == url)
 557     return NULL;
 558
 559   if (*p == ':')
 560     {
 561       const char *pp;
 562       char *res;
 563       /* If the characters after the colon and before the next slash
 564          or end of string are all digits, it's HTTP.  */
 565       int digits = 0;
 566       for (pp = p + 1; ISDIGIT (*pp); pp++)
 567         ++digits;
 568       if (digits > 0 && (*pp == '/' || *pp == '\0'))
 569         goto http;
 570
 571       /* Prepend "ftp://" to the entire URL... */
 572       res = xmalloc (6 + strlen (url) + 1);
 573       sprintf (res, "ftp://%s", url);
 574       /* ...and replace ':' with '/'. */
 575       res[6 + (p - url)] = '/';
 576       return res;
 577     }
 578   else
 579     {
 580       char *res;
 581     http:
 582       /* Just prepend "http://" to what we have. */
 583       res = xmalloc (7 + strlen (url) + 1);
 584       sprintf (res, "http://%s", url);
 585       return res;
 586     }
 587 }
 588 \f
 589 static void split_path PARAMS ((const char *, char **, char **));
 590
 591 /* Like strpbrk, with the exception that it returns the pointer to the
 592    terminating zero (end-of-string aka "eos") if no matching character
 593    is found.
 594
 595    Although I normally balk at Gcc-specific optimizations, it probably
 596    makes sense here: glibc has optimizations that detect strpbrk being
 597    called with literal string as ACCEPT and inline the search.  That
 598    optimization is defeated if strpbrk is hidden within the call to
 599    another function.  (And no, making strpbrk_or_eos inline doesn't
 600    help because the check for literal accept is in the
 601    preprocessor.)  */
 602
 603 #ifdef __GNUC__
 604
 605 #define strpbrk_or_eos(s, accept) ({            \
 606   char *SOE_p = strpbrk (s, accept);            \
 607   if (!SOE_p)                                   \
 608     SOE_p = (char *)s + strlen (s);             \
 609   SOE_p;                                        \
 610 })
 611
 612 #else  /* not __GNUC__ */
 613
 614 static char *
 615 strpbrk_or_eos (const char *s, const char *accept)
 616 {
 617   char *p = strpbrk (s, accept);
 618   if (!p)
 619     p = (char *)s + strlen (s);
 620   return p;
 621 }
 622 #endif
 623
 624 /* Turn STR into lowercase; return non-zero if a character was
 625    actually changed. */
 626
 627 static int
 628 lowercase_str (char *str)
 629 {
 630   int change = 0;
 631   for (; *str; str++)
 632     if (ISUPPER (*str))
 633       {
 634         change = 1;
 635         *str = TOLOWER (*str);
 636       }
 637   return change;
 638 }
 639
 640 static const char *parse_errors[] = {
 641 #define PE_NO_ERROR                     0
 642   N_("No error"),
 643 #define PE_UNSUPPORTED_SCHEME           1
 644   N_("Unsupported scheme"),
 645 #define PE_EMPTY_HOST                   2
 646   N_("Empty host"),
 647 #define PE_BAD_PORT_NUMBER              3
 648   N_("Bad port number"),
 649 #define PE_INVALID_USER_NAME            4
 650   N_("Invalid user name"),
 651 #define PE_UNTERMINATED_IPV6_ADDRESS    5
 652   N_("Unterminated IPv6 numeric address"),
 653 #define PE_IPV6_NOT_SUPPORTED           6
 654   N_("IPv6 addresses not supported"),
 655 #define PE_INVALID_IPV6_ADDRESS         7
 656   N_("Invalid IPv6 numeric address")
 657 };
 658
 659 /* Parse a URL.
 660
 661    Return a new struct url if successful, NULL on error.  In case of
 662    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 663    error code. */
 664 struct url *
 665 url_parse (const char *url, int *error)
 666 {
 667   struct url *u;
 668   const char *p;
 669   int path_modified, host_modified;
 670
 671   enum url_scheme scheme;
 672
 673   const char *uname_b,     *uname_e;
 674   const char *host_b,      *host_e;
 675   const char *path_b,      *path_e;
 676   const char *params_b,    *params_e;
 677   const char *query_b,     *query_e;
 678   const char *fragment_b,  *fragment_e;
 679
 680   int port;
 681   char *user = NULL, *passwd = NULL;
 682
 683   char *url_encoded = NULL;
 684
 685   int error_code;
 686
 687   scheme = url_scheme (url);
 688   if (scheme == SCHEME_INVALID)
 689     {
 690       error_code = PE_UNSUPPORTED_SCHEME;
 691       goto error;
 692     }
 693
 694   url_encoded = reencode_escapes (url);
 695   p = url_encoded;
 696
 697   p += strlen (supported_schemes[scheme].leading_string);
 698   uname_b = p;
 699   p = url_skip_credentials (p);
 700   uname_e = p;
 701
 702   /* scheme://user:pass@host[:port]... */
 703   /*                    ^              */
 704
 705   /* We attempt to break down the URL into the components path,
 706      params, query, and fragment.  They are ordered like this:
 707
 708        scheme://host[:port][/path][;params][?query][#fragment]  */
 709
 710   params_b   = params_e   = NULL;
 711   query_b    = query_e    = NULL;
 712   fragment_b = fragment_e = NULL;
 713
 714   host_b = p;
 715
 716   if (*p == '[')
 717     {
 718       /* Handle IPv6 address inside square brackets.  Ideally we'd
 719          just look for the terminating ']', but rfc2732 mandates
 720          rejecting invalid IPv6 addresses.  */
 721
 722       /* The address begins after '['. */
 723       host_b = p + 1;
 724       host_e = strchr (host_b, ']');
 725
 726       if (!host_e)
 727         {
 728           error_code = PE_UNTERMINATED_IPV6_ADDRESS;
 729           goto error;
 730         }
 731
 732 #ifdef ENABLE_IPV6
 733       /* Check if the IPv6 address is valid. */
 734       if (!is_valid_ipv6_address(host_b, host_e))
 735         {
 736           error_code = PE_INVALID_IPV6_ADDRESS;
 737           goto error;
 738         }
 739
 740       /* Continue parsing after the closing ']'. */
 741       p = host_e + 1;
 742 #else
 743       error_code = PE_IPV6_NOT_SUPPORTED;
 744       goto error;
 745 #endif
 746     }
 747   else
 748     {
 749       p = strpbrk_or_eos (p, ":/;?#");
 750       host_e = p;
 751     }
 752
 753   if (host_b == host_e)
 754     {
 755       error_code = PE_EMPTY_HOST;
 756       goto error;
 757     }
 758
 759   port = scheme_default_port (scheme);
 760   if (*p == ':')
 761     {
 762       const char *port_b, *port_e, *pp;
 763
 764       /* scheme://host:port/tralala */
 765       /*              ^             */
 766       ++p;
 767       port_b = p;
 768       p = strpbrk_or_eos (p, "/;?#");
 769       port_e = p;
 770
 771       /* Allow empty port, as per rfc2396. */
 772       if (port_b != port_e)
 773         {
 774           for (port = 0, pp = port_b; pp < port_e; pp++)
 775             {
 776               if (!ISDIGIT (*pp))
 777                 {
 778                   /* http://host:12randomgarbage/blah */
 779                   /*               ^                  */
 780                   error_code = PE_BAD_PORT_NUMBER;
 781                   goto error;
 782                 }
 783               port = 10 * port + (*pp - '0');
 784               /* Check for too large port numbers here, before we have
 785                  a chance to overflow on bogus port values.  */
 786               if (port > 65535)
 787                 {
 788                   error_code = PE_BAD_PORT_NUMBER;
 789                   goto error;
 790                 }
 791             }
 792         }
 793     }
 794
 795   if (*p == '/')
 796     {
 797       ++p;
 798       path_b = p;
 799       p = strpbrk_or_eos (p, ";?#");
 800       path_e = p;
 801     }
 802   else
 803     {
 804       /* Path is not allowed not to exist. */
 805       path_b = path_e = p;
 806     }
 807
 808   if (*p == ';')
 809     {
 810       ++p;
 811       params_b = p;
 812       p = strpbrk_or_eos (p, "?#");
 813       params_e = p;
 814     }
 815   if (*p == '?')
 816     {
 817       ++p;
 818       query_b = p;
 819       p = strpbrk_or_eos (p, "#");
 820       query_e = p;
 821
 822       /* Hack that allows users to use '?' (a wildcard character) in
 823          FTP URLs without it being interpreted as a query string
 824          delimiter.  */
 825       if (scheme == SCHEME_FTP)
 826         {
 827           query_b = query_e = NULL;
 828           path_e = p;
 829         }
 830     }
 831   if (*p == '#')
 832     {
 833       ++p;
 834       fragment_b = p;
 835       p += strlen (p);
 836       fragment_e = p;
 837     }
 838   assert (*p == 0);
 839
 840   if (uname_b != uname_e)
 841     {
 842       /* http://user:pass@host */
 843       /*        ^         ^    */
 844       /*     uname_b   uname_e */
 845       if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
 846         {
 847           error_code = PE_INVALID_USER_NAME;
 848           goto error;
 849         }
 850     }
 851
 852   u = xnew0 (struct url);
 853   u->scheme = scheme;
 854   u->host   = strdupdelim (host_b, host_e);
 855   u->port   = port;
 856   u->user   = user;
 857   u->passwd = passwd;
 858
 859   u->path = strdupdelim (path_b, path_e);
 860   path_modified = path_simplify (u->path);
 861   split_path (u->path, &u->dir, &u->file);
 862
 863   host_modified = lowercase_str (u->host);
 864
 865   /* Decode %HH sequences in host name.  This is important not so much
 866      to support %HH sequences, but to support binary characters (which
 867      will have been converted to %HH by reencode_escapes).  */
 868   if (strchr (u->host, '%'))
 869     {
 870       url_unescape (u->host);
 871       host_modified = 1;
 872     }
 873
 874   if (params_b)
 875     u->params = strdupdelim (params_b, params_e);
 876   if (query_b)
 877     u->query = strdupdelim (query_b, query_e);
 878   if (fragment_b)
 879     u->fragment = strdupdelim (fragment_b, fragment_e);
 880
 881   if (path_modified || u->fragment || host_modified || path_b == path_e)
 882     {
 883       /* If we suspect that a transformation has rendered what
 884          url_string might return different from URL_ENCODED, rebuild
 885          u->url using url_string.  */
 886       u->url = url_string (u, 0);
 887
 888       if (url_encoded != url)
 889         xfree ((char *) url_encoded);
 890     }
 891   else
 892     {
 893       if (url_encoded == url)
 894         u->url = xstrdup (url);
 895       else
 896         u->url = url_encoded;
 897     }
 898   url_encoded = NULL;
 899
 900   return u;
 901
 902  error:
 903   /* Cleanup in case of error: */
 904   if (url_encoded && url_encoded != url)
 905     xfree (url_encoded);
 906
 907   /* Transmit the error code to the caller, if the caller wants to
 908      know.  */
 909   if (error)
 910     *error = error_code;
 911   return NULL;
 912 }
 913
 914 /* Return the error message string from ERROR_CODE, which should have
 915    been retrieved from url_parse.  The error message is translated.  */
 916
 917 const char *
 918 url_error (int error_code)
 919 {
 920   assert (error_code >= 0 && error_code < countof (parse_errors));
 921   return _(parse_errors[error_code]);
 922 }
 923
 924 /* Split PATH into DIR and FILE.  PATH comes from the URL and is
 925    expected to be URL-escaped.
 926
 927    The path is split into directory (the part up to the last slash)
 928    and file (the part after the last slash), which are subsequently
 929    unescaped.  Examples:
 930
 931    PATH                 DIR           FILE
 932    "foo/bar/baz"        "foo/bar"     "baz"
 933    "foo/bar/"           "foo/bar"     ""
 934    "foo"                ""            "foo"
 935    "foo/bar/baz%2fqux"  "foo/bar"     "baz/qux" (!)
 936
 937    DIR and FILE are freshly allocated.  */
 938
 939 static void
 940 split_path (const char *path, char **dir, char **file)
 941 {
 942   char *last_slash = strrchr (path, '/');
 943   if (!last_slash)
 944     {
 945       *dir = xstrdup ("");
 946       *file = xstrdup (path);
 947     }
 948   else
 949     {
 950       *dir = strdupdelim (path, last_slash);
 951       *file = xstrdup (last_slash + 1);
 952     }
 953   url_unescape (*dir);
 954   url_unescape (*file);
 955 }
 956
 957 /* Note: URL's "full path" is the path with the query string and
 958    params appended.  The "fragment" (#foo) is intentionally ignored,
 959    but that might be changed.  For example, if the original URL was
 960    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
 961    the full path will be "/foo/bar/baz;bullshit?querystring".  */
 962
 963 /* Return the length of the full path, without the terminating
 964    zero.  */
 965
 966 static int
 967 full_path_length (const struct url *url)
 968 {
 969   int len = 0;
 970
 971 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
 972
 973   FROB (path);
 974   FROB (params);
 975   FROB (query);
 976
 977 #undef FROB
 978
 979   return len;
 980 }
 981
 982 /* Write out the full path. */
 983
 984 static void
 985 full_path_write (const struct url *url, char *where)
 986 {
 987 #define FROB(el, chr) do {                      \
 988   char *f_el = url->el;                         \
 989   if (f_el) {                                   \
 990     int l = strlen (f_el);                      \
 991     *where++ = chr;                             \
 992     memcpy (where, f_el, l);                    \
 993     where += l;                                 \
 994   }                                             \
 995 } while (0)
 996
 997   FROB (path, '/');
 998   FROB (params, ';');
 999   FROB (query, '?');
1000
1001 #undef FROB
1002 }
1003
1004 /* Public function for getting the "full path".  E.g. if u->path is
1005    "foo/bar" and u->query is "param=value", full_path will be
1006    "/foo/bar?param=value". */
1007
1008 char *
1009 url_full_path (const struct url *url)
1010 {
1011   int length = full_path_length (url);
1012   char *full_path = (char *) xmalloc (length + 1);
1013
1014   full_path_write (url, full_path);
1015   full_path[length] = '\0';
1016
1017   return full_path;
1018 }
1019
1020 /* Escape unsafe and reserved characters, except for the slash
1021    characters.  */
1022
1023 static char *
1024 url_escape_dir (const char *dir)
1025 {
1026   char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1027   char *h, *t;
1028   if (newdir == dir)
1029     return (char *)dir;
1030
1031   /* Unescape slashes in NEWDIR. */
1032
1033   h = newdir;                   /* hare */
1034   t = newdir;                   /* tortoise */
1035
1036   for (; *h; h++, t++)
1037     {
1038       /* url_escape_1 having converted '/' to "%2F" exactly. */
1039       if (*h == '%' && h[1] == '2' && h[2] == 'F')
1040         {
1041           *t = '/';
1042           h += 2;
1043         }
1044       else
1045         *t = *h;
1046     }
1047   *t = '\0';
1048
1049   return newdir;
1050 }
1051
1052 /* Sync u->path and u->url with u->dir and u->file.  Called after
1053    u->file or u->dir have been changed, typically by the FTP code.  */
1054
1055 static void
1056 sync_path (struct url *u)
1057 {
1058   char *newpath, *efile, *edir;
1059
1060   xfree (u->path);
1061
1062   /* u->dir and u->file are not escaped.  URL-escape them before
1063      reassembling them into u->path.  That way, if they contain
1064      separators like '?' or even if u->file contains slashes, the
1065      path will be correctly assembled.  (u->file can contain slashes
1066      if the URL specifies it with %2f, or if an FTP server returns
1067      it.)  */
1068   edir = url_escape_dir (u->dir);
1069   efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1070
1071   if (!*edir)
1072     newpath = xstrdup (efile);
1073   else
1074     {
1075       int dirlen = strlen (edir);
1076       int filelen = strlen (efile);
1077
1078       /* Copy "DIR/FILE" to newpath. */
1079       char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1080       memcpy (p, edir, dirlen);
1081       p += dirlen;
1082       *p++ = '/';
1083       memcpy (p, efile, filelen);
1084       p += filelen;
1085       *p++ = '\0';
1086     }
1087
1088   u->path = newpath;
1089
1090   if (edir != u->dir)
1091     xfree (edir);
1092   if (efile != u->file)
1093     xfree (efile);
1094
1095   /* Regenerate u->url as well.  */
1096   xfree (u->url);
1097   u->url = url_string (u, 0);
1098 }
1099
1100 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1101    This way we can sync u->path and u->url when they get changed.  */
1102
1103 void
1104 url_set_dir (struct url *url, const char *newdir)
1105 {
1106   xfree (url->dir);
1107   url->dir = xstrdup (newdir);
1108   sync_path (url);
1109 }
1110
1111 void
1112 url_set_file (struct url *url, const char *newfile)
1113 {
1114   xfree (url->file);
1115   url->file = xstrdup (newfile);
1116   sync_path (url);
1117 }
1118
1119 void
1120 url_free (struct url *url)
1121 {
1122   xfree (url->host);
1123   xfree (url->path);
1124   xfree (url->url);
1125
1126   xfree_null (url->params);
1127   xfree_null (url->query);
1128   xfree_null (url->fragment);
1129   xfree_null (url->user);
1130   xfree_null (url->passwd);
1131
1132   xfree (url->dir);
1133   xfree (url->file);
1134
1135   xfree (url);
1136 }
1137 \f
1138 /* Create all the necessary directories for PATH (a file).  Calls
1139    mkdirhier() internally.  */
1140 int
1141 mkalldirs (const char *path)
1142 {
1143   const char *p;
1144   char *t;
1145   struct_stat st;
1146   int res;
1147
1148   p = path + strlen (path);
1149   for (; *p != '/' && p != path; p--)
1150     ;
1151
1152   /* Don't create if it's just a file.  */
1153   if ((p == path) && (*p != '/'))
1154     return 0;
1155   t = strdupdelim (path, p);
1156
1157   /* Check whether the directory exists.  */
1158   if ((stat (t, &st) == 0))
1159     {
1160       if (S_ISDIR (st.st_mode))
1161         {
1162           xfree (t);
1163           return 0;
1164         }
1165       else
1166         {
1167           /* If the dir exists as a file name, remove it first.  This
1168              is *only* for Wget to work with buggy old CERN http
1169              servers.  Here is the scenario: When Wget tries to
1170              retrieve a directory without a slash, e.g.
1171              http://foo/bar (bar being a directory), CERN server will
1172              not redirect it too http://foo/bar/ -- it will generate a
1173              directory listing containing links to bar/file1,
1174              bar/file2, etc.  Wget will lose because it saves this
1175              HTML listing to a file `bar', so it cannot create the
1176              directory.  To work around this, if the file of the same
1177              name exists, we just remove it and create the directory
1178              anyway.  */
1179           DEBUGP (("Removing %s because of directory danger!\n", t));
1180           unlink (t);
1181         }
1182     }
1183   res = make_directory (t);
1184   if (res != 0)
1185     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1186   xfree (t);
1187   return res;
1188 }
1189 \f
1190 /* Functions for constructing the file name out of URL components.  */
1191
1192 /* A growable string structure, used by url_file_name and friends.
1193    This should perhaps be moved to utils.c.
1194
1195    The idea is to have a convenient and efficient way to construct a
1196    string by having various functions append data to it.  Instead of
1197    passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1198    functions in questions, we pass the pointer to this struct.  */
1199
1200 struct growable {
1201   char *base;
1202   int size;
1203   int tail;
1204 };
1205
1206 /* Ensure that the string can accept APPEND_COUNT more characters past
1207    the current TAIL position.  If necessary, this will grow the string
1208    and update its allocated size.  If the string is already large
1209    enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1210 #define GROW(g, append_size) do {                                       \
1211   struct growable *G_ = g;                                              \
1212   DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1213 } while (0)
1214
1215 /* Return the tail position of the string. */
1216 #define TAIL(r) ((r)->base + (r)->tail)
1217
1218 /* Move the tail position by APPEND_COUNT characters. */
1219 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1220
1221 /* Append the string STR to DEST.  NOTICE: the string in DEST is not
1222    terminated.  */
1223
1224 static void
1225 append_string (const char *str, struct growable *dest)
1226 {
1227   int l = strlen (str);
1228   GROW (dest, l);
1229   memcpy (TAIL (dest), str, l);
1230   TAIL_INCR (dest, l);
1231 }
1232
1233 /* Append CH to DEST.  For example, append_char (0, DEST)
1234    zero-terminates DEST.  */
1235
1236 static void
1237 append_char (char ch, struct growable *dest)
1238 {
1239   GROW (dest, 1);
1240   *TAIL (dest) = ch;
1241   TAIL_INCR (dest, 1);
1242 }
1243
1244 enum {
1245   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1246   filechr_not_windows = 2,      /* unusable on Windows, one of \|/<>?:*" */
1247   filechr_control     = 4       /* a control character, e.g. 0-31 */
1248 };
1249
1250 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1251
1252 /* Shorthands for the table: */
1253 #define U filechr_not_unix
1254 #define W filechr_not_windows
1255 #define C filechr_control
1256
1257 #define UW U|W
1258 #define UWC U|W|C
1259
1260 /* Table of characters unsafe under various conditions (see above).
1261
1262    Arguably we could also claim `%' to be unsafe, since we use it as
1263    the escape character.  If we ever want to be able to reliably
1264    translate file name back to URL, this would become important
1265    crucial.  Right now, it's better to be minimal in escaping.  */
1266
1267 const static unsigned char filechr_table[256] =
1268 {
1269 UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1270   C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1271   C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1272   C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1273   0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1274   0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1275   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1276   0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
1277   0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1278   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1279   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1280   0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1281   0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1282   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1283   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1284   0,  0,  0,  0,   0,  0,  0,  0,   /* x   y   z   {    |   }   ~   DEL */
1285
1286   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
1287   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
1288   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1289   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1290
1291   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1292   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1293   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1294   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1295 };
1296 #undef U
1297 #undef W
1298 #undef C
1299 #undef UW
1300 #undef UWC
1301
1302 /* FN_PORT_SEP is the separator between host and port in file names
1303    for non-standard port numbers.  On Unix this is normally ':', as in
1304    "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1305    because Windows can't handle ':' in file names.  */
1306 #define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1307
1308 /* FN_QUERY_SEP is the separator between the file name and the URL
1309    query, normally '?'.  Since Windows cannot handle '?' as part of
1310    file name, we use '@' instead there.  */
1311 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1312
1313 /* Quote path element, characters in [b, e), as file name, and append
1314    the quoted string to DEST.  Each character is quoted as per
1315    file_unsafe_char and the corresponding table.
1316
1317    If ESCAPED_P is non-zero, the path element is considered to be
1318    URL-escaped and will be unescaped prior to inspection.  */
1319
1320 static void
1321 append_uri_pathel (const char *b, const char *e, int escaped_p,
1322                    struct growable *dest)
1323 {
1324   const char *p;
1325   int quoted, outlen;
1326
1327   int mask;
1328   if (opt.restrict_files_os == restrict_unix)
1329     mask = filechr_not_unix;
1330   else
1331     mask = filechr_not_windows;
1332   if (opt.restrict_files_ctrl)
1333     mask |= filechr_control;
1334
1335   /* Copy [b, e) to PATHEL and URL-unescape it. */
1336   if (escaped_p)
1337     {
1338       char *unescaped;
1339       BOUNDED_TO_ALLOCA (b, e, unescaped);
1340       url_unescape (unescaped);
1341       b = unescaped;
1342       e = unescaped + strlen (unescaped);
1343     }
1344
1345   /* Defang ".." when found as component of path.  Remember that path
1346      comes from the URL and might contain malicious input.  */
1347   if (e - b == 2 && b[0] == '.' && b[1] == '.')
1348     {
1349       b = "%2E%2E";
1350       e = b + 6;
1351     }
1352
1353   /* Walk the PATHEL string and check how many characters we'll need
1354      to quote.  */
1355   quoted = 0;
1356   for (p = b; p < e; p++)
1357     if (FILE_CHAR_TEST (*p, mask))
1358       ++quoted;
1359
1360   /* Calculate the length of the output string.  e-b is the input
1361      string length.  Each quoted char introduces two additional
1362      characters in the string, hence 2*quoted.  */
1363   outlen = (e - b) + (2 * quoted);
1364   GROW (dest, outlen);
1365
1366   if (!quoted)
1367     {
1368       /* If there's nothing to quote, we can simply append the string
1369          without processing it again.  */
1370       memcpy (TAIL (dest), b, outlen);
1371     }
1372   else
1373     {
1374       char *q = TAIL (dest);
1375       for (p = b; p < e; p++)
1376         {
1377           if (!FILE_CHAR_TEST (*p, mask))
1378             *q++ = *p;
1379           else
1380             {
1381               unsigned char ch = *p;
1382               *q++ = '%';
1383               *q++ = XNUM_TO_DIGIT (ch >> 4);
1384               *q++ = XNUM_TO_DIGIT (ch & 0xf);
1385             }
1386         }
1387       assert (q - TAIL (dest) == outlen);
1388     }
1389   TAIL_INCR (dest, outlen);
1390 }
1391
1392 /* Append to DEST the directory structure that corresponds the
1393    directory part of URL's path.  For example, if the URL is
1394    http://server/dir1/dir2/file, this appends "/dir1/dir2".
1395
1396    Each path element ("dir1" and "dir2" in the above example) is
1397    examined, url-unescaped, and re-escaped as file name element.
1398
1399    Additionally, it cuts as many directories from the path as
1400    specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1401    will produce "bar" for the above example.  For 2 or more, it will
1402    produce "".
1403
1404    Each component of the path is quoted for use as file name.  */
1405
1406 static void
1407 append_dir_structure (const struct url *u, struct growable *dest)
1408 {
1409   char *pathel, *next;
1410   int cut = opt.cut_dirs;
1411
1412   /* Go through the path components, de-URL-quote them, and quote them
1413      (if necessary) as file names.  */
1414
1415   pathel = u->path;
1416   for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1417     {
1418       if (cut-- > 0)
1419         continue;
1420       if (pathel == next)
1421         /* Ignore empty pathels.  */
1422         continue;
1423
1424       if (dest->tail)
1425         append_char ('/', dest);
1426       append_uri_pathel (pathel, next, 1, dest);
1427     }
1428 }
1429
1430 /* Return a unique file name that matches the given URL as good as
1431    possible.  Does not create directories on the file system.  */
1432
1433 char *
1434 url_file_name (const struct url *u)
1435 {
1436   struct growable fnres;        /* stands for "file name result" */
1437
1438   const char *u_file, *u_query;
1439   char *fname, *unique;
1440
1441   fnres.base = NULL;
1442   fnres.size = 0;
1443   fnres.tail = 0;
1444
1445   /* Start with the directory prefix, if specified. */
1446   if (opt.dir_prefix)
1447     append_string (opt.dir_prefix, &fnres);
1448
1449   /* If "dirstruct" is turned on (typically the case with -r), add
1450      the host and port (unless those have been turned off) and
1451      directory structure.  */
1452   if (opt.dirstruct)
1453     {
1454       if (opt.protocol_directories)
1455         {
1456           if (fnres.tail)
1457             append_char ('/', &fnres);
1458           append_string (supported_schemes[u->scheme].name, &fnres);
1459         }
1460       if (opt.add_hostdir)
1461         {
1462           if (fnres.tail)
1463             append_char ('/', &fnres);
1464           if (0 != strcmp (u->host, ".."))
1465             append_string (u->host, &fnres);
1466           else
1467             /* Host name can come from the network; malicious DNS may
1468                allow ".." to be resolved, causing us to write to
1469                "../<file>".  Defang such host names.  */
1470             append_string ("%2E%2E", &fnres);
1471           if (u->port != scheme_default_port (u->scheme))
1472             {
1473               char portstr[24];
1474               number_to_string (portstr, u->port);
1475               append_char (FN_PORT_SEP, &fnres);
1476               append_string (portstr, &fnres);
1477             }
1478         }
1479
1480       append_dir_structure (u, &fnres);
1481     }
1482
1483   /* Add the file name. */
1484   if (fnres.tail)
1485     append_char ('/', &fnres);
1486   u_file = *u->file ? u->file : "index.html";
1487   append_uri_pathel (u_file, u_file + strlen (u_file), 0, &fnres);
1488
1489   /* Append "?query" to the file name. */
1490   u_query = u->query && *u->query ? u->query : NULL;
1491   if (u_query)
1492     {
1493       append_char (FN_QUERY_SEP, &fnres);
1494       append_uri_pathel (u_query, u_query + strlen (u_query), 1, &fnres);
1495     }
1496
1497   /* Zero-terminate the file name. */
1498   append_char ('\0', &fnres);
1499
1500   fname = fnres.base;
1501
1502   /* Check the cases in which the unique extensions are not used:
1503      1) Clobbering is turned off (-nc).
1504      2) Retrieval with regetting.
1505      3) Timestamping is used.
1506      4) Hierarchy is built.
1507
1508      The exception is the case when file does exist and is a
1509      directory (see `mkalldirs' for explanation).  */
1510
1511   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1512       && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1513     return fname;
1514
1515   unique = unique_name (fname, 1);
1516   if (unique != fname)
1517     xfree (fname);
1518   return unique;
1519 }
1520 \f
1521 /* Resolve "." and ".." elements of PATH by destructively modifying
1522    PATH and return non-zero if PATH has been modified, zero otherwise.
1523
1524    The algorithm is in spirit similar to the one described in rfc1808,
1525    although implemented differently, in one pass.  To recap, path
1526    elements containing only "." are removed, and ".." is taken to mean
1527    "back up one element".  Single leading and trailing slashes are
1528    preserved.
1529
1530    This function does not handle URL escapes explicitly.  If you're
1531    passing paths from URLs, make sure to unquote "%2e" and "%2E" to
1532    ".", so that this function can find the dots.  (Wget's URL parser
1533    calls reencode_escapes, which see.)
1534
1535    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1536    test examples are provided below.  If you change anything in this
1537    function, run test_path_simplify to make sure you haven't broken a
1538    test case.  */
1539
1540 static int
1541 path_simplify (char *path)
1542 {
1543   char *h = path;               /* hare */
1544   char *t = path;               /* tortoise */
1545   char *beg = path;             /* boundary for backing the tortoise */
1546   char *end = path + strlen (path);
1547
1548   while (h < end)
1549     {
1550       /* Hare should be at the beginning of a path element. */
1551
1552       if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1553         {
1554           /* Ignore "./". */
1555           h += 2;
1556         }
1557       else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1558         {
1559           /* Handle "../" by retreating the tortoise by one path
1560              element -- but not past beggining.  */
1561           if (t > beg)
1562             {
1563               /* Move backwards until T hits the beginning of the
1564                  previous path element or the beginning of path. */
1565               for (--t; t > beg && t[-1] != '/'; t--)
1566                 ;
1567             }
1568           else
1569             {
1570               /* If we're at the beginning, copy the "../" literally
1571                  move the beginning so a later ".." doesn't remove
1572                  it.  */
1573               beg = t + 3;
1574               goto regular;
1575             }
1576           h += 3;
1577         }
1578       else
1579         {
1580         regular:
1581           /* A regular path element.  If H hasn't advanced past T,
1582              simply skip to the next path element.  Otherwise, copy
1583              the path element until the next slash.  */
1584           if (t == h)
1585             {
1586               /* Skip the path element, including the slash.  */
1587               while (h < end && *h != '/')
1588                 t++, h++;
1589               if (h < end)
1590                 t++, h++;
1591             }
1592           else
1593             {
1594               /* Copy the path element, including the final slash.  */
1595               while (h < end && *h != '/')
1596                 *t++ = *h++;
1597               if (h < end)
1598                 *t++ = *h++;
1599             }
1600         }
1601     }
1602
1603   if (t != h)
1604     *t = '\0';
1605
1606   return t != h;
1607 }
1608 \f
1609 /* Return the length of URL's path.  Path is considered to be
1610    terminated by one of '?', ';', '#', or by the end of the
1611    string.  */
1612
1613 static int
1614 path_length (const char *url)
1615 {
1616   const char *q = strpbrk_or_eos (url, "?;#");
1617   return q - url;
1618 }
1619
1620 /* Find the last occurrence of character C in the range [b, e), or
1621    NULL, if none are present.  We might want to use memrchr (a GNU
1622    extension) under GNU libc.  */
1623
1624 static const char *
1625 find_last_char (const char *b, const char *e, char c)
1626 {
1627   for (; e > b; e--)
1628     if (*e == c)
1629       return e;
1630   return NULL;
1631 }
1632
1633 /* Merge BASE with LINK and return the resulting URI.
1634
1635    Either of the URIs may be absolute or relative, complete with the
1636    host name, or path only.  This tries to reasonably handle all
1637    foreseeable cases.  It only employs minimal URL parsing, without
1638    knowledge of the specifics of schemes.
1639
1640    I briefly considered making this function call path_simplify after
1641    the merging process, as rfc1738 seems to suggest.  This is a bad
1642    idea for several reasons: 1) it complexifies the code, and 2)
1643    url_parse has to simplify path anyway, so it's wasteful to boot.  */
1644
1645 char *
1646 uri_merge (const char *base, const char *link)
1647 {
1648   int linklength;
1649   const char *end;
1650   char *merge;
1651
1652   if (url_has_scheme (link))
1653     return xstrdup (link);
1654
1655   /* We may not examine BASE past END. */
1656   end = base + path_length (base);
1657   linklength = strlen (link);
1658
1659   if (!*link)
1660     {
1661       /* Empty LINK points back to BASE, query string and all. */
1662       return xstrdup (base);
1663     }
1664   else if (*link == '?')
1665     {
1666       /* LINK points to the same location, but changes the query
1667          string.  Examples: */
1668       /* uri_merge("path",         "?new") -> "path?new"     */
1669       /* uri_merge("path?foo",     "?new") -> "path?new"     */
1670       /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1671       /* uri_merge("path#foo",     "?new") -> "path?new"     */
1672       int baselength = end - base;
1673       merge = xmalloc (baselength + linklength + 1);
1674       memcpy (merge, base, baselength);
1675       memcpy (merge + baselength, link, linklength);
1676       merge[baselength + linklength] = '\0';
1677     }
1678   else if (*link == '#')
1679     {
1680       /* uri_merge("path",         "#new") -> "path#new"     */
1681       /* uri_merge("path#foo",     "#new") -> "path#new"     */
1682       /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1683       /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1684       int baselength;
1685       const char *end1 = strchr (base, '#');
1686       if (!end1)
1687         end1 = base + strlen (base);
1688       baselength = end1 - base;
1689       merge = xmalloc (baselength + linklength + 1);
1690       memcpy (merge, base, baselength);
1691       memcpy (merge + baselength, link, linklength);
1692       merge[baselength + linklength] = '\0';
1693     }
1694   else if (*link == '/' && *(link + 1) == '/')
1695     {
1696       /* LINK begins with "//" and so is a net path: we need to
1697          replace everything after (and including) the double slash
1698          with LINK. */
1699
1700       /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1701       /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1702       /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1703
1704       int span;
1705       const char *slash;
1706       const char *start_insert;
1707
1708       /* Look for first slash. */
1709       slash = memchr (base, '/', end - base);
1710       /* If found slash and it is a double slash, then replace
1711          from this point, else default to replacing from the
1712          beginning.  */
1713       if (slash && *(slash + 1) == '/')
1714         start_insert = slash;
1715       else
1716         start_insert = base;
1717
1718       span = start_insert - base;
1719       merge = (char *)xmalloc (span + linklength + 1);
1720       if (span)
1721         memcpy (merge, base, span);
1722       memcpy (merge + span, link, linklength);
1723       merge[span + linklength] = '\0';
1724     }
1725   else if (*link == '/')
1726     {
1727       /* LINK is an absolute path: we need to replace everything
1728          after (and including) the FIRST slash with LINK.
1729
1730          So, if BASE is "http://host/whatever/foo/bar", and LINK is
1731          "/qux/xyzzy", our result should be
1732          "http://host/qux/xyzzy".  */
1733       int span;
1734       const char *slash;
1735       const char *start_insert = NULL; /* for gcc to shut up. */
1736       const char *pos = base;
1737       int seen_slash_slash = 0;
1738       /* We're looking for the first slash, but want to ignore
1739          double slash. */
1740     again:
1741       slash = memchr (pos, '/', end - pos);
1742       if (slash && !seen_slash_slash)
1743         if (*(slash + 1) == '/')
1744           {
1745             pos = slash + 2;
1746             seen_slash_slash = 1;
1747             goto again;
1748           }
1749
1750       /* At this point, SLASH is the location of the first / after
1751          "//", or the first slash altogether.  START_INSERT is the
1752          pointer to the location where LINK will be inserted.  When
1753          examining the last two examples, keep in mind that LINK
1754          begins with '/'. */
1755
1756       if (!slash && !seen_slash_slash)
1757         /* example: "foo" */
1758         /*           ^    */
1759         start_insert = base;
1760       else if (!slash && seen_slash_slash)
1761         /* example: "http://foo" */
1762         /*                     ^ */
1763         start_insert = end;
1764       else if (slash && !seen_slash_slash)
1765         /* example: "foo/bar" */
1766         /*           ^        */
1767         start_insert = base;
1768       else if (slash && seen_slash_slash)
1769         /* example: "http://something/" */
1770         /*                           ^  */
1771         start_insert = slash;
1772
1773       span = start_insert - base;
1774       merge = (char *)xmalloc (span + linklength + 1);
1775       if (span)
1776         memcpy (merge, base, span);
1777       memcpy (merge + span, link, linklength);
1778       merge[span + linklength] = '\0';
1779     }
1780   else
1781     {
1782       /* LINK is a relative URL: we need to replace everything
1783          after last slash (possibly empty) with LINK.
1784
1785          So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1786          our result should be "whatever/foo/qux/xyzzy".  */
1787       int need_explicit_slash = 0;
1788       int span;
1789       const char *start_insert;
1790       const char *last_slash = find_last_char (base, end, '/');
1791       if (!last_slash)
1792         {
1793           /* No slash found at all.  Replace what we have with LINK. */
1794           start_insert = base;
1795         }
1796       else if (last_slash && last_slash >= base + 2
1797                && last_slash[-2] == ':' && last_slash[-1] == '/')
1798         {
1799           /* example: http://host"  */
1800           /*                      ^ */
1801           start_insert = end + 1;
1802           need_explicit_slash = 1;
1803         }
1804       else
1805         {
1806           /* example: "whatever/foo/bar" */
1807           /*                        ^    */
1808           start_insert = last_slash + 1;
1809         }
1810
1811       span = start_insert - base;
1812       merge = (char *)xmalloc (span + linklength + 1);
1813       if (span)
1814         memcpy (merge, base, span);
1815       if (need_explicit_slash)
1816         merge[span - 1] = '/';
1817       memcpy (merge + span, link, linklength);
1818       merge[span + linklength] = '\0';
1819     }
1820
1821   return merge;
1822 }
1823 \f
1824 #define APPEND(p, s) do {                       \
1825   int len = strlen (s);                         \
1826   memcpy (p, s, len);                           \
1827   p += len;                                     \
1828 } while (0)
1829
1830 /* Use this instead of password when the actual password is supposed
1831    to be hidden.  We intentionally use a generic string without giving
1832    away the number of characters in the password, like previous
1833    versions did.  */
1834 #define HIDDEN_PASSWORD "*password*"
1835
1836 /* Recreate the URL string from the data in URL.
1837
1838    If HIDE is non-zero (as it is when we're calling this on a URL we
1839    plan to print, but not when calling it to canonicalize a URL for
1840    use within the program), password will be hidden.  Unsafe
1841    characters in the URL will be quoted.  */
1842
1843 char *
1844 url_string (const struct url *url, int hide_password)
1845 {
1846   int size;
1847   char *result, *p;
1848   char *quoted_user = NULL, *quoted_passwd = NULL;
1849
1850   int scheme_port  = supported_schemes[url->scheme].default_port;
1851   const char *scheme_str = supported_schemes[url->scheme].leading_string;
1852   int fplen = full_path_length (url);
1853
1854   int brackets_around_host;
1855
1856   assert (scheme_str != NULL);
1857
1858   /* Make sure the user name and password are quoted. */
1859   if (url->user)
1860     {
1861       quoted_user = url_escape_allow_passthrough (url->user);
1862       if (url->passwd)
1863         {
1864           if (hide_password)
1865             quoted_passwd = HIDDEN_PASSWORD;
1866           else
1867             quoted_passwd = url_escape_allow_passthrough (url->passwd);
1868         }
1869     }
1870
1871   /* Numeric IPv6 addresses can contain ':' and need to be quoted with
1872      brackets.  */
1873   brackets_around_host = strchr (url->host, ':') != NULL;
1874
1875   size = (strlen (scheme_str)
1876           + strlen (url->host)
1877           + (brackets_around_host ? 2 : 0)
1878           + fplen
1879           + 1);
1880   if (url->port != scheme_port)
1881     size += 1 + numdigit (url->port);
1882   if (quoted_user)
1883     {
1884       size += 1 + strlen (quoted_user);
1885       if (quoted_passwd)
1886         size += 1 + strlen (quoted_passwd);
1887     }
1888
1889   p = result = xmalloc (size);
1890
1891   APPEND (p, scheme_str);
1892   if (quoted_user)
1893     {
1894       APPEND (p, quoted_user);
1895       if (quoted_passwd)
1896         {
1897           *p++ = ':';
1898           APPEND (p, quoted_passwd);
1899         }
1900       *p++ = '@';
1901     }
1902
1903   if (brackets_around_host)
1904     *p++ = '[';
1905   APPEND (p, url->host);
1906   if (brackets_around_host)
1907     *p++ = ']';
1908   if (url->port != scheme_port)
1909     {
1910       *p++ = ':';
1911       p = number_to_string (p, url->port);
1912     }
1913
1914   full_path_write (url, p);
1915   p += fplen;
1916   *p++ = '\0';
1917
1918   assert (p - result == size);
1919
1920   if (quoted_user && quoted_user != url->user)
1921     xfree (quoted_user);
1922   if (quoted_passwd && !hide_password
1923       && quoted_passwd != url->passwd)
1924     xfree (quoted_passwd);
1925
1926   return result;
1927 }
1928 \f
1929 /* Return non-zero if scheme a is similar to scheme b.
1930
1931    Schemes are similar if they are equal.  If SSL is supported, schemes
1932    are also similar if one is http (SCHEME_HTTP) and the other is https
1933    (SCHEME_HTTPS).  */
1934 int
1935 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
1936 {
1937   if (a == b)
1938     return 1;
1939 #ifdef HAVE_SSL
1940   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
1941       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
1942     return 1;
1943 #endif
1944   return 0;
1945 }
1946 \f
1947 #if 0
1948 /* Debugging and testing support for path_simplify. */
1949
1950 /* Debug: run path_simplify on PATH and return the result in a new
1951    string.  Useful for calling from the debugger.  */
1952 static char *
1953 ps (char *path)
1954 {
1955   char *copy = xstrdup (path);
1956   path_simplify (copy);
1957   return copy;
1958 }
1959
1960 static void
1961 run_test (char *test, char *expected_result, int expected_change)
1962 {
1963   char *test_copy = xstrdup (test);
1964   int modified = path_simplify (test_copy);
1965
1966   if (0 != strcmp (test_copy, expected_result))
1967     {
1968       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
1969               test, expected_result, test_copy);
1970     }
1971   if (modified != expected_change)
1972     {
1973       if (expected_change == 1)
1974         printf ("Expected modification with path_simplify(\"%s\").\n",
1975                 test);
1976       else
1977         printf ("Expected no modification with path_simplify(\"%s\").\n",
1978                 test);
1979     }
1980   xfree (test_copy);
1981 }
1982
1983 static void
1984 test_path_simplify (void)
1985 {
1986   static struct {
1987     char *test, *result;
1988     int should_modify;
1989   } tests[] = {
1990     { "",                       "",             0 },
1991     { ".",                      "",             1 },
1992     { "./",                     "",             1 },
1993     { "..",                     "..",           0 },
1994     { "../",                    "../",          0 },
1995     { "foo",                    "foo",          0 },
1996     { "foo/bar",                "foo/bar",      0 },
1997     { "foo///bar",              "foo///bar",    0 },
1998     { "foo/.",                  "foo/",         1 },
1999     { "foo/./",                 "foo/",         1 },
2000     { "foo./",                  "foo./",        0 },
2001     { "foo/../bar",             "bar",          1 },
2002     { "foo/../bar/",            "bar/",         1 },
2003     { "foo/bar/..",             "foo/",         1 },
2004     { "foo/bar/../x",           "foo/x",        1 },
2005     { "foo/bar/../x/",          "foo/x/",       1 },
2006     { "foo/..",                 "",             1 },
2007     { "foo/../..",              "..",           1 },
2008     { "foo/../../..",           "../..",        1 },
2009     { "foo/../../bar/../../baz", "../../baz",   1 },
2010     { "a/b/../../c",            "c",            1 },
2011     { "./a/../b",               "b",            1 }
2012   };
2013   int i;
2014
2015   for (i = 0; i < countof (tests); i++)
2016     {
2017       char *test = tests[i].test;
2018       char *expected_result = tests[i].result;
2019       int   expected_change = tests[i].should_modify;
2020       run_test (test, expected_result, expected_change);
2021     }
2022 }
2023 #endif