sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 2005 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include <config.h>
  31
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #ifdef HAVE_STRING_H
  35 # include <string.h>
  36 #else
  37 # include <strings.h>
  38 #endif
  39 #include <sys/types.h>
  40 #ifdef HAVE_UNISTD_H
  41 # include <unistd.h>
  42 #endif
  43 #include <errno.h>
  44 #include <assert.h>
  45
  46 #include "wget.h"
  47 #include "utils.h"
  48 #include "url.h"
  49 #include "host.h"  /* for is_valid_ipv6_address */
  50
  51 #ifndef errno
  52 extern int errno;
  53 #endif
  54
  55 struct scheme_data
  56 {
  57   const char *name;
  58   const char *leading_string;
  59   int default_port;
  60   int enabled;
  61 };
  62
  63 /* Supported schemes: */
  64 static struct scheme_data supported_schemes[] =
  65 {
  66   { "http",     "http://",  DEFAULT_HTTP_PORT,  1 },
  67 #ifdef HAVE_SSL
  68   { "https",    "https://", DEFAULT_HTTPS_PORT, 1 },
  69 #endif
  70   { "ftp",      "ftp://",   DEFAULT_FTP_PORT,   1 },
  71
  72   /* SCHEME_INVALID */
  73   { NULL,       NULL,       -1,                 0 }
  74 };
  75
  76 /* Forward declarations: */
  77
  78 static int path_simplify PARAMS ((char *));
  79 \f
  80 /* Support for escaping and unescaping of URL strings.  */
  81
  82 /* Table of "reserved" and "unsafe" characters.  Those terms are
  83    rfc1738-speak, as such largely obsoleted by rfc2396 and later
  84    specs, but the general idea remains.
  85
  86    A reserved character is the one that you can't decode without
  87    changing the meaning of the URL.  For example, you can't decode
  88    "/foo/%2f/bar" into "/foo///bar" because the number and contents of
  89    path components is different.  Non-reserved characters can be
  90    changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar".  The
  91    unsafe characters are loosely based on rfc1738, plus "$" and ",",
  92    as recommended by rfc2396, and minus "~", which is very frequently
  93    used (and sometimes unrecognized as %7E by broken servers).
  94
  95    An unsafe character is the one that should be encoded when URLs are
  96    placed in foreign environments.  E.g. space and newline are unsafe
  97    in HTTP contexts because HTTP uses them as separator and line
  98    terminator, so they must be encoded to %20 and %0A respectively.
  99    "*" is unsafe in shell context, etc.
 100
 101    We determine whether a character is unsafe through static table
 102    lookup.  This code assumes ASCII character set and 8-bit chars.  */
 103
 104 enum {
 105   /* rfc1738 reserved chars + "$" and ",".  */
 106   urlchr_reserved = 1,
 107
 108   /* rfc1738 unsafe chars, plus non-printables.  */
 109   urlchr_unsafe   = 2
 110 };
 111
 112 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 113 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 114 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 115
 116 /* Shorthands for the table: */
 117 #define R  urlchr_reserved
 118 #define U  urlchr_unsafe
 119 #define RU R|U
 120
 121 static const unsigned char urlchr_table[256] =
 122 {
 123   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 124   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 125   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 126   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 127   U,  0,  U, RU,   R,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 128   0,  0,  0,  R,   R,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 129   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 130   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 131  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 132   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 133   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 134   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 135   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 136   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 137   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 138   0,  0,  0,  U,   U,  U,  0,  U,   /* x   y   z   {    |   }   ~   DEL */
 139
 140   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 141   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 142   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 143   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 144
 145   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 146   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 147   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 148   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 149 };
 150 #undef R
 151 #undef U
 152 #undef RU
 153
 154 /* URL-unescape the string S.
 155
 156    This is done by transforming the sequences "%HH" to the character
 157    represented by the hexadecimal digits HH.  If % is not followed by
 158    two hexadecimal digits, it is inserted literally.
 159
 160    The transformation is done in place.  If you need the original
 161    string intact, make a copy before calling this function.  */
 162
 163 static void
 164 url_unescape (char *s)
 165 {
 166   char *t = s;                  /* t - tortoise */
 167   char *h = s;                  /* h - hare     */
 168
 169   for (; *h; h++, t++)
 170     {
 171       if (*h != '%')
 172         {
 173         copychar:
 174           *t = *h;
 175         }
 176       else
 177         {
 178           /* Do nothing if '%' is not followed by two hex digits. */
 179           if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2])))
 180             goto copychar;
 181           *t = X2DIGITS_TO_NUM (h[1], h[2]);
 182           h += 2;
 183         }
 184     }
 185   *t = '\0';
 186 }
 187
 188 /* The core of url_escape_* functions.  Escapes the characters that
 189    match the provided mask in urlchr_table.
 190
 191    If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars
 192    will be returned unchanged.  If ALLOW_PASSTHROUGH is zero, a
 193    freshly allocated string will be returned in all cases.  */
 194
 195 static char *
 196 url_escape_1 (const char *s, unsigned char mask, int allow_passthrough)
 197 {
 198   const char *p1;
 199   char *p2, *newstr;
 200   int newlen;
 201   int addition = 0;
 202
 203   for (p1 = s; *p1; p1++)
 204     if (urlchr_test (*p1, mask))
 205       addition += 2;            /* Two more characters (hex digits) */
 206
 207   if (!addition)
 208     return allow_passthrough ? (char *)s : xstrdup (s);
 209
 210   newlen = (p1 - s) + addition;
 211   newstr = (char *)xmalloc (newlen + 1);
 212
 213   p1 = s;
 214   p2 = newstr;
 215   while (*p1)
 216     {
 217       /* Quote the characters that match the test mask. */
 218       if (urlchr_test (*p1, mask))
 219         {
 220           unsigned char c = *p1++;
 221           *p2++ = '%';
 222           *p2++ = XNUM_TO_DIGIT (c >> 4);
 223           *p2++ = XNUM_TO_DIGIT (c & 0xf);
 224         }
 225       else
 226         *p2++ = *p1++;
 227     }
 228   assert (p2 - newstr == newlen);
 229   *p2 = '\0';
 230
 231   return newstr;
 232 }
 233
 234 /* URL-escape the unsafe characters (see urlchr_table) in a given
 235    string, returning a freshly allocated string.  */
 236
 237 char *
 238 url_escape (const char *s)
 239 {
 240   return url_escape_1 (s, urlchr_unsafe, 0);
 241 }
 242
 243 /* URL-escape the unsafe characters (see urlchr_table) in a given
 244    string.  If no characters are unsafe, S is returned.  */
 245
 246 static char *
 247 url_escape_allow_passthrough (const char *s)
 248 {
 249   return url_escape_1 (s, urlchr_unsafe, 1);
 250 }
 251 \f
 252 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
 253
 254 /* Decide whether to encode, decode, or pass through the char at P.
 255    This used to be a macro, but it got a little too convoluted.  */
 256 static inline enum copy_method
 257 decide_copy_method (const char *p)
 258 {
 259   if (*p == '%')
 260     {
 261       if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
 262         {
 263           /* %xx sequence: decode it, unless it would decode to an
 264              unsafe or a reserved char; in that case, leave it as
 265              is. */
 266           char preempt = X2DIGITS_TO_NUM (*(p + 1), *(p + 2));
 267           if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt))
 268             return CM_PASSTHROUGH;
 269           else
 270             return CM_DECODE;
 271         }
 272       else
 273         /* Garbled %.. sequence: encode `%'. */
 274         return CM_ENCODE;
 275     }
 276   else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
 277     return CM_ENCODE;
 278   else
 279     return CM_PASSTHROUGH;
 280 }
 281
 282 /* Translate a %-escaped (but possibly non-conformant) input string S
 283    into a %-escaped (and conformant) output string.  If no characters
 284    are encoded or decoded, return the same string S; otherwise, return
 285    a freshly allocated string with the new contents.
 286
 287    After a URL has been run through this function, the protocols that
 288    use `%' as the quote character can use the resulting string as-is,
 289    while those that don't call url_unescape() to get to the intended
 290    data.  This function is also stable: after an input string is
 291    transformed the first time, all further transformations of the
 292    result yield the same result string.
 293
 294    Let's discuss why this function is needed.
 295
 296    Imagine Wget is to retrieve `http://abc.xyz/abc def'.  Since a raw
 297    space character would mess up the HTTP request, it needs to be
 298    quoted, like this:
 299
 300        GET /abc%20def HTTP/1.0
 301
 302    It appears that the unsafe chars need to be quoted, for example
 303    with url_escape.  But what if we're requested to download
 304    `abc%20def'?  url_escape transforms "%" to "%25", which would leave
 305    us with `abc%2520def'.  This is incorrect -- since %-escapes are
 306    part of URL syntax, "%20" is the correct way to denote a literal
 307    space on the Wget command line.  This leaves us in the conclusion
 308    that in that case Wget should not call url_escape, but leave the
 309    `%20' as is.
 310
 311    And what if the requested URI is `abc%20 def'?  If we call
 312    url_escape, we end up with `/abc%2520%20def', which is almost
 313    certainly not intended.  If we don't call url_escape, we are left
 314    with the embedded space and cannot complete the request.  What the
 315    user meant was for Wget to request `/abc%20%20def', and this is
 316    where reencode_escapes kicks in.
 317
 318    Wget used to solve this by first decoding %-quotes, and then
 319    encoding all the "unsafe" characters found in the resulting string.
 320    This was wrong because it didn't preserve certain URL special
 321    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 322    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 323    whether we considered `+' reserved (it is).  One of these results
 324    is inevitable because by the second step we would lose information
 325    on whether the `+' was originally encoded or not.  Both results
 326    were wrong because in CGI parameters + means space, while %2B means
 327    literal plus.  reencode_escapes correctly translates the above to
 328    "a%2B+b", i.e. returns the original string.
 329
 330    This function uses an algorithm proposed by Anon Sricharoenchai:
 331
 332    1. Encode all URL_UNSAFE and the "%" that are not followed by 2
 333       hexdigits.
 334
 335    2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
 336       "+".
 337
 338    ...except that this code conflates the two steps, and decides
 339    whether to encode, decode, or pass through each character in turn.
 340    The function still uses two passes, but their logic is the same --
 341    the first pass exists merely for the sake of allocation.  Another
 342    small difference is that we include `+' to URL_RESERVED.
 343
 344    Anon's test case:
 345
 346    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 347    ->
 348    "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
 349
 350    Simpler test cases:
 351
 352    "foo bar"         -> "foo%20bar"
 353    "foo%20bar"       -> "foo%20bar"
 354    "foo %20bar"      -> "foo%20%20bar"
 355    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 356    "foo%25%20bar"    -> "foo%25%20bar"
 357    "foo%2%20bar"     -> "foo%252%20bar"
 358    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 359    "foo%2b+bar"      -> "foo%2b+bar"  */
 360
 361 static char *
 362 reencode_escapes (const char *s)
 363 {
 364   const char *p1;
 365   char *newstr, *p2;
 366   int oldlen, newlen;
 367
 368   int encode_count = 0;
 369   int decode_count = 0;
 370
 371   /* First, pass through the string to see if there's anything to do,
 372      and to calculate the new length.  */
 373   for (p1 = s; *p1; p1++)
 374     {
 375       switch (decide_copy_method (p1))
 376         {
 377         case CM_ENCODE:
 378           ++encode_count;
 379           break;
 380         case CM_DECODE:
 381           ++decode_count;
 382           break;
 383         case CM_PASSTHROUGH:
 384           break;
 385         }
 386     }
 387
 388   if (!encode_count && !decode_count)
 389     /* The string is good as it is. */
 390     return (char *)s;           /* C const model sucks. */
 391
 392   oldlen = p1 - s;
 393   /* Each encoding adds two characters (hex digits), while each
 394      decoding removes two characters.  */
 395   newlen = oldlen + 2 * (encode_count - decode_count);
 396   newstr = xmalloc (newlen + 1);
 397
 398   p1 = s;
 399   p2 = newstr;
 400
 401   while (*p1)
 402     {
 403       switch (decide_copy_method (p1))
 404         {
 405         case CM_ENCODE:
 406           {
 407             unsigned char c = *p1++;
 408             *p2++ = '%';
 409             *p2++ = XNUM_TO_DIGIT (c >> 4);
 410             *p2++ = XNUM_TO_DIGIT (c & 0xf);
 411           }
 412           break;
 413         case CM_DECODE:
 414           *p2++ = X2DIGITS_TO_NUM (p1[1], p1[2]);
 415           p1 += 3;              /* skip %xx */
 416           break;
 417         case CM_PASSTHROUGH:
 418           *p2++ = *p1++;
 419         }
 420     }
 421   *p2 = '\0';
 422   assert (p2 - newstr == newlen);
 423   return newstr;
 424 }
 425 \f
 426 /* Returns the scheme type if the scheme is supported, or
 427    SCHEME_INVALID if not.  */
 428
 429 enum url_scheme
 430 url_scheme (const char *url)
 431 {
 432   int i;
 433
 434   for (i = 0; supported_schemes[i].leading_string; i++)
 435     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 436                           strlen (supported_schemes[i].leading_string)))
 437       {
 438         if (supported_schemes[i].enabled)
 439           return (enum url_scheme) i;
 440         else
 441           return SCHEME_INVALID;
 442       }
 443
 444   return SCHEME_INVALID;
 445 }
 446
 447 #define SCHEME_CHAR(ch) (ISALNUM (ch) || (ch) == '-' || (ch) == '+')
 448
 449 /* Return 1 if the URL begins with any "scheme", 0 otherwise.  As
 450    currently implemented, it returns true if URL begins with
 451    [-+a-zA-Z0-9]+: .  */
 452
 453 int
 454 url_has_scheme (const char *url)
 455 {
 456   const char *p = url;
 457
 458   /* The first char must be a scheme char. */
 459   if (!*p || !SCHEME_CHAR (*p))
 460     return 0;
 461   ++p;
 462   /* Followed by 0 or more scheme chars. */
 463   while (*p && SCHEME_CHAR (*p))
 464     ++p;
 465   /* Terminated by ':'. */
 466   return *p == ':';
 467 }
 468
 469 int
 470 scheme_default_port (enum url_scheme scheme)
 471 {
 472   return supported_schemes[scheme].default_port;
 473 }
 474
 475 void
 476 scheme_disable (enum url_scheme scheme)
 477 {
 478   supported_schemes[scheme].enabled = 0;
 479 }
 480
 481 /* Skip the username and password, if present in the URL.  The
 482    function should *not* be called with the complete URL, but with the
 483    portion after the scheme.
 484
 485    If no username and password are found, return URL.  */
 486
 487 static const char *
 488 url_skip_credentials (const char *url)
 489 {
 490   /* Look for '@' that comes before terminators, such as '/', '?',
 491      '#', or ';'.  */
 492   const char *p = (const char *)strpbrk (url, "@/?#;");
 493   if (!p || *p != '@')
 494     return url;
 495   return p + 1;
 496 }
 497
 498 /* Parse credentials contained in [BEG, END).  The region is expected
 499    to have come from a URL and is unescaped.  */
 500
 501 static int
 502 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
 503 {
 504   char *colon;
 505   const char *userend;
 506
 507   if (beg == end)
 508     return 0;                   /* empty user name */
 509
 510   colon = memchr (beg, ':', end - beg);
 511   if (colon == beg)
 512     return 0;                   /* again empty user name */
 513
 514   if (colon)
 515     {
 516       *passwd = strdupdelim (colon + 1, end);
 517       userend = colon;
 518       url_unescape (*passwd);
 519     }
 520   else
 521     {
 522       *passwd = NULL;
 523       userend = end;
 524     }
 525   *user = strdupdelim (beg, userend);
 526   url_unescape (*user);
 527   return 1;
 528 }
 529
 530 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 531    popularized by Netscape and NcFTP.  HTTP shorthands look like this:
 532
 533    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 534    www.foo.com[:port]            -> http://www.foo.com[:port]
 535
 536    FTP shorthands look like this:
 537
 538    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 539    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 540
 541    If the URL needs not or cannot be rewritten, return NULL.  */
 542
 543 char *
 544 rewrite_shorthand_url (const char *url)
 545 {
 546   const char *p;
 547
 548   if (url_scheme (url) != SCHEME_INVALID)
 549     return NULL;
 550
 551   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 552      latter Netscape.  */
 553   for (p = url; *p && *p != ':' && *p != '/'; p++)
 554     ;
 555
 556   if (p == url)
 557     return NULL;
 558
 559   if (*p == ':')
 560     {
 561       const char *pp;
 562       char *res;
 563       /* If the characters after the colon and before the next slash
 564          or end of string are all digits, it's HTTP.  */
 565       int digits = 0;
 566       for (pp = p + 1; ISDIGIT (*pp); pp++)
 567         ++digits;
 568       if (digits > 0 && (*pp == '/' || *pp == '\0'))
 569         goto http;
 570
 571       /* Prepend "ftp://" to the entire URL... */
 572       res = xmalloc (6 + strlen (url) + 1);
 573       sprintf (res, "ftp://%s", url);
 574       /* ...and replace ':' with '/'. */
 575       res[6 + (p - url)] = '/';
 576       return res;
 577     }
 578   else
 579     {
 580       char *res;
 581     http:
 582       /* Just prepend "http://" to what we have. */
 583       res = xmalloc (7 + strlen (url) + 1);
 584       sprintf (res, "http://%s", url);
 585       return res;
 586     }
 587 }
 588 \f
 589 static void split_path PARAMS ((const char *, char **, char **));
 590
 591 /* Like strpbrk, with the exception that it returns the pointer to the
 592    terminating zero (end-of-string aka "eos") if no matching character
 593    is found.
 594
 595    Although I normally balk at Gcc-specific optimizations, it probably
 596    makes sense here: glibc has optimizations that detect strpbrk being
 597    called with literal string as ACCEPT and inline the search.  That
 598    optimization is defeated if strpbrk is hidden within the call to
 599    another function.  (And no, making strpbrk_or_eos inline doesn't
 600    help because the check for literal accept is in the
 601    preprocessor.)  */
 602
 603 #ifdef __GNUC__
 604
 605 #define strpbrk_or_eos(s, accept) ({            \
 606   char *SOE_p = strpbrk (s, accept);            \
 607   if (!SOE_p)                                   \
 608     SOE_p = (char *)s + strlen (s);             \
 609   SOE_p;                                        \
 610 })
 611
 612 #else  /* not __GNUC__ */
 613
 614 static char *
 615 strpbrk_or_eos (const char *s, const char *accept)
 616 {
 617   char *p = strpbrk (s, accept);
 618   if (!p)
 619     p = (char *)s + strlen (s);
 620   return p;
 621 }
 622 #endif
 623
 624 /* Turn STR into lowercase; return non-zero if a character was
 625    actually changed. */
 626
 627 static int
 628 lowercase_str (char *str)
 629 {
 630   int change = 0;
 631   for (; *str; str++)
 632     if (ISUPPER (*str))
 633       {
 634         change = 1;
 635         *str = TOLOWER (*str);
 636       }
 637   return change;
 638 }
 639
 640 static const char *parse_errors[] = {
 641 #define PE_NO_ERROR                     0
 642   N_("No error"),
 643 #define PE_UNSUPPORTED_SCHEME           1
 644   N_("Unsupported scheme"),
 645 #define PE_EMPTY_HOST                   2
 646   N_("Empty host"),
 647 #define PE_BAD_PORT_NUMBER              3
 648   N_("Bad port number"),
 649 #define PE_INVALID_USER_NAME            4
 650   N_("Invalid user name"),
 651 #define PE_UNTERMINATED_IPV6_ADDRESS    5
 652   N_("Unterminated IPv6 numeric address"),
 653 #define PE_IPV6_NOT_SUPPORTED           6
 654   N_("IPv6 addresses not supported"),
 655 #define PE_INVALID_IPV6_ADDRESS         7
 656   N_("Invalid IPv6 numeric address")
 657 };
 658
 659 /* Parse a URL.
 660
 661    Return a new struct url if successful, NULL on error.  In case of
 662    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 663    error code. */
 664 struct url *
 665 url_parse (const char *url, int *error)
 666 {
 667   struct url *u;
 668   const char *p;
 669   int path_modified, host_modified;
 670
 671   enum url_scheme scheme;
 672
 673   const char *uname_b,     *uname_e;
 674   const char *host_b,      *host_e;
 675   const char *path_b,      *path_e;
 676   const char *params_b,    *params_e;
 677   const char *query_b,     *query_e;
 678   const char *fragment_b,  *fragment_e;
 679
 680   int port;
 681   char *user = NULL, *passwd = NULL;
 682
 683   char *url_encoded = NULL;
 684
 685   int error_code;
 686
 687   scheme = url_scheme (url);
 688   if (scheme == SCHEME_INVALID)
 689     {
 690       error_code = PE_UNSUPPORTED_SCHEME;
 691       goto err;
 692     }
 693
 694   url_encoded = reencode_escapes (url);
 695   p = url_encoded;
 696
 697   p += strlen (supported_schemes[scheme].leading_string);
 698   uname_b = p;
 699   p = url_skip_credentials (p);
 700   uname_e = p;
 701
 702   /* scheme://user:pass@host[:port]... */
 703   /*                    ^              */
 704
 705   /* We attempt to break down the URL into the components path,
 706      params, query, and fragment.  They are ordered like this:
 707
 708        scheme://host[:port][/path][;params][?query][#fragment]  */
 709
 710   params_b   = params_e   = NULL;
 711   query_b    = query_e    = NULL;
 712   fragment_b = fragment_e = NULL;
 713
 714   host_b = p;
 715
 716   if (*p == '[')
 717     {
 718       /* Handle IPv6 address inside square brackets.  Ideally we'd
 719          just look for the terminating ']', but rfc2732 mandates
 720          rejecting invalid IPv6 addresses.  */
 721
 722       /* The address begins after '['. */
 723       host_b = p + 1;
 724       host_e = strchr (host_b, ']');
 725
 726       if (!host_e)
 727         {
 728           error_code = PE_UNTERMINATED_IPV6_ADDRESS;
 729           goto err;
 730         }
 731
 732 #ifdef ENABLE_IPV6
 733       /* Check if the IPv6 address is valid. */
 734       if (!is_valid_ipv6_address(host_b, host_e))
 735         {
 736           error_code = PE_INVALID_IPV6_ADDRESS;
 737           goto err;
 738         }
 739
 740       /* Continue parsing after the closing ']'. */
 741       p = host_e + 1;
 742 #else
 743       error_code = PE_IPV6_NOT_SUPPORTED;
 744       goto err;
 745 #endif
 746     }
 747   else
 748     {
 749       p = strpbrk_or_eos (p, ":/;?#");
 750       host_e = p;
 751     }
 752
 753   if (host_b == host_e)
 754     {
 755       error_code = PE_EMPTY_HOST;
 756       goto err;
 757     }
 758
 759   port = scheme_default_port (scheme);
 760   if (*p == ':')
 761     {
 762       const char *port_b, *port_e, *pp;
 763
 764       /* scheme://host:port/tralala */
 765       /*              ^             */
 766       ++p;
 767       port_b = p;
 768       p = strpbrk_or_eos (p, "/;?#");
 769       port_e = p;
 770
 771       /* Allow empty port, as per rfc2396. */
 772       if (port_b != port_e)
 773         {
 774           for (port = 0, pp = port_b; pp < port_e; pp++)
 775             {
 776               if (!ISDIGIT (*pp))
 777                 {
 778                   /* http://host:12randomgarbage/blah */
 779                   /*               ^                  */
 780                   error_code = PE_BAD_PORT_NUMBER;
 781                   goto err;
 782                 }
 783               port = 10 * port + (*pp - '0');
 784               /* Check for too large port numbers here, before we have
 785                  a chance to overflow on bogus port values.  */
 786               if (port > 65535)
 787                 {
 788                   error_code = PE_BAD_PORT_NUMBER;
 789                   goto err;
 790                 }
 791             }
 792         }
 793     }
 794
 795   if (*p == '/')
 796     {
 797       ++p;
 798       path_b = p;
 799       p = strpbrk_or_eos (p, ";?#");
 800       path_e = p;
 801     }
 802   else
 803     {
 804       /* Path is not allowed not to exist. */
 805       path_b = path_e = p;
 806     }
 807
 808   if (*p == ';')
 809     {
 810       ++p;
 811       params_b = p;
 812       p = strpbrk_or_eos (p, "?#");
 813       params_e = p;
 814     }
 815   if (*p == '?')
 816     {
 817       ++p;
 818       query_b = p;
 819       p = strpbrk_or_eos (p, "#");
 820       query_e = p;
 821
 822       /* Hack that allows users to use '?' (a wildcard character) in
 823          FTP URLs without it being interpreted as a query string
 824          delimiter.  */
 825       if (scheme == SCHEME_FTP)
 826         {
 827           query_b = query_e = NULL;
 828           path_e = p;
 829         }
 830     }
 831   if (*p == '#')
 832     {
 833       ++p;
 834       fragment_b = p;
 835       p += strlen (p);
 836       fragment_e = p;
 837     }
 838   assert (*p == 0);
 839
 840   if (uname_b != uname_e)
 841     {
 842       /* http://user:pass@host */
 843       /*        ^         ^    */
 844       /*     uname_b   uname_e */
 845       if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
 846         {
 847           error_code = PE_INVALID_USER_NAME;
 848           goto err;
 849         }
 850     }
 851
 852   u = xnew0 (struct url);
 853   u->scheme = scheme;
 854   u->host   = strdupdelim (host_b, host_e);
 855   u->port   = port;
 856   u->user   = user;
 857   u->passwd = passwd;
 858
 859   u->path = strdupdelim (path_b, path_e);
 860   path_modified = path_simplify (u->path);
 861   split_path (u->path, &u->dir, &u->file);
 862
 863   host_modified = lowercase_str (u->host);
 864
 865   /* Decode %HH sequences in host name.  This is important not so much
 866      to support %HH sequences, but to support binary characters (which
 867      will have been converted to %HH by reencode_escapes).  */
 868   if (strchr (u->host, '%'))
 869     {
 870       url_unescape (u->host);
 871       host_modified = 1;
 872     }
 873
 874   if (params_b)
 875     u->params = strdupdelim (params_b, params_e);
 876   if (query_b)
 877     u->query = strdupdelim (query_b, query_e);
 878   if (fragment_b)
 879     u->fragment = strdupdelim (fragment_b, fragment_e);
 880
 881   if (path_modified || u->fragment || host_modified || path_b == path_e)
 882     {
 883       /* If we suspect that a transformation has rendered what
 884          url_string might return different from URL_ENCODED, rebuild
 885          u->url using url_string.  */
 886       u->url = url_string (u, 0);
 887
 888       if (url_encoded != url)
 889         xfree ((char *) url_encoded);
 890     }
 891   else
 892     {
 893       if (url_encoded == url)
 894         u->url = xstrdup (url);
 895       else
 896         u->url = url_encoded;
 897     }
 898   url_encoded = NULL;
 899
 900   return u;
 901
 902  err:
 903   /* Cleanup in case of error: */
 904   if (url_encoded && url_encoded != url)
 905     xfree (url_encoded);
 906
 907   /* Transmit the error code to the caller, if the caller wants to
 908      know.  */
 909   if (error)
 910     *error = error_code;
 911   return NULL;
 912 }
 913
 914 /* Return the error message string from ERROR_CODE, which should have
 915    been retrieved from url_parse.  The error message is translated.  */
 916
 917 const char *
 918 url_error (int error_code)
 919 {
 920   assert (error_code >= 0 && error_code < countof (parse_errors));
 921   return _(parse_errors[error_code]);
 922 }
 923
 924 /* Split PATH into DIR and FILE.  PATH comes from the URL and is
 925    expected to be URL-escaped.
 926
 927    The path is split into directory (the part up to the last slash)
 928    and file (the part after the last slash), which are subsequently
 929    unescaped.  Examples:
 930
 931    PATH                 DIR           FILE
 932    "foo/bar/baz"        "foo/bar"     "baz"
 933    "foo/bar/"           "foo/bar"     ""
 934    "foo"                ""            "foo"
 935    "foo/bar/baz%2fqux"  "foo/bar"     "baz/qux" (!)
 936
 937    DIR and FILE are freshly allocated.  */
 938
 939 static void
 940 split_path (const char *path, char **dir, char **file)
 941 {
 942   char *last_slash = strrchr (path, '/');
 943   if (!last_slash)
 944     {
 945       *dir = xstrdup ("");
 946       *file = xstrdup (path);
 947     }
 948   else
 949     {
 950       *dir = strdupdelim (path, last_slash);
 951       *file = xstrdup (last_slash + 1);
 952     }
 953   url_unescape (*dir);
 954   url_unescape (*file);
 955 }
 956
 957 /* Note: URL's "full path" is the path with the query string and
 958    params appended.  The "fragment" (#foo) is intentionally ignored,
 959    but that might be changed.  For example, if the original URL was
 960    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
 961    the full path will be "/foo/bar/baz;bullshit?querystring".  */
 962
 963 /* Return the length of the full path, without the terminating
 964    zero.  */
 965
 966 static int
 967 full_path_length (const struct url *url)
 968 {
 969   int len = 0;
 970
 971 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
 972
 973   FROB (path);
 974   FROB (params);
 975   FROB (query);
 976
 977 #undef FROB
 978
 979   return len;
 980 }
 981
 982 /* Write out the full path. */
 983
 984 static void
 985 full_path_write (const struct url *url, char *where)
 986 {
 987 #define FROB(el, chr) do {                      \
 988   char *f_el = url->el;                         \
 989   if (f_el) {                                   \
 990     int l = strlen (f_el);                      \
 991     *where++ = chr;                             \
 992     memcpy (where, f_el, l);                    \
 993     where += l;                                 \
 994   }                                             \
 995 } while (0)
 996
 997   FROB (path, '/');
 998   FROB (params, ';');
 999   FROB (query, '?');
1000
1001 #undef FROB
1002 }
1003
1004 /* Public function for getting the "full path".  E.g. if u->path is
1005    "foo/bar" and u->query is "param=value", full_path will be
1006    "/foo/bar?param=value". */
1007
1008 char *
1009 url_full_path (const struct url *url)
1010 {
1011   int length = full_path_length (url);
1012   char *full_path = (char *) xmalloc (length + 1);
1013
1014   full_path_write (url, full_path);
1015   full_path[length] = '\0';
1016
1017   return full_path;
1018 }
1019
1020 /* Unescape CHR in an otherwise escaped STR.  Used to selectively
1021    escaping of certain characters, such as "/" and ":".  Returns a
1022    count of unescaped chars.  */
1023
1024 static void
1025 unescape_single_char (char *str, char chr)
1026 {
1027   const char c1 = XNUM_TO_DIGIT (chr >> 4);
1028   const char c2 = XNUM_TO_DIGIT (chr & 0xf);
1029   char *h = str;                /* hare */
1030   char *t = str;                /* tortoise */
1031   for (; *h; h++, t++)
1032     {
1033       if (h[0] == '%' && h[1] == c1 && h[2] == c2)
1034         {
1035           *t = chr;
1036           h += 2;
1037         }
1038       else
1039         *t = *h;
1040     }
1041   *t = '\0';
1042 }
1043
1044 /* Escape unsafe and reserved characters, except for the slash
1045    characters.  */
1046
1047 static char *
1048 url_escape_dir (const char *dir)
1049 {
1050   char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1051   if (newdir == dir)
1052     return (char *)dir;
1053
1054   unescape_single_char (newdir, '/');
1055   return newdir;
1056 }
1057
1058 /* Sync u->path and u->url with u->dir and u->file.  Called after
1059    u->file or u->dir have been changed, typically by the FTP code.  */
1060
1061 static void
1062 sync_path (struct url *u)
1063 {
1064   char *newpath, *efile, *edir;
1065
1066   xfree (u->path);
1067
1068   /* u->dir and u->file are not escaped.  URL-escape them before
1069      reassembling them into u->path.  That way, if they contain
1070      separators like '?' or even if u->file contains slashes, the
1071      path will be correctly assembled.  (u->file can contain slashes
1072      if the URL specifies it with %2f, or if an FTP server returns
1073      it.)  */
1074   edir = url_escape_dir (u->dir);
1075   efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1076
1077   if (!*edir)
1078     newpath = xstrdup (efile);
1079   else
1080     {
1081       int dirlen = strlen (edir);
1082       int filelen = strlen (efile);
1083
1084       /* Copy "DIR/FILE" to newpath. */
1085       char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1086       memcpy (p, edir, dirlen);
1087       p += dirlen;
1088       *p++ = '/';
1089       memcpy (p, efile, filelen);
1090       p += filelen;
1091       *p++ = '\0';
1092     }
1093
1094   u->path = newpath;
1095
1096   if (edir != u->dir)
1097     xfree (edir);
1098   if (efile != u->file)
1099     xfree (efile);
1100
1101   /* Regenerate u->url as well.  */
1102   xfree (u->url);
1103   u->url = url_string (u, 0);
1104 }
1105
1106 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1107    This way we can sync u->path and u->url when they get changed.  */
1108
1109 void
1110 url_set_dir (struct url *url, const char *newdir)
1111 {
1112   xfree (url->dir);
1113   url->dir = xstrdup (newdir);
1114   sync_path (url);
1115 }
1116
1117 void
1118 url_set_file (struct url *url, const char *newfile)
1119 {
1120   xfree (url->file);
1121   url->file = xstrdup (newfile);
1122   sync_path (url);
1123 }
1124
1125 void
1126 url_free (struct url *url)
1127 {
1128   xfree (url->host);
1129   xfree (url->path);
1130   xfree (url->url);
1131
1132   xfree_null (url->params);
1133   xfree_null (url->query);
1134   xfree_null (url->fragment);
1135   xfree_null (url->user);
1136   xfree_null (url->passwd);
1137
1138   xfree (url->dir);
1139   xfree (url->file);
1140
1141   xfree (url);
1142 }
1143 \f
1144 /* Create all the necessary directories for PATH (a file).  Calls
1145    mkdirhier() internally.  */
1146 int
1147 mkalldirs (const char *path)
1148 {
1149   const char *p;
1150   char *t;
1151   struct_stat st;
1152   int res;
1153
1154   p = path + strlen (path);
1155   for (; *p != '/' && p != path; p--)
1156     ;
1157
1158   /* Don't create if it's just a file.  */
1159   if ((p == path) && (*p != '/'))
1160     return 0;
1161   t = strdupdelim (path, p);
1162
1163   /* Check whether the directory exists.  */
1164   if ((stat (t, &st) == 0))
1165     {
1166       if (S_ISDIR (st.st_mode))
1167         {
1168           xfree (t);
1169           return 0;
1170         }
1171       else
1172         {
1173           /* If the dir exists as a file name, remove it first.  This
1174              is *only* for Wget to work with buggy old CERN http
1175              servers.  Here is the scenario: When Wget tries to
1176              retrieve a directory without a slash, e.g.
1177              http://foo/bar (bar being a directory), CERN server will
1178              not redirect it too http://foo/bar/ -- it will generate a
1179              directory listing containing links to bar/file1,
1180              bar/file2, etc.  Wget will lose because it saves this
1181              HTML listing to a file `bar', so it cannot create the
1182              directory.  To work around this, if the file of the same
1183              name exists, we just remove it and create the directory
1184              anyway.  */
1185           DEBUGP (("Removing %s because of directory danger!\n", t));
1186           unlink (t);
1187         }
1188     }
1189   res = make_directory (t);
1190   if (res != 0)
1191     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1192   xfree (t);
1193   return res;
1194 }
1195 \f
1196 /* Functions for constructing the file name out of URL components.  */
1197
1198 /* A growable string structure, used by url_file_name and friends.
1199    This should perhaps be moved to utils.c.
1200
1201    The idea is to have a convenient and efficient way to construct a
1202    string by having various functions append data to it.  Instead of
1203    passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1204    functions in questions, we pass the pointer to this struct.  */
1205
1206 struct growable {
1207   char *base;
1208   int size;
1209   int tail;
1210 };
1211
1212 /* Ensure that the string can accept APPEND_COUNT more characters past
1213    the current TAIL position.  If necessary, this will grow the string
1214    and update its allocated size.  If the string is already large
1215    enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1216 #define GROW(g, append_size) do {                                       \
1217   struct growable *G_ = g;                                              \
1218   DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1219 } while (0)
1220
1221 /* Return the tail position of the string. */
1222 #define TAIL(r) ((r)->base + (r)->tail)
1223
1224 /* Move the tail position by APPEND_COUNT characters. */
1225 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1226
1227 /* Append the string STR to DEST.  NOTICE: the string in DEST is not
1228    terminated.  */
1229
1230 static void
1231 append_string (const char *str, struct growable *dest)
1232 {
1233   int l = strlen (str);
1234   GROW (dest, l);
1235   memcpy (TAIL (dest), str, l);
1236   TAIL_INCR (dest, l);
1237 }
1238
1239 /* Append CH to DEST.  For example, append_char (0, DEST)
1240    zero-terminates DEST.  */
1241
1242 static void
1243 append_char (char ch, struct growable *dest)
1244 {
1245   GROW (dest, 1);
1246   *TAIL (dest) = ch;
1247   TAIL_INCR (dest, 1);
1248 }
1249
1250 enum {
1251   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1252   filechr_not_windows = 2,      /* unusable on Windows, one of \|/<>?:*" */
1253   filechr_control     = 4       /* a control character, e.g. 0-31 */
1254 };
1255
1256 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1257
1258 /* Shorthands for the table: */
1259 #define U filechr_not_unix
1260 #define W filechr_not_windows
1261 #define C filechr_control
1262
1263 #define UW U|W
1264 #define UWC U|W|C
1265
1266 /* Table of characters unsafe under various conditions (see above).
1267
1268    Arguably we could also claim `%' to be unsafe, since we use it as
1269    the escape character.  If we ever want to be able to reliably
1270    translate file name back to URL, this would become important
1271    crucial.  Right now, it's better to be minimal in escaping.  */
1272
1273 static const unsigned char filechr_table[256] =
1274 {
1275 UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1276   C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1277   C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1278   C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1279   0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1280   0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1281   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1282   0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
1283   0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1284   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1285   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1286   0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1287   0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1288   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1289   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1290   0,  0,  0,  0,   0,  0,  0,  0,   /* x   y   z   {    |   }   ~   DEL */
1291
1292   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
1293   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
1294   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1295   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1296
1297   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1298   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1299   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1300   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1301 };
1302 #undef U
1303 #undef W
1304 #undef C
1305 #undef UW
1306 #undef UWC
1307
1308 /* FN_PORT_SEP is the separator between host and port in file names
1309    for non-standard port numbers.  On Unix this is normally ':', as in
1310    "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1311    because Windows can't handle ':' in file names.  */
1312 #define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1313
1314 /* FN_QUERY_SEP is the separator between the file name and the URL
1315    query, normally '?'.  Since Windows cannot handle '?' as part of
1316    file name, we use '@' instead there.  */
1317 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1318
1319 /* Quote path element, characters in [b, e), as file name, and append
1320    the quoted string to DEST.  Each character is quoted as per
1321    file_unsafe_char and the corresponding table.
1322
1323    If ESCAPED_P is non-zero, the path element is considered to be
1324    URL-escaped and will be unescaped prior to inspection.  */
1325
1326 static void
1327 append_uri_pathel (const char *b, const char *e, int escaped_p,
1328                    struct growable *dest)
1329 {
1330   const char *p;
1331   int quoted, outlen;
1332
1333   int mask;
1334   if (opt.restrict_files_os == restrict_unix)
1335     mask = filechr_not_unix;
1336   else
1337     mask = filechr_not_windows;
1338   if (opt.restrict_files_ctrl)
1339     mask |= filechr_control;
1340
1341   /* Copy [b, e) to PATHEL and URL-unescape it. */
1342   if (escaped_p)
1343     {
1344       char *unescaped;
1345       BOUNDED_TO_ALLOCA (b, e, unescaped);
1346       url_unescape (unescaped);
1347       b = unescaped;
1348       e = unescaped + strlen (unescaped);
1349     }
1350
1351   /* Defang ".." when found as component of path.  Remember that path
1352      comes from the URL and might contain malicious input.  */
1353   if (e - b == 2 && b[0] == '.' && b[1] == '.')
1354     {
1355       b = "%2E%2E";
1356       e = b + 6;
1357     }
1358
1359   /* Walk the PATHEL string and check how many characters we'll need
1360      to quote.  */
1361   quoted = 0;
1362   for (p = b; p < e; p++)
1363     if (FILE_CHAR_TEST (*p, mask))
1364       ++quoted;
1365
1366   /* Calculate the length of the output string.  e-b is the input
1367      string length.  Each quoted char introduces two additional
1368      characters in the string, hence 2*quoted.  */
1369   outlen = (e - b) + (2 * quoted);
1370   GROW (dest, outlen);
1371
1372   if (!quoted)
1373     {
1374       /* If there's nothing to quote, we can simply append the string
1375          without processing it again.  */
1376       memcpy (TAIL (dest), b, outlen);
1377     }
1378   else
1379     {
1380       char *q = TAIL (dest);
1381       for (p = b; p < e; p++)
1382         {
1383           if (!FILE_CHAR_TEST (*p, mask))
1384             *q++ = *p;
1385           else
1386             {
1387               unsigned char ch = *p;
1388               *q++ = '%';
1389               *q++ = XNUM_TO_DIGIT (ch >> 4);
1390               *q++ = XNUM_TO_DIGIT (ch & 0xf);
1391             }
1392         }
1393       assert (q - TAIL (dest) == outlen);
1394     }
1395   TAIL_INCR (dest, outlen);
1396 }
1397
1398 /* Append to DEST the directory structure that corresponds the
1399    directory part of URL's path.  For example, if the URL is
1400    http://server/dir1/dir2/file, this appends "/dir1/dir2".
1401
1402    Each path element ("dir1" and "dir2" in the above example) is
1403    examined, url-unescaped, and re-escaped as file name element.
1404
1405    Additionally, it cuts as many directories from the path as
1406    specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1407    will produce "bar" for the above example.  For 2 or more, it will
1408    produce "".
1409
1410    Each component of the path is quoted for use as file name.  */
1411
1412 static void
1413 append_dir_structure (const struct url *u, struct growable *dest)
1414 {
1415   char *pathel, *next;
1416   int cut = opt.cut_dirs;
1417
1418   /* Go through the path components, de-URL-quote them, and quote them
1419      (if necessary) as file names.  */
1420
1421   pathel = u->path;
1422   for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1423     {
1424       if (cut-- > 0)
1425         continue;
1426       if (pathel == next)
1427         /* Ignore empty pathels.  */
1428         continue;
1429
1430       if (dest->tail)
1431         append_char ('/', dest);
1432       append_uri_pathel (pathel, next, 1, dest);
1433     }
1434 }
1435
1436 /* Return a unique file name that matches the given URL as good as
1437    possible.  Does not create directories on the file system.  */
1438
1439 char *
1440 url_file_name (const struct url *u)
1441 {
1442   struct growable fnres;        /* stands for "file name result" */
1443
1444   const char *u_file, *u_query;
1445   char *fname, *unique;
1446
1447   fnres.base = NULL;
1448   fnres.size = 0;
1449   fnres.tail = 0;
1450
1451   /* Start with the directory prefix, if specified. */
1452   if (opt.dir_prefix)
1453     append_string (opt.dir_prefix, &fnres);
1454
1455   /* If "dirstruct" is turned on (typically the case with -r), add
1456      the host and port (unless those have been turned off) and
1457      directory structure.  */
1458   if (opt.dirstruct)
1459     {
1460       if (opt.protocol_directories)
1461         {
1462           if (fnres.tail)
1463             append_char ('/', &fnres);
1464           append_string (supported_schemes[u->scheme].name, &fnres);
1465         }
1466       if (opt.add_hostdir)
1467         {
1468           if (fnres.tail)
1469             append_char ('/', &fnres);
1470           if (0 != strcmp (u->host, ".."))
1471             append_string (u->host, &fnres);
1472           else
1473             /* Host name can come from the network; malicious DNS may
1474                allow ".." to be resolved, causing us to write to
1475                "../<file>".  Defang such host names.  */
1476             append_string ("%2E%2E", &fnres);
1477           if (u->port != scheme_default_port (u->scheme))
1478             {
1479               char portstr[24];
1480               number_to_string (portstr, u->port);
1481               append_char (FN_PORT_SEP, &fnres);
1482               append_string (portstr, &fnres);
1483             }
1484         }
1485
1486       append_dir_structure (u, &fnres);
1487     }
1488
1489   /* Add the file name. */
1490   if (fnres.tail)
1491     append_char ('/', &fnres);
1492   u_file = *u->file ? u->file : "index.html";
1493   append_uri_pathel (u_file, u_file + strlen (u_file), 0, &fnres);
1494
1495   /* Append "?query" to the file name. */
1496   u_query = u->query && *u->query ? u->query : NULL;
1497   if (u_query)
1498     {
1499       append_char (FN_QUERY_SEP, &fnres);
1500       append_uri_pathel (u_query, u_query + strlen (u_query), 1, &fnres);
1501     }
1502
1503   /* Zero-terminate the file name. */
1504   append_char ('\0', &fnres);
1505
1506   fname = fnres.base;
1507
1508   /* Check the cases in which the unique extensions are not used:
1509      1) Clobbering is turned off (-nc).
1510      2) Retrieval with regetting.
1511      3) Timestamping is used.
1512      4) Hierarchy is built.
1513
1514      The exception is the case when file does exist and is a
1515      directory (see `mkalldirs' for explanation).  */
1516
1517   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1518       && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1519     return fname;
1520
1521   unique = unique_name (fname, 1);
1522   if (unique != fname)
1523     xfree (fname);
1524   return unique;
1525 }
1526 \f
1527 /* Resolve "." and ".." elements of PATH by destructively modifying
1528    PATH and return non-zero if PATH has been modified, zero otherwise.
1529
1530    The algorithm is in spirit similar to the one described in rfc1808,
1531    although implemented differently, in one pass.  To recap, path
1532    elements containing only "." are removed, and ".." is taken to mean
1533    "back up one element".  Single leading and trailing slashes are
1534    preserved.
1535
1536    This function does not handle URL escapes explicitly.  If you're
1537    passing paths from URLs, make sure to unquote "%2e" and "%2E" to
1538    ".", so that this function can find the dots.  (Wget's URL parser
1539    calls reencode_escapes, which see.)
1540
1541    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1542    test examples are provided below.  If you change anything in this
1543    function, run test_path_simplify to make sure you haven't broken a
1544    test case.  */
1545
1546 static int
1547 path_simplify (char *path)
1548 {
1549   char *h = path;               /* hare */
1550   char *t = path;               /* tortoise */
1551   char *beg = path;             /* boundary for backing the tortoise */
1552   char *end = path + strlen (path);
1553
1554   while (h < end)
1555     {
1556       /* Hare should be at the beginning of a path element. */
1557
1558       if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1559         {
1560           /* Ignore "./". */
1561           h += 2;
1562         }
1563       else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1564         {
1565           /* Handle "../" by retreating the tortoise by one path
1566              element -- but not past beggining.  */
1567           if (t > beg)
1568             {
1569               /* Move backwards until T hits the beginning of the
1570                  previous path element or the beginning of path. */
1571               for (--t; t > beg && t[-1] != '/'; t--)
1572                 ;
1573             }
1574           else
1575             {
1576               /* If we're at the beginning, copy the "../" literally
1577                  move the beginning so a later ".." doesn't remove
1578                  it.  */
1579               beg = t + 3;
1580               goto regular;
1581             }
1582           h += 3;
1583         }
1584       else
1585         {
1586         regular:
1587           /* A regular path element.  If H hasn't advanced past T,
1588              simply skip to the next path element.  Otherwise, copy
1589              the path element until the next slash.  */
1590           if (t == h)
1591             {
1592               /* Skip the path element, including the slash.  */
1593               while (h < end && *h != '/')
1594                 t++, h++;
1595               if (h < end)
1596                 t++, h++;
1597             }
1598           else
1599             {
1600               /* Copy the path element, including the final slash.  */
1601               while (h < end && *h != '/')
1602                 *t++ = *h++;
1603               if (h < end)
1604                 *t++ = *h++;
1605             }
1606         }
1607     }
1608
1609   if (t != h)
1610     *t = '\0';
1611
1612   return t != h;
1613 }
1614 \f
1615 /* Return the length of URL's path.  Path is considered to be
1616    terminated by one of '?', ';', '#', or by the end of the
1617    string.  */
1618
1619 static int
1620 path_length (const char *url)
1621 {
1622   const char *q = strpbrk_or_eos (url, "?;#");
1623   return q - url;
1624 }
1625
1626 /* Find the last occurrence of character C in the range [b, e), or
1627    NULL, if none are present.  We might want to use memrchr (a GNU
1628    extension) under GNU libc.  */
1629
1630 static const char *
1631 find_last_char (const char *b, const char *e, char c)
1632 {
1633   for (; e > b; e--)
1634     if (*e == c)
1635       return e;
1636   return NULL;
1637 }
1638
1639 /* Merge BASE with LINK and return the resulting URI.
1640
1641    Either of the URIs may be absolute or relative, complete with the
1642    host name, or path only.  This tries to reasonably handle all
1643    foreseeable cases.  It only employs minimal URL parsing, without
1644    knowledge of the specifics of schemes.
1645
1646    I briefly considered making this function call path_simplify after
1647    the merging process, as rfc1738 seems to suggest.  This is a bad
1648    idea for several reasons: 1) it complexifies the code, and 2)
1649    url_parse has to simplify path anyway, so it's wasteful to boot.  */
1650
1651 char *
1652 uri_merge (const char *base, const char *link)
1653 {
1654   int linklength;
1655   const char *end;
1656   char *merge;
1657
1658   if (url_has_scheme (link))
1659     return xstrdup (link);
1660
1661   /* We may not examine BASE past END. */
1662   end = base + path_length (base);
1663   linklength = strlen (link);
1664
1665   if (!*link)
1666     {
1667       /* Empty LINK points back to BASE, query string and all. */
1668       return xstrdup (base);
1669     }
1670   else if (*link == '?')
1671     {
1672       /* LINK points to the same location, but changes the query
1673          string.  Examples: */
1674       /* uri_merge("path",         "?new") -> "path?new"     */
1675       /* uri_merge("path?foo",     "?new") -> "path?new"     */
1676       /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1677       /* uri_merge("path#foo",     "?new") -> "path?new"     */
1678       int baselength = end - base;
1679       merge = xmalloc (baselength + linklength + 1);
1680       memcpy (merge, base, baselength);
1681       memcpy (merge + baselength, link, linklength);
1682       merge[baselength + linklength] = '\0';
1683     }
1684   else if (*link == '#')
1685     {
1686       /* uri_merge("path",         "#new") -> "path#new"     */
1687       /* uri_merge("path#foo",     "#new") -> "path#new"     */
1688       /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1689       /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1690       int baselength;
1691       const char *end1 = strchr (base, '#');
1692       if (!end1)
1693         end1 = base + strlen (base);
1694       baselength = end1 - base;
1695       merge = xmalloc (baselength + linklength + 1);
1696       memcpy (merge, base, baselength);
1697       memcpy (merge + baselength, link, linklength);
1698       merge[baselength + linklength] = '\0';
1699     }
1700   else if (*link == '/' && *(link + 1) == '/')
1701     {
1702       /* LINK begins with "//" and so is a net path: we need to
1703          replace everything after (and including) the double slash
1704          with LINK. */
1705
1706       /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1707       /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1708       /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1709
1710       int span;
1711       const char *slash;
1712       const char *start_insert;
1713
1714       /* Look for first slash. */
1715       slash = memchr (base, '/', end - base);
1716       /* If found slash and it is a double slash, then replace
1717          from this point, else default to replacing from the
1718          beginning.  */
1719       if (slash && *(slash + 1) == '/')
1720         start_insert = slash;
1721       else
1722         start_insert = base;
1723
1724       span = start_insert - base;
1725       merge = (char *)xmalloc (span + linklength + 1);
1726       if (span)
1727         memcpy (merge, base, span);
1728       memcpy (merge + span, link, linklength);
1729       merge[span + linklength] = '\0';
1730     }
1731   else if (*link == '/')
1732     {
1733       /* LINK is an absolute path: we need to replace everything
1734          after (and including) the FIRST slash with LINK.
1735
1736          So, if BASE is "http://host/whatever/foo/bar", and LINK is
1737          "/qux/xyzzy", our result should be
1738          "http://host/qux/xyzzy".  */
1739       int span;
1740       const char *slash;
1741       const char *start_insert = NULL; /* for gcc to shut up. */
1742       const char *pos = base;
1743       int seen_slash_slash = 0;
1744       /* We're looking for the first slash, but want to ignore
1745          double slash. */
1746     again:
1747       slash = memchr (pos, '/', end - pos);
1748       if (slash && !seen_slash_slash)
1749         if (*(slash + 1) == '/')
1750           {
1751             pos = slash + 2;
1752             seen_slash_slash = 1;
1753             goto again;
1754           }
1755
1756       /* At this point, SLASH is the location of the first / after
1757          "//", or the first slash altogether.  START_INSERT is the
1758          pointer to the location where LINK will be inserted.  When
1759          examining the last two examples, keep in mind that LINK
1760          begins with '/'. */
1761
1762       if (!slash && !seen_slash_slash)
1763         /* example: "foo" */
1764         /*           ^    */
1765         start_insert = base;
1766       else if (!slash && seen_slash_slash)
1767         /* example: "http://foo" */
1768         /*                     ^ */
1769         start_insert = end;
1770       else if (slash && !seen_slash_slash)
1771         /* example: "foo/bar" */
1772         /*           ^        */
1773         start_insert = base;
1774       else if (slash && seen_slash_slash)
1775         /* example: "http://something/" */
1776         /*                           ^  */
1777         start_insert = slash;
1778
1779       span = start_insert - base;
1780       merge = (char *)xmalloc (span + linklength + 1);
1781       if (span)
1782         memcpy (merge, base, span);
1783       memcpy (merge + span, link, linklength);
1784       merge[span + linklength] = '\0';
1785     }
1786   else
1787     {
1788       /* LINK is a relative URL: we need to replace everything
1789          after last slash (possibly empty) with LINK.
1790
1791          So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1792          our result should be "whatever/foo/qux/xyzzy".  */
1793       int need_explicit_slash = 0;
1794       int span;
1795       const char *start_insert;
1796       const char *last_slash = find_last_char (base, end, '/');
1797       if (!last_slash)
1798         {
1799           /* No slash found at all.  Replace what we have with LINK. */
1800           start_insert = base;
1801         }
1802       else if (last_slash && last_slash >= base + 2
1803                && last_slash[-2] == ':' && last_slash[-1] == '/')
1804         {
1805           /* example: http://host"  */
1806           /*                      ^ */
1807           start_insert = end + 1;
1808           need_explicit_slash = 1;
1809         }
1810       else
1811         {
1812           /* example: "whatever/foo/bar" */
1813           /*                        ^    */
1814           start_insert = last_slash + 1;
1815         }
1816
1817       span = start_insert - base;
1818       merge = (char *)xmalloc (span + linklength + 1);
1819       if (span)
1820         memcpy (merge, base, span);
1821       if (need_explicit_slash)
1822         merge[span - 1] = '/';
1823       memcpy (merge + span, link, linklength);
1824       merge[span + linklength] = '\0';
1825     }
1826
1827   return merge;
1828 }
1829 \f
1830 #define APPEND(p, s) do {                       \
1831   int len = strlen (s);                         \
1832   memcpy (p, s, len);                           \
1833   p += len;                                     \
1834 } while (0)
1835
1836 /* Use this instead of password when the actual password is supposed
1837    to be hidden.  We intentionally use a generic string without giving
1838    away the number of characters in the password, like previous
1839    versions did.  */
1840 #define HIDDEN_PASSWORD "*password*"
1841
1842 /* Recreate the URL string from the data in URL.
1843
1844    If HIDE is non-zero (as it is when we're calling this on a URL we
1845    plan to print, but not when calling it to canonicalize a URL for
1846    use within the program), password will be hidden.  Unsafe
1847    characters in the URL will be quoted.  */
1848
1849 char *
1850 url_string (const struct url *url, int hide_password)
1851 {
1852   int size;
1853   char *result, *p;
1854   char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;
1855
1856   int scheme_port  = supported_schemes[url->scheme].default_port;
1857   const char *scheme_str = supported_schemes[url->scheme].leading_string;
1858   int fplen = full_path_length (url);
1859
1860   int brackets_around_host;
1861
1862   assert (scheme_str != NULL);
1863
1864   /* Make sure the user name and password are quoted. */
1865   if (url->user)
1866     {
1867       quoted_user = url_escape_allow_passthrough (url->user);
1868       if (url->passwd)
1869         {
1870           if (hide_password)
1871             quoted_passwd = HIDDEN_PASSWORD;
1872           else
1873             quoted_passwd = url_escape_allow_passthrough (url->passwd);
1874         }
1875     }
1876
1877   /* In the unlikely event that the host name contains non-printable
1878      characters, quote it for displaying to the user.  */
1879   quoted_host = url_escape_allow_passthrough (url->host);
1880
1881   /* Undo the quoting of colons that URL escaping performs.  IPv6
1882      addresses may legally contain colons, and in that case must be
1883      placed in square brackets.  */
1884   if (quoted_host != url->host)
1885     unescape_single_char (quoted_host, ':');
1886   brackets_around_host = strchr (quoted_host, ':') != NULL;
1887
1888   size = (strlen (scheme_str)
1889           + strlen (quoted_host)
1890           + (brackets_around_host ? 2 : 0)
1891           + fplen
1892           + 1);
1893   if (url->port != scheme_port)
1894     size += 1 + numdigit (url->port);
1895   if (quoted_user)
1896     {
1897       size += 1 + strlen (quoted_user);
1898       if (quoted_passwd)
1899         size += 1 + strlen (quoted_passwd);
1900     }
1901
1902   p = result = xmalloc (size);
1903
1904   APPEND (p, scheme_str);
1905   if (quoted_user)
1906     {
1907       APPEND (p, quoted_user);
1908       if (quoted_passwd)
1909         {
1910           *p++ = ':';
1911           APPEND (p, quoted_passwd);
1912         }
1913       *p++ = '@';
1914     }
1915
1916   if (brackets_around_host)
1917     *p++ = '[';
1918   APPEND (p, quoted_host);
1919   if (brackets_around_host)
1920     *p++ = ']';
1921   if (url->port != scheme_port)
1922     {
1923       *p++ = ':';
1924       p = number_to_string (p, url->port);
1925     }
1926
1927   full_path_write (url, p);
1928   p += fplen;
1929   *p++ = '\0';
1930
1931   assert (p - result == size);
1932
1933   if (quoted_user && quoted_user != url->user)
1934     xfree (quoted_user);
1935   if (quoted_passwd && !hide_password && quoted_passwd != url->passwd)
1936     xfree (quoted_passwd);
1937   if (quoted_host != url->host)
1938     xfree (quoted_host);
1939
1940   return result;
1941 }
1942 \f
1943 /* Return non-zero if scheme a is similar to scheme b.
1944
1945    Schemes are similar if they are equal.  If SSL is supported, schemes
1946    are also similar if one is http (SCHEME_HTTP) and the other is https
1947    (SCHEME_HTTPS).  */
1948 int
1949 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
1950 {
1951   if (a == b)
1952     return 1;
1953 #ifdef HAVE_SSL
1954   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
1955       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
1956     return 1;
1957 #endif
1958   return 0;
1959 }
1960 \f
1961 #if 0
1962 /* Debugging and testing support for path_simplify. */
1963
1964 /* Debug: run path_simplify on PATH and return the result in a new
1965    string.  Useful for calling from the debugger.  */
1966 static char *
1967 ps (char *path)
1968 {
1969   char *copy = xstrdup (path);
1970   path_simplify (copy);
1971   return copy;
1972 }
1973
1974 static void
1975 run_test (char *test, char *expected_result, int expected_change)
1976 {
1977   char *test_copy = xstrdup (test);
1978   int modified = path_simplify (test_copy);
1979
1980   if (0 != strcmp (test_copy, expected_result))
1981     {
1982       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
1983               test, expected_result, test_copy);
1984     }
1985   if (modified != expected_change)
1986     {
1987       if (expected_change == 1)
1988         printf ("Expected modification with path_simplify(\"%s\").\n",
1989                 test);
1990       else
1991         printf ("Expected no modification with path_simplify(\"%s\").\n",
1992                 test);
1993     }
1994   xfree (test_copy);
1995 }
1996
1997 static void
1998 test_path_simplify (void)
1999 {
2000   static struct {
2001     char *test, *result;
2002     int should_modify;
2003   } tests[] = {
2004     { "",                       "",             0 },
2005     { ".",                      "",             1 },
2006     { "./",                     "",             1 },
2007     { "..",                     "..",           0 },
2008     { "../",                    "../",          0 },
2009     { "foo",                    "foo",          0 },
2010     { "foo/bar",                "foo/bar",      0 },
2011     { "foo///bar",              "foo///bar",    0 },
2012     { "foo/.",                  "foo/",         1 },
2013     { "foo/./",                 "foo/",         1 },
2014     { "foo./",                  "foo./",        0 },
2015     { "foo/../bar",             "bar",          1 },
2016     { "foo/../bar/",            "bar/",         1 },
2017     { "foo/bar/..",             "foo/",         1 },
2018     { "foo/bar/../x",           "foo/x",        1 },
2019     { "foo/bar/../x/",          "foo/x/",       1 },
2020     { "foo/..",                 "",             1 },
2021     { "foo/../..",              "..",           1 },
2022     { "foo/../../..",           "../..",        1 },
2023     { "foo/../../bar/../../baz", "../../baz",   1 },
2024     { "a/b/../../c",            "c",            1 },
2025     { "./a/../b",               "b",            1 }
2026   };
2027   int i;
2028
2029   for (i = 0; i < countof (tests); i++)
2030     {
2031       char *test = tests[i].test;
2032       char *expected_result = tests[i].result;
2033       int   expected_change = tests[i].should_modify;
2034       run_test (test, expected_result, expected_change);
2035     }
2036 }
2037 #endif