sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 2005 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include <config.h>
  31
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #ifdef HAVE_STRING_H
  35 # include <string.h>
  36 #else
  37 # include <strings.h>
  38 #endif
  39 #include <sys/types.h>
  40 #ifdef HAVE_UNISTD_H
  41 # include <unistd.h>
  42 #endif
  43 #include <errno.h>
  44 #include <assert.h>
  45
  46 #include "wget.h"
  47 #include "utils.h"
  48 #include "url.h"
  49 #include "host.h"  /* for is_valid_ipv6_address */
  50
  51 #ifndef errno
  52 extern int errno;
  53 #endif
  54
  55 struct scheme_data
  56 {
  57   const char *name;
  58   const char *leading_string;
  59   int default_port;
  60   int enabled;
  61 };
  62
  63 /* Supported schemes: */
  64 static struct scheme_data supported_schemes[] =
  65 {
  66   { "http",     "http://",  DEFAULT_HTTP_PORT,  1 },
  67 #ifdef HAVE_SSL
  68   { "https",    "https://", DEFAULT_HTTPS_PORT, 1 },
  69 #endif
  70   { "ftp",      "ftp://",   DEFAULT_FTP_PORT,   1 },
  71
  72   /* SCHEME_INVALID */
  73   { NULL,       NULL,       -1,                 0 }
  74 };
  75
  76 /* Forward declarations: */
  77
  78 static int path_simplify PARAMS ((char *));
  79 \f
  80 /* Support for escaping and unescaping of URL strings.  */
  81
  82 /* Table of "reserved" and "unsafe" characters.  Those terms are
  83    rfc1738-speak, as such largely obsoleted by rfc2396 and later
  84    specs, but the general idea remains.
  85
  86    A reserved character is the one that you can't decode without
  87    changing the meaning of the URL.  For example, you can't decode
  88    "/foo/%2f/bar" into "/foo///bar" because the number and contents of
  89    path components is different.  Non-reserved characters can be
  90    changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar".  The
  91    unsafe characters are loosely based on rfc1738, plus "$" and ",",
  92    as recommended by rfc2396, and minus "~", which is very frequently
  93    used (and sometimes unrecognized as %7E by broken servers).
  94
  95    An unsafe character is the one that should be encoded when URLs are
  96    placed in foreign environments.  E.g. space and newline are unsafe
  97    in HTTP contexts because HTTP uses them as separator and line
  98    terminator, so they must be encoded to %20 and %0A respectively.
  99    "*" is unsafe in shell context, etc.
 100
 101    We determine whether a character is unsafe through static table
 102    lookup.  This code assumes ASCII character set and 8-bit chars.  */
 103
 104 enum {
 105   /* rfc1738 reserved chars + "$" and ",".  */
 106   urlchr_reserved = 1,
 107
 108   /* rfc1738 unsafe chars, plus non-printables.  */
 109   urlchr_unsafe   = 2
 110 };
 111
 112 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 113 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 114 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 115
 116 /* Shorthands for the table: */
 117 #define R  urlchr_reserved
 118 #define U  urlchr_unsafe
 119 #define RU R|U
 120
 121 static const unsigned char urlchr_table[256] =
 122 {
 123   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 124   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 125   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 126   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 127   U,  0,  U, RU,   R,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 128   0,  0,  0,  R,   R,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 129   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 130   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 131  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 132   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 133   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 134   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 135   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 136   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 137   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 138   0,  0,  0,  U,   U,  U,  0,  U,   /* x   y   z   {    |   }   ~   DEL */
 139
 140   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 141   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 142   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 143   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 144
 145   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 146   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 147   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 148   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 149 };
 150 #undef R
 151 #undef U
 152 #undef RU
 153
 154 /* URL-unescape the string S.
 155
 156    This is done by transforming the sequences "%HH" to the character
 157    represented by the hexadecimal digits HH.  If % is not followed by
 158    two hexadecimal digits, it is inserted literally.
 159
 160    The transformation is done in place.  If you need the original
 161    string intact, make a copy before calling this function.  */
 162
 163 static void
 164 url_unescape (char *s)
 165 {
 166   char *t = s;                  /* t - tortoise */
 167   char *h = s;                  /* h - hare     */
 168
 169   for (; *h; h++, t++)
 170     {
 171       if (*h != '%')
 172         {
 173         copychar:
 174           *t = *h;
 175         }
 176       else
 177         {
 178           char c;
 179           /* Do nothing if '%' is not followed by two hex digits. */
 180           if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2])))
 181             goto copychar;
 182           c = X2DIGITS_TO_NUM (h[1], h[2]);
 183           /* Don't unescape %00 because there is no way to insert it
 184              into a C string without effectively truncating it. */
 185           if (c == '\0')
 186             goto copychar;
 187           *t = c;
 188           h += 2;
 189         }
 190     }
 191   *t = '\0';
 192 }
 193
 194 /* The core of url_escape_* functions.  Escapes the characters that
 195    match the provided mask in urlchr_table.
 196
 197    If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars
 198    will be returned unchanged.  If ALLOW_PASSTHROUGH is zero, a
 199    freshly allocated string will be returned in all cases.  */
 200
 201 static char *
 202 url_escape_1 (const char *s, unsigned char mask, int allow_passthrough)
 203 {
 204   const char *p1;
 205   char *p2, *newstr;
 206   int newlen;
 207   int addition = 0;
 208
 209   for (p1 = s; *p1; p1++)
 210     if (urlchr_test (*p1, mask))
 211       addition += 2;            /* Two more characters (hex digits) */
 212
 213   if (!addition)
 214     return allow_passthrough ? (char *)s : xstrdup (s);
 215
 216   newlen = (p1 - s) + addition;
 217   newstr = (char *)xmalloc (newlen + 1);
 218
 219   p1 = s;
 220   p2 = newstr;
 221   while (*p1)
 222     {
 223       /* Quote the characters that match the test mask. */
 224       if (urlchr_test (*p1, mask))
 225         {
 226           unsigned char c = *p1++;
 227           *p2++ = '%';
 228           *p2++ = XNUM_TO_DIGIT (c >> 4);
 229           *p2++ = XNUM_TO_DIGIT (c & 0xf);
 230         }
 231       else
 232         *p2++ = *p1++;
 233     }
 234   assert (p2 - newstr == newlen);
 235   *p2 = '\0';
 236
 237   return newstr;
 238 }
 239
 240 /* URL-escape the unsafe characters (see urlchr_table) in a given
 241    string, returning a freshly allocated string.  */
 242
 243 char *
 244 url_escape (const char *s)
 245 {
 246   return url_escape_1 (s, urlchr_unsafe, 0);
 247 }
 248
 249 /* URL-escape the unsafe characters (see urlchr_table) in a given
 250    string.  If no characters are unsafe, S is returned.  */
 251
 252 static char *
 253 url_escape_allow_passthrough (const char *s)
 254 {
 255   return url_escape_1 (s, urlchr_unsafe, 1);
 256 }
 257 \f
 258 /* Decide whether the char at position P needs to be encoded.  (It is
 259    not enough to pass a single char *P because the function may need
 260    to inspect the surrounding context.)
 261
 262    Return 1 if the char should be escaped as %XX, 0 otherwise.  */
 263
 264 static inline int
 265 char_needs_escaping (const char *p)
 266 {
 267   if (*p == '%')
 268     {
 269       if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
 270         return 0;
 271       else
 272         /* Garbled %.. sequence: encode `%'. */
 273         return 1;
 274     }
 275   else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
 276     return 1;
 277   else
 278     return 0;
 279 }
 280
 281 /* Translate a %-escaped (but possibly non-conformant) input string S
 282    into a %-escaped (and conformant) output string.  If no characters
 283    are encoded or decoded, return the same string S; otherwise, return
 284    a freshly allocated string with the new contents.
 285
 286    After a URL has been run through this function, the protocols that
 287    use `%' as the quote character can use the resulting string as-is,
 288    while those that don't can use url_unescape to get to the intended
 289    data.  This function is stable: once the input is transformed,
 290    further transformations of the result yield the same output.
 291
 292    Let's discuss why this function is needed.
 293
 294    Imagine Wget is asked to retrieve `http://abc.xyz/abc def'.  Since
 295    a raw space character would mess up the HTTP request, it needs to
 296    be quoted, like this:
 297
 298        GET /abc%20def HTTP/1.0
 299
 300    It would appear that the unsafe chars need to be quoted, for
 301    example with url_escape.  But what if we're requested to download
 302    `abc%20def'?  url_escape transforms "%" to "%25", which would leave
 303    us with `abc%2520def'.  This is incorrect -- since %-escapes are
 304    part of URL syntax, "%20" is the correct way to denote a literal
 305    space on the Wget command line.  This leads to the conclusion that
 306    in that case Wget should not call url_escape, but leave the `%20'
 307    as is.  This is clearly contradictory, but it only gets worse.
 308
 309    What if the requested URI is `abc%20 def'?  If we call url_escape,
 310    we end up with `/abc%2520%20def', which is almost certainly not
 311    intended.  If we don't call url_escape, we are left with the
 312    embedded space and cannot complete the request.  What the user
 313    meant was for Wget to request `/abc%20%20def', and this is where
 314    reencode_escapes kicks in.
 315
 316    Wget used to solve this by first decoding %-quotes, and then
 317    encoding all the "unsafe" characters found in the resulting string.
 318    This was wrong because it didn't preserve certain URL special
 319    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 320    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 321    whether we considered `+' reserved (it is).  One of these results
 322    is inevitable because by the second step we would lose information
 323    on whether the `+' was originally encoded or not.  Both results
 324    were wrong because in CGI parameters + means space, while %2B means
 325    literal plus.  reencode_escapes correctly translates the above to
 326    "a%2B+b", i.e. returns the original string.
 327
 328    This function uses a modified version of the algorithm originally
 329    proposed by Anon Sricharoenchai:
 330
 331    * Encode all "unsafe" characters, except those that are also
 332      "reserved", to %XX.  See urlchr_table for which characters are
 333      unsafe and reserved.
 334
 335    * Encode the "%" characters not followed by two hex digits to
 336      "%25".
 337
 338    * Pass through all other characters and %XX escapes as-is.  (Up to
 339      Wget 1.10 this decoded %XX escapes corresponding to "safe"
 340      characters, but that was obtrusive and broke some servers.)
 341
 342    Anon's test case:
 343
 344    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 345    ->
 346    "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc"
 347
 348    Simpler test cases:
 349
 350    "foo bar"         -> "foo%20bar"
 351    "foo%20bar"       -> "foo%20bar"
 352    "foo %20bar"      -> "foo%20%20bar"
 353    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 354    "foo%25%20bar"    -> "foo%25%20bar"
 355    "foo%2%20bar"     -> "foo%252%20bar"
 356    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 357    "foo%2b+bar"      -> "foo%2b+bar"  */
 358
 359 static char *
 360 reencode_escapes (const char *s)
 361 {
 362   const char *p1;
 363   char *newstr, *p2;
 364   int oldlen, newlen;
 365
 366   int encode_count = 0;
 367
 368   /* First pass: inspect the string to see if there's anything to do,
 369      and to calculate the new length.  */
 370   for (p1 = s; *p1; p1++)
 371     if (char_needs_escaping (p1))
 372       ++encode_count;
 373
 374   if (!encode_count)
 375     /* The string is good as it is. */
 376     return (char *) s;          /* C const model sucks. */
 377
 378   oldlen = p1 - s;
 379   /* Each encoding adds two characters (hex digits).  */
 380   newlen = oldlen + 2 * encode_count;
 381   newstr = xmalloc (newlen + 1);
 382
 383   /* Second pass: copy the string to the destination address, encoding
 384      chars when needed.  */
 385   p1 = s;
 386   p2 = newstr;
 387
 388   while (*p1)
 389     if (char_needs_escaping (p1))
 390       {
 391         unsigned char c = *p1++;
 392         *p2++ = '%';
 393         *p2++ = XNUM_TO_DIGIT (c >> 4);
 394         *p2++ = XNUM_TO_DIGIT (c & 0xf);
 395       }
 396     else
 397       *p2++ = *p1++;
 398
 399   *p2 = '\0';
 400   assert (p2 - newstr == newlen);
 401   return newstr;
 402 }
 403 \f
 404 /* Returns the scheme type if the scheme is supported, or
 405    SCHEME_INVALID if not.  */
 406
 407 enum url_scheme
 408 url_scheme (const char *url)
 409 {
 410   int i;
 411
 412   for (i = 0; supported_schemes[i].leading_string; i++)
 413     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 414                           strlen (supported_schemes[i].leading_string)))
 415       {
 416         if (supported_schemes[i].enabled)
 417           return (enum url_scheme) i;
 418         else
 419           return SCHEME_INVALID;
 420       }
 421
 422   return SCHEME_INVALID;
 423 }
 424
 425 #define SCHEME_CHAR(ch) (ISALNUM (ch) || (ch) == '-' || (ch) == '+')
 426
 427 /* Return 1 if the URL begins with any "scheme", 0 otherwise.  As
 428    currently implemented, it returns true if URL begins with
 429    [-+a-zA-Z0-9]+: .  */
 430
 431 int
 432 url_has_scheme (const char *url)
 433 {
 434   const char *p = url;
 435
 436   /* The first char must be a scheme char. */
 437   if (!*p || !SCHEME_CHAR (*p))
 438     return 0;
 439   ++p;
 440   /* Followed by 0 or more scheme chars. */
 441   while (*p && SCHEME_CHAR (*p))
 442     ++p;
 443   /* Terminated by ':'. */
 444   return *p == ':';
 445 }
 446
 447 int
 448 scheme_default_port (enum url_scheme scheme)
 449 {
 450   return supported_schemes[scheme].default_port;
 451 }
 452
 453 void
 454 scheme_disable (enum url_scheme scheme)
 455 {
 456   supported_schemes[scheme].enabled = 0;
 457 }
 458
 459 /* Skip the username and password, if present in the URL.  The
 460    function should *not* be called with the complete URL, but with the
 461    portion after the scheme.
 462
 463    If no username and password are found, return URL.  */
 464
 465 static const char *
 466 url_skip_credentials (const char *url)
 467 {
 468   /* Look for '@' that comes before terminators, such as '/', '?',
 469      '#', or ';'.  */
 470   const char *p = (const char *)strpbrk (url, "@/?#;");
 471   if (!p || *p != '@')
 472     return url;
 473   return p + 1;
 474 }
 475
 476 /* Parse credentials contained in [BEG, END).  The region is expected
 477    to have come from a URL and is unescaped.  */
 478
 479 static int
 480 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
 481 {
 482   char *colon;
 483   const char *userend;
 484
 485   if (beg == end)
 486     return 0;                   /* empty user name */
 487
 488   colon = memchr (beg, ':', end - beg);
 489   if (colon == beg)
 490     return 0;                   /* again empty user name */
 491
 492   if (colon)
 493     {
 494       *passwd = strdupdelim (colon + 1, end);
 495       userend = colon;
 496       url_unescape (*passwd);
 497     }
 498   else
 499     {
 500       *passwd = NULL;
 501       userend = end;
 502     }
 503   *user = strdupdelim (beg, userend);
 504   url_unescape (*user);
 505   return 1;
 506 }
 507
 508 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 509    popularized by Netscape and NcFTP.  HTTP shorthands look like this:
 510
 511    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 512    www.foo.com[:port]            -> http://www.foo.com[:port]
 513
 514    FTP shorthands look like this:
 515
 516    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 517    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 518
 519    If the URL needs not or cannot be rewritten, return NULL.  */
 520
 521 char *
 522 rewrite_shorthand_url (const char *url)
 523 {
 524   const char *p;
 525
 526   if (url_scheme (url) != SCHEME_INVALID)
 527     return NULL;
 528
 529   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 530      latter Netscape.  */
 531   for (p = url; *p && *p != ':' && *p != '/'; p++)
 532     ;
 533
 534   if (p == url)
 535     return NULL;
 536
 537   if (*p == ':')
 538     {
 539       const char *pp;
 540       char *res;
 541       /* If the characters after the colon and before the next slash
 542          or end of string are all digits, it's HTTP.  */
 543       int digits = 0;
 544       for (pp = p + 1; ISDIGIT (*pp); pp++)
 545         ++digits;
 546       if (digits > 0 && (*pp == '/' || *pp == '\0'))
 547         goto http;
 548
 549       /* Prepend "ftp://" to the entire URL... */
 550       res = xmalloc (6 + strlen (url) + 1);
 551       sprintf (res, "ftp://%s", url);
 552       /* ...and replace ':' with '/'. */
 553       res[6 + (p - url)] = '/';
 554       return res;
 555     }
 556   else
 557     {
 558       char *res;
 559     http:
 560       /* Just prepend "http://" to what we have. */
 561       res = xmalloc (7 + strlen (url) + 1);
 562       sprintf (res, "http://%s", url);
 563       return res;
 564     }
 565 }
 566 \f
 567 static void split_path PARAMS ((const char *, char **, char **));
 568
 569 /* Like strpbrk, with the exception that it returns the pointer to the
 570    terminating zero (end-of-string aka "eos") if no matching character
 571    is found.
 572
 573    Although I normally balk at Gcc-specific optimizations, it probably
 574    makes sense here: glibc has optimizations that detect strpbrk being
 575    called with literal string as ACCEPT and inline the search.  That
 576    optimization is defeated if strpbrk is hidden within the call to
 577    another function.  (And no, making strpbrk_or_eos inline doesn't
 578    help because the check for literal accept is in the
 579    preprocessor.)  */
 580
 581 #ifdef __GNUC__
 582
 583 #define strpbrk_or_eos(s, accept) ({            \
 584   char *SOE_p = strpbrk (s, accept);            \
 585   if (!SOE_p)                                   \
 586     SOE_p = strchr (s, '\0');                   \
 587   SOE_p;                                        \
 588 })
 589
 590 #else  /* not __GNUC__ */
 591
 592 static inline char *
 593 strpbrk_or_eos (const char *s, const char *accept)
 594 {
 595   char *p = strpbrk (s, accept);
 596   if (!p)
 597     p = strchr (s, '\0');
 598   return p;
 599 }
 600 #endif /* not __GNUC__ */
 601
 602 /* Turn STR into lowercase; return non-zero if a character was
 603    actually changed. */
 604
 605 static int
 606 lowercase_str (char *str)
 607 {
 608   int change = 0;
 609   for (; *str; str++)
 610     if (ISUPPER (*str))
 611       {
 612         change = 1;
 613         *str = TOLOWER (*str);
 614       }
 615   return change;
 616 }
 617
 618 static const char *parse_errors[] = {
 619 #define PE_NO_ERROR                     0
 620   N_("No error"),
 621 #define PE_UNSUPPORTED_SCHEME           1
 622   N_("Unsupported scheme"),
 623 #define PE_EMPTY_HOST                   2
 624   N_("Empty host"),
 625 #define PE_BAD_PORT_NUMBER              3
 626   N_("Bad port number"),
 627 #define PE_INVALID_USER_NAME            4
 628   N_("Invalid user name"),
 629 #define PE_UNTERMINATED_IPV6_ADDRESS    5
 630   N_("Unterminated IPv6 numeric address"),
 631 #define PE_IPV6_NOT_SUPPORTED           6
 632   N_("IPv6 addresses not supported"),
 633 #define PE_INVALID_IPV6_ADDRESS         7
 634   N_("Invalid IPv6 numeric address")
 635 };
 636
 637 /* Parse a URL.
 638
 639    Return a new struct url if successful, NULL on error.  In case of
 640    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 641    error code. */
 642 struct url *
 643 url_parse (const char *url, int *error)
 644 {
 645   struct url *u;
 646   const char *p;
 647   int path_modified, host_modified;
 648
 649   enum url_scheme scheme;
 650
 651   const char *uname_b,     *uname_e;
 652   const char *host_b,      *host_e;
 653   const char *path_b,      *path_e;
 654   const char *params_b,    *params_e;
 655   const char *query_b,     *query_e;
 656   const char *fragment_b,  *fragment_e;
 657
 658   int port;
 659   char *user = NULL, *passwd = NULL;
 660
 661   char *url_encoded = NULL;
 662
 663   int error_code;
 664
 665   scheme = url_scheme (url);
 666   if (scheme == SCHEME_INVALID)
 667     {
 668       error_code = PE_UNSUPPORTED_SCHEME;
 669       goto err;
 670     }
 671
 672   url_encoded = reencode_escapes (url);
 673   p = url_encoded;
 674
 675   p += strlen (supported_schemes[scheme].leading_string);
 676   uname_b = p;
 677   p = url_skip_credentials (p);
 678   uname_e = p;
 679
 680   /* scheme://user:pass@host[:port]... */
 681   /*                    ^              */
 682
 683   /* We attempt to break down the URL into the components path,
 684      params, query, and fragment.  They are ordered like this:
 685
 686        scheme://host[:port][/path][;params][?query][#fragment]  */
 687
 688   params_b   = params_e   = NULL;
 689   query_b    = query_e    = NULL;
 690   fragment_b = fragment_e = NULL;
 691
 692   host_b = p;
 693
 694   if (*p == '[')
 695     {
 696       /* Handle IPv6 address inside square brackets.  Ideally we'd
 697          just look for the terminating ']', but rfc2732 mandates
 698          rejecting invalid IPv6 addresses.  */
 699
 700       /* The address begins after '['. */
 701       host_b = p + 1;
 702       host_e = strchr (host_b, ']');
 703
 704       if (!host_e)
 705         {
 706           error_code = PE_UNTERMINATED_IPV6_ADDRESS;
 707           goto err;
 708         }
 709
 710 #ifdef ENABLE_IPV6
 711       /* Check if the IPv6 address is valid. */
 712       if (!is_valid_ipv6_address(host_b, host_e))
 713         {
 714           error_code = PE_INVALID_IPV6_ADDRESS;
 715           goto err;
 716         }
 717
 718       /* Continue parsing after the closing ']'. */
 719       p = host_e + 1;
 720 #else
 721       error_code = PE_IPV6_NOT_SUPPORTED;
 722       goto err;
 723 #endif
 724     }
 725   else
 726     {
 727       p = strpbrk_or_eos (p, ":/;?#");
 728       host_e = p;
 729     }
 730
 731   if (host_b == host_e)
 732     {
 733       error_code = PE_EMPTY_HOST;
 734       goto err;
 735     }
 736
 737   port = scheme_default_port (scheme);
 738   if (*p == ':')
 739     {
 740       const char *port_b, *port_e, *pp;
 741
 742       /* scheme://host:port/tralala */
 743       /*              ^             */
 744       ++p;
 745       port_b = p;
 746       p = strpbrk_or_eos (p, "/;?#");
 747       port_e = p;
 748
 749       /* Allow empty port, as per rfc2396. */
 750       if (port_b != port_e)
 751         {
 752           for (port = 0, pp = port_b; pp < port_e; pp++)
 753             {
 754               if (!ISDIGIT (*pp))
 755                 {
 756                   /* http://host:12randomgarbage/blah */
 757                   /*               ^                  */
 758                   error_code = PE_BAD_PORT_NUMBER;
 759                   goto err;
 760                 }
 761               port = 10 * port + (*pp - '0');
 762               /* Check for too large port numbers here, before we have
 763                  a chance to overflow on bogus port values.  */
 764               if (port > 65535)
 765                 {
 766                   error_code = PE_BAD_PORT_NUMBER;
 767                   goto err;
 768                 }
 769             }
 770         }
 771     }
 772
 773   if (*p == '/')
 774     {
 775       ++p;
 776       path_b = p;
 777       p = strpbrk_or_eos (p, ";?#");
 778       path_e = p;
 779     }
 780   else
 781     {
 782       /* Path is not allowed not to exist. */
 783       path_b = path_e = p;
 784     }
 785
 786   if (*p == ';')
 787     {
 788       ++p;
 789       params_b = p;
 790       p = strpbrk_or_eos (p, "?#");
 791       params_e = p;
 792     }
 793   if (*p == '?')
 794     {
 795       ++p;
 796       query_b = p;
 797       p = strpbrk_or_eos (p, "#");
 798       query_e = p;
 799
 800       /* Hack that allows users to use '?' (a wildcard character) in
 801          FTP URLs without it being interpreted as a query string
 802          delimiter.  */
 803       if (scheme == SCHEME_FTP)
 804         {
 805           query_b = query_e = NULL;
 806           path_e = p;
 807         }
 808     }
 809   if (*p == '#')
 810     {
 811       ++p;
 812       fragment_b = p;
 813       p += strlen (p);
 814       fragment_e = p;
 815     }
 816   assert (*p == 0);
 817
 818   if (uname_b != uname_e)
 819     {
 820       /* http://user:pass@host */
 821       /*        ^         ^    */
 822       /*     uname_b   uname_e */
 823       if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
 824         {
 825           error_code = PE_INVALID_USER_NAME;
 826           goto err;
 827         }
 828     }
 829
 830   u = xnew0 (struct url);
 831   u->scheme = scheme;
 832   u->host   = strdupdelim (host_b, host_e);
 833   u->port   = port;
 834   u->user   = user;
 835   u->passwd = passwd;
 836
 837   u->path = strdupdelim (path_b, path_e);
 838   path_modified = path_simplify (u->path);
 839   split_path (u->path, &u->dir, &u->file);
 840
 841   host_modified = lowercase_str (u->host);
 842
 843   /* Decode %HH sequences in host name.  This is important not so much
 844      to support %HH sequences in host names (which other browser
 845      don't), but to support binary characters (which will have been
 846      converted to %HH by reencode_escapes).  */
 847   if (strchr (u->host, '%'))
 848     {
 849       url_unescape (u->host);
 850       host_modified = 1;
 851     }
 852
 853   if (params_b)
 854     u->params = strdupdelim (params_b, params_e);
 855   if (query_b)
 856     u->query = strdupdelim (query_b, query_e);
 857   if (fragment_b)
 858     u->fragment = strdupdelim (fragment_b, fragment_e);
 859
 860   if (path_modified || u->fragment || host_modified || path_b == path_e)
 861     {
 862       /* If we suspect that a transformation has rendered what
 863          url_string might return different from URL_ENCODED, rebuild
 864          u->url using url_string.  */
 865       u->url = url_string (u, 0);
 866
 867       if (url_encoded != url)
 868         xfree ((char *) url_encoded);
 869     }
 870   else
 871     {
 872       if (url_encoded == url)
 873         u->url = xstrdup (url);
 874       else
 875         u->url = url_encoded;
 876     }
 877
 878   return u;
 879
 880  err:
 881   /* Cleanup in case of error: */
 882   if (url_encoded && url_encoded != url)
 883     xfree (url_encoded);
 884
 885   /* Transmit the error code to the caller, if the caller wants to
 886      know.  */
 887   if (error)
 888     *error = error_code;
 889   return NULL;
 890 }
 891
 892 /* Return the error message string from ERROR_CODE, which should have
 893    been retrieved from url_parse.  The error message is translated.  */
 894
 895 const char *
 896 url_error (int error_code)
 897 {
 898   assert (error_code >= 0 && error_code < countof (parse_errors));
 899   return _(parse_errors[error_code]);
 900 }
 901
 902 /* Split PATH into DIR and FILE.  PATH comes from the URL and is
 903    expected to be URL-escaped.
 904
 905    The path is split into directory (the part up to the last slash)
 906    and file (the part after the last slash), which are subsequently
 907    unescaped.  Examples:
 908
 909    PATH                 DIR           FILE
 910    "foo/bar/baz"        "foo/bar"     "baz"
 911    "foo/bar/"           "foo/bar"     ""
 912    "foo"                ""            "foo"
 913    "foo/bar/baz%2fqux"  "foo/bar"     "baz/qux" (!)
 914
 915    DIR and FILE are freshly allocated.  */
 916
 917 static void
 918 split_path (const char *path, char **dir, char **file)
 919 {
 920   char *last_slash = strrchr (path, '/');
 921   if (!last_slash)
 922     {
 923       *dir = xstrdup ("");
 924       *file = xstrdup (path);
 925     }
 926   else
 927     {
 928       *dir = strdupdelim (path, last_slash);
 929       *file = xstrdup (last_slash + 1);
 930     }
 931   url_unescape (*dir);
 932   url_unescape (*file);
 933 }
 934
 935 /* Note: URL's "full path" is the path with the query string and
 936    params appended.  The "fragment" (#foo) is intentionally ignored,
 937    but that might be changed.  For example, if the original URL was
 938    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
 939    the full path will be "/foo/bar/baz;bullshit?querystring".  */
 940
 941 /* Return the length of the full path, without the terminating
 942    zero.  */
 943
 944 static int
 945 full_path_length (const struct url *url)
 946 {
 947   int len = 0;
 948
 949 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
 950
 951   FROB (path);
 952   FROB (params);
 953   FROB (query);
 954
 955 #undef FROB
 956
 957   return len;
 958 }
 959
 960 /* Write out the full path. */
 961
 962 static void
 963 full_path_write (const struct url *url, char *where)
 964 {
 965 #define FROB(el, chr) do {                      \
 966   char *f_el = url->el;                         \
 967   if (f_el) {                                   \
 968     int l = strlen (f_el);                      \
 969     *where++ = chr;                             \
 970     memcpy (where, f_el, l);                    \
 971     where += l;                                 \
 972   }                                             \
 973 } while (0)
 974
 975   FROB (path, '/');
 976   FROB (params, ';');
 977   FROB (query, '?');
 978
 979 #undef FROB
 980 }
 981
 982 /* Public function for getting the "full path".  E.g. if u->path is
 983    "foo/bar" and u->query is "param=value", full_path will be
 984    "/foo/bar?param=value". */
 985
 986 char *
 987 url_full_path (const struct url *url)
 988 {
 989   int length = full_path_length (url);
 990   char *full_path = (char *) xmalloc (length + 1);
 991
 992   full_path_write (url, full_path);
 993   full_path[length] = '\0';
 994
 995   return full_path;
 996 }
 997
 998 /* Unescape CHR in an otherwise escaped STR.  Used to selectively
 999    escaping of certain characters, such as "/" and ":".  Returns a
1000    count of unescaped chars.  */
1001
1002 static void
1003 unescape_single_char (char *str, char chr)
1004 {
1005   const char c1 = XNUM_TO_DIGIT (chr >> 4);
1006   const char c2 = XNUM_TO_DIGIT (chr & 0xf);
1007   char *h = str;                /* hare */
1008   char *t = str;                /* tortoise */
1009   for (; *h; h++, t++)
1010     {
1011       if (h[0] == '%' && h[1] == c1 && h[2] == c2)
1012         {
1013           *t = chr;
1014           h += 2;
1015         }
1016       else
1017         *t = *h;
1018     }
1019   *t = '\0';
1020 }
1021
1022 /* Escape unsafe and reserved characters, except for the slash
1023    characters.  */
1024
1025 static char *
1026 url_escape_dir (const char *dir)
1027 {
1028   char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1029   if (newdir == dir)
1030     return (char *)dir;
1031
1032   unescape_single_char (newdir, '/');
1033   return newdir;
1034 }
1035
1036 /* Sync u->path and u->url with u->dir and u->file.  Called after
1037    u->file or u->dir have been changed, typically by the FTP code.  */
1038
1039 static void
1040 sync_path (struct url *u)
1041 {
1042   char *newpath, *efile, *edir;
1043
1044   xfree (u->path);
1045
1046   /* u->dir and u->file are not escaped.  URL-escape them before
1047      reassembling them into u->path.  That way, if they contain
1048      separators like '?' or even if u->file contains slashes, the
1049      path will be correctly assembled.  (u->file can contain slashes
1050      if the URL specifies it with %2f, or if an FTP server returns
1051      it.)  */
1052   edir = url_escape_dir (u->dir);
1053   efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1054
1055   if (!*edir)
1056     newpath = xstrdup (efile);
1057   else
1058     {
1059       int dirlen = strlen (edir);
1060       int filelen = strlen (efile);
1061
1062       /* Copy "DIR/FILE" to newpath. */
1063       char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1064       memcpy (p, edir, dirlen);
1065       p += dirlen;
1066       *p++ = '/';
1067       memcpy (p, efile, filelen);
1068       p += filelen;
1069       *p = '\0';
1070     }
1071
1072   u->path = newpath;
1073
1074   if (edir != u->dir)
1075     xfree (edir);
1076   if (efile != u->file)
1077     xfree (efile);
1078
1079   /* Regenerate u->url as well.  */
1080   xfree (u->url);
1081   u->url = url_string (u, 0);
1082 }
1083
1084 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1085    This way we can sync u->path and u->url when they get changed.  */
1086
1087 void
1088 url_set_dir (struct url *url, const char *newdir)
1089 {
1090   xfree (url->dir);
1091   url->dir = xstrdup (newdir);
1092   sync_path (url);
1093 }
1094
1095 void
1096 url_set_file (struct url *url, const char *newfile)
1097 {
1098   xfree (url->file);
1099   url->file = xstrdup (newfile);
1100   sync_path (url);
1101 }
1102
1103 void
1104 url_free (struct url *url)
1105 {
1106   xfree (url->host);
1107   xfree (url->path);
1108   xfree (url->url);
1109
1110   xfree_null (url->params);
1111   xfree_null (url->query);
1112   xfree_null (url->fragment);
1113   xfree_null (url->user);
1114   xfree_null (url->passwd);
1115
1116   xfree (url->dir);
1117   xfree (url->file);
1118
1119   xfree (url);
1120 }
1121 \f
1122 /* Create all the necessary directories for PATH (a file).  Calls
1123    mkdirhier() internally.  */
1124 int
1125 mkalldirs (const char *path)
1126 {
1127   const char *p;
1128   char *t;
1129   struct_stat st;
1130   int res;
1131
1132   p = path + strlen (path);
1133   for (; *p != '/' && p != path; p--)
1134     ;
1135
1136   /* Don't create if it's just a file.  */
1137   if ((p == path) && (*p != '/'))
1138     return 0;
1139   t = strdupdelim (path, p);
1140
1141   /* Check whether the directory exists.  */
1142   if ((stat (t, &st) == 0))
1143     {
1144       if (S_ISDIR (st.st_mode))
1145         {
1146           xfree (t);
1147           return 0;
1148         }
1149       else
1150         {
1151           /* If the dir exists as a file name, remove it first.  This
1152              is *only* for Wget to work with buggy old CERN http
1153              servers.  Here is the scenario: When Wget tries to
1154              retrieve a directory without a slash, e.g.
1155              http://foo/bar (bar being a directory), CERN server will
1156              not redirect it too http://foo/bar/ -- it will generate a
1157              directory listing containing links to bar/file1,
1158              bar/file2, etc.  Wget will lose because it saves this
1159              HTML listing to a file `bar', so it cannot create the
1160              directory.  To work around this, if the file of the same
1161              name exists, we just remove it and create the directory
1162              anyway.  */
1163           DEBUGP (("Removing %s because of directory danger!\n", t));
1164           unlink (t);
1165         }
1166     }
1167   res = make_directory (t);
1168   if (res != 0)
1169     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1170   xfree (t);
1171   return res;
1172 }
1173 \f
1174 /* Functions for constructing the file name out of URL components.  */
1175
1176 /* A growable string structure, used by url_file_name and friends.
1177    This should perhaps be moved to utils.c.
1178
1179    The idea is to have a convenient and efficient way to construct a
1180    string by having various functions append data to it.  Instead of
1181    passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1182    functions in questions, we pass the pointer to this struct.  */
1183
1184 struct growable {
1185   char *base;
1186   int size;
1187   int tail;
1188 };
1189
1190 /* Ensure that the string can accept APPEND_COUNT more characters past
1191    the current TAIL position.  If necessary, this will grow the string
1192    and update its allocated size.  If the string is already large
1193    enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1194 #define GROW(g, append_size) do {                                       \
1195   struct growable *G_ = g;                                              \
1196   DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1197 } while (0)
1198
1199 /* Return the tail position of the string. */
1200 #define TAIL(r) ((r)->base + (r)->tail)
1201
1202 /* Move the tail position by APPEND_COUNT characters. */
1203 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1204
1205 /* Append the string STR to DEST.  NOTICE: the string in DEST is not
1206    terminated.  */
1207
1208 static void
1209 append_string (const char *str, struct growable *dest)
1210 {
1211   int l = strlen (str);
1212   GROW (dest, l);
1213   memcpy (TAIL (dest), str, l);
1214   TAIL_INCR (dest, l);
1215 }
1216
1217 /* Append CH to DEST.  For example, append_char (0, DEST)
1218    zero-terminates DEST.  */
1219
1220 static void
1221 append_char (char ch, struct growable *dest)
1222 {
1223   GROW (dest, 1);
1224   *TAIL (dest) = ch;
1225   TAIL_INCR (dest, 1);
1226 }
1227
1228 enum {
1229   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1230   filechr_not_windows = 2,      /* unusable on Windows, one of \|/<>?:*" */
1231   filechr_control     = 4       /* a control character, e.g. 0-31 */
1232 };
1233
1234 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1235
1236 /* Shorthands for the table: */
1237 #define U filechr_not_unix
1238 #define W filechr_not_windows
1239 #define C filechr_control
1240
1241 #define UW U|W
1242 #define UWC U|W|C
1243
1244 /* Table of characters unsafe under various conditions (see above).
1245
1246    Arguably we could also claim `%' to be unsafe, since we use it as
1247    the escape character.  If we ever want to be able to reliably
1248    translate file name back to URL, this would become important
1249    crucial.  Right now, it's better to be minimal in escaping.  */
1250
1251 static const unsigned char filechr_table[256] =
1252 {
1253 UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1254   C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1255   C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1256   C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1257   0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1258   0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1259   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1260   0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
1261   0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1262   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1263   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1264   0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1265   0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1266   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1267   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1268   0,  0,  0,  0,   0,  0,  0,  0,   /* x   y   z   {    |   }   ~   DEL */
1269
1270   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
1271   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
1272   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1273   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1274
1275   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1276   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1277   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1278   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1279 };
1280 #undef U
1281 #undef W
1282 #undef C
1283 #undef UW
1284 #undef UWC
1285
1286 /* FN_PORT_SEP is the separator between host and port in file names
1287    for non-standard port numbers.  On Unix this is normally ':', as in
1288    "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1289    because Windows can't handle ':' in file names.  */
1290 #define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1291
1292 /* FN_QUERY_SEP is the separator between the file name and the URL
1293    query, normally '?'.  Since Windows cannot handle '?' as part of
1294    file name, we use '@' instead there.  */
1295 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1296
1297 /* Quote path element, characters in [b, e), as file name, and append
1298    the quoted string to DEST.  Each character is quoted as per
1299    file_unsafe_char and the corresponding table.
1300
1301    If ESCAPED_P is non-zero, the path element is considered to be
1302    URL-escaped and will be unescaped prior to inspection.  */
1303
1304 static void
1305 append_uri_pathel (const char *b, const char *e, int escaped_p,
1306                    struct growable *dest)
1307 {
1308   const char *p;
1309   int quoted, outlen;
1310
1311   int mask;
1312   if (opt.restrict_files_os == restrict_unix)
1313     mask = filechr_not_unix;
1314   else
1315     mask = filechr_not_windows;
1316   if (opt.restrict_files_ctrl)
1317     mask |= filechr_control;
1318
1319   /* Copy [b, e) to PATHEL and URL-unescape it. */
1320   if (escaped_p)
1321     {
1322       char *unescaped;
1323       BOUNDED_TO_ALLOCA (b, e, unescaped);
1324       url_unescape (unescaped);
1325       b = unescaped;
1326       e = unescaped + strlen (unescaped);
1327     }
1328
1329   /* Defang ".." when found as component of path.  Remember that path
1330      comes from the URL and might contain malicious input.  */
1331   if (e - b == 2 && b[0] == '.' && b[1] == '.')
1332     {
1333       b = "%2E%2E";
1334       e = b + 6;
1335     }
1336
1337   /* Walk the PATHEL string and check how many characters we'll need
1338      to quote.  */
1339   quoted = 0;
1340   for (p = b; p < e; p++)
1341     if (FILE_CHAR_TEST (*p, mask))
1342       ++quoted;
1343
1344   /* Calculate the length of the output string.  e-b is the input
1345      string length.  Each quoted char introduces two additional
1346      characters in the string, hence 2*quoted.  */
1347   outlen = (e - b) + (2 * quoted);
1348   GROW (dest, outlen);
1349
1350   if (!quoted)
1351     {
1352       /* If there's nothing to quote, we can simply append the string
1353          without processing it again.  */
1354       memcpy (TAIL (dest), b, outlen);
1355     }
1356   else
1357     {
1358       char *q = TAIL (dest);
1359       for (p = b; p < e; p++)
1360         {
1361           if (!FILE_CHAR_TEST (*p, mask))
1362             *q++ = *p;
1363           else
1364             {
1365               unsigned char ch = *p;
1366               *q++ = '%';
1367               *q++ = XNUM_TO_DIGIT (ch >> 4);
1368               *q++ = XNUM_TO_DIGIT (ch & 0xf);
1369             }
1370         }
1371       assert (q - TAIL (dest) == outlen);
1372     }
1373   TAIL_INCR (dest, outlen);
1374 }
1375
1376 /* Append to DEST the directory structure that corresponds the
1377    directory part of URL's path.  For example, if the URL is
1378    http://server/dir1/dir2/file, this appends "/dir1/dir2".
1379
1380    Each path element ("dir1" and "dir2" in the above example) is
1381    examined, url-unescaped, and re-escaped as file name element.
1382
1383    Additionally, it cuts as many directories from the path as
1384    specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1385    will produce "bar" for the above example.  For 2 or more, it will
1386    produce "".
1387
1388    Each component of the path is quoted for use as file name.  */
1389
1390 static void
1391 append_dir_structure (const struct url *u, struct growable *dest)
1392 {
1393   char *pathel, *next;
1394   int cut = opt.cut_dirs;
1395
1396   /* Go through the path components, de-URL-quote them, and quote them
1397      (if necessary) as file names.  */
1398
1399   pathel = u->path;
1400   for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1401     {
1402       if (cut-- > 0)
1403         continue;
1404       if (pathel == next)
1405         /* Ignore empty pathels.  */
1406         continue;
1407
1408       if (dest->tail)
1409         append_char ('/', dest);
1410       append_uri_pathel (pathel, next, 1, dest);
1411     }
1412 }
1413
1414 /* Return a unique file name that matches the given URL as good as
1415    possible.  Does not create directories on the file system.  */
1416
1417 char *
1418 url_file_name (const struct url *u)
1419 {
1420   struct growable fnres;        /* stands for "file name result" */
1421
1422   const char *u_file, *u_query;
1423   char *fname, *unique;
1424
1425   fnres.base = NULL;
1426   fnres.size = 0;
1427   fnres.tail = 0;
1428
1429   /* Start with the directory prefix, if specified. */
1430   if (opt.dir_prefix)
1431     append_string (opt.dir_prefix, &fnres);
1432
1433   /* If "dirstruct" is turned on (typically the case with -r), add
1434      the host and port (unless those have been turned off) and
1435      directory structure.  */
1436   if (opt.dirstruct)
1437     {
1438       if (opt.protocol_directories)
1439         {
1440           if (fnres.tail)
1441             append_char ('/', &fnres);
1442           append_string (supported_schemes[u->scheme].name, &fnres);
1443         }
1444       if (opt.add_hostdir)
1445         {
1446           if (fnres.tail)
1447             append_char ('/', &fnres);
1448           if (0 != strcmp (u->host, ".."))
1449             append_string (u->host, &fnres);
1450           else
1451             /* Host name can come from the network; malicious DNS may
1452                allow ".." to be resolved, causing us to write to
1453                "../<file>".  Defang such host names.  */
1454             append_string ("%2E%2E", &fnres);
1455           if (u->port != scheme_default_port (u->scheme))
1456             {
1457               char portstr[24];
1458               number_to_string (portstr, u->port);
1459               append_char (FN_PORT_SEP, &fnres);
1460               append_string (portstr, &fnres);
1461             }
1462         }
1463
1464       append_dir_structure (u, &fnres);
1465     }
1466
1467   /* Add the file name. */
1468   if (fnres.tail)
1469     append_char ('/', &fnres);
1470   u_file = *u->file ? u->file : "index.html";
1471   append_uri_pathel (u_file, u_file + strlen (u_file), 0, &fnres);
1472
1473   /* Append "?query" to the file name. */
1474   u_query = u->query && *u->query ? u->query : NULL;
1475   if (u_query)
1476     {
1477       append_char (FN_QUERY_SEP, &fnres);
1478       append_uri_pathel (u_query, u_query + strlen (u_query), 1, &fnres);
1479     }
1480
1481   /* Zero-terminate the file name. */
1482   append_char ('\0', &fnres);
1483
1484   fname = fnres.base;
1485
1486   /* Check the cases in which the unique extensions are not used:
1487      1) Clobbering is turned off (-nc).
1488      2) Retrieval with regetting.
1489      3) Timestamping is used.
1490      4) Hierarchy is built.
1491
1492      The exception is the case when file does exist and is a
1493      directory (see `mkalldirs' for explanation).  */
1494
1495   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1496       && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1497     return fname;
1498
1499   unique = unique_name (fname, 1);
1500   if (unique != fname)
1501     xfree (fname);
1502   return unique;
1503 }
1504 \f
1505 /* Resolve "." and ".." elements of PATH by destructively modifying
1506    PATH and return non-zero if PATH has been modified, zero otherwise.
1507
1508    The algorithm is in spirit similar to the one described in rfc1808,
1509    although implemented differently, in one pass.  To recap, path
1510    elements containing only "." are removed, and ".." is taken to mean
1511    "back up one element".  Single leading and trailing slashes are
1512    preserved.
1513
1514    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1515    test examples are provided below.  If you change anything in this
1516    function, run test_path_simplify to make sure you haven't broken a
1517    test case.  */
1518
1519 static int
1520 path_simplify (char *path)
1521 {
1522   char *h = path;               /* hare */
1523   char *t = path;               /* tortoise */
1524   char *beg = path;             /* boundary for backing the tortoise */
1525   char *end = path + strlen (path);
1526
1527   while (h < end)
1528     {
1529       /* Hare should be at the beginning of a path element. */
1530
1531       if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1532         {
1533           /* Ignore "./". */
1534           h += 2;
1535         }
1536       else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1537         {
1538           /* Handle "../" by retreating the tortoise by one path
1539              element -- but not past beggining.  */
1540           if (t > beg)
1541             {
1542               /* Move backwards until T hits the beginning of the
1543                  previous path element or the beginning of path. */
1544               for (--t; t > beg && t[-1] != '/'; t--)
1545                 ;
1546             }
1547           else
1548             {
1549               /* If we're at the beginning, copy the "../" literally
1550                  move the beginning so a later ".." doesn't remove
1551                  it.  */
1552               beg = t + 3;
1553               goto regular;
1554             }
1555           h += 3;
1556         }
1557       else
1558         {
1559         regular:
1560           /* A regular path element.  If H hasn't advanced past T,
1561              simply skip to the next path element.  Otherwise, copy
1562              the path element until the next slash.  */
1563           if (t == h)
1564             {
1565               /* Skip the path element, including the slash.  */
1566               while (h < end && *h != '/')
1567                 t++, h++;
1568               if (h < end)
1569                 t++, h++;
1570             }
1571           else
1572             {
1573               /* Copy the path element, including the final slash.  */
1574               while (h < end && *h != '/')
1575                 *t++ = *h++;
1576               if (h < end)
1577                 *t++ = *h++;
1578             }
1579         }
1580     }
1581
1582   if (t != h)
1583     *t = '\0';
1584
1585   return t != h;
1586 }
1587 \f
1588 /* Return the length of URL's path.  Path is considered to be
1589    terminated by one of '?', ';', '#', or by the end of the
1590    string.  */
1591
1592 static int
1593 path_length (const char *url)
1594 {
1595   const char *q = strpbrk_or_eos (url, "?;#");
1596   return q - url;
1597 }
1598
1599 /* Find the last occurrence of character C in the range [b, e), or
1600    NULL, if none are present.  We might want to use memrchr (a GNU
1601    extension) under GNU libc.  */
1602
1603 static const char *
1604 find_last_char (const char *b, const char *e, char c)
1605 {
1606   for (; e > b; e--)
1607     if (*e == c)
1608       return e;
1609   return NULL;
1610 }
1611
1612 /* Merge BASE with LINK and return the resulting URI.
1613
1614    Either of the URIs may be absolute or relative, complete with the
1615    host name, or path only.  This tries to reasonably handle all
1616    foreseeable cases.  It only employs minimal URL parsing, without
1617    knowledge of the specifics of schemes.
1618
1619    I briefly considered making this function call path_simplify after
1620    the merging process, as rfc1738 seems to suggest.  This is a bad
1621    idea for several reasons: 1) it complexifies the code, and 2)
1622    url_parse has to simplify path anyway, so it's wasteful to boot.  */
1623
1624 char *
1625 uri_merge (const char *base, const char *link)
1626 {
1627   int linklength;
1628   const char *end;
1629   char *merge;
1630
1631   if (url_has_scheme (link))
1632     return xstrdup (link);
1633
1634   /* We may not examine BASE past END. */
1635   end = base + path_length (base);
1636   linklength = strlen (link);
1637
1638   if (!*link)
1639     {
1640       /* Empty LINK points back to BASE, query string and all. */
1641       return xstrdup (base);
1642     }
1643   else if (*link == '?')
1644     {
1645       /* LINK points to the same location, but changes the query
1646          string.  Examples: */
1647       /* uri_merge("path",         "?new") -> "path?new"     */
1648       /* uri_merge("path?foo",     "?new") -> "path?new"     */
1649       /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1650       /* uri_merge("path#foo",     "?new") -> "path?new"     */
1651       int baselength = end - base;
1652       merge = xmalloc (baselength + linklength + 1);
1653       memcpy (merge, base, baselength);
1654       memcpy (merge + baselength, link, linklength);
1655       merge[baselength + linklength] = '\0';
1656     }
1657   else if (*link == '#')
1658     {
1659       /* uri_merge("path",         "#new") -> "path#new"     */
1660       /* uri_merge("path#foo",     "#new") -> "path#new"     */
1661       /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1662       /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1663       int baselength;
1664       const char *end1 = strchr (base, '#');
1665       if (!end1)
1666         end1 = base + strlen (base);
1667       baselength = end1 - base;
1668       merge = xmalloc (baselength + linklength + 1);
1669       memcpy (merge, base, baselength);
1670       memcpy (merge + baselength, link, linklength);
1671       merge[baselength + linklength] = '\0';
1672     }
1673   else if (*link == '/' && *(link + 1) == '/')
1674     {
1675       /* LINK begins with "//" and so is a net path: we need to
1676          replace everything after (and including) the double slash
1677          with LINK. */
1678
1679       /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1680       /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1681       /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1682
1683       int span;
1684       const char *slash;
1685       const char *start_insert;
1686
1687       /* Look for first slash. */
1688       slash = memchr (base, '/', end - base);
1689       /* If found slash and it is a double slash, then replace
1690          from this point, else default to replacing from the
1691          beginning.  */
1692       if (slash && *(slash + 1) == '/')
1693         start_insert = slash;
1694       else
1695         start_insert = base;
1696
1697       span = start_insert - base;
1698       merge = (char *)xmalloc (span + linklength + 1);
1699       if (span)
1700         memcpy (merge, base, span);
1701       memcpy (merge + span, link, linklength);
1702       merge[span + linklength] = '\0';
1703     }
1704   else if (*link == '/')
1705     {
1706       /* LINK is an absolute path: we need to replace everything
1707          after (and including) the FIRST slash with LINK.
1708
1709          So, if BASE is "http://host/whatever/foo/bar", and LINK is
1710          "/qux/xyzzy", our result should be
1711          "http://host/qux/xyzzy".  */
1712       int span;
1713       const char *slash;
1714       const char *start_insert = NULL; /* for gcc to shut up. */
1715       const char *pos = base;
1716       int seen_slash_slash = 0;
1717       /* We're looking for the first slash, but want to ignore
1718          double slash. */
1719     again:
1720       slash = memchr (pos, '/', end - pos);
1721       if (slash && !seen_slash_slash)
1722         if (*(slash + 1) == '/')
1723           {
1724             pos = slash + 2;
1725             seen_slash_slash = 1;
1726             goto again;
1727           }
1728
1729       /* At this point, SLASH is the location of the first / after
1730          "//", or the first slash altogether.  START_INSERT is the
1731          pointer to the location where LINK will be inserted.  When
1732          examining the last two examples, keep in mind that LINK
1733          begins with '/'. */
1734
1735       if (!slash && !seen_slash_slash)
1736         /* example: "foo" */
1737         /*           ^    */
1738         start_insert = base;
1739       else if (!slash && seen_slash_slash)
1740         /* example: "http://foo" */
1741         /*                     ^ */
1742         start_insert = end;
1743       else if (slash && !seen_slash_slash)
1744         /* example: "foo/bar" */
1745         /*           ^        */
1746         start_insert = base;
1747       else if (slash && seen_slash_slash)
1748         /* example: "http://something/" */
1749         /*                           ^  */
1750         start_insert = slash;
1751
1752       span = start_insert - base;
1753       merge = (char *)xmalloc (span + linklength + 1);
1754       if (span)
1755         memcpy (merge, base, span);
1756       memcpy (merge + span, link, linklength);
1757       merge[span + linklength] = '\0';
1758     }
1759   else
1760     {
1761       /* LINK is a relative URL: we need to replace everything
1762          after last slash (possibly empty) with LINK.
1763
1764          So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1765          our result should be "whatever/foo/qux/xyzzy".  */
1766       int need_explicit_slash = 0;
1767       int span;
1768       const char *start_insert;
1769       const char *last_slash = find_last_char (base, end, '/');
1770       if (!last_slash)
1771         {
1772           /* No slash found at all.  Replace what we have with LINK. */
1773           start_insert = base;
1774         }
1775       else if (last_slash && last_slash >= base + 2
1776                && last_slash[-2] == ':' && last_slash[-1] == '/')
1777         {
1778           /* example: http://host"  */
1779           /*                      ^ */
1780           start_insert = end + 1;
1781           need_explicit_slash = 1;
1782         }
1783       else
1784         {
1785           /* example: "whatever/foo/bar" */
1786           /*                        ^    */
1787           start_insert = last_slash + 1;
1788         }
1789
1790       span = start_insert - base;
1791       merge = (char *)xmalloc (span + linklength + 1);
1792       if (span)
1793         memcpy (merge, base, span);
1794       if (need_explicit_slash)
1795         merge[span - 1] = '/';
1796       memcpy (merge + span, link, linklength);
1797       merge[span + linklength] = '\0';
1798     }
1799
1800   return merge;
1801 }
1802 \f
1803 #define APPEND(p, s) do {                       \
1804   int len = strlen (s);                         \
1805   memcpy (p, s, len);                           \
1806   p += len;                                     \
1807 } while (0)
1808
1809 /* Use this instead of password when the actual password is supposed
1810    to be hidden.  We intentionally use a generic string without giving
1811    away the number of characters in the password, like previous
1812    versions did.  */
1813 #define HIDDEN_PASSWORD "*password*"
1814
1815 /* Recreate the URL string from the data in URL.
1816
1817    If HIDE is non-zero (as it is when we're calling this on a URL we
1818    plan to print, but not when calling it to canonicalize a URL for
1819    use within the program), password will be hidden.  Unsafe
1820    characters in the URL will be quoted.  */
1821
1822 char *
1823 url_string (const struct url *url, int hide_password)
1824 {
1825   int size;
1826   char *result, *p;
1827   char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;
1828
1829   int scheme_port  = supported_schemes[url->scheme].default_port;
1830   const char *scheme_str = supported_schemes[url->scheme].leading_string;
1831   int fplen = full_path_length (url);
1832
1833   int brackets_around_host;
1834
1835   assert (scheme_str != NULL);
1836
1837   /* Make sure the user name and password are quoted. */
1838   if (url->user)
1839     {
1840       quoted_user = url_escape_allow_passthrough (url->user);
1841       if (url->passwd)
1842         {
1843           if (hide_password)
1844             quoted_passwd = HIDDEN_PASSWORD;
1845           else
1846             quoted_passwd = url_escape_allow_passthrough (url->passwd);
1847         }
1848     }
1849
1850   /* In the unlikely event that the host name contains non-printable
1851      characters, quote it for displaying to the user.  */
1852   quoted_host = url_escape_allow_passthrough (url->host);
1853
1854   /* Undo the quoting of colons that URL escaping performs.  IPv6
1855      addresses may legally contain colons, and in that case must be
1856      placed in square brackets.  */
1857   if (quoted_host != url->host)
1858     unescape_single_char (quoted_host, ':');
1859   brackets_around_host = strchr (quoted_host, ':') != NULL;
1860
1861   size = (strlen (scheme_str)
1862           + strlen (quoted_host)
1863           + (brackets_around_host ? 2 : 0)
1864           + fplen
1865           + 1);
1866   if (url->port != scheme_port)
1867     size += 1 + numdigit (url->port);
1868   if (quoted_user)
1869     {
1870       size += 1 + strlen (quoted_user);
1871       if (quoted_passwd)
1872         size += 1 + strlen (quoted_passwd);
1873     }
1874
1875   p = result = xmalloc (size);
1876
1877   APPEND (p, scheme_str);
1878   if (quoted_user)
1879     {
1880       APPEND (p, quoted_user);
1881       if (quoted_passwd)
1882         {
1883           *p++ = ':';
1884           APPEND (p, quoted_passwd);
1885         }
1886       *p++ = '@';
1887     }
1888
1889   if (brackets_around_host)
1890     *p++ = '[';
1891   APPEND (p, quoted_host);
1892   if (brackets_around_host)
1893     *p++ = ']';
1894   if (url->port != scheme_port)
1895     {
1896       *p++ = ':';
1897       p = number_to_string (p, url->port);
1898     }
1899
1900   full_path_write (url, p);
1901   p += fplen;
1902   *p++ = '\0';
1903
1904   assert (p - result == size);
1905
1906   if (quoted_user && quoted_user != url->user)
1907     xfree (quoted_user);
1908   if (quoted_passwd && !hide_password && quoted_passwd != url->passwd)
1909     xfree (quoted_passwd);
1910   if (quoted_host != url->host)
1911     xfree (quoted_host);
1912
1913   return result;
1914 }
1915 \f
1916 /* Return non-zero if scheme a is similar to scheme b.
1917
1918    Schemes are similar if they are equal.  If SSL is supported, schemes
1919    are also similar if one is http (SCHEME_HTTP) and the other is https
1920    (SCHEME_HTTPS).  */
1921 int
1922 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
1923 {
1924   if (a == b)
1925     return 1;
1926 #ifdef HAVE_SSL
1927   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
1928       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
1929     return 1;
1930 #endif
1931   return 0;
1932 }
1933 \f
1934 #if 0
1935 /* Debugging and testing support for path_simplify. */
1936
1937 /* Debug: run path_simplify on PATH and return the result in a new
1938    string.  Useful for calling from the debugger.  */
1939 static char *
1940 ps (char *path)
1941 {
1942   char *copy = xstrdup (path);
1943   path_simplify (copy);
1944   return copy;
1945 }
1946
1947 static void
1948 run_test (char *test, char *expected_result, int expected_change)
1949 {
1950   char *test_copy = xstrdup (test);
1951   int modified = path_simplify (test_copy);
1952
1953   if (0 != strcmp (test_copy, expected_result))
1954     {
1955       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
1956               test, expected_result, test_copy);
1957     }
1958   if (modified != expected_change)
1959     {
1960       if (expected_change == 1)
1961         printf ("Expected modification with path_simplify(\"%s\").\n",
1962                 test);
1963       else
1964         printf ("Expected no modification with path_simplify(\"%s\").\n",
1965                 test);
1966     }
1967   xfree (test_copy);
1968 }
1969
1970 static void
1971 test_path_simplify (void)
1972 {
1973   static struct {
1974     char *test, *result;
1975     int should_modify;
1976   } tests[] = {
1977     { "",                       "",             0 },
1978     { ".",                      "",             1 },
1979     { "./",                     "",             1 },
1980     { "..",                     "..",           0 },
1981     { "../",                    "../",          0 },
1982     { "foo",                    "foo",          0 },
1983     { "foo/bar",                "foo/bar",      0 },
1984     { "foo///bar",              "foo///bar",    0 },
1985     { "foo/.",                  "foo/",         1 },
1986     { "foo/./",                 "foo/",         1 },
1987     { "foo./",                  "foo./",        0 },
1988     { "foo/../bar",             "bar",          1 },
1989     { "foo/../bar/",            "bar/",         1 },
1990     { "foo/bar/..",             "foo/",         1 },
1991     { "foo/bar/../x",           "foo/x",        1 },
1992     { "foo/bar/../x/",          "foo/x/",       1 },
1993     { "foo/..",                 "",             1 },
1994     { "foo/../..",              "..",           1 },
1995     { "foo/../../..",           "../..",        1 },
1996     { "foo/../../bar/../../baz", "../../baz",   1 },
1997     { "a/b/../../c",            "c",            1 },
1998     { "./a/../b",               "b",            1 }
1999   };
2000   int i;
2001
2002   for (i = 0; i < countof (tests); i++)
2003     {
2004       char *test = tests[i].test;
2005       char *expected_result = tests[i].result;
2006       int   expected_change = tests[i].should_modify;
2007       run_test (test, expected_result, expected_change);
2008     }
2009 }
2010 #endif