sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1996-2005 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software Foundation, Inc.,
  18 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include <config.h>
  31
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #include <string.h>
  35 #ifdef HAVE_UNISTD_H
  36 # include <unistd.h>
  37 #endif
  38 #include <errno.h>
  39 #include <assert.h>
  40
  41 #include "wget.h"
  42 #include "utils.h"
  43 #include "url.h"
  44 #include "host.h"  /* for is_valid_ipv6_address */
  45
  46 enum {
  47   scm_disabled = 1,             /* for https when OpenSSL fails to init. */
  48   scm_has_params = 2,           /* whether scheme has ;params */
  49   scm_has_query = 4,            /* whether scheme has ?query */
  50   scm_has_fragment = 8          /* whether scheme has #fragment */
  51 };
  52
  53 struct scheme_data
  54 {
  55   /* Short name of the scheme, such as "http" or "ftp". */
  56   const char *name;
  57   /* Leading string that identifies the scheme, such as "https://". */
  58   const char *leading_string;
  59   /* Default port of the scheme when none is specified. */
  60   int default_port;
  61   /* Various flags. */
  62   int flags;
  63 };
  64
  65 /* Supported schemes: */
  66 static struct scheme_data supported_schemes[] =
  67 {
  68   { "http",     "http://",  DEFAULT_HTTP_PORT,  scm_has_query|scm_has_fragment },
  69 #ifdef HAVE_SSL
  70   { "https",    "https://", DEFAULT_HTTPS_PORT, scm_has_query|scm_has_fragment },
  71 #endif
  72   { "ftp",      "ftp://",   DEFAULT_FTP_PORT,   scm_has_params|scm_has_fragment },
  73
  74   /* SCHEME_INVALID */
  75   { NULL,       NULL,       -1,                 0 }
  76 };
  77
  78 /* Forward declarations: */
  79
  80 static bool path_simplify (char *);
  81 \f
  82 /* Support for escaping and unescaping of URL strings.  */
  83
  84 /* Table of "reserved" and "unsafe" characters.  Those terms are
  85    rfc1738-speak, as such largely obsoleted by rfc2396 and later
  86    specs, but the general idea remains.
  87
  88    A reserved character is the one that you can't decode without
  89    changing the meaning of the URL.  For example, you can't decode
  90    "/foo/%2f/bar" into "/foo///bar" because the number and contents of
  91    path components is different.  Non-reserved characters can be
  92    changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar".  The
  93    unsafe characters are loosely based on rfc1738, plus "$" and ",",
  94    as recommended by rfc2396, and minus "~", which is very frequently
  95    used (and sometimes unrecognized as %7E by broken servers).
  96
  97    An unsafe character is the one that should be encoded when URLs are
  98    placed in foreign environments.  E.g. space and newline are unsafe
  99    in HTTP contexts because HTTP uses them as separator and line
 100    terminator, so they must be encoded to %20 and %0A respectively.
 101    "*" is unsafe in shell context, etc.
 102
 103    We determine whether a character is unsafe through static table
 104    lookup.  This code assumes ASCII character set and 8-bit chars.  */
 105
 106 enum {
 107   /* rfc1738 reserved chars + "$" and ",".  */
 108   urlchr_reserved = 1,
 109
 110   /* rfc1738 unsafe chars, plus non-printables.  */
 111   urlchr_unsafe   = 2
 112 };
 113
 114 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 115 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 116 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 117
 118 /* Shorthands for the table: */
 119 #define R  urlchr_reserved
 120 #define U  urlchr_unsafe
 121 #define RU R|U
 122
 123 static const unsigned char urlchr_table[256] =
 124 {
 125   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 126   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 127   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 128   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 129   U,  0,  U, RU,   R,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 130   0,  0,  0,  R,   R,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 131   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 132   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 133  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 134   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 135   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 136   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 137   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 138   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 139   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 140   0,  0,  0,  U,   U,  U,  0,  U,   /* x   y   z   {    |   }   ~   DEL */
 141
 142   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 143   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 144   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 145   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 146
 147   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 148   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 149   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 150   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 151 };
 152 #undef R
 153 #undef U
 154 #undef RU
 155
 156 /* URL-unescape the string S.
 157
 158    This is done by transforming the sequences "%HH" to the character
 159    represented by the hexadecimal digits HH.  If % is not followed by
 160    two hexadecimal digits, it is inserted literally.
 161
 162    The transformation is done in place.  If you need the original
 163    string intact, make a copy before calling this function.  */
 164
 165 static void
 166 url_unescape (char *s)
 167 {
 168   char *t = s;                  /* t - tortoise */
 169   char *h = s;                  /* h - hare     */
 170
 171   for (; *h; h++, t++)
 172     {
 173       if (*h != '%')
 174         {
 175         copychar:
 176           *t = *h;
 177         }
 178       else
 179         {
 180           char c;
 181           /* Do nothing if '%' is not followed by two hex digits. */
 182           if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2])))
 183             goto copychar;
 184           c = X2DIGITS_TO_NUM (h[1], h[2]);
 185           /* Don't unescape %00 because there is no way to insert it
 186              into a C string without effectively truncating it. */
 187           if (c == '\0')
 188             goto copychar;
 189           *t = c;
 190           h += 2;
 191         }
 192     }
 193   *t = '\0';
 194 }
 195
 196 /* The core of url_escape_* functions.  Escapes the characters that
 197    match the provided mask in urlchr_table.
 198
 199    If ALLOW_PASSTHROUGH is true, a string with no unsafe chars will be
 200    returned unchanged.  If ALLOW_PASSTHROUGH is false, a freshly
 201    allocated string will be returned in all cases.  */
 202
 203 static char *
 204 url_escape_1 (const char *s, unsigned char mask, bool allow_passthrough)
 205 {
 206   const char *p1;
 207   char *p2, *newstr;
 208   int newlen;
 209   int addition = 0;
 210
 211   for (p1 = s; *p1; p1++)
 212     if (urlchr_test (*p1, mask))
 213       addition += 2;            /* Two more characters (hex digits) */
 214
 215   if (!addition)
 216     return allow_passthrough ? (char *)s : xstrdup (s);
 217
 218   newlen = (p1 - s) + addition;
 219   newstr = xmalloc (newlen + 1);
 220
 221   p1 = s;
 222   p2 = newstr;
 223   while (*p1)
 224     {
 225       /* Quote the characters that match the test mask. */
 226       if (urlchr_test (*p1, mask))
 227         {
 228           unsigned char c = *p1++;
 229           *p2++ = '%';
 230           *p2++ = XNUM_TO_DIGIT (c >> 4);
 231           *p2++ = XNUM_TO_DIGIT (c & 0xf);
 232         }
 233       else
 234         *p2++ = *p1++;
 235     }
 236   assert (p2 - newstr == newlen);
 237   *p2 = '\0';
 238
 239   return newstr;
 240 }
 241
 242 /* URL-escape the unsafe characters (see urlchr_table) in a given
 243    string, returning a freshly allocated string.  */
 244
 245 char *
 246 url_escape (const char *s)
 247 {
 248   return url_escape_1 (s, urlchr_unsafe, false);
 249 }
 250
 251 /* URL-escape the unsafe characters (see urlchr_table) in a given
 252    string.  If no characters are unsafe, S is returned.  */
 253
 254 static char *
 255 url_escape_allow_passthrough (const char *s)
 256 {
 257   return url_escape_1 (s, urlchr_unsafe, true);
 258 }
 259 \f
 260 /* Decide whether the char at position P needs to be encoded.  (It is
 261    not enough to pass a single char *P because the function may need
 262    to inspect the surrounding context.)
 263
 264    Return true if the char should be escaped as %XX, false otherwise.  */
 265
 266 static inline bool
 267 char_needs_escaping (const char *p)
 268 {
 269   if (*p == '%')
 270     {
 271       if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
 272         return false;
 273       else
 274         /* Garbled %.. sequence: encode `%'. */
 275         return true;
 276     }
 277   else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
 278     return true;
 279   else
 280     return false;
 281 }
 282
 283 /* Translate a %-escaped (but possibly non-conformant) input string S
 284    into a %-escaped (and conformant) output string.  If no characters
 285    are encoded or decoded, return the same string S; otherwise, return
 286    a freshly allocated string with the new contents.
 287
 288    After a URL has been run through this function, the protocols that
 289    use `%' as the quote character can use the resulting string as-is,
 290    while those that don't can use url_unescape to get to the intended
 291    data.  This function is stable: once the input is transformed,
 292    further transformations of the result yield the same output.
 293
 294    Let's discuss why this function is needed.
 295
 296    Imagine Wget is asked to retrieve `http://abc.xyz/abc def'.  Since
 297    a raw space character would mess up the HTTP request, it needs to
 298    be quoted, like this:
 299
 300        GET /abc%20def HTTP/1.0
 301
 302    It would appear that the unsafe chars need to be quoted, for
 303    example with url_escape.  But what if we're requested to download
 304    `abc%20def'?  url_escape transforms "%" to "%25", which would leave
 305    us with `abc%2520def'.  This is incorrect -- since %-escapes are
 306    part of URL syntax, "%20" is the correct way to denote a literal
 307    space on the Wget command line.  This leads to the conclusion that
 308    in that case Wget should not call url_escape, but leave the `%20'
 309    as is.  This is clearly contradictory, but it only gets worse.
 310
 311    What if the requested URI is `abc%20 def'?  If we call url_escape,
 312    we end up with `/abc%2520%20def', which is almost certainly not
 313    intended.  If we don't call url_escape, we are left with the
 314    embedded space and cannot complete the request.  What the user
 315    meant was for Wget to request `/abc%20%20def', and this is where
 316    reencode_escapes kicks in.
 317
 318    Wget used to solve this by first decoding %-quotes, and then
 319    encoding all the "unsafe" characters found in the resulting string.
 320    This was wrong because it didn't preserve certain URL special
 321    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 322    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 323    whether we considered `+' reserved (it is).  One of these results
 324    is inevitable because by the second step we would lose information
 325    on whether the `+' was originally encoded or not.  Both results
 326    were wrong because in CGI parameters + means space, while %2B means
 327    literal plus.  reencode_escapes correctly translates the above to
 328    "a%2B+b", i.e. returns the original string.
 329
 330    This function uses a modified version of the algorithm originally
 331    proposed by Anon Sricharoenchai:
 332
 333    * Encode all "unsafe" characters, except those that are also
 334      "reserved", to %XX.  See urlchr_table for which characters are
 335      unsafe and reserved.
 336
 337    * Encode the "%" characters not followed by two hex digits to
 338      "%25".
 339
 340    * Pass through all other characters and %XX escapes as-is.  (Up to
 341      Wget 1.10 this decoded %XX escapes corresponding to "safe"
 342      characters, but that was obtrusive and broke some servers.)
 343
 344    Anon's test case:
 345
 346    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 347    ->
 348    "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc"
 349
 350    Simpler test cases:
 351
 352    "foo bar"         -> "foo%20bar"
 353    "foo%20bar"       -> "foo%20bar"
 354    "foo %20bar"      -> "foo%20%20bar"
 355    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 356    "foo%25%20bar"    -> "foo%25%20bar"
 357    "foo%2%20bar"     -> "foo%252%20bar"
 358    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 359    "foo%2b+bar"      -> "foo%2b+bar"  */
 360
 361 static char *
 362 reencode_escapes (const char *s)
 363 {
 364   const char *p1;
 365   char *newstr, *p2;
 366   int oldlen, newlen;
 367
 368   int encode_count = 0;
 369
 370   /* First pass: inspect the string to see if there's anything to do,
 371      and to calculate the new length.  */
 372   for (p1 = s; *p1; p1++)
 373     if (char_needs_escaping (p1))
 374       ++encode_count;
 375
 376   if (!encode_count)
 377     /* The string is good as it is. */
 378     return (char *) s;          /* C const model sucks. */
 379
 380   oldlen = p1 - s;
 381   /* Each encoding adds two characters (hex digits).  */
 382   newlen = oldlen + 2 * encode_count;
 383   newstr = xmalloc (newlen + 1);
 384
 385   /* Second pass: copy the string to the destination address, encoding
 386      chars when needed.  */
 387   p1 = s;
 388   p2 = newstr;
 389
 390   while (*p1)
 391     if (char_needs_escaping (p1))
 392       {
 393         unsigned char c = *p1++;
 394         *p2++ = '%';
 395         *p2++ = XNUM_TO_DIGIT (c >> 4);
 396         *p2++ = XNUM_TO_DIGIT (c & 0xf);
 397       }
 398     else
 399       *p2++ = *p1++;
 400
 401   *p2 = '\0';
 402   assert (p2 - newstr == newlen);
 403   return newstr;
 404 }
 405 \f
 406 /* Returns the scheme type if the scheme is supported, or
 407    SCHEME_INVALID if not.  */
 408
 409 enum url_scheme
 410 url_scheme (const char *url)
 411 {
 412   int i;
 413
 414   for (i = 0; supported_schemes[i].leading_string; i++)
 415     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 416                           strlen (supported_schemes[i].leading_string)))
 417       {
 418         if (!(supported_schemes[i].flags & scm_disabled))
 419           return (enum url_scheme) i;
 420         else
 421           return SCHEME_INVALID;
 422       }
 423
 424   return SCHEME_INVALID;
 425 }
 426
 427 #define SCHEME_CHAR(ch) (ISALNUM (ch) || (ch) == '-' || (ch) == '+')
 428
 429 /* Return 1 if the URL begins with any "scheme", 0 otherwise.  As
 430    currently implemented, it returns true if URL begins with
 431    [-+a-zA-Z0-9]+: .  */
 432
 433 bool
 434 url_has_scheme (const char *url)
 435 {
 436   const char *p = url;
 437
 438   /* The first char must be a scheme char. */
 439   if (!*p || !SCHEME_CHAR (*p))
 440     return false;
 441   ++p;
 442   /* Followed by 0 or more scheme chars. */
 443   while (*p && SCHEME_CHAR (*p))
 444     ++p;
 445   /* Terminated by ':'. */
 446   return *p == ':';
 447 }
 448
 449 int
 450 scheme_default_port (enum url_scheme scheme)
 451 {
 452   return supported_schemes[scheme].default_port;
 453 }
 454
 455 void
 456 scheme_disable (enum url_scheme scheme)
 457 {
 458   supported_schemes[scheme].flags |= scm_disabled;
 459 }
 460
 461 /* Skip the username and password, if present in the URL.  The
 462    function should *not* be called with the complete URL, but with the
 463    portion after the scheme.
 464
 465    If no username and password are found, return URL.  */
 466
 467 static const char *
 468 url_skip_credentials (const char *url)
 469 {
 470   /* Look for '@' that comes before terminators, such as '/', '?',
 471      '#', or ';'.  */
 472   const char *p = (const char *)strpbrk (url, "@/?#;");
 473   if (!p || *p != '@')
 474     return url;
 475   return p + 1;
 476 }
 477
 478 /* Parse credentials contained in [BEG, END).  The region is expected
 479    to have come from a URL and is unescaped.  */
 480
 481 static bool
 482 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
 483 {
 484   char *colon;
 485   const char *userend;
 486
 487   if (beg == end)
 488     return false;               /* empty user name */
 489
 490   colon = memchr (beg, ':', end - beg);
 491   if (colon == beg)
 492     return false;               /* again empty user name */
 493
 494   if (colon)
 495     {
 496       *passwd = strdupdelim (colon + 1, end);
 497       userend = colon;
 498       url_unescape (*passwd);
 499     }
 500   else
 501     {
 502       *passwd = NULL;
 503       userend = end;
 504     }
 505   *user = strdupdelim (beg, userend);
 506   url_unescape (*user);
 507   return true;
 508 }
 509
 510 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 511    popularized by Netscape and NcFTP.  HTTP shorthands look like this:
 512
 513    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 514    www.foo.com[:port]            -> http://www.foo.com[:port]
 515
 516    FTP shorthands look like this:
 517
 518    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 519    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 520
 521    If the URL needs not or cannot be rewritten, return NULL.  */
 522
 523 char *
 524 rewrite_shorthand_url (const char *url)
 525 {
 526   const char *p;
 527
 528   if (url_scheme (url) != SCHEME_INVALID)
 529     return NULL;
 530
 531   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 532      latter Netscape.  */
 533   for (p = url; *p && *p != ':' && *p != '/'; p++)
 534     ;
 535
 536   if (p == url)
 537     return NULL;
 538
 539   /* If we're looking at "://", it means the URL uses a scheme we
 540      don't support, which may include "https" when compiled without
 541      SSL support.  Don't bogusly rewrite such URLs.  */
 542   if (p[0] == ':' && p[1] == '/' && p[2] == '/')
 543     return NULL;
 544
 545   if (*p == ':')
 546     {
 547       const char *pp;
 548       char *res;
 549       /* If the characters after the colon and before the next slash
 550          or end of string are all digits, it's HTTP.  */
 551       int digits = 0;
 552       for (pp = p + 1; ISDIGIT (*pp); pp++)
 553         ++digits;
 554       if (digits > 0 && (*pp == '/' || *pp == '\0'))
 555         goto http;
 556
 557       /* Prepend "ftp://" to the entire URL... */
 558       res = xmalloc (6 + strlen (url) + 1);
 559       sprintf (res, "ftp://%s", url);
 560       /* ...and replace ':' with '/'. */
 561       res[6 + (p - url)] = '/';
 562       return res;
 563     }
 564   else
 565     {
 566       char *res;
 567     http:
 568       /* Just prepend "http://" to what we have. */
 569       res = xmalloc (7 + strlen (url) + 1);
 570       sprintf (res, "http://%s", url);
 571       return res;
 572     }
 573 }
 574 \f
 575 static void split_path (const char *, char **, char **);
 576
 577 /* Like strpbrk, with the exception that it returns the pointer to the
 578    terminating zero (end-of-string aka "eos") if no matching character
 579    is found.
 580
 581    Although I normally balk at Gcc-specific optimizations, it probably
 582    makes sense here: glibc has optimizations that detect strpbrk being
 583    called with literal string as ACCEPT and inline the search.  That
 584    optimization is defeated if strpbrk is hidden within the call to
 585    another function.  (And no, making strpbrk_or_eos inline doesn't
 586    help because the check for literal accept is in the
 587    preprocessor.)  */
 588
 589 #if defined(__GNUC__) && __GNUC__ >= 3
 590
 591 #define strpbrk_or_eos(s, accept) ({            \
 592   char *SOE_p = strpbrk (s, accept);            \
 593   if (!SOE_p)                                   \
 594     SOE_p = strchr (s, '\0');                   \
 595   SOE_p;                                        \
 596 })
 597
 598 #else  /* not __GNUC__ or old gcc */
 599
 600 static inline char *
 601 strpbrk_or_eos (const char *s, const char *accept)
 602 {
 603   char *p = strpbrk (s, accept);
 604   if (!p)
 605     p = strchr (s, '\0');
 606   return p;
 607 }
 608 #endif /* not __GNUC__ or old gcc */
 609
 610 /* Turn STR into lowercase; return true if a character was actually
 611    changed. */
 612
 613 static bool
 614 lowercase_str (char *str)
 615 {
 616   bool changed = false;
 617   for (; *str; str++)
 618     if (ISUPPER (*str))
 619       {
 620         changed = true;
 621         *str = TOLOWER (*str);
 622       }
 623   return changed;
 624 }
 625
 626 static const char *
 627 init_seps (enum url_scheme scheme)
 628 {
 629   static char seps[8] = ":/";
 630   char *p = seps + 2;
 631   int flags = supported_schemes[scheme].flags;
 632
 633   if (flags & scm_has_params)
 634     *p++ = ';';
 635   if (flags & scm_has_query)
 636     *p++ = '?';
 637   if (flags & scm_has_fragment)
 638     *p++ = '#';
 639   *p++ = '\0';
 640   return seps;
 641 }
 642
 643 static const char *parse_errors[] = {
 644 #define PE_NO_ERROR                     0
 645   N_("No error"),
 646 #define PE_UNSUPPORTED_SCHEME           1
 647   N_("Unsupported scheme"),
 648 #define PE_INVALID_HOST_NAME            2
 649   N_("Invalid host name"),
 650 #define PE_BAD_PORT_NUMBER              3
 651   N_("Bad port number"),
 652 #define PE_INVALID_USER_NAME            4
 653   N_("Invalid user name"),
 654 #define PE_UNTERMINATED_IPV6_ADDRESS    5
 655   N_("Unterminated IPv6 numeric address"),
 656 #define PE_IPV6_NOT_SUPPORTED           6
 657   N_("IPv6 addresses not supported"),
 658 #define PE_INVALID_IPV6_ADDRESS         7
 659   N_("Invalid IPv6 numeric address")
 660 };
 661
 662 /* Parse a URL.
 663
 664    Return a new struct url if successful, NULL on error.  In case of
 665    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 666    error code. */
 667 struct url *
 668 url_parse (const char *url, int *error)
 669 {
 670   struct url *u;
 671   const char *p;
 672   bool path_modified, host_modified;
 673
 674   enum url_scheme scheme;
 675   const char *seps;
 676
 677   const char *uname_b,     *uname_e;
 678   const char *host_b,      *host_e;
 679   const char *path_b,      *path_e;
 680   const char *params_b,    *params_e;
 681   const char *query_b,     *query_e;
 682   const char *fragment_b,  *fragment_e;
 683
 684   int port;
 685   char *user = NULL, *passwd = NULL;
 686
 687   char *url_encoded = NULL;
 688
 689   int error_code;
 690
 691   scheme = url_scheme (url);
 692   if (scheme == SCHEME_INVALID)
 693     {
 694       error_code = PE_UNSUPPORTED_SCHEME;
 695       goto error;
 696     }
 697
 698   url_encoded = reencode_escapes (url);
 699   p = url_encoded;
 700
 701   p += strlen (supported_schemes[scheme].leading_string);
 702   uname_b = p;
 703   p = url_skip_credentials (p);
 704   uname_e = p;
 705
 706   /* scheme://user:pass@host[:port]... */
 707   /*                    ^              */
 708
 709   /* We attempt to break down the URL into the components path,
 710      params, query, and fragment.  They are ordered like this:
 711
 712        scheme://host[:port][/path][;params][?query][#fragment]  */
 713
 714   path_b     = path_e     = NULL;
 715   params_b   = params_e   = NULL;
 716   query_b    = query_e    = NULL;
 717   fragment_b = fragment_e = NULL;
 718
 719   /* Initialize separators for optional parts of URL, depending on the
 720      scheme.  For example, FTP has params, and HTTP and HTTPS have
 721      query string and fragment. */
 722   seps = init_seps (scheme);
 723
 724   host_b = p;
 725
 726   if (*p == '[')
 727     {
 728       /* Handle IPv6 address inside square brackets.  Ideally we'd
 729          just look for the terminating ']', but rfc2732 mandates
 730          rejecting invalid IPv6 addresses.  */
 731
 732       /* The address begins after '['. */
 733       host_b = p + 1;
 734       host_e = strchr (host_b, ']');
 735
 736       if (!host_e)
 737         {
 738           error_code = PE_UNTERMINATED_IPV6_ADDRESS;
 739           goto error;
 740         }
 741
 742 #ifdef ENABLE_IPV6
 743       /* Check if the IPv6 address is valid. */
 744       if (!is_valid_ipv6_address(host_b, host_e))
 745         {
 746           error_code = PE_INVALID_IPV6_ADDRESS;
 747           goto error;
 748         }
 749
 750       /* Continue parsing after the closing ']'. */
 751       p = host_e + 1;
 752 #else
 753       error_code = PE_IPV6_NOT_SUPPORTED;
 754       goto error;
 755 #endif
 756
 757       /* The closing bracket must be followed by a separator or by the
 758          null char.  */
 759       /* http://[::1]... */
 760       /*             ^   */
 761       if (!strchr (seps, *p))
 762         {
 763           /* Trailing garbage after []-delimited IPv6 address. */
 764           error_code = PE_INVALID_HOST_NAME;
 765           goto error;
 766         }
 767     }
 768   else
 769     {
 770       p = strpbrk_or_eos (p, seps);
 771       host_e = p;
 772     }
 773   ++seps;                       /* advance to '/' */
 774
 775   if (host_b == host_e)
 776     {
 777       error_code = PE_INVALID_HOST_NAME;
 778       goto error;
 779     }
 780
 781   port = scheme_default_port (scheme);
 782   if (*p == ':')
 783     {
 784       const char *port_b, *port_e, *pp;
 785
 786       /* scheme://host:port/tralala */
 787       /*              ^             */
 788       ++p;
 789       port_b = p;
 790       p = strpbrk_or_eos (p, seps);
 791       port_e = p;
 792
 793       /* Allow empty port, as per rfc2396. */
 794       if (port_b != port_e)
 795         for (port = 0, pp = port_b; pp < port_e; pp++)
 796           {
 797             if (!ISDIGIT (*pp))
 798               {
 799                 /* http://host:12randomgarbage/blah */
 800                 /*               ^                  */
 801                 error_code = PE_BAD_PORT_NUMBER;
 802                 goto error;
 803               }
 804             port = 10 * port + (*pp - '0');
 805             /* Check for too large port numbers here, before we have
 806                a chance to overflow on bogus port values.  */
 807             if (port > 0xffff)
 808               {
 809                 error_code = PE_BAD_PORT_NUMBER;
 810                 goto error;
 811               }
 812           }
 813     }
 814   /* Advance to the first separator *after* '/' (either ';' or '?',
 815      depending on the scheme).  */
 816   ++seps;
 817
 818   /* Get the optional parts of URL, each part being delimited by
 819      current location and the position of the next separator.  */
 820 #define GET_URL_PART(sepchar, var) do {                         \
 821   if (*p == sepchar)                                            \
 822     var##_b = ++p, var##_e = p = strpbrk_or_eos (p, seps);      \
 823   ++seps;                                                       \
 824 } while (0)
 825
 826   GET_URL_PART ('/', path);
 827   if (supported_schemes[scheme].flags & scm_has_params)
 828     GET_URL_PART (';', params);
 829   if (supported_schemes[scheme].flags & scm_has_query)
 830     GET_URL_PART ('?', query);
 831   if (supported_schemes[scheme].flags & scm_has_fragment)
 832     GET_URL_PART ('#', fragment);
 833
 834 #undef GET_URL_PART
 835   assert (*p == 0);
 836
 837   if (uname_b != uname_e)
 838     {
 839       /* http://user:pass@host */
 840       /*        ^         ^    */
 841       /*     uname_b   uname_e */
 842       if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
 843         {
 844           error_code = PE_INVALID_USER_NAME;
 845           goto error;
 846         }
 847     }
 848
 849   u = xnew0 (struct url);
 850   u->scheme = scheme;
 851   u->host   = strdupdelim (host_b, host_e);
 852   u->port   = port;
 853   u->user   = user;
 854   u->passwd = passwd;
 855
 856   u->path = strdupdelim (path_b, path_e);
 857   path_modified = path_simplify (u->path);
 858   split_path (u->path, &u->dir, &u->file);
 859
 860   host_modified = lowercase_str (u->host);
 861
 862   /* Decode %HH sequences in host name.  This is important not so much
 863      to support %HH sequences in host names (which other browser
 864      don't), but to support binary characters (which will have been
 865      converted to %HH by reencode_escapes).  */
 866   if (strchr (u->host, '%'))
 867     {
 868       url_unescape (u->host);
 869       host_modified = true;
 870     }
 871
 872   if (params_b)
 873     u->params = strdupdelim (params_b, params_e);
 874   if (query_b)
 875     u->query = strdupdelim (query_b, query_e);
 876   if (fragment_b)
 877     u->fragment = strdupdelim (fragment_b, fragment_e);
 878
 879   if (path_modified || u->fragment || host_modified || path_b == path_e)
 880     {
 881       /* If we suspect that a transformation has rendered what
 882          url_string might return different from URL_ENCODED, rebuild
 883          u->url using url_string.  */
 884       u->url = url_string (u, false);
 885
 886       if (url_encoded != url)
 887         xfree ((char *) url_encoded);
 888     }
 889   else
 890     {
 891       if (url_encoded == url)
 892         u->url = xstrdup (url);
 893       else
 894         u->url = url_encoded;
 895     }
 896
 897   return u;
 898
 899  error:
 900   /* Cleanup in case of error: */
 901   if (url_encoded && url_encoded != url)
 902     xfree (url_encoded);
 903
 904   /* Transmit the error code to the caller, if the caller wants to
 905      know.  */
 906   if (error)
 907     *error = error_code;
 908   return NULL;
 909 }
 910
 911 /* Return the error message string from ERROR_CODE, which should have
 912    been retrieved from url_parse.  The error message is translated.  */
 913
 914 const char *
 915 url_error (int error_code)
 916 {
 917   assert (error_code >= 0 && error_code < countof (parse_errors));
 918   return _(parse_errors[error_code]);
 919 }
 920
 921 /* Split PATH into DIR and FILE.  PATH comes from the URL and is
 922    expected to be URL-escaped.
 923
 924    The path is split into directory (the part up to the last slash)
 925    and file (the part after the last slash), which are subsequently
 926    unescaped.  Examples:
 927
 928    PATH                 DIR           FILE
 929    "foo/bar/baz"        "foo/bar"     "baz"
 930    "foo/bar/"           "foo/bar"     ""
 931    "foo"                ""            "foo"
 932    "foo/bar/baz%2fqux"  "foo/bar"     "baz/qux" (!)
 933
 934    DIR and FILE are freshly allocated.  */
 935
 936 static void
 937 split_path (const char *path, char **dir, char **file)
 938 {
 939   char *last_slash = strrchr (path, '/');
 940   if (!last_slash)
 941     {
 942       *dir = xstrdup ("");
 943       *file = xstrdup (path);
 944     }
 945   else
 946     {
 947       *dir = strdupdelim (path, last_slash);
 948       *file = xstrdup (last_slash + 1);
 949     }
 950   url_unescape (*dir);
 951   url_unescape (*file);
 952 }
 953
 954 /* Note: URL's "full path" is the path with the query string and
 955    params appended.  The "fragment" (#foo) is intentionally ignored,
 956    but that might be changed.  For example, if the original URL was
 957    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
 958    the full path will be "/foo/bar/baz;bullshit?querystring".  */
 959
 960 /* Return the length of the full path, without the terminating
 961    zero.  */
 962
 963 static int
 964 full_path_length (const struct url *url)
 965 {
 966   int len = 0;
 967
 968 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
 969
 970   FROB (path);
 971   FROB (params);
 972   FROB (query);
 973
 974 #undef FROB
 975
 976   return len;
 977 }
 978
 979 /* Write out the full path. */
 980
 981 static void
 982 full_path_write (const struct url *url, char *where)
 983 {
 984 #define FROB(el, chr) do {                      \
 985   char *f_el = url->el;                         \
 986   if (f_el) {                                   \
 987     int l = strlen (f_el);                      \
 988     *where++ = chr;                             \
 989     memcpy (where, f_el, l);                    \
 990     where += l;                                 \
 991   }                                             \
 992 } while (0)
 993
 994   FROB (path, '/');
 995   FROB (params, ';');
 996   FROB (query, '?');
 997
 998 #undef FROB
 999 }
1000
1001 /* Public function for getting the "full path".  E.g. if u->path is
1002    "foo/bar" and u->query is "param=value", full_path will be
1003    "/foo/bar?param=value". */
1004
1005 char *
1006 url_full_path (const struct url *url)
1007 {
1008   int length = full_path_length (url);
1009   char *full_path = xmalloc (length + 1);
1010
1011   full_path_write (url, full_path);
1012   full_path[length] = '\0';
1013
1014   return full_path;
1015 }
1016
1017 /* Unescape CHR in an otherwise escaped STR.  Used to selectively
1018    escaping of certain characters, such as "/" and ":".  Returns a
1019    count of unescaped chars.  */
1020
1021 static void
1022 unescape_single_char (char *str, char chr)
1023 {
1024   const char c1 = XNUM_TO_DIGIT (chr >> 4);
1025   const char c2 = XNUM_TO_DIGIT (chr & 0xf);
1026   char *h = str;                /* hare */
1027   char *t = str;                /* tortoise */
1028   for (; *h; h++, t++)
1029     {
1030       if (h[0] == '%' && h[1] == c1 && h[2] == c2)
1031         {
1032           *t = chr;
1033           h += 2;
1034         }
1035       else
1036         *t = *h;
1037     }
1038   *t = '\0';
1039 }
1040
1041 /* Escape unsafe and reserved characters, except for the slash
1042    characters.  */
1043
1044 static char *
1045 url_escape_dir (const char *dir)
1046 {
1047   char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1048   if (newdir == dir)
1049     return (char *)dir;
1050
1051   unescape_single_char (newdir, '/');
1052   return newdir;
1053 }
1054
1055 /* Sync u->path and u->url with u->dir and u->file.  Called after
1056    u->file or u->dir have been changed, typically by the FTP code.  */
1057
1058 static void
1059 sync_path (struct url *u)
1060 {
1061   char *newpath, *efile, *edir;
1062
1063   xfree (u->path);
1064
1065   /* u->dir and u->file are not escaped.  URL-escape them before
1066      reassembling them into u->path.  That way, if they contain
1067      separators like '?' or even if u->file contains slashes, the
1068      path will be correctly assembled.  (u->file can contain slashes
1069      if the URL specifies it with %2f, or if an FTP server returns
1070      it.)  */
1071   edir = url_escape_dir (u->dir);
1072   efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1073
1074   if (!*edir)
1075     newpath = xstrdup (efile);
1076   else
1077     {
1078       int dirlen = strlen (edir);
1079       int filelen = strlen (efile);
1080
1081       /* Copy "DIR/FILE" to newpath. */
1082       char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1083       memcpy (p, edir, dirlen);
1084       p += dirlen;
1085       *p++ = '/';
1086       memcpy (p, efile, filelen);
1087       p += filelen;
1088       *p = '\0';
1089     }
1090
1091   u->path = newpath;
1092
1093   if (edir != u->dir)
1094     xfree (edir);
1095   if (efile != u->file)
1096     xfree (efile);
1097
1098   /* Regenerate u->url as well.  */
1099   xfree (u->url);
1100   u->url = url_string (u, false);
1101 }
1102
1103 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1104    This way we can sync u->path and u->url when they get changed.  */
1105
1106 void
1107 url_set_dir (struct url *url, const char *newdir)
1108 {
1109   xfree (url->dir);
1110   url->dir = xstrdup (newdir);
1111   sync_path (url);
1112 }
1113
1114 void
1115 url_set_file (struct url *url, const char *newfile)
1116 {
1117   xfree (url->file);
1118   url->file = xstrdup (newfile);
1119   sync_path (url);
1120 }
1121
1122 void
1123 url_free (struct url *url)
1124 {
1125   xfree (url->host);
1126   xfree (url->path);
1127   xfree (url->url);
1128
1129   xfree_null (url->params);
1130   xfree_null (url->query);
1131   xfree_null (url->fragment);
1132   xfree_null (url->user);
1133   xfree_null (url->passwd);
1134
1135   xfree (url->dir);
1136   xfree (url->file);
1137
1138   xfree (url);
1139 }
1140 \f
1141 /* Create all the necessary directories for PATH (a file).  Calls
1142    make_directory internally.  */
1143 int
1144 mkalldirs (const char *path)
1145 {
1146   const char *p;
1147   char *t;
1148   struct_stat st;
1149   int res;
1150
1151   p = path + strlen (path);
1152   for (; *p != '/' && p != path; p--)
1153     ;
1154
1155   /* Don't create if it's just a file.  */
1156   if ((p == path) && (*p != '/'))
1157     return 0;
1158   t = strdupdelim (path, p);
1159
1160   /* Check whether the directory exists.  */
1161   if ((stat (t, &st) == 0))
1162     {
1163       if (S_ISDIR (st.st_mode))
1164         {
1165           xfree (t);
1166           return 0;
1167         }
1168       else
1169         {
1170           /* If the dir exists as a file name, remove it first.  This
1171              is *only* for Wget to work with buggy old CERN http
1172              servers.  Here is the scenario: When Wget tries to
1173              retrieve a directory without a slash, e.g.
1174              http://foo/bar (bar being a directory), CERN server will
1175              not redirect it too http://foo/bar/ -- it will generate a
1176              directory listing containing links to bar/file1,
1177              bar/file2, etc.  Wget will lose because it saves this
1178              HTML listing to a file `bar', so it cannot create the
1179              directory.  To work around this, if the file of the same
1180              name exists, we just remove it and create the directory
1181              anyway.  */
1182           DEBUGP (("Removing %s because of directory danger!\n", t));
1183           unlink (t);
1184         }
1185     }
1186   res = make_directory (t);
1187   if (res != 0)
1188     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1189   xfree (t);
1190   return res;
1191 }
1192 \f
1193 /* Functions for constructing the file name out of URL components.  */
1194
1195 /* A growable string structure, used by url_file_name and friends.
1196    This should perhaps be moved to utils.c.
1197
1198    The idea is to have a convenient and efficient way to construct a
1199    string by having various functions append data to it.  Instead of
1200    passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1201    functions in questions, we pass the pointer to this struct.  */
1202
1203 struct growable {
1204   char *base;
1205   int size;
1206   int tail;
1207 };
1208
1209 /* Ensure that the string can accept APPEND_COUNT more characters past
1210    the current TAIL position.  If necessary, this will grow the string
1211    and update its allocated size.  If the string is already large
1212    enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1213 #define GROW(g, append_size) do {                                       \
1214   struct growable *G_ = g;                                              \
1215   DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1216 } while (0)
1217
1218 /* Return the tail position of the string. */
1219 #define TAIL(r) ((r)->base + (r)->tail)
1220
1221 /* Move the tail position by APPEND_COUNT characters. */
1222 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1223
1224 /* Append the string STR to DEST.  NOTICE: the string in DEST is not
1225    terminated.  */
1226
1227 static void
1228 append_string (const char *str, struct growable *dest)
1229 {
1230   int l = strlen (str);
1231   GROW (dest, l);
1232   memcpy (TAIL (dest), str, l);
1233   TAIL_INCR (dest, l);
1234 }
1235
1236 /* Append CH to DEST.  For example, append_char (0, DEST)
1237    zero-terminates DEST.  */
1238
1239 static void
1240 append_char (char ch, struct growable *dest)
1241 {
1242   GROW (dest, 1);
1243   *TAIL (dest) = ch;
1244   TAIL_INCR (dest, 1);
1245 }
1246
1247 enum {
1248   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1249   filechr_not_windows = 2,      /* unusable on Windows, one of \|/<>?:*" */
1250   filechr_control     = 4       /* a control character, e.g. 0-31 */
1251 };
1252
1253 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1254
1255 /* Shorthands for the table: */
1256 #define U filechr_not_unix
1257 #define W filechr_not_windows
1258 #define C filechr_control
1259
1260 #define UW U|W
1261 #define UWC U|W|C
1262
1263 /* Table of characters unsafe under various conditions (see above).
1264
1265    Arguably we could also claim `%' to be unsafe, since we use it as
1266    the escape character.  If we ever want to be able to reliably
1267    translate file name back to URL, this would become important
1268    crucial.  Right now, it's better to be minimal in escaping.  */
1269
1270 static const unsigned char filechr_table[256] =
1271 {
1272 UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1273   C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1274   C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1275   C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1276   0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1277   0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1278   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1279   0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
1280   0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1281   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1282   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1283   0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1284   0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1285   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1286   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1287   0,  0,  0,  0,   0,  0,  0,  0,   /* x   y   z   {    |   }   ~   DEL */
1288
1289   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
1290   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
1291   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1292   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1293
1294   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1295   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1296   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1297   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1298 };
1299 #undef U
1300 #undef W
1301 #undef C
1302 #undef UW
1303 #undef UWC
1304
1305 /* FN_PORT_SEP is the separator between host and port in file names
1306    for non-standard port numbers.  On Unix this is normally ':', as in
1307    "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1308    because Windows can't handle ':' in file names.  */
1309 #define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1310
1311 /* FN_QUERY_SEP is the separator between the file name and the URL
1312    query, normally '?'.  Since Windows cannot handle '?' as part of
1313    file name, we use '@' instead there.  */
1314 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1315
1316 /* Quote path element, characters in [b, e), as file name, and append
1317    the quoted string to DEST.  Each character is quoted as per
1318    file_unsafe_char and the corresponding table.
1319
1320    If ESCAPED is true, the path element is considered to be
1321    URL-escaped and will be unescaped prior to inspection.  */
1322
1323 static void
1324 append_uri_pathel (const char *b, const char *e, bool escaped,
1325                    struct growable *dest)
1326 {
1327   const char *p;
1328   int quoted, outlen;
1329
1330   int mask;
1331   if (opt.restrict_files_os == restrict_unix)
1332     mask = filechr_not_unix;
1333   else
1334     mask = filechr_not_windows;
1335   if (opt.restrict_files_ctrl)
1336     mask |= filechr_control;
1337
1338   /* Copy [b, e) to PATHEL and URL-unescape it. */
1339   if (escaped)
1340     {
1341       char *unescaped;
1342       BOUNDED_TO_ALLOCA (b, e, unescaped);
1343       url_unescape (unescaped);
1344       b = unescaped;
1345       e = unescaped + strlen (unescaped);
1346     }
1347
1348   /* Defang ".." when found as component of path.  Remember that path
1349      comes from the URL and might contain malicious input.  */
1350   if (e - b == 2 && b[0] == '.' && b[1] == '.')
1351     {
1352       b = "%2E%2E";
1353       e = b + 6;
1354     }
1355
1356   /* Walk the PATHEL string and check how many characters we'll need
1357      to quote.  */
1358   quoted = 0;
1359   for (p = b; p < e; p++)
1360     if (FILE_CHAR_TEST (*p, mask))
1361       ++quoted;
1362
1363   /* Calculate the length of the output string.  e-b is the input
1364      string length.  Each quoted char introduces two additional
1365      characters in the string, hence 2*quoted.  */
1366   outlen = (e - b) + (2 * quoted);
1367   GROW (dest, outlen);
1368
1369   if (!quoted)
1370     {
1371       /* If there's nothing to quote, we can simply append the string
1372          without processing it again.  */
1373       memcpy (TAIL (dest), b, outlen);
1374     }
1375   else
1376     {
1377       char *q = TAIL (dest);
1378       for (p = b; p < e; p++)
1379         {
1380           if (!FILE_CHAR_TEST (*p, mask))
1381             *q++ = *p;
1382           else
1383             {
1384               unsigned char ch = *p;
1385               *q++ = '%';
1386               *q++ = XNUM_TO_DIGIT (ch >> 4);
1387               *q++ = XNUM_TO_DIGIT (ch & 0xf);
1388             }
1389         }
1390       assert (q - TAIL (dest) == outlen);
1391     }
1392   TAIL_INCR (dest, outlen);
1393 }
1394
1395 /* Append to DEST the directory structure that corresponds the
1396    directory part of URL's path.  For example, if the URL is
1397    http://server/dir1/dir2/file, this appends "/dir1/dir2".
1398
1399    Each path element ("dir1" and "dir2" in the above example) is
1400    examined, url-unescaped, and re-escaped as file name element.
1401
1402    Additionally, it cuts as many directories from the path as
1403    specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1404    will produce "bar" for the above example.  For 2 or more, it will
1405    produce "".
1406
1407    Each component of the path is quoted for use as file name.  */
1408
1409 static void
1410 append_dir_structure (const struct url *u, struct growable *dest)
1411 {
1412   char *pathel, *next;
1413   int cut = opt.cut_dirs;
1414
1415   /* Go through the path components, de-URL-quote them, and quote them
1416      (if necessary) as file names.  */
1417
1418   pathel = u->path;
1419   for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1420     {
1421       if (cut-- > 0)
1422         continue;
1423       if (pathel == next)
1424         /* Ignore empty pathels.  */
1425         continue;
1426
1427       if (dest->tail)
1428         append_char ('/', dest);
1429       append_uri_pathel (pathel, next, true, dest);
1430     }
1431 }
1432
1433 /* Return a unique file name that matches the given URL as good as
1434    possible.  Does not create directories on the file system.  */
1435
1436 char *
1437 url_file_name (const struct url *u)
1438 {
1439   struct growable fnres;        /* stands for "file name result" */
1440
1441   const char *u_file, *u_query;
1442   char *fname, *unique;
1443
1444   fnres.base = NULL;
1445   fnres.size = 0;
1446   fnres.tail = 0;
1447
1448   /* Start with the directory prefix, if specified. */
1449   if (opt.dir_prefix)
1450     append_string (opt.dir_prefix, &fnres);
1451
1452   /* If "dirstruct" is turned on (typically the case with -r), add
1453      the host and port (unless those have been turned off) and
1454      directory structure.  */
1455   if (opt.dirstruct)
1456     {
1457       if (opt.protocol_directories)
1458         {
1459           if (fnres.tail)
1460             append_char ('/', &fnres);
1461           append_string (supported_schemes[u->scheme].name, &fnres);
1462         }
1463       if (opt.add_hostdir)
1464         {
1465           if (fnres.tail)
1466             append_char ('/', &fnres);
1467           if (0 != strcmp (u->host, ".."))
1468             append_string (u->host, &fnres);
1469           else
1470             /* Host name can come from the network; malicious DNS may
1471                allow ".." to be resolved, causing us to write to
1472                "../<file>".  Defang such host names.  */
1473             append_string ("%2E%2E", &fnres);
1474           if (u->port != scheme_default_port (u->scheme))
1475             {
1476               char portstr[24];
1477               number_to_string (portstr, u->port);
1478               append_char (FN_PORT_SEP, &fnres);
1479               append_string (portstr, &fnres);
1480             }
1481         }
1482
1483       append_dir_structure (u, &fnres);
1484     }
1485
1486   /* Add the file name. */
1487   if (fnres.tail)
1488     append_char ('/', &fnres);
1489   u_file = *u->file ? u->file : "index.html";
1490   append_uri_pathel (u_file, u_file + strlen (u_file), false, &fnres);
1491
1492   /* Append "?query" to the file name. */
1493   u_query = u->query && *u->query ? u->query : NULL;
1494   if (u_query)
1495     {
1496       append_char (FN_QUERY_SEP, &fnres);
1497       append_uri_pathel (u_query, u_query + strlen (u_query), true, &fnres);
1498     }
1499
1500   /* Zero-terminate the file name. */
1501   append_char ('\0', &fnres);
1502
1503   fname = fnres.base;
1504
1505   /* Check the cases in which the unique extensions are not used:
1506      1) Clobbering is turned off (-nc).
1507      2) Retrieval with regetting.
1508      3) Timestamping is used.
1509      4) Hierarchy is built.
1510
1511      The exception is the case when file does exist and is a
1512      directory (see `mkalldirs' for explanation).  */
1513
1514   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1515       && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1516     return fname;
1517
1518   unique = unique_name (fname, true);
1519   if (unique != fname)
1520     xfree (fname);
1521   return unique;
1522 }
1523 \f
1524 /* Resolve "." and ".." elements of PATH by destructively modifying
1525    PATH and return true if PATH has been modified, false otherwise.
1526
1527    The algorithm is in spirit similar to the one described in rfc1808,
1528    although implemented differently, in one pass.  To recap, path
1529    elements containing only "." are removed, and ".." is taken to mean
1530    "back up one element".  Single leading and trailing slashes are
1531    preserved.
1532
1533    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1534    test examples are provided below.  If you change anything in this
1535    function, run test_path_simplify to make sure you haven't broken a
1536    test case.  */
1537
1538 static bool
1539 path_simplify (char *path)
1540 {
1541   char *h = path;               /* hare */
1542   char *t = path;               /* tortoise */
1543   char *beg = path;             /* boundary for backing the tortoise */
1544   char *end = path + strlen (path);
1545
1546   while (h < end)
1547     {
1548       /* Hare should be at the beginning of a path element. */
1549
1550       if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1551         {
1552           /* Ignore "./". */
1553           h += 2;
1554         }
1555       else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1556         {
1557           /* Handle "../" by retreating the tortoise by one path
1558              element -- but not past beggining.  */
1559           if (t > beg)
1560             {
1561               /* Move backwards until T hits the beginning of the
1562                  previous path element or the beginning of path. */
1563               for (--t; t > beg && t[-1] != '/'; t--)
1564                 ;
1565             }
1566           else
1567             {
1568               /* If we're at the beginning, copy the "../" literally
1569                  move the beginning so a later ".." doesn't remove
1570                  it.  */
1571               beg = t + 3;
1572               goto regular;
1573             }
1574           h += 3;
1575         }
1576       else
1577         {
1578         regular:
1579           /* A regular path element.  If H hasn't advanced past T,
1580              simply skip to the next path element.  Otherwise, copy
1581              the path element until the next slash.  */
1582           if (t == h)
1583             {
1584               /* Skip the path element, including the slash.  */
1585               while (h < end && *h != '/')
1586                 t++, h++;
1587               if (h < end)
1588                 t++, h++;
1589             }
1590           else
1591             {
1592               /* Copy the path element, including the final slash.  */
1593               while (h < end && *h != '/')
1594                 *t++ = *h++;
1595               if (h < end)
1596                 *t++ = *h++;
1597             }
1598         }
1599     }
1600
1601   if (t != h)
1602     *t = '\0';
1603
1604   return t != h;
1605 }
1606 \f
1607 /* Return the length of URL's path.  Path is considered to be
1608    terminated by one of '?', ';', '#', or by the end of the
1609    string.  */
1610
1611 static int
1612 path_length (const char *url)
1613 {
1614   const char *q = strpbrk_or_eos (url, "?;#");
1615   return q - url;
1616 }
1617
1618 /* Find the last occurrence of character C in the range [b, e), or
1619    NULL, if none are present.  We might want to use memrchr (a GNU
1620    extension) under GNU libc.  */
1621
1622 static const char *
1623 find_last_char (const char *b, const char *e, char c)
1624 {
1625   for (; e > b; e--)
1626     if (*e == c)
1627       return e;
1628   return NULL;
1629 }
1630
1631 /* Merge BASE with LINK and return the resulting URI.
1632
1633    Either of the URIs may be absolute or relative, complete with the
1634    host name, or path only.  This tries to reasonably handle all
1635    foreseeable cases.  It only employs minimal URL parsing, without
1636    knowledge of the specifics of schemes.
1637
1638    I briefly considered making this function call path_simplify after
1639    the merging process, as rfc1738 seems to suggest.  This is a bad
1640    idea for several reasons: 1) it complexifies the code, and 2)
1641    url_parse has to simplify path anyway, so it's wasteful to boot.  */
1642
1643 char *
1644 uri_merge (const char *base, const char *link)
1645 {
1646   int linklength;
1647   const char *end;
1648   char *merge;
1649
1650   if (url_has_scheme (link))
1651     return xstrdup (link);
1652
1653   /* We may not examine BASE past END. */
1654   end = base + path_length (base);
1655   linklength = strlen (link);
1656
1657   if (!*link)
1658     {
1659       /* Empty LINK points back to BASE, query string and all. */
1660       return xstrdup (base);
1661     }
1662   else if (*link == '?')
1663     {
1664       /* LINK points to the same location, but changes the query
1665          string.  Examples: */
1666       /* uri_merge("path",         "?new") -> "path?new"     */
1667       /* uri_merge("path?foo",     "?new") -> "path?new"     */
1668       /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1669       /* uri_merge("path#foo",     "?new") -> "path?new"     */
1670       int baselength = end - base;
1671       merge = xmalloc (baselength + linklength + 1);
1672       memcpy (merge, base, baselength);
1673       memcpy (merge + baselength, link, linklength);
1674       merge[baselength + linklength] = '\0';
1675     }
1676   else if (*link == '#')
1677     {
1678       /* uri_merge("path",         "#new") -> "path#new"     */
1679       /* uri_merge("path#foo",     "#new") -> "path#new"     */
1680       /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1681       /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1682       int baselength;
1683       const char *end1 = strchr (base, '#');
1684       if (!end1)
1685         end1 = base + strlen (base);
1686       baselength = end1 - base;
1687       merge = xmalloc (baselength + linklength + 1);
1688       memcpy (merge, base, baselength);
1689       memcpy (merge + baselength, link, linklength);
1690       merge[baselength + linklength] = '\0';
1691     }
1692   else if (*link == '/' && *(link + 1) == '/')
1693     {
1694       /* LINK begins with "//" and so is a net path: we need to
1695          replace everything after (and including) the double slash
1696          with LINK. */
1697
1698       /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1699       /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1700       /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1701
1702       int span;
1703       const char *slash;
1704       const char *start_insert;
1705
1706       /* Look for first slash. */
1707       slash = memchr (base, '/', end - base);
1708       /* If found slash and it is a double slash, then replace
1709          from this point, else default to replacing from the
1710          beginning.  */
1711       if (slash && *(slash + 1) == '/')
1712         start_insert = slash;
1713       else
1714         start_insert = base;
1715
1716       span = start_insert - base;
1717       merge = xmalloc (span + linklength + 1);
1718       if (span)
1719         memcpy (merge, base, span);
1720       memcpy (merge + span, link, linklength);
1721       merge[span + linklength] = '\0';
1722     }
1723   else if (*link == '/')
1724     {
1725       /* LINK is an absolute path: we need to replace everything
1726          after (and including) the FIRST slash with LINK.
1727
1728          So, if BASE is "http://host/whatever/foo/bar", and LINK is
1729          "/qux/xyzzy", our result should be
1730          "http://host/qux/xyzzy".  */
1731       int span;
1732       const char *slash;
1733       const char *start_insert = NULL; /* for gcc to shut up. */
1734       const char *pos = base;
1735       bool seen_slash_slash = false;
1736       /* We're looking for the first slash, but want to ignore
1737          double slash. */
1738     again:
1739       slash = memchr (pos, '/', end - pos);
1740       if (slash && !seen_slash_slash)
1741         if (*(slash + 1) == '/')
1742           {
1743             pos = slash + 2;
1744             seen_slash_slash = true;
1745             goto again;
1746           }
1747
1748       /* At this point, SLASH is the location of the first / after
1749          "//", or the first slash altogether.  START_INSERT is the
1750          pointer to the location where LINK will be inserted.  When
1751          examining the last two examples, keep in mind that LINK
1752          begins with '/'. */
1753
1754       if (!slash && !seen_slash_slash)
1755         /* example: "foo" */
1756         /*           ^    */
1757         start_insert = base;
1758       else if (!slash && seen_slash_slash)
1759         /* example: "http://foo" */
1760         /*                     ^ */
1761         start_insert = end;
1762       else if (slash && !seen_slash_slash)
1763         /* example: "foo/bar" */
1764         /*           ^        */
1765         start_insert = base;
1766       else if (slash && seen_slash_slash)
1767         /* example: "http://something/" */
1768         /*                           ^  */
1769         start_insert = slash;
1770
1771       span = start_insert - base;
1772       merge = xmalloc (span + linklength + 1);
1773       if (span)
1774         memcpy (merge, base, span);
1775       memcpy (merge + span, link, linklength);
1776       merge[span + linklength] = '\0';
1777     }
1778   else
1779     {
1780       /* LINK is a relative URL: we need to replace everything
1781          after last slash (possibly empty) with LINK.
1782
1783          So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1784          our result should be "whatever/foo/qux/xyzzy".  */
1785       bool need_explicit_slash = false;
1786       int span;
1787       const char *start_insert;
1788       const char *last_slash = find_last_char (base, end, '/');
1789       if (!last_slash)
1790         {
1791           /* No slash found at all.  Replace what we have with LINK. */
1792           start_insert = base;
1793         }
1794       else if (last_slash && last_slash >= base + 2
1795                && last_slash[-2] == ':' && last_slash[-1] == '/')
1796         {
1797           /* example: http://host"  */
1798           /*                      ^ */
1799           start_insert = end + 1;
1800           need_explicit_slash = true;
1801         }
1802       else
1803         {
1804           /* example: "whatever/foo/bar" */
1805           /*                        ^    */
1806           start_insert = last_slash + 1;
1807         }
1808
1809       span = start_insert - base;
1810       merge = xmalloc (span + linklength + 1);
1811       if (span)
1812         memcpy (merge, base, span);
1813       if (need_explicit_slash)
1814         merge[span - 1] = '/';
1815       memcpy (merge + span, link, linklength);
1816       merge[span + linklength] = '\0';
1817     }
1818
1819   return merge;
1820 }
1821 \f
1822 #define APPEND(p, s) do {                       \
1823   int len = strlen (s);                         \
1824   memcpy (p, s, len);                           \
1825   p += len;                                     \
1826 } while (0)
1827
1828 /* Use this instead of password when the actual password is supposed
1829    to be hidden.  We intentionally use a generic string without giving
1830    away the number of characters in the password, like previous
1831    versions did.  */
1832 #define HIDDEN_PASSWORD "*password*"
1833
1834 /* Recreate the URL string from the data in URL.
1835
1836    If HIDE is true (as it is when we're calling this on a URL we plan
1837    to print, but not when calling it to canonicalize a URL for use
1838    within the program), password will be hidden.  Unsafe characters in
1839    the URL will be quoted.  */
1840
1841 char *
1842 url_string (const struct url *url, bool hide_password)
1843 {
1844   int size;
1845   char *result, *p;
1846   char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;
1847
1848   int scheme_port = supported_schemes[url->scheme].default_port;
1849   const char *scheme_str = supported_schemes[url->scheme].leading_string;
1850   int fplen = full_path_length (url);
1851
1852   bool brackets_around_host;
1853
1854   assert (scheme_str != NULL);
1855
1856   /* Make sure the user name and password are quoted. */
1857   if (url->user)
1858     {
1859       quoted_user = url_escape_allow_passthrough (url->user);
1860       if (url->passwd)
1861         {
1862           if (hide_password)
1863             quoted_passwd = HIDDEN_PASSWORD;
1864           else
1865             quoted_passwd = url_escape_allow_passthrough (url->passwd);
1866         }
1867     }
1868
1869   /* In the unlikely event that the host name contains non-printable
1870      characters, quote it for displaying to the user.  */
1871   quoted_host = url_escape_allow_passthrough (url->host);
1872
1873   /* Undo the quoting of colons that URL escaping performs.  IPv6
1874      addresses may legally contain colons, and in that case must be
1875      placed in square brackets.  */
1876   if (quoted_host != url->host)
1877     unescape_single_char (quoted_host, ':');
1878   brackets_around_host = strchr (quoted_host, ':') != NULL;
1879
1880   size = (strlen (scheme_str)
1881           + strlen (quoted_host)
1882           + (brackets_around_host ? 2 : 0)
1883           + fplen
1884           + 1);
1885   if (url->port != scheme_port)
1886     size += 1 + numdigit (url->port);
1887   if (quoted_user)
1888     {
1889       size += 1 + strlen (quoted_user);
1890       if (quoted_passwd)
1891         size += 1 + strlen (quoted_passwd);
1892     }
1893
1894   p = result = xmalloc (size);
1895
1896   APPEND (p, scheme_str);
1897   if (quoted_user)
1898     {
1899       APPEND (p, quoted_user);
1900       if (quoted_passwd)
1901         {
1902           *p++ = ':';
1903           APPEND (p, quoted_passwd);
1904         }
1905       *p++ = '@';
1906     }
1907
1908   if (brackets_around_host)
1909     *p++ = '[';
1910   APPEND (p, quoted_host);
1911   if (brackets_around_host)
1912     *p++ = ']';
1913   if (url->port != scheme_port)
1914     {
1915       *p++ = ':';
1916       p = number_to_string (p, url->port);
1917     }
1918
1919   full_path_write (url, p);
1920   p += fplen;
1921   *p++ = '\0';
1922
1923   assert (p - result == size);
1924
1925   if (quoted_user && quoted_user != url->user)
1926     xfree (quoted_user);
1927   if (quoted_passwd && !hide_password && quoted_passwd != url->passwd)
1928     xfree (quoted_passwd);
1929   if (quoted_host != url->host)
1930     xfree (quoted_host);
1931
1932   return result;
1933 }
1934 \f
1935 /* Return true if scheme a is similar to scheme b.
1936
1937    Schemes are similar if they are equal.  If SSL is supported, schemes
1938    are also similar if one is http (SCHEME_HTTP) and the other is https
1939    (SCHEME_HTTPS).  */
1940 bool
1941 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
1942 {
1943   if (a == b)
1944     return true;
1945 #ifdef HAVE_SSL
1946   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
1947       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
1948     return true;
1949 #endif
1950   return false;
1951 }
1952 \f
1953 #if 0
1954 /* Debugging and testing support for path_simplify. */
1955
1956 /* Debug: run path_simplify on PATH and return the result in a new
1957    string.  Useful for calling from the debugger.  */
1958 static char *
1959 ps (char *path)
1960 {
1961   char *copy = xstrdup (path);
1962   path_simplify (copy);
1963   return copy;
1964 }
1965
1966 static void
1967 run_test (char *test, char *expected_result, bool expected_change)
1968 {
1969   char *test_copy = xstrdup (test);
1970   bool modified = path_simplify (test_copy);
1971
1972   if (0 != strcmp (test_copy, expected_result))
1973     {
1974       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
1975               test, expected_result, test_copy);
1976     }
1977   if (modified != expected_change)
1978     {
1979       if (expected_change)
1980         printf ("Expected modification with path_simplify(\"%s\").\n",
1981                 test);
1982       else
1983         printf ("Expected no modification with path_simplify(\"%s\").\n",
1984                 test);
1985     }
1986   xfree (test_copy);
1987 }
1988
1989 static void
1990 test_path_simplify (void)
1991 {
1992   static struct {
1993     char *test, *result;
1994     bool should_modify;
1995   } tests[] = {
1996     { "",                       "",             false },
1997     { ".",                      "",             true },
1998     { "./",                     "",             true },
1999     { "..",                     "..",           false },
2000     { "../",                    "../",          false },
2001     { "foo",                    "foo",          false },
2002     { "foo/bar",                "foo/bar",      false },
2003     { "foo///bar",              "foo///bar",    false },
2004     { "foo/.",                  "foo/",         true },
2005     { "foo/./",                 "foo/",         true },
2006     { "foo./",                  "foo./",        false },
2007     { "foo/../bar",             "bar",          true },
2008     { "foo/../bar/",            "bar/",         true },
2009     { "foo/bar/..",             "foo/",         true },
2010     { "foo/bar/../x",           "foo/x",        true },
2011     { "foo/bar/../x/",          "foo/x/",       true },
2012     { "foo/..",                 "",             true },
2013     { "foo/../..",              "..",           true },
2014     { "foo/../../..",           "../..",        true },
2015     { "foo/../../bar/../../baz", "../../baz",   true },
2016     { "a/b/../../c",            "c",            true },
2017     { "./a/../b",               "b",            true }
2018   };
2019   int i;
2020
2021   for (i = 0; i < countof (tests); i++)
2022     {
2023       char *test = tests[i].test;
2024       char *expected_result = tests[i].result;
2025       bool  expected_change = tests[i].should_modify;
2026       run_test (test, expected_result, expected_change);
2027     }
2028 }
2029 #endif