sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1996-2005 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software Foundation, Inc.,
  18 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include <config.h>
  31
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #include <string.h>
  35 #ifdef HAVE_UNISTD_H
  36 # include <unistd.h>
  37 #endif
  38 #include <errno.h>
  39 #include <assert.h>
  40
  41 #include "wget.h"
  42 #include "utils.h"
  43 #include "url.h"
  44 #include "host.h"  /* for is_valid_ipv6_address */
  45
  46 enum {
  47   scm_disabled = 1,             /* for https when OpenSSL fails to init. */
  48   scm_has_params = 2,           /* whether scheme has ;params */
  49   scm_has_query = 4,            /* whether scheme has ?query */
  50   scm_has_fragment = 8          /* whether scheme has #fragment */
  51 };
  52
  53 struct scheme_data
  54 {
  55   /* Short name of the scheme, such as "http" or "ftp". */
  56   const char *name;
  57   /* Leading string that identifies the scheme, such as "https://". */
  58   const char *leading_string;
  59   /* Default port of the scheme when none is specified. */
  60   int default_port;
  61   /* Various flags. */
  62   int flags;
  63 };
  64
  65 /* Supported schemes: */
  66 static struct scheme_data supported_schemes[] =
  67 {
  68   { "http",     "http://",  DEFAULT_HTTP_PORT,  scm_has_query|scm_has_fragment },
  69 #ifdef HAVE_SSL
  70   { "https",    "https://", DEFAULT_HTTPS_PORT, scm_has_query|scm_has_fragment },
  71 #endif
  72   { "ftp",      "ftp://",   DEFAULT_FTP_PORT,   scm_has_params|scm_has_fragment },
  73
  74   /* SCHEME_INVALID */
  75   { NULL,       NULL,       -1,                 0 }
  76 };
  77
  78 /* Forward declarations: */
  79
  80 static bool path_simplify (char *);
  81 \f
  82 /* Support for escaping and unescaping of URL strings.  */
  83
  84 /* Table of "reserved" and "unsafe" characters.  Those terms are
  85    rfc1738-speak, as such largely obsoleted by rfc2396 and later
  86    specs, but the general idea remains.
  87
  88    A reserved character is the one that you can't decode without
  89    changing the meaning of the URL.  For example, you can't decode
  90    "/foo/%2f/bar" into "/foo///bar" because the number and contents of
  91    path components is different.  Non-reserved characters can be
  92    changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar".  The
  93    unsafe characters are loosely based on rfc1738, plus "$" and ",",
  94    as recommended by rfc2396, and minus "~", which is very frequently
  95    used (and sometimes unrecognized as %7E by broken servers).
  96
  97    An unsafe character is the one that should be encoded when URLs are
  98    placed in foreign environments.  E.g. space and newline are unsafe
  99    in HTTP contexts because HTTP uses them as separator and line
 100    terminator, so they must be encoded to %20 and %0A respectively.
 101    "*" is unsafe in shell context, etc.
 102
 103    We determine whether a character is unsafe through static table
 104    lookup.  This code assumes ASCII character set and 8-bit chars.  */
 105
 106 enum {
 107   /* rfc1738 reserved chars + "$" and ",".  */
 108   urlchr_reserved = 1,
 109
 110   /* rfc1738 unsafe chars, plus non-printables.  */
 111   urlchr_unsafe   = 2
 112 };
 113
 114 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 115 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 116 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 117
 118 /* Shorthands for the table: */
 119 #define R  urlchr_reserved
 120 #define U  urlchr_unsafe
 121 #define RU R|U
 122
 123 static const unsigned char urlchr_table[256] =
 124 {
 125   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 126   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 127   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 128   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 129   U,  0,  U, RU,   R,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 130   0,  0,  0,  R,   R,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 131   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 132   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 133  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 134   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 135   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 136   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 137   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 138   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 139   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 140   0,  0,  0,  U,   U,  U,  0,  U,   /* x   y   z   {    |   }   ~   DEL */
 141
 142   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 143   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 144   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 145   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 146
 147   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 148   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 149   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 150   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 151 };
 152 #undef R
 153 #undef U
 154 #undef RU
 155
 156 /* URL-unescape the string S.
 157
 158    This is done by transforming the sequences "%HH" to the character
 159    represented by the hexadecimal digits HH.  If % is not followed by
 160    two hexadecimal digits, it is inserted literally.
 161
 162    The transformation is done in place.  If you need the original
 163    string intact, make a copy before calling this function.  */
 164
 165 static void
 166 url_unescape (char *s)
 167 {
 168   char *t = s;                  /* t - tortoise */
 169   char *h = s;                  /* h - hare     */
 170
 171   for (; *h; h++, t++)
 172     {
 173       if (*h != '%')
 174         {
 175         copychar:
 176           *t = *h;
 177         }
 178       else
 179         {
 180           char c;
 181           /* Do nothing if '%' is not followed by two hex digits. */
 182           if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2])))
 183             goto copychar;
 184           c = X2DIGITS_TO_NUM (h[1], h[2]);
 185           /* Don't unescape %00 because there is no way to insert it
 186              into a C string without effectively truncating it. */
 187           if (c == '\0')
 188             goto copychar;
 189           *t = c;
 190           h += 2;
 191         }
 192     }
 193   *t = '\0';
 194 }
 195
 196 /* The core of url_escape_* functions.  Escapes the characters that
 197    match the provided mask in urlchr_table.
 198
 199    If ALLOW_PASSTHROUGH is true, a string with no unsafe chars will be
 200    returned unchanged.  If ALLOW_PASSTHROUGH is false, a freshly
 201    allocated string will be returned in all cases.  */
 202
 203 static char *
 204 url_escape_1 (const char *s, unsigned char mask, bool allow_passthrough)
 205 {
 206   const char *p1;
 207   char *p2, *newstr;
 208   int newlen;
 209   int addition = 0;
 210
 211   for (p1 = s; *p1; p1++)
 212     if (urlchr_test (*p1, mask))
 213       addition += 2;            /* Two more characters (hex digits) */
 214
 215   if (!addition)
 216     return allow_passthrough ? (char *)s : xstrdup (s);
 217
 218   newlen = (p1 - s) + addition;
 219   newstr = xmalloc (newlen + 1);
 220
 221   p1 = s;
 222   p2 = newstr;
 223   while (*p1)
 224     {
 225       /* Quote the characters that match the test mask. */
 226       if (urlchr_test (*p1, mask))
 227         {
 228           unsigned char c = *p1++;
 229           *p2++ = '%';
 230           *p2++ = XNUM_TO_DIGIT (c >> 4);
 231           *p2++ = XNUM_TO_DIGIT (c & 0xf);
 232         }
 233       else
 234         *p2++ = *p1++;
 235     }
 236   assert (p2 - newstr == newlen);
 237   *p2 = '\0';
 238
 239   return newstr;
 240 }
 241
 242 /* URL-escape the unsafe characters (see urlchr_table) in a given
 243    string, returning a freshly allocated string.  */
 244
 245 char *
 246 url_escape (const char *s)
 247 {
 248   return url_escape_1 (s, urlchr_unsafe, false);
 249 }
 250
 251 /* URL-escape the unsafe characters (see urlchr_table) in a given
 252    string.  If no characters are unsafe, S is returned.  */
 253
 254 static char *
 255 url_escape_allow_passthrough (const char *s)
 256 {
 257   return url_escape_1 (s, urlchr_unsafe, true);
 258 }
 259 \f
 260 /* Decide whether the char at position P needs to be encoded.  (It is
 261    not enough to pass a single char *P because the function may need
 262    to inspect the surrounding context.)
 263
 264    Return true if the char should be escaped as %XX, false otherwise.  */
 265
 266 static inline bool
 267 char_needs_escaping (const char *p)
 268 {
 269   if (*p == '%')
 270     {
 271       if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
 272         return false;
 273       else
 274         /* Garbled %.. sequence: encode `%'. */
 275         return true;
 276     }
 277   else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
 278     return true;
 279   else
 280     return false;
 281 }
 282
 283 /* Translate a %-escaped (but possibly non-conformant) input string S
 284    into a %-escaped (and conformant) output string.  If no characters
 285    are encoded or decoded, return the same string S; otherwise, return
 286    a freshly allocated string with the new contents.
 287
 288    After a URL has been run through this function, the protocols that
 289    use `%' as the quote character can use the resulting string as-is,
 290    while those that don't can use url_unescape to get to the intended
 291    data.  This function is stable: once the input is transformed,
 292    further transformations of the result yield the same output.
 293
 294    Let's discuss why this function is needed.
 295
 296    Imagine Wget is asked to retrieve `http://abc.xyz/abc def'.  Since
 297    a raw space character would mess up the HTTP request, it needs to
 298    be quoted, like this:
 299
 300        GET /abc%20def HTTP/1.0
 301
 302    It would appear that the unsafe chars need to be quoted, for
 303    example with url_escape.  But what if we're requested to download
 304    `abc%20def'?  url_escape transforms "%" to "%25", which would leave
 305    us with `abc%2520def'.  This is incorrect -- since %-escapes are
 306    part of URL syntax, "%20" is the correct way to denote a literal
 307    space on the Wget command line.  This leads to the conclusion that
 308    in that case Wget should not call url_escape, but leave the `%20'
 309    as is.  This is clearly contradictory, but it only gets worse.
 310
 311    What if the requested URI is `abc%20 def'?  If we call url_escape,
 312    we end up with `/abc%2520%20def', which is almost certainly not
 313    intended.  If we don't call url_escape, we are left with the
 314    embedded space and cannot complete the request.  What the user
 315    meant was for Wget to request `/abc%20%20def', and this is where
 316    reencode_escapes kicks in.
 317
 318    Wget used to solve this by first decoding %-quotes, and then
 319    encoding all the "unsafe" characters found in the resulting string.
 320    This was wrong because it didn't preserve certain URL special
 321    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 322    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 323    whether we considered `+' reserved (it is).  One of these results
 324    is inevitable because by the second step we would lose information
 325    on whether the `+' was originally encoded or not.  Both results
 326    were wrong because in CGI parameters + means space, while %2B means
 327    literal plus.  reencode_escapes correctly translates the above to
 328    "a%2B+b", i.e. returns the original string.
 329
 330    This function uses a modified version of the algorithm originally
 331    proposed by Anon Sricharoenchai:
 332
 333    * Encode all "unsafe" characters, except those that are also
 334      "reserved", to %XX.  See urlchr_table for which characters are
 335      unsafe and reserved.
 336
 337    * Encode the "%" characters not followed by two hex digits to
 338      "%25".
 339
 340    * Pass through all other characters and %XX escapes as-is.  (Up to
 341      Wget 1.10 this decoded %XX escapes corresponding to "safe"
 342      characters, but that was obtrusive and broke some servers.)
 343
 344    Anon's test case:
 345
 346    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 347    ->
 348    "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc"
 349
 350    Simpler test cases:
 351
 352    "foo bar"         -> "foo%20bar"
 353    "foo%20bar"       -> "foo%20bar"
 354    "foo %20bar"      -> "foo%20%20bar"
 355    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 356    "foo%25%20bar"    -> "foo%25%20bar"
 357    "foo%2%20bar"     -> "foo%252%20bar"
 358    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 359    "foo%2b+bar"      -> "foo%2b+bar"  */
 360
 361 static char *
 362 reencode_escapes (const char *s)
 363 {
 364   const char *p1;
 365   char *newstr, *p2;
 366   int oldlen, newlen;
 367
 368   int encode_count = 0;
 369
 370   /* First pass: inspect the string to see if there's anything to do,
 371      and to calculate the new length.  */
 372   for (p1 = s; *p1; p1++)
 373     if (char_needs_escaping (p1))
 374       ++encode_count;
 375
 376   if (!encode_count)
 377     /* The string is good as it is. */
 378     return (char *) s;          /* C const model sucks. */
 379
 380   oldlen = p1 - s;
 381   /* Each encoding adds two characters (hex digits).  */
 382   newlen = oldlen + 2 * encode_count;
 383   newstr = xmalloc (newlen + 1);
 384
 385   /* Second pass: copy the string to the destination address, encoding
 386      chars when needed.  */
 387   p1 = s;
 388   p2 = newstr;
 389
 390   while (*p1)
 391     if (char_needs_escaping (p1))
 392       {
 393         unsigned char c = *p1++;
 394         *p2++ = '%';
 395         *p2++ = XNUM_TO_DIGIT (c >> 4);
 396         *p2++ = XNUM_TO_DIGIT (c & 0xf);
 397       }
 398     else
 399       *p2++ = *p1++;
 400
 401   *p2 = '\0';
 402   assert (p2 - newstr == newlen);
 403   return newstr;
 404 }
 405 \f
 406 /* Returns the scheme type if the scheme is supported, or
 407    SCHEME_INVALID if not.  */
 408
 409 enum url_scheme
 410 url_scheme (const char *url)
 411 {
 412   int i;
 413
 414   for (i = 0; supported_schemes[i].leading_string; i++)
 415     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 416                           strlen (supported_schemes[i].leading_string)))
 417       {
 418         if (!(supported_schemes[i].flags & scm_disabled))
 419           return (enum url_scheme) i;
 420         else
 421           return SCHEME_INVALID;
 422       }
 423
 424   return SCHEME_INVALID;
 425 }
 426
 427 #define SCHEME_CHAR(ch) (ISALNUM (ch) || (ch) == '-' || (ch) == '+')
 428
 429 /* Return 1 if the URL begins with any "scheme", 0 otherwise.  As
 430    currently implemented, it returns true if URL begins with
 431    [-+a-zA-Z0-9]+: .  */
 432
 433 bool
 434 url_has_scheme (const char *url)
 435 {
 436   const char *p = url;
 437
 438   /* The first char must be a scheme char. */
 439   if (!*p || !SCHEME_CHAR (*p))
 440     return false;
 441   ++p;
 442   /* Followed by 0 or more scheme chars. */
 443   while (*p && SCHEME_CHAR (*p))
 444     ++p;
 445   /* Terminated by ':'. */
 446   return *p == ':';
 447 }
 448
 449 int
 450 scheme_default_port (enum url_scheme scheme)
 451 {
 452   return supported_schemes[scheme].default_port;
 453 }
 454
 455 void
 456 scheme_disable (enum url_scheme scheme)
 457 {
 458   supported_schemes[scheme].flags |= scm_disabled;
 459 }
 460
 461 /* Skip the username and password, if present in the URL.  The
 462    function should *not* be called with the complete URL, but with the
 463    portion after the scheme.
 464
 465    If no username and password are found, return URL.  */
 466
 467 static const char *
 468 url_skip_credentials (const char *url)
 469 {
 470   /* Look for '@' that comes before terminators, such as '/', '?',
 471      '#', or ';'.  */
 472   const char *p = (const char *)strpbrk (url, "@/?#;");
 473   if (!p || *p != '@')
 474     return url;
 475   return p + 1;
 476 }
 477
 478 /* Parse credentials contained in [BEG, END).  The region is expected
 479    to have come from a URL and is unescaped.  */
 480
 481 static bool
 482 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
 483 {
 484   char *colon;
 485   const char *userend;
 486
 487   if (beg == end)
 488     return false;               /* empty user name */
 489
 490   colon = memchr (beg, ':', end - beg);
 491   if (colon == beg)
 492     return false;               /* again empty user name */
 493
 494   if (colon)
 495     {
 496       *passwd = strdupdelim (colon + 1, end);
 497       userend = colon;
 498       url_unescape (*passwd);
 499     }
 500   else
 501     {
 502       *passwd = NULL;
 503       userend = end;
 504     }
 505   *user = strdupdelim (beg, userend);
 506   url_unescape (*user);
 507   return true;
 508 }
 509
 510 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 511    originally popularized by Netscape and NcFTP.  HTTP shorthands look
 512    like this:
 513
 514    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 515    www.foo.com[:port]            -> http://www.foo.com[:port]
 516
 517    FTP shorthands look like this:
 518
 519    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 520    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 521
 522    If the URL needs not or cannot be rewritten, return NULL.  */
 523
 524 char *
 525 rewrite_shorthand_url (const char *url)
 526 {
 527   const char *p;
 528   char *ret;
 529
 530   if (url_scheme (url) != SCHEME_INVALID)
 531     return NULL;
 532
 533   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 534      latter Netscape.  */
 535   p = strpbrk (url, ":/");
 536   if (p == url)
 537     return NULL;
 538
 539   /* If we're looking at "://", it means the URL uses a scheme we
 540      don't support, which may include "https" when compiled without
 541      SSL support.  Don't bogusly rewrite such URLs.  */
 542   if (p && p[0] == ':' && p[1] == '/' && p[2] == '/')
 543     return NULL;
 544
 545   if (p && *p == ':')
 546     {
 547       /* Colon indicates ftp, as in foo.bar.com:path.  Check for
 548          special case of http port number ("localhost:10000").  */
 549       int digits = strspn (p + 1, "0123456789");
 550       if (digits && (p[1 + digits] == '/' || p[1 + digits] == '\0'))
 551         goto http;
 552
 553       /* Turn "foo.bar.com:path" to "ftp://foo.bar.com/path". */
 554       ret = aprintf ("ftp://%s", url);
 555       ret[6 + (p - url)] = '/';
 556     }
 557   else
 558     {
 559     http:
 560       /* Just prepend "http://" to URL. */
 561       ret = aprintf ("http://%s", url);
 562     }
 563   return ret;
 564 }
 565 \f
 566 static void split_path (const char *, char **, char **);
 567
 568 /* Like strpbrk, with the exception that it returns the pointer to the
 569    terminating zero (end-of-string aka "eos") if no matching character
 570    is found.  */
 571
 572 static inline char *
 573 strpbrk_or_eos (const char *s, const char *accept)
 574 {
 575   char *p = strpbrk (s, accept);
 576   if (!p)
 577     p = strchr (s, '\0');
 578   return p;
 579 }
 580
 581 /* Turn STR into lowercase; return true if a character was actually
 582    changed. */
 583
 584 static bool
 585 lowercase_str (char *str)
 586 {
 587   bool changed = false;
 588   for (; *str; str++)
 589     if (ISUPPER (*str))
 590       {
 591         changed = true;
 592         *str = TOLOWER (*str);
 593       }
 594   return changed;
 595 }
 596
 597 static const char *
 598 init_seps (enum url_scheme scheme)
 599 {
 600   static char seps[8] = ":/";
 601   char *p = seps + 2;
 602   int flags = supported_schemes[scheme].flags;
 603
 604   if (flags & scm_has_params)
 605     *p++ = ';';
 606   if (flags & scm_has_query)
 607     *p++ = '?';
 608   if (flags & scm_has_fragment)
 609     *p++ = '#';
 610   *p++ = '\0';
 611   return seps;
 612 }
 613
 614 static const char *parse_errors[] = {
 615 #define PE_NO_ERROR                     0
 616   N_("No error"),
 617 #define PE_UNSUPPORTED_SCHEME           1
 618   N_("Unsupported scheme"),
 619 #define PE_INVALID_HOST_NAME            2
 620   N_("Invalid host name"),
 621 #define PE_BAD_PORT_NUMBER              3
 622   N_("Bad port number"),
 623 #define PE_INVALID_USER_NAME            4
 624   N_("Invalid user name"),
 625 #define PE_UNTERMINATED_IPV6_ADDRESS    5
 626   N_("Unterminated IPv6 numeric address"),
 627 #define PE_IPV6_NOT_SUPPORTED           6
 628   N_("IPv6 addresses not supported"),
 629 #define PE_INVALID_IPV6_ADDRESS         7
 630   N_("Invalid IPv6 numeric address")
 631 };
 632
 633 /* Parse a URL.
 634
 635    Return a new struct url if successful, NULL on error.  In case of
 636    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 637    error code. */
 638 struct url *
 639 url_parse (const char *url, int *error)
 640 {
 641   struct url *u;
 642   const char *p;
 643   bool path_modified, host_modified;
 644
 645   enum url_scheme scheme;
 646   const char *seps;
 647
 648   const char *uname_b,     *uname_e;
 649   const char *host_b,      *host_e;
 650   const char *path_b,      *path_e;
 651   const char *params_b,    *params_e;
 652   const char *query_b,     *query_e;
 653   const char *fragment_b,  *fragment_e;
 654
 655   int port;
 656   char *user = NULL, *passwd = NULL;
 657
 658   char *url_encoded = NULL;
 659
 660   int error_code;
 661
 662   scheme = url_scheme (url);
 663   if (scheme == SCHEME_INVALID)
 664     {
 665       error_code = PE_UNSUPPORTED_SCHEME;
 666       goto error;
 667     }
 668
 669   url_encoded = reencode_escapes (url);
 670   p = url_encoded;
 671
 672   p += strlen (supported_schemes[scheme].leading_string);
 673   uname_b = p;
 674   p = url_skip_credentials (p);
 675   uname_e = p;
 676
 677   /* scheme://user:pass@host[:port]... */
 678   /*                    ^              */
 679
 680   /* We attempt to break down the URL into the components path,
 681      params, query, and fragment.  They are ordered like this:
 682
 683        scheme://host[:port][/path][;params][?query][#fragment]  */
 684
 685   path_b     = path_e     = NULL;
 686   params_b   = params_e   = NULL;
 687   query_b    = query_e    = NULL;
 688   fragment_b = fragment_e = NULL;
 689
 690   /* Initialize separators for optional parts of URL, depending on the
 691      scheme.  For example, FTP has params, and HTTP and HTTPS have
 692      query string and fragment. */
 693   seps = init_seps (scheme);
 694
 695   host_b = p;
 696
 697   if (*p == '[')
 698     {
 699       /* Handle IPv6 address inside square brackets.  Ideally we'd
 700          just look for the terminating ']', but rfc2732 mandates
 701          rejecting invalid IPv6 addresses.  */
 702
 703       /* The address begins after '['. */
 704       host_b = p + 1;
 705       host_e = strchr (host_b, ']');
 706
 707       if (!host_e)
 708         {
 709           error_code = PE_UNTERMINATED_IPV6_ADDRESS;
 710           goto error;
 711         }
 712
 713 #ifdef ENABLE_IPV6
 714       /* Check if the IPv6 address is valid. */
 715       if (!is_valid_ipv6_address(host_b, host_e))
 716         {
 717           error_code = PE_INVALID_IPV6_ADDRESS;
 718           goto error;
 719         }
 720
 721       /* Continue parsing after the closing ']'. */
 722       p = host_e + 1;
 723 #else
 724       error_code = PE_IPV6_NOT_SUPPORTED;
 725       goto error;
 726 #endif
 727
 728       /* The closing bracket must be followed by a separator or by the
 729          null char.  */
 730       /* http://[::1]... */
 731       /*             ^   */
 732       if (!strchr (seps, *p))
 733         {
 734           /* Trailing garbage after []-delimited IPv6 address. */
 735           error_code = PE_INVALID_HOST_NAME;
 736           goto error;
 737         }
 738     }
 739   else
 740     {
 741       p = strpbrk_or_eos (p, seps);
 742       host_e = p;
 743     }
 744   ++seps;                       /* advance to '/' */
 745
 746   if (host_b == host_e)
 747     {
 748       error_code = PE_INVALID_HOST_NAME;
 749       goto error;
 750     }
 751
 752   port = scheme_default_port (scheme);
 753   if (*p == ':')
 754     {
 755       const char *port_b, *port_e, *pp;
 756
 757       /* scheme://host:port/tralala */
 758       /*              ^             */
 759       ++p;
 760       port_b = p;
 761       p = strpbrk_or_eos (p, seps);
 762       port_e = p;
 763
 764       /* Allow empty port, as per rfc2396. */
 765       if (port_b != port_e)
 766         for (port = 0, pp = port_b; pp < port_e; pp++)
 767           {
 768             if (!ISDIGIT (*pp))
 769               {
 770                 /* http://host:12randomgarbage/blah */
 771                 /*               ^                  */
 772                 error_code = PE_BAD_PORT_NUMBER;
 773                 goto error;
 774               }
 775             port = 10 * port + (*pp - '0');
 776             /* Check for too large port numbers here, before we have
 777                a chance to overflow on bogus port values.  */
 778             if (port > 0xffff)
 779               {
 780                 error_code = PE_BAD_PORT_NUMBER;
 781                 goto error;
 782               }
 783           }
 784     }
 785   /* Advance to the first separator *after* '/' (either ';' or '?',
 786      depending on the scheme).  */
 787   ++seps;
 788
 789   /* Get the optional parts of URL, each part being delimited by
 790      current location and the position of the next separator.  */
 791 #define GET_URL_PART(sepchar, var) do {                         \
 792   if (*p == sepchar)                                            \
 793     var##_b = ++p, var##_e = p = strpbrk_or_eos (p, seps);      \
 794   ++seps;                                                       \
 795 } while (0)
 796
 797   GET_URL_PART ('/', path);
 798   if (supported_schemes[scheme].flags & scm_has_params)
 799     GET_URL_PART (';', params);
 800   if (supported_schemes[scheme].flags & scm_has_query)
 801     GET_URL_PART ('?', query);
 802   if (supported_schemes[scheme].flags & scm_has_fragment)
 803     GET_URL_PART ('#', fragment);
 804
 805 #undef GET_URL_PART
 806   assert (*p == 0);
 807
 808   if (uname_b != uname_e)
 809     {
 810       /* http://user:pass@host */
 811       /*        ^         ^    */
 812       /*     uname_b   uname_e */
 813       if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
 814         {
 815           error_code = PE_INVALID_USER_NAME;
 816           goto error;
 817         }
 818     }
 819
 820   u = xnew0 (struct url);
 821   u->scheme = scheme;
 822   u->host   = strdupdelim (host_b, host_e);
 823   u->port   = port;
 824   u->user   = user;
 825   u->passwd = passwd;
 826
 827   u->path = strdupdelim (path_b, path_e);
 828   path_modified = path_simplify (u->path);
 829   split_path (u->path, &u->dir, &u->file);
 830
 831   host_modified = lowercase_str (u->host);
 832
 833   /* Decode %HH sequences in host name.  This is important not so much
 834      to support %HH sequences in host names (which other browser
 835      don't), but to support binary characters (which will have been
 836      converted to %HH by reencode_escapes).  */
 837   if (strchr (u->host, '%'))
 838     {
 839       url_unescape (u->host);
 840       host_modified = true;
 841     }
 842
 843   if (params_b)
 844     u->params = strdupdelim (params_b, params_e);
 845   if (query_b)
 846     u->query = strdupdelim (query_b, query_e);
 847   if (fragment_b)
 848     u->fragment = strdupdelim (fragment_b, fragment_e);
 849
 850   if (path_modified || u->fragment || host_modified || path_b == path_e)
 851     {
 852       /* If we suspect that a transformation has rendered what
 853          url_string might return different from URL_ENCODED, rebuild
 854          u->url using url_string.  */
 855       u->url = url_string (u, false);
 856
 857       if (url_encoded != url)
 858         xfree ((char *) url_encoded);
 859     }
 860   else
 861     {
 862       if (url_encoded == url)
 863         u->url = xstrdup (url);
 864       else
 865         u->url = url_encoded;
 866     }
 867
 868   return u;
 869
 870  error:
 871   /* Cleanup in case of error: */
 872   if (url_encoded && url_encoded != url)
 873     xfree (url_encoded);
 874
 875   /* Transmit the error code to the caller, if the caller wants to
 876      know.  */
 877   if (error)
 878     *error = error_code;
 879   return NULL;
 880 }
 881
 882 /* Return the error message string from ERROR_CODE, which should have
 883    been retrieved from url_parse.  The error message is translated.  */
 884
 885 const char *
 886 url_error (int error_code)
 887 {
 888   assert (error_code >= 0 && error_code < countof (parse_errors));
 889   return _(parse_errors[error_code]);
 890 }
 891
 892 /* Split PATH into DIR and FILE.  PATH comes from the URL and is
 893    expected to be URL-escaped.
 894
 895    The path is split into directory (the part up to the last slash)
 896    and file (the part after the last slash), which are subsequently
 897    unescaped.  Examples:
 898
 899    PATH                 DIR           FILE
 900    "foo/bar/baz"        "foo/bar"     "baz"
 901    "foo/bar/"           "foo/bar"     ""
 902    "foo"                ""            "foo"
 903    "foo/bar/baz%2fqux"  "foo/bar"     "baz/qux" (!)
 904
 905    DIR and FILE are freshly allocated.  */
 906
 907 static void
 908 split_path (const char *path, char **dir, char **file)
 909 {
 910   char *last_slash = strrchr (path, '/');
 911   if (!last_slash)
 912     {
 913       *dir = xstrdup ("");
 914       *file = xstrdup (path);
 915     }
 916   else
 917     {
 918       *dir = strdupdelim (path, last_slash);
 919       *file = xstrdup (last_slash + 1);
 920     }
 921   url_unescape (*dir);
 922   url_unescape (*file);
 923 }
 924
 925 /* Note: URL's "full path" is the path with the query string and
 926    params appended.  The "fragment" (#foo) is intentionally ignored,
 927    but that might be changed.  For example, if the original URL was
 928    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
 929    the full path will be "/foo/bar/baz;bullshit?querystring".  */
 930
 931 /* Return the length of the full path, without the terminating
 932    zero.  */
 933
 934 static int
 935 full_path_length (const struct url *url)
 936 {
 937   int len = 0;
 938
 939 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
 940
 941   FROB (path);
 942   FROB (params);
 943   FROB (query);
 944
 945 #undef FROB
 946
 947   return len;
 948 }
 949
 950 /* Write out the full path. */
 951
 952 static void
 953 full_path_write (const struct url *url, char *where)
 954 {
 955 #define FROB(el, chr) do {                      \
 956   char *f_el = url->el;                         \
 957   if (f_el) {                                   \
 958     int l = strlen (f_el);                      \
 959     *where++ = chr;                             \
 960     memcpy (where, f_el, l);                    \
 961     where += l;                                 \
 962   }                                             \
 963 } while (0)
 964
 965   FROB (path, '/');
 966   FROB (params, ';');
 967   FROB (query, '?');
 968
 969 #undef FROB
 970 }
 971
 972 /* Public function for getting the "full path".  E.g. if u->path is
 973    "foo/bar" and u->query is "param=value", full_path will be
 974    "/foo/bar?param=value". */
 975
 976 char *
 977 url_full_path (const struct url *url)
 978 {
 979   int length = full_path_length (url);
 980   char *full_path = xmalloc (length + 1);
 981
 982   full_path_write (url, full_path);
 983   full_path[length] = '\0';
 984
 985   return full_path;
 986 }
 987
 988 /* Unescape CHR in an otherwise escaped STR.  Used to selectively
 989    escaping of certain characters, such as "/" and ":".  Returns a
 990    count of unescaped chars.  */
 991
 992 static void
 993 unescape_single_char (char *str, char chr)
 994 {
 995   const char c1 = XNUM_TO_DIGIT (chr >> 4);
 996   const char c2 = XNUM_TO_DIGIT (chr & 0xf);
 997   char *h = str;                /* hare */
 998   char *t = str;                /* tortoise */
 999   for (; *h; h++, t++)
1000     {
1001       if (h[0] == '%' && h[1] == c1 && h[2] == c2)
1002         {
1003           *t = chr;
1004           h += 2;
1005         }
1006       else
1007         *t = *h;
1008     }
1009   *t = '\0';
1010 }
1011
1012 /* Escape unsafe and reserved characters, except for the slash
1013    characters.  */
1014
1015 static char *
1016 url_escape_dir (const char *dir)
1017 {
1018   char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1019   if (newdir == dir)
1020     return (char *)dir;
1021
1022   unescape_single_char (newdir, '/');
1023   return newdir;
1024 }
1025
1026 /* Sync u->path and u->url with u->dir and u->file.  Called after
1027    u->file or u->dir have been changed, typically by the FTP code.  */
1028
1029 static void
1030 sync_path (struct url *u)
1031 {
1032   char *newpath, *efile, *edir;
1033
1034   xfree (u->path);
1035
1036   /* u->dir and u->file are not escaped.  URL-escape them before
1037      reassembling them into u->path.  That way, if they contain
1038      separators like '?' or even if u->file contains slashes, the
1039      path will be correctly assembled.  (u->file can contain slashes
1040      if the URL specifies it with %2f, or if an FTP server returns
1041      it.)  */
1042   edir = url_escape_dir (u->dir);
1043   efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1044
1045   if (!*edir)
1046     newpath = xstrdup (efile);
1047   else
1048     {
1049       int dirlen = strlen (edir);
1050       int filelen = strlen (efile);
1051
1052       /* Copy "DIR/FILE" to newpath. */
1053       char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1054       memcpy (p, edir, dirlen);
1055       p += dirlen;
1056       *p++ = '/';
1057       memcpy (p, efile, filelen);
1058       p += filelen;
1059       *p = '\0';
1060     }
1061
1062   u->path = newpath;
1063
1064   if (edir != u->dir)
1065     xfree (edir);
1066   if (efile != u->file)
1067     xfree (efile);
1068
1069   /* Regenerate u->url as well.  */
1070   xfree (u->url);
1071   u->url = url_string (u, false);
1072 }
1073
1074 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1075    This way we can sync u->path and u->url when they get changed.  */
1076
1077 void
1078 url_set_dir (struct url *url, const char *newdir)
1079 {
1080   xfree (url->dir);
1081   url->dir = xstrdup (newdir);
1082   sync_path (url);
1083 }
1084
1085 void
1086 url_set_file (struct url *url, const char *newfile)
1087 {
1088   xfree (url->file);
1089   url->file = xstrdup (newfile);
1090   sync_path (url);
1091 }
1092
1093 void
1094 url_free (struct url *url)
1095 {
1096   xfree (url->host);
1097   xfree (url->path);
1098   xfree (url->url);
1099
1100   xfree_null (url->params);
1101   xfree_null (url->query);
1102   xfree_null (url->fragment);
1103   xfree_null (url->user);
1104   xfree_null (url->passwd);
1105
1106   xfree (url->dir);
1107   xfree (url->file);
1108
1109   xfree (url);
1110 }
1111 \f
1112 /* Create all the necessary directories for PATH (a file).  Calls
1113    make_directory internally.  */
1114 int
1115 mkalldirs (const char *path)
1116 {
1117   const char *p;
1118   char *t;
1119   struct_stat st;
1120   int res;
1121
1122   p = path + strlen (path);
1123   for (; *p != '/' && p != path; p--)
1124     ;
1125
1126   /* Don't create if it's just a file.  */
1127   if ((p == path) && (*p != '/'))
1128     return 0;
1129   t = strdupdelim (path, p);
1130
1131   /* Check whether the directory exists.  */
1132   if ((stat (t, &st) == 0))
1133     {
1134       if (S_ISDIR (st.st_mode))
1135         {
1136           xfree (t);
1137           return 0;
1138         }
1139       else
1140         {
1141           /* If the dir exists as a file name, remove it first.  This
1142              is *only* for Wget to work with buggy old CERN http
1143              servers.  Here is the scenario: When Wget tries to
1144              retrieve a directory without a slash, e.g.
1145              http://foo/bar (bar being a directory), CERN server will
1146              not redirect it too http://foo/bar/ -- it will generate a
1147              directory listing containing links to bar/file1,
1148              bar/file2, etc.  Wget will lose because it saves this
1149              HTML listing to a file `bar', so it cannot create the
1150              directory.  To work around this, if the file of the same
1151              name exists, we just remove it and create the directory
1152              anyway.  */
1153           DEBUGP (("Removing %s because of directory danger!\n", t));
1154           unlink (t);
1155         }
1156     }
1157   res = make_directory (t);
1158   if (res != 0)
1159     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1160   xfree (t);
1161   return res;
1162 }
1163 \f
1164 /* Functions for constructing the file name out of URL components.  */
1165
1166 /* A growable string structure, used by url_file_name and friends.
1167    This should perhaps be moved to utils.c.
1168
1169    The idea is to have a convenient and efficient way to construct a
1170    string by having various functions append data to it.  Instead of
1171    passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1172    functions in questions, we pass the pointer to this struct.  */
1173
1174 struct growable {
1175   char *base;
1176   int size;
1177   int tail;
1178 };
1179
1180 /* Ensure that the string can accept APPEND_COUNT more characters past
1181    the current TAIL position.  If necessary, this will grow the string
1182    and update its allocated size.  If the string is already large
1183    enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1184 #define GROW(g, append_size) do {                                       \
1185   struct growable *G_ = g;                                              \
1186   DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1187 } while (0)
1188
1189 /* Return the tail position of the string. */
1190 #define TAIL(r) ((r)->base + (r)->tail)
1191
1192 /* Move the tail position by APPEND_COUNT characters. */
1193 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1194
1195 /* Append the string STR to DEST.  NOTICE: the string in DEST is not
1196    terminated.  */
1197
1198 static void
1199 append_string (const char *str, struct growable *dest)
1200 {
1201   int l = strlen (str);
1202   GROW (dest, l);
1203   memcpy (TAIL (dest), str, l);
1204   TAIL_INCR (dest, l);
1205 }
1206
1207 /* Append CH to DEST.  For example, append_char (0, DEST)
1208    zero-terminates DEST.  */
1209
1210 static void
1211 append_char (char ch, struct growable *dest)
1212 {
1213   GROW (dest, 1);
1214   *TAIL (dest) = ch;
1215   TAIL_INCR (dest, 1);
1216 }
1217
1218 enum {
1219   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1220   filechr_not_windows = 2,      /* unusable on Windows, one of \|/<>?:*" */
1221   filechr_control     = 4       /* a control character, e.g. 0-31 */
1222 };
1223
1224 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1225
1226 /* Shorthands for the table: */
1227 #define U filechr_not_unix
1228 #define W filechr_not_windows
1229 #define C filechr_control
1230
1231 #define UW U|W
1232 #define UWC U|W|C
1233
1234 /* Table of characters unsafe under various conditions (see above).
1235
1236    Arguably we could also claim `%' to be unsafe, since we use it as
1237    the escape character.  If we ever want to be able to reliably
1238    translate file name back to URL, this would become important
1239    crucial.  Right now, it's better to be minimal in escaping.  */
1240
1241 static const unsigned char filechr_table[256] =
1242 {
1243 UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1244   C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1245   C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1246   C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1247   0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1248   0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1249   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1250   0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
1251   0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1252   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1253   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1254   0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1255   0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1256   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1257   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1258   0,  0,  0,  0,   0,  0,  0,  0,   /* x   y   z   {    |   }   ~   DEL */
1259
1260   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
1261   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
1262   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1263   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1264
1265   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1266   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1267   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1268   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1269 };
1270 #undef U
1271 #undef W
1272 #undef C
1273 #undef UW
1274 #undef UWC
1275
1276 /* FN_PORT_SEP is the separator between host and port in file names
1277    for non-standard port numbers.  On Unix this is normally ':', as in
1278    "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1279    because Windows can't handle ':' in file names.  */
1280 #define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1281
1282 /* FN_QUERY_SEP is the separator between the file name and the URL
1283    query, normally '?'.  Since Windows cannot handle '?' as part of
1284    file name, we use '@' instead there.  */
1285 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1286
1287 /* Quote path element, characters in [b, e), as file name, and append
1288    the quoted string to DEST.  Each character is quoted as per
1289    file_unsafe_char and the corresponding table.
1290
1291    If ESCAPED is true, the path element is considered to be
1292    URL-escaped and will be unescaped prior to inspection.  */
1293
1294 static void
1295 append_uri_pathel (const char *b, const char *e, bool escaped,
1296                    struct growable *dest)
1297 {
1298   const char *p;
1299   int quoted, outlen;
1300
1301   int mask;
1302   if (opt.restrict_files_os == restrict_unix)
1303     mask = filechr_not_unix;
1304   else
1305     mask = filechr_not_windows;
1306   if (opt.restrict_files_ctrl)
1307     mask |= filechr_control;
1308
1309   /* Copy [b, e) to PATHEL and URL-unescape it. */
1310   if (escaped)
1311     {
1312       char *unescaped;
1313       BOUNDED_TO_ALLOCA (b, e, unescaped);
1314       url_unescape (unescaped);
1315       b = unescaped;
1316       e = unescaped + strlen (unescaped);
1317     }
1318
1319   /* Defang ".." when found as component of path.  Remember that path
1320      comes from the URL and might contain malicious input.  */
1321   if (e - b == 2 && b[0] == '.' && b[1] == '.')
1322     {
1323       b = "%2E%2E";
1324       e = b + 6;
1325     }
1326
1327   /* Walk the PATHEL string and check how many characters we'll need
1328      to quote.  */
1329   quoted = 0;
1330   for (p = b; p < e; p++)
1331     if (FILE_CHAR_TEST (*p, mask))
1332       ++quoted;
1333
1334   /* Calculate the length of the output string.  e-b is the input
1335      string length.  Each quoted char introduces two additional
1336      characters in the string, hence 2*quoted.  */
1337   outlen = (e - b) + (2 * quoted);
1338   GROW (dest, outlen);
1339
1340   if (!quoted)
1341     {
1342       /* If there's nothing to quote, we can simply append the string
1343          without processing it again.  */
1344       memcpy (TAIL (dest), b, outlen);
1345     }
1346   else
1347     {
1348       char *q = TAIL (dest);
1349       for (p = b; p < e; p++)
1350         {
1351           if (!FILE_CHAR_TEST (*p, mask))
1352             *q++ = *p;
1353           else
1354             {
1355               unsigned char ch = *p;
1356               *q++ = '%';
1357               *q++ = XNUM_TO_DIGIT (ch >> 4);
1358               *q++ = XNUM_TO_DIGIT (ch & 0xf);
1359             }
1360         }
1361       assert (q - TAIL (dest) == outlen);
1362     }
1363   TAIL_INCR (dest, outlen);
1364 }
1365
1366 /* Append to DEST the directory structure that corresponds the
1367    directory part of URL's path.  For example, if the URL is
1368    http://server/dir1/dir2/file, this appends "/dir1/dir2".
1369
1370    Each path element ("dir1" and "dir2" in the above example) is
1371    examined, url-unescaped, and re-escaped as file name element.
1372
1373    Additionally, it cuts as many directories from the path as
1374    specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1375    will produce "bar" for the above example.  For 2 or more, it will
1376    produce "".
1377
1378    Each component of the path is quoted for use as file name.  */
1379
1380 static void
1381 append_dir_structure (const struct url *u, struct growable *dest)
1382 {
1383   char *pathel, *next;
1384   int cut = opt.cut_dirs;
1385
1386   /* Go through the path components, de-URL-quote them, and quote them
1387      (if necessary) as file names.  */
1388
1389   pathel = u->path;
1390   for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1391     {
1392       if (cut-- > 0)
1393         continue;
1394       if (pathel == next)
1395         /* Ignore empty pathels.  */
1396         continue;
1397
1398       if (dest->tail)
1399         append_char ('/', dest);
1400       append_uri_pathel (pathel, next, true, dest);
1401     }
1402 }
1403
1404 /* Return a unique file name that matches the given URL as good as
1405    possible.  Does not create directories on the file system.  */
1406
1407 char *
1408 url_file_name (const struct url *u)
1409 {
1410   struct growable fnres;        /* stands for "file name result" */
1411
1412   const char *u_file, *u_query;
1413   char *fname, *unique;
1414
1415   fnres.base = NULL;
1416   fnres.size = 0;
1417   fnres.tail = 0;
1418
1419   /* Start with the directory prefix, if specified. */
1420   if (opt.dir_prefix)
1421     append_string (opt.dir_prefix, &fnres);
1422
1423   /* If "dirstruct" is turned on (typically the case with -r), add
1424      the host and port (unless those have been turned off) and
1425      directory structure.  */
1426   if (opt.dirstruct)
1427     {
1428       if (opt.protocol_directories)
1429         {
1430           if (fnres.tail)
1431             append_char ('/', &fnres);
1432           append_string (supported_schemes[u->scheme].name, &fnres);
1433         }
1434       if (opt.add_hostdir)
1435         {
1436           if (fnres.tail)
1437             append_char ('/', &fnres);
1438           if (0 != strcmp (u->host, ".."))
1439             append_string (u->host, &fnres);
1440           else
1441             /* Host name can come from the network; malicious DNS may
1442                allow ".." to be resolved, causing us to write to
1443                "../<file>".  Defang such host names.  */
1444             append_string ("%2E%2E", &fnres);
1445           if (u->port != scheme_default_port (u->scheme))
1446             {
1447               char portstr[24];
1448               number_to_string (portstr, u->port);
1449               append_char (FN_PORT_SEP, &fnres);
1450               append_string (portstr, &fnres);
1451             }
1452         }
1453
1454       append_dir_structure (u, &fnres);
1455     }
1456
1457   /* Add the file name. */
1458   if (fnres.tail)
1459     append_char ('/', &fnres);
1460   u_file = *u->file ? u->file : "index.html";
1461   append_uri_pathel (u_file, u_file + strlen (u_file), false, &fnres);
1462
1463   /* Append "?query" to the file name. */
1464   u_query = u->query && *u->query ? u->query : NULL;
1465   if (u_query)
1466     {
1467       append_char (FN_QUERY_SEP, &fnres);
1468       append_uri_pathel (u_query, u_query + strlen (u_query), true, &fnres);
1469     }
1470
1471   /* Zero-terminate the file name. */
1472   append_char ('\0', &fnres);
1473
1474   fname = fnres.base;
1475
1476   /* Check the cases in which the unique extensions are not used:
1477      1) Clobbering is turned off (-nc).
1478      2) Retrieval with regetting.
1479      3) Timestamping is used.
1480      4) Hierarchy is built.
1481
1482      The exception is the case when file does exist and is a
1483      directory (see `mkalldirs' for explanation).  */
1484
1485   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1486       && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1487     return fname;
1488
1489   unique = unique_name (fname, true);
1490   if (unique != fname)
1491     xfree (fname);
1492   return unique;
1493 }
1494 \f
1495 /* Resolve "." and ".." elements of PATH by destructively modifying
1496    PATH and return true if PATH has been modified, false otherwise.
1497
1498    The algorithm is in spirit similar to the one described in rfc1808,
1499    although implemented differently, in one pass.  To recap, path
1500    elements containing only "." are removed, and ".." is taken to mean
1501    "back up one element".  Single leading and trailing slashes are
1502    preserved.
1503
1504    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1505    test examples are provided below.  If you change anything in this
1506    function, run test_path_simplify to make sure you haven't broken a
1507    test case.  */
1508
1509 static bool
1510 path_simplify (char *path)
1511 {
1512   char *h = path;               /* hare */
1513   char *t = path;               /* tortoise */
1514   char *beg = path;             /* boundary for backing the tortoise */
1515   char *end = path + strlen (path);
1516
1517   while (h < end)
1518     {
1519       /* Hare should be at the beginning of a path element. */
1520
1521       if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1522         {
1523           /* Ignore "./". */
1524           h += 2;
1525         }
1526       else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1527         {
1528           /* Handle "../" by retreating the tortoise by one path
1529              element -- but not past beggining.  */
1530           if (t > beg)
1531             {
1532               /* Move backwards until T hits the beginning of the
1533                  previous path element or the beginning of path. */
1534               for (--t; t > beg && t[-1] != '/'; t--)
1535                 ;
1536             }
1537           else
1538             {
1539               /* If we're at the beginning, copy the "../" literally
1540                  move the beginning so a later ".." doesn't remove
1541                  it.  */
1542               beg = t + 3;
1543               goto regular;
1544             }
1545           h += 3;
1546         }
1547       else
1548         {
1549         regular:
1550           /* A regular path element.  If H hasn't advanced past T,
1551              simply skip to the next path element.  Otherwise, copy
1552              the path element until the next slash.  */
1553           if (t == h)
1554             {
1555               /* Skip the path element, including the slash.  */
1556               while (h < end && *h != '/')
1557                 t++, h++;
1558               if (h < end)
1559                 t++, h++;
1560             }
1561           else
1562             {
1563               /* Copy the path element, including the final slash.  */
1564               while (h < end && *h != '/')
1565                 *t++ = *h++;
1566               if (h < end)
1567                 *t++ = *h++;
1568             }
1569         }
1570     }
1571
1572   if (t != h)
1573     *t = '\0';
1574
1575   return t != h;
1576 }
1577 \f
1578 /* Return the length of URL's path.  Path is considered to be
1579    terminated by one or more of the ?query or ;params or #fragment,
1580    depending on the scheme.  */
1581
1582 static const char *
1583 path_end (const char *url)
1584 {
1585   enum url_scheme scheme = url_scheme (url);
1586   const char *seps;
1587   if (scheme == SCHEME_INVALID)
1588     scheme = SCHEME_HTTP;       /* use http semantics for rel links */
1589   /* +2 to ignore the first two separators ':' and '/' */
1590   seps = init_seps (scheme) + 2;
1591   return strpbrk_or_eos (url, seps);
1592 }
1593
1594 /* Find the last occurrence of character C in the range [b, e), or
1595    NULL, if none are present.  We might want to use memrchr (a GNU
1596    extension) under GNU libc.  */
1597
1598 static const char *
1599 find_last_char (const char *b, const char *e, char c)
1600 {
1601   for (; e > b; e--)
1602     if (*e == c)
1603       return e;
1604   return NULL;
1605 }
1606
1607 /* Merge BASE with LINK and return the resulting URI.
1608
1609    Either of the URIs may be absolute or relative, complete with the
1610    host name, or path only.  This tries to reasonably handle all
1611    foreseeable cases.  It only employs minimal URL parsing, without
1612    knowledge of the specifics of schemes.
1613
1614    I briefly considered making this function call path_simplify after
1615    the merging process, as rfc1738 seems to suggest.  This is a bad
1616    idea for several reasons: 1) it complexifies the code, and 2)
1617    url_parse has to simplify path anyway, so it's wasteful to boot.  */
1618
1619 char *
1620 uri_merge (const char *base, const char *link)
1621 {
1622   int linklength;
1623   const char *end;
1624   char *merge;
1625
1626   if (url_has_scheme (link))
1627     return xstrdup (link);
1628
1629   /* We may not examine BASE past END. */
1630   end = path_end (base);
1631   linklength = strlen (link);
1632
1633   if (!*link)
1634     {
1635       /* Empty LINK points back to BASE, query string and all. */
1636       return xstrdup (base);
1637     }
1638   else if (*link == '?')
1639     {
1640       /* LINK points to the same location, but changes the query
1641          string.  Examples: */
1642       /* uri_merge("path",         "?new") -> "path?new"     */
1643       /* uri_merge("path?foo",     "?new") -> "path?new"     */
1644       /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1645       /* uri_merge("path#foo",     "?new") -> "path?new"     */
1646       int baselength = end - base;
1647       merge = xmalloc (baselength + linklength + 1);
1648       memcpy (merge, base, baselength);
1649       memcpy (merge + baselength, link, linklength);
1650       merge[baselength + linklength] = '\0';
1651     }
1652   else if (*link == '#')
1653     {
1654       /* uri_merge("path",         "#new") -> "path#new"     */
1655       /* uri_merge("path#foo",     "#new") -> "path#new"     */
1656       /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1657       /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1658       int baselength;
1659       const char *end1 = strchr (base, '#');
1660       if (!end1)
1661         end1 = base + strlen (base);
1662       baselength = end1 - base;
1663       merge = xmalloc (baselength + linklength + 1);
1664       memcpy (merge, base, baselength);
1665       memcpy (merge + baselength, link, linklength);
1666       merge[baselength + linklength] = '\0';
1667     }
1668   else if (*link == '/' && *(link + 1) == '/')
1669     {
1670       /* LINK begins with "//" and so is a net path: we need to
1671          replace everything after (and including) the double slash
1672          with LINK. */
1673
1674       /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1675       /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1676       /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1677
1678       int span;
1679       const char *slash;
1680       const char *start_insert;
1681
1682       /* Look for first slash. */
1683       slash = memchr (base, '/', end - base);
1684       /* If found slash and it is a double slash, then replace
1685          from this point, else default to replacing from the
1686          beginning.  */
1687       if (slash && *(slash + 1) == '/')
1688         start_insert = slash;
1689       else
1690         start_insert = base;
1691
1692       span = start_insert - base;
1693       merge = xmalloc (span + linklength + 1);
1694       if (span)
1695         memcpy (merge, base, span);
1696       memcpy (merge + span, link, linklength);
1697       merge[span + linklength] = '\0';
1698     }
1699   else if (*link == '/')
1700     {
1701       /* LINK is an absolute path: we need to replace everything
1702          after (and including) the FIRST slash with LINK.
1703
1704          So, if BASE is "http://host/whatever/foo/bar", and LINK is
1705          "/qux/xyzzy", our result should be
1706          "http://host/qux/xyzzy".  */
1707       int span;
1708       const char *slash;
1709       const char *start_insert = NULL; /* for gcc to shut up. */
1710       const char *pos = base;
1711       bool seen_slash_slash = false;
1712       /* We're looking for the first slash, but want to ignore
1713          double slash. */
1714     again:
1715       slash = memchr (pos, '/', end - pos);
1716       if (slash && !seen_slash_slash)
1717         if (*(slash + 1) == '/')
1718           {
1719             pos = slash + 2;
1720             seen_slash_slash = true;
1721             goto again;
1722           }
1723
1724       /* At this point, SLASH is the location of the first / after
1725          "//", or the first slash altogether.  START_INSERT is the
1726          pointer to the location where LINK will be inserted.  When
1727          examining the last two examples, keep in mind that LINK
1728          begins with '/'. */
1729
1730       if (!slash && !seen_slash_slash)
1731         /* example: "foo" */
1732         /*           ^    */
1733         start_insert = base;
1734       else if (!slash && seen_slash_slash)
1735         /* example: "http://foo" */
1736         /*                     ^ */
1737         start_insert = end;
1738       else if (slash && !seen_slash_slash)
1739         /* example: "foo/bar" */
1740         /*           ^        */
1741         start_insert = base;
1742       else if (slash && seen_slash_slash)
1743         /* example: "http://something/" */
1744         /*                           ^  */
1745         start_insert = slash;
1746
1747       span = start_insert - base;
1748       merge = xmalloc (span + linklength + 1);
1749       if (span)
1750         memcpy (merge, base, span);
1751       memcpy (merge + span, link, linklength);
1752       merge[span + linklength] = '\0';
1753     }
1754   else
1755     {
1756       /* LINK is a relative URL: we need to replace everything
1757          after last slash (possibly empty) with LINK.
1758
1759          So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1760          our result should be "whatever/foo/qux/xyzzy".  */
1761       bool need_explicit_slash = false;
1762       int span;
1763       const char *start_insert;
1764       const char *last_slash = find_last_char (base, end, '/');
1765       if (!last_slash)
1766         {
1767           /* No slash found at all.  Replace what we have with LINK. */
1768           start_insert = base;
1769         }
1770       else if (last_slash && last_slash >= base + 2
1771                && last_slash[-2] == ':' && last_slash[-1] == '/')
1772         {
1773           /* example: http://host"  */
1774           /*                      ^ */
1775           start_insert = end + 1;
1776           need_explicit_slash = true;
1777         }
1778       else
1779         {
1780           /* example: "whatever/foo/bar" */
1781           /*                        ^    */
1782           start_insert = last_slash + 1;
1783         }
1784
1785       span = start_insert - base;
1786       merge = xmalloc (span + linklength + 1);
1787       if (span)
1788         memcpy (merge, base, span);
1789       if (need_explicit_slash)
1790         merge[span - 1] = '/';
1791       memcpy (merge + span, link, linklength);
1792       merge[span + linklength] = '\0';
1793     }
1794
1795   return merge;
1796 }
1797 \f
1798 #define APPEND(p, s) do {                       \
1799   int len = strlen (s);                         \
1800   memcpy (p, s, len);                           \
1801   p += len;                                     \
1802 } while (0)
1803
1804 /* Use this instead of password when the actual password is supposed
1805    to be hidden.  We intentionally use a generic string without giving
1806    away the number of characters in the password, like previous
1807    versions did.  */
1808 #define HIDDEN_PASSWORD "*password*"
1809
1810 /* Recreate the URL string from the data in URL.
1811
1812    If HIDE is true (as it is when we're calling this on a URL we plan
1813    to print, but not when calling it to canonicalize a URL for use
1814    within the program), password will be hidden.  Unsafe characters in
1815    the URL will be quoted.  */
1816
1817 char *
1818 url_string (const struct url *url, bool hide_password)
1819 {
1820   int size;
1821   char *result, *p;
1822   char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;
1823
1824   int scheme_port = supported_schemes[url->scheme].default_port;
1825   const char *scheme_str = supported_schemes[url->scheme].leading_string;
1826   int fplen = full_path_length (url);
1827
1828   bool brackets_around_host;
1829
1830   assert (scheme_str != NULL);
1831
1832   /* Make sure the user name and password are quoted. */
1833   if (url->user)
1834     {
1835       quoted_user = url_escape_allow_passthrough (url->user);
1836       if (url->passwd)
1837         {
1838           if (hide_password)
1839             quoted_passwd = HIDDEN_PASSWORD;
1840           else
1841             quoted_passwd = url_escape_allow_passthrough (url->passwd);
1842         }
1843     }
1844
1845   /* In the unlikely event that the host name contains non-printable
1846      characters, quote it for displaying to the user.  */
1847   quoted_host = url_escape_allow_passthrough (url->host);
1848
1849   /* Undo the quoting of colons that URL escaping performs.  IPv6
1850      addresses may legally contain colons, and in that case must be
1851      placed in square brackets.  */
1852   if (quoted_host != url->host)
1853     unescape_single_char (quoted_host, ':');
1854   brackets_around_host = strchr (quoted_host, ':') != NULL;
1855
1856   size = (strlen (scheme_str)
1857           + strlen (quoted_host)
1858           + (brackets_around_host ? 2 : 0)
1859           + fplen
1860           + 1);
1861   if (url->port != scheme_port)
1862     size += 1 + numdigit (url->port);
1863   if (quoted_user)
1864     {
1865       size += 1 + strlen (quoted_user);
1866       if (quoted_passwd)
1867         size += 1 + strlen (quoted_passwd);
1868     }
1869
1870   p = result = xmalloc (size);
1871
1872   APPEND (p, scheme_str);
1873   if (quoted_user)
1874     {
1875       APPEND (p, quoted_user);
1876       if (quoted_passwd)
1877         {
1878           *p++ = ':';
1879           APPEND (p, quoted_passwd);
1880         }
1881       *p++ = '@';
1882     }
1883
1884   if (brackets_around_host)
1885     *p++ = '[';
1886   APPEND (p, quoted_host);
1887   if (brackets_around_host)
1888     *p++ = ']';
1889   if (url->port != scheme_port)
1890     {
1891       *p++ = ':';
1892       p = number_to_string (p, url->port);
1893     }
1894
1895   full_path_write (url, p);
1896   p += fplen;
1897   *p++ = '\0';
1898
1899   assert (p - result == size);
1900
1901   if (quoted_user && quoted_user != url->user)
1902     xfree (quoted_user);
1903   if (quoted_passwd && !hide_password && quoted_passwd != url->passwd)
1904     xfree (quoted_passwd);
1905   if (quoted_host != url->host)
1906     xfree (quoted_host);
1907
1908   return result;
1909 }
1910 \f
1911 /* Return true if scheme a is similar to scheme b.
1912
1913    Schemes are similar if they are equal.  If SSL is supported, schemes
1914    are also similar if one is http (SCHEME_HTTP) and the other is https
1915    (SCHEME_HTTPS).  */
1916 bool
1917 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
1918 {
1919   if (a == b)
1920     return true;
1921 #ifdef HAVE_SSL
1922   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
1923       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
1924     return true;
1925 #endif
1926   return false;
1927 }
1928 \f
1929 #if 0
1930 /* Debugging and testing support for path_simplify. */
1931
1932 /* Debug: run path_simplify on PATH and return the result in a new
1933    string.  Useful for calling from the debugger.  */
1934 static char *
1935 ps (char *path)
1936 {
1937   char *copy = xstrdup (path);
1938   path_simplify (copy);
1939   return copy;
1940 }
1941
1942 static void
1943 run_test (char *test, char *expected_result, bool expected_change)
1944 {
1945   char *test_copy = xstrdup (test);
1946   bool modified = path_simplify (test_copy);
1947
1948   if (0 != strcmp (test_copy, expected_result))
1949     {
1950       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
1951               test, expected_result, test_copy);
1952     }
1953   if (modified != expected_change)
1954     {
1955       if (expected_change)
1956         printf ("Expected modification with path_simplify(\"%s\").\n",
1957                 test);
1958       else
1959         printf ("Expected no modification with path_simplify(\"%s\").\n",
1960                 test);
1961     }
1962   xfree (test_copy);
1963 }
1964
1965 static void
1966 test_path_simplify (void)
1967 {
1968   static struct {
1969     char *test, *result;
1970     bool should_modify;
1971   } tests[] = {
1972     { "",                       "",             false },
1973     { ".",                      "",             true },
1974     { "./",                     "",             true },
1975     { "..",                     "..",           false },
1976     { "../",                    "../",          false },
1977     { "foo",                    "foo",          false },
1978     { "foo/bar",                "foo/bar",      false },
1979     { "foo///bar",              "foo///bar",    false },
1980     { "foo/.",                  "foo/",         true },
1981     { "foo/./",                 "foo/",         true },
1982     { "foo./",                  "foo./",        false },
1983     { "foo/../bar",             "bar",          true },
1984     { "foo/../bar/",            "bar/",         true },
1985     { "foo/bar/..",             "foo/",         true },
1986     { "foo/bar/../x",           "foo/x",        true },
1987     { "foo/bar/../x/",          "foo/x/",       true },
1988     { "foo/..",                 "",             true },
1989     { "foo/../..",              "..",           true },
1990     { "foo/../../..",           "../..",        true },
1991     { "foo/../../bar/../../baz", "../../baz",   true },
1992     { "a/b/../../c",            "c",            true },
1993     { "./a/../b",               "b",            true }
1994   };
1995   int i;
1996
1997   for (i = 0; i < countof (tests); i++)
1998     {
1999       char *test = tests[i].test;
2000       char *expected_result = tests[i].result;
2001       bool  expected_change = tests[i].should_modify;
2002       run_test (test, expected_result, expected_change);
2003     }
2004 }
2005 #endif