sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   3    2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation,
   4    Inc.
   5
   6 This file is part of GNU Wget.
   7
   8 GNU Wget is free software; you can redistribute it and/or modify
   9 it under the terms of the GNU General Public License as published by
  10 the Free Software Foundation; either version 3 of the License, or (at
  11 your option) any later version.
  12
  13 GNU Wget is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  20
  21 Additional permission under GNU GPL version 3 section 7
  22
  23 If you modify this program, or any covered work, by linking or
  24 combining it with the OpenSSL project's OpenSSL library (or a
  25 modified version of that library), containing parts covered by the
  26 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  27 grants you additional permission to convey the resulting work.
  28 Corresponding Source for a non-source form of such a combination
  29 shall include the source code for the parts of OpenSSL used as well
  30 as that of the covered work.  */
  31
  32 #include "wget.h"
  33
  34 #include <stdio.h>
  35 #include <stdlib.h>
  36 #include <string.h>
  37 #include <unistd.h>
  38 #include <errno.h>
  39 #include <assert.h>
  40
  41 #include "utils.h"
  42 #include "url.h"
  43 #include "host.h"  /* for is_valid_ipv6_address */
  44
  45 #ifdef __VMS
  46 #include "vms.h"
  47 #endif /* def __VMS */
  48
  49 #ifdef TESTING
  50 #include "test.h"
  51 #endif
  52
  53 enum {
  54   scm_disabled = 1,             /* for https when OpenSSL fails to init. */
  55   scm_has_params = 2,           /* whether scheme has ;params */
  56   scm_has_query = 4,            /* whether scheme has ?query */
  57   scm_has_fragment = 8          /* whether scheme has #fragment */
  58 };
  59
  60 struct scheme_data
  61 {
  62   /* Short name of the scheme, such as "http" or "ftp". */
  63   const char *name;
  64   /* Leading string that identifies the scheme, such as "https://". */
  65   const char *leading_string;
  66   /* Default port of the scheme when none is specified. */
  67   int default_port;
  68   /* Various flags. */
  69   int flags;
  70 };
  71
  72 /* Supported schemes: */
  73 static struct scheme_data supported_schemes[] =
  74 {
  75   { "http",     "http://",  DEFAULT_HTTP_PORT,  scm_has_query|scm_has_fragment },
  76 #ifdef HAVE_SSL
  77   { "https",    "https://", DEFAULT_HTTPS_PORT, scm_has_query|scm_has_fragment },
  78 #endif
  79   { "ftp",      "ftp://",   DEFAULT_FTP_PORT,   scm_has_params|scm_has_fragment },
  80
  81   /* SCHEME_INVALID */
  82   { NULL,       NULL,       -1,                 0 }
  83 };
  84
  85 /* Forward declarations: */
  86
  87 static bool path_simplify (enum url_scheme, char *);
  88 \f
  89 /* Support for escaping and unescaping of URL strings.  */
  90
  91 /* Table of "reserved" and "unsafe" characters.  Those terms are
  92    rfc1738-speak, as such largely obsoleted by rfc2396 and later
  93    specs, but the general idea remains.
  94
  95    A reserved character is the one that you can't decode without
  96    changing the meaning of the URL.  For example, you can't decode
  97    "/foo/%2f/bar" into "/foo///bar" because the number and contents of
  98    path components is different.  Non-reserved characters can be
  99    changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar".  The
 100    unsafe characters are loosely based on rfc1738, plus "$" and ",",
 101    as recommended by rfc2396, and minus "~", which is very frequently
 102    used (and sometimes unrecognized as %7E by broken servers).
 103
 104    An unsafe character is the one that should be encoded when URLs are
 105    placed in foreign environments.  E.g. space and newline are unsafe
 106    in HTTP contexts because HTTP uses them as separator and line
 107    terminator, so they must be encoded to %20 and %0A respectively.
 108    "*" is unsafe in shell context, etc.
 109
 110    We determine whether a character is unsafe through static table
 111    lookup.  This code assumes ASCII character set and 8-bit chars.  */
 112
 113 enum {
 114   /* rfc1738 reserved chars + "$" and ",".  */
 115   urlchr_reserved = 1,
 116
 117   /* rfc1738 unsafe chars, plus non-printables.  */
 118   urlchr_unsafe   = 2
 119 };
 120
 121 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 122 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 123 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 124
 125 /* Shorthands for the table: */
 126 #define R  urlchr_reserved
 127 #define U  urlchr_unsafe
 128 #define RU R|U
 129
 130 static const unsigned char urlchr_table[256] =
 131 {
 132   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 133   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 134   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 135   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 136   U,  0,  U, RU,   R,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 137   0,  0,  0,  R,   R,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 138   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 139   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 140  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 141   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 142   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 143   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 144   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 145   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 146   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 147   0,  0,  0,  U,   U,  U,  0,  U,   /* x   y   z   {    |   }   ~   DEL */
 148
 149   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 150   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 151   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 152   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 153
 154   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 155   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 156   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 157   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 158 };
 159 #undef R
 160 #undef U
 161 #undef RU
 162
 163 /* URL-unescape the string S.
 164
 165    This is done by transforming the sequences "%HH" to the character
 166    represented by the hexadecimal digits HH.  If % is not followed by
 167    two hexadecimal digits, it is inserted literally.
 168
 169    The transformation is done in place.  If you need the original
 170    string intact, make a copy before calling this function.  */
 171
 172 void
 173 url_unescape (char *s)
 174 {
 175   char *t = s;                  /* t - tortoise */
 176   char *h = s;                  /* h - hare     */
 177
 178   for (; *h; h++, t++)
 179     {
 180       if (*h != '%')
 181         {
 182         copychar:
 183           *t = *h;
 184         }
 185       else
 186         {
 187           char c;
 188           /* Do nothing if '%' is not followed by two hex digits. */
 189           if (!h[1] || !h[2] || !(c_isxdigit (h[1]) && c_isxdigit (h[2])))
 190             goto copychar;
 191           c = X2DIGITS_TO_NUM (h[1], h[2]);
 192           /* Don't unescape %00 because there is no way to insert it
 193              into a C string without effectively truncating it. */
 194           if (c == '\0')
 195             goto copychar;
 196           *t = c;
 197           h += 2;
 198         }
 199     }
 200   *t = '\0';
 201 }
 202
 203 /* The core of url_escape_* functions.  Escapes the characters that
 204    match the provided mask in urlchr_table.
 205
 206    If ALLOW_PASSTHROUGH is true, a string with no unsafe chars will be
 207    returned unchanged.  If ALLOW_PASSTHROUGH is false, a freshly
 208    allocated string will be returned in all cases.  */
 209
 210 static char *
 211 url_escape_1 (const char *s, unsigned char mask, bool allow_passthrough)
 212 {
 213   const char *p1;
 214   char *p2, *newstr;
 215   int newlen;
 216   int addition = 0;
 217
 218   for (p1 = s; *p1; p1++)
 219     if (urlchr_test (*p1, mask))
 220       addition += 2;            /* Two more characters (hex digits) */
 221
 222   if (!addition)
 223     return allow_passthrough ? (char *)s : xstrdup (s);
 224
 225   newlen = (p1 - s) + addition;
 226   newstr = xmalloc (newlen + 1);
 227
 228   p1 = s;
 229   p2 = newstr;
 230   while (*p1)
 231     {
 232       /* Quote the characters that match the test mask. */
 233       if (urlchr_test (*p1, mask))
 234         {
 235           unsigned char c = *p1++;
 236           *p2++ = '%';
 237           *p2++ = XNUM_TO_DIGIT (c >> 4);
 238           *p2++ = XNUM_TO_DIGIT (c & 0xf);
 239         }
 240       else
 241         *p2++ = *p1++;
 242     }
 243   assert (p2 - newstr == newlen);
 244   *p2 = '\0';
 245
 246   return newstr;
 247 }
 248
 249 /* URL-escape the unsafe characters (see urlchr_table) in a given
 250    string, returning a freshly allocated string.  */
 251
 252 char *
 253 url_escape (const char *s)
 254 {
 255   return url_escape_1 (s, urlchr_unsafe, false);
 256 }
 257
 258 /* URL-escape the unsafe and reserved characters (see urlchr_table) in
 259    a given string, returning a freshly allocated string.  */
 260
 261 char *
 262 url_escape_unsafe_and_reserved (const char *s)
 263 {
 264   return url_escape_1 (s, urlchr_unsafe|urlchr_reserved, false);
 265 }
 266
 267 /* URL-escape the unsafe characters (see urlchr_table) in a given
 268    string.  If no characters are unsafe, S is returned.  */
 269
 270 static char *
 271 url_escape_allow_passthrough (const char *s)
 272 {
 273   return url_escape_1 (s, urlchr_unsafe, true);
 274 }
 275 \f
 276 /* Decide whether the char at position P needs to be encoded.  (It is
 277    not enough to pass a single char *P because the function may need
 278    to inspect the surrounding context.)
 279
 280    Return true if the char should be escaped as %XX, false otherwise.  */
 281
 282 static inline bool
 283 char_needs_escaping (const char *p)
 284 {
 285   if (*p == '%')
 286     {
 287       if (c_isxdigit (*(p + 1)) && c_isxdigit (*(p + 2)))
 288         return false;
 289       else
 290         /* Garbled %.. sequence: encode `%'. */
 291         return true;
 292     }
 293   else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
 294     return true;
 295   else
 296     return false;
 297 }
 298
 299 /* Translate a %-escaped (but possibly non-conformant) input string S
 300    into a %-escaped (and conformant) output string.  If no characters
 301    are encoded or decoded, return the same string S; otherwise, return
 302    a freshly allocated string with the new contents.
 303
 304    After a URL has been run through this function, the protocols that
 305    use `%' as the quote character can use the resulting string as-is,
 306    while those that don't can use url_unescape to get to the intended
 307    data.  This function is stable: once the input is transformed,
 308    further transformations of the result yield the same output.
 309
 310    Let's discuss why this function is needed.
 311
 312    Imagine Wget is asked to retrieve `http://abc.xyz/abc def'.  Since
 313    a raw space character would mess up the HTTP request, it needs to
 314    be quoted, like this:
 315
 316        GET /abc%20def HTTP/1.0
 317
 318    It would appear that the unsafe chars need to be quoted, for
 319    example with url_escape.  But what if we're requested to download
 320    `abc%20def'?  url_escape transforms "%" to "%25", which would leave
 321    us with `abc%2520def'.  This is incorrect -- since %-escapes are
 322    part of URL syntax, "%20" is the correct way to denote a literal
 323    space on the Wget command line.  This leads to the conclusion that
 324    in that case Wget should not call url_escape, but leave the `%20'
 325    as is.  This is clearly contradictory, but it only gets worse.
 326
 327    What if the requested URI is `abc%20 def'?  If we call url_escape,
 328    we end up with `/abc%2520%20def', which is almost certainly not
 329    intended.  If we don't call url_escape, we are left with the
 330    embedded space and cannot complete the request.  What the user
 331    meant was for Wget to request `/abc%20%20def', and this is where
 332    reencode_escapes kicks in.
 333
 334    Wget used to solve this by first decoding %-quotes, and then
 335    encoding all the "unsafe" characters found in the resulting string.
 336    This was wrong because it didn't preserve certain URL special
 337    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 338    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 339    whether we considered `+' reserved (it is).  One of these results
 340    is inevitable because by the second step we would lose information
 341    on whether the `+' was originally encoded or not.  Both results
 342    were wrong because in CGI parameters + means space, while %2B means
 343    literal plus.  reencode_escapes correctly translates the above to
 344    "a%2B+b", i.e. returns the original string.
 345
 346    This function uses a modified version of the algorithm originally
 347    proposed by Anon Sricharoenchai:
 348
 349    * Encode all "unsafe" characters, except those that are also
 350      "reserved", to %XX.  See urlchr_table for which characters are
 351      unsafe and reserved.
 352
 353    * Encode the "%" characters not followed by two hex digits to
 354      "%25".
 355
 356    * Pass through all other characters and %XX escapes as-is.  (Up to
 357      Wget 1.10 this decoded %XX escapes corresponding to "safe"
 358      characters, but that was obtrusive and broke some servers.)
 359
 360    Anon's test case:
 361
 362    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 363    ->
 364    "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc"
 365
 366    Simpler test cases:
 367
 368    "foo bar"         -> "foo%20bar"
 369    "foo%20bar"       -> "foo%20bar"
 370    "foo %20bar"      -> "foo%20%20bar"
 371    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 372    "foo%25%20bar"    -> "foo%25%20bar"
 373    "foo%2%20bar"     -> "foo%252%20bar"
 374    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 375    "foo%2b+bar"      -> "foo%2b+bar"  */
 376
 377 static char *
 378 reencode_escapes (const char *s)
 379 {
 380   const char *p1;
 381   char *newstr, *p2;
 382   int oldlen, newlen;
 383
 384   int encode_count = 0;
 385
 386   /* First pass: inspect the string to see if there's anything to do,
 387      and to calculate the new length.  */
 388   for (p1 = s; *p1; p1++)
 389     if (char_needs_escaping (p1))
 390       ++encode_count;
 391
 392   if (!encode_count)
 393     /* The string is good as it is. */
 394     return (char *) s;          /* C const model sucks. */
 395
 396   oldlen = p1 - s;
 397   /* Each encoding adds two characters (hex digits).  */
 398   newlen = oldlen + 2 * encode_count;
 399   newstr = xmalloc (newlen + 1);
 400
 401   /* Second pass: copy the string to the destination address, encoding
 402      chars when needed.  */
 403   p1 = s;
 404   p2 = newstr;
 405
 406   while (*p1)
 407     if (char_needs_escaping (p1))
 408       {
 409         unsigned char c = *p1++;
 410         *p2++ = '%';
 411         *p2++ = XNUM_TO_DIGIT (c >> 4);
 412         *p2++ = XNUM_TO_DIGIT (c & 0xf);
 413       }
 414     else
 415       *p2++ = *p1++;
 416
 417   *p2 = '\0';
 418   assert (p2 - newstr == newlen);
 419   return newstr;
 420 }
 421 \f
 422 /* Returns the scheme type if the scheme is supported, or
 423    SCHEME_INVALID if not.  */
 424
 425 enum url_scheme
 426 url_scheme (const char *url)
 427 {
 428   int i;
 429
 430   for (i = 0; supported_schemes[i].leading_string; i++)
 431     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 432                           strlen (supported_schemes[i].leading_string)))
 433       {
 434         if (!(supported_schemes[i].flags & scm_disabled))
 435           return (enum url_scheme) i;
 436         else
 437           return SCHEME_INVALID;
 438       }
 439
 440   return SCHEME_INVALID;
 441 }
 442
 443 #define SCHEME_CHAR(ch) (c_isalnum (ch) || (ch) == '-' || (ch) == '+')
 444
 445 /* Return 1 if the URL begins with any "scheme", 0 otherwise.  As
 446    currently implemented, it returns true if URL begins with
 447    [-+a-zA-Z0-9]+: .  */
 448
 449 bool
 450 url_has_scheme (const char *url)
 451 {
 452   const char *p = url;
 453
 454   /* The first char must be a scheme char. */
 455   if (!*p || !SCHEME_CHAR (*p))
 456     return false;
 457   ++p;
 458   /* Followed by 0 or more scheme chars. */
 459   while (*p && SCHEME_CHAR (*p))
 460     ++p;
 461   /* Terminated by ':'. */
 462   return *p == ':';
 463 }
 464
 465 bool
 466 url_valid_scheme (const char *url)
 467 {
 468   enum url_scheme scheme = url_scheme (url);
 469   return scheme != SCHEME_INVALID;
 470 }
 471
 472 int
 473 scheme_default_port (enum url_scheme scheme)
 474 {
 475   return supported_schemes[scheme].default_port;
 476 }
 477
 478 void
 479 scheme_disable (enum url_scheme scheme)
 480 {
 481   supported_schemes[scheme].flags |= scm_disabled;
 482 }
 483
 484 /* Skip the username and password, if present in the URL.  The
 485    function should *not* be called with the complete URL, but with the
 486    portion after the scheme.
 487
 488    If no username and password are found, return URL.  */
 489
 490 static const char *
 491 url_skip_credentials (const char *url)
 492 {
 493   /* Look for '@' that comes before terminators, such as '/', '?',
 494      '#', or ';'.  */
 495   const char *p = (const char *)strpbrk (url, "@/?#;");
 496   if (!p || *p != '@')
 497     return url;
 498   return p + 1;
 499 }
 500
 501 /* Parse credentials contained in [BEG, END).  The region is expected
 502    to have come from a URL and is unescaped.  */
 503
 504 static bool
 505 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
 506 {
 507   char *colon;
 508   const char *userend;
 509
 510   if (beg == end)
 511     return false;               /* empty user name */
 512
 513   colon = memchr (beg, ':', end - beg);
 514   if (colon == beg)
 515     return false;               /* again empty user name */
 516
 517   if (colon)
 518     {
 519       *passwd = strdupdelim (colon + 1, end);
 520       userend = colon;
 521       url_unescape (*passwd);
 522     }
 523   else
 524     {
 525       *passwd = NULL;
 526       userend = end;
 527     }
 528   *user = strdupdelim (beg, userend);
 529   url_unescape (*user);
 530   return true;
 531 }
 532
 533 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 534    originally popularized by Netscape and NcFTP.  HTTP shorthands look
 535    like this:
 536
 537    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 538    www.foo.com[:port]            -> http://www.foo.com[:port]
 539
 540    FTP shorthands look like this:
 541
 542    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 543    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 544
 545    If the URL needs not or cannot be rewritten, return NULL.  */
 546
 547 char *
 548 rewrite_shorthand_url (const char *url)
 549 {
 550   const char *p;
 551   char *ret;
 552
 553   if (url_scheme (url) != SCHEME_INVALID)
 554     return NULL;
 555
 556   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 557      latter Netscape.  */
 558   p = strpbrk (url, ":/");
 559   if (p == url)
 560     return NULL;
 561
 562   /* If we're looking at "://", it means the URL uses a scheme we
 563      don't support, which may include "https" when compiled without
 564      SSL support.  Don't bogusly rewrite such URLs.  */
 565   if (p && p[0] == ':' && p[1] == '/' && p[2] == '/')
 566     return NULL;
 567
 568   if (p && *p == ':')
 569     {
 570       /* Colon indicates ftp, as in foo.bar.com:path.  Check for
 571          special case of http port number ("localhost:10000").  */
 572       int digits = strspn (p + 1, "0123456789");
 573       if (digits && (p[1 + digits] == '/' || p[1 + digits] == '\0'))
 574         goto http;
 575
 576       /* Turn "foo.bar.com:path" to "ftp://foo.bar.com/path". */
 577       ret = aprintf ("ftp://%s", url);
 578       ret[6 + (p - url)] = '/';
 579     }
 580   else
 581     {
 582     http:
 583       /* Just prepend "http://" to URL. */
 584       ret = aprintf ("http://%s", url);
 585     }
 586   return ret;
 587 }
 588 \f
 589 static void split_path (const char *, char **, char **);
 590
 591 /* Like strpbrk, with the exception that it returns the pointer to the
 592    terminating zero (end-of-string aka "eos") if no matching character
 593    is found.  */
 594
 595 static inline char *
 596 strpbrk_or_eos (const char *s, const char *accept)
 597 {
 598   char *p = strpbrk (s, accept);
 599   if (!p)
 600     p = strchr (s, '\0');
 601   return p;
 602 }
 603
 604 /* Turn STR into lowercase; return true if a character was actually
 605    changed. */
 606
 607 static bool
 608 lowercase_str (char *str)
 609 {
 610   bool changed = false;
 611   for (; *str; str++)
 612     if (c_isupper (*str))
 613       {
 614         changed = true;
 615         *str = c_tolower (*str);
 616       }
 617   return changed;
 618 }
 619
 620 static const char *
 621 init_seps (enum url_scheme scheme)
 622 {
 623   static char seps[8] = ":/";
 624   char *p = seps + 2;
 625   int flags = supported_schemes[scheme].flags;
 626
 627   if (flags & scm_has_params)
 628     *p++ = ';';
 629   if (flags & scm_has_query)
 630     *p++ = '?';
 631   if (flags & scm_has_fragment)
 632     *p++ = '#';
 633   *p = '\0';
 634   return seps;
 635 }
 636
 637 static const char *parse_errors[] = {
 638 #define PE_NO_ERROR                     0
 639   N_("No error"),
 640 #define PE_UNSUPPORTED_SCHEME           1
 641   N_("Unsupported scheme %s"), /* support for format token only here */
 642 #define PE_MISSING_SCHEME               2
 643   N_("Scheme missing"),
 644 #define PE_INVALID_HOST_NAME            3
 645   N_("Invalid host name"),
 646 #define PE_BAD_PORT_NUMBER              4
 647   N_("Bad port number"),
 648 #define PE_INVALID_USER_NAME            5
 649   N_("Invalid user name"),
 650 #define PE_UNTERMINATED_IPV6_ADDRESS    6
 651   N_("Unterminated IPv6 numeric address"),
 652 #define PE_IPV6_NOT_SUPPORTED           7
 653   N_("IPv6 addresses not supported"),
 654 #define PE_INVALID_IPV6_ADDRESS         8
 655   N_("Invalid IPv6 numeric address")
 656 };
 657
 658 /* Parse a URL.
 659
 660    Return a new struct url if successful, NULL on error.  In case of
 661    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 662    error code. */
 663 struct url *
 664 url_parse (const char *url, int *error, struct iri *iri, bool percent_encode)
 665 {
 666   struct url *u;
 667   const char *p;
 668   bool path_modified, host_modified;
 669
 670   enum url_scheme scheme;
 671   const char *seps;
 672
 673   const char *uname_b,     *uname_e;
 674   const char *host_b,      *host_e;
 675   const char *path_b,      *path_e;
 676   const char *params_b,    *params_e;
 677   const char *query_b,     *query_e;
 678   const char *fragment_b,  *fragment_e;
 679
 680   int port;
 681   char *user = NULL, *passwd = NULL;
 682
 683   const char *url_encoded = NULL;
 684   char *new_url = NULL;
 685
 686   int error_code;
 687
 688   scheme = url_scheme (url);
 689   if (scheme == SCHEME_INVALID)
 690     {
 691       if (url_has_scheme (url))
 692         error_code = PE_UNSUPPORTED_SCHEME;
 693       else
 694         error_code = PE_MISSING_SCHEME;
 695       goto error;
 696     }
 697
 698   if (iri && iri->utf8_encode)
 699     {
 700       iri->utf8_encode = remote_to_utf8 (iri, iri->orig_url ? iri->orig_url : url, (const char **) &new_url);
 701       if (!iri->utf8_encode)
 702         new_url = NULL;
 703       else
 704         {
 705           iri->orig_url = xstrdup (url);
 706           percent_encode = true;
 707         }
 708     }
 709
 710   /* XXX XXX Could that change introduce (security) bugs ???  XXX XXX*/
 711   if (percent_encode)
 712     url_encoded = reencode_escapes (new_url ? new_url : url);
 713   else
 714     url_encoded = new_url ? new_url : url;
 715
 716   p = url_encoded;
 717
 718   if (new_url && url_encoded != new_url)
 719     xfree (new_url);
 720
 721   p += strlen (supported_schemes[scheme].leading_string);
 722   uname_b = p;
 723   p = url_skip_credentials (p);
 724   uname_e = p;
 725
 726   /* scheme://user:pass@host[:port]... */
 727   /*                    ^              */
 728
 729   /* We attempt to break down the URL into the components path,
 730      params, query, and fragment.  They are ordered like this:
 731
 732        scheme://host[:port][/path][;params][?query][#fragment]  */
 733
 734   path_b     = path_e     = NULL;
 735   params_b   = params_e   = NULL;
 736   query_b    = query_e    = NULL;
 737   fragment_b = fragment_e = NULL;
 738
 739   /* Initialize separators for optional parts of URL, depending on the
 740      scheme.  For example, FTP has params, and HTTP and HTTPS have
 741      query string and fragment. */
 742   seps = init_seps (scheme);
 743
 744   host_b = p;
 745
 746   if (*p == '[')
 747     {
 748       /* Handle IPv6 address inside square brackets.  Ideally we'd
 749          just look for the terminating ']', but rfc2732 mandates
 750          rejecting invalid IPv6 addresses.  */
 751
 752       /* The address begins after '['. */
 753       host_b = p + 1;
 754       host_e = strchr (host_b, ']');
 755
 756       if (!host_e)
 757         {
 758           error_code = PE_UNTERMINATED_IPV6_ADDRESS;
 759           goto error;
 760         }
 761
 762 #ifdef ENABLE_IPV6
 763       /* Check if the IPv6 address is valid. */
 764       if (!is_valid_ipv6_address(host_b, host_e))
 765         {
 766           error_code = PE_INVALID_IPV6_ADDRESS;
 767           goto error;
 768         }
 769
 770       /* Continue parsing after the closing ']'. */
 771       p = host_e + 1;
 772 #else
 773       error_code = PE_IPV6_NOT_SUPPORTED;
 774       goto error;
 775 #endif
 776
 777       /* The closing bracket must be followed by a separator or by the
 778          null char.  */
 779       /* http://[::1]... */
 780       /*             ^   */
 781       if (!strchr (seps, *p))
 782         {
 783           /* Trailing garbage after []-delimited IPv6 address. */
 784           error_code = PE_INVALID_HOST_NAME;
 785           goto error;
 786         }
 787     }
 788   else
 789     {
 790       p = strpbrk_or_eos (p, seps);
 791       host_e = p;
 792     }
 793   ++seps;                       /* advance to '/' */
 794
 795   if (host_b == host_e)
 796     {
 797       error_code = PE_INVALID_HOST_NAME;
 798       goto error;
 799     }
 800
 801   port = scheme_default_port (scheme);
 802   if (*p == ':')
 803     {
 804       const char *port_b, *port_e, *pp;
 805
 806       /* scheme://host:port/tralala */
 807       /*              ^             */
 808       ++p;
 809       port_b = p;
 810       p = strpbrk_or_eos (p, seps);
 811       port_e = p;
 812
 813       /* Allow empty port, as per rfc2396. */
 814       if (port_b != port_e)
 815         for (port = 0, pp = port_b; pp < port_e; pp++)
 816           {
 817             if (!c_isdigit (*pp))
 818               {
 819                 /* http://host:12randomgarbage/blah */
 820                 /*               ^                  */
 821                 error_code = PE_BAD_PORT_NUMBER;
 822                 goto error;
 823               }
 824             port = 10 * port + (*pp - '0');
 825             /* Check for too large port numbers here, before we have
 826                a chance to overflow on bogus port values.  */
 827             if (port > 0xffff)
 828               {
 829                 error_code = PE_BAD_PORT_NUMBER;
 830                 goto error;
 831               }
 832           }
 833     }
 834   /* Advance to the first separator *after* '/' (either ';' or '?',
 835      depending on the scheme).  */
 836   ++seps;
 837
 838   /* Get the optional parts of URL, each part being delimited by
 839      current location and the position of the next separator.  */
 840 #define GET_URL_PART(sepchar, var) do {                         \
 841   if (*p == sepchar)                                            \
 842     var##_b = ++p, var##_e = p = strpbrk_or_eos (p, seps);      \
 843   ++seps;                                                       \
 844 } while (0)
 845
 846   GET_URL_PART ('/', path);
 847   if (supported_schemes[scheme].flags & scm_has_params)
 848     GET_URL_PART (';', params);
 849   if (supported_schemes[scheme].flags & scm_has_query)
 850     GET_URL_PART ('?', query);
 851   if (supported_schemes[scheme].flags & scm_has_fragment)
 852     GET_URL_PART ('#', fragment);
 853
 854 #undef GET_URL_PART
 855   assert (*p == 0);
 856
 857   if (uname_b != uname_e)
 858     {
 859       /* http://user:pass@host */
 860       /*        ^         ^    */
 861       /*     uname_b   uname_e */
 862       if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
 863         {
 864           error_code = PE_INVALID_USER_NAME;
 865           goto error;
 866         }
 867     }
 868
 869   u = xnew0 (struct url);
 870   u->scheme = scheme;
 871   u->host   = strdupdelim (host_b, host_e);
 872   u->port   = port;
 873   u->user   = user;
 874   u->passwd = passwd;
 875
 876   u->path = strdupdelim (path_b, path_e);
 877   path_modified = path_simplify (scheme, u->path);
 878   split_path (u->path, &u->dir, &u->file);
 879
 880   host_modified = lowercase_str (u->host);
 881
 882   /* Decode %HH sequences in host name.  This is important not so much
 883      to support %HH sequences in host names (which other browser
 884      don't), but to support binary characters (which will have been
 885      converted to %HH by reencode_escapes).  */
 886   if (strchr (u->host, '%'))
 887     {
 888       url_unescape (u->host);
 889       host_modified = true;
 890
 891       /* Apply IDNA regardless of iri->utf8_encode status */
 892       if (opt.enable_iri && iri)
 893         {
 894           char *new = idn_encode (iri, u->host);
 895           if (new)
 896             {
 897               xfree (u->host);
 898               u->host = new;
 899               host_modified = true;
 900             }
 901         }
 902     }
 903
 904   if (params_b)
 905     u->params = strdupdelim (params_b, params_e);
 906   if (query_b)
 907     u->query = strdupdelim (query_b, query_e);
 908   if (fragment_b)
 909     u->fragment = strdupdelim (fragment_b, fragment_e);
 910
 911   if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e)
 912     {
 913       /* If we suspect that a transformation has rendered what
 914          url_string might return different from URL_ENCODED, rebuild
 915          u->url using url_string.  */
 916       u->url = url_string (u, URL_AUTH_SHOW);
 917
 918       if (url_encoded != url)
 919         xfree ((char *) url_encoded);
 920     }
 921   else
 922     {
 923       if (url_encoded == url)
 924         u->url = xstrdup (url);
 925       else
 926         u->url = (char *) url_encoded;
 927     }
 928
 929   return u;
 930
 931  error:
 932   /* Cleanup in case of error: */
 933   if (url_encoded && url_encoded != url)
 934     xfree ((char *) url_encoded);
 935
 936   /* Transmit the error code to the caller, if the caller wants to
 937      know.  */
 938   if (error)
 939     *error = error_code;
 940   return NULL;
 941 }
 942
 943 /* Return the error message string from ERROR_CODE, which should have
 944    been retrieved from url_parse.  The error message is translated.  */
 945
 946 char *
 947 url_error (const char *url, int error_code)
 948 {
 949   assert (error_code >= 0 && ((size_t) error_code) < countof (parse_errors));
 950
 951   if (error_code == PE_UNSUPPORTED_SCHEME)
 952     {
 953       char *error, *p;
 954       char *scheme = xstrdup (url);
 955       assert (url_has_scheme (url));
 956
 957       if ((p = strchr (scheme, ':')))
 958         *p = '\0';
 959       if (!strcasecmp (scheme, "https"))
 960         error = aprintf (_("HTTPS support not compiled in"));
 961       else
 962         error = aprintf (_(parse_errors[error_code]), quote (scheme));
 963       xfree (scheme);
 964
 965       return error;
 966     }
 967   else
 968     return xstrdup (_(parse_errors[error_code]));
 969 }
 970
 971 /* Split PATH into DIR and FILE.  PATH comes from the URL and is
 972    expected to be URL-escaped.
 973
 974    The path is split into directory (the part up to the last slash)
 975    and file (the part after the last slash), which are subsequently
 976    unescaped.  Examples:
 977
 978    PATH                 DIR           FILE
 979    "foo/bar/baz"        "foo/bar"     "baz"
 980    "foo/bar/"           "foo/bar"     ""
 981    "foo"                ""            "foo"
 982    "foo/bar/baz%2fqux"  "foo/bar"     "baz/qux" (!)
 983
 984    DIR and FILE are freshly allocated.  */
 985
 986 static void
 987 split_path (const char *path, char **dir, char **file)
 988 {
 989   char *last_slash = strrchr (path, '/');
 990   if (!last_slash)
 991     {
 992       *dir = xstrdup ("");
 993       *file = xstrdup (path);
 994     }
 995   else
 996     {
 997       *dir = strdupdelim (path, last_slash);
 998       *file = xstrdup (last_slash + 1);
 999     }
1000   url_unescape (*dir);
1001   url_unescape (*file);
1002 }
1003
1004 /* Note: URL's "full path" is the path with the query string and
1005    params appended.  The "fragment" (#foo) is intentionally ignored,
1006    but that might be changed.  For example, if the original URL was
1007    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
1008    the full path will be "/foo/bar/baz;bullshit?querystring".  */
1009
1010 /* Return the length of the full path, without the terminating
1011    zero.  */
1012
1013 static int
1014 full_path_length (const struct url *url)
1015 {
1016   int len = 0;
1017
1018 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
1019
1020   FROB (path);
1021   FROB (params);
1022   FROB (query);
1023
1024 #undef FROB
1025
1026   return len;
1027 }
1028
1029 /* Write out the full path. */
1030
1031 static void
1032 full_path_write (const struct url *url, char *where)
1033 {
1034 #define FROB(el, chr) do {                      \
1035   char *f_el = url->el;                         \
1036   if (f_el) {                                   \
1037     int l = strlen (f_el);                      \
1038     *where++ = chr;                             \
1039     memcpy (where, f_el, l);                    \
1040     where += l;                                 \
1041   }                                             \
1042 } while (0)
1043
1044   FROB (path, '/');
1045   FROB (params, ';');
1046   FROB (query, '?');
1047
1048 #undef FROB
1049 }
1050
1051 /* Public function for getting the "full path".  E.g. if u->path is
1052    "foo/bar" and u->query is "param=value", full_path will be
1053    "/foo/bar?param=value". */
1054
1055 char *
1056 url_full_path (const struct url *url)
1057 {
1058   int length = full_path_length (url);
1059   char *full_path = xmalloc (length + 1);
1060
1061   full_path_write (url, full_path);
1062   full_path[length] = '\0';
1063
1064   return full_path;
1065 }
1066
1067 /* Unescape CHR in an otherwise escaped STR.  Used to selectively
1068    escaping of certain characters, such as "/" and ":".  Returns a
1069    count of unescaped chars.  */
1070
1071 static void
1072 unescape_single_char (char *str, char chr)
1073 {
1074   const char c1 = XNUM_TO_DIGIT (chr >> 4);
1075   const char c2 = XNUM_TO_DIGIT (chr & 0xf);
1076   char *h = str;                /* hare */
1077   char *t = str;                /* tortoise */
1078   for (; *h; h++, t++)
1079     {
1080       if (h[0] == '%' && h[1] == c1 && h[2] == c2)
1081         {
1082           *t = chr;
1083           h += 2;
1084         }
1085       else
1086         *t = *h;
1087     }
1088   *t = '\0';
1089 }
1090
1091 /* Escape unsafe and reserved characters, except for the slash
1092    characters.  */
1093
1094 static char *
1095 url_escape_dir (const char *dir)
1096 {
1097   char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1098   if (newdir == dir)
1099     return (char *)dir;
1100
1101   unescape_single_char (newdir, '/');
1102   return newdir;
1103 }
1104
1105 /* Sync u->path and u->url with u->dir and u->file.  Called after
1106    u->file or u->dir have been changed, typically by the FTP code.  */
1107
1108 static void
1109 sync_path (struct url *u)
1110 {
1111   char *newpath, *efile, *edir;
1112
1113   xfree (u->path);
1114
1115   /* u->dir and u->file are not escaped.  URL-escape them before
1116      reassembling them into u->path.  That way, if they contain
1117      separators like '?' or even if u->file contains slashes, the
1118      path will be correctly assembled.  (u->file can contain slashes
1119      if the URL specifies it with %2f, or if an FTP server returns
1120      it.)  */
1121   edir = url_escape_dir (u->dir);
1122   efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1123
1124   if (!*edir)
1125     newpath = xstrdup (efile);
1126   else
1127     {
1128       int dirlen = strlen (edir);
1129       int filelen = strlen (efile);
1130
1131       /* Copy "DIR/FILE" to newpath. */
1132       char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1133       memcpy (p, edir, dirlen);
1134       p += dirlen;
1135       *p++ = '/';
1136       memcpy (p, efile, filelen);
1137       p += filelen;
1138       *p = '\0';
1139     }
1140
1141   u->path = newpath;
1142
1143   if (edir != u->dir)
1144     xfree (edir);
1145   if (efile != u->file)
1146     xfree (efile);
1147
1148   /* Regenerate u->url as well.  */
1149   xfree (u->url);
1150   u->url = url_string (u, URL_AUTH_SHOW);
1151 }
1152
1153 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1154    This way we can sync u->path and u->url when they get changed.  */
1155
1156 void
1157 url_set_dir (struct url *url, const char *newdir)
1158 {
1159   xfree (url->dir);
1160   url->dir = xstrdup (newdir);
1161   sync_path (url);
1162 }
1163
1164 void
1165 url_set_file (struct url *url, const char *newfile)
1166 {
1167   xfree (url->file);
1168   url->file = xstrdup (newfile);
1169   sync_path (url);
1170 }
1171
1172 void
1173 url_free (struct url *url)
1174 {
1175   xfree (url->host);
1176   xfree (url->path);
1177   xfree (url->url);
1178
1179   xfree_null (url->params);
1180   xfree_null (url->query);
1181   xfree_null (url->fragment);
1182   xfree_null (url->user);
1183   xfree_null (url->passwd);
1184
1185   xfree (url->dir);
1186   xfree (url->file);
1187
1188   xfree (url);
1189 }
1190 \f
1191 /* Create all the necessary directories for PATH (a file).  Calls
1192    make_directory internally.  */
1193 int
1194 mkalldirs (const char *path)
1195 {
1196   const char *p;
1197   char *t;
1198   struct_stat st;
1199   int res;
1200
1201   p = path + strlen (path);
1202   for (; *p != '/' && p != path; p--)
1203     ;
1204
1205   /* Don't create if it's just a file.  */
1206   if ((p == path) && (*p != '/'))
1207     return 0;
1208   t = strdupdelim (path, p);
1209
1210   /* Check whether the directory exists.  */
1211   if ((stat (t, &st) == 0))
1212     {
1213       if (S_ISDIR (st.st_mode))
1214         {
1215           xfree (t);
1216           return 0;
1217         }
1218       else
1219         {
1220           /* If the dir exists as a file name, remove it first.  This
1221              is *only* for Wget to work with buggy old CERN http
1222              servers.  Here is the scenario: When Wget tries to
1223              retrieve a directory without a slash, e.g.
1224              http://foo/bar (bar being a directory), CERN server will
1225              not redirect it too http://foo/bar/ -- it will generate a
1226              directory listing containing links to bar/file1,
1227              bar/file2, etc.  Wget will lose because it saves this
1228              HTML listing to a file `bar', so it cannot create the
1229              directory.  To work around this, if the file of the same
1230              name exists, we just remove it and create the directory
1231              anyway.  */
1232           DEBUGP (("Removing %s because of directory danger!\n", t));
1233           unlink (t);
1234         }
1235     }
1236   res = make_directory (t);
1237   if (res != 0)
1238     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1239   xfree (t);
1240   return res;
1241 }
1242 \f
1243 /* Functions for constructing the file name out of URL components.  */
1244
1245 /* A growable string structure, used by url_file_name and friends.
1246    This should perhaps be moved to utils.c.
1247
1248    The idea is to have a convenient and efficient way to construct a
1249    string by having various functions append data to it.  Instead of
1250    passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1251    functions in questions, we pass the pointer to this struct.
1252
1253    Functions that write to the members in this struct must make sure
1254    that base remains null terminated by calling append_null().
1255    */
1256
1257 struct growable {
1258   char *base;
1259   int size;   /* memory allocated */
1260   int tail;   /* string length */
1261 };
1262
1263 /* Ensure that the string can accept APPEND_COUNT more characters past
1264    the current TAIL position.  If necessary, this will grow the string
1265    and update its allocated size.  If the string is already large
1266    enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1267 #define GROW(g, append_size) do {                                       \
1268   struct growable *G_ = g;                                              \
1269   DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1270 } while (0)
1271
1272 /* Return the tail position of the string. */
1273 #define TAIL(r) ((r)->base + (r)->tail)
1274
1275 /* Move the tail position by APPEND_COUNT characters. */
1276 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1277
1278
1279 /* Append NULL to DEST. */
1280 static void
1281 append_null (struct growable *dest)
1282 {
1283   GROW (dest, 1);
1284   *TAIL (dest) = 0;
1285 }
1286
1287 /* Append CH to DEST. */
1288 static void
1289 append_char (char ch, struct growable *dest)
1290 {
1291   if (ch)
1292     {
1293       GROW (dest, 1);
1294       *TAIL (dest) = ch;
1295       TAIL_INCR (dest, 1);
1296     }
1297
1298   append_null (dest);
1299 }
1300
1301 /* Append the string STR to DEST. */
1302 static void
1303 append_string (const char *str, struct growable *dest)
1304 {
1305   int l = strlen (str);
1306
1307   if (l)
1308     {
1309       GROW (dest, l);
1310       memcpy (TAIL (dest), str, l);
1311       TAIL_INCR (dest, l);
1312     }
1313
1314   append_null (dest);
1315 }
1316
1317
1318 enum {
1319   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1320   filechr_not_windows = 2,      /* unusable on Windows, one of \|/<>?:*" */
1321   filechr_control     = 4       /* a control character, e.g. 0-31 */
1322 };
1323
1324 #define FILE_CHAR_TEST(c, mask) \
1325     ((opt.restrict_files_nonascii && !c_isascii ((unsigned char)(c))) || \
1326     (filechr_table[(unsigned char)(c)] & (mask)))
1327
1328 /* Shorthands for the table: */
1329 #define U filechr_not_unix
1330 #define W filechr_not_windows
1331 #define C filechr_control
1332
1333 #define UW U|W
1334 #define UWC U|W|C
1335
1336 /* Table of characters unsafe under various conditions (see above).
1337
1338    Arguably we could also claim `%' to be unsafe, since we use it as
1339    the escape character.  If we ever want to be able to reliably
1340    translate file name back to URL, this would become important
1341    crucial.  Right now, it's better to be minimal in escaping.  */
1342
1343 static const unsigned char filechr_table[256] =
1344 {
1345 UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1346   C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1347   C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1348   C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1349   0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1350   0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1351   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1352   0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
1353   0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1354   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1355   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1356   0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1357   0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1358   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1359   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1360   0,  0,  0,  0,   W,  0,  0,  C,   /* x   y   z   {    |   }   ~   DEL */
1361
1362   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
1363   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
1364   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1365   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1366
1367   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1368   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1369   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1370   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1371 };
1372 #undef U
1373 #undef W
1374 #undef C
1375 #undef UW
1376 #undef UWC
1377
1378 /* FN_PORT_SEP is the separator between host and port in file names
1379    for non-standard port numbers.  On Unix this is normally ':', as in
1380    "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1381    because Windows can't handle ':' in file names.  */
1382 #define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1383
1384 /* FN_QUERY_SEP is the separator between the file name and the URL
1385    query, normally '?'.  Since Windows cannot handle '?' as part of
1386    file name, we use '@' instead there.  */
1387 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1388 #define FN_QUERY_SEP_STR (opt.restrict_files_os != restrict_windows ? "?" : "@")
1389
1390 /* Quote path element, characters in [b, e), as file name, and append
1391    the quoted string to DEST.  Each character is quoted as per
1392    file_unsafe_char and the corresponding table.
1393
1394    If ESCAPED is true, the path element is considered to be
1395    URL-escaped and will be unescaped prior to inspection.  */
1396
1397 static void
1398 append_uri_pathel (const char *b, const char *e, bool escaped,
1399                    struct growable *dest)
1400 {
1401   const char *p;
1402   int quoted, outlen;
1403
1404   int mask;
1405   if (opt.restrict_files_os == restrict_unix)
1406     mask = filechr_not_unix;
1407   else
1408     mask = filechr_not_windows;
1409   if (opt.restrict_files_ctrl)
1410     mask |= filechr_control;
1411
1412   /* Copy [b, e) to PATHEL and URL-unescape it. */
1413   if (escaped)
1414     {
1415       char *unescaped;
1416       BOUNDED_TO_ALLOCA (b, e, unescaped);
1417       url_unescape (unescaped);
1418       b = unescaped;
1419       e = unescaped + strlen (unescaped);
1420     }
1421
1422   /* Defang ".." when found as component of path.  Remember that path
1423      comes from the URL and might contain malicious input.  */
1424   if (e - b == 2 && b[0] == '.' && b[1] == '.')
1425     {
1426       b = "%2E%2E";
1427       e = b + 6;
1428     }
1429
1430   /* Walk the PATHEL string and check how many characters we'll need
1431      to quote.  */
1432   quoted = 0;
1433   for (p = b; p < e; p++)
1434     if (FILE_CHAR_TEST (*p, mask))
1435       ++quoted;
1436
1437   /* Calculate the length of the output string.  e-b is the input
1438      string length.  Each quoted char introduces two additional
1439      characters in the string, hence 2*quoted.  */
1440   outlen = (e - b) + (2 * quoted);
1441   GROW (dest, outlen);
1442
1443   if (!quoted)
1444     {
1445       /* If there's nothing to quote, we can simply append the string
1446          without processing it again.  */
1447       memcpy (TAIL (dest), b, outlen);
1448     }
1449   else
1450     {
1451       char *q = TAIL (dest);
1452       for (p = b; p < e; p++)
1453         {
1454           if (!FILE_CHAR_TEST (*p, mask))
1455             *q++ = *p;
1456           else
1457             {
1458               unsigned char ch = *p;
1459               *q++ = '%';
1460               *q++ = XNUM_TO_DIGIT (ch >> 4);
1461               *q++ = XNUM_TO_DIGIT (ch & 0xf);
1462             }
1463         }
1464       assert (q - TAIL (dest) == outlen);
1465     }
1466
1467   /* Perform inline case transformation if required.  */
1468   if (opt.restrict_files_case == restrict_lowercase
1469       || opt.restrict_files_case == restrict_uppercase)
1470     {
1471       char *q;
1472       for (q = TAIL (dest); q < TAIL (dest) + outlen; ++q)
1473         {
1474           if (opt.restrict_files_case == restrict_lowercase)
1475             *q = c_tolower (*q);
1476           else
1477             *q = c_toupper (*q);
1478         }
1479     }
1480
1481   TAIL_INCR (dest, outlen);
1482   append_null (dest);
1483 }
1484
1485 /* Append to DEST the directory structure that corresponds the
1486    directory part of URL's path.  For example, if the URL is
1487    http://server/dir1/dir2/file, this appends "/dir1/dir2".
1488
1489    Each path element ("dir1" and "dir2" in the above example) is
1490    examined, url-unescaped, and re-escaped as file name element.
1491
1492    Additionally, it cuts as many directories from the path as
1493    specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1494    will produce "bar" for the above example.  For 2 or more, it will
1495    produce "".
1496
1497    Each component of the path is quoted for use as file name.  */
1498
1499 static void
1500 append_dir_structure (const struct url *u, struct growable *dest)
1501 {
1502   char *pathel, *next;
1503   int cut = opt.cut_dirs;
1504
1505   /* Go through the path components, de-URL-quote them, and quote them
1506      (if necessary) as file names.  */
1507
1508   pathel = u->path;
1509   for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1510     {
1511       if (cut-- > 0)
1512         continue;
1513       if (pathel == next)
1514         /* Ignore empty pathels.  */
1515         continue;
1516
1517       if (dest->tail)
1518         append_char ('/', dest);
1519       append_uri_pathel (pathel, next, true, dest);
1520     }
1521 }
1522
1523 /* Return a unique file name that matches the given URL as well as
1524    possible.  Does not create directories on the file system.  */
1525
1526 char *
1527 url_file_name (const struct url *u, char *replaced_filename)
1528 {
1529   struct growable fnres;        /* stands for "file name result" */
1530   struct growable temp_fnres;
1531
1532   const char *u_file;
1533   char *fname, *unique, *fname_len_check;
1534   const char *index_filename = "index.html"; /* The default index file is index.html */
1535   size_t max_length;
1536
1537   fnres.base = NULL;
1538   fnres.size = 0;
1539   fnres.tail = 0;
1540
1541   temp_fnres.base = NULL;
1542   temp_fnres.size = 0;
1543   temp_fnres.tail = 0;
1544
1545   /* If an alternative index file was defined, change index_filename */
1546   if (opt.default_page)
1547     index_filename = opt.default_page;
1548
1549
1550   /* Start with the directory prefix, if specified. */
1551   if (opt.dir_prefix)
1552     append_string (opt.dir_prefix, &fnres);
1553
1554   /* If "dirstruct" is turned on (typically the case with -r), add
1555      the host and port (unless those have been turned off) and
1556      directory structure.  */
1557   if (opt.dirstruct)
1558     {
1559       if (opt.protocol_directories)
1560         {
1561           if (fnres.tail)
1562             append_char ('/', &fnres);
1563           append_string (supported_schemes[u->scheme].name, &fnres);
1564         }
1565       if (opt.add_hostdir)
1566         {
1567           if (fnres.tail)
1568             append_char ('/', &fnres);
1569           if (0 != strcmp (u->host, ".."))
1570             append_string (u->host, &fnres);
1571           else
1572             /* Host name can come from the network; malicious DNS may
1573                allow ".." to be resolved, causing us to write to
1574                "../<file>".  Defang such host names.  */
1575             append_string ("%2E%2E", &fnres);
1576           if (u->port != scheme_default_port (u->scheme))
1577             {
1578               char portstr[24];
1579               number_to_string (portstr, u->port);
1580               append_char (FN_PORT_SEP, &fnres);
1581               append_string (portstr, &fnres);
1582             }
1583         }
1584
1585       append_dir_structure (u, &fnres);
1586     }
1587
1588   if (!replaced_filename)
1589     {
1590       /* Create the filename. */
1591       u_file = *u->file ? u->file : index_filename;
1592
1593       /* Append "?query" to the file name, even if empty,
1594        * and create fname_len_check. */
1595       if (u->query)
1596         fname_len_check = concat_strings (u_file, FN_QUERY_SEP_STR, u->query, NULL);
1597       else
1598         fname_len_check = strdupdelim (u_file, u_file + strlen (u_file));
1599     }
1600   else
1601     {
1602       u_file = replaced_filename;
1603       fname_len_check = strdupdelim (u_file, u_file + strlen (u_file));
1604     }
1605
1606   append_uri_pathel (fname_len_check,
1607     fname_len_check + strlen (fname_len_check), false, &temp_fnres);
1608
1609   /* Zero-terminate the temporary file name. */
1610   append_char ('\0', &temp_fnres);
1611
1612   /* Check that the length of the file name is acceptable. */
1613 #ifdef WINDOWS
1614   if (MAX_PATH > (fnres.tail + CHOMP_BUFFER + 2))
1615     {
1616       max_length = MAX_PATH - (fnres.tail + CHOMP_BUFFER + 2);
1617       /* FIXME: In Windows a filename is usually limited to 255 characters.
1618       To really be accurate you could call GetVolumeInformation() to get
1619       lpMaximumComponentLength
1620       */
1621       if (max_length > 255)
1622         {
1623           max_length = 255;
1624         }
1625     }
1626   else
1627     {
1628       max_length = 0;
1629     }
1630 #else
1631   max_length = get_max_length (fnres.base, fnres.tail, _PC_NAME_MAX) - CHOMP_BUFFER;
1632 #endif
1633   if (max_length > 0 && strlen (temp_fnres.base) > max_length)
1634     {
1635       logprintf (LOG_NOTQUIET, "The name is too long, %lu chars total.\n",
1636           (unsigned long) strlen (temp_fnres.base));
1637       logprintf (LOG_NOTQUIET, "Trying to shorten...\n");
1638
1639       /* Shorten the file name. */
1640       temp_fnres.base[max_length] = '\0';
1641
1642       logprintf (LOG_NOTQUIET, "New name is %s.\n", temp_fnres.base);
1643     }
1644
1645   free (fname_len_check);
1646
1647   /* The filename has already been 'cleaned' by append_uri_pathel() above.  So,
1648    * just append it. */
1649   if (fnres.tail)
1650     append_char ('/', &fnres);
1651   append_string (temp_fnres.base, &fnres);
1652
1653   fname = fnres.base;
1654
1655   /* Make a final check that the path length is acceptable? */
1656   /* TODO: check fnres.base for path length problem */
1657
1658   free (temp_fnres.base);
1659
1660   /* Check the cases in which the unique extensions are not used:
1661      1) Clobbering is turned off (-nc).
1662      2) Retrieval with regetting.
1663      3) Timestamping is used.
1664      4) Hierarchy is built.
1665      5) Backups are specified.
1666
1667      The exception is the case when file does exist and is a
1668      directory (see `mkalldirs' for explanation).  */
1669
1670   if (ALLOW_CLOBBER
1671       && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1672     {
1673       unique = fname;
1674     }
1675   else
1676     {
1677       unique = unique_name (fname, true);
1678       if (unique != fname)
1679         xfree (fname);
1680     }
1681
1682 /* On VMS, alter the name as required. */
1683 #ifdef __VMS
1684   {
1685     char *unique2;
1686
1687     unique2 = ods_conform( unique);
1688     if (unique2 != unique)
1689       {
1690         xfree (unique);
1691         unique = unique2;
1692       }
1693   }
1694 #endif /* def __VMS */
1695
1696   return unique;
1697 }
1698 \f
1699 /* Resolve "." and ".." elements of PATH by destructively modifying
1700    PATH and return true if PATH has been modified, false otherwise.
1701
1702    The algorithm is in spirit similar to the one described in rfc1808,
1703    although implemented differently, in one pass.  To recap, path
1704    elements containing only "." are removed, and ".." is taken to mean
1705    "back up one element".  Single leading and trailing slashes are
1706    preserved.
1707
1708    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1709    test examples are provided below.  If you change anything in this
1710    function, run test_path_simplify to make sure you haven't broken a
1711    test case.  */
1712
1713 static bool
1714 path_simplify (enum url_scheme scheme, char *path)
1715 {
1716   char *h = path;               /* hare */
1717   char *t = path;               /* tortoise */
1718   char *beg = path;
1719   char *end = strchr (path, '\0');
1720
1721   while (h < end)
1722     {
1723       /* Hare should be at the beginning of a path element. */
1724
1725       if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1726         {
1727           /* Ignore "./". */
1728           h += 2;
1729         }
1730       else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1731         {
1732           /* Handle "../" by retreating the tortoise by one path
1733              element -- but not past beggining.  */
1734           if (t > beg)
1735             {
1736               /* Move backwards until T hits the beginning of the
1737                  previous path element or the beginning of path. */
1738               for (--t; t > beg && t[-1] != '/'; t--)
1739                 ;
1740             }
1741           else if (scheme == SCHEME_FTP)
1742             {
1743               /* If we're at the beginning, copy the "../" literally
1744                  and move the beginning so a later ".." doesn't remove
1745                  it.  This violates RFC 3986; but we do it for FTP
1746                  anyway because there is otherwise no way to get at a
1747                  parent directory, when the FTP server drops us in a
1748                  non-root directory (which is not uncommon). */
1749               beg = t + 3;
1750               goto regular;
1751             }
1752           h += 3;
1753         }
1754       else
1755         {
1756         regular:
1757           /* A regular path element.  If H hasn't advanced past T,
1758              simply skip to the next path element.  Otherwise, copy
1759              the path element until the next slash.  */
1760           if (t == h)
1761             {
1762               /* Skip the path element, including the slash.  */
1763               while (h < end && *h != '/')
1764                 t++, h++;
1765               if (h < end)
1766                 t++, h++;
1767             }
1768           else
1769             {
1770               /* Copy the path element, including the final slash.  */
1771               while (h < end && *h != '/')
1772                 *t++ = *h++;
1773               if (h < end)
1774                 *t++ = *h++;
1775             }
1776         }
1777     }
1778
1779   if (t != h)
1780     *t = '\0';
1781
1782   return t != h;
1783 }
1784 \f
1785 /* Return the length of URL's path.  Path is considered to be
1786    terminated by one or more of the ?query or ;params or #fragment,
1787    depending on the scheme.  */
1788
1789 static const char *
1790 path_end (const char *url)
1791 {
1792   enum url_scheme scheme = url_scheme (url);
1793   const char *seps;
1794   if (scheme == SCHEME_INVALID)
1795     scheme = SCHEME_HTTP;       /* use http semantics for rel links */
1796   /* +2 to ignore the first two separators ':' and '/' */
1797   seps = init_seps (scheme) + 2;
1798   return strpbrk_or_eos (url, seps);
1799 }
1800
1801 /* Find the last occurrence of character C in the range [b, e), or
1802    NULL, if none are present.  */
1803 #define find_last_char(b, e, c) memrchr ((b), (c), (e) - (b))
1804
1805 /* Merge BASE with LINK and return the resulting URI.
1806
1807    Either of the URIs may be absolute or relative, complete with the
1808    host name, or path only.  This tries to reasonably handle all
1809    foreseeable cases.  It only employs minimal URL parsing, without
1810    knowledge of the specifics of schemes.
1811
1812    I briefly considered making this function call path_simplify after
1813    the merging process, as rfc1738 seems to suggest.  This is a bad
1814    idea for several reasons: 1) it complexifies the code, and 2)
1815    url_parse has to simplify path anyway, so it's wasteful to boot.  */
1816
1817 char *
1818 uri_merge (const char *base, const char *link)
1819 {
1820   int linklength;
1821   const char *end;
1822   char *merge;
1823
1824   if (url_has_scheme (link))
1825     return xstrdup (link);
1826
1827   /* We may not examine BASE past END. */
1828   end = path_end (base);
1829   linklength = strlen (link);
1830
1831   if (!*link)
1832     {
1833       /* Empty LINK points back to BASE, query string and all. */
1834       return xstrdup (base);
1835     }
1836   else if (*link == '?')
1837     {
1838       /* LINK points to the same location, but changes the query
1839          string.  Examples: */
1840       /* uri_merge("path",         "?new") -> "path?new"     */
1841       /* uri_merge("path?foo",     "?new") -> "path?new"     */
1842       /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1843       /* uri_merge("path#foo",     "?new") -> "path?new"     */
1844       int baselength = end - base;
1845       merge = xmalloc (baselength + linklength + 1);
1846       memcpy (merge, base, baselength);
1847       memcpy (merge + baselength, link, linklength);
1848       merge[baselength + linklength] = '\0';
1849     }
1850   else if (*link == '#')
1851     {
1852       /* uri_merge("path",         "#new") -> "path#new"     */
1853       /* uri_merge("path#foo",     "#new") -> "path#new"     */
1854       /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1855       /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1856       int baselength;
1857       const char *end1 = strchr (base, '#');
1858       if (!end1)
1859         end1 = base + strlen (base);
1860       baselength = end1 - base;
1861       merge = xmalloc (baselength + linklength + 1);
1862       memcpy (merge, base, baselength);
1863       memcpy (merge + baselength, link, linklength);
1864       merge[baselength + linklength] = '\0';
1865     }
1866   else if (*link == '/' && *(link + 1) == '/')
1867     {
1868       /* LINK begins with "//" and so is a net path: we need to
1869          replace everything after (and including) the double slash
1870          with LINK. */
1871
1872       /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1873       /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1874       /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1875
1876       int span;
1877       const char *slash;
1878       const char *start_insert;
1879
1880       /* Look for first slash. */
1881       slash = memchr (base, '/', end - base);
1882       /* If found slash and it is a double slash, then replace
1883          from this point, else default to replacing from the
1884          beginning.  */
1885       if (slash && *(slash + 1) == '/')
1886         start_insert = slash;
1887       else
1888         start_insert = base;
1889
1890       span = start_insert - base;
1891       merge = xmalloc (span + linklength + 1);
1892       if (span)
1893         memcpy (merge, base, span);
1894       memcpy (merge + span, link, linklength);
1895       merge[span + linklength] = '\0';
1896     }
1897   else if (*link == '/')
1898     {
1899       /* LINK is an absolute path: we need to replace everything
1900          after (and including) the FIRST slash with LINK.
1901
1902          So, if BASE is "http://host/whatever/foo/bar", and LINK is
1903          "/qux/xyzzy", our result should be
1904          "http://host/qux/xyzzy".  */
1905       int span;
1906       const char *slash;
1907       const char *start_insert = NULL; /* for gcc to shut up. */
1908       const char *pos = base;
1909       bool seen_slash_slash = false;
1910       /* We're looking for the first slash, but want to ignore
1911          double slash. */
1912     again:
1913       slash = memchr (pos, '/', end - pos);
1914       if (slash && !seen_slash_slash)
1915         if (*(slash + 1) == '/')
1916           {
1917             pos = slash + 2;
1918             seen_slash_slash = true;
1919             goto again;
1920           }
1921
1922       /* At this point, SLASH is the location of the first / after
1923          "//", or the first slash altogether.  START_INSERT is the
1924          pointer to the location where LINK will be inserted.  When
1925          examining the last two examples, keep in mind that LINK
1926          begins with '/'. */
1927
1928       if (!slash && !seen_slash_slash)
1929         /* example: "foo" */
1930         /*           ^    */
1931         start_insert = base;
1932       else if (!slash && seen_slash_slash)
1933         /* example: "http://foo" */
1934         /*                     ^ */
1935         start_insert = end;
1936       else if (slash && !seen_slash_slash)
1937         /* example: "foo/bar" */
1938         /*           ^        */
1939         start_insert = base;
1940       else if (slash && seen_slash_slash)
1941         /* example: "http://something/" */
1942         /*                           ^  */
1943         start_insert = slash;
1944
1945       span = start_insert - base;
1946       merge = xmalloc (span + linklength + 1);
1947       if (span)
1948         memcpy (merge, base, span);
1949       memcpy (merge + span, link, linklength);
1950       merge[span + linklength] = '\0';
1951     }
1952   else
1953     {
1954       /* LINK is a relative URL: we need to replace everything
1955          after last slash (possibly empty) with LINK.
1956
1957          So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1958          our result should be "whatever/foo/qux/xyzzy".  */
1959       bool need_explicit_slash = false;
1960       int span;
1961       const char *start_insert;
1962       const char *last_slash = find_last_char (base, end, '/');
1963       if (!last_slash)
1964         {
1965           /* No slash found at all.  Replace what we have with LINK. */
1966           start_insert = base;
1967         }
1968       else if (last_slash && last_slash >= base + 2
1969                && last_slash[-2] == ':' && last_slash[-1] == '/')
1970         {
1971           /* example: http://host"  */
1972           /*                      ^ */
1973           start_insert = end + 1;
1974           need_explicit_slash = true;
1975         }
1976       else
1977         {
1978           /* example: "whatever/foo/bar" */
1979           /*                        ^    */
1980           start_insert = last_slash + 1;
1981         }
1982
1983       span = start_insert - base;
1984       merge = xmalloc (span + linklength + 1);
1985       if (span)
1986         memcpy (merge, base, span);
1987       if (need_explicit_slash)
1988         merge[span - 1] = '/';
1989       memcpy (merge + span, link, linklength);
1990       merge[span + linklength] = '\0';
1991     }
1992
1993   return merge;
1994 }
1995 \f
1996 #define APPEND(p, s) do {                       \
1997   int len = strlen (s);                         \
1998   memcpy (p, s, len);                           \
1999   p += len;                                     \
2000 } while (0)
2001
2002 /* Use this instead of password when the actual password is supposed
2003    to be hidden.  We intentionally use a generic string without giving
2004    away the number of characters in the password, like previous
2005    versions did.  */
2006 #define HIDDEN_PASSWORD "*password*"
2007
2008 /* Recreate the URL string from the data in URL.
2009
2010    If HIDE is true (as it is when we're calling this on a URL we plan
2011    to print, but not when calling it to canonicalize a URL for use
2012    within the program), password will be hidden.  Unsafe characters in
2013    the URL will be quoted.  */
2014
2015 char *
2016 url_string (const struct url *url, enum url_auth_mode auth_mode)
2017 {
2018   int size;
2019   char *result, *p;
2020   char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;
2021
2022   int scheme_port = supported_schemes[url->scheme].default_port;
2023   const char *scheme_str = supported_schemes[url->scheme].leading_string;
2024   int fplen = full_path_length (url);
2025
2026   bool brackets_around_host;
2027
2028   assert (scheme_str != NULL);
2029
2030   /* Make sure the user name and password are quoted. */
2031   if (url->user)
2032     {
2033       if (auth_mode != URL_AUTH_HIDE)
2034         {
2035           quoted_user = url_escape_allow_passthrough (url->user);
2036           if (url->passwd)
2037             {
2038               if (auth_mode == URL_AUTH_HIDE_PASSWD)
2039                 quoted_passwd = (char *) HIDDEN_PASSWORD;
2040               else
2041                 quoted_passwd = url_escape_allow_passthrough (url->passwd);
2042             }
2043         }
2044     }
2045
2046   /* In the unlikely event that the host name contains non-printable
2047      characters, quote it for displaying to the user.  */
2048   quoted_host = url_escape_allow_passthrough (url->host);
2049
2050   /* Undo the quoting of colons that URL escaping performs.  IPv6
2051      addresses may legally contain colons, and in that case must be
2052      placed in square brackets.  */
2053   if (quoted_host != url->host)
2054     unescape_single_char (quoted_host, ':');
2055   brackets_around_host = strchr (quoted_host, ':') != NULL;
2056
2057   size = (strlen (scheme_str)
2058           + strlen (quoted_host)
2059           + (brackets_around_host ? 2 : 0)
2060           + fplen
2061           + 1);
2062   if (url->port != scheme_port)
2063     size += 1 + numdigit (url->port);
2064   if (quoted_user)
2065     {
2066       size += 1 + strlen (quoted_user);
2067       if (quoted_passwd)
2068         size += 1 + strlen (quoted_passwd);
2069     }
2070
2071   p = result = xmalloc (size);
2072
2073   APPEND (p, scheme_str);
2074   if (quoted_user)
2075     {
2076       APPEND (p, quoted_user);
2077       if (quoted_passwd)
2078         {
2079           *p++ = ':';
2080           APPEND (p, quoted_passwd);
2081         }
2082       *p++ = '@';
2083     }
2084
2085   if (brackets_around_host)
2086     *p++ = '[';
2087   APPEND (p, quoted_host);
2088   if (brackets_around_host)
2089     *p++ = ']';
2090   if (url->port != scheme_port)
2091     {
2092       *p++ = ':';
2093       p = number_to_string (p, url->port);
2094     }
2095
2096   full_path_write (url, p);
2097   p += fplen;
2098   *p++ = '\0';
2099
2100   assert (p - result == size);
2101
2102   if (quoted_user && quoted_user != url->user)
2103     xfree (quoted_user);
2104   if (quoted_passwd && auth_mode == URL_AUTH_SHOW
2105       && quoted_passwd != url->passwd)
2106     xfree (quoted_passwd);
2107   if (quoted_host != url->host)
2108     xfree (quoted_host);
2109
2110   return result;
2111 }
2112 \f
2113 /* Return true if scheme a is similar to scheme b.
2114
2115    Schemes are similar if they are equal.  If SSL is supported, schemes
2116    are also similar if one is http (SCHEME_HTTP) and the other is https
2117    (SCHEME_HTTPS).  */
2118 bool
2119 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2120 {
2121   if (a == b)
2122     return true;
2123 #ifdef HAVE_SSL
2124   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2125       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2126     return true;
2127 #endif
2128   return false;
2129 }
2130 \f
2131 static int
2132 getchar_from_escaped_string (const char *str, char *c)
2133 {
2134   const char *p = str;
2135
2136   assert (str && *str);
2137   assert (c);
2138
2139   if (p[0] == '%')
2140     {
2141       if (!c_isxdigit(p[1]) || !c_isxdigit(p[2]))
2142         {
2143           *c = '%';
2144           return 1;
2145         }
2146       else
2147         {
2148           if (p[2] == 0)
2149             return 0; /* error: invalid string */
2150
2151           *c = X2DIGITS_TO_NUM (p[1], p[2]);
2152           if (URL_RESERVED_CHAR(*c))
2153             {
2154               *c = '%';
2155               return 1;
2156             }
2157           else
2158             return 3;
2159         }
2160     }
2161   else
2162     {
2163       *c = p[0];
2164     }
2165
2166   return 1;
2167 }
2168
2169 bool
2170 are_urls_equal (const char *u1, const char *u2)
2171 {
2172   const char *p, *q;
2173   int pp, qq;
2174   char ch1, ch2;
2175   assert(u1 && u2);
2176
2177   p = u1;
2178   q = u2;
2179
2180   while (*p && *q
2181          && (pp = getchar_from_escaped_string (p, &ch1))
2182          && (qq = getchar_from_escaped_string (q, &ch2))
2183          && (c_tolower(ch1) == c_tolower(ch2)))
2184     {
2185       p += pp;
2186       q += qq;
2187     }
2188
2189   return (*p == 0 && *q == 0 ? true : false);
2190 }
2191 \f
2192 #ifdef TESTING
2193 /* Debugging and testing support for path_simplify. */
2194
2195 #if 0
2196 /* Debug: run path_simplify on PATH and return the result in a new
2197    string.  Useful for calling from the debugger.  */
2198 static char *
2199 ps (char *path)
2200 {
2201   char *copy = xstrdup (path);
2202   path_simplify (copy);
2203   return copy;
2204 }
2205 #endif
2206
2207 static const char *
2208 run_test (const char *test, const char *expected_result, enum url_scheme scheme,
2209           bool expected_change)
2210 {
2211   char *test_copy = xstrdup (test);
2212   bool modified = path_simplify (scheme, test_copy);
2213
2214   if (0 != strcmp (test_copy, expected_result))
2215     {
2216       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2217               test, expected_result, test_copy);
2218       mu_assert ("", 0);
2219     }
2220   if (modified != expected_change)
2221     {
2222       if (expected_change)
2223         printf ("Expected modification with path_simplify(\"%s\").\n",
2224                 test);
2225       else
2226         printf ("Expected no modification with path_simplify(\"%s\").\n",
2227                 test);
2228     }
2229   xfree (test_copy);
2230   mu_assert ("", modified == expected_change);
2231   return NULL;
2232 }
2233
2234 const char *
2235 test_path_simplify (void)
2236 {
2237   static const struct {
2238     const char *test, *result;
2239     enum url_scheme scheme;
2240     bool should_modify;
2241   } tests[] = {
2242     { "",                       "",             SCHEME_HTTP, false },
2243     { ".",                      "",             SCHEME_HTTP, true },
2244     { "./",                     "",             SCHEME_HTTP, true },
2245     { "..",                     "",             SCHEME_HTTP, true },
2246     { "../",                    "",             SCHEME_HTTP, true },
2247     { "..",                     "..",           SCHEME_FTP,  false },
2248     { "../",                    "../",          SCHEME_FTP,  false },
2249     { "foo",                    "foo",          SCHEME_HTTP, false },
2250     { "foo/bar",                "foo/bar",      SCHEME_HTTP, false },
2251     { "foo///bar",              "foo///bar",    SCHEME_HTTP, false },
2252     { "foo/.",                  "foo/",         SCHEME_HTTP, true },
2253     { "foo/./",                 "foo/",         SCHEME_HTTP, true },
2254     { "foo./",                  "foo./",        SCHEME_HTTP, false },
2255     { "foo/../bar",             "bar",          SCHEME_HTTP, true },
2256     { "foo/../bar/",            "bar/",         SCHEME_HTTP, true },
2257     { "foo/bar/..",             "foo/",         SCHEME_HTTP, true },
2258     { "foo/bar/../x",           "foo/x",        SCHEME_HTTP, true },
2259     { "foo/bar/../x/",          "foo/x/",       SCHEME_HTTP, true },
2260     { "foo/..",                 "",             SCHEME_HTTP, true },
2261     { "foo/../..",              "",             SCHEME_HTTP, true },
2262     { "foo/../../..",           "",             SCHEME_HTTP, true },
2263     { "foo/../../bar/../../baz", "baz",         SCHEME_HTTP, true },
2264     { "foo/../..",              "..",           SCHEME_FTP,  true },
2265     { "foo/../../..",           "../..",        SCHEME_FTP,  true },
2266     { "foo/../../bar/../../baz", "../../baz",   SCHEME_FTP,  true },
2267     { "a/b/../../c",            "c",            SCHEME_HTTP, true },
2268     { "./a/../b",               "b",            SCHEME_HTTP, true }
2269   };
2270   unsigned i;
2271
2272   for (i = 0; i < countof (tests); i++)
2273     {
2274       const char *message;
2275       const char *test = tests[i].test;
2276       const char *expected_result = tests[i].result;
2277       enum url_scheme scheme = tests[i].scheme;
2278       bool  expected_change = tests[i].should_modify;
2279
2280       message = run_test (test, expected_result, scheme, expected_change);
2281       if (message) return message;
2282     }
2283   return NULL;
2284 }
2285
2286 const char *
2287 test_append_uri_pathel(void)
2288 {
2289   unsigned i;
2290   static const struct {
2291     const char *original_url;
2292     const char *input;
2293     bool escaped;
2294     const char *expected_result;
2295   } test_array[] = {
2296     { "http://www.yoyodyne.com/path/", "somepage.html", false, "http://www.yoyodyne.com/path/somepage.html" },
2297   };
2298
2299   for (i = 0; i < countof(test_array); ++i)
2300     {
2301       struct growable dest;
2302       const char *p = test_array[i].input;
2303
2304       memset (&dest, 0, sizeof (dest));
2305
2306       append_string (test_array[i].original_url, &dest);
2307       append_uri_pathel (p, p + strlen(p), test_array[i].escaped, &dest);
2308
2309       mu_assert ("test_append_uri_pathel: wrong result",
2310                  strcmp (dest.base, test_array[i].expected_result) == 0);
2311     }
2312
2313   return NULL;
2314 }
2315
2316 const char *
2317 test_are_urls_equal(void)
2318 {
2319   unsigned i;
2320   static const struct {
2321     const char *url1;
2322     const char *url2;
2323     bool expected_result;
2324   } test_array[] = {
2325     { "http://www.adomain.com/apath/", "http://www.adomain.com/apath/",       true },
2326     { "http://www.adomain.com/apath/", "http://www.adomain.com/anotherpath/", false },
2327     { "http://www.adomain.com/apath/", "http://www.anotherdomain.com/path/",  false },
2328     { "http://www.adomain.com/~path/", "http://www.adomain.com/%7epath/",     true },
2329     { "http://www.adomain.com/longer-path/", "http://www.adomain.com/path/",  false },
2330     { "http://www.adomain.com/path%2f", "http://www.adomain.com/path/",       false },
2331   };
2332
2333   for (i = 0; i < countof(test_array); ++i)
2334     {
2335       mu_assert ("test_are_urls_equal: wrong result",
2336                  are_urls_equal (test_array[i].url1, test_array[i].url2) == test_array[i].expected_result);
2337     }
2338
2339   return NULL;
2340 }
2341
2342 #endif /* TESTING */
2343
2344 /*
2345  * vim: et ts=2 sw=2
2346  */
2347