sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
   3    2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or (at
  10 your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #include "wget.h"
  32
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 #ifdef HAVE_UNISTD_H
  37 # include <unistd.h>
  38 #endif
  39 #include <errno.h>
  40 #include <assert.h>
  41
  42 #include "utils.h"
  43 #include "url.h"
  44 #include "host.h"  /* for is_valid_ipv6_address */
  45
  46 #ifdef TESTING
  47 #include "test.h"
  48 #endif
  49
  50 enum {
  51   scm_disabled = 1,             /* for https when OpenSSL fails to init. */
  52   scm_has_params = 2,           /* whether scheme has ;params */
  53   scm_has_query = 4,            /* whether scheme has ?query */
  54   scm_has_fragment = 8          /* whether scheme has #fragment */
  55 };
  56
  57 struct scheme_data
  58 {
  59   /* Short name of the scheme, such as "http" or "ftp". */
  60   const char *name;
  61   /* Leading string that identifies the scheme, such as "https://". */
  62   const char *leading_string;
  63   /* Default port of the scheme when none is specified. */
  64   int default_port;
  65   /* Various flags. */
  66   int flags;
  67 };
  68
  69 /* Supported schemes: */
  70 static struct scheme_data supported_schemes[] =
  71 {
  72   { "http",     "http://",  DEFAULT_HTTP_PORT,  scm_has_query|scm_has_fragment },
  73 #ifdef HAVE_SSL
  74   { "https",    "https://", DEFAULT_HTTPS_PORT, scm_has_query|scm_has_fragment },
  75 #endif
  76   { "ftp",      "ftp://",   DEFAULT_FTP_PORT,   scm_has_params|scm_has_fragment },
  77
  78   /* SCHEME_INVALID */
  79   { NULL,       NULL,       -1,                 0 }
  80 };
  81
  82 /* Forward declarations: */
  83
  84 static bool path_simplify (enum url_scheme, char *);
  85 \f
  86 /* Support for escaping and unescaping of URL strings.  */
  87
  88 /* Table of "reserved" and "unsafe" characters.  Those terms are
  89    rfc1738-speak, as such largely obsoleted by rfc2396 and later
  90    specs, but the general idea remains.
  91
  92    A reserved character is the one that you can't decode without
  93    changing the meaning of the URL.  For example, you can't decode
  94    "/foo/%2f/bar" into "/foo///bar" because the number and contents of
  95    path components is different.  Non-reserved characters can be
  96    changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar".  The
  97    unsafe characters are loosely based on rfc1738, plus "$" and ",",
  98    as recommended by rfc2396, and minus "~", which is very frequently
  99    used (and sometimes unrecognized as %7E by broken servers).
 100
 101    An unsafe character is the one that should be encoded when URLs are
 102    placed in foreign environments.  E.g. space and newline are unsafe
 103    in HTTP contexts because HTTP uses them as separator and line
 104    terminator, so they must be encoded to %20 and %0A respectively.
 105    "*" is unsafe in shell context, etc.
 106
 107    We determine whether a character is unsafe through static table
 108    lookup.  This code assumes ASCII character set and 8-bit chars.  */
 109
 110 enum {
 111   /* rfc1738 reserved chars + "$" and ",".  */
 112   urlchr_reserved = 1,
 113
 114   /* rfc1738 unsafe chars, plus non-printables.  */
 115   urlchr_unsafe   = 2
 116 };
 117
 118 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 119 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 120 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 121
 122 /* Shorthands for the table: */
 123 #define R  urlchr_reserved
 124 #define U  urlchr_unsafe
 125 #define RU R|U
 126
 127 static const unsigned char urlchr_table[256] =
 128 {
 129   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 130   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 131   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 132   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 133   U,  0,  U, RU,   R,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 134   0,  0,  0,  R,   R,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 135   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 136   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 137  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 138   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 139   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 140   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 141   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 142   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 143   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 144   0,  0,  0,  U,   U,  U,  0,  U,   /* x   y   z   {    |   }   ~   DEL */
 145
 146   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 147   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 148   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 149   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 150
 151   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 152   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 153   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 154   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 155 };
 156 #undef R
 157 #undef U
 158 #undef RU
 159
 160 /* URL-unescape the string S.
 161
 162    This is done by transforming the sequences "%HH" to the character
 163    represented by the hexadecimal digits HH.  If % is not followed by
 164    two hexadecimal digits, it is inserted literally.
 165
 166    The transformation is done in place.  If you need the original
 167    string intact, make a copy before calling this function.  */
 168
 169 static void
 170 url_unescape (char *s)
 171 {
 172   char *t = s;                  /* t - tortoise */
 173   char *h = s;                  /* h - hare     */
 174
 175   for (; *h; h++, t++)
 176     {
 177       if (*h != '%')
 178         {
 179         copychar:
 180           *t = *h;
 181         }
 182       else
 183         {
 184           char c;
 185           /* Do nothing if '%' is not followed by two hex digits. */
 186           if (!h[1] || !h[2] || !(c_isxdigit (h[1]) && c_isxdigit (h[2])))
 187             goto copychar;
 188           c = X2DIGITS_TO_NUM (h[1], h[2]);
 189           /* Don't unescape %00 because there is no way to insert it
 190              into a C string without effectively truncating it. */
 191           if (c == '\0')
 192             goto copychar;
 193           *t = c;
 194           h += 2;
 195         }
 196     }
 197   *t = '\0';
 198 }
 199
 200 /* The core of url_escape_* functions.  Escapes the characters that
 201    match the provided mask in urlchr_table.
 202
 203    If ALLOW_PASSTHROUGH is true, a string with no unsafe chars will be
 204    returned unchanged.  If ALLOW_PASSTHROUGH is false, a freshly
 205    allocated string will be returned in all cases.  */
 206
 207 static char *
 208 url_escape_1 (const char *s, unsigned char mask, bool allow_passthrough)
 209 {
 210   const char *p1;
 211   char *p2, *newstr;
 212   int newlen;
 213   int addition = 0;
 214
 215   for (p1 = s; *p1; p1++)
 216     if (urlchr_test (*p1, mask))
 217       addition += 2;            /* Two more characters (hex digits) */
 218
 219   if (!addition)
 220     return allow_passthrough ? (char *)s : xstrdup (s);
 221
 222   newlen = (p1 - s) + addition;
 223   newstr = xmalloc (newlen + 1);
 224
 225   p1 = s;
 226   p2 = newstr;
 227   while (*p1)
 228     {
 229       /* Quote the characters that match the test mask. */
 230       if (urlchr_test (*p1, mask))
 231         {
 232           unsigned char c = *p1++;
 233           *p2++ = '%';
 234           *p2++ = XNUM_TO_DIGIT (c >> 4);
 235           *p2++ = XNUM_TO_DIGIT (c & 0xf);
 236         }
 237       else
 238         *p2++ = *p1++;
 239     }
 240   assert (p2 - newstr == newlen);
 241   *p2 = '\0';
 242
 243   return newstr;
 244 }
 245
 246 /* URL-escape the unsafe characters (see urlchr_table) in a given
 247    string, returning a freshly allocated string.  */
 248
 249 char *
 250 url_escape (const char *s)
 251 {
 252   return url_escape_1 (s, urlchr_unsafe, false);
 253 }
 254
 255 /* URL-escape the unsafe characters (see urlchr_table) in a given
 256    string.  If no characters are unsafe, S is returned.  */
 257
 258 static char *
 259 url_escape_allow_passthrough (const char *s)
 260 {
 261   return url_escape_1 (s, urlchr_unsafe, true);
 262 }
 263 \f
 264 /* Decide whether the char at position P needs to be encoded.  (It is
 265    not enough to pass a single char *P because the function may need
 266    to inspect the surrounding context.)
 267
 268    Return true if the char should be escaped as %XX, false otherwise.  */
 269
 270 static inline bool
 271 char_needs_escaping (const char *p)
 272 {
 273   if (*p == '%')
 274     {
 275       if (c_isxdigit (*(p + 1)) && c_isxdigit (*(p + 2)))
 276         return false;
 277       else
 278         /* Garbled %.. sequence: encode `%'. */
 279         return true;
 280     }
 281   else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
 282     return true;
 283   else
 284     return false;
 285 }
 286
 287 /* Translate a %-escaped (but possibly non-conformant) input string S
 288    into a %-escaped (and conformant) output string.  If no characters
 289    are encoded or decoded, return the same string S; otherwise, return
 290    a freshly allocated string with the new contents.
 291
 292    After a URL has been run through this function, the protocols that
 293    use `%' as the quote character can use the resulting string as-is,
 294    while those that don't can use url_unescape to get to the intended
 295    data.  This function is stable: once the input is transformed,
 296    further transformations of the result yield the same output.
 297
 298    Let's discuss why this function is needed.
 299
 300    Imagine Wget is asked to retrieve `http://abc.xyz/abc def'.  Since
 301    a raw space character would mess up the HTTP request, it needs to
 302    be quoted, like this:
 303
 304        GET /abc%20def HTTP/1.0
 305
 306    It would appear that the unsafe chars need to be quoted, for
 307    example with url_escape.  But what if we're requested to download
 308    `abc%20def'?  url_escape transforms "%" to "%25", which would leave
 309    us with `abc%2520def'.  This is incorrect -- since %-escapes are
 310    part of URL syntax, "%20" is the correct way to denote a literal
 311    space on the Wget command line.  This leads to the conclusion that
 312    in that case Wget should not call url_escape, but leave the `%20'
 313    as is.  This is clearly contradictory, but it only gets worse.
 314
 315    What if the requested URI is `abc%20 def'?  If we call url_escape,
 316    we end up with `/abc%2520%20def', which is almost certainly not
 317    intended.  If we don't call url_escape, we are left with the
 318    embedded space and cannot complete the request.  What the user
 319    meant was for Wget to request `/abc%20%20def', and this is where
 320    reencode_escapes kicks in.
 321
 322    Wget used to solve this by first decoding %-quotes, and then
 323    encoding all the "unsafe" characters found in the resulting string.
 324    This was wrong because it didn't preserve certain URL special
 325    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 326    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 327    whether we considered `+' reserved (it is).  One of these results
 328    is inevitable because by the second step we would lose information
 329    on whether the `+' was originally encoded or not.  Both results
 330    were wrong because in CGI parameters + means space, while %2B means
 331    literal plus.  reencode_escapes correctly translates the above to
 332    "a%2B+b", i.e. returns the original string.
 333
 334    This function uses a modified version of the algorithm originally
 335    proposed by Anon Sricharoenchai:
 336
 337    * Encode all "unsafe" characters, except those that are also
 338      "reserved", to %XX.  See urlchr_table for which characters are
 339      unsafe and reserved.
 340
 341    * Encode the "%" characters not followed by two hex digits to
 342      "%25".
 343
 344    * Pass through all other characters and %XX escapes as-is.  (Up to
 345      Wget 1.10 this decoded %XX escapes corresponding to "safe"
 346      characters, but that was obtrusive and broke some servers.)
 347
 348    Anon's test case:
 349
 350    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 351    ->
 352    "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc"
 353
 354    Simpler test cases:
 355
 356    "foo bar"         -> "foo%20bar"
 357    "foo%20bar"       -> "foo%20bar"
 358    "foo %20bar"      -> "foo%20%20bar"
 359    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 360    "foo%25%20bar"    -> "foo%25%20bar"
 361    "foo%2%20bar"     -> "foo%252%20bar"
 362    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 363    "foo%2b+bar"      -> "foo%2b+bar"  */
 364
 365 static char *
 366 reencode_escapes (const char *s)
 367 {
 368   const char *p1;
 369   char *newstr, *p2;
 370   int oldlen, newlen;
 371
 372   int encode_count = 0;
 373
 374   /* First pass: inspect the string to see if there's anything to do,
 375      and to calculate the new length.  */
 376   for (p1 = s; *p1; p1++)
 377     if (char_needs_escaping (p1))
 378       ++encode_count;
 379
 380   if (!encode_count)
 381     /* The string is good as it is. */
 382     return (char *) s;          /* C const model sucks. */
 383
 384   oldlen = p1 - s;
 385   /* Each encoding adds two characters (hex digits).  */
 386   newlen = oldlen + 2 * encode_count;
 387   newstr = xmalloc (newlen + 1);
 388
 389   /* Second pass: copy the string to the destination address, encoding
 390      chars when needed.  */
 391   p1 = s;
 392   p2 = newstr;
 393
 394   while (*p1)
 395     if (char_needs_escaping (p1))
 396       {
 397         unsigned char c = *p1++;
 398         *p2++ = '%';
 399         *p2++ = XNUM_TO_DIGIT (c >> 4);
 400         *p2++ = XNUM_TO_DIGIT (c & 0xf);
 401       }
 402     else
 403       *p2++ = *p1++;
 404
 405   *p2 = '\0';
 406   assert (p2 - newstr == newlen);
 407   return newstr;
 408 }
 409 \f
 410 /* Returns the scheme type if the scheme is supported, or
 411    SCHEME_INVALID if not.  */
 412
 413 enum url_scheme
 414 url_scheme (const char *url)
 415 {
 416   int i;
 417
 418   for (i = 0; supported_schemes[i].leading_string; i++)
 419     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 420                           strlen (supported_schemes[i].leading_string)))
 421       {
 422         if (!(supported_schemes[i].flags & scm_disabled))
 423           return (enum url_scheme) i;
 424         else
 425           return SCHEME_INVALID;
 426       }
 427
 428   return SCHEME_INVALID;
 429 }
 430
 431 #define SCHEME_CHAR(ch) (c_isalnum (ch) || (ch) == '-' || (ch) == '+')
 432
 433 /* Return 1 if the URL begins with any "scheme", 0 otherwise.  As
 434    currently implemented, it returns true if URL begins with
 435    [-+a-zA-Z0-9]+: .  */
 436
 437 bool
 438 url_has_scheme (const char *url)
 439 {
 440   const char *p = url;
 441
 442   /* The first char must be a scheme char. */
 443   if (!*p || !SCHEME_CHAR (*p))
 444     return false;
 445   ++p;
 446   /* Followed by 0 or more scheme chars. */
 447   while (*p && SCHEME_CHAR (*p))
 448     ++p;
 449   /* Terminated by ':'. */
 450   return *p == ':';
 451 }
 452
 453 int
 454 scheme_default_port (enum url_scheme scheme)
 455 {
 456   return supported_schemes[scheme].default_port;
 457 }
 458
 459 void
 460 scheme_disable (enum url_scheme scheme)
 461 {
 462   supported_schemes[scheme].flags |= scm_disabled;
 463 }
 464
 465 /* Skip the username and password, if present in the URL.  The
 466    function should *not* be called with the complete URL, but with the
 467    portion after the scheme.
 468
 469    If no username and password are found, return URL.  */
 470
 471 static const char *
 472 url_skip_credentials (const char *url)
 473 {
 474   /* Look for '@' that comes before terminators, such as '/', '?',
 475      '#', or ';'.  */
 476   const char *p = (const char *)strpbrk (url, "@/?#;");
 477   if (!p || *p != '@')
 478     return url;
 479   return p + 1;
 480 }
 481
 482 /* Parse credentials contained in [BEG, END).  The region is expected
 483    to have come from a URL and is unescaped.  */
 484
 485 static bool
 486 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
 487 {
 488   char *colon;
 489   const char *userend;
 490
 491   if (beg == end)
 492     return false;               /* empty user name */
 493
 494   colon = memchr (beg, ':', end - beg);
 495   if (colon == beg)
 496     return false;               /* again empty user name */
 497
 498   if (colon)
 499     {
 500       *passwd = strdupdelim (colon + 1, end);
 501       userend = colon;
 502       url_unescape (*passwd);
 503     }
 504   else
 505     {
 506       *passwd = NULL;
 507       userend = end;
 508     }
 509   *user = strdupdelim (beg, userend);
 510   url_unescape (*user);
 511   return true;
 512 }
 513
 514 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 515    originally popularized by Netscape and NcFTP.  HTTP shorthands look
 516    like this:
 517
 518    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 519    www.foo.com[:port]            -> http://www.foo.com[:port]
 520
 521    FTP shorthands look like this:
 522
 523    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 524    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 525
 526    If the URL needs not or cannot be rewritten, return NULL.  */
 527
 528 char *
 529 rewrite_shorthand_url (const char *url)
 530 {
 531   const char *p;
 532   char *ret;
 533
 534   if (url_scheme (url) != SCHEME_INVALID)
 535     return NULL;
 536
 537   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 538      latter Netscape.  */
 539   p = strpbrk (url, ":/");
 540   if (p == url)
 541     return NULL;
 542
 543   /* If we're looking at "://", it means the URL uses a scheme we
 544      don't support, which may include "https" when compiled without
 545      SSL support.  Don't bogusly rewrite such URLs.  */
 546   if (p && p[0] == ':' && p[1] == '/' && p[2] == '/')
 547     return NULL;
 548
 549   if (p && *p == ':')
 550     {
 551       /* Colon indicates ftp, as in foo.bar.com:path.  Check for
 552          special case of http port number ("localhost:10000").  */
 553       int digits = strspn (p + 1, "0123456789");
 554       if (digits && (p[1 + digits] == '/' || p[1 + digits] == '\0'))
 555         goto http;
 556
 557       /* Turn "foo.bar.com:path" to "ftp://foo.bar.com/path". */
 558       ret = aprintf ("ftp://%s", url);
 559       ret[6 + (p - url)] = '/';
 560     }
 561   else
 562     {
 563     http:
 564       /* Just prepend "http://" to URL. */
 565       ret = aprintf ("http://%s", url);
 566     }
 567   return ret;
 568 }
 569 \f
 570 static void split_path (const char *, char **, char **);
 571
 572 /* Like strpbrk, with the exception that it returns the pointer to the
 573    terminating zero (end-of-string aka "eos") if no matching character
 574    is found.  */
 575
 576 static inline char *
 577 strpbrk_or_eos (const char *s, const char *accept)
 578 {
 579   char *p = strpbrk (s, accept);
 580   if (!p)
 581     p = strchr (s, '\0');
 582   return p;
 583 }
 584
 585 /* Turn STR into lowercase; return true if a character was actually
 586    changed. */
 587
 588 static bool
 589 lowercase_str (char *str)
 590 {
 591   bool changed = false;
 592   for (; *str; str++)
 593     if (c_isupper (*str))
 594       {
 595         changed = true;
 596         *str = c_tolower (*str);
 597       }
 598   return changed;
 599 }
 600
 601 static const char *
 602 init_seps (enum url_scheme scheme)
 603 {
 604   static char seps[8] = ":/";
 605   char *p = seps + 2;
 606   int flags = supported_schemes[scheme].flags;
 607
 608   if (flags & scm_has_params)
 609     *p++ = ';';
 610   if (flags & scm_has_query)
 611     *p++ = '?';
 612   if (flags & scm_has_fragment)
 613     *p++ = '#';
 614   *p++ = '\0';
 615   return seps;
 616 }
 617
 618 static const char *parse_errors[] = {
 619 #define PE_NO_ERROR                     0
 620   N_("No error"),
 621 #define PE_UNSUPPORTED_SCHEME           1
 622   N_("Unsupported scheme"),
 623 #define PE_INVALID_HOST_NAME            2
 624   N_("Invalid host name"),
 625 #define PE_BAD_PORT_NUMBER              3
 626   N_("Bad port number"),
 627 #define PE_INVALID_USER_NAME            4
 628   N_("Invalid user name"),
 629 #define PE_UNTERMINATED_IPV6_ADDRESS    5
 630   N_("Unterminated IPv6 numeric address"),
 631 #define PE_IPV6_NOT_SUPPORTED           6
 632   N_("IPv6 addresses not supported"),
 633 #define PE_INVALID_IPV6_ADDRESS         7
 634   N_("Invalid IPv6 numeric address")
 635 };
 636
 637 /* Parse a URL.
 638
 639    Return a new struct url if successful, NULL on error.  In case of
 640    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 641    error code. */
 642 struct url *
 643 url_parse (const char *url, int *error, struct iri *iri)
 644 {
 645   struct url *u;
 646   const char *p;
 647   bool path_modified, host_modified;
 648
 649   enum url_scheme scheme;
 650   const char *seps;
 651
 652   const char *uname_b,     *uname_e;
 653   const char *host_b,      *host_e;
 654   const char *path_b,      *path_e;
 655   const char *params_b,    *params_e;
 656   const char *query_b,     *query_e;
 657   const char *fragment_b,  *fragment_e;
 658
 659   int port;
 660   char *user = NULL, *passwd = NULL;
 661
 662   char *url_encoded = NULL, *new_url = NULL;
 663
 664   int error_code;
 665
 666   scheme = url_scheme (url);
 667   if (scheme == SCHEME_INVALID)
 668     {
 669       error_code = PE_UNSUPPORTED_SCHEME;
 670       goto error;
 671     }
 672
 673   if (iri && iri->utf8_encode)
 674     {
 675       url_unescape ((char *) url);
 676       iri->utf8_encode = remote_to_utf8 (iri, url, (const char **) &new_url);
 677       if (!iri->utf8_encode)
 678         new_url = NULL;
 679     }
 680
 681   url_encoded = reencode_escapes (new_url ? new_url : url);
 682   p = url_encoded;
 683
 684   if (new_url && url_encoded != new_url)
 685     xfree (new_url);
 686
 687   p += strlen (supported_schemes[scheme].leading_string);
 688   uname_b = p;
 689   p = url_skip_credentials (p);
 690   uname_e = p;
 691
 692   /* scheme://user:pass@host[:port]... */
 693   /*                    ^              */
 694
 695   /* We attempt to break down the URL into the components path,
 696      params, query, and fragment.  They are ordered like this:
 697
 698        scheme://host[:port][/path][;params][?query][#fragment]  */
 699
 700   path_b     = path_e     = NULL;
 701   params_b   = params_e   = NULL;
 702   query_b    = query_e    = NULL;
 703   fragment_b = fragment_e = NULL;
 704
 705   /* Initialize separators for optional parts of URL, depending on the
 706      scheme.  For example, FTP has params, and HTTP and HTTPS have
 707      query string and fragment. */
 708   seps = init_seps (scheme);
 709
 710   host_b = p;
 711
 712   if (*p == '[')
 713     {
 714       /* Handle IPv6 address inside square brackets.  Ideally we'd
 715          just look for the terminating ']', but rfc2732 mandates
 716          rejecting invalid IPv6 addresses.  */
 717
 718       /* The address begins after '['. */
 719       host_b = p + 1;
 720       host_e = strchr (host_b, ']');
 721
 722       if (!host_e)
 723         {
 724           error_code = PE_UNTERMINATED_IPV6_ADDRESS;
 725           goto error;
 726         }
 727
 728 #ifdef ENABLE_IPV6
 729       /* Check if the IPv6 address is valid. */
 730       if (!is_valid_ipv6_address(host_b, host_e))
 731         {
 732           error_code = PE_INVALID_IPV6_ADDRESS;
 733           goto error;
 734         }
 735
 736       /* Continue parsing after the closing ']'. */
 737       p = host_e + 1;
 738 #else
 739       error_code = PE_IPV6_NOT_SUPPORTED;
 740       goto error;
 741 #endif
 742
 743       /* The closing bracket must be followed by a separator or by the
 744          null char.  */
 745       /* http://[::1]... */
 746       /*             ^   */
 747       if (!strchr (seps, *p))
 748         {
 749           /* Trailing garbage after []-delimited IPv6 address. */
 750           error_code = PE_INVALID_HOST_NAME;
 751           goto error;
 752         }
 753     }
 754   else
 755     {
 756       p = strpbrk_or_eos (p, seps);
 757       host_e = p;
 758     }
 759   ++seps;                       /* advance to '/' */
 760
 761   if (host_b == host_e)
 762     {
 763       error_code = PE_INVALID_HOST_NAME;
 764       goto error;
 765     }
 766
 767   port = scheme_default_port (scheme);
 768   if (*p == ':')
 769     {
 770       const char *port_b, *port_e, *pp;
 771
 772       /* scheme://host:port/tralala */
 773       /*              ^             */
 774       ++p;
 775       port_b = p;
 776       p = strpbrk_or_eos (p, seps);
 777       port_e = p;
 778
 779       /* Allow empty port, as per rfc2396. */
 780       if (port_b != port_e)
 781         for (port = 0, pp = port_b; pp < port_e; pp++)
 782           {
 783             if (!c_isdigit (*pp))
 784               {
 785                 /* http://host:12randomgarbage/blah */
 786                 /*               ^                  */
 787                 error_code = PE_BAD_PORT_NUMBER;
 788                 goto error;
 789               }
 790             port = 10 * port + (*pp - '0');
 791             /* Check for too large port numbers here, before we have
 792                a chance to overflow on bogus port values.  */
 793             if (port > 0xffff)
 794               {
 795                 error_code = PE_BAD_PORT_NUMBER;
 796                 goto error;
 797               }
 798           }
 799     }
 800   /* Advance to the first separator *after* '/' (either ';' or '?',
 801      depending on the scheme).  */
 802   ++seps;
 803
 804   /* Get the optional parts of URL, each part being delimited by
 805      current location and the position of the next separator.  */
 806 #define GET_URL_PART(sepchar, var) do {                         \
 807   if (*p == sepchar)                                            \
 808     var##_b = ++p, var##_e = p = strpbrk_or_eos (p, seps);      \
 809   ++seps;                                                       \
 810 } while (0)
 811
 812   GET_URL_PART ('/', path);
 813   if (supported_schemes[scheme].flags & scm_has_params)
 814     GET_URL_PART (';', params);
 815   if (supported_schemes[scheme].flags & scm_has_query)
 816     GET_URL_PART ('?', query);
 817   if (supported_schemes[scheme].flags & scm_has_fragment)
 818     GET_URL_PART ('#', fragment);
 819
 820 #undef GET_URL_PART
 821   assert (*p == 0);
 822
 823   if (uname_b != uname_e)
 824     {
 825       /* http://user:pass@host */
 826       /*        ^         ^    */
 827       /*     uname_b   uname_e */
 828       if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
 829         {
 830           error_code = PE_INVALID_USER_NAME;
 831           goto error;
 832         }
 833     }
 834
 835   u = xnew0 (struct url);
 836   u->scheme = scheme;
 837   u->host   = strdupdelim (host_b, host_e);
 838   u->port   = port;
 839   u->user   = user;
 840   u->passwd = passwd;
 841
 842   u->path = strdupdelim (path_b, path_e);
 843   path_modified = path_simplify (scheme, u->path);
 844   split_path (u->path, &u->dir, &u->file);
 845
 846   host_modified = lowercase_str (u->host);
 847
 848   /* Decode %HH sequences in host name.  This is important not so much
 849      to support %HH sequences in host names (which other browser
 850      don't), but to support binary characters (which will have been
 851      converted to %HH by reencode_escapes).  */
 852   if (strchr (u->host, '%'))
 853     {
 854       url_unescape (u->host);
 855       host_modified = true;
 856
 857       /* Apply IDNA regardless of iri->utf8_encode status */
 858       if (opt.enable_iri && iri)
 859         {
 860           char *new = idn_encode (iri, u->host);
 861           if (new)
 862             {
 863               xfree (u->host);
 864               u->host = new;
 865               host_modified = true;
 866             }
 867         }
 868     }
 869
 870   if (params_b)
 871     u->params = strdupdelim (params_b, params_e);
 872   if (query_b)
 873     u->query = strdupdelim (query_b, query_e);
 874   if (fragment_b)
 875     u->fragment = strdupdelim (fragment_b, fragment_e);
 876
 877   if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e)
 878     {
 879       /* If we suspect that a transformation has rendered what
 880          url_string might return different from URL_ENCODED, rebuild
 881          u->url using url_string.  */
 882       u->url = url_string (u, URL_AUTH_SHOW);
 883
 884       if (url_encoded != url)
 885         xfree ((char *) url_encoded);
 886     }
 887   else
 888     {
 889       if (url_encoded == url)
 890         u->url = xstrdup (url);
 891       else
 892         u->url = url_encoded;
 893     }
 894
 895   return u;
 896
 897  error:
 898   /* Cleanup in case of error: */
 899   if (url_encoded && url_encoded != url)
 900     xfree (url_encoded);
 901
 902   /* Transmit the error code to the caller, if the caller wants to
 903      know.  */
 904   if (error)
 905     *error = error_code;
 906   return NULL;
 907 }
 908
 909 /* Return the error message string from ERROR_CODE, which should have
 910    been retrieved from url_parse.  The error message is translated.  */
 911
 912 const char *
 913 url_error (int error_code)
 914 {
 915   assert (error_code >= 0 && ((size_t) error_code) < countof (parse_errors));
 916   return _(parse_errors[error_code]);
 917 }
 918
 919 /* Split PATH into DIR and FILE.  PATH comes from the URL and is
 920    expected to be URL-escaped.
 921
 922    The path is split into directory (the part up to the last slash)
 923    and file (the part after the last slash), which are subsequently
 924    unescaped.  Examples:
 925
 926    PATH                 DIR           FILE
 927    "foo/bar/baz"        "foo/bar"     "baz"
 928    "foo/bar/"           "foo/bar"     ""
 929    "foo"                ""            "foo"
 930    "foo/bar/baz%2fqux"  "foo/bar"     "baz/qux" (!)
 931
 932    DIR and FILE are freshly allocated.  */
 933
 934 static void
 935 split_path (const char *path, char **dir, char **file)
 936 {
 937   char *last_slash = strrchr (path, '/');
 938   if (!last_slash)
 939     {
 940       *dir = xstrdup ("");
 941       *file = xstrdup (path);
 942     }
 943   else
 944     {
 945       *dir = strdupdelim (path, last_slash);
 946       *file = xstrdup (last_slash + 1);
 947     }
 948   url_unescape (*dir);
 949   url_unescape (*file);
 950 }
 951
 952 /* Note: URL's "full path" is the path with the query string and
 953    params appended.  The "fragment" (#foo) is intentionally ignored,
 954    but that might be changed.  For example, if the original URL was
 955    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
 956    the full path will be "/foo/bar/baz;bullshit?querystring".  */
 957
 958 /* Return the length of the full path, without the terminating
 959    zero.  */
 960
 961 static int
 962 full_path_length (const struct url *url)
 963 {
 964   int len = 0;
 965
 966 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
 967
 968   FROB (path);
 969   FROB (params);
 970   FROB (query);
 971
 972 #undef FROB
 973
 974   return len;
 975 }
 976
 977 /* Write out the full path. */
 978
 979 static void
 980 full_path_write (const struct url *url, char *where)
 981 {
 982 #define FROB(el, chr) do {                      \
 983   char *f_el = url->el;                         \
 984   if (f_el) {                                   \
 985     int l = strlen (f_el);                      \
 986     *where++ = chr;                             \
 987     memcpy (where, f_el, l);                    \
 988     where += l;                                 \
 989   }                                             \
 990 } while (0)
 991
 992   FROB (path, '/');
 993   FROB (params, ';');
 994   FROB (query, '?');
 995
 996 #undef FROB
 997 }
 998
 999 /* Public function for getting the "full path".  E.g. if u->path is
1000    "foo/bar" and u->query is "param=value", full_path will be
1001    "/foo/bar?param=value". */
1002
1003 char *
1004 url_full_path (const struct url *url)
1005 {
1006   int length = full_path_length (url);
1007   char *full_path = xmalloc (length + 1);
1008
1009   full_path_write (url, full_path);
1010   full_path[length] = '\0';
1011
1012   return full_path;
1013 }
1014
1015 /* Unescape CHR in an otherwise escaped STR.  Used to selectively
1016    escaping of certain characters, such as "/" and ":".  Returns a
1017    count of unescaped chars.  */
1018
1019 static void
1020 unescape_single_char (char *str, char chr)
1021 {
1022   const char c1 = XNUM_TO_DIGIT (chr >> 4);
1023   const char c2 = XNUM_TO_DIGIT (chr & 0xf);
1024   char *h = str;                /* hare */
1025   char *t = str;                /* tortoise */
1026   for (; *h; h++, t++)
1027     {
1028       if (h[0] == '%' && h[1] == c1 && h[2] == c2)
1029         {
1030           *t = chr;
1031           h += 2;
1032         }
1033       else
1034         *t = *h;
1035     }
1036   *t = '\0';
1037 }
1038
1039 /* Escape unsafe and reserved characters, except for the slash
1040    characters.  */
1041
1042 static char *
1043 url_escape_dir (const char *dir)
1044 {
1045   char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1046   if (newdir == dir)
1047     return (char *)dir;
1048
1049   unescape_single_char (newdir, '/');
1050   return newdir;
1051 }
1052
1053 /* Sync u->path and u->url with u->dir and u->file.  Called after
1054    u->file or u->dir have been changed, typically by the FTP code.  */
1055
1056 static void
1057 sync_path (struct url *u)
1058 {
1059   char *newpath, *efile, *edir;
1060
1061   xfree (u->path);
1062
1063   /* u->dir and u->file are not escaped.  URL-escape them before
1064      reassembling them into u->path.  That way, if they contain
1065      separators like '?' or even if u->file contains slashes, the
1066      path will be correctly assembled.  (u->file can contain slashes
1067      if the URL specifies it with %2f, or if an FTP server returns
1068      it.)  */
1069   edir = url_escape_dir (u->dir);
1070   efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1071
1072   if (!*edir)
1073     newpath = xstrdup (efile);
1074   else
1075     {
1076       int dirlen = strlen (edir);
1077       int filelen = strlen (efile);
1078
1079       /* Copy "DIR/FILE" to newpath. */
1080       char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1081       memcpy (p, edir, dirlen);
1082       p += dirlen;
1083       *p++ = '/';
1084       memcpy (p, efile, filelen);
1085       p += filelen;
1086       *p = '\0';
1087     }
1088
1089   u->path = newpath;
1090
1091   if (edir != u->dir)
1092     xfree (edir);
1093   if (efile != u->file)
1094     xfree (efile);
1095
1096   /* Regenerate u->url as well.  */
1097   xfree (u->url);
1098   u->url = url_string (u, URL_AUTH_SHOW);
1099 }
1100
1101 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1102    This way we can sync u->path and u->url when they get changed.  */
1103
1104 void
1105 url_set_dir (struct url *url, const char *newdir)
1106 {
1107   xfree (url->dir);
1108   url->dir = xstrdup (newdir);
1109   sync_path (url);
1110 }
1111
1112 void
1113 url_set_file (struct url *url, const char *newfile)
1114 {
1115   xfree (url->file);
1116   url->file = xstrdup (newfile);
1117   sync_path (url);
1118 }
1119
1120 void
1121 url_free (struct url *url)
1122 {
1123   xfree (url->host);
1124   xfree (url->path);
1125   xfree (url->url);
1126
1127   xfree_null (url->params);
1128   xfree_null (url->query);
1129   xfree_null (url->fragment);
1130   xfree_null (url->user);
1131   xfree_null (url->passwd);
1132
1133   xfree (url->dir);
1134   xfree (url->file);
1135
1136   xfree (url);
1137 }
1138 \f
1139 /* Create all the necessary directories for PATH (a file).  Calls
1140    make_directory internally.  */
1141 int
1142 mkalldirs (const char *path)
1143 {
1144   const char *p;
1145   char *t;
1146   struct_stat st;
1147   int res;
1148
1149   p = path + strlen (path);
1150   for (; *p != '/' && p != path; p--)
1151     ;
1152
1153   /* Don't create if it's just a file.  */
1154   if ((p == path) && (*p != '/'))
1155     return 0;
1156   t = strdupdelim (path, p);
1157
1158   /* Check whether the directory exists.  */
1159   if ((stat (t, &st) == 0))
1160     {
1161       if (S_ISDIR (st.st_mode))
1162         {
1163           xfree (t);
1164           return 0;
1165         }
1166       else
1167         {
1168           /* If the dir exists as a file name, remove it first.  This
1169              is *only* for Wget to work with buggy old CERN http
1170              servers.  Here is the scenario: When Wget tries to
1171              retrieve a directory without a slash, e.g.
1172              http://foo/bar (bar being a directory), CERN server will
1173              not redirect it too http://foo/bar/ -- it will generate a
1174              directory listing containing links to bar/file1,
1175              bar/file2, etc.  Wget will lose because it saves this
1176              HTML listing to a file `bar', so it cannot create the
1177              directory.  To work around this, if the file of the same
1178              name exists, we just remove it and create the directory
1179              anyway.  */
1180           DEBUGP (("Removing %s because of directory danger!\n", t));
1181           unlink (t);
1182         }
1183     }
1184   res = make_directory (t);
1185   if (res != 0)
1186     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1187   xfree (t);
1188   return res;
1189 }
1190 \f
1191 /* Functions for constructing the file name out of URL components.  */
1192
1193 /* A growable string structure, used by url_file_name and friends.
1194    This should perhaps be moved to utils.c.
1195
1196    The idea is to have a convenient and efficient way to construct a
1197    string by having various functions append data to it.  Instead of
1198    passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1199    functions in questions, we pass the pointer to this struct.  */
1200
1201 struct growable {
1202   char *base;
1203   int size;
1204   int tail;
1205 };
1206
1207 /* Ensure that the string can accept APPEND_COUNT more characters past
1208    the current TAIL position.  If necessary, this will grow the string
1209    and update its allocated size.  If the string is already large
1210    enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1211 #define GROW(g, append_size) do {                                       \
1212   struct growable *G_ = g;                                              \
1213   DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1214 } while (0)
1215
1216 /* Return the tail position of the string. */
1217 #define TAIL(r) ((r)->base + (r)->tail)
1218
1219 /* Move the tail position by APPEND_COUNT characters. */
1220 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1221
1222 /* Append the string STR to DEST.  NOTICE: the string in DEST is not
1223    terminated.  */
1224
1225 static void
1226 append_string (const char *str, struct growable *dest)
1227 {
1228   int l = strlen (str);
1229   GROW (dest, l);
1230   memcpy (TAIL (dest), str, l);
1231   TAIL_INCR (dest, l);
1232 }
1233
1234 /* Append CH to DEST.  For example, append_char (0, DEST)
1235    zero-terminates DEST.  */
1236
1237 static void
1238 append_char (char ch, struct growable *dest)
1239 {
1240   GROW (dest, 1);
1241   *TAIL (dest) = ch;
1242   TAIL_INCR (dest, 1);
1243 }
1244
1245 enum {
1246   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1247   filechr_not_windows = 2,      /* unusable on Windows, one of \|/<>?:*" */
1248   filechr_control     = 4       /* a control character, e.g. 0-31 */
1249 };
1250
1251 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1252
1253 /* Shorthands for the table: */
1254 #define U filechr_not_unix
1255 #define W filechr_not_windows
1256 #define C filechr_control
1257
1258 #define UW U|W
1259 #define UWC U|W|C
1260
1261 /* Table of characters unsafe under various conditions (see above).
1262
1263    Arguably we could also claim `%' to be unsafe, since we use it as
1264    the escape character.  If we ever want to be able to reliably
1265    translate file name back to URL, this would become important
1266    crucial.  Right now, it's better to be minimal in escaping.  */
1267
1268 static const unsigned char filechr_table[256] =
1269 {
1270 UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1271   C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1272   C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1273   C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1274   0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1275   0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1276   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1277   0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
1278   0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1279   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1280   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1281   0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1282   0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1283   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1284   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1285   0,  0,  0,  0,   W,  0,  0,  C,   /* x   y   z   {    |   }   ~   DEL */
1286
1287   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
1288   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
1289   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1290   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1291
1292   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1293   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1294   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1295   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1296 };
1297 #undef U
1298 #undef W
1299 #undef C
1300 #undef UW
1301 #undef UWC
1302
1303 /* FN_PORT_SEP is the separator between host and port in file names
1304    for non-standard port numbers.  On Unix this is normally ':', as in
1305    "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1306    because Windows can't handle ':' in file names.  */
1307 #define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1308
1309 /* FN_QUERY_SEP is the separator between the file name and the URL
1310    query, normally '?'.  Since Windows cannot handle '?' as part of
1311    file name, we use '@' instead there.  */
1312 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1313
1314 /* Quote path element, characters in [b, e), as file name, and append
1315    the quoted string to DEST.  Each character is quoted as per
1316    file_unsafe_char and the corresponding table.
1317
1318    If ESCAPED is true, the path element is considered to be
1319    URL-escaped and will be unescaped prior to inspection.  */
1320
1321 static void
1322 append_uri_pathel (const char *b, const char *e, bool escaped,
1323                    struct growable *dest)
1324 {
1325   const char *p;
1326   int quoted, outlen;
1327
1328   int mask;
1329   if (opt.restrict_files_os == restrict_unix)
1330     mask = filechr_not_unix;
1331   else
1332     mask = filechr_not_windows;
1333   if (opt.restrict_files_ctrl)
1334     mask |= filechr_control;
1335
1336   /* Copy [b, e) to PATHEL and URL-unescape it. */
1337   if (escaped)
1338     {
1339       char *unescaped;
1340       BOUNDED_TO_ALLOCA (b, e, unescaped);
1341       url_unescape (unescaped);
1342       b = unescaped;
1343       e = unescaped + strlen (unescaped);
1344     }
1345
1346   /* Defang ".." when found as component of path.  Remember that path
1347      comes from the URL and might contain malicious input.  */
1348   if (e - b == 2 && b[0] == '.' && b[1] == '.')
1349     {
1350       b = "%2E%2E";
1351       e = b + 6;
1352     }
1353
1354   /* Walk the PATHEL string and check how many characters we'll need
1355      to quote.  */
1356   quoted = 0;
1357   for (p = b; p < e; p++)
1358     if (FILE_CHAR_TEST (*p, mask))
1359       ++quoted;
1360
1361   /* Calculate the length of the output string.  e-b is the input
1362      string length.  Each quoted char introduces two additional
1363      characters in the string, hence 2*quoted.  */
1364   outlen = (e - b) + (2 * quoted);
1365   GROW (dest, outlen);
1366
1367   if (!quoted)
1368     {
1369       /* If there's nothing to quote, we can simply append the string
1370          without processing it again.  */
1371       memcpy (TAIL (dest), b, outlen);
1372     }
1373   else
1374     {
1375       char *q = TAIL (dest);
1376       for (p = b; p < e; p++)
1377         {
1378           if (!FILE_CHAR_TEST (*p, mask))
1379             *q++ = *p;
1380           else
1381             {
1382               unsigned char ch = *p;
1383               *q++ = '%';
1384               *q++ = XNUM_TO_DIGIT (ch >> 4);
1385               *q++ = XNUM_TO_DIGIT (ch & 0xf);
1386             }
1387         }
1388       assert (q - TAIL (dest) == outlen);
1389     }
1390
1391   /* Perform inline case transformation if required.  */
1392   if (opt.restrict_files_case == restrict_lowercase
1393       || opt.restrict_files_case == restrict_uppercase)
1394     {
1395       char *q;
1396       for (q = TAIL (dest); q < TAIL (dest) + outlen; ++q)
1397         {
1398           if (opt.restrict_files_case == restrict_lowercase)
1399             *q = c_tolower (*q);
1400           else
1401             *q = c_toupper (*q);
1402         }
1403     }
1404
1405   TAIL_INCR (dest, outlen);
1406 }
1407
1408 /* Append to DEST the directory structure that corresponds the
1409    directory part of URL's path.  For example, if the URL is
1410    http://server/dir1/dir2/file, this appends "/dir1/dir2".
1411
1412    Each path element ("dir1" and "dir2" in the above example) is
1413    examined, url-unescaped, and re-escaped as file name element.
1414
1415    Additionally, it cuts as many directories from the path as
1416    specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1417    will produce "bar" for the above example.  For 2 or more, it will
1418    produce "".
1419
1420    Each component of the path is quoted for use as file name.  */
1421
1422 static void
1423 append_dir_structure (const struct url *u, struct growable *dest)
1424 {
1425   char *pathel, *next;
1426   int cut = opt.cut_dirs;
1427
1428   /* Go through the path components, de-URL-quote them, and quote them
1429      (if necessary) as file names.  */
1430
1431   pathel = u->path;
1432   for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1433     {
1434       if (cut-- > 0)
1435         continue;
1436       if (pathel == next)
1437         /* Ignore empty pathels.  */
1438         continue;
1439
1440       if (dest->tail)
1441         append_char ('/', dest);
1442       append_uri_pathel (pathel, next, true, dest);
1443     }
1444 }
1445
1446 /* Return a unique file name that matches the given URL as good as
1447    possible.  Does not create directories on the file system.  */
1448
1449 char *
1450 url_file_name (const struct url *u)
1451 {
1452   struct growable fnres;        /* stands for "file name result" */
1453
1454   const char *u_file, *u_query;
1455   char *fname, *unique;
1456
1457   fnres.base = NULL;
1458   fnres.size = 0;
1459   fnres.tail = 0;
1460
1461   /* Start with the directory prefix, if specified. */
1462   if (opt.dir_prefix)
1463     append_string (opt.dir_prefix, &fnres);
1464
1465   /* If "dirstruct" is turned on (typically the case with -r), add
1466      the host and port (unless those have been turned off) and
1467      directory structure.  */
1468   if (opt.dirstruct)
1469     {
1470       if (opt.protocol_directories)
1471         {
1472           if (fnres.tail)
1473             append_char ('/', &fnres);
1474           append_string (supported_schemes[u->scheme].name, &fnres);
1475         }
1476       if (opt.add_hostdir)
1477         {
1478           if (fnres.tail)
1479             append_char ('/', &fnres);
1480           if (0 != strcmp (u->host, ".."))
1481             append_string (u->host, &fnres);
1482           else
1483             /* Host name can come from the network; malicious DNS may
1484                allow ".." to be resolved, causing us to write to
1485                "../<file>".  Defang such host names.  */
1486             append_string ("%2E%2E", &fnres);
1487           if (u->port != scheme_default_port (u->scheme))
1488             {
1489               char portstr[24];
1490               number_to_string (portstr, u->port);
1491               append_char (FN_PORT_SEP, &fnres);
1492               append_string (portstr, &fnres);
1493             }
1494         }
1495
1496       append_dir_structure (u, &fnres);
1497     }
1498
1499   /* Add the file name. */
1500   if (fnres.tail)
1501     append_char ('/', &fnres);
1502   u_file = *u->file ? u->file : "index.html";
1503   append_uri_pathel (u_file, u_file + strlen (u_file), false, &fnres);
1504
1505   /* Append "?query" to the file name. */
1506   u_query = u->query && *u->query ? u->query : NULL;
1507   if (u_query)
1508     {
1509       append_char (FN_QUERY_SEP, &fnres);
1510       append_uri_pathel (u_query, u_query + strlen (u_query), true, &fnres);
1511     }
1512
1513   /* Zero-terminate the file name. */
1514   append_char ('\0', &fnres);
1515
1516   fname = fnres.base;
1517
1518   /* Check the cases in which the unique extensions are not used:
1519      1) Clobbering is turned off (-nc).
1520      2) Retrieval with regetting.
1521      3) Timestamping is used.
1522      4) Hierarchy is built.
1523
1524      The exception is the case when file does exist and is a
1525      directory (see `mkalldirs' for explanation).  */
1526
1527   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1528       && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1529     return fname;
1530
1531   unique = unique_name (fname, true);
1532   if (unique != fname)
1533     xfree (fname);
1534   return unique;
1535 }
1536 \f
1537 /* Resolve "." and ".." elements of PATH by destructively modifying
1538    PATH and return true if PATH has been modified, false otherwise.
1539
1540    The algorithm is in spirit similar to the one described in rfc1808,
1541    although implemented differently, in one pass.  To recap, path
1542    elements containing only "." are removed, and ".." is taken to mean
1543    "back up one element".  Single leading and trailing slashes are
1544    preserved.
1545
1546    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1547    test examples are provided below.  If you change anything in this
1548    function, run test_path_simplify to make sure you haven't broken a
1549    test case.  */
1550
1551 static bool
1552 path_simplify (enum url_scheme scheme, char *path)
1553 {
1554   char *h = path;               /* hare */
1555   char *t = path;               /* tortoise */
1556   char *beg = path;
1557   char *end = strchr (path, '\0');
1558
1559   while (h < end)
1560     {
1561       /* Hare should be at the beginning of a path element. */
1562
1563       if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1564         {
1565           /* Ignore "./". */
1566           h += 2;
1567         }
1568       else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1569         {
1570           /* Handle "../" by retreating the tortoise by one path
1571              element -- but not past beggining.  */
1572           if (t > beg)
1573             {
1574               /* Move backwards until T hits the beginning of the
1575                  previous path element or the beginning of path. */
1576               for (--t; t > beg && t[-1] != '/'; t--)
1577                 ;
1578             }
1579           else if (scheme == SCHEME_FTP)
1580             {
1581               /* If we're at the beginning, copy the "../" literally
1582                  and move the beginning so a later ".." doesn't remove
1583                  it.  This violates RFC 3986; but we do it for FTP
1584                  anyway because there is otherwise no way to get at a
1585                  parent directory, when the FTP server drops us in a
1586                  non-root directory (which is not uncommon). */
1587               beg = t + 3;
1588               goto regular;
1589             }
1590           h += 3;
1591         }
1592       else
1593         {
1594         regular:
1595           /* A regular path element.  If H hasn't advanced past T,
1596              simply skip to the next path element.  Otherwise, copy
1597              the path element until the next slash.  */
1598           if (t == h)
1599             {
1600               /* Skip the path element, including the slash.  */
1601               while (h < end && *h != '/')
1602                 t++, h++;
1603               if (h < end)
1604                 t++, h++;
1605             }
1606           else
1607             {
1608               /* Copy the path element, including the final slash.  */
1609               while (h < end && *h != '/')
1610                 *t++ = *h++;
1611               if (h < end)
1612                 *t++ = *h++;
1613             }
1614         }
1615     }
1616
1617   if (t != h)
1618     *t = '\0';
1619
1620   return t != h;
1621 }
1622 \f
1623 /* Return the length of URL's path.  Path is considered to be
1624    terminated by one or more of the ?query or ;params or #fragment,
1625    depending on the scheme.  */
1626
1627 static const char *
1628 path_end (const char *url)
1629 {
1630   enum url_scheme scheme = url_scheme (url);
1631   const char *seps;
1632   if (scheme == SCHEME_INVALID)
1633     scheme = SCHEME_HTTP;       /* use http semantics for rel links */
1634   /* +2 to ignore the first two separators ':' and '/' */
1635   seps = init_seps (scheme) + 2;
1636   return strpbrk_or_eos (url, seps);
1637 }
1638
1639 /* Find the last occurrence of character C in the range [b, e), or
1640    NULL, if none are present.  */
1641 #define find_last_char(b, e, c) memrchr ((b), (c), (e) - (b))
1642
1643 /* Merge BASE with LINK and return the resulting URI.
1644
1645    Either of the URIs may be absolute or relative, complete with the
1646    host name, or path only.  This tries to reasonably handle all
1647    foreseeable cases.  It only employs minimal URL parsing, without
1648    knowledge of the specifics of schemes.
1649
1650    I briefly considered making this function call path_simplify after
1651    the merging process, as rfc1738 seems to suggest.  This is a bad
1652    idea for several reasons: 1) it complexifies the code, and 2)
1653    url_parse has to simplify path anyway, so it's wasteful to boot.  */
1654
1655 char *
1656 uri_merge (const char *base, const char *link)
1657 {
1658   int linklength;
1659   const char *end;
1660   char *merge;
1661
1662   if (url_has_scheme (link))
1663     return xstrdup (link);
1664
1665   /* We may not examine BASE past END. */
1666   end = path_end (base);
1667   linklength = strlen (link);
1668
1669   if (!*link)
1670     {
1671       /* Empty LINK points back to BASE, query string and all. */
1672       return xstrdup (base);
1673     }
1674   else if (*link == '?')
1675     {
1676       /* LINK points to the same location, but changes the query
1677          string.  Examples: */
1678       /* uri_merge("path",         "?new") -> "path?new"     */
1679       /* uri_merge("path?foo",     "?new") -> "path?new"     */
1680       /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1681       /* uri_merge("path#foo",     "?new") -> "path?new"     */
1682       int baselength = end - base;
1683       merge = xmalloc (baselength + linklength + 1);
1684       memcpy (merge, base, baselength);
1685       memcpy (merge + baselength, link, linklength);
1686       merge[baselength + linklength] = '\0';
1687     }
1688   else if (*link == '#')
1689     {
1690       /* uri_merge("path",         "#new") -> "path#new"     */
1691       /* uri_merge("path#foo",     "#new") -> "path#new"     */
1692       /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1693       /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1694       int baselength;
1695       const char *end1 = strchr (base, '#');
1696       if (!end1)
1697         end1 = base + strlen (base);
1698       baselength = end1 - base;
1699       merge = xmalloc (baselength + linklength + 1);
1700       memcpy (merge, base, baselength);
1701       memcpy (merge + baselength, link, linklength);
1702       merge[baselength + linklength] = '\0';
1703     }
1704   else if (*link == '/' && *(link + 1) == '/')
1705     {
1706       /* LINK begins with "//" and so is a net path: we need to
1707          replace everything after (and including) the double slash
1708          with LINK. */
1709
1710       /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1711       /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1712       /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1713
1714       int span;
1715       const char *slash;
1716       const char *start_insert;
1717
1718       /* Look for first slash. */
1719       slash = memchr (base, '/', end - base);
1720       /* If found slash and it is a double slash, then replace
1721          from this point, else default to replacing from the
1722          beginning.  */
1723       if (slash && *(slash + 1) == '/')
1724         start_insert = slash;
1725       else
1726         start_insert = base;
1727
1728       span = start_insert - base;
1729       merge = xmalloc (span + linklength + 1);
1730       if (span)
1731         memcpy (merge, base, span);
1732       memcpy (merge + span, link, linklength);
1733       merge[span + linklength] = '\0';
1734     }
1735   else if (*link == '/')
1736     {
1737       /* LINK is an absolute path: we need to replace everything
1738          after (and including) the FIRST slash with LINK.
1739
1740          So, if BASE is "http://host/whatever/foo/bar", and LINK is
1741          "/qux/xyzzy", our result should be
1742          "http://host/qux/xyzzy".  */
1743       int span;
1744       const char *slash;
1745       const char *start_insert = NULL; /* for gcc to shut up. */
1746       const char *pos = base;
1747       bool seen_slash_slash = false;
1748       /* We're looking for the first slash, but want to ignore
1749          double slash. */
1750     again:
1751       slash = memchr (pos, '/', end - pos);
1752       if (slash && !seen_slash_slash)
1753         if (*(slash + 1) == '/')
1754           {
1755             pos = slash + 2;
1756             seen_slash_slash = true;
1757             goto again;
1758           }
1759
1760       /* At this point, SLASH is the location of the first / after
1761          "//", or the first slash altogether.  START_INSERT is the
1762          pointer to the location where LINK will be inserted.  When
1763          examining the last two examples, keep in mind that LINK
1764          begins with '/'. */
1765
1766       if (!slash && !seen_slash_slash)
1767         /* example: "foo" */
1768         /*           ^    */
1769         start_insert = base;
1770       else if (!slash && seen_slash_slash)
1771         /* example: "http://foo" */
1772         /*                     ^ */
1773         start_insert = end;
1774       else if (slash && !seen_slash_slash)
1775         /* example: "foo/bar" */
1776         /*           ^        */
1777         start_insert = base;
1778       else if (slash && seen_slash_slash)
1779         /* example: "http://something/" */
1780         /*                           ^  */
1781         start_insert = slash;
1782
1783       span = start_insert - base;
1784       merge = xmalloc (span + linklength + 1);
1785       if (span)
1786         memcpy (merge, base, span);
1787       memcpy (merge + span, link, linklength);
1788       merge[span + linklength] = '\0';
1789     }
1790   else
1791     {
1792       /* LINK is a relative URL: we need to replace everything
1793          after last slash (possibly empty) with LINK.
1794
1795          So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1796          our result should be "whatever/foo/qux/xyzzy".  */
1797       bool need_explicit_slash = false;
1798       int span;
1799       const char *start_insert;
1800       const char *last_slash = find_last_char (base, end, '/');
1801       if (!last_slash)
1802         {
1803           /* No slash found at all.  Replace what we have with LINK. */
1804           start_insert = base;
1805         }
1806       else if (last_slash && last_slash >= base + 2
1807                && last_slash[-2] == ':' && last_slash[-1] == '/')
1808         {
1809           /* example: http://host"  */
1810           /*                      ^ */
1811           start_insert = end + 1;
1812           need_explicit_slash = true;
1813         }
1814       else
1815         {
1816           /* example: "whatever/foo/bar" */
1817           /*                        ^    */
1818           start_insert = last_slash + 1;
1819         }
1820
1821       span = start_insert - base;
1822       merge = xmalloc (span + linklength + 1);
1823       if (span)
1824         memcpy (merge, base, span);
1825       if (need_explicit_slash)
1826         merge[span - 1] = '/';
1827       memcpy (merge + span, link, linklength);
1828       merge[span + linklength] = '\0';
1829     }
1830
1831   return merge;
1832 }
1833 \f
1834 #define APPEND(p, s) do {                       \
1835   int len = strlen (s);                         \
1836   memcpy (p, s, len);                           \
1837   p += len;                                     \
1838 } while (0)
1839
1840 /* Use this instead of password when the actual password is supposed
1841    to be hidden.  We intentionally use a generic string without giving
1842    away the number of characters in the password, like previous
1843    versions did.  */
1844 #define HIDDEN_PASSWORD "*password*"
1845
1846 /* Recreate the URL string from the data in URL.
1847
1848    If HIDE is true (as it is when we're calling this on a URL we plan
1849    to print, but not when calling it to canonicalize a URL for use
1850    within the program), password will be hidden.  Unsafe characters in
1851    the URL will be quoted.  */
1852
1853 char *
1854 url_string (const struct url *url, enum url_auth_mode auth_mode)
1855 {
1856   int size;
1857   char *result, *p;
1858   char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;
1859
1860   int scheme_port = supported_schemes[url->scheme].default_port;
1861   const char *scheme_str = supported_schemes[url->scheme].leading_string;
1862   int fplen = full_path_length (url);
1863
1864   bool brackets_around_host;
1865
1866   assert (scheme_str != NULL);
1867
1868   /* Make sure the user name and password are quoted. */
1869   if (url->user)
1870     {
1871       if (auth_mode != URL_AUTH_HIDE)
1872         {
1873           quoted_user = url_escape_allow_passthrough (url->user);
1874           if (url->passwd)
1875             {
1876               if (auth_mode == URL_AUTH_HIDE_PASSWD)
1877                 quoted_passwd = HIDDEN_PASSWORD;
1878               else
1879                 quoted_passwd = url_escape_allow_passthrough (url->passwd);
1880             }
1881         }
1882     }
1883
1884   /* In the unlikely event that the host name contains non-printable
1885      characters, quote it for displaying to the user.  */
1886   quoted_host = url_escape_allow_passthrough (url->host);
1887
1888   /* Undo the quoting of colons that URL escaping performs.  IPv6
1889      addresses may legally contain colons, and in that case must be
1890      placed in square brackets.  */
1891   if (quoted_host != url->host)
1892     unescape_single_char (quoted_host, ':');
1893   brackets_around_host = strchr (quoted_host, ':') != NULL;
1894
1895   size = (strlen (scheme_str)
1896           + strlen (quoted_host)
1897           + (brackets_around_host ? 2 : 0)
1898           + fplen
1899           + 1);
1900   if (url->port != scheme_port)
1901     size += 1 + numdigit (url->port);
1902   if (quoted_user)
1903     {
1904       size += 1 + strlen (quoted_user);
1905       if (quoted_passwd)
1906         size += 1 + strlen (quoted_passwd);
1907     }
1908
1909   p = result = xmalloc (size);
1910
1911   APPEND (p, scheme_str);
1912   if (quoted_user)
1913     {
1914       APPEND (p, quoted_user);
1915       if (quoted_passwd)
1916         {
1917           *p++ = ':';
1918           APPEND (p, quoted_passwd);
1919         }
1920       *p++ = '@';
1921     }
1922
1923   if (brackets_around_host)
1924     *p++ = '[';
1925   APPEND (p, quoted_host);
1926   if (brackets_around_host)
1927     *p++ = ']';
1928   if (url->port != scheme_port)
1929     {
1930       *p++ = ':';
1931       p = number_to_string (p, url->port);
1932     }
1933
1934   full_path_write (url, p);
1935   p += fplen;
1936   *p++ = '\0';
1937
1938   assert (p - result == size);
1939
1940   if (quoted_user && quoted_user != url->user)
1941     xfree (quoted_user);
1942   if (quoted_passwd && auth_mode == URL_AUTH_SHOW
1943       && quoted_passwd != url->passwd)
1944     xfree (quoted_passwd);
1945   if (quoted_host != url->host)
1946     xfree (quoted_host);
1947
1948   return result;
1949 }
1950 \f
1951 /* Return true if scheme a is similar to scheme b.
1952
1953    Schemes are similar if they are equal.  If SSL is supported, schemes
1954    are also similar if one is http (SCHEME_HTTP) and the other is https
1955    (SCHEME_HTTPS).  */
1956 bool
1957 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
1958 {
1959   if (a == b)
1960     return true;
1961 #ifdef HAVE_SSL
1962   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
1963       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
1964     return true;
1965 #endif
1966   return false;
1967 }
1968 \f
1969 static int
1970 getchar_from_escaped_string (const char *str, char *c)
1971 {
1972   const char *p = str;
1973
1974   assert (str && *str);
1975   assert (c);
1976
1977   if (p[0] == '%')
1978     {
1979       if (!c_isxdigit(p[1]) || !c_isxdigit(p[2]))
1980         {
1981           *c = '%';
1982           return 1;
1983         }
1984       else
1985         {
1986           if (p[2] == 0)
1987             return 0; /* error: invalid string */
1988
1989           *c = X2DIGITS_TO_NUM (p[1], p[2]);
1990           if (URL_RESERVED_CHAR(*c))
1991             {
1992               *c = '%';
1993               return 1;
1994             }
1995           else
1996             return 3;
1997         }
1998     }
1999   else
2000     {
2001       *c = p[0];
2002     }
2003
2004   return 1;
2005 }
2006
2007 bool
2008 are_urls_equal (const char *u1, const char *u2)
2009 {
2010   const char *p, *q;
2011   int pp, qq;
2012   char ch1, ch2;
2013   assert(u1 && u2);
2014
2015   p = u1;
2016   q = u2;
2017
2018   while (*p && *q
2019          && (pp = getchar_from_escaped_string (p, &ch1))
2020          && (qq = getchar_from_escaped_string (q, &ch2))
2021          && (c_tolower(ch1) == c_tolower(ch2)))
2022     {
2023       p += pp;
2024       q += qq;
2025     }
2026
2027   return (*p == 0 && *q == 0 ? true : false);
2028 }
2029 \f
2030 #ifdef TESTING
2031 /* Debugging and testing support for path_simplify. */
2032
2033 #if 0
2034 /* Debug: run path_simplify on PATH and return the result in a new
2035    string.  Useful for calling from the debugger.  */
2036 static char *
2037 ps (char *path)
2038 {
2039   char *copy = xstrdup (path);
2040   path_simplify (copy);
2041   return copy;
2042 }
2043 #endif
2044
2045 static const char *
2046 run_test (char *test, char *expected_result, enum url_scheme scheme,
2047           bool expected_change)
2048 {
2049   char *test_copy = xstrdup (test);
2050   bool modified = path_simplify (scheme, test_copy);
2051
2052   if (0 != strcmp (test_copy, expected_result))
2053     {
2054       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2055               test, expected_result, test_copy);
2056       mu_assert ("", 0);
2057     }
2058   if (modified != expected_change)
2059     {
2060       if (expected_change)
2061         printf ("Expected modification with path_simplify(\"%s\").\n",
2062                 test);
2063       else
2064         printf ("Expected no modification with path_simplify(\"%s\").\n",
2065                 test);
2066     }
2067   xfree (test_copy);
2068   mu_assert ("", modified == expected_change);
2069   return NULL;
2070 }
2071
2072 const char *
2073 test_path_simplify (void)
2074 {
2075   static struct {
2076     char *test, *result;
2077     enum url_scheme scheme;
2078     bool should_modify;
2079   } tests[] = {
2080     { "",                       "",             SCHEME_HTTP, false },
2081     { ".",                      "",             SCHEME_HTTP, true },
2082     { "./",                     "",             SCHEME_HTTP, true },
2083     { "..",                     "",             SCHEME_HTTP, true },
2084     { "../",                    "",             SCHEME_HTTP, true },
2085     { "..",                     "..",           SCHEME_FTP,  false },
2086     { "../",                    "../",          SCHEME_FTP,  false },
2087     { "foo",                    "foo",          SCHEME_HTTP, false },
2088     { "foo/bar",                "foo/bar",      SCHEME_HTTP, false },
2089     { "foo///bar",              "foo///bar",    SCHEME_HTTP, false },
2090     { "foo/.",                  "foo/",         SCHEME_HTTP, true },
2091     { "foo/./",                 "foo/",         SCHEME_HTTP, true },
2092     { "foo./",                  "foo./",        SCHEME_HTTP, false },
2093     { "foo/../bar",             "bar",          SCHEME_HTTP, true },
2094     { "foo/../bar/",            "bar/",         SCHEME_HTTP, true },
2095     { "foo/bar/..",             "foo/",         SCHEME_HTTP, true },
2096     { "foo/bar/../x",           "foo/x",        SCHEME_HTTP, true },
2097     { "foo/bar/../x/",          "foo/x/",       SCHEME_HTTP, true },
2098     { "foo/..",                 "",             SCHEME_HTTP, true },
2099     { "foo/../..",              "",             SCHEME_HTTP, true },
2100     { "foo/../../..",           "",             SCHEME_HTTP, true },
2101     { "foo/../../bar/../../baz", "baz",         SCHEME_HTTP, true },
2102     { "foo/../..",              "..",           SCHEME_FTP,  true },
2103     { "foo/../../..",           "../..",        SCHEME_FTP,  true },
2104     { "foo/../../bar/../../baz", "../../baz",   SCHEME_FTP,  true },
2105     { "a/b/../../c",            "c",            SCHEME_HTTP, true },
2106     { "./a/../b",               "b",            SCHEME_HTTP, true }
2107   };
2108   int i;
2109
2110   for (i = 0; i < countof (tests); i++)
2111     {
2112       const char *message;
2113       char *test = tests[i].test;
2114       char *expected_result = tests[i].result;
2115       enum url_scheme scheme = tests[i].scheme;
2116       bool  expected_change = tests[i].should_modify;
2117       message = run_test (test, expected_result, scheme, expected_change);
2118       if (message) return message;
2119     }
2120   return NULL;
2121 }
2122
2123 const char *
2124 test_append_uri_pathel()
2125 {
2126   int i;
2127   struct {
2128     char *original_url;
2129     char *input;
2130     bool escaped;
2131     char *expected_result;
2132   } test_array[] = {
2133     { "http://www.yoyodyne.com/path/", "somepage.html", false, "http://www.yoyodyne.com/path/somepage.html" },
2134   };
2135
2136   for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
2137     {
2138       struct growable dest;
2139       const char *p = test_array[i].input;
2140
2141       memset (&dest, 0, sizeof (dest));
2142
2143       append_string (test_array[i].original_url, &dest);
2144       append_uri_pathel (p, p + strlen(p), test_array[i].escaped, &dest);
2145       append_char ('\0', &dest);
2146
2147       mu_assert ("test_append_uri_pathel: wrong result",
2148                  strcmp (dest.base, test_array[i].expected_result) == 0);
2149     }
2150
2151   return NULL;
2152 }
2153
2154 const char*
2155 test_are_urls_equal()
2156 {
2157   int i;
2158   struct {
2159     char *url1;
2160     char *url2;
2161     bool expected_result;
2162   } test_array[] = {
2163     { "http://www.adomain.com/apath/", "http://www.adomain.com/apath/",       true },
2164     { "http://www.adomain.com/apath/", "http://www.adomain.com/anotherpath/", false },
2165     { "http://www.adomain.com/apath/", "http://www.anotherdomain.com/path/",  false },
2166     { "http://www.adomain.com/~path/", "http://www.adomain.com/%7epath/",     true },
2167     { "http://www.adomain.com/longer-path/", "http://www.adomain.com/path/",  false },
2168     { "http://www.adomain.com/path%2f", "http://www.adomain.com/path/",       false },
2169   };
2170
2171   for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
2172     {
2173       mu_assert ("test_are_urls_equal: wrong result",
2174                  are_urls_equal (test_array[i].url1, test_array[i].url2) == test_array[i].expected_result);
2175     }
2176
2177   return NULL;
2178 }
2179
2180 #endif /* TESTING */
2181
2182 /*
2183  * vim: et ts=2 sw=2
2184  */
2185