sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
   3    2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or (at
  10 your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #include "wget.h"
  32
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 #ifdef HAVE_UNISTD_H
  37 # include <unistd.h>
  38 #endif
  39 #include <errno.h>
  40 #include <assert.h>
  41
  42 #include "utils.h"
  43 #include "url.h"
  44 #include "host.h"  /* for is_valid_ipv6_address */
  45
  46 #ifdef TESTING
  47 #include "test.h"
  48 #endif
  49
  50 enum {
  51   scm_disabled = 1,             /* for https when OpenSSL fails to init. */
  52   scm_has_params = 2,           /* whether scheme has ;params */
  53   scm_has_query = 4,            /* whether scheme has ?query */
  54   scm_has_fragment = 8          /* whether scheme has #fragment */
  55 };
  56
  57 struct scheme_data
  58 {
  59   /* Short name of the scheme, such as "http" or "ftp". */
  60   const char *name;
  61   /* Leading string that identifies the scheme, such as "https://". */
  62   const char *leading_string;
  63   /* Default port of the scheme when none is specified. */
  64   int default_port;
  65   /* Various flags. */
  66   int flags;
  67 };
  68
  69 /* Supported schemes: */
  70 static struct scheme_data supported_schemes[] =
  71 {
  72   { "http",     "http://",  DEFAULT_HTTP_PORT,  scm_has_query|scm_has_fragment },
  73 #ifdef HAVE_SSL
  74   { "https",    "https://", DEFAULT_HTTPS_PORT, scm_has_query|scm_has_fragment },
  75 #endif
  76   { "ftp",      "ftp://",   DEFAULT_FTP_PORT,   scm_has_params|scm_has_fragment },
  77
  78   /* SCHEME_INVALID */
  79   { NULL,       NULL,       -1,                 0 }
  80 };
  81
  82 /* Forward declarations: */
  83
  84 static bool path_simplify (enum url_scheme, char *);
  85 \f
  86 /* Support for escaping and unescaping of URL strings.  */
  87
  88 /* Table of "reserved" and "unsafe" characters.  Those terms are
  89    rfc1738-speak, as such largely obsoleted by rfc2396 and later
  90    specs, but the general idea remains.
  91
  92    A reserved character is the one that you can't decode without
  93    changing the meaning of the URL.  For example, you can't decode
  94    "/foo/%2f/bar" into "/foo///bar" because the number and contents of
  95    path components is different.  Non-reserved characters can be
  96    changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar".  The
  97    unsafe characters are loosely based on rfc1738, plus "$" and ",",
  98    as recommended by rfc2396, and minus "~", which is very frequently
  99    used (and sometimes unrecognized as %7E by broken servers).
 100
 101    An unsafe character is the one that should be encoded when URLs are
 102    placed in foreign environments.  E.g. space and newline are unsafe
 103    in HTTP contexts because HTTP uses them as separator and line
 104    terminator, so they must be encoded to %20 and %0A respectively.
 105    "*" is unsafe in shell context, etc.
 106
 107    We determine whether a character is unsafe through static table
 108    lookup.  This code assumes ASCII character set and 8-bit chars.  */
 109
 110 enum {
 111   /* rfc1738 reserved chars + "$" and ",".  */
 112   urlchr_reserved = 1,
 113
 114   /* rfc1738 unsafe chars, plus non-printables.  */
 115   urlchr_unsafe   = 2
 116 };
 117
 118 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 119 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 120 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 121
 122 /* Shorthands for the table: */
 123 #define R  urlchr_reserved
 124 #define U  urlchr_unsafe
 125 #define RU R|U
 126
 127 static const unsigned char urlchr_table[256] =
 128 {
 129   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 130   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 131   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 132   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 133   U,  0,  U, RU,   R,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 134   0,  0,  0,  R,   R,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 135   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 136   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 137  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 138   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 139   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 140   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 141   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 142   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 143   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 144   0,  0,  0,  U,   U,  U,  0,  U,   /* x   y   z   {    |   }   ~   DEL */
 145
 146   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 147   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 148   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 149   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 150
 151   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 152   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 153   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 154   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 155 };
 156 #undef R
 157 #undef U
 158 #undef RU
 159
 160 /* URL-unescape the string S.
 161
 162    This is done by transforming the sequences "%HH" to the character
 163    represented by the hexadecimal digits HH.  If % is not followed by
 164    two hexadecimal digits, it is inserted literally.
 165
 166    The transformation is done in place.  If you need the original
 167    string intact, make a copy before calling this function.  */
 168
 169 static void
 170 url_unescape (char *s)
 171 {
 172   char *t = s;                  /* t - tortoise */
 173   char *h = s;                  /* h - hare     */
 174
 175   for (; *h; h++, t++)
 176     {
 177       if (*h != '%')
 178         {
 179         copychar:
 180           *t = *h;
 181         }
 182       else
 183         {
 184           char c;
 185           /* Do nothing if '%' is not followed by two hex digits. */
 186           if (!h[1] || !h[2] || !(c_isxdigit (h[1]) && c_isxdigit (h[2])))
 187             goto copychar;
 188           c = X2DIGITS_TO_NUM (h[1], h[2]);
 189           /* Don't unescape %00 because there is no way to insert it
 190              into a C string without effectively truncating it. */
 191           if (c == '\0')
 192             goto copychar;
 193           *t = c;
 194           h += 2;
 195         }
 196     }
 197   *t = '\0';
 198 }
 199
 200 /* The core of url_escape_* functions.  Escapes the characters that
 201    match the provided mask in urlchr_table.
 202
 203    If ALLOW_PASSTHROUGH is true, a string with no unsafe chars will be
 204    returned unchanged.  If ALLOW_PASSTHROUGH is false, a freshly
 205    allocated string will be returned in all cases.  */
 206
 207 static char *
 208 url_escape_1 (const char *s, unsigned char mask, bool allow_passthrough)
 209 {
 210   const char *p1;
 211   char *p2, *newstr;
 212   int newlen;
 213   int addition = 0;
 214
 215   for (p1 = s; *p1; p1++)
 216     if (urlchr_test (*p1, mask))
 217       addition += 2;            /* Two more characters (hex digits) */
 218
 219   if (!addition)
 220     return allow_passthrough ? (char *)s : xstrdup (s);
 221
 222   newlen = (p1 - s) + addition;
 223   newstr = xmalloc (newlen + 1);
 224
 225   p1 = s;
 226   p2 = newstr;
 227   while (*p1)
 228     {
 229       /* Quote the characters that match the test mask. */
 230       if (urlchr_test (*p1, mask))
 231         {
 232           unsigned char c = *p1++;
 233           *p2++ = '%';
 234           *p2++ = XNUM_TO_DIGIT (c >> 4);
 235           *p2++ = XNUM_TO_DIGIT (c & 0xf);
 236         }
 237       else
 238         *p2++ = *p1++;
 239     }
 240   assert (p2 - newstr == newlen);
 241   *p2 = '\0';
 242
 243   return newstr;
 244 }
 245
 246 /* URL-escape the unsafe characters (see urlchr_table) in a given
 247    string, returning a freshly allocated string.  */
 248
 249 char *
 250 url_escape (const char *s)
 251 {
 252   return url_escape_1 (s, urlchr_unsafe, false);
 253 }
 254
 255 /* URL-escape the unsafe characters (see urlchr_table) in a given
 256    string.  If no characters are unsafe, S is returned.  */
 257
 258 static char *
 259 url_escape_allow_passthrough (const char *s)
 260 {
 261   return url_escape_1 (s, urlchr_unsafe, true);
 262 }
 263 \f
 264 /* Decide whether the char at position P needs to be encoded.  (It is
 265    not enough to pass a single char *P because the function may need
 266    to inspect the surrounding context.)
 267
 268    Return true if the char should be escaped as %XX, false otherwise.  */
 269
 270 static inline bool
 271 char_needs_escaping (const char *p)
 272 {
 273   if (*p == '%')
 274     {
 275       if (c_isxdigit (*(p + 1)) && c_isxdigit (*(p + 2)))
 276         return false;
 277       else
 278         /* Garbled %.. sequence: encode `%'. */
 279         return true;
 280     }
 281   else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
 282     return true;
 283   else
 284     return false;
 285 }
 286
 287 /* Translate a %-escaped (but possibly non-conformant) input string S
 288    into a %-escaped (and conformant) output string.  If no characters
 289    are encoded or decoded, return the same string S; otherwise, return
 290    a freshly allocated string with the new contents.
 291
 292    After a URL has been run through this function, the protocols that
 293    use `%' as the quote character can use the resulting string as-is,
 294    while those that don't can use url_unescape to get to the intended
 295    data.  This function is stable: once the input is transformed,
 296    further transformations of the result yield the same output.
 297
 298    Let's discuss why this function is needed.
 299
 300    Imagine Wget is asked to retrieve `http://abc.xyz/abc def'.  Since
 301    a raw space character would mess up the HTTP request, it needs to
 302    be quoted, like this:
 303
 304        GET /abc%20def HTTP/1.0
 305
 306    It would appear that the unsafe chars need to be quoted, for
 307    example with url_escape.  But what if we're requested to download
 308    `abc%20def'?  url_escape transforms "%" to "%25", which would leave
 309    us with `abc%2520def'.  This is incorrect -- since %-escapes are
 310    part of URL syntax, "%20" is the correct way to denote a literal
 311    space on the Wget command line.  This leads to the conclusion that
 312    in that case Wget should not call url_escape, but leave the `%20'
 313    as is.  This is clearly contradictory, but it only gets worse.
 314
 315    What if the requested URI is `abc%20 def'?  If we call url_escape,
 316    we end up with `/abc%2520%20def', which is almost certainly not
 317    intended.  If we don't call url_escape, we are left with the
 318    embedded space and cannot complete the request.  What the user
 319    meant was for Wget to request `/abc%20%20def', and this is where
 320    reencode_escapes kicks in.
 321
 322    Wget used to solve this by first decoding %-quotes, and then
 323    encoding all the "unsafe" characters found in the resulting string.
 324    This was wrong because it didn't preserve certain URL special
 325    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 326    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 327    whether we considered `+' reserved (it is).  One of these results
 328    is inevitable because by the second step we would lose information
 329    on whether the `+' was originally encoded or not.  Both results
 330    were wrong because in CGI parameters + means space, while %2B means
 331    literal plus.  reencode_escapes correctly translates the above to
 332    "a%2B+b", i.e. returns the original string.
 333
 334    This function uses a modified version of the algorithm originally
 335    proposed by Anon Sricharoenchai:
 336
 337    * Encode all "unsafe" characters, except those that are also
 338      "reserved", to %XX.  See urlchr_table for which characters are
 339      unsafe and reserved.
 340
 341    * Encode the "%" characters not followed by two hex digits to
 342      "%25".
 343
 344    * Pass through all other characters and %XX escapes as-is.  (Up to
 345      Wget 1.10 this decoded %XX escapes corresponding to "safe"
 346      characters, but that was obtrusive and broke some servers.)
 347
 348    Anon's test case:
 349
 350    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 351    ->
 352    "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc"
 353
 354    Simpler test cases:
 355
 356    "foo bar"         -> "foo%20bar"
 357    "foo%20bar"       -> "foo%20bar"
 358    "foo %20bar"      -> "foo%20%20bar"
 359    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 360    "foo%25%20bar"    -> "foo%25%20bar"
 361    "foo%2%20bar"     -> "foo%252%20bar"
 362    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 363    "foo%2b+bar"      -> "foo%2b+bar"  */
 364
 365 static char *
 366 reencode_escapes (const char *s)
 367 {
 368   const char *p1;
 369   char *newstr, *p2;
 370   int oldlen, newlen;
 371
 372   int encode_count = 0;
 373
 374   /* First pass: inspect the string to see if there's anything to do,
 375      and to calculate the new length.  */
 376   for (p1 = s; *p1; p1++)
 377     if (char_needs_escaping (p1))
 378       ++encode_count;
 379
 380   if (!encode_count)
 381     /* The string is good as it is. */
 382     return (char *) s;          /* C const model sucks. */
 383
 384   oldlen = p1 - s;
 385   /* Each encoding adds two characters (hex digits).  */
 386   newlen = oldlen + 2 * encode_count;
 387   newstr = xmalloc (newlen + 1);
 388
 389   /* Second pass: copy the string to the destination address, encoding
 390      chars when needed.  */
 391   p1 = s;
 392   p2 = newstr;
 393
 394   while (*p1)
 395     if (char_needs_escaping (p1))
 396       {
 397         unsigned char c = *p1++;
 398         *p2++ = '%';
 399         *p2++ = XNUM_TO_DIGIT (c >> 4);
 400         *p2++ = XNUM_TO_DIGIT (c & 0xf);
 401       }
 402     else
 403       *p2++ = *p1++;
 404
 405   *p2 = '\0';
 406   assert (p2 - newstr == newlen);
 407   return newstr;
 408 }
 409 \f
 410 /* Returns the scheme type if the scheme is supported, or
 411    SCHEME_INVALID if not.  */
 412
 413 enum url_scheme
 414 url_scheme (const char *url)
 415 {
 416   int i;
 417
 418   for (i = 0; supported_schemes[i].leading_string; i++)
 419     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 420                           strlen (supported_schemes[i].leading_string)))
 421       {
 422         if (!(supported_schemes[i].flags & scm_disabled))
 423           return (enum url_scheme) i;
 424         else
 425           return SCHEME_INVALID;
 426       }
 427
 428   return SCHEME_INVALID;
 429 }
 430
 431 #define SCHEME_CHAR(ch) (c_isalnum (ch) || (ch) == '-' || (ch) == '+')
 432
 433 /* Return 1 if the URL begins with any "scheme", 0 otherwise.  As
 434    currently implemented, it returns true if URL begins with
 435    [-+a-zA-Z0-9]+: .  */
 436
 437 bool
 438 url_has_scheme (const char *url)
 439 {
 440   const char *p = url;
 441
 442   /* The first char must be a scheme char. */
 443   if (!*p || !SCHEME_CHAR (*p))
 444     return false;
 445   ++p;
 446   /* Followed by 0 or more scheme chars. */
 447   while (*p && SCHEME_CHAR (*p))
 448     ++p;
 449   /* Terminated by ':'. */
 450   return *p == ':';
 451 }
 452
 453 int
 454 scheme_default_port (enum url_scheme scheme)
 455 {
 456   return supported_schemes[scheme].default_port;
 457 }
 458
 459 void
 460 scheme_disable (enum url_scheme scheme)
 461 {
 462   supported_schemes[scheme].flags |= scm_disabled;
 463 }
 464
 465 /* Skip the username and password, if present in the URL.  The
 466    function should *not* be called with the complete URL, but with the
 467    portion after the scheme.
 468
 469    If no username and password are found, return URL.  */
 470
 471 static const char *
 472 url_skip_credentials (const char *url)
 473 {
 474   /* Look for '@' that comes before terminators, such as '/', '?',
 475      '#', or ';'.  */
 476   const char *p = (const char *)strpbrk (url, "@/?#;");
 477   if (!p || *p != '@')
 478     return url;
 479   return p + 1;
 480 }
 481
 482 /* Parse credentials contained in [BEG, END).  The region is expected
 483    to have come from a URL and is unescaped.  */
 484
 485 static bool
 486 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
 487 {
 488   char *colon;
 489   const char *userend;
 490
 491   if (beg == end)
 492     return false;               /* empty user name */
 493
 494   colon = memchr (beg, ':', end - beg);
 495   if (colon == beg)
 496     return false;               /* again empty user name */
 497
 498   if (colon)
 499     {
 500       *passwd = strdupdelim (colon + 1, end);
 501       userend = colon;
 502       url_unescape (*passwd);
 503     }
 504   else
 505     {
 506       *passwd = NULL;
 507       userend = end;
 508     }
 509   *user = strdupdelim (beg, userend);
 510   url_unescape (*user);
 511   return true;
 512 }
 513
 514 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 515    originally popularized by Netscape and NcFTP.  HTTP shorthands look
 516    like this:
 517
 518    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 519    www.foo.com[:port]            -> http://www.foo.com[:port]
 520
 521    FTP shorthands look like this:
 522
 523    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 524    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 525
 526    If the URL needs not or cannot be rewritten, return NULL.  */
 527
 528 char *
 529 rewrite_shorthand_url (const char *url)
 530 {
 531   const char *p;
 532   char *ret;
 533
 534   if (url_scheme (url) != SCHEME_INVALID)
 535     return NULL;
 536
 537   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 538      latter Netscape.  */
 539   p = strpbrk (url, ":/");
 540   if (p == url)
 541     return NULL;
 542
 543   /* If we're looking at "://", it means the URL uses a scheme we
 544      don't support, which may include "https" when compiled without
 545      SSL support.  Don't bogusly rewrite such URLs.  */
 546   if (p && p[0] == ':' && p[1] == '/' && p[2] == '/')
 547     return NULL;
 548
 549   if (p && *p == ':')
 550     {
 551       /* Colon indicates ftp, as in foo.bar.com:path.  Check for
 552          special case of http port number ("localhost:10000").  */
 553       int digits = strspn (p + 1, "0123456789");
 554       if (digits && (p[1 + digits] == '/' || p[1 + digits] == '\0'))
 555         goto http;
 556
 557       /* Turn "foo.bar.com:path" to "ftp://foo.bar.com/path". */
 558       ret = aprintf ("ftp://%s", url);
 559       ret[6 + (p - url)] = '/';
 560     }
 561   else
 562     {
 563     http:
 564       /* Just prepend "http://" to URL. */
 565       ret = aprintf ("http://%s", url);
 566     }
 567   return ret;
 568 }
 569 \f
 570 static void split_path (const char *, char **, char **);
 571
 572 /* Like strpbrk, with the exception that it returns the pointer to the
 573    terminating zero (end-of-string aka "eos") if no matching character
 574    is found.  */
 575
 576 static inline char *
 577 strpbrk_or_eos (const char *s, const char *accept)
 578 {
 579   char *p = strpbrk (s, accept);
 580   if (!p)
 581     p = strchr (s, '\0');
 582   return p;
 583 }
 584
 585 /* Turn STR into lowercase; return true if a character was actually
 586    changed. */
 587
 588 static bool
 589 lowercase_str (char *str)
 590 {
 591   bool changed = false;
 592   for (; *str; str++)
 593     if (c_isupper (*str))
 594       {
 595         changed = true;
 596         *str = c_tolower (*str);
 597       }
 598   return changed;
 599 }
 600
 601 static const char *
 602 init_seps (enum url_scheme scheme)
 603 {
 604   static char seps[8] = ":/";
 605   char *p = seps + 2;
 606   int flags = supported_schemes[scheme].flags;
 607
 608   if (flags & scm_has_params)
 609     *p++ = ';';
 610   if (flags & scm_has_query)
 611     *p++ = '?';
 612   if (flags & scm_has_fragment)
 613     *p++ = '#';
 614   *p++ = '\0';
 615   return seps;
 616 }
 617
 618 static const char *parse_errors[] = {
 619 #define PE_NO_ERROR                     0
 620   N_("No error"),
 621 #define PE_UNSUPPORTED_SCHEME           1
 622   N_("Unsupported scheme"),
 623 #define PE_INVALID_HOST_NAME            2
 624   N_("Invalid host name"),
 625 #define PE_BAD_PORT_NUMBER              3
 626   N_("Bad port number"),
 627 #define PE_INVALID_USER_NAME            4
 628   N_("Invalid user name"),
 629 #define PE_UNTERMINATED_IPV6_ADDRESS    5
 630   N_("Unterminated IPv6 numeric address"),
 631 #define PE_IPV6_NOT_SUPPORTED           6
 632   N_("IPv6 addresses not supported"),
 633 #define PE_INVALID_IPV6_ADDRESS         7
 634   N_("Invalid IPv6 numeric address")
 635 };
 636
 637 /* Parse a URL.
 638
 639    Return a new struct url if successful, NULL on error.  In case of
 640    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 641    error code. */
 642 struct url *
 643 url_parse (const char *url, int *error)
 644 {
 645   struct url *u;
 646   const char *p;
 647   bool path_modified, host_modified;
 648
 649   enum url_scheme scheme;
 650   const char *seps;
 651
 652   const char *uname_b,     *uname_e;
 653   const char *host_b,      *host_e;
 654   const char *path_b,      *path_e;
 655   const char *params_b,    *params_e;
 656   const char *query_b,     *query_e;
 657   const char *fragment_b,  *fragment_e;
 658
 659   int port;
 660   char *user = NULL, *passwd = NULL;
 661
 662   char *url_encoded = NULL;
 663
 664   int error_code;
 665
 666   scheme = url_scheme (url);
 667   if (scheme == SCHEME_INVALID)
 668     {
 669       error_code = PE_UNSUPPORTED_SCHEME;
 670       goto error;
 671     }
 672
 673   url_encoded = reencode_escapes (url);
 674   p = url_encoded;
 675
 676   p += strlen (supported_schemes[scheme].leading_string);
 677   uname_b = p;
 678   p = url_skip_credentials (p);
 679   uname_e = p;
 680
 681   /* scheme://user:pass@host[:port]... */
 682   /*                    ^              */
 683
 684   /* We attempt to break down the URL into the components path,
 685      params, query, and fragment.  They are ordered like this:
 686
 687        scheme://host[:port][/path][;params][?query][#fragment]  */
 688
 689   path_b     = path_e     = NULL;
 690   params_b   = params_e   = NULL;
 691   query_b    = query_e    = NULL;
 692   fragment_b = fragment_e = NULL;
 693
 694   /* Initialize separators for optional parts of URL, depending on the
 695      scheme.  For example, FTP has params, and HTTP and HTTPS have
 696      query string and fragment. */
 697   seps = init_seps (scheme);
 698
 699   host_b = p;
 700
 701   if (*p == '[')
 702     {
 703       /* Handle IPv6 address inside square brackets.  Ideally we'd
 704          just look for the terminating ']', but rfc2732 mandates
 705          rejecting invalid IPv6 addresses.  */
 706
 707       /* The address begins after '['. */
 708       host_b = p + 1;
 709       host_e = strchr (host_b, ']');
 710
 711       if (!host_e)
 712         {
 713           error_code = PE_UNTERMINATED_IPV6_ADDRESS;
 714           goto error;
 715         }
 716
 717 #ifdef ENABLE_IPV6
 718       /* Check if the IPv6 address is valid. */
 719       if (!is_valid_ipv6_address(host_b, host_e))
 720         {
 721           error_code = PE_INVALID_IPV6_ADDRESS;
 722           goto error;
 723         }
 724
 725       /* Continue parsing after the closing ']'. */
 726       p = host_e + 1;
 727 #else
 728       error_code = PE_IPV6_NOT_SUPPORTED;
 729       goto error;
 730 #endif
 731
 732       /* The closing bracket must be followed by a separator or by the
 733          null char.  */
 734       /* http://[::1]... */
 735       /*             ^   */
 736       if (!strchr (seps, *p))
 737         {
 738           /* Trailing garbage after []-delimited IPv6 address. */
 739           error_code = PE_INVALID_HOST_NAME;
 740           goto error;
 741         }
 742     }
 743   else
 744     {
 745       p = strpbrk_or_eos (p, seps);
 746       host_e = p;
 747     }
 748   ++seps;                       /* advance to '/' */
 749
 750   if (host_b == host_e)
 751     {
 752       error_code = PE_INVALID_HOST_NAME;
 753       goto error;
 754     }
 755
 756   port = scheme_default_port (scheme);
 757   if (*p == ':')
 758     {
 759       const char *port_b, *port_e, *pp;
 760
 761       /* scheme://host:port/tralala */
 762       /*              ^             */
 763       ++p;
 764       port_b = p;
 765       p = strpbrk_or_eos (p, seps);
 766       port_e = p;
 767
 768       /* Allow empty port, as per rfc2396. */
 769       if (port_b != port_e)
 770         for (port = 0, pp = port_b; pp < port_e; pp++)
 771           {
 772             if (!c_isdigit (*pp))
 773               {
 774                 /* http://host:12randomgarbage/blah */
 775                 /*               ^                  */
 776                 error_code = PE_BAD_PORT_NUMBER;
 777                 goto error;
 778               }
 779             port = 10 * port + (*pp - '0');
 780             /* Check for too large port numbers here, before we have
 781                a chance to overflow on bogus port values.  */
 782             if (port > 0xffff)
 783               {
 784                 error_code = PE_BAD_PORT_NUMBER;
 785                 goto error;
 786               }
 787           }
 788     }
 789   /* Advance to the first separator *after* '/' (either ';' or '?',
 790      depending on the scheme).  */
 791   ++seps;
 792
 793   /* Get the optional parts of URL, each part being delimited by
 794      current location and the position of the next separator.  */
 795 #define GET_URL_PART(sepchar, var) do {                         \
 796   if (*p == sepchar)                                            \
 797     var##_b = ++p, var##_e = p = strpbrk_or_eos (p, seps);      \
 798   ++seps;                                                       \
 799 } while (0)
 800
 801   GET_URL_PART ('/', path);
 802   if (supported_schemes[scheme].flags & scm_has_params)
 803     GET_URL_PART (';', params);
 804   if (supported_schemes[scheme].flags & scm_has_query)
 805     GET_URL_PART ('?', query);
 806   if (supported_schemes[scheme].flags & scm_has_fragment)
 807     GET_URL_PART ('#', fragment);
 808
 809 #undef GET_URL_PART
 810   assert (*p == 0);
 811
 812   if (uname_b != uname_e)
 813     {
 814       /* http://user:pass@host */
 815       /*        ^         ^    */
 816       /*     uname_b   uname_e */
 817       if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
 818         {
 819           error_code = PE_INVALID_USER_NAME;
 820           goto error;
 821         }
 822     }
 823
 824   u = xnew0 (struct url);
 825   u->scheme = scheme;
 826   u->host   = strdupdelim (host_b, host_e);
 827   u->port   = port;
 828   u->user   = user;
 829   u->passwd = passwd;
 830
 831   u->path = strdupdelim (path_b, path_e);
 832   path_modified = path_simplify (scheme, u->path);
 833   split_path (u->path, &u->dir, &u->file);
 834
 835   host_modified = lowercase_str (u->host);
 836
 837   /* Decode %HH sequences in host name.  This is important not so much
 838      to support %HH sequences in host names (which other browser
 839      don't), but to support binary characters (which will have been
 840      converted to %HH by reencode_escapes).  */
 841   if (strchr (u->host, '%'))
 842     {
 843       url_unescape (u->host);
 844       host_modified = true;
 845     }
 846
 847   if (params_b)
 848     u->params = strdupdelim (params_b, params_e);
 849   if (query_b)
 850     u->query = strdupdelim (query_b, query_e);
 851   if (fragment_b)
 852     u->fragment = strdupdelim (fragment_b, fragment_e);
 853
 854   if (path_modified || u->fragment || host_modified || path_b == path_e)
 855     {
 856       /* If we suspect that a transformation has rendered what
 857          url_string might return different from URL_ENCODED, rebuild
 858          u->url using url_string.  */
 859       u->url = url_string (u, URL_AUTH_SHOW);
 860
 861       if (url_encoded != url)
 862         xfree ((char *) url_encoded);
 863     }
 864   else
 865     {
 866       if (url_encoded == url)
 867         u->url = xstrdup (url);
 868       else
 869         u->url = url_encoded;
 870     }
 871
 872   return u;
 873
 874  error:
 875   /* Cleanup in case of error: */
 876   if (url_encoded && url_encoded != url)
 877     xfree (url_encoded);
 878
 879   /* Transmit the error code to the caller, if the caller wants to
 880      know.  */
 881   if (error)
 882     *error = error_code;
 883   return NULL;
 884 }
 885
 886 /* Return the error message string from ERROR_CODE, which should have
 887    been retrieved from url_parse.  The error message is translated.  */
 888
 889 const char *
 890 url_error (int error_code)
 891 {
 892   assert (error_code >= 0 && ((size_t) error_code) < countof (parse_errors));
 893   return _(parse_errors[error_code]);
 894 }
 895
 896 /* Split PATH into DIR and FILE.  PATH comes from the URL and is
 897    expected to be URL-escaped.
 898
 899    The path is split into directory (the part up to the last slash)
 900    and file (the part after the last slash), which are subsequently
 901    unescaped.  Examples:
 902
 903    PATH                 DIR           FILE
 904    "foo/bar/baz"        "foo/bar"     "baz"
 905    "foo/bar/"           "foo/bar"     ""
 906    "foo"                ""            "foo"
 907    "foo/bar/baz%2fqux"  "foo/bar"     "baz/qux" (!)
 908
 909    DIR and FILE are freshly allocated.  */
 910
 911 static void
 912 split_path (const char *path, char **dir, char **file)
 913 {
 914   char *last_slash = strrchr (path, '/');
 915   if (!last_slash)
 916     {
 917       *dir = xstrdup ("");
 918       *file = xstrdup (path);
 919     }
 920   else
 921     {
 922       *dir = strdupdelim (path, last_slash);
 923       *file = xstrdup (last_slash + 1);
 924     }
 925   url_unescape (*dir);
 926   url_unescape (*file);
 927 }
 928
 929 /* Note: URL's "full path" is the path with the query string and
 930    params appended.  The "fragment" (#foo) is intentionally ignored,
 931    but that might be changed.  For example, if the original URL was
 932    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
 933    the full path will be "/foo/bar/baz;bullshit?querystring".  */
 934
 935 /* Return the length of the full path, without the terminating
 936    zero.  */
 937
 938 static int
 939 full_path_length (const struct url *url)
 940 {
 941   int len = 0;
 942
 943 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
 944
 945   FROB (path);
 946   FROB (params);
 947   FROB (query);
 948
 949 #undef FROB
 950
 951   return len;
 952 }
 953
 954 /* Write out the full path. */
 955
 956 static void
 957 full_path_write (const struct url *url, char *where)
 958 {
 959 #define FROB(el, chr) do {                      \
 960   char *f_el = url->el;                         \
 961   if (f_el) {                                   \
 962     int l = strlen (f_el);                      \
 963     *where++ = chr;                             \
 964     memcpy (where, f_el, l);                    \
 965     where += l;                                 \
 966   }                                             \
 967 } while (0)
 968
 969   FROB (path, '/');
 970   FROB (params, ';');
 971   FROB (query, '?');
 972
 973 #undef FROB
 974 }
 975
 976 /* Public function for getting the "full path".  E.g. if u->path is
 977    "foo/bar" and u->query is "param=value", full_path will be
 978    "/foo/bar?param=value". */
 979
 980 char *
 981 url_full_path (const struct url *url)
 982 {
 983   int length = full_path_length (url);
 984   char *full_path = xmalloc (length + 1);
 985
 986   full_path_write (url, full_path);
 987   full_path[length] = '\0';
 988
 989   return full_path;
 990 }
 991
 992 /* Unescape CHR in an otherwise escaped STR.  Used to selectively
 993    escaping of certain characters, such as "/" and ":".  Returns a
 994    count of unescaped chars.  */
 995
 996 static void
 997 unescape_single_char (char *str, char chr)
 998 {
 999   const char c1 = XNUM_TO_DIGIT (chr >> 4);
1000   const char c2 = XNUM_TO_DIGIT (chr & 0xf);
1001   char *h = str;                /* hare */
1002   char *t = str;                /* tortoise */
1003   for (; *h; h++, t++)
1004     {
1005       if (h[0] == '%' && h[1] == c1 && h[2] == c2)
1006         {
1007           *t = chr;
1008           h += 2;
1009         }
1010       else
1011         *t = *h;
1012     }
1013   *t = '\0';
1014 }
1015
1016 /* Escape unsafe and reserved characters, except for the slash
1017    characters.  */
1018
1019 static char *
1020 url_escape_dir (const char *dir)
1021 {
1022   char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1023   if (newdir == dir)
1024     return (char *)dir;
1025
1026   unescape_single_char (newdir, '/');
1027   return newdir;
1028 }
1029
1030 /* Sync u->path and u->url with u->dir and u->file.  Called after
1031    u->file or u->dir have been changed, typically by the FTP code.  */
1032
1033 static void
1034 sync_path (struct url *u)
1035 {
1036   char *newpath, *efile, *edir;
1037
1038   xfree (u->path);
1039
1040   /* u->dir and u->file are not escaped.  URL-escape them before
1041      reassembling them into u->path.  That way, if they contain
1042      separators like '?' or even if u->file contains slashes, the
1043      path will be correctly assembled.  (u->file can contain slashes
1044      if the URL specifies it with %2f, or if an FTP server returns
1045      it.)  */
1046   edir = url_escape_dir (u->dir);
1047   efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1048
1049   if (!*edir)
1050     newpath = xstrdup (efile);
1051   else
1052     {
1053       int dirlen = strlen (edir);
1054       int filelen = strlen (efile);
1055
1056       /* Copy "DIR/FILE" to newpath. */
1057       char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1058       memcpy (p, edir, dirlen);
1059       p += dirlen;
1060       *p++ = '/';
1061       memcpy (p, efile, filelen);
1062       p += filelen;
1063       *p = '\0';
1064     }
1065
1066   u->path = newpath;
1067
1068   if (edir != u->dir)
1069     xfree (edir);
1070   if (efile != u->file)
1071     xfree (efile);
1072
1073   /* Regenerate u->url as well.  */
1074   xfree (u->url);
1075   u->url = url_string (u, URL_AUTH_SHOW);
1076 }
1077
1078 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1079    This way we can sync u->path and u->url when they get changed.  */
1080
1081 void
1082 url_set_dir (struct url *url, const char *newdir)
1083 {
1084   xfree (url->dir);
1085   url->dir = xstrdup (newdir);
1086   sync_path (url);
1087 }
1088
1089 void
1090 url_set_file (struct url *url, const char *newfile)
1091 {
1092   xfree (url->file);
1093   url->file = xstrdup (newfile);
1094   sync_path (url);
1095 }
1096
1097 void
1098 url_free (struct url *url)
1099 {
1100   xfree (url->host);
1101   xfree (url->path);
1102   xfree (url->url);
1103
1104   xfree_null (url->params);
1105   xfree_null (url->query);
1106   xfree_null (url->fragment);
1107   xfree_null (url->user);
1108   xfree_null (url->passwd);
1109
1110   xfree (url->dir);
1111   xfree (url->file);
1112
1113   xfree (url);
1114 }
1115 \f
1116 /* Create all the necessary directories for PATH (a file).  Calls
1117    make_directory internally.  */
1118 int
1119 mkalldirs (const char *path)
1120 {
1121   const char *p;
1122   char *t;
1123   struct_stat st;
1124   int res;
1125
1126   p = path + strlen (path);
1127   for (; *p != '/' && p != path; p--)
1128     ;
1129
1130   /* Don't create if it's just a file.  */
1131   if ((p == path) && (*p != '/'))
1132     return 0;
1133   t = strdupdelim (path, p);
1134
1135   /* Check whether the directory exists.  */
1136   if ((stat (t, &st) == 0))
1137     {
1138       if (S_ISDIR (st.st_mode))
1139         {
1140           xfree (t);
1141           return 0;
1142         }
1143       else
1144         {
1145           /* If the dir exists as a file name, remove it first.  This
1146              is *only* for Wget to work with buggy old CERN http
1147              servers.  Here is the scenario: When Wget tries to
1148              retrieve a directory without a slash, e.g.
1149              http://foo/bar (bar being a directory), CERN server will
1150              not redirect it too http://foo/bar/ -- it will generate a
1151              directory listing containing links to bar/file1,
1152              bar/file2, etc.  Wget will lose because it saves this
1153              HTML listing to a file `bar', so it cannot create the
1154              directory.  To work around this, if the file of the same
1155              name exists, we just remove it and create the directory
1156              anyway.  */
1157           DEBUGP (("Removing %s because of directory danger!\n", t));
1158           unlink (t);
1159         }
1160     }
1161   res = make_directory (t);
1162   if (res != 0)
1163     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1164   xfree (t);
1165   return res;
1166 }
1167 \f
1168 /* Functions for constructing the file name out of URL components.  */
1169
1170 /* A growable string structure, used by url_file_name and friends.
1171    This should perhaps be moved to utils.c.
1172
1173    The idea is to have a convenient and efficient way to construct a
1174    string by having various functions append data to it.  Instead of
1175    passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1176    functions in questions, we pass the pointer to this struct.  */
1177
1178 struct growable {
1179   char *base;
1180   int size;
1181   int tail;
1182 };
1183
1184 /* Ensure that the string can accept APPEND_COUNT more characters past
1185    the current TAIL position.  If necessary, this will grow the string
1186    and update its allocated size.  If the string is already large
1187    enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1188 #define GROW(g, append_size) do {                                       \
1189   struct growable *G_ = g;                                              \
1190   DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1191 } while (0)
1192
1193 /* Return the tail position of the string. */
1194 #define TAIL(r) ((r)->base + (r)->tail)
1195
1196 /* Move the tail position by APPEND_COUNT characters. */
1197 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1198
1199 /* Append the string STR to DEST.  NOTICE: the string in DEST is not
1200    terminated.  */
1201
1202 static void
1203 append_string (const char *str, struct growable *dest)
1204 {
1205   int l = strlen (str);
1206   GROW (dest, l);
1207   memcpy (TAIL (dest), str, l);
1208   TAIL_INCR (dest, l);
1209 }
1210
1211 /* Append CH to DEST.  For example, append_char (0, DEST)
1212    zero-terminates DEST.  */
1213
1214 static void
1215 append_char (char ch, struct growable *dest)
1216 {
1217   GROW (dest, 1);
1218   *TAIL (dest) = ch;
1219   TAIL_INCR (dest, 1);
1220 }
1221
1222 enum {
1223   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1224   filechr_not_windows = 2,      /* unusable on Windows, one of \|/<>?:*" */
1225   filechr_control     = 4       /* a control character, e.g. 0-31 */
1226 };
1227
1228 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1229
1230 /* Shorthands for the table: */
1231 #define U filechr_not_unix
1232 #define W filechr_not_windows
1233 #define C filechr_control
1234
1235 #define UW U|W
1236 #define UWC U|W|C
1237
1238 /* Table of characters unsafe under various conditions (see above).
1239
1240    Arguably we could also claim `%' to be unsafe, since we use it as
1241    the escape character.  If we ever want to be able to reliably
1242    translate file name back to URL, this would become important
1243    crucial.  Right now, it's better to be minimal in escaping.  */
1244
1245 static const unsigned char filechr_table[256] =
1246 {
1247 UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1248   C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1249   C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1250   C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1251   0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1252   0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1253   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1254   0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
1255   0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1256   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1257   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1258   0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1259   0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1260   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1261   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1262   0,  0,  0,  0,   W,  0,  0,  C,   /* x   y   z   {    |   }   ~   DEL */
1263
1264   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
1265   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
1266   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1267   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1268
1269   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1270   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1271   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1272   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1273 };
1274 #undef U
1275 #undef W
1276 #undef C
1277 #undef UW
1278 #undef UWC
1279
1280 /* FN_PORT_SEP is the separator between host and port in file names
1281    for non-standard port numbers.  On Unix this is normally ':', as in
1282    "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1283    because Windows can't handle ':' in file names.  */
1284 #define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1285
1286 /* FN_QUERY_SEP is the separator between the file name and the URL
1287    query, normally '?'.  Since Windows cannot handle '?' as part of
1288    file name, we use '@' instead there.  */
1289 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1290
1291 /* Quote path element, characters in [b, e), as file name, and append
1292    the quoted string to DEST.  Each character is quoted as per
1293    file_unsafe_char and the corresponding table.
1294
1295    If ESCAPED is true, the path element is considered to be
1296    URL-escaped and will be unescaped prior to inspection.  */
1297
1298 static void
1299 append_uri_pathel (const char *b, const char *e, bool escaped,
1300                    struct growable *dest)
1301 {
1302   const char *p;
1303   int quoted, outlen;
1304
1305   int mask;
1306   if (opt.restrict_files_os == restrict_unix)
1307     mask = filechr_not_unix;
1308   else
1309     mask = filechr_not_windows;
1310   if (opt.restrict_files_ctrl)
1311     mask |= filechr_control;
1312
1313   /* Copy [b, e) to PATHEL and URL-unescape it. */
1314   if (escaped)
1315     {
1316       char *unescaped;
1317       BOUNDED_TO_ALLOCA (b, e, unescaped);
1318       url_unescape (unescaped);
1319       b = unescaped;
1320       e = unescaped + strlen (unescaped);
1321     }
1322
1323   /* Defang ".." when found as component of path.  Remember that path
1324      comes from the URL and might contain malicious input.  */
1325   if (e - b == 2 && b[0] == '.' && b[1] == '.')
1326     {
1327       b = "%2E%2E";
1328       e = b + 6;
1329     }
1330
1331   /* Walk the PATHEL string and check how many characters we'll need
1332      to quote.  */
1333   quoted = 0;
1334   for (p = b; p < e; p++)
1335     if (FILE_CHAR_TEST (*p, mask))
1336       ++quoted;
1337
1338   /* Calculate the length of the output string.  e-b is the input
1339      string length.  Each quoted char introduces two additional
1340      characters in the string, hence 2*quoted.  */
1341   outlen = (e - b) + (2 * quoted);
1342   GROW (dest, outlen);
1343
1344   if (!quoted)
1345     {
1346       /* If there's nothing to quote, we can simply append the string
1347          without processing it again.  */
1348       memcpy (TAIL (dest), b, outlen);
1349     }
1350   else
1351     {
1352       char *q = TAIL (dest);
1353       for (p = b; p < e; p++)
1354         {
1355           if (!FILE_CHAR_TEST (*p, mask))
1356             *q++ = *p;
1357           else
1358             {
1359               unsigned char ch = *p;
1360               *q++ = '%';
1361               *q++ = XNUM_TO_DIGIT (ch >> 4);
1362               *q++ = XNUM_TO_DIGIT (ch & 0xf);
1363             }
1364         }
1365       assert (q - TAIL (dest) == outlen);
1366     }
1367
1368   /* Perform inline case transformation if required.  */
1369   if (opt.restrict_files_case == restrict_lowercase
1370       || opt.restrict_files_case == restrict_uppercase)
1371     {
1372       char *q;
1373       for (q = TAIL (dest); q < TAIL (dest) + outlen; ++q)
1374         {
1375           if (opt.restrict_files_case == restrict_lowercase)
1376             *q = c_tolower (*q);
1377           else
1378             *q = c_toupper (*q);
1379         }
1380     }
1381
1382   TAIL_INCR (dest, outlen);
1383 }
1384
1385 /* Append to DEST the directory structure that corresponds the
1386    directory part of URL's path.  For example, if the URL is
1387    http://server/dir1/dir2/file, this appends "/dir1/dir2".
1388
1389    Each path element ("dir1" and "dir2" in the above example) is
1390    examined, url-unescaped, and re-escaped as file name element.
1391
1392    Additionally, it cuts as many directories from the path as
1393    specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1394    will produce "bar" for the above example.  For 2 or more, it will
1395    produce "".
1396
1397    Each component of the path is quoted for use as file name.  */
1398
1399 static void
1400 append_dir_structure (const struct url *u, struct growable *dest)
1401 {
1402   char *pathel, *next;
1403   int cut = opt.cut_dirs;
1404
1405   /* Go through the path components, de-URL-quote them, and quote them
1406      (if necessary) as file names.  */
1407
1408   pathel = u->path;
1409   for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1410     {
1411       if (cut-- > 0)
1412         continue;
1413       if (pathel == next)
1414         /* Ignore empty pathels.  */
1415         continue;
1416
1417       if (dest->tail)
1418         append_char ('/', dest);
1419       append_uri_pathel (pathel, next, true, dest);
1420     }
1421 }
1422
1423 /* Return a unique file name that matches the given URL as good as
1424    possible.  Does not create directories on the file system.  */
1425
1426 char *
1427 url_file_name (const struct url *u)
1428 {
1429   struct growable fnres;        /* stands for "file name result" */
1430
1431   const char *u_file, *u_query;
1432   char *fname, *unique;
1433
1434   fnres.base = NULL;
1435   fnres.size = 0;
1436   fnres.tail = 0;
1437
1438   /* Start with the directory prefix, if specified. */
1439   if (opt.dir_prefix)
1440     append_string (opt.dir_prefix, &fnres);
1441
1442   /* If "dirstruct" is turned on (typically the case with -r), add
1443      the host and port (unless those have been turned off) and
1444      directory structure.  */
1445   if (opt.dirstruct)
1446     {
1447       if (opt.protocol_directories)
1448         {
1449           if (fnres.tail)
1450             append_char ('/', &fnres);
1451           append_string (supported_schemes[u->scheme].name, &fnres);
1452         }
1453       if (opt.add_hostdir)
1454         {
1455           if (fnres.tail)
1456             append_char ('/', &fnres);
1457           if (0 != strcmp (u->host, ".."))
1458             append_string (u->host, &fnres);
1459           else
1460             /* Host name can come from the network; malicious DNS may
1461                allow ".." to be resolved, causing us to write to
1462                "../<file>".  Defang such host names.  */
1463             append_string ("%2E%2E", &fnres);
1464           if (u->port != scheme_default_port (u->scheme))
1465             {
1466               char portstr[24];
1467               number_to_string (portstr, u->port);
1468               append_char (FN_PORT_SEP, &fnres);
1469               append_string (portstr, &fnres);
1470             }
1471         }
1472
1473       append_dir_structure (u, &fnres);
1474     }
1475
1476   /* Add the file name. */
1477   if (fnres.tail)
1478     append_char ('/', &fnres);
1479   u_file = *u->file ? u->file : "index.html";
1480   append_uri_pathel (u_file, u_file + strlen (u_file), false, &fnres);
1481
1482   /* Append "?query" to the file name. */
1483   u_query = u->query && *u->query ? u->query : NULL;
1484   if (u_query)
1485     {
1486       append_char (FN_QUERY_SEP, &fnres);
1487       append_uri_pathel (u_query, u_query + strlen (u_query), true, &fnres);
1488     }
1489
1490   /* Zero-terminate the file name. */
1491   append_char ('\0', &fnres);
1492
1493   fname = fnres.base;
1494
1495   /* Check the cases in which the unique extensions are not used:
1496      1) Clobbering is turned off (-nc).
1497      2) Retrieval with regetting.
1498      3) Timestamping is used.
1499      4) Hierarchy is built.
1500
1501      The exception is the case when file does exist and is a
1502      directory (see `mkalldirs' for explanation).  */
1503
1504   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1505       && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1506     return fname;
1507
1508   unique = unique_name (fname, true);
1509   if (unique != fname)
1510     xfree (fname);
1511   return unique;
1512 }
1513 \f
1514 /* Resolve "." and ".." elements of PATH by destructively modifying
1515    PATH and return true if PATH has been modified, false otherwise.
1516
1517    The algorithm is in spirit similar to the one described in rfc1808,
1518    although implemented differently, in one pass.  To recap, path
1519    elements containing only "." are removed, and ".." is taken to mean
1520    "back up one element".  Single leading and trailing slashes are
1521    preserved.
1522
1523    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1524    test examples are provided below.  If you change anything in this
1525    function, run test_path_simplify to make sure you haven't broken a
1526    test case.  */
1527
1528 static bool
1529 path_simplify (enum url_scheme scheme, char *path)
1530 {
1531   char *h = path;               /* hare */
1532   char *t = path;               /* tortoise */
1533   char *beg = path;
1534   char *end = strchr (path, '\0');
1535
1536   while (h < end)
1537     {
1538       /* Hare should be at the beginning of a path element. */
1539
1540       if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1541         {
1542           /* Ignore "./". */
1543           h += 2;
1544         }
1545       else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1546         {
1547           /* Handle "../" by retreating the tortoise by one path
1548              element -- but not past beggining.  */
1549           if (t > beg)
1550             {
1551               /* Move backwards until T hits the beginning of the
1552                  previous path element or the beginning of path. */
1553               for (--t; t > beg && t[-1] != '/'; t--)
1554                 ;
1555             }
1556           else if (scheme == SCHEME_FTP)
1557             {
1558               /* If we're at the beginning, copy the "../" literally
1559                  and move the beginning so a later ".." doesn't remove
1560                  it.  This violates RFC 3986; but we do it for FTP
1561                  anyway because there is otherwise no way to get at a
1562                  parent directory, when the FTP server drops us in a
1563                  non-root directory (which is not uncommon). */
1564               beg = t + 3;
1565               goto regular;
1566             }
1567           h += 3;
1568         }
1569       else
1570         {
1571         regular:
1572           /* A regular path element.  If H hasn't advanced past T,
1573              simply skip to the next path element.  Otherwise, copy
1574              the path element until the next slash.  */
1575           if (t == h)
1576             {
1577               /* Skip the path element, including the slash.  */
1578               while (h < end && *h != '/')
1579                 t++, h++;
1580               if (h < end)
1581                 t++, h++;
1582             }
1583           else
1584             {
1585               /* Copy the path element, including the final slash.  */
1586               while (h < end && *h != '/')
1587                 *t++ = *h++;
1588               if (h < end)
1589                 *t++ = *h++;
1590             }
1591         }
1592     }
1593
1594   if (t != h)
1595     *t = '\0';
1596
1597   return t != h;
1598 }
1599 \f
1600 /* Return the length of URL's path.  Path is considered to be
1601    terminated by one or more of the ?query or ;params or #fragment,
1602    depending on the scheme.  */
1603
1604 static const char *
1605 path_end (const char *url)
1606 {
1607   enum url_scheme scheme = url_scheme (url);
1608   const char *seps;
1609   if (scheme == SCHEME_INVALID)
1610     scheme = SCHEME_HTTP;       /* use http semantics for rel links */
1611   /* +2 to ignore the first two separators ':' and '/' */
1612   seps = init_seps (scheme) + 2;
1613   return strpbrk_or_eos (url, seps);
1614 }
1615
1616 /* Find the last occurrence of character C in the range [b, e), or
1617    NULL, if none are present.  */
1618 #define find_last_char(b, e, c) memrchr ((b), (c), (e) - (b))
1619
1620 /* Merge BASE with LINK and return the resulting URI.
1621
1622    Either of the URIs may be absolute or relative, complete with the
1623    host name, or path only.  This tries to reasonably handle all
1624    foreseeable cases.  It only employs minimal URL parsing, without
1625    knowledge of the specifics of schemes.
1626
1627    I briefly considered making this function call path_simplify after
1628    the merging process, as rfc1738 seems to suggest.  This is a bad
1629    idea for several reasons: 1) it complexifies the code, and 2)
1630    url_parse has to simplify path anyway, so it's wasteful to boot.  */
1631
1632 char *
1633 uri_merge (const char *base, const char *link)
1634 {
1635   int linklength;
1636   const char *end;
1637   char *merge;
1638
1639   if (url_has_scheme (link))
1640     return xstrdup (link);
1641
1642   /* We may not examine BASE past END. */
1643   end = path_end (base);
1644   linklength = strlen (link);
1645
1646   if (!*link)
1647     {
1648       /* Empty LINK points back to BASE, query string and all. */
1649       return xstrdup (base);
1650     }
1651   else if (*link == '?')
1652     {
1653       /* LINK points to the same location, but changes the query
1654          string.  Examples: */
1655       /* uri_merge("path",         "?new") -> "path?new"     */
1656       /* uri_merge("path?foo",     "?new") -> "path?new"     */
1657       /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1658       /* uri_merge("path#foo",     "?new") -> "path?new"     */
1659       int baselength = end - base;
1660       merge = xmalloc (baselength + linklength + 1);
1661       memcpy (merge, base, baselength);
1662       memcpy (merge + baselength, link, linklength);
1663       merge[baselength + linklength] = '\0';
1664     }
1665   else if (*link == '#')
1666     {
1667       /* uri_merge("path",         "#new") -> "path#new"     */
1668       /* uri_merge("path#foo",     "#new") -> "path#new"     */
1669       /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1670       /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1671       int baselength;
1672       const char *end1 = strchr (base, '#');
1673       if (!end1)
1674         end1 = base + strlen (base);
1675       baselength = end1 - base;
1676       merge = xmalloc (baselength + linklength + 1);
1677       memcpy (merge, base, baselength);
1678       memcpy (merge + baselength, link, linklength);
1679       merge[baselength + linklength] = '\0';
1680     }
1681   else if (*link == '/' && *(link + 1) == '/')
1682     {
1683       /* LINK begins with "//" and so is a net path: we need to
1684          replace everything after (and including) the double slash
1685          with LINK. */
1686
1687       /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1688       /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1689       /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1690
1691       int span;
1692       const char *slash;
1693       const char *start_insert;
1694
1695       /* Look for first slash. */
1696       slash = memchr (base, '/', end - base);
1697       /* If found slash and it is a double slash, then replace
1698          from this point, else default to replacing from the
1699          beginning.  */
1700       if (slash && *(slash + 1) == '/')
1701         start_insert = slash;
1702       else
1703         start_insert = base;
1704
1705       span = start_insert - base;
1706       merge = xmalloc (span + linklength + 1);
1707       if (span)
1708         memcpy (merge, base, span);
1709       memcpy (merge + span, link, linklength);
1710       merge[span + linklength] = '\0';
1711     }
1712   else if (*link == '/')
1713     {
1714       /* LINK is an absolute path: we need to replace everything
1715          after (and including) the FIRST slash with LINK.
1716
1717          So, if BASE is "http://host/whatever/foo/bar", and LINK is
1718          "/qux/xyzzy", our result should be
1719          "http://host/qux/xyzzy".  */
1720       int span;
1721       const char *slash;
1722       const char *start_insert = NULL; /* for gcc to shut up. */
1723       const char *pos = base;
1724       bool seen_slash_slash = false;
1725       /* We're looking for the first slash, but want to ignore
1726          double slash. */
1727     again:
1728       slash = memchr (pos, '/', end - pos);
1729       if (slash && !seen_slash_slash)
1730         if (*(slash + 1) == '/')
1731           {
1732             pos = slash + 2;
1733             seen_slash_slash = true;
1734             goto again;
1735           }
1736
1737       /* At this point, SLASH is the location of the first / after
1738          "//", or the first slash altogether.  START_INSERT is the
1739          pointer to the location where LINK will be inserted.  When
1740          examining the last two examples, keep in mind that LINK
1741          begins with '/'. */
1742
1743       if (!slash && !seen_slash_slash)
1744         /* example: "foo" */
1745         /*           ^    */
1746         start_insert = base;
1747       else if (!slash && seen_slash_slash)
1748         /* example: "http://foo" */
1749         /*                     ^ */
1750         start_insert = end;
1751       else if (slash && !seen_slash_slash)
1752         /* example: "foo/bar" */
1753         /*           ^        */
1754         start_insert = base;
1755       else if (slash && seen_slash_slash)
1756         /* example: "http://something/" */
1757         /*                           ^  */
1758         start_insert = slash;
1759
1760       span = start_insert - base;
1761       merge = xmalloc (span + linklength + 1);
1762       if (span)
1763         memcpy (merge, base, span);
1764       memcpy (merge + span, link, linklength);
1765       merge[span + linklength] = '\0';
1766     }
1767   else
1768     {
1769       /* LINK is a relative URL: we need to replace everything
1770          after last slash (possibly empty) with LINK.
1771
1772          So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1773          our result should be "whatever/foo/qux/xyzzy".  */
1774       bool need_explicit_slash = false;
1775       int span;
1776       const char *start_insert;
1777       const char *last_slash = find_last_char (base, end, '/');
1778       if (!last_slash)
1779         {
1780           /* No slash found at all.  Replace what we have with LINK. */
1781           start_insert = base;
1782         }
1783       else if (last_slash && last_slash >= base + 2
1784                && last_slash[-2] == ':' && last_slash[-1] == '/')
1785         {
1786           /* example: http://host"  */
1787           /*                      ^ */
1788           start_insert = end + 1;
1789           need_explicit_slash = true;
1790         }
1791       else
1792         {
1793           /* example: "whatever/foo/bar" */
1794           /*                        ^    */
1795           start_insert = last_slash + 1;
1796         }
1797
1798       span = start_insert - base;
1799       merge = xmalloc (span + linklength + 1);
1800       if (span)
1801         memcpy (merge, base, span);
1802       if (need_explicit_slash)
1803         merge[span - 1] = '/';
1804       memcpy (merge + span, link, linklength);
1805       merge[span + linklength] = '\0';
1806     }
1807
1808   return merge;
1809 }
1810 \f
1811 #define APPEND(p, s) do {                       \
1812   int len = strlen (s);                         \
1813   memcpy (p, s, len);                           \
1814   p += len;                                     \
1815 } while (0)
1816
1817 /* Use this instead of password when the actual password is supposed
1818    to be hidden.  We intentionally use a generic string without giving
1819    away the number of characters in the password, like previous
1820    versions did.  */
1821 #define HIDDEN_PASSWORD "*password*"
1822
1823 /* Recreate the URL string from the data in URL.
1824
1825    If HIDE is true (as it is when we're calling this on a URL we plan
1826    to print, but not when calling it to canonicalize a URL for use
1827    within the program), password will be hidden.  Unsafe characters in
1828    the URL will be quoted.  */
1829
1830 char *
1831 url_string (const struct url *url, enum url_auth_mode auth_mode)
1832 {
1833   int size;
1834   char *result, *p;
1835   char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;
1836
1837   int scheme_port = supported_schemes[url->scheme].default_port;
1838   const char *scheme_str = supported_schemes[url->scheme].leading_string;
1839   int fplen = full_path_length (url);
1840
1841   bool brackets_around_host;
1842
1843   assert (scheme_str != NULL);
1844
1845   /* Make sure the user name and password are quoted. */
1846   if (url->user)
1847     {
1848       if (auth_mode != URL_AUTH_HIDE)
1849         {
1850           quoted_user = url_escape_allow_passthrough (url->user);
1851           if (url->passwd)
1852             {
1853               if (auth_mode == URL_AUTH_HIDE_PASSWD)
1854                 quoted_passwd = HIDDEN_PASSWORD;
1855               else
1856                 quoted_passwd = url_escape_allow_passthrough (url->passwd);
1857             }
1858         }
1859     }
1860
1861   /* In the unlikely event that the host name contains non-printable
1862      characters, quote it for displaying to the user.  */
1863   quoted_host = url_escape_allow_passthrough (url->host);
1864
1865   /* Undo the quoting of colons that URL escaping performs.  IPv6
1866      addresses may legally contain colons, and in that case must be
1867      placed in square brackets.  */
1868   if (quoted_host != url->host)
1869     unescape_single_char (quoted_host, ':');
1870   brackets_around_host = strchr (quoted_host, ':') != NULL;
1871
1872   size = (strlen (scheme_str)
1873           + strlen (quoted_host)
1874           + (brackets_around_host ? 2 : 0)
1875           + fplen
1876           + 1);
1877   if (url->port != scheme_port)
1878     size += 1 + numdigit (url->port);
1879   if (quoted_user)
1880     {
1881       size += 1 + strlen (quoted_user);
1882       if (quoted_passwd)
1883         size += 1 + strlen (quoted_passwd);
1884     }
1885
1886   p = result = xmalloc (size);
1887
1888   APPEND (p, scheme_str);
1889   if (quoted_user)
1890     {
1891       APPEND (p, quoted_user);
1892       if (quoted_passwd)
1893         {
1894           *p++ = ':';
1895           APPEND (p, quoted_passwd);
1896         }
1897       *p++ = '@';
1898     }
1899
1900   if (brackets_around_host)
1901     *p++ = '[';
1902   APPEND (p, quoted_host);
1903   if (brackets_around_host)
1904     *p++ = ']';
1905   if (url->port != scheme_port)
1906     {
1907       *p++ = ':';
1908       p = number_to_string (p, url->port);
1909     }
1910
1911   full_path_write (url, p);
1912   p += fplen;
1913   *p++ = '\0';
1914
1915   assert (p - result == size);
1916
1917   if (quoted_user && quoted_user != url->user)
1918     xfree (quoted_user);
1919   if (quoted_passwd && auth_mode == URL_AUTH_SHOW
1920       && quoted_passwd != url->passwd)
1921     xfree (quoted_passwd);
1922   if (quoted_host != url->host)
1923     xfree (quoted_host);
1924
1925   return result;
1926 }
1927 \f
1928 /* Return true if scheme a is similar to scheme b.
1929
1930    Schemes are similar if they are equal.  If SSL is supported, schemes
1931    are also similar if one is http (SCHEME_HTTP) and the other is https
1932    (SCHEME_HTTPS).  */
1933 bool
1934 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
1935 {
1936   if (a == b)
1937     return true;
1938 #ifdef HAVE_SSL
1939   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
1940       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
1941     return true;
1942 #endif
1943   return false;
1944 }
1945 \f
1946 static int
1947 getchar_from_escaped_string (const char *str, char *c)
1948 {
1949   const char *p = str;
1950
1951   assert (str && *str);
1952   assert (c);
1953
1954   if (p[0] == '%')
1955     {
1956       if (!c_isxdigit(p[1]) || !c_isxdigit(p[2]))
1957         {
1958           *c = '%';
1959           return 1;
1960         }
1961       else
1962         {
1963           if (p[2] == 0)
1964             return 0; /* error: invalid string */
1965
1966           *c = X2DIGITS_TO_NUM (p[1], p[2]);
1967           if (URL_RESERVED_CHAR(*c))
1968             {
1969               *c = '%';
1970               return 1;
1971             }
1972           else
1973             return 3;
1974         }
1975     }
1976   else
1977     {
1978       *c = p[0];
1979     }
1980
1981   return 1;
1982 }
1983
1984 bool
1985 are_urls_equal (const char *u1, const char *u2)
1986 {
1987   const char *p, *q;
1988   int pp, qq;
1989   char ch1, ch2;
1990   assert(u1 && u2);
1991
1992   p = u1;
1993   q = u2;
1994
1995   while (*p && *q
1996          && (pp = getchar_from_escaped_string (p, &ch1))
1997          && (qq = getchar_from_escaped_string (q, &ch2))
1998          && (c_tolower(ch1) == c_tolower(ch2)))
1999     {
2000       p += pp;
2001       q += qq;
2002     }
2003
2004   return (*p == 0 && *q == 0 ? true : false);
2005 }
2006 \f
2007 #ifdef TESTING
2008 /* Debugging and testing support for path_simplify. */
2009
2010 #if 0
2011 /* Debug: run path_simplify on PATH and return the result in a new
2012    string.  Useful for calling from the debugger.  */
2013 static char *
2014 ps (char *path)
2015 {
2016   char *copy = xstrdup (path);
2017   path_simplify (copy);
2018   return copy;
2019 }
2020 #endif
2021
2022 static const char *
2023 run_test (char *test, char *expected_result, enum url_scheme scheme,
2024           bool expected_change)
2025 {
2026   char *test_copy = xstrdup (test);
2027   bool modified = path_simplify (scheme, test_copy);
2028
2029   if (0 != strcmp (test_copy, expected_result))
2030     {
2031       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2032               test, expected_result, test_copy);
2033       mu_assert ("", 0);
2034     }
2035   if (modified != expected_change)
2036     {
2037       if (expected_change)
2038         printf ("Expected modification with path_simplify(\"%s\").\n",
2039                 test);
2040       else
2041         printf ("Expected no modification with path_simplify(\"%s\").\n",
2042                 test);
2043     }
2044   xfree (test_copy);
2045   mu_assert ("", modified == expected_change);
2046   return NULL;
2047 }
2048
2049 const char *
2050 test_path_simplify (void)
2051 {
2052   static struct {
2053     char *test, *result;
2054     enum url_scheme scheme;
2055     bool should_modify;
2056   } tests[] = {
2057     { "",                       "",             SCHEME_HTTP, false },
2058     { ".",                      "",             SCHEME_HTTP, true },
2059     { "./",                     "",             SCHEME_HTTP, true },
2060     { "..",                     "",             SCHEME_HTTP, true },
2061     { "../",                    "",             SCHEME_HTTP, true },
2062     { "..",                     "..",           SCHEME_FTP,  false },
2063     { "../",                    "../",          SCHEME_FTP,  false },
2064     { "foo",                    "foo",          SCHEME_HTTP, false },
2065     { "foo/bar",                "foo/bar",      SCHEME_HTTP, false },
2066     { "foo///bar",              "foo///bar",    SCHEME_HTTP, false },
2067     { "foo/.",                  "foo/",         SCHEME_HTTP, true },
2068     { "foo/./",                 "foo/",         SCHEME_HTTP, true },
2069     { "foo./",                  "foo./",        SCHEME_HTTP, false },
2070     { "foo/../bar",             "bar",          SCHEME_HTTP, true },
2071     { "foo/../bar/",            "bar/",         SCHEME_HTTP, true },
2072     { "foo/bar/..",             "foo/",         SCHEME_HTTP, true },
2073     { "foo/bar/../x",           "foo/x",        SCHEME_HTTP, true },
2074     { "foo/bar/../x/",          "foo/x/",       SCHEME_HTTP, true },
2075     { "foo/..",                 "",             SCHEME_HTTP, true },
2076     { "foo/../..",              "",             SCHEME_HTTP, true },
2077     { "foo/../../..",           "",             SCHEME_HTTP, true },
2078     { "foo/../../bar/../../baz", "baz",         SCHEME_HTTP, true },
2079     { "foo/../..",              "..",           SCHEME_FTP,  true },
2080     { "foo/../../..",           "../..",        SCHEME_FTP,  true },
2081     { "foo/../../bar/../../baz", "../../baz",   SCHEME_FTP,  true },
2082     { "a/b/../../c",            "c",            SCHEME_HTTP, true },
2083     { "./a/../b",               "b",            SCHEME_HTTP, true }
2084   };
2085   int i;
2086
2087   for (i = 0; i < countof (tests); i++)
2088     {
2089       const char *message;
2090       char *test = tests[i].test;
2091       char *expected_result = tests[i].result;
2092       enum url_scheme scheme = tests[i].scheme;
2093       bool  expected_change = tests[i].should_modify;
2094       message = run_test (test, expected_result, scheme, expected_change);
2095       if (message) return message;
2096     }
2097   return NULL;
2098 }
2099
2100 const char *
2101 test_append_uri_pathel()
2102 {
2103   int i;
2104   struct {
2105     char *original_url;
2106     char *input;
2107     bool escaped;
2108     char *expected_result;
2109   } test_array[] = {
2110     { "http://www.yoyodyne.com/path/", "somepage.html", false, "http://www.yoyodyne.com/path/somepage.html" },
2111   };
2112
2113   for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
2114     {
2115       struct growable dest;
2116       const char *p = test_array[i].input;
2117
2118       memset (&dest, 0, sizeof (dest));
2119
2120       append_string (test_array[i].original_url, &dest);
2121       append_uri_pathel (p, p + strlen(p), test_array[i].escaped, &dest);
2122       append_char ('\0', &dest);
2123
2124       mu_assert ("test_append_uri_pathel: wrong result",
2125                  strcmp (dest.base, test_array[i].expected_result) == 0);
2126     }
2127
2128   return NULL;
2129 }
2130
2131 const char*
2132 test_are_urls_equal()
2133 {
2134   int i;
2135   struct {
2136     char *url1;
2137     char *url2;
2138     bool expected_result;
2139   } test_array[] = {
2140     { "http://www.adomain.com/apath/", "http://www.adomain.com/apath/",       true },
2141     { "http://www.adomain.com/apath/", "http://www.adomain.com/anotherpath/", false },
2142     { "http://www.adomain.com/apath/", "http://www.anotherdomain.com/path/",  false },
2143     { "http://www.adomain.com/~path/", "http://www.adomain.com/%7epath/",     true },
2144     { "http://www.adomain.com/longer-path/", "http://www.adomain.com/path/",  false },
2145     { "http://www.adomain.com/path%2f", "http://www.adomain.com/path/",       false },
2146   };
2147
2148   for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
2149     {
2150       mu_assert ("test_are_urls_equal: wrong result",
2151                  are_urls_equal (test_array[i].url1, test_array[i].url2) == test_array[i].expected_result);
2152     }
2153
2154   return NULL;
2155 }
2156
2157 #endif /* TESTING */
2158
2159 /*
2160  * vim: et ts=2 sw=2
2161  */
2162