sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
   3    2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or (at
  10 your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #define USE_GNULIB_ALLOC
  32
  33 #include "wget.h"
  34
  35 #include <stdio.h>
  36 #include <stdlib.h>
  37 #include <string.h>
  38 #ifdef HAVE_UNISTD_H
  39 # include <unistd.h>
  40 #endif
  41 #include <errno.h>
  42 #include <assert.h>
  43
  44 #include "utils.h"
  45 #include "url.h"
  46 #include "host.h"  /* for is_valid_ipv6_address */
  47
  48 #ifdef TESTING
  49 #include "test.h"
  50 #endif
  51
  52 enum {
  53   scm_disabled = 1,             /* for https when OpenSSL fails to init. */
  54   scm_has_params = 2,           /* whether scheme has ;params */
  55   scm_has_query = 4,            /* whether scheme has ?query */
  56   scm_has_fragment = 8          /* whether scheme has #fragment */
  57 };
  58
  59 struct scheme_data
  60 {
  61   /* Short name of the scheme, such as "http" or "ftp". */
  62   const char *name;
  63   /* Leading string that identifies the scheme, such as "https://". */
  64   const char *leading_string;
  65   /* Default port of the scheme when none is specified. */
  66   int default_port;
  67   /* Various flags. */
  68   int flags;
  69 };
  70
  71 /* Supported schemes: */
  72 static struct scheme_data supported_schemes[] =
  73 {
  74   { "http",     "http://",  DEFAULT_HTTP_PORT,  scm_has_query|scm_has_fragment },
  75 #ifdef HAVE_SSL
  76   { "https",    "https://", DEFAULT_HTTPS_PORT, scm_has_query|scm_has_fragment },
  77 #endif
  78   { "ftp",      "ftp://",   DEFAULT_FTP_PORT,   scm_has_params|scm_has_fragment },
  79
  80   /* SCHEME_INVALID */
  81   { NULL,       NULL,       -1,                 0 }
  82 };
  83
  84 /* Forward declarations: */
  85
  86 static bool path_simplify (char *);
  87 \f
  88 /* Support for escaping and unescaping of URL strings.  */
  89
  90 /* Table of "reserved" and "unsafe" characters.  Those terms are
  91    rfc1738-speak, as such largely obsoleted by rfc2396 and later
  92    specs, but the general idea remains.
  93
  94    A reserved character is the one that you can't decode without
  95    changing the meaning of the URL.  For example, you can't decode
  96    "/foo/%2f/bar" into "/foo///bar" because the number and contents of
  97    path components is different.  Non-reserved characters can be
  98    changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar".  The
  99    unsafe characters are loosely based on rfc1738, plus "$" and ",",
 100    as recommended by rfc2396, and minus "~", which is very frequently
 101    used (and sometimes unrecognized as %7E by broken servers).
 102
 103    An unsafe character is the one that should be encoded when URLs are
 104    placed in foreign environments.  E.g. space and newline are unsafe
 105    in HTTP contexts because HTTP uses them as separator and line
 106    terminator, so they must be encoded to %20 and %0A respectively.
 107    "*" is unsafe in shell context, etc.
 108
 109    We determine whether a character is unsafe through static table
 110    lookup.  This code assumes ASCII character set and 8-bit chars.  */
 111
 112 enum {
 113   /* rfc1738 reserved chars + "$" and ",".  */
 114   urlchr_reserved = 1,
 115
 116   /* rfc1738 unsafe chars, plus non-printables.  */
 117   urlchr_unsafe   = 2
 118 };
 119
 120 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 121 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 122 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 123
 124 /* Shorthands for the table: */
 125 #define R  urlchr_reserved
 126 #define U  urlchr_unsafe
 127 #define RU R|U
 128
 129 static const unsigned char urlchr_table[256] =
 130 {
 131   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 132   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 133   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 134   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 135   U,  0,  U, RU,   R,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 136   0,  0,  0,  R,   R,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 137   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 138   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 139  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 140   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 141   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 142   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 143   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 144   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 145   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 146   0,  0,  0,  U,   U,  U,  0,  U,   /* x   y   z   {    |   }   ~   DEL */
 147
 148   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 149   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 150   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 151   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 152
 153   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 154   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 155   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 156   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 157 };
 158 #undef R
 159 #undef U
 160 #undef RU
 161
 162 /* URL-unescape the string S.
 163
 164    This is done by transforming the sequences "%HH" to the character
 165    represented by the hexadecimal digits HH.  If % is not followed by
 166    two hexadecimal digits, it is inserted literally.
 167
 168    The transformation is done in place.  If you need the original
 169    string intact, make a copy before calling this function.  */
 170
 171 static void
 172 url_unescape (char *s)
 173 {
 174   char *t = s;                  /* t - tortoise */
 175   char *h = s;                  /* h - hare     */
 176
 177   for (; *h; h++, t++)
 178     {
 179       if (*h != '%')
 180         {
 181         copychar:
 182           *t = *h;
 183         }
 184       else
 185         {
 186           char c;
 187           /* Do nothing if '%' is not followed by two hex digits. */
 188           if (!h[1] || !h[2] || !(c_isxdigit (h[1]) && c_isxdigit (h[2])))
 189             goto copychar;
 190           c = X2DIGITS_TO_NUM (h[1], h[2]);
 191           /* Don't unescape %00 because there is no way to insert it
 192              into a C string without effectively truncating it. */
 193           if (c == '\0')
 194             goto copychar;
 195           *t = c;
 196           h += 2;
 197         }
 198     }
 199   *t = '\0';
 200 }
 201
 202 /* The core of url_escape_* functions.  Escapes the characters that
 203    match the provided mask in urlchr_table.
 204
 205    If ALLOW_PASSTHROUGH is true, a string with no unsafe chars will be
 206    returned unchanged.  If ALLOW_PASSTHROUGH is false, a freshly
 207    allocated string will be returned in all cases.  */
 208
 209 static char *
 210 url_escape_1 (const char *s, unsigned char mask, bool allow_passthrough)
 211 {
 212   const char *p1;
 213   char *p2, *newstr;
 214   int newlen;
 215   int addition = 0;
 216
 217   for (p1 = s; *p1; p1++)
 218     if (urlchr_test (*p1, mask))
 219       addition += 2;            /* Two more characters (hex digits) */
 220
 221   if (!addition)
 222     return allow_passthrough ? (char *)s : xstrdup (s);
 223
 224   newlen = (p1 - s) + addition;
 225   newstr = xmalloc (newlen + 1);
 226
 227   p1 = s;
 228   p2 = newstr;
 229   while (*p1)
 230     {
 231       /* Quote the characters that match the test mask. */
 232       if (urlchr_test (*p1, mask))
 233         {
 234           unsigned char c = *p1++;
 235           *p2++ = '%';
 236           *p2++ = XNUM_TO_DIGIT (c >> 4);
 237           *p2++ = XNUM_TO_DIGIT (c & 0xf);
 238         }
 239       else
 240         *p2++ = *p1++;
 241     }
 242   assert (p2 - newstr == newlen);
 243   *p2 = '\0';
 244
 245   return newstr;
 246 }
 247
 248 /* URL-escape the unsafe characters (see urlchr_table) in a given
 249    string, returning a freshly allocated string.  */
 250
 251 char *
 252 url_escape (const char *s)
 253 {
 254   return url_escape_1 (s, urlchr_unsafe, false);
 255 }
 256
 257 /* URL-escape the unsafe characters (see urlchr_table) in a given
 258    string.  If no characters are unsafe, S is returned.  */
 259
 260 static char *
 261 url_escape_allow_passthrough (const char *s)
 262 {
 263   return url_escape_1 (s, urlchr_unsafe, true);
 264 }
 265 \f
 266 /* Decide whether the char at position P needs to be encoded.  (It is
 267    not enough to pass a single char *P because the function may need
 268    to inspect the surrounding context.)
 269
 270    Return true if the char should be escaped as %XX, false otherwise.  */
 271
 272 static inline bool
 273 char_needs_escaping (const char *p)
 274 {
 275   if (*p == '%')
 276     {
 277       if (c_isxdigit (*(p + 1)) && c_isxdigit (*(p + 2)))
 278         return false;
 279       else
 280         /* Garbled %.. sequence: encode `%'. */
 281         return true;
 282     }
 283   else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
 284     return true;
 285   else
 286     return false;
 287 }
 288
 289 /* Translate a %-escaped (but possibly non-conformant) input string S
 290    into a %-escaped (and conformant) output string.  If no characters
 291    are encoded or decoded, return the same string S; otherwise, return
 292    a freshly allocated string with the new contents.
 293
 294    After a URL has been run through this function, the protocols that
 295    use `%' as the quote character can use the resulting string as-is,
 296    while those that don't can use url_unescape to get to the intended
 297    data.  This function is stable: once the input is transformed,
 298    further transformations of the result yield the same output.
 299
 300    Let's discuss why this function is needed.
 301
 302    Imagine Wget is asked to retrieve `http://abc.xyz/abc def'.  Since
 303    a raw space character would mess up the HTTP request, it needs to
 304    be quoted, like this:
 305
 306        GET /abc%20def HTTP/1.0
 307
 308    It would appear that the unsafe chars need to be quoted, for
 309    example with url_escape.  But what if we're requested to download
 310    `abc%20def'?  url_escape transforms "%" to "%25", which would leave
 311    us with `abc%2520def'.  This is incorrect -- since %-escapes are
 312    part of URL syntax, "%20" is the correct way to denote a literal
 313    space on the Wget command line.  This leads to the conclusion that
 314    in that case Wget should not call url_escape, but leave the `%20'
 315    as is.  This is clearly contradictory, but it only gets worse.
 316
 317    What if the requested URI is `abc%20 def'?  If we call url_escape,
 318    we end up with `/abc%2520%20def', which is almost certainly not
 319    intended.  If we don't call url_escape, we are left with the
 320    embedded space and cannot complete the request.  What the user
 321    meant was for Wget to request `/abc%20%20def', and this is where
 322    reencode_escapes kicks in.
 323
 324    Wget used to solve this by first decoding %-quotes, and then
 325    encoding all the "unsafe" characters found in the resulting string.
 326    This was wrong because it didn't preserve certain URL special
 327    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 328    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 329    whether we considered `+' reserved (it is).  One of these results
 330    is inevitable because by the second step we would lose information
 331    on whether the `+' was originally encoded or not.  Both results
 332    were wrong because in CGI parameters + means space, while %2B means
 333    literal plus.  reencode_escapes correctly translates the above to
 334    "a%2B+b", i.e. returns the original string.
 335
 336    This function uses a modified version of the algorithm originally
 337    proposed by Anon Sricharoenchai:
 338
 339    * Encode all "unsafe" characters, except those that are also
 340      "reserved", to %XX.  See urlchr_table for which characters are
 341      unsafe and reserved.
 342
 343    * Encode the "%" characters not followed by two hex digits to
 344      "%25".
 345
 346    * Pass through all other characters and %XX escapes as-is.  (Up to
 347      Wget 1.10 this decoded %XX escapes corresponding to "safe"
 348      characters, but that was obtrusive and broke some servers.)
 349
 350    Anon's test case:
 351
 352    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 353    ->
 354    "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc"
 355
 356    Simpler test cases:
 357
 358    "foo bar"         -> "foo%20bar"
 359    "foo%20bar"       -> "foo%20bar"
 360    "foo %20bar"      -> "foo%20%20bar"
 361    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 362    "foo%25%20bar"    -> "foo%25%20bar"
 363    "foo%2%20bar"     -> "foo%252%20bar"
 364    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 365    "foo%2b+bar"      -> "foo%2b+bar"  */
 366
 367 static char *
 368 reencode_escapes (const char *s)
 369 {
 370   const char *p1;
 371   char *newstr, *p2;
 372   int oldlen, newlen;
 373
 374   int encode_count = 0;
 375
 376   /* First pass: inspect the string to see if there's anything to do,
 377      and to calculate the new length.  */
 378   for (p1 = s; *p1; p1++)
 379     if (char_needs_escaping (p1))
 380       ++encode_count;
 381
 382   if (!encode_count)
 383     /* The string is good as it is. */
 384     return (char *) s;          /* C const model sucks. */
 385
 386   oldlen = p1 - s;
 387   /* Each encoding adds two characters (hex digits).  */
 388   newlen = oldlen + 2 * encode_count;
 389   newstr = xmalloc (newlen + 1);
 390
 391   /* Second pass: copy the string to the destination address, encoding
 392      chars when needed.  */
 393   p1 = s;
 394   p2 = newstr;
 395
 396   while (*p1)
 397     if (char_needs_escaping (p1))
 398       {
 399         unsigned char c = *p1++;
 400         *p2++ = '%';
 401         *p2++ = XNUM_TO_DIGIT (c >> 4);
 402         *p2++ = XNUM_TO_DIGIT (c & 0xf);
 403       }
 404     else
 405       *p2++ = *p1++;
 406
 407   *p2 = '\0';
 408   assert (p2 - newstr == newlen);
 409   return newstr;
 410 }
 411 \f
 412 /* Returns the scheme type if the scheme is supported, or
 413    SCHEME_INVALID if not.  */
 414
 415 enum url_scheme
 416 url_scheme (const char *url)
 417 {
 418   int i;
 419
 420   for (i = 0; supported_schemes[i].leading_string; i++)
 421     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 422                           strlen (supported_schemes[i].leading_string)))
 423       {
 424         if (!(supported_schemes[i].flags & scm_disabled))
 425           return (enum url_scheme) i;
 426         else
 427           return SCHEME_INVALID;
 428       }
 429
 430   return SCHEME_INVALID;
 431 }
 432
 433 #define SCHEME_CHAR(ch) (c_isalnum (ch) || (ch) == '-' || (ch) == '+')
 434
 435 /* Return 1 if the URL begins with any "scheme", 0 otherwise.  As
 436    currently implemented, it returns true if URL begins with
 437    [-+a-zA-Z0-9]+: .  */
 438
 439 bool
 440 url_has_scheme (const char *url)
 441 {
 442   const char *p = url;
 443
 444   /* The first char must be a scheme char. */
 445   if (!*p || !SCHEME_CHAR (*p))
 446     return false;
 447   ++p;
 448   /* Followed by 0 or more scheme chars. */
 449   while (*p && SCHEME_CHAR (*p))
 450     ++p;
 451   /* Terminated by ':'. */
 452   return *p == ':';
 453 }
 454
 455 int
 456 scheme_default_port (enum url_scheme scheme)
 457 {
 458   return supported_schemes[scheme].default_port;
 459 }
 460
 461 void
 462 scheme_disable (enum url_scheme scheme)
 463 {
 464   supported_schemes[scheme].flags |= scm_disabled;
 465 }
 466
 467 /* Skip the username and password, if present in the URL.  The
 468    function should *not* be called with the complete URL, but with the
 469    portion after the scheme.
 470
 471    If no username and password are found, return URL.  */
 472
 473 static const char *
 474 url_skip_credentials (const char *url)
 475 {
 476   /* Look for '@' that comes before terminators, such as '/', '?',
 477      '#', or ';'.  */
 478   const char *p = (const char *)strpbrk (url, "@/?#;");
 479   if (!p || *p != '@')
 480     return url;
 481   return p + 1;
 482 }
 483
 484 /* Parse credentials contained in [BEG, END).  The region is expected
 485    to have come from a URL and is unescaped.  */
 486
 487 static bool
 488 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
 489 {
 490   char *colon;
 491   const char *userend;
 492
 493   if (beg == end)
 494     return false;               /* empty user name */
 495
 496   colon = memchr (beg, ':', end - beg);
 497   if (colon == beg)
 498     return false;               /* again empty user name */
 499
 500   if (colon)
 501     {
 502       *passwd = strdupdelim (colon + 1, end);
 503       userend = colon;
 504       url_unescape (*passwd);
 505     }
 506   else
 507     {
 508       *passwd = NULL;
 509       userend = end;
 510     }
 511   *user = strdupdelim (beg, userend);
 512   url_unescape (*user);
 513   return true;
 514 }
 515
 516 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 517    originally popularized by Netscape and NcFTP.  HTTP shorthands look
 518    like this:
 519
 520    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 521    www.foo.com[:port]            -> http://www.foo.com[:port]
 522
 523    FTP shorthands look like this:
 524
 525    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 526    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 527
 528    If the URL needs not or cannot be rewritten, return NULL.  */
 529
 530 char *
 531 rewrite_shorthand_url (const char *url)
 532 {
 533   const char *p;
 534   char *ret;
 535
 536   if (url_scheme (url) != SCHEME_INVALID)
 537     return NULL;
 538
 539   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 540      latter Netscape.  */
 541   p = strpbrk (url, ":/");
 542   if (p == url)
 543     return NULL;
 544
 545   /* If we're looking at "://", it means the URL uses a scheme we
 546      don't support, which may include "https" when compiled without
 547      SSL support.  Don't bogusly rewrite such URLs.  */
 548   if (p && p[0] == ':' && p[1] == '/' && p[2] == '/')
 549     return NULL;
 550
 551   if (p && *p == ':')
 552     {
 553       /* Colon indicates ftp, as in foo.bar.com:path.  Check for
 554          special case of http port number ("localhost:10000").  */
 555       int digits = strspn (p + 1, "0123456789");
 556       if (digits && (p[1 + digits] == '/' || p[1 + digits] == '\0'))
 557         goto http;
 558
 559       /* Turn "foo.bar.com:path" to "ftp://foo.bar.com/path". */
 560       ret = aprintf ("ftp://%s", url);
 561       ret[6 + (p - url)] = '/';
 562     }
 563   else
 564     {
 565     http:
 566       /* Just prepend "http://" to URL. */
 567       ret = aprintf ("http://%s", url);
 568     }
 569   return ret;
 570 }
 571 \f
 572 static void split_path (const char *, char **, char **);
 573
 574 /* Like strpbrk, with the exception that it returns the pointer to the
 575    terminating zero (end-of-string aka "eos") if no matching character
 576    is found.  */
 577
 578 static inline char *
 579 strpbrk_or_eos (const char *s, const char *accept)
 580 {
 581   char *p = strpbrk (s, accept);
 582   if (!p)
 583     p = strchr (s, '\0');
 584   return p;
 585 }
 586
 587 /* Turn STR into lowercase; return true if a character was actually
 588    changed. */
 589
 590 static bool
 591 lowercase_str (char *str)
 592 {
 593   bool changed = false;
 594   for (; *str; str++)
 595     if (c_isupper (*str))
 596       {
 597         changed = true;
 598         *str = c_tolower (*str);
 599       }
 600   return changed;
 601 }
 602
 603 static const char *
 604 init_seps (enum url_scheme scheme)
 605 {
 606   static char seps[8] = ":/";
 607   char *p = seps + 2;
 608   int flags = supported_schemes[scheme].flags;
 609
 610   if (flags & scm_has_params)
 611     *p++ = ';';
 612   if (flags & scm_has_query)
 613     *p++ = '?';
 614   if (flags & scm_has_fragment)
 615     *p++ = '#';
 616   *p++ = '\0';
 617   return seps;
 618 }
 619
 620 static const char *parse_errors[] = {
 621 #define PE_NO_ERROR                     0
 622   N_("No error"),
 623 #define PE_UNSUPPORTED_SCHEME           1
 624   N_("Unsupported scheme"),
 625 #define PE_INVALID_HOST_NAME            2
 626   N_("Invalid host name"),
 627 #define PE_BAD_PORT_NUMBER              3
 628   N_("Bad port number"),
 629 #define PE_INVALID_USER_NAME            4
 630   N_("Invalid user name"),
 631 #define PE_UNTERMINATED_IPV6_ADDRESS    5
 632   N_("Unterminated IPv6 numeric address"),
 633 #define PE_IPV6_NOT_SUPPORTED           6
 634   N_("IPv6 addresses not supported"),
 635 #define PE_INVALID_IPV6_ADDRESS         7
 636   N_("Invalid IPv6 numeric address")
 637 };
 638
 639 /* Parse a URL.
 640
 641    Return a new struct url if successful, NULL on error.  In case of
 642    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 643    error code. */
 644 struct url *
 645 url_parse (const char *url, int *error)
 646 {
 647   struct url *u;
 648   const char *p;
 649   bool path_modified, host_modified;
 650
 651   enum url_scheme scheme;
 652   const char *seps;
 653
 654   const char *uname_b,     *uname_e;
 655   const char *host_b,      *host_e;
 656   const char *path_b,      *path_e;
 657   const char *params_b,    *params_e;
 658   const char *query_b,     *query_e;
 659   const char *fragment_b,  *fragment_e;
 660
 661   int port;
 662   char *user = NULL, *passwd = NULL;
 663
 664   char *url_encoded = NULL;
 665
 666   int error_code;
 667
 668   scheme = url_scheme (url);
 669   if (scheme == SCHEME_INVALID)
 670     {
 671       error_code = PE_UNSUPPORTED_SCHEME;
 672       goto error;
 673     }
 674
 675   url_encoded = reencode_escapes (url);
 676   p = url_encoded;
 677
 678   p += strlen (supported_schemes[scheme].leading_string);
 679   uname_b = p;
 680   p = url_skip_credentials (p);
 681   uname_e = p;
 682
 683   /* scheme://user:pass@host[:port]... */
 684   /*                    ^              */
 685
 686   /* We attempt to break down the URL into the components path,
 687      params, query, and fragment.  They are ordered like this:
 688
 689        scheme://host[:port][/path][;params][?query][#fragment]  */
 690
 691   path_b     = path_e     = NULL;
 692   params_b   = params_e   = NULL;
 693   query_b    = query_e    = NULL;
 694   fragment_b = fragment_e = NULL;
 695
 696   /* Initialize separators for optional parts of URL, depending on the
 697      scheme.  For example, FTP has params, and HTTP and HTTPS have
 698      query string and fragment. */
 699   seps = init_seps (scheme);
 700
 701   host_b = p;
 702
 703   if (*p == '[')
 704     {
 705       /* Handle IPv6 address inside square brackets.  Ideally we'd
 706          just look for the terminating ']', but rfc2732 mandates
 707          rejecting invalid IPv6 addresses.  */
 708
 709       /* The address begins after '['. */
 710       host_b = p + 1;
 711       host_e = strchr (host_b, ']');
 712
 713       if (!host_e)
 714         {
 715           error_code = PE_UNTERMINATED_IPV6_ADDRESS;
 716           goto error;
 717         }
 718
 719 #ifdef ENABLE_IPV6
 720       /* Check if the IPv6 address is valid. */
 721       if (!is_valid_ipv6_address(host_b, host_e))
 722         {
 723           error_code = PE_INVALID_IPV6_ADDRESS;
 724           goto error;
 725         }
 726
 727       /* Continue parsing after the closing ']'. */
 728       p = host_e + 1;
 729 #else
 730       error_code = PE_IPV6_NOT_SUPPORTED;
 731       goto error;
 732 #endif
 733
 734       /* The closing bracket must be followed by a separator or by the
 735          null char.  */
 736       /* http://[::1]... */
 737       /*             ^   */
 738       if (!strchr (seps, *p))
 739         {
 740           /* Trailing garbage after []-delimited IPv6 address. */
 741           error_code = PE_INVALID_HOST_NAME;
 742           goto error;
 743         }
 744     }
 745   else
 746     {
 747       p = strpbrk_or_eos (p, seps);
 748       host_e = p;
 749     }
 750   ++seps;                       /* advance to '/' */
 751
 752   if (host_b == host_e)
 753     {
 754       error_code = PE_INVALID_HOST_NAME;
 755       goto error;
 756     }
 757
 758   port = scheme_default_port (scheme);
 759   if (*p == ':')
 760     {
 761       const char *port_b, *port_e, *pp;
 762
 763       /* scheme://host:port/tralala */
 764       /*              ^             */
 765       ++p;
 766       port_b = p;
 767       p = strpbrk_or_eos (p, seps);
 768       port_e = p;
 769
 770       /* Allow empty port, as per rfc2396. */
 771       if (port_b != port_e)
 772         for (port = 0, pp = port_b; pp < port_e; pp++)
 773           {
 774             if (!c_isdigit (*pp))
 775               {
 776                 /* http://host:12randomgarbage/blah */
 777                 /*               ^                  */
 778                 error_code = PE_BAD_PORT_NUMBER;
 779                 goto error;
 780               }
 781             port = 10 * port + (*pp - '0');
 782             /* Check for too large port numbers here, before we have
 783                a chance to overflow on bogus port values.  */
 784             if (port > 0xffff)
 785               {
 786                 error_code = PE_BAD_PORT_NUMBER;
 787                 goto error;
 788               }
 789           }
 790     }
 791   /* Advance to the first separator *after* '/' (either ';' or '?',
 792      depending on the scheme).  */
 793   ++seps;
 794
 795   /* Get the optional parts of URL, each part being delimited by
 796      current location and the position of the next separator.  */
 797 #define GET_URL_PART(sepchar, var) do {                         \
 798   if (*p == sepchar)                                            \
 799     var##_b = ++p, var##_e = p = strpbrk_or_eos (p, seps);      \
 800   ++seps;                                                       \
 801 } while (0)
 802
 803   GET_URL_PART ('/', path);
 804   if (supported_schemes[scheme].flags & scm_has_params)
 805     GET_URL_PART (';', params);
 806   if (supported_schemes[scheme].flags & scm_has_query)
 807     GET_URL_PART ('?', query);
 808   if (supported_schemes[scheme].flags & scm_has_fragment)
 809     GET_URL_PART ('#', fragment);
 810
 811 #undef GET_URL_PART
 812   assert (*p == 0);
 813
 814   if (uname_b != uname_e)
 815     {
 816       /* http://user:pass@host */
 817       /*        ^         ^    */
 818       /*     uname_b   uname_e */
 819       if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
 820         {
 821           error_code = PE_INVALID_USER_NAME;
 822           goto error;
 823         }
 824     }
 825
 826   u = xnew0 (struct url);
 827   u->scheme = scheme;
 828   u->host   = strdupdelim (host_b, host_e);
 829   u->port   = port;
 830   u->user   = user;
 831   u->passwd = passwd;
 832
 833   u->path = strdupdelim (path_b, path_e);
 834   path_modified = path_simplify (u->path);
 835   split_path (u->path, &u->dir, &u->file);
 836
 837   host_modified = lowercase_str (u->host);
 838
 839   /* Decode %HH sequences in host name.  This is important not so much
 840      to support %HH sequences in host names (which other browser
 841      don't), but to support binary characters (which will have been
 842      converted to %HH by reencode_escapes).  */
 843   if (strchr (u->host, '%'))
 844     {
 845       url_unescape (u->host);
 846       host_modified = true;
 847     }
 848
 849   if (params_b)
 850     u->params = strdupdelim (params_b, params_e);
 851   if (query_b)
 852     u->query = strdupdelim (query_b, query_e);
 853   if (fragment_b)
 854     u->fragment = strdupdelim (fragment_b, fragment_e);
 855
 856   if (path_modified || u->fragment || host_modified || path_b == path_e)
 857     {
 858       /* If we suspect that a transformation has rendered what
 859          url_string might return different from URL_ENCODED, rebuild
 860          u->url using url_string.  */
 861       u->url = url_string (u, URL_AUTH_SHOW);
 862
 863       if (url_encoded != url)
 864         xfree ((char *) url_encoded);
 865     }
 866   else
 867     {
 868       if (url_encoded == url)
 869         u->url = xstrdup (url);
 870       else
 871         u->url = url_encoded;
 872     }
 873
 874   return u;
 875
 876  error:
 877   /* Cleanup in case of error: */
 878   if (url_encoded && url_encoded != url)
 879     xfree (url_encoded);
 880
 881   /* Transmit the error code to the caller, if the caller wants to
 882      know.  */
 883   if (error)
 884     *error = error_code;
 885   return NULL;
 886 }
 887
 888 /* Return the error message string from ERROR_CODE, which should have
 889    been retrieved from url_parse.  The error message is translated.  */
 890
 891 const char *
 892 url_error (int error_code)
 893 {
 894   assert (error_code >= 0 && error_code < countof (parse_errors));
 895   return _(parse_errors[error_code]);
 896 }
 897
 898 /* Split PATH into DIR and FILE.  PATH comes from the URL and is
 899    expected to be URL-escaped.
 900
 901    The path is split into directory (the part up to the last slash)
 902    and file (the part after the last slash), which are subsequently
 903    unescaped.  Examples:
 904
 905    PATH                 DIR           FILE
 906    "foo/bar/baz"        "foo/bar"     "baz"
 907    "foo/bar/"           "foo/bar"     ""
 908    "foo"                ""            "foo"
 909    "foo/bar/baz%2fqux"  "foo/bar"     "baz/qux" (!)
 910
 911    DIR and FILE are freshly allocated.  */
 912
 913 static void
 914 split_path (const char *path, char **dir, char **file)
 915 {
 916   char *last_slash = strrchr (path, '/');
 917   if (!last_slash)
 918     {
 919       *dir = xstrdup ("");
 920       *file = xstrdup (path);
 921     }
 922   else
 923     {
 924       *dir = strdupdelim (path, last_slash);
 925       *file = xstrdup (last_slash + 1);
 926     }
 927   url_unescape (*dir);
 928   url_unescape (*file);
 929 }
 930
 931 /* Note: URL's "full path" is the path with the query string and
 932    params appended.  The "fragment" (#foo) is intentionally ignored,
 933    but that might be changed.  For example, if the original URL was
 934    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
 935    the full path will be "/foo/bar/baz;bullshit?querystring".  */
 936
 937 /* Return the length of the full path, without the terminating
 938    zero.  */
 939
 940 static int
 941 full_path_length (const struct url *url)
 942 {
 943   int len = 0;
 944
 945 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
 946
 947   FROB (path);
 948   FROB (params);
 949   FROB (query);
 950
 951 #undef FROB
 952
 953   return len;
 954 }
 955
 956 /* Write out the full path. */
 957
 958 static void
 959 full_path_write (const struct url *url, char *where)
 960 {
 961 #define FROB(el, chr) do {                      \
 962   char *f_el = url->el;                         \
 963   if (f_el) {                                   \
 964     int l = strlen (f_el);                      \
 965     *where++ = chr;                             \
 966     memcpy (where, f_el, l);                    \
 967     where += l;                                 \
 968   }                                             \
 969 } while (0)
 970
 971   FROB (path, '/');
 972   FROB (params, ';');
 973   FROB (query, '?');
 974
 975 #undef FROB
 976 }
 977
 978 /* Public function for getting the "full path".  E.g. if u->path is
 979    "foo/bar" and u->query is "param=value", full_path will be
 980    "/foo/bar?param=value". */
 981
 982 char *
 983 url_full_path (const struct url *url)
 984 {
 985   int length = full_path_length (url);
 986   char *full_path = xmalloc (length + 1);
 987
 988   full_path_write (url, full_path);
 989   full_path[length] = '\0';
 990
 991   return full_path;
 992 }
 993
 994 /* Unescape CHR in an otherwise escaped STR.  Used to selectively
 995    escaping of certain characters, such as "/" and ":".  Returns a
 996    count of unescaped chars.  */
 997
 998 static void
 999 unescape_single_char (char *str, char chr)
1000 {
1001   const char c1 = XNUM_TO_DIGIT (chr >> 4);
1002   const char c2 = XNUM_TO_DIGIT (chr & 0xf);
1003   char *h = str;                /* hare */
1004   char *t = str;                /* tortoise */
1005   for (; *h; h++, t++)
1006     {
1007       if (h[0] == '%' && h[1] == c1 && h[2] == c2)
1008         {
1009           *t = chr;
1010           h += 2;
1011         }
1012       else
1013         *t = *h;
1014     }
1015   *t = '\0';
1016 }
1017
1018 /* Escape unsafe and reserved characters, except for the slash
1019    characters.  */
1020
1021 static char *
1022 url_escape_dir (const char *dir)
1023 {
1024   char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1025   if (newdir == dir)
1026     return (char *)dir;
1027
1028   unescape_single_char (newdir, '/');
1029   return newdir;
1030 }
1031
1032 /* Sync u->path and u->url with u->dir and u->file.  Called after
1033    u->file or u->dir have been changed, typically by the FTP code.  */
1034
1035 static void
1036 sync_path (struct url *u)
1037 {
1038   char *newpath, *efile, *edir;
1039
1040   xfree (u->path);
1041
1042   /* u->dir and u->file are not escaped.  URL-escape them before
1043      reassembling them into u->path.  That way, if they contain
1044      separators like '?' or even if u->file contains slashes, the
1045      path will be correctly assembled.  (u->file can contain slashes
1046      if the URL specifies it with %2f, or if an FTP server returns
1047      it.)  */
1048   edir = url_escape_dir (u->dir);
1049   efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1050
1051   if (!*edir)
1052     newpath = xstrdup (efile);
1053   else
1054     {
1055       int dirlen = strlen (edir);
1056       int filelen = strlen (efile);
1057
1058       /* Copy "DIR/FILE" to newpath. */
1059       char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1060       memcpy (p, edir, dirlen);
1061       p += dirlen;
1062       *p++ = '/';
1063       memcpy (p, efile, filelen);
1064       p += filelen;
1065       *p = '\0';
1066     }
1067
1068   u->path = newpath;
1069
1070   if (edir != u->dir)
1071     xfree (edir);
1072   if (efile != u->file)
1073     xfree (efile);
1074
1075   /* Regenerate u->url as well.  */
1076   xfree (u->url);
1077   u->url = url_string (u, URL_AUTH_SHOW);
1078 }
1079
1080 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1081    This way we can sync u->path and u->url when they get changed.  */
1082
1083 void
1084 url_set_dir (struct url *url, const char *newdir)
1085 {
1086   xfree (url->dir);
1087   url->dir = xstrdup (newdir);
1088   sync_path (url);
1089 }
1090
1091 void
1092 url_set_file (struct url *url, const char *newfile)
1093 {
1094   xfree (url->file);
1095   url->file = xstrdup (newfile);
1096   sync_path (url);
1097 }
1098
1099 void
1100 url_free (struct url *url)
1101 {
1102   xfree (url->host);
1103   xfree (url->path);
1104   xfree (url->url);
1105
1106   xfree_null (url->params);
1107   xfree_null (url->query);
1108   xfree_null (url->fragment);
1109   xfree_null (url->user);
1110   xfree_null (url->passwd);
1111
1112   xfree (url->dir);
1113   xfree (url->file);
1114
1115   xfree (url);
1116 }
1117 \f
1118 /* Create all the necessary directories for PATH (a file).  Calls
1119    make_directory internally.  */
1120 int
1121 mkalldirs (const char *path)
1122 {
1123   const char *p;
1124   char *t;
1125   struct_stat st;
1126   int res;
1127
1128   p = path + strlen (path);
1129   for (; *p != '/' && p != path; p--)
1130     ;
1131
1132   /* Don't create if it's just a file.  */
1133   if ((p == path) && (*p != '/'))
1134     return 0;
1135   t = strdupdelim (path, p);
1136
1137   /* Check whether the directory exists.  */
1138   if ((stat (t, &st) == 0))
1139     {
1140       if (S_ISDIR (st.st_mode))
1141         {
1142           xfree (t);
1143           return 0;
1144         }
1145       else
1146         {
1147           /* If the dir exists as a file name, remove it first.  This
1148              is *only* for Wget to work with buggy old CERN http
1149              servers.  Here is the scenario: When Wget tries to
1150              retrieve a directory without a slash, e.g.
1151              http://foo/bar (bar being a directory), CERN server will
1152              not redirect it too http://foo/bar/ -- it will generate a
1153              directory listing containing links to bar/file1,
1154              bar/file2, etc.  Wget will lose because it saves this
1155              HTML listing to a file `bar', so it cannot create the
1156              directory.  To work around this, if the file of the same
1157              name exists, we just remove it and create the directory
1158              anyway.  */
1159           DEBUGP (("Removing %s because of directory danger!\n", t));
1160           unlink (t);
1161         }
1162     }
1163   res = make_directory (t);
1164   if (res != 0)
1165     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1166   xfree (t);
1167   return res;
1168 }
1169 \f
1170 /* Functions for constructing the file name out of URL components.  */
1171
1172 /* A growable string structure, used by url_file_name and friends.
1173    This should perhaps be moved to utils.c.
1174
1175    The idea is to have a convenient and efficient way to construct a
1176    string by having various functions append data to it.  Instead of
1177    passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1178    functions in questions, we pass the pointer to this struct.  */
1179
1180 struct growable {
1181   char *base;
1182   int size;
1183   int tail;
1184 };
1185
1186 /* Ensure that the string can accept APPEND_COUNT more characters past
1187    the current TAIL position.  If necessary, this will grow the string
1188    and update its allocated size.  If the string is already large
1189    enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1190 #define GROW(g, append_size) do {                                       \
1191   struct growable *G_ = g;                                              \
1192   DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1193 } while (0)
1194
1195 /* Return the tail position of the string. */
1196 #define TAIL(r) ((r)->base + (r)->tail)
1197
1198 /* Move the tail position by APPEND_COUNT characters. */
1199 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1200
1201 /* Append the string STR to DEST.  NOTICE: the string in DEST is not
1202    terminated.  */
1203
1204 static void
1205 append_string (const char *str, struct growable *dest)
1206 {
1207   int l = strlen (str);
1208   GROW (dest, l);
1209   memcpy (TAIL (dest), str, l);
1210   TAIL_INCR (dest, l);
1211 }
1212
1213 /* Append CH to DEST.  For example, append_char (0, DEST)
1214    zero-terminates DEST.  */
1215
1216 static void
1217 append_char (char ch, struct growable *dest)
1218 {
1219   GROW (dest, 1);
1220   *TAIL (dest) = ch;
1221   TAIL_INCR (dest, 1);
1222 }
1223
1224 enum {
1225   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1226   filechr_not_windows = 2,      /* unusable on Windows, one of \|/<>?:*" */
1227   filechr_control     = 4       /* a control character, e.g. 0-31 */
1228 };
1229
1230 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1231
1232 /* Shorthands for the table: */
1233 #define U filechr_not_unix
1234 #define W filechr_not_windows
1235 #define C filechr_control
1236
1237 #define UW U|W
1238 #define UWC U|W|C
1239
1240 /* Table of characters unsafe under various conditions (see above).
1241
1242    Arguably we could also claim `%' to be unsafe, since we use it as
1243    the escape character.  If we ever want to be able to reliably
1244    translate file name back to URL, this would become important
1245    crucial.  Right now, it's better to be minimal in escaping.  */
1246
1247 static const unsigned char filechr_table[256] =
1248 {
1249 UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1250   C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1251   C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1252   C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1253   0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1254   0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1255   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1256   0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
1257   0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1258   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1259   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1260   0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1261   0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1262   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1263   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1264   0,  0,  0,  0,   W,  0,  0,  C,   /* x   y   z   {    |   }   ~   DEL */
1265
1266   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
1267   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
1268   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1269   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1270
1271   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1272   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1273   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1274   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1275 };
1276 #undef U
1277 #undef W
1278 #undef C
1279 #undef UW
1280 #undef UWC
1281
1282 /* FN_PORT_SEP is the separator between host and port in file names
1283    for non-standard port numbers.  On Unix this is normally ':', as in
1284    "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1285    because Windows can't handle ':' in file names.  */
1286 #define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1287
1288 /* FN_QUERY_SEP is the separator between the file name and the URL
1289    query, normally '?'.  Since Windows cannot handle '?' as part of
1290    file name, we use '@' instead there.  */
1291 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1292
1293 /* Quote path element, characters in [b, e), as file name, and append
1294    the quoted string to DEST.  Each character is quoted as per
1295    file_unsafe_char and the corresponding table.
1296
1297    If ESCAPED is true, the path element is considered to be
1298    URL-escaped and will be unescaped prior to inspection.  */
1299
1300 static void
1301 append_uri_pathel (const char *b, const char *e, bool escaped,
1302                    struct growable *dest)
1303 {
1304   const char *p;
1305   int quoted, outlen;
1306
1307   int mask;
1308   if (opt.restrict_files_os == restrict_unix)
1309     mask = filechr_not_unix;
1310   else
1311     mask = filechr_not_windows;
1312   if (opt.restrict_files_ctrl)
1313     mask |= filechr_control;
1314
1315   /* Copy [b, e) to PATHEL and URL-unescape it. */
1316   if (escaped)
1317     {
1318       char *unescaped;
1319       BOUNDED_TO_ALLOCA (b, e, unescaped);
1320       url_unescape (unescaped);
1321       b = unescaped;
1322       e = unescaped + strlen (unescaped);
1323     }
1324
1325   /* Defang ".." when found as component of path.  Remember that path
1326      comes from the URL and might contain malicious input.  */
1327   if (e - b == 2 && b[0] == '.' && b[1] == '.')
1328     {
1329       b = "%2E%2E";
1330       e = b + 6;
1331     }
1332
1333   /* Walk the PATHEL string and check how many characters we'll need
1334      to quote.  */
1335   quoted = 0;
1336   for (p = b; p < e; p++)
1337     if (FILE_CHAR_TEST (*p, mask))
1338       ++quoted;
1339
1340   /* Calculate the length of the output string.  e-b is the input
1341      string length.  Each quoted char introduces two additional
1342      characters in the string, hence 2*quoted.  */
1343   outlen = (e - b) + (2 * quoted);
1344   GROW (dest, outlen);
1345
1346   if (!quoted)
1347     {
1348       /* If there's nothing to quote, we can simply append the string
1349          without processing it again.  */
1350       memcpy (TAIL (dest), b, outlen);
1351     }
1352   else
1353     {
1354       char *q = TAIL (dest);
1355       for (p = b; p < e; p++)
1356         {
1357           if (!FILE_CHAR_TEST (*p, mask))
1358             *q++ = *p;
1359           else
1360             {
1361               unsigned char ch = *p;
1362               *q++ = '%';
1363               *q++ = XNUM_TO_DIGIT (ch >> 4);
1364               *q++ = XNUM_TO_DIGIT (ch & 0xf);
1365             }
1366         }
1367       assert (q - TAIL (dest) == outlen);
1368     }
1369
1370   /* Perform inline case transformation if required.  */
1371   if (opt.restrict_files_case == restrict_lowercase
1372       || opt.restrict_files_case == restrict_uppercase)
1373     {
1374       char *q;
1375       for (q = TAIL (dest); q < TAIL (dest) + outlen; ++q)
1376         {
1377           if (opt.restrict_files_case == restrict_lowercase)
1378             *q = c_tolower (*q);
1379           else
1380             *q = c_toupper (*q);
1381         }
1382     }
1383
1384   TAIL_INCR (dest, outlen);
1385 }
1386
1387 /* Append to DEST the directory structure that corresponds the
1388    directory part of URL's path.  For example, if the URL is
1389    http://server/dir1/dir2/file, this appends "/dir1/dir2".
1390
1391    Each path element ("dir1" and "dir2" in the above example) is
1392    examined, url-unescaped, and re-escaped as file name element.
1393
1394    Additionally, it cuts as many directories from the path as
1395    specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1396    will produce "bar" for the above example.  For 2 or more, it will
1397    produce "".
1398
1399    Each component of the path is quoted for use as file name.  */
1400
1401 static void
1402 append_dir_structure (const struct url *u, struct growable *dest)
1403 {
1404   char *pathel, *next;
1405   int cut = opt.cut_dirs;
1406
1407   /* Go through the path components, de-URL-quote them, and quote them
1408      (if necessary) as file names.  */
1409
1410   pathel = u->path;
1411   for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1412     {
1413       if (cut-- > 0)
1414         continue;
1415       if (pathel == next)
1416         /* Ignore empty pathels.  */
1417         continue;
1418
1419       if (dest->tail)
1420         append_char ('/', dest);
1421       append_uri_pathel (pathel, next, true, dest);
1422     }
1423 }
1424
1425 /* Return a unique file name that matches the given URL as good as
1426    possible.  Does not create directories on the file system.  */
1427
1428 char *
1429 url_file_name (const struct url *u)
1430 {
1431   struct growable fnres;        /* stands for "file name result" */
1432
1433   const char *u_file, *u_query;
1434   char *fname, *unique;
1435
1436   fnres.base = NULL;
1437   fnres.size = 0;
1438   fnres.tail = 0;
1439
1440   /* Start with the directory prefix, if specified. */
1441   if (opt.dir_prefix)
1442     append_string (opt.dir_prefix, &fnres);
1443
1444   /* If "dirstruct" is turned on (typically the case with -r), add
1445      the host and port (unless those have been turned off) and
1446      directory structure.  */
1447   if (opt.dirstruct)
1448     {
1449       if (opt.protocol_directories)
1450         {
1451           if (fnres.tail)
1452             append_char ('/', &fnres);
1453           append_string (supported_schemes[u->scheme].name, &fnres);
1454         }
1455       if (opt.add_hostdir)
1456         {
1457           if (fnres.tail)
1458             append_char ('/', &fnres);
1459           if (0 != strcmp (u->host, ".."))
1460             append_string (u->host, &fnres);
1461           else
1462             /* Host name can come from the network; malicious DNS may
1463                allow ".." to be resolved, causing us to write to
1464                "../<file>".  Defang such host names.  */
1465             append_string ("%2E%2E", &fnres);
1466           if (u->port != scheme_default_port (u->scheme))
1467             {
1468               char portstr[24];
1469               number_to_string (portstr, u->port);
1470               append_char (FN_PORT_SEP, &fnres);
1471               append_string (portstr, &fnres);
1472             }
1473         }
1474
1475       append_dir_structure (u, &fnres);
1476     }
1477
1478   /* Add the file name. */
1479   if (fnres.tail)
1480     append_char ('/', &fnres);
1481   u_file = *u->file ? u->file : "index.html";
1482   append_uri_pathel (u_file, u_file + strlen (u_file), false, &fnres);
1483
1484   /* Append "?query" to the file name. */
1485   u_query = u->query && *u->query ? u->query : NULL;
1486   if (u_query)
1487     {
1488       append_char (FN_QUERY_SEP, &fnres);
1489       append_uri_pathel (u_query, u_query + strlen (u_query), true, &fnres);
1490     }
1491
1492   /* Zero-terminate the file name. */
1493   append_char ('\0', &fnres);
1494
1495   fname = fnres.base;
1496
1497   /* Check the cases in which the unique extensions are not used:
1498      1) Clobbering is turned off (-nc).
1499      2) Retrieval with regetting.
1500      3) Timestamping is used.
1501      4) Hierarchy is built.
1502
1503      The exception is the case when file does exist and is a
1504      directory (see `mkalldirs' for explanation).  */
1505
1506   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1507       && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1508     return fname;
1509
1510   unique = unique_name (fname, true);
1511   if (unique != fname)
1512     xfree (fname);
1513   return unique;
1514 }
1515 \f
1516 /* Resolve "." and ".." elements of PATH by destructively modifying
1517    PATH and return true if PATH has been modified, false otherwise.
1518
1519    The algorithm is in spirit similar to the one described in rfc1808,
1520    although implemented differently, in one pass.  To recap, path
1521    elements containing only "." are removed, and ".." is taken to mean
1522    "back up one element".  Single leading and trailing slashes are
1523    preserved.
1524
1525    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1526    test examples are provided below.  If you change anything in this
1527    function, run test_path_simplify to make sure you haven't broken a
1528    test case.  */
1529
1530 static bool
1531 path_simplify (char *path)
1532 {
1533   char *h = path;               /* hare */
1534   char *t = path;               /* tortoise */
1535   char *end = strchr (path, '\0');
1536
1537   while (h < end)
1538     {
1539       /* Hare should be at the beginning of a path element. */
1540
1541       if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1542         {
1543           /* Ignore "./". */
1544           h += 2;
1545         }
1546       else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1547         {
1548           /* Handle "../" by retreating the tortoise by one path
1549              element -- but not past beggining.  */
1550           if (t > path)
1551             {
1552               /* Move backwards until T hits the beginning of the
1553                  previous path element or the beginning of path. */
1554               for (--t; t > path && t[-1] != '/'; t--)
1555                 ;
1556             }
1557           h += 3;
1558         }
1559       else
1560         {
1561           /* A regular path element.  If H hasn't advanced past T,
1562              simply skip to the next path element.  Otherwise, copy
1563              the path element until the next slash.  */
1564           if (t == h)
1565             {
1566               /* Skip the path element, including the slash.  */
1567               while (h < end && *h != '/')
1568                 t++, h++;
1569               if (h < end)
1570                 t++, h++;
1571             }
1572           else
1573             {
1574               /* Copy the path element, including the final slash.  */
1575               while (h < end && *h != '/')
1576                 *t++ = *h++;
1577               if (h < end)
1578                 *t++ = *h++;
1579             }
1580         }
1581     }
1582
1583   if (t != h)
1584     *t = '\0';
1585
1586   return t != h;
1587 }
1588 \f
1589 /* Return the length of URL's path.  Path is considered to be
1590    terminated by one or more of the ?query or ;params or #fragment,
1591    depending on the scheme.  */
1592
1593 static const char *
1594 path_end (const char *url)
1595 {
1596   enum url_scheme scheme = url_scheme (url);
1597   const char *seps;
1598   if (scheme == SCHEME_INVALID)
1599     scheme = SCHEME_HTTP;       /* use http semantics for rel links */
1600   /* +2 to ignore the first two separators ':' and '/' */
1601   seps = init_seps (scheme) + 2;
1602   return strpbrk_or_eos (url, seps);
1603 }
1604
1605 /* Find the last occurrence of character C in the range [b, e), or
1606    NULL, if none are present.  */
1607 #define find_last_char(b, e, c) memrchr ((b), (c), (e) - (b))
1608
1609 /* Merge BASE with LINK and return the resulting URI.
1610
1611    Either of the URIs may be absolute or relative, complete with the
1612    host name, or path only.  This tries to reasonably handle all
1613    foreseeable cases.  It only employs minimal URL parsing, without
1614    knowledge of the specifics of schemes.
1615
1616    I briefly considered making this function call path_simplify after
1617    the merging process, as rfc1738 seems to suggest.  This is a bad
1618    idea for several reasons: 1) it complexifies the code, and 2)
1619    url_parse has to simplify path anyway, so it's wasteful to boot.  */
1620
1621 char *
1622 uri_merge (const char *base, const char *link)
1623 {
1624   int linklength;
1625   const char *end;
1626   char *merge;
1627
1628   if (url_has_scheme (link))
1629     return xstrdup (link);
1630
1631   /* We may not examine BASE past END. */
1632   end = path_end (base);
1633   linklength = strlen (link);
1634
1635   if (!*link)
1636     {
1637       /* Empty LINK points back to BASE, query string and all. */
1638       return xstrdup (base);
1639     }
1640   else if (*link == '?')
1641     {
1642       /* LINK points to the same location, but changes the query
1643          string.  Examples: */
1644       /* uri_merge("path",         "?new") -> "path?new"     */
1645       /* uri_merge("path?foo",     "?new") -> "path?new"     */
1646       /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1647       /* uri_merge("path#foo",     "?new") -> "path?new"     */
1648       int baselength = end - base;
1649       merge = xmalloc (baselength + linklength + 1);
1650       memcpy (merge, base, baselength);
1651       memcpy (merge + baselength, link, linklength);
1652       merge[baselength + linklength] = '\0';
1653     }
1654   else if (*link == '#')
1655     {
1656       /* uri_merge("path",         "#new") -> "path#new"     */
1657       /* uri_merge("path#foo",     "#new") -> "path#new"     */
1658       /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1659       /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1660       int baselength;
1661       const char *end1 = strchr (base, '#');
1662       if (!end1)
1663         end1 = base + strlen (base);
1664       baselength = end1 - base;
1665       merge = xmalloc (baselength + linklength + 1);
1666       memcpy (merge, base, baselength);
1667       memcpy (merge + baselength, link, linklength);
1668       merge[baselength + linklength] = '\0';
1669     }
1670   else if (*link == '/' && *(link + 1) == '/')
1671     {
1672       /* LINK begins with "//" and so is a net path: we need to
1673          replace everything after (and including) the double slash
1674          with LINK. */
1675
1676       /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1677       /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1678       /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1679
1680       int span;
1681       const char *slash;
1682       const char *start_insert;
1683
1684       /* Look for first slash. */
1685       slash = memchr (base, '/', end - base);
1686       /* If found slash and it is a double slash, then replace
1687          from this point, else default to replacing from the
1688          beginning.  */
1689       if (slash && *(slash + 1) == '/')
1690         start_insert = slash;
1691       else
1692         start_insert = base;
1693
1694       span = start_insert - base;
1695       merge = xmalloc (span + linklength + 1);
1696       if (span)
1697         memcpy (merge, base, span);
1698       memcpy (merge + span, link, linklength);
1699       merge[span + linklength] = '\0';
1700     }
1701   else if (*link == '/')
1702     {
1703       /* LINK is an absolute path: we need to replace everything
1704          after (and including) the FIRST slash with LINK.
1705
1706          So, if BASE is "http://host/whatever/foo/bar", and LINK is
1707          "/qux/xyzzy", our result should be
1708          "http://host/qux/xyzzy".  */
1709       int span;
1710       const char *slash;
1711       const char *start_insert = NULL; /* for gcc to shut up. */
1712       const char *pos = base;
1713       bool seen_slash_slash = false;
1714       /* We're looking for the first slash, but want to ignore
1715          double slash. */
1716     again:
1717       slash = memchr (pos, '/', end - pos);
1718       if (slash && !seen_slash_slash)
1719         if (*(slash + 1) == '/')
1720           {
1721             pos = slash + 2;
1722             seen_slash_slash = true;
1723             goto again;
1724           }
1725
1726       /* At this point, SLASH is the location of the first / after
1727          "//", or the first slash altogether.  START_INSERT is the
1728          pointer to the location where LINK will be inserted.  When
1729          examining the last two examples, keep in mind that LINK
1730          begins with '/'. */
1731
1732       if (!slash && !seen_slash_slash)
1733         /* example: "foo" */
1734         /*           ^    */
1735         start_insert = base;
1736       else if (!slash && seen_slash_slash)
1737         /* example: "http://foo" */
1738         /*                     ^ */
1739         start_insert = end;
1740       else if (slash && !seen_slash_slash)
1741         /* example: "foo/bar" */
1742         /*           ^        */
1743         start_insert = base;
1744       else if (slash && seen_slash_slash)
1745         /* example: "http://something/" */
1746         /*                           ^  */
1747         start_insert = slash;
1748
1749       span = start_insert - base;
1750       merge = xmalloc (span + linklength + 1);
1751       if (span)
1752         memcpy (merge, base, span);
1753       memcpy (merge + span, link, linklength);
1754       merge[span + linklength] = '\0';
1755     }
1756   else
1757     {
1758       /* LINK is a relative URL: we need to replace everything
1759          after last slash (possibly empty) with LINK.
1760
1761          So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1762          our result should be "whatever/foo/qux/xyzzy".  */
1763       bool need_explicit_slash = false;
1764       int span;
1765       const char *start_insert;
1766       const char *last_slash = find_last_char (base, end, '/');
1767       if (!last_slash)
1768         {
1769           /* No slash found at all.  Replace what we have with LINK. */
1770           start_insert = base;
1771         }
1772       else if (last_slash && last_slash >= base + 2
1773                && last_slash[-2] == ':' && last_slash[-1] == '/')
1774         {
1775           /* example: http://host"  */
1776           /*                      ^ */
1777           start_insert = end + 1;
1778           need_explicit_slash = true;
1779         }
1780       else
1781         {
1782           /* example: "whatever/foo/bar" */
1783           /*                        ^    */
1784           start_insert = last_slash + 1;
1785         }
1786
1787       span = start_insert - base;
1788       merge = xmalloc (span + linklength + 1);
1789       if (span)
1790         memcpy (merge, base, span);
1791       if (need_explicit_slash)
1792         merge[span - 1] = '/';
1793       memcpy (merge + span, link, linklength);
1794       merge[span + linklength] = '\0';
1795     }
1796
1797   return merge;
1798 }
1799 \f
1800 #define APPEND(p, s) do {                       \
1801   int len = strlen (s);                         \
1802   memcpy (p, s, len);                           \
1803   p += len;                                     \
1804 } while (0)
1805
1806 /* Use this instead of password when the actual password is supposed
1807    to be hidden.  We intentionally use a generic string without giving
1808    away the number of characters in the password, like previous
1809    versions did.  */
1810 #define HIDDEN_PASSWORD "*password*"
1811
1812 /* Recreate the URL string from the data in URL.
1813
1814    If HIDE is true (as it is when we're calling this on a URL we plan
1815    to print, but not when calling it to canonicalize a URL for use
1816    within the program), password will be hidden.  Unsafe characters in
1817    the URL will be quoted.  */
1818
1819 char *
1820 url_string (const struct url *url, enum url_auth_mode auth_mode)
1821 {
1822   int size;
1823   char *result, *p;
1824   char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;
1825
1826   int scheme_port = supported_schemes[url->scheme].default_port;
1827   const char *scheme_str = supported_schemes[url->scheme].leading_string;
1828   int fplen = full_path_length (url);
1829
1830   bool brackets_around_host;
1831
1832   assert (scheme_str != NULL);
1833
1834   /* Make sure the user name and password are quoted. */
1835   if (url->user)
1836     {
1837       if (auth_mode != URL_AUTH_HIDE)
1838         {
1839           quoted_user = url_escape_allow_passthrough (url->user);
1840           if (url->passwd)
1841             {
1842               if (auth_mode == URL_AUTH_HIDE_PASSWD)
1843                 quoted_passwd = HIDDEN_PASSWORD;
1844               else
1845                 quoted_passwd = url_escape_allow_passthrough (url->passwd);
1846             }
1847         }
1848     }
1849
1850   /* In the unlikely event that the host name contains non-printable
1851      characters, quote it for displaying to the user.  */
1852   quoted_host = url_escape_allow_passthrough (url->host);
1853
1854   /* Undo the quoting of colons that URL escaping performs.  IPv6
1855      addresses may legally contain colons, and in that case must be
1856      placed in square brackets.  */
1857   if (quoted_host != url->host)
1858     unescape_single_char (quoted_host, ':');
1859   brackets_around_host = strchr (quoted_host, ':') != NULL;
1860
1861   size = (strlen (scheme_str)
1862           + strlen (quoted_host)
1863           + (brackets_around_host ? 2 : 0)
1864           + fplen
1865           + 1);
1866   if (url->port != scheme_port)
1867     size += 1 + numdigit (url->port);
1868   if (quoted_user)
1869     {
1870       size += 1 + strlen (quoted_user);
1871       if (quoted_passwd)
1872         size += 1 + strlen (quoted_passwd);
1873     }
1874
1875   p = result = xmalloc (size);
1876
1877   APPEND (p, scheme_str);
1878   if (quoted_user)
1879     {
1880       APPEND (p, quoted_user);
1881       if (quoted_passwd)
1882         {
1883           *p++ = ':';
1884           APPEND (p, quoted_passwd);
1885         }
1886       *p++ = '@';
1887     }
1888
1889   if (brackets_around_host)
1890     *p++ = '[';
1891   APPEND (p, quoted_host);
1892   if (brackets_around_host)
1893     *p++ = ']';
1894   if (url->port != scheme_port)
1895     {
1896       *p++ = ':';
1897       p = number_to_string (p, url->port);
1898     }
1899
1900   full_path_write (url, p);
1901   p += fplen;
1902   *p++ = '\0';
1903
1904   assert (p - result == size);
1905
1906   if (quoted_user && quoted_user != url->user)
1907     xfree (quoted_user);
1908   if (quoted_passwd && auth_mode == URL_AUTH_SHOW
1909       && quoted_passwd != url->passwd)
1910     xfree (quoted_passwd);
1911   if (quoted_host != url->host)
1912     xfree (quoted_host);
1913
1914   return result;
1915 }
1916 \f
1917 /* Return true if scheme a is similar to scheme b.
1918
1919    Schemes are similar if they are equal.  If SSL is supported, schemes
1920    are also similar if one is http (SCHEME_HTTP) and the other is https
1921    (SCHEME_HTTPS).  */
1922 bool
1923 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
1924 {
1925   if (a == b)
1926     return true;
1927 #ifdef HAVE_SSL
1928   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
1929       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
1930     return true;
1931 #endif
1932   return false;
1933 }
1934 \f
1935 static int
1936 getchar_from_escaped_string (const char *str, char *c)
1937 {
1938   const char *p = str;
1939
1940   assert (str && *str);
1941   assert (c);
1942
1943   if (p[0] == '%')
1944     {
1945       if (!c_isxdigit(p[1]) || !c_isxdigit(p[2]))
1946         {
1947           *c = '%';
1948           return 1;
1949         }
1950       else
1951         {
1952           if (p[2] == 0)
1953             return 0; /* error: invalid string */
1954
1955           *c = X2DIGITS_TO_NUM (p[1], p[2]);
1956           if (URL_RESERVED_CHAR(*c))
1957             {
1958               *c = '%';
1959               return 1;
1960             }
1961           else
1962             return 3;
1963         }
1964     }
1965   else
1966     {
1967       *c = p[0];
1968     }
1969
1970   return 1;
1971 }
1972
1973 bool
1974 are_urls_equal (const char *u1, const char *u2)
1975 {
1976   const char *p, *q;
1977   int pp, qq;
1978   char ch1, ch2;
1979   assert(u1 && u2);
1980
1981   p = u1;
1982   q = u2;
1983
1984   while (*p && *q
1985          && (pp = getchar_from_escaped_string (p, &ch1))
1986          && (qq = getchar_from_escaped_string (q, &ch2))
1987          && (c_tolower(ch1) == c_tolower(ch2)))
1988     {
1989       p += pp;
1990       q += qq;
1991     }
1992
1993   return (*p == 0 && *q == 0 ? true : false);
1994 }
1995 \f
1996 #if 0
1997 /* Debugging and testing support for path_simplify. */
1998
1999 /* Debug: run path_simplify on PATH and return the result in a new
2000    string.  Useful for calling from the debugger.  */
2001 static char *
2002 ps (char *path)
2003 {
2004   char *copy = xstrdup (path);
2005   path_simplify (copy);
2006   return copy;
2007 }
2008
2009 static void
2010 run_test (char *test, char *expected_result, bool expected_change)
2011 {
2012   char *test_copy = xstrdup (test);
2013   bool modified = path_simplify (test_copy);
2014
2015   if (0 != strcmp (test_copy, expected_result))
2016     {
2017       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2018               test, expected_result, test_copy);
2019     }
2020   if (modified != expected_change)
2021     {
2022       if (expected_change)
2023         printf ("Expected modification with path_simplify(\"%s\").\n",
2024                 test);
2025       else
2026         printf ("Expected no modification with path_simplify(\"%s\").\n",
2027                 test);
2028     }
2029   xfree (test_copy);
2030 }
2031
2032 static void
2033 test_path_simplify (void)
2034 {
2035   static struct {
2036     char *test, *result;
2037     bool should_modify;
2038   } tests[] = {
2039     { "",                       "",             false },
2040     { ".",                      "",             true },
2041     { "./",                     "",             true },
2042     { "..",                     "",             true },
2043     { "../",                    "",             true },
2044     { "foo",                    "foo",          false },
2045     { "foo/bar",                "foo/bar",      false },
2046     { "foo///bar",              "foo///bar",    false },
2047     { "foo/.",                  "foo/",         true },
2048     { "foo/./",                 "foo/",         true },
2049     { "foo./",                  "foo./",        false },
2050     { "foo/../bar",             "bar",          true },
2051     { "foo/../bar/",            "bar/",         true },
2052     { "foo/bar/..",             "foo/",         true },
2053     { "foo/bar/../x",           "foo/x",        true },
2054     { "foo/bar/../x/",          "foo/x/",       true },
2055     { "foo/..",                 "",             true },
2056     { "foo/../..",              "",             true },
2057     { "foo/../../..",           "",             true },
2058     { "foo/../../bar/../../baz", "baz",         true },
2059     { "a/b/../../c",            "c",            true },
2060     { "./a/../b",               "b",            true }
2061   };
2062   int i;
2063
2064   for (i = 0; i < countof (tests); i++)
2065     {
2066       char *test = tests[i].test;
2067       char *expected_result = tests[i].result;
2068       bool  expected_change = tests[i].should_modify;
2069       run_test (test, expected_result, expected_change);
2070     }
2071 }
2072 #endif
2073 \f
2074 #ifdef TESTING
2075
2076 const char *
2077 test_append_uri_pathel()
2078 {
2079   int i;
2080   struct {
2081     char *original_url;
2082     char *input;
2083     bool escaped;
2084     char *expected_result;
2085   } test_array[] = {
2086     { "http://www.yoyodyne.com/path/", "somepage.html", false, "http://www.yoyodyne.com/path/somepage.html" },
2087   };
2088
2089   for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
2090     {
2091       struct growable dest;
2092       const char *p = test_array[i].input;
2093
2094       memset (&dest, 0, sizeof (dest));
2095
2096       append_string (test_array[i].original_url, &dest);
2097       append_uri_pathel (p, p + strlen(p), test_array[i].escaped, &dest);
2098       append_char ('\0', &dest);
2099
2100       mu_assert ("test_append_uri_pathel: wrong result",
2101                  strcmp (dest.base, test_array[i].expected_result) == 0);
2102     }
2103
2104   return NULL;
2105 }
2106
2107 const char*
2108 test_are_urls_equal()
2109 {
2110   int i;
2111   struct {
2112     char *url1;
2113     char *url2;
2114     bool expected_result;
2115   } test_array[] = {
2116     { "http://www.adomain.com/apath/", "http://www.adomain.com/apath/",       true },
2117     { "http://www.adomain.com/apath/", "http://www.adomain.com/anotherpath/", false },
2118     { "http://www.adomain.com/apath/", "http://www.anotherdomain.com/path/",  false },
2119     { "http://www.adomain.com/~path/", "http://www.adomain.com/%7epath/",     true },
2120     { "http://www.adomain.com/longer-path/", "http://www.adomain.com/path/",  false },
2121     { "http://www.adomain.com/path%2f", "http://www.adomain.com/path/",       false },
2122   };
2123
2124   for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
2125     {
2126       mu_assert ("test_are_urls_equal: wrong result",
2127                  are_urls_equal (test_array[i].url1, test_array[i].url2) == test_array[i].expected_result);
2128     }
2129
2130   return NULL;
2131 }
2132
2133 #endif /* TESTING */
2134
2135 /*
2136  * vim: et ts=2 sw=2
2137  */
2138