sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
   3    2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or (at
  10 your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #include "wget.h"
  32
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 #ifdef HAVE_UNISTD_H
  37 # include <unistd.h>
  38 #endif
  39 #include <errno.h>
  40 #include <assert.h>
  41
  42 #include "utils.h"
  43 #include "url.h"
  44 #include "host.h"  /* for is_valid_ipv6_address */
  45
  46 #ifdef TESTING
  47 #include "test.h"
  48 #endif
  49
  50 enum {
  51   scm_disabled = 1,             /* for https when OpenSSL fails to init. */
  52   scm_has_params = 2,           /* whether scheme has ;params */
  53   scm_has_query = 4,            /* whether scheme has ?query */
  54   scm_has_fragment = 8          /* whether scheme has #fragment */
  55 };
  56
  57 struct scheme_data
  58 {
  59   /* Short name of the scheme, such as "http" or "ftp". */
  60   const char *name;
  61   /* Leading string that identifies the scheme, such as "https://". */
  62   const char *leading_string;
  63   /* Default port of the scheme when none is specified. */
  64   int default_port;
  65   /* Various flags. */
  66   int flags;
  67 };
  68
  69 /* Supported schemes: */
  70 static struct scheme_data supported_schemes[] =
  71 {
  72   { "http",     "http://",  DEFAULT_HTTP_PORT,  scm_has_query|scm_has_fragment },
  73 #ifdef HAVE_SSL
  74   { "https",    "https://", DEFAULT_HTTPS_PORT, scm_has_query|scm_has_fragment },
  75 #endif
  76   { "ftp",      "ftp://",   DEFAULT_FTP_PORT,   scm_has_params|scm_has_fragment },
  77
  78   /* SCHEME_INVALID */
  79   { NULL,       NULL,       -1,                 0 }
  80 };
  81
  82 /* Forward declarations: */
  83
  84 static bool path_simplify (enum url_scheme, char *);
  85 \f
  86 /* Support for escaping and unescaping of URL strings.  */
  87
  88 /* Table of "reserved" and "unsafe" characters.  Those terms are
  89    rfc1738-speak, as such largely obsoleted by rfc2396 and later
  90    specs, but the general idea remains.
  91
  92    A reserved character is the one that you can't decode without
  93    changing the meaning of the URL.  For example, you can't decode
  94    "/foo/%2f/bar" into "/foo///bar" because the number and contents of
  95    path components is different.  Non-reserved characters can be
  96    changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar".  The
  97    unsafe characters are loosely based on rfc1738, plus "$" and ",",
  98    as recommended by rfc2396, and minus "~", which is very frequently
  99    used (and sometimes unrecognized as %7E by broken servers).
 100
 101    An unsafe character is the one that should be encoded when URLs are
 102    placed in foreign environments.  E.g. space and newline are unsafe
 103    in HTTP contexts because HTTP uses them as separator and line
 104    terminator, so they must be encoded to %20 and %0A respectively.
 105    "*" is unsafe in shell context, etc.
 106
 107    We determine whether a character is unsafe through static table
 108    lookup.  This code assumes ASCII character set and 8-bit chars.  */
 109
 110 enum {
 111   /* rfc1738 reserved chars + "$" and ",".  */
 112   urlchr_reserved = 1,
 113
 114   /* rfc1738 unsafe chars, plus non-printables.  */
 115   urlchr_unsafe   = 2
 116 };
 117
 118 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 119 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 120 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 121
 122 /* Shorthands for the table: */
 123 #define R  urlchr_reserved
 124 #define U  urlchr_unsafe
 125 #define RU R|U
 126
 127 static const unsigned char urlchr_table[256] =
 128 {
 129   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 130   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 131   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 132   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 133   U,  0,  U, RU,   R,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 134   0,  0,  0,  R,   R,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 135   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 136   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 137  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 138   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 139   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 140   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 141   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 142   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 143   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 144   0,  0,  0,  U,   U,  U,  0,  U,   /* x   y   z   {    |   }   ~   DEL */
 145
 146   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 147   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 148   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 149   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 150
 151   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 152   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 153   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 154   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 155 };
 156 #undef R
 157 #undef U
 158 #undef RU
 159
 160 /* URL-unescape the string S.
 161
 162    This is done by transforming the sequences "%HH" to the character
 163    represented by the hexadecimal digits HH.  If % is not followed by
 164    two hexadecimal digits, it is inserted literally.
 165
 166    The transformation is done in place.  If you need the original
 167    string intact, make a copy before calling this function.  */
 168
 169 static void
 170 url_unescape (char *s)
 171 {
 172   char *t = s;                  /* t - tortoise */
 173   char *h = s;                  /* h - hare     */
 174
 175   for (; *h; h++, t++)
 176     {
 177       if (*h != '%')
 178         {
 179         copychar:
 180           *t = *h;
 181         }
 182       else
 183         {
 184           char c;
 185           /* Do nothing if '%' is not followed by two hex digits. */
 186           if (!h[1] || !h[2] || !(c_isxdigit (h[1]) && c_isxdigit (h[2])))
 187             goto copychar;
 188           c = X2DIGITS_TO_NUM (h[1], h[2]);
 189           /* Don't unescape %00 because there is no way to insert it
 190              into a C string without effectively truncating it. */
 191           if (c == '\0')
 192             goto copychar;
 193           *t = c;
 194           h += 2;
 195         }
 196     }
 197   *t = '\0';
 198 }
 199
 200 /* The core of url_escape_* functions.  Escapes the characters that
 201    match the provided mask in urlchr_table.
 202
 203    If ALLOW_PASSTHROUGH is true, a string with no unsafe chars will be
 204    returned unchanged.  If ALLOW_PASSTHROUGH is false, a freshly
 205    allocated string will be returned in all cases.  */
 206
 207 static char *
 208 url_escape_1 (const char *s, unsigned char mask, bool allow_passthrough)
 209 {
 210   const char *p1;
 211   char *p2, *newstr;
 212   int newlen;
 213   int addition = 0;
 214
 215   for (p1 = s; *p1; p1++)
 216     if (urlchr_test (*p1, mask))
 217       addition += 2;            /* Two more characters (hex digits) */
 218
 219   if (!addition)
 220     return allow_passthrough ? (char *)s : xstrdup (s);
 221
 222   newlen = (p1 - s) + addition;
 223   newstr = xmalloc (newlen + 1);
 224
 225   p1 = s;
 226   p2 = newstr;
 227   while (*p1)
 228     {
 229       /* Quote the characters that match the test mask. */
 230       if (urlchr_test (*p1, mask))
 231         {
 232           unsigned char c = *p1++;
 233           *p2++ = '%';
 234           *p2++ = XNUM_TO_DIGIT (c >> 4);
 235           *p2++ = XNUM_TO_DIGIT (c & 0xf);
 236         }
 237       else
 238         *p2++ = *p1++;
 239     }
 240   assert (p2 - newstr == newlen);
 241   *p2 = '\0';
 242
 243   return newstr;
 244 }
 245
 246 /* URL-escape the unsafe characters (see urlchr_table) in a given
 247    string, returning a freshly allocated string.  */
 248
 249 char *
 250 url_escape (const char *s)
 251 {
 252   return url_escape_1 (s, urlchr_unsafe, false);
 253 }
 254
 255 /* URL-escape the unsafe characters (see urlchr_table) in a given
 256    string.  If no characters are unsafe, S is returned.  */
 257
 258 static char *
 259 url_escape_allow_passthrough (const char *s)
 260 {
 261   return url_escape_1 (s, urlchr_unsafe, true);
 262 }
 263 \f
 264 /* Decide whether the char at position P needs to be encoded.  (It is
 265    not enough to pass a single char *P because the function may need
 266    to inspect the surrounding context.)
 267
 268    Return true if the char should be escaped as %XX, false otherwise.  */
 269
 270 static inline bool
 271 char_needs_escaping (const char *p)
 272 {
 273   if (*p == '%')
 274     {
 275       if (c_isxdigit (*(p + 1)) && c_isxdigit (*(p + 2)))
 276         return false;
 277       else
 278         /* Garbled %.. sequence: encode `%'. */
 279         return true;
 280     }
 281   else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
 282     return true;
 283   else
 284     return false;
 285 }
 286
 287 /* Translate a %-escaped (but possibly non-conformant) input string S
 288    into a %-escaped (and conformant) output string.  If no characters
 289    are encoded or decoded, return the same string S; otherwise, return
 290    a freshly allocated string with the new contents.
 291
 292    After a URL has been run through this function, the protocols that
 293    use `%' as the quote character can use the resulting string as-is,
 294    while those that don't can use url_unescape to get to the intended
 295    data.  This function is stable: once the input is transformed,
 296    further transformations of the result yield the same output.
 297
 298    Let's discuss why this function is needed.
 299
 300    Imagine Wget is asked to retrieve `http://abc.xyz/abc def'.  Since
 301    a raw space character would mess up the HTTP request, it needs to
 302    be quoted, like this:
 303
 304        GET /abc%20def HTTP/1.0
 305
 306    It would appear that the unsafe chars need to be quoted, for
 307    example with url_escape.  But what if we're requested to download
 308    `abc%20def'?  url_escape transforms "%" to "%25", which would leave
 309    us with `abc%2520def'.  This is incorrect -- since %-escapes are
 310    part of URL syntax, "%20" is the correct way to denote a literal
 311    space on the Wget command line.  This leads to the conclusion that
 312    in that case Wget should not call url_escape, but leave the `%20'
 313    as is.  This is clearly contradictory, but it only gets worse.
 314
 315    What if the requested URI is `abc%20 def'?  If we call url_escape,
 316    we end up with `/abc%2520%20def', which is almost certainly not
 317    intended.  If we don't call url_escape, we are left with the
 318    embedded space and cannot complete the request.  What the user
 319    meant was for Wget to request `/abc%20%20def', and this is where
 320    reencode_escapes kicks in.
 321
 322    Wget used to solve this by first decoding %-quotes, and then
 323    encoding all the "unsafe" characters found in the resulting string.
 324    This was wrong because it didn't preserve certain URL special
 325    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 326    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 327    whether we considered `+' reserved (it is).  One of these results
 328    is inevitable because by the second step we would lose information
 329    on whether the `+' was originally encoded or not.  Both results
 330    were wrong because in CGI parameters + means space, while %2B means
 331    literal plus.  reencode_escapes correctly translates the above to
 332    "a%2B+b", i.e. returns the original string.
 333
 334    This function uses a modified version of the algorithm originally
 335    proposed by Anon Sricharoenchai:
 336
 337    * Encode all "unsafe" characters, except those that are also
 338      "reserved", to %XX.  See urlchr_table for which characters are
 339      unsafe and reserved.
 340
 341    * Encode the "%" characters not followed by two hex digits to
 342      "%25".
 343
 344    * Pass through all other characters and %XX escapes as-is.  (Up to
 345      Wget 1.10 this decoded %XX escapes corresponding to "safe"
 346      characters, but that was obtrusive and broke some servers.)
 347
 348    Anon's test case:
 349
 350    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 351    ->
 352    "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc"
 353
 354    Simpler test cases:
 355
 356    "foo bar"         -> "foo%20bar"
 357    "foo%20bar"       -> "foo%20bar"
 358    "foo %20bar"      -> "foo%20%20bar"
 359    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 360    "foo%25%20bar"    -> "foo%25%20bar"
 361    "foo%2%20bar"     -> "foo%252%20bar"
 362    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 363    "foo%2b+bar"      -> "foo%2b+bar"  */
 364
 365 static char *
 366 reencode_escapes (const char *s)
 367 {
 368   const char *p1;
 369   char *newstr, *p2;
 370   int oldlen, newlen;
 371
 372   int encode_count = 0;
 373
 374   /* First pass: inspect the string to see if there's anything to do,
 375      and to calculate the new length.  */
 376   for (p1 = s; *p1; p1++)
 377     if (char_needs_escaping (p1))
 378       ++encode_count;
 379
 380   if (!encode_count)
 381     /* The string is good as it is. */
 382     return (char *) s;          /* C const model sucks. */
 383
 384   oldlen = p1 - s;
 385   /* Each encoding adds two characters (hex digits).  */
 386   newlen = oldlen + 2 * encode_count;
 387   newstr = xmalloc (newlen + 1);
 388
 389   /* Second pass: copy the string to the destination address, encoding
 390      chars when needed.  */
 391   p1 = s;
 392   p2 = newstr;
 393
 394   while (*p1)
 395     if (char_needs_escaping (p1))
 396       {
 397         unsigned char c = *p1++;
 398         *p2++ = '%';
 399         *p2++ = XNUM_TO_DIGIT (c >> 4);
 400         *p2++ = XNUM_TO_DIGIT (c & 0xf);
 401       }
 402     else
 403       *p2++ = *p1++;
 404
 405   *p2 = '\0';
 406   assert (p2 - newstr == newlen);
 407   return newstr;
 408 }
 409 \f
 410 /* Returns the scheme type if the scheme is supported, or
 411    SCHEME_INVALID if not.  */
 412
 413 enum url_scheme
 414 url_scheme (const char *url)
 415 {
 416   int i;
 417
 418   for (i = 0; supported_schemes[i].leading_string; i++)
 419     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 420                           strlen (supported_schemes[i].leading_string)))
 421       {
 422         if (!(supported_schemes[i].flags & scm_disabled))
 423           return (enum url_scheme) i;
 424         else
 425           return SCHEME_INVALID;
 426       }
 427
 428   return SCHEME_INVALID;
 429 }
 430
 431 #define SCHEME_CHAR(ch) (c_isalnum (ch) || (ch) == '-' || (ch) == '+')
 432
 433 /* Return 1 if the URL begins with any "scheme", 0 otherwise.  As
 434    currently implemented, it returns true if URL begins with
 435    [-+a-zA-Z0-9]+: .  */
 436
 437 bool
 438 url_has_scheme (const char *url)
 439 {
 440   const char *p = url;
 441
 442   /* The first char must be a scheme char. */
 443   if (!*p || !SCHEME_CHAR (*p))
 444     return false;
 445   ++p;
 446   /* Followed by 0 or more scheme chars. */
 447   while (*p && SCHEME_CHAR (*p))
 448     ++p;
 449   /* Terminated by ':'. */
 450   return *p == ':';
 451 }
 452
 453 int
 454 scheme_default_port (enum url_scheme scheme)
 455 {
 456   return supported_schemes[scheme].default_port;
 457 }
 458
 459 void
 460 scheme_disable (enum url_scheme scheme)
 461 {
 462   supported_schemes[scheme].flags |= scm_disabled;
 463 }
 464
 465 /* Skip the username and password, if present in the URL.  The
 466    function should *not* be called with the complete URL, but with the
 467    portion after the scheme.
 468
 469    If no username and password are found, return URL.  */
 470
 471 static const char *
 472 url_skip_credentials (const char *url)
 473 {
 474   /* Look for '@' that comes before terminators, such as '/', '?',
 475      '#', or ';'.  */
 476   const char *p = (const char *)strpbrk (url, "@/?#;");
 477   if (!p || *p != '@')
 478     return url;
 479   return p + 1;
 480 }
 481
 482 /* Parse credentials contained in [BEG, END).  The region is expected
 483    to have come from a URL and is unescaped.  */
 484
 485 static bool
 486 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
 487 {
 488   char *colon;
 489   const char *userend;
 490
 491   if (beg == end)
 492     return false;               /* empty user name */
 493
 494   colon = memchr (beg, ':', end - beg);
 495   if (colon == beg)
 496     return false;               /* again empty user name */
 497
 498   if (colon)
 499     {
 500       *passwd = strdupdelim (colon + 1, end);
 501       userend = colon;
 502       url_unescape (*passwd);
 503     }
 504   else
 505     {
 506       *passwd = NULL;
 507       userend = end;
 508     }
 509   *user = strdupdelim (beg, userend);
 510   url_unescape (*user);
 511   return true;
 512 }
 513
 514 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 515    originally popularized by Netscape and NcFTP.  HTTP shorthands look
 516    like this:
 517
 518    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 519    www.foo.com[:port]            -> http://www.foo.com[:port]
 520
 521    FTP shorthands look like this:
 522
 523    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 524    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 525
 526    If the URL needs not or cannot be rewritten, return NULL.  */
 527
 528 char *
 529 rewrite_shorthand_url (const char *url)
 530 {
 531   const char *p;
 532   char *ret;
 533
 534   if (url_scheme (url) != SCHEME_INVALID)
 535     return NULL;
 536
 537   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 538      latter Netscape.  */
 539   p = strpbrk (url, ":/");
 540   if (p == url)
 541     return NULL;
 542
 543   /* If we're looking at "://", it means the URL uses a scheme we
 544      don't support, which may include "https" when compiled without
 545      SSL support.  Don't bogusly rewrite such URLs.  */
 546   if (p && p[0] == ':' && p[1] == '/' && p[2] == '/')
 547     return NULL;
 548
 549   if (p && *p == ':')
 550     {
 551       /* Colon indicates ftp, as in foo.bar.com:path.  Check for
 552          special case of http port number ("localhost:10000").  */
 553       int digits = strspn (p + 1, "0123456789");
 554       if (digits && (p[1 + digits] == '/' || p[1 + digits] == '\0'))
 555         goto http;
 556
 557       /* Turn "foo.bar.com:path" to "ftp://foo.bar.com/path". */
 558       ret = aprintf ("ftp://%s", url);
 559       ret[6 + (p - url)] = '/';
 560     }
 561   else
 562     {
 563     http:
 564       /* Just prepend "http://" to URL. */
 565       ret = aprintf ("http://%s", url);
 566     }
 567   return ret;
 568 }
 569 \f
 570 static void split_path (const char *, char **, char **);
 571
 572 /* Like strpbrk, with the exception that it returns the pointer to the
 573    terminating zero (end-of-string aka "eos") if no matching character
 574    is found.  */
 575
 576 static inline char *
 577 strpbrk_or_eos (const char *s, const char *accept)
 578 {
 579   char *p = strpbrk (s, accept);
 580   if (!p)
 581     p = strchr (s, '\0');
 582   return p;
 583 }
 584
 585 /* Turn STR into lowercase; return true if a character was actually
 586    changed. */
 587
 588 static bool
 589 lowercase_str (char *str)
 590 {
 591   bool changed = false;
 592   for (; *str; str++)
 593     if (c_isupper (*str))
 594       {
 595         changed = true;
 596         *str = c_tolower (*str);
 597       }
 598   return changed;
 599 }
 600
 601 static const char *
 602 init_seps (enum url_scheme scheme)
 603 {
 604   static char seps[8] = ":/";
 605   char *p = seps + 2;
 606   int flags = supported_schemes[scheme].flags;
 607
 608   if (flags & scm_has_params)
 609     *p++ = ';';
 610   if (flags & scm_has_query)
 611     *p++ = '?';
 612   if (flags & scm_has_fragment)
 613     *p++ = '#';
 614   *p++ = '\0';
 615   return seps;
 616 }
 617
 618 static const char *parse_errors[] = {
 619 #define PE_NO_ERROR                     0
 620   N_("No error"),
 621 #define PE_UNSUPPORTED_SCHEME           1
 622   N_("Unsupported scheme %s"),
 623 #define PE_INVALID_HOST_NAME            2
 624   N_("Invalid host name"),
 625 #define PE_BAD_PORT_NUMBER              3
 626   N_("Bad port number"),
 627 #define PE_INVALID_USER_NAME            4
 628   N_("Invalid user name"),
 629 #define PE_UNTERMINATED_IPV6_ADDRESS    5
 630   N_("Unterminated IPv6 numeric address"),
 631 #define PE_IPV6_NOT_SUPPORTED           6
 632   N_("IPv6 addresses not supported"),
 633 #define PE_INVALID_IPV6_ADDRESS         7
 634   N_("Invalid IPv6 numeric address")
 635 };
 636
 637 /* Parse a URL.
 638
 639    Return a new struct url if successful, NULL on error.  In case of
 640    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 641    error code. */
 642 struct url *
 643 url_parse (const char *url, int *error, struct iri *iri, bool percent_encode)
 644 {
 645   struct url *u;
 646   const char *p;
 647   bool path_modified, host_modified;
 648
 649   enum url_scheme scheme;
 650   const char *seps;
 651
 652   const char *uname_b,     *uname_e;
 653   const char *host_b,      *host_e;
 654   const char *path_b,      *path_e;
 655   const char *params_b,    *params_e;
 656   const char *query_b,     *query_e;
 657   const char *fragment_b,  *fragment_e;
 658
 659   int port;
 660   char *user = NULL, *passwd = NULL;
 661
 662   char *url_encoded = NULL, *new_url = NULL;
 663
 664   int error_code;
 665
 666   scheme = url_scheme (url);
 667   if (scheme == SCHEME_INVALID)
 668     {
 669       error_code = PE_UNSUPPORTED_SCHEME;
 670       goto error;
 671     }
 672
 673   if (iri && iri->utf8_encode)
 674     {
 675       iri->utf8_encode = remote_to_utf8 (iri, iri->orig_url ? iri->orig_url : url, (const char **) &new_url);
 676       if (!iri->utf8_encode)
 677         new_url = NULL;
 678       else
 679         iri->orig_url = xstrdup (url);
 680     }
 681
 682   /* XXX XXX Could that change introduce (security) bugs ???  XXX XXX*/
 683   if (percent_encode)
 684     url_encoded = reencode_escapes (new_url ? new_url : url);
 685   else
 686      url_encoded = new_url ? new_url : url;
 687
 688   p = url_encoded;
 689
 690   if (new_url && url_encoded != new_url)
 691     xfree (new_url);
 692
 693   p += strlen (supported_schemes[scheme].leading_string);
 694   uname_b = p;
 695   p = url_skip_credentials (p);
 696   uname_e = p;
 697
 698   /* scheme://user:pass@host[:port]... */
 699   /*                    ^              */
 700
 701   /* We attempt to break down the URL into the components path,
 702      params, query, and fragment.  They are ordered like this:
 703
 704        scheme://host[:port][/path][;params][?query][#fragment]  */
 705
 706   path_b     = path_e     = NULL;
 707   params_b   = params_e   = NULL;
 708   query_b    = query_e    = NULL;
 709   fragment_b = fragment_e = NULL;
 710
 711   /* Initialize separators for optional parts of URL, depending on the
 712      scheme.  For example, FTP has params, and HTTP and HTTPS have
 713      query string and fragment. */
 714   seps = init_seps (scheme);
 715
 716   host_b = p;
 717
 718   if (*p == '[')
 719     {
 720       /* Handle IPv6 address inside square brackets.  Ideally we'd
 721          just look for the terminating ']', but rfc2732 mandates
 722          rejecting invalid IPv6 addresses.  */
 723
 724       /* The address begins after '['. */
 725       host_b = p + 1;
 726       host_e = strchr (host_b, ']');
 727
 728       if (!host_e)
 729         {
 730           error_code = PE_UNTERMINATED_IPV6_ADDRESS;
 731           goto error;
 732         }
 733
 734 #ifdef ENABLE_IPV6
 735       /* Check if the IPv6 address is valid. */
 736       if (!is_valid_ipv6_address(host_b, host_e))
 737         {
 738           error_code = PE_INVALID_IPV6_ADDRESS;
 739           goto error;
 740         }
 741
 742       /* Continue parsing after the closing ']'. */
 743       p = host_e + 1;
 744 #else
 745       error_code = PE_IPV6_NOT_SUPPORTED;
 746       goto error;
 747 #endif
 748
 749       /* The closing bracket must be followed by a separator or by the
 750          null char.  */
 751       /* http://[::1]... */
 752       /*             ^   */
 753       if (!strchr (seps, *p))
 754         {
 755           /* Trailing garbage after []-delimited IPv6 address. */
 756           error_code = PE_INVALID_HOST_NAME;
 757           goto error;
 758         }
 759     }
 760   else
 761     {
 762       p = strpbrk_or_eos (p, seps);
 763       host_e = p;
 764     }
 765   ++seps;                       /* advance to '/' */
 766
 767   if (host_b == host_e)
 768     {
 769       error_code = PE_INVALID_HOST_NAME;
 770       goto error;
 771     }
 772
 773   port = scheme_default_port (scheme);
 774   if (*p == ':')
 775     {
 776       const char *port_b, *port_e, *pp;
 777
 778       /* scheme://host:port/tralala */
 779       /*              ^             */
 780       ++p;
 781       port_b = p;
 782       p = strpbrk_or_eos (p, seps);
 783       port_e = p;
 784
 785       /* Allow empty port, as per rfc2396. */
 786       if (port_b != port_e)
 787         for (port = 0, pp = port_b; pp < port_e; pp++)
 788           {
 789             if (!c_isdigit (*pp))
 790               {
 791                 /* http://host:12randomgarbage/blah */
 792                 /*               ^                  */
 793                 error_code = PE_BAD_PORT_NUMBER;
 794                 goto error;
 795               }
 796             port = 10 * port + (*pp - '0');
 797             /* Check for too large port numbers here, before we have
 798                a chance to overflow on bogus port values.  */
 799             if (port > 0xffff)
 800               {
 801                 error_code = PE_BAD_PORT_NUMBER;
 802                 goto error;
 803               }
 804           }
 805     }
 806   /* Advance to the first separator *after* '/' (either ';' or '?',
 807      depending on the scheme).  */
 808   ++seps;
 809
 810   /* Get the optional parts of URL, each part being delimited by
 811      current location and the position of the next separator.  */
 812 #define GET_URL_PART(sepchar, var) do {                         \
 813   if (*p == sepchar)                                            \
 814     var##_b = ++p, var##_e = p = strpbrk_or_eos (p, seps);      \
 815   ++seps;                                                       \
 816 } while (0)
 817
 818   GET_URL_PART ('/', path);
 819   if (supported_schemes[scheme].flags & scm_has_params)
 820     GET_URL_PART (';', params);
 821   if (supported_schemes[scheme].flags & scm_has_query)
 822     GET_URL_PART ('?', query);
 823   if (supported_schemes[scheme].flags & scm_has_fragment)
 824     GET_URL_PART ('#', fragment);
 825
 826 #undef GET_URL_PART
 827   assert (*p == 0);
 828
 829   if (uname_b != uname_e)
 830     {
 831       /* http://user:pass@host */
 832       /*        ^         ^    */
 833       /*     uname_b   uname_e */
 834       if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
 835         {
 836           error_code = PE_INVALID_USER_NAME;
 837           goto error;
 838         }
 839     }
 840
 841   u = xnew0 (struct url);
 842   u->scheme = scheme;
 843   u->host   = strdupdelim (host_b, host_e);
 844   u->port   = port;
 845   u->user   = user;
 846   u->passwd = passwd;
 847
 848   u->path = strdupdelim (path_b, path_e);
 849   path_modified = path_simplify (scheme, u->path);
 850   split_path (u->path, &u->dir, &u->file);
 851
 852   host_modified = lowercase_str (u->host);
 853
 854   /* Decode %HH sequences in host name.  This is important not so much
 855      to support %HH sequences in host names (which other browser
 856      don't), but to support binary characters (which will have been
 857      converted to %HH by reencode_escapes).  */
 858   if (strchr (u->host, '%'))
 859     {
 860       url_unescape (u->host);
 861       host_modified = true;
 862
 863       /* Apply IDNA regardless of iri->utf8_encode status */
 864       if (opt.enable_iri && iri)
 865         {
 866           char *new = idn_encode (iri, u->host);
 867           if (new)
 868             {
 869               xfree (u->host);
 870               u->host = new;
 871               host_modified = true;
 872             }
 873         }
 874     }
 875
 876   if (params_b)
 877     u->params = strdupdelim (params_b, params_e);
 878   if (query_b)
 879     u->query = strdupdelim (query_b, query_e);
 880   if (fragment_b)
 881     u->fragment = strdupdelim (fragment_b, fragment_e);
 882
 883   if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e)
 884     {
 885       /* If we suspect that a transformation has rendered what
 886          url_string might return different from URL_ENCODED, rebuild
 887          u->url using url_string.  */
 888       u->url = url_string (u, URL_AUTH_SHOW);
 889
 890       if (url_encoded != url)
 891         xfree ((char *) url_encoded);
 892     }
 893   else
 894     {
 895       if (url_encoded == url)
 896         u->url = xstrdup (url);
 897       else
 898         u->url = url_encoded;
 899     }
 900
 901   return u;
 902
 903  error:
 904   /* Cleanup in case of error: */
 905   if (url_encoded && url_encoded != url)
 906     xfree (url_encoded);
 907
 908   /* Transmit the error code to the caller, if the caller wants to
 909      know.  */
 910   if (error)
 911     *error = error_code;
 912   return NULL;
 913 }
 914
 915 /* Return the error message string from ERROR_CODE, which should have
 916    been retrieved from url_parse.  The error message is translated.  */
 917
 918 char *
 919 url_error (const char *url, int error_code)
 920 {
 921   assert (error_code >= 0 && ((size_t) error_code) < countof (parse_errors));
 922
 923   if (error_code == PE_UNSUPPORTED_SCHEME)
 924     {
 925       char *error, *p;
 926       char *scheme = xstrdup (url);
 927       assert (url_has_scheme (url));
 928
 929       if ((p = strchr (scheme, ':')))
 930         *p = '\0';
 931       if (!strcasecmp (scheme, "https"))
 932         asprintf (&error, _("HTTPS support not compiled in"));
 933       else
 934         asprintf (&error, _(parse_errors[error_code]), quote (scheme));
 935       xfree (scheme);
 936
 937       return error;
 938     }
 939   else
 940     return xstrdup (_(parse_errors[error_code]));
 941 }
 942
 943 /* Split PATH into DIR and FILE.  PATH comes from the URL and is
 944    expected to be URL-escaped.
 945
 946    The path is split into directory (the part up to the last slash)
 947    and file (the part after the last slash), which are subsequently
 948    unescaped.  Examples:
 949
 950    PATH                 DIR           FILE
 951    "foo/bar/baz"        "foo/bar"     "baz"
 952    "foo/bar/"           "foo/bar"     ""
 953    "foo"                ""            "foo"
 954    "foo/bar/baz%2fqux"  "foo/bar"     "baz/qux" (!)
 955
 956    DIR and FILE are freshly allocated.  */
 957
 958 static void
 959 split_path (const char *path, char **dir, char **file)
 960 {
 961   char *last_slash = strrchr (path, '/');
 962   if (!last_slash)
 963     {
 964       *dir = xstrdup ("");
 965       *file = xstrdup (path);
 966     }
 967   else
 968     {
 969       *dir = strdupdelim (path, last_slash);
 970       *file = xstrdup (last_slash + 1);
 971     }
 972   url_unescape (*dir);
 973   url_unescape (*file);
 974 }
 975
 976 /* Note: URL's "full path" is the path with the query string and
 977    params appended.  The "fragment" (#foo) is intentionally ignored,
 978    but that might be changed.  For example, if the original URL was
 979    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
 980    the full path will be "/foo/bar/baz;bullshit?querystring".  */
 981
 982 /* Return the length of the full path, without the terminating
 983    zero.  */
 984
 985 static int
 986 full_path_length (const struct url *url)
 987 {
 988   int len = 0;
 989
 990 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
 991
 992   FROB (path);
 993   FROB (params);
 994   FROB (query);
 995
 996 #undef FROB
 997
 998   return len;
 999 }
1000
1001 /* Write out the full path. */
1002
1003 static void
1004 full_path_write (const struct url *url, char *where)
1005 {
1006 #define FROB(el, chr) do {                      \
1007   char *f_el = url->el;                         \
1008   if (f_el) {                                   \
1009     int l = strlen (f_el);                      \
1010     *where++ = chr;                             \
1011     memcpy (where, f_el, l);                    \
1012     where += l;                                 \
1013   }                                             \
1014 } while (0)
1015
1016   FROB (path, '/');
1017   FROB (params, ';');
1018   FROB (query, '?');
1019
1020 #undef FROB
1021 }
1022
1023 /* Public function for getting the "full path".  E.g. if u->path is
1024    "foo/bar" and u->query is "param=value", full_path will be
1025    "/foo/bar?param=value". */
1026
1027 char *
1028 url_full_path (const struct url *url)
1029 {
1030   int length = full_path_length (url);
1031   char *full_path = xmalloc (length + 1);
1032
1033   full_path_write (url, full_path);
1034   full_path[length] = '\0';
1035
1036   return full_path;
1037 }
1038
1039 /* Unescape CHR in an otherwise escaped STR.  Used to selectively
1040    escaping of certain characters, such as "/" and ":".  Returns a
1041    count of unescaped chars.  */
1042
1043 static void
1044 unescape_single_char (char *str, char chr)
1045 {
1046   const char c1 = XNUM_TO_DIGIT (chr >> 4);
1047   const char c2 = XNUM_TO_DIGIT (chr & 0xf);
1048   char *h = str;                /* hare */
1049   char *t = str;                /* tortoise */
1050   for (; *h; h++, t++)
1051     {
1052       if (h[0] == '%' && h[1] == c1 && h[2] == c2)
1053         {
1054           *t = chr;
1055           h += 2;
1056         }
1057       else
1058         *t = *h;
1059     }
1060   *t = '\0';
1061 }
1062
1063 /* Escape unsafe and reserved characters, except for the slash
1064    characters.  */
1065
1066 static char *
1067 url_escape_dir (const char *dir)
1068 {
1069   char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1070   if (newdir == dir)
1071     return (char *)dir;
1072
1073   unescape_single_char (newdir, '/');
1074   return newdir;
1075 }
1076
1077 /* Sync u->path and u->url with u->dir and u->file.  Called after
1078    u->file or u->dir have been changed, typically by the FTP code.  */
1079
1080 static void
1081 sync_path (struct url *u)
1082 {
1083   char *newpath, *efile, *edir;
1084
1085   xfree (u->path);
1086
1087   /* u->dir and u->file are not escaped.  URL-escape them before
1088      reassembling them into u->path.  That way, if they contain
1089      separators like '?' or even if u->file contains slashes, the
1090      path will be correctly assembled.  (u->file can contain slashes
1091      if the URL specifies it with %2f, or if an FTP server returns
1092      it.)  */
1093   edir = url_escape_dir (u->dir);
1094   efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1095
1096   if (!*edir)
1097     newpath = xstrdup (efile);
1098   else
1099     {
1100       int dirlen = strlen (edir);
1101       int filelen = strlen (efile);
1102
1103       /* Copy "DIR/FILE" to newpath. */
1104       char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1105       memcpy (p, edir, dirlen);
1106       p += dirlen;
1107       *p++ = '/';
1108       memcpy (p, efile, filelen);
1109       p += filelen;
1110       *p = '\0';
1111     }
1112
1113   u->path = newpath;
1114
1115   if (edir != u->dir)
1116     xfree (edir);
1117   if (efile != u->file)
1118     xfree (efile);
1119
1120   /* Regenerate u->url as well.  */
1121   xfree (u->url);
1122   u->url = url_string (u, URL_AUTH_SHOW);
1123 }
1124
1125 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1126    This way we can sync u->path and u->url when they get changed.  */
1127
1128 void
1129 url_set_dir (struct url *url, const char *newdir)
1130 {
1131   xfree (url->dir);
1132   url->dir = xstrdup (newdir);
1133   sync_path (url);
1134 }
1135
1136 void
1137 url_set_file (struct url *url, const char *newfile)
1138 {
1139   xfree (url->file);
1140   url->file = xstrdup (newfile);
1141   sync_path (url);
1142 }
1143
1144 void
1145 url_free (struct url *url)
1146 {
1147   xfree (url->host);
1148   xfree (url->path);
1149   xfree (url->url);
1150
1151   xfree_null (url->params);
1152   xfree_null (url->query);
1153   xfree_null (url->fragment);
1154   xfree_null (url->user);
1155   xfree_null (url->passwd);
1156
1157   xfree (url->dir);
1158   xfree (url->file);
1159
1160   xfree (url);
1161 }
1162 \f
1163 /* Create all the necessary directories for PATH (a file).  Calls
1164    make_directory internally.  */
1165 int
1166 mkalldirs (const char *path)
1167 {
1168   const char *p;
1169   char *t;
1170   struct_stat st;
1171   int res;
1172
1173   p = path + strlen (path);
1174   for (; *p != '/' && p != path; p--)
1175     ;
1176
1177   /* Don't create if it's just a file.  */
1178   if ((p == path) && (*p != '/'))
1179     return 0;
1180   t = strdupdelim (path, p);
1181
1182   /* Check whether the directory exists.  */
1183   if ((stat (t, &st) == 0))
1184     {
1185       if (S_ISDIR (st.st_mode))
1186         {
1187           xfree (t);
1188           return 0;
1189         }
1190       else
1191         {
1192           /* If the dir exists as a file name, remove it first.  This
1193              is *only* for Wget to work with buggy old CERN http
1194              servers.  Here is the scenario: When Wget tries to
1195              retrieve a directory without a slash, e.g.
1196              http://foo/bar (bar being a directory), CERN server will
1197              not redirect it too http://foo/bar/ -- it will generate a
1198              directory listing containing links to bar/file1,
1199              bar/file2, etc.  Wget will lose because it saves this
1200              HTML listing to a file `bar', so it cannot create the
1201              directory.  To work around this, if the file of the same
1202              name exists, we just remove it and create the directory
1203              anyway.  */
1204           DEBUGP (("Removing %s because of directory danger!\n", t));
1205           unlink (t);
1206         }
1207     }
1208   res = make_directory (t);
1209   if (res != 0)
1210     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1211   xfree (t);
1212   return res;
1213 }
1214 \f
1215 /* Functions for constructing the file name out of URL components.  */
1216
1217 /* A growable string structure, used by url_file_name and friends.
1218    This should perhaps be moved to utils.c.
1219
1220    The idea is to have a convenient and efficient way to construct a
1221    string by having various functions append data to it.  Instead of
1222    passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1223    functions in questions, we pass the pointer to this struct.  */
1224
1225 struct growable {
1226   char *base;
1227   int size;
1228   int tail;
1229 };
1230
1231 /* Ensure that the string can accept APPEND_COUNT more characters past
1232    the current TAIL position.  If necessary, this will grow the string
1233    and update its allocated size.  If the string is already large
1234    enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1235 #define GROW(g, append_size) do {                                       \
1236   struct growable *G_ = g;                                              \
1237   DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1238 } while (0)
1239
1240 /* Return the tail position of the string. */
1241 #define TAIL(r) ((r)->base + (r)->tail)
1242
1243 /* Move the tail position by APPEND_COUNT characters. */
1244 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1245
1246 /* Append the string STR to DEST.  NOTICE: the string in DEST is not
1247    terminated.  */
1248
1249 static void
1250 append_string (const char *str, struct growable *dest)
1251 {
1252   int l = strlen (str);
1253   GROW (dest, l);
1254   memcpy (TAIL (dest), str, l);
1255   TAIL_INCR (dest, l);
1256 }
1257
1258 /* Append CH to DEST.  For example, append_char (0, DEST)
1259    zero-terminates DEST.  */
1260
1261 static void
1262 append_char (char ch, struct growable *dest)
1263 {
1264   GROW (dest, 1);
1265   *TAIL (dest) = ch;
1266   TAIL_INCR (dest, 1);
1267 }
1268
1269 enum {
1270   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1271   filechr_not_windows = 2,      /* unusable on Windows, one of \|/<>?:*" */
1272   filechr_control     = 4       /* a control character, e.g. 0-31 */
1273 };
1274
1275 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1276
1277 /* Shorthands for the table: */
1278 #define U filechr_not_unix
1279 #define W filechr_not_windows
1280 #define C filechr_control
1281
1282 #define UW U|W
1283 #define UWC U|W|C
1284
1285 /* Table of characters unsafe under various conditions (see above).
1286
1287    Arguably we could also claim `%' to be unsafe, since we use it as
1288    the escape character.  If we ever want to be able to reliably
1289    translate file name back to URL, this would become important
1290    crucial.  Right now, it's better to be minimal in escaping.  */
1291
1292 static const unsigned char filechr_table[256] =
1293 {
1294 UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1295   C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1296   C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1297   C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1298   0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1299   0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1300   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1301   0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
1302   0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1303   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1304   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1305   0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1306   0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1307   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1308   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1309   0,  0,  0,  0,   W,  0,  0,  C,   /* x   y   z   {    |   }   ~   DEL */
1310
1311   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
1312   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
1313   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1314   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1315
1316   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1317   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1318   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1319   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1320 };
1321 #undef U
1322 #undef W
1323 #undef C
1324 #undef UW
1325 #undef UWC
1326
1327 /* FN_PORT_SEP is the separator between host and port in file names
1328    for non-standard port numbers.  On Unix this is normally ':', as in
1329    "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1330    because Windows can't handle ':' in file names.  */
1331 #define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1332
1333 /* FN_QUERY_SEP is the separator between the file name and the URL
1334    query, normally '?'.  Since Windows cannot handle '?' as part of
1335    file name, we use '@' instead there.  */
1336 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1337
1338 /* Quote path element, characters in [b, e), as file name, and append
1339    the quoted string to DEST.  Each character is quoted as per
1340    file_unsafe_char and the corresponding table.
1341
1342    If ESCAPED is true, the path element is considered to be
1343    URL-escaped and will be unescaped prior to inspection.  */
1344
1345 static void
1346 append_uri_pathel (const char *b, const char *e, bool escaped,
1347                    struct growable *dest)
1348 {
1349   const char *p;
1350   int quoted, outlen;
1351
1352   int mask;
1353   if (opt.restrict_files_os == restrict_unix)
1354     mask = filechr_not_unix;
1355   else
1356     mask = filechr_not_windows;
1357   if (opt.restrict_files_ctrl)
1358     mask |= filechr_control;
1359
1360   /* Copy [b, e) to PATHEL and URL-unescape it. */
1361   if (escaped)
1362     {
1363       char *unescaped;
1364       BOUNDED_TO_ALLOCA (b, e, unescaped);
1365       url_unescape (unescaped);
1366       b = unescaped;
1367       e = unescaped + strlen (unescaped);
1368     }
1369
1370   /* Defang ".." when found as component of path.  Remember that path
1371      comes from the URL and might contain malicious input.  */
1372   if (e - b == 2 && b[0] == '.' && b[1] == '.')
1373     {
1374       b = "%2E%2E";
1375       e = b + 6;
1376     }
1377
1378   /* Walk the PATHEL string and check how many characters we'll need
1379      to quote.  */
1380   quoted = 0;
1381   for (p = b; p < e; p++)
1382     if (FILE_CHAR_TEST (*p, mask))
1383       ++quoted;
1384
1385   /* Calculate the length of the output string.  e-b is the input
1386      string length.  Each quoted char introduces two additional
1387      characters in the string, hence 2*quoted.  */
1388   outlen = (e - b) + (2 * quoted);
1389   GROW (dest, outlen);
1390
1391   if (!quoted)
1392     {
1393       /* If there's nothing to quote, we can simply append the string
1394          without processing it again.  */
1395       memcpy (TAIL (dest), b, outlen);
1396     }
1397   else
1398     {
1399       char *q = TAIL (dest);
1400       for (p = b; p < e; p++)
1401         {
1402           if (!FILE_CHAR_TEST (*p, mask))
1403             *q++ = *p;
1404           else
1405             {
1406               unsigned char ch = *p;
1407               *q++ = '%';
1408               *q++ = XNUM_TO_DIGIT (ch >> 4);
1409               *q++ = XNUM_TO_DIGIT (ch & 0xf);
1410             }
1411         }
1412       assert (q - TAIL (dest) == outlen);
1413     }
1414
1415   /* Perform inline case transformation if required.  */
1416   if (opt.restrict_files_case == restrict_lowercase
1417       || opt.restrict_files_case == restrict_uppercase)
1418     {
1419       char *q;
1420       for (q = TAIL (dest); q < TAIL (dest) + outlen; ++q)
1421         {
1422           if (opt.restrict_files_case == restrict_lowercase)
1423             *q = c_tolower (*q);
1424           else
1425             *q = c_toupper (*q);
1426         }
1427     }
1428
1429   TAIL_INCR (dest, outlen);
1430 }
1431
1432 /* Append to DEST the directory structure that corresponds the
1433    directory part of URL's path.  For example, if the URL is
1434    http://server/dir1/dir2/file, this appends "/dir1/dir2".
1435
1436    Each path element ("dir1" and "dir2" in the above example) is
1437    examined, url-unescaped, and re-escaped as file name element.
1438
1439    Additionally, it cuts as many directories from the path as
1440    specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1441    will produce "bar" for the above example.  For 2 or more, it will
1442    produce "".
1443
1444    Each component of the path is quoted for use as file name.  */
1445
1446 static void
1447 append_dir_structure (const struct url *u, struct growable *dest)
1448 {
1449   char *pathel, *next;
1450   int cut = opt.cut_dirs;
1451
1452   /* Go through the path components, de-URL-quote them, and quote them
1453      (if necessary) as file names.  */
1454
1455   pathel = u->path;
1456   for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1457     {
1458       if (cut-- > 0)
1459         continue;
1460       if (pathel == next)
1461         /* Ignore empty pathels.  */
1462         continue;
1463
1464       if (dest->tail)
1465         append_char ('/', dest);
1466       append_uri_pathel (pathel, next, true, dest);
1467     }
1468 }
1469
1470 /* Return a unique file name that matches the given URL as good as
1471    possible.  Does not create directories on the file system.  */
1472
1473 char *
1474 url_file_name (const struct url *u)
1475 {
1476   struct growable fnres;        /* stands for "file name result" */
1477
1478   const char *u_file, *u_query;
1479   char *fname, *unique;
1480   char *index_filename = "index.html"; /* The default index file is index.html */
1481
1482   fnres.base = NULL;
1483   fnres.size = 0;
1484   fnres.tail = 0;
1485
1486   /* If an alternative index file was defined, change index_filename */
1487   if (opt.default_page)
1488     index_filename = opt.default_page;
1489
1490
1491   /* Start with the directory prefix, if specified. */
1492   if (opt.dir_prefix)
1493     append_string (opt.dir_prefix, &fnres);
1494
1495   /* If "dirstruct" is turned on (typically the case with -r), add
1496      the host and port (unless those have been turned off) and
1497      directory structure.  */
1498   if (opt.dirstruct)
1499     {
1500       if (opt.protocol_directories)
1501         {
1502           if (fnres.tail)
1503             append_char ('/', &fnres);
1504           append_string (supported_schemes[u->scheme].name, &fnres);
1505         }
1506       if (opt.add_hostdir)
1507         {
1508           if (fnres.tail)
1509             append_char ('/', &fnres);
1510           if (0 != strcmp (u->host, ".."))
1511             append_string (u->host, &fnres);
1512           else
1513             /* Host name can come from the network; malicious DNS may
1514                allow ".." to be resolved, causing us to write to
1515                "../<file>".  Defang such host names.  */
1516             append_string ("%2E%2E", &fnres);
1517           if (u->port != scheme_default_port (u->scheme))
1518             {
1519               char portstr[24];
1520               number_to_string (portstr, u->port);
1521               append_char (FN_PORT_SEP, &fnres);
1522               append_string (portstr, &fnres);
1523             }
1524         }
1525
1526       append_dir_structure (u, &fnres);
1527     }
1528
1529   /* Add the file name. */
1530   if (fnres.tail)
1531     append_char ('/', &fnres);
1532   u_file = *u->file ? u->file : index_filename;
1533   append_uri_pathel (u_file, u_file + strlen (u_file), false, &fnres);
1534
1535   /* Append "?query" to the file name. */
1536   u_query = u->query && *u->query ? u->query : NULL;
1537   if (u_query)
1538     {
1539       append_char (FN_QUERY_SEP, &fnres);
1540       append_uri_pathel (u_query, u_query + strlen (u_query), true, &fnres);
1541     }
1542
1543   /* Zero-terminate the file name. */
1544   append_char ('\0', &fnres);
1545
1546   fname = fnres.base;
1547
1548   /* Check the cases in which the unique extensions are not used:
1549      1) Clobbering is turned off (-nc).
1550      2) Retrieval with regetting.
1551      3) Timestamping is used.
1552      4) Hierarchy is built.
1553
1554      The exception is the case when file does exist and is a
1555      directory (see `mkalldirs' for explanation).  */
1556
1557   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1558       && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1559     return fname;
1560
1561   unique = unique_name (fname, true);
1562   if (unique != fname)
1563     xfree (fname);
1564   return unique;
1565 }
1566 \f
1567 /* Resolve "." and ".." elements of PATH by destructively modifying
1568    PATH and return true if PATH has been modified, false otherwise.
1569
1570    The algorithm is in spirit similar to the one described in rfc1808,
1571    although implemented differently, in one pass.  To recap, path
1572    elements containing only "." are removed, and ".." is taken to mean
1573    "back up one element".  Single leading and trailing slashes are
1574    preserved.
1575
1576    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1577    test examples are provided below.  If you change anything in this
1578    function, run test_path_simplify to make sure you haven't broken a
1579    test case.  */
1580
1581 static bool
1582 path_simplify (enum url_scheme scheme, char *path)
1583 {
1584   char *h = path;               /* hare */
1585   char *t = path;               /* tortoise */
1586   char *beg = path;
1587   char *end = strchr (path, '\0');
1588
1589   while (h < end)
1590     {
1591       /* Hare should be at the beginning of a path element. */
1592
1593       if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1594         {
1595           /* Ignore "./". */
1596           h += 2;
1597         }
1598       else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1599         {
1600           /* Handle "../" by retreating the tortoise by one path
1601              element -- but not past beggining.  */
1602           if (t > beg)
1603             {
1604               /* Move backwards until T hits the beginning of the
1605                  previous path element or the beginning of path. */
1606               for (--t; t > beg && t[-1] != '/'; t--)
1607                 ;
1608             }
1609           else if (scheme == SCHEME_FTP)
1610             {
1611               /* If we're at the beginning, copy the "../" literally
1612                  and move the beginning so a later ".." doesn't remove
1613                  it.  This violates RFC 3986; but we do it for FTP
1614                  anyway because there is otherwise no way to get at a
1615                  parent directory, when the FTP server drops us in a
1616                  non-root directory (which is not uncommon). */
1617               beg = t + 3;
1618               goto regular;
1619             }
1620           h += 3;
1621         }
1622       else
1623         {
1624         regular:
1625           /* A regular path element.  If H hasn't advanced past T,
1626              simply skip to the next path element.  Otherwise, copy
1627              the path element until the next slash.  */
1628           if (t == h)
1629             {
1630               /* Skip the path element, including the slash.  */
1631               while (h < end && *h != '/')
1632                 t++, h++;
1633               if (h < end)
1634                 t++, h++;
1635             }
1636           else
1637             {
1638               /* Copy the path element, including the final slash.  */
1639               while (h < end && *h != '/')
1640                 *t++ = *h++;
1641               if (h < end)
1642                 *t++ = *h++;
1643             }
1644         }
1645     }
1646
1647   if (t != h)
1648     *t = '\0';
1649
1650   return t != h;
1651 }
1652 \f
1653 /* Return the length of URL's path.  Path is considered to be
1654    terminated by one or more of the ?query or ;params or #fragment,
1655    depending on the scheme.  */
1656
1657 static const char *
1658 path_end (const char *url)
1659 {
1660   enum url_scheme scheme = url_scheme (url);
1661   const char *seps;
1662   if (scheme == SCHEME_INVALID)
1663     scheme = SCHEME_HTTP;       /* use http semantics for rel links */
1664   /* +2 to ignore the first two separators ':' and '/' */
1665   seps = init_seps (scheme) + 2;
1666   return strpbrk_or_eos (url, seps);
1667 }
1668
1669 /* Find the last occurrence of character C in the range [b, e), or
1670    NULL, if none are present.  */
1671 #define find_last_char(b, e, c) memrchr ((b), (c), (e) - (b))
1672
1673 /* Merge BASE with LINK and return the resulting URI.
1674
1675    Either of the URIs may be absolute or relative, complete with the
1676    host name, or path only.  This tries to reasonably handle all
1677    foreseeable cases.  It only employs minimal URL parsing, without
1678    knowledge of the specifics of schemes.
1679
1680    I briefly considered making this function call path_simplify after
1681    the merging process, as rfc1738 seems to suggest.  This is a bad
1682    idea for several reasons: 1) it complexifies the code, and 2)
1683    url_parse has to simplify path anyway, so it's wasteful to boot.  */
1684
1685 char *
1686 uri_merge (const char *base, const char *link)
1687 {
1688   int linklength;
1689   const char *end;
1690   char *merge;
1691
1692   if (url_has_scheme (link))
1693     return xstrdup (link);
1694
1695   /* We may not examine BASE past END. */
1696   end = path_end (base);
1697   linklength = strlen (link);
1698
1699   if (!*link)
1700     {
1701       /* Empty LINK points back to BASE, query string and all. */
1702       return xstrdup (base);
1703     }
1704   else if (*link == '?')
1705     {
1706       /* LINK points to the same location, but changes the query
1707          string.  Examples: */
1708       /* uri_merge("path",         "?new") -> "path?new"     */
1709       /* uri_merge("path?foo",     "?new") -> "path?new"     */
1710       /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1711       /* uri_merge("path#foo",     "?new") -> "path?new"     */
1712       int baselength = end - base;
1713       merge = xmalloc (baselength + linklength + 1);
1714       memcpy (merge, base, baselength);
1715       memcpy (merge + baselength, link, linklength);
1716       merge[baselength + linklength] = '\0';
1717     }
1718   else if (*link == '#')
1719     {
1720       /* uri_merge("path",         "#new") -> "path#new"     */
1721       /* uri_merge("path#foo",     "#new") -> "path#new"     */
1722       /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1723       /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1724       int baselength;
1725       const char *end1 = strchr (base, '#');
1726       if (!end1)
1727         end1 = base + strlen (base);
1728       baselength = end1 - base;
1729       merge = xmalloc (baselength + linklength + 1);
1730       memcpy (merge, base, baselength);
1731       memcpy (merge + baselength, link, linklength);
1732       merge[baselength + linklength] = '\0';
1733     }
1734   else if (*link == '/' && *(link + 1) == '/')
1735     {
1736       /* LINK begins with "//" and so is a net path: we need to
1737          replace everything after (and including) the double slash
1738          with LINK. */
1739
1740       /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1741       /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1742       /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1743
1744       int span;
1745       const char *slash;
1746       const char *start_insert;
1747
1748       /* Look for first slash. */
1749       slash = memchr (base, '/', end - base);
1750       /* If found slash and it is a double slash, then replace
1751          from this point, else default to replacing from the
1752          beginning.  */
1753       if (slash && *(slash + 1) == '/')
1754         start_insert = slash;
1755       else
1756         start_insert = base;
1757
1758       span = start_insert - base;
1759       merge = xmalloc (span + linklength + 1);
1760       if (span)
1761         memcpy (merge, base, span);
1762       memcpy (merge + span, link, linklength);
1763       merge[span + linklength] = '\0';
1764     }
1765   else if (*link == '/')
1766     {
1767       /* LINK is an absolute path: we need to replace everything
1768          after (and including) the FIRST slash with LINK.
1769
1770          So, if BASE is "http://host/whatever/foo/bar", and LINK is
1771          "/qux/xyzzy", our result should be
1772          "http://host/qux/xyzzy".  */
1773       int span;
1774       const char *slash;
1775       const char *start_insert = NULL; /* for gcc to shut up. */
1776       const char *pos = base;
1777       bool seen_slash_slash = false;
1778       /* We're looking for the first slash, but want to ignore
1779          double slash. */
1780     again:
1781       slash = memchr (pos, '/', end - pos);
1782       if (slash && !seen_slash_slash)
1783         if (*(slash + 1) == '/')
1784           {
1785             pos = slash + 2;
1786             seen_slash_slash = true;
1787             goto again;
1788           }
1789
1790       /* At this point, SLASH is the location of the first / after
1791          "//", or the first slash altogether.  START_INSERT is the
1792          pointer to the location where LINK will be inserted.  When
1793          examining the last two examples, keep in mind that LINK
1794          begins with '/'. */
1795
1796       if (!slash && !seen_slash_slash)
1797         /* example: "foo" */
1798         /*           ^    */
1799         start_insert = base;
1800       else if (!slash && seen_slash_slash)
1801         /* example: "http://foo" */
1802         /*                     ^ */
1803         start_insert = end;
1804       else if (slash && !seen_slash_slash)
1805         /* example: "foo/bar" */
1806         /*           ^        */
1807         start_insert = base;
1808       else if (slash && seen_slash_slash)
1809         /* example: "http://something/" */
1810         /*                           ^  */
1811         start_insert = slash;
1812
1813       span = start_insert - base;
1814       merge = xmalloc (span + linklength + 1);
1815       if (span)
1816         memcpy (merge, base, span);
1817       memcpy (merge + span, link, linklength);
1818       merge[span + linklength] = '\0';
1819     }
1820   else
1821     {
1822       /* LINK is a relative URL: we need to replace everything
1823          after last slash (possibly empty) with LINK.
1824
1825          So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1826          our result should be "whatever/foo/qux/xyzzy".  */
1827       bool need_explicit_slash = false;
1828       int span;
1829       const char *start_insert;
1830       const char *last_slash = find_last_char (base, end, '/');
1831       if (!last_slash)
1832         {
1833           /* No slash found at all.  Replace what we have with LINK. */
1834           start_insert = base;
1835         }
1836       else if (last_slash && last_slash >= base + 2
1837                && last_slash[-2] == ':' && last_slash[-1] == '/')
1838         {
1839           /* example: http://host"  */
1840           /*                      ^ */
1841           start_insert = end + 1;
1842           need_explicit_slash = true;
1843         }
1844       else
1845         {
1846           /* example: "whatever/foo/bar" */
1847           /*                        ^    */
1848           start_insert = last_slash + 1;
1849         }
1850
1851       span = start_insert - base;
1852       merge = xmalloc (span + linklength + 1);
1853       if (span)
1854         memcpy (merge, base, span);
1855       if (need_explicit_slash)
1856         merge[span - 1] = '/';
1857       memcpy (merge + span, link, linklength);
1858       merge[span + linklength] = '\0';
1859     }
1860
1861   return merge;
1862 }
1863 \f
1864 #define APPEND(p, s) do {                       \
1865   int len = strlen (s);                         \
1866   memcpy (p, s, len);                           \
1867   p += len;                                     \
1868 } while (0)
1869
1870 /* Use this instead of password when the actual password is supposed
1871    to be hidden.  We intentionally use a generic string without giving
1872    away the number of characters in the password, like previous
1873    versions did.  */
1874 #define HIDDEN_PASSWORD "*password*"
1875
1876 /* Recreate the URL string from the data in URL.
1877
1878    If HIDE is true (as it is when we're calling this on a URL we plan
1879    to print, but not when calling it to canonicalize a URL for use
1880    within the program), password will be hidden.  Unsafe characters in
1881    the URL will be quoted.  */
1882
1883 char *
1884 url_string (const struct url *url, enum url_auth_mode auth_mode)
1885 {
1886   int size;
1887   char *result, *p;
1888   char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;
1889
1890   int scheme_port = supported_schemes[url->scheme].default_port;
1891   const char *scheme_str = supported_schemes[url->scheme].leading_string;
1892   int fplen = full_path_length (url);
1893
1894   bool brackets_around_host;
1895
1896   assert (scheme_str != NULL);
1897
1898   /* Make sure the user name and password are quoted. */
1899   if (url->user)
1900     {
1901       if (auth_mode != URL_AUTH_HIDE)
1902         {
1903           quoted_user = url_escape_allow_passthrough (url->user);
1904           if (url->passwd)
1905             {
1906               if (auth_mode == URL_AUTH_HIDE_PASSWD)
1907                 quoted_passwd = HIDDEN_PASSWORD;
1908               else
1909                 quoted_passwd = url_escape_allow_passthrough (url->passwd);
1910             }
1911         }
1912     }
1913
1914   /* In the unlikely event that the host name contains non-printable
1915      characters, quote it for displaying to the user.  */
1916   quoted_host = url_escape_allow_passthrough (url->host);
1917
1918   /* Undo the quoting of colons that URL escaping performs.  IPv6
1919      addresses may legally contain colons, and in that case must be
1920      placed in square brackets.  */
1921   if (quoted_host != url->host)
1922     unescape_single_char (quoted_host, ':');
1923   brackets_around_host = strchr (quoted_host, ':') != NULL;
1924
1925   size = (strlen (scheme_str)
1926           + strlen (quoted_host)
1927           + (brackets_around_host ? 2 : 0)
1928           + fplen
1929           + 1);
1930   if (url->port != scheme_port)
1931     size += 1 + numdigit (url->port);
1932   if (quoted_user)
1933     {
1934       size += 1 + strlen (quoted_user);
1935       if (quoted_passwd)
1936         size += 1 + strlen (quoted_passwd);
1937     }
1938
1939   p = result = xmalloc (size);
1940
1941   APPEND (p, scheme_str);
1942   if (quoted_user)
1943     {
1944       APPEND (p, quoted_user);
1945       if (quoted_passwd)
1946         {
1947           *p++ = ':';
1948           APPEND (p, quoted_passwd);
1949         }
1950       *p++ = '@';
1951     }
1952
1953   if (brackets_around_host)
1954     *p++ = '[';
1955   APPEND (p, quoted_host);
1956   if (brackets_around_host)
1957     *p++ = ']';
1958   if (url->port != scheme_port)
1959     {
1960       *p++ = ':';
1961       p = number_to_string (p, url->port);
1962     }
1963
1964   full_path_write (url, p);
1965   p += fplen;
1966   *p++ = '\0';
1967
1968   assert (p - result == size);
1969
1970   if (quoted_user && quoted_user != url->user)
1971     xfree (quoted_user);
1972   if (quoted_passwd && auth_mode == URL_AUTH_SHOW
1973       && quoted_passwd != url->passwd)
1974     xfree (quoted_passwd);
1975   if (quoted_host != url->host)
1976     xfree (quoted_host);
1977
1978   return result;
1979 }
1980 \f
1981 /* Return true if scheme a is similar to scheme b.
1982
1983    Schemes are similar if they are equal.  If SSL is supported, schemes
1984    are also similar if one is http (SCHEME_HTTP) and the other is https
1985    (SCHEME_HTTPS).  */
1986 bool
1987 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
1988 {
1989   if (a == b)
1990     return true;
1991 #ifdef HAVE_SSL
1992   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
1993       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
1994     return true;
1995 #endif
1996   return false;
1997 }
1998 \f
1999 static int
2000 getchar_from_escaped_string (const char *str, char *c)
2001 {
2002   const char *p = str;
2003
2004   assert (str && *str);
2005   assert (c);
2006
2007   if (p[0] == '%')
2008     {
2009       if (!c_isxdigit(p[1]) || !c_isxdigit(p[2]))
2010         {
2011           *c = '%';
2012           return 1;
2013         }
2014       else
2015         {
2016           if (p[2] == 0)
2017             return 0; /* error: invalid string */
2018
2019           *c = X2DIGITS_TO_NUM (p[1], p[2]);
2020           if (URL_RESERVED_CHAR(*c))
2021             {
2022               *c = '%';
2023               return 1;
2024             }
2025           else
2026             return 3;
2027         }
2028     }
2029   else
2030     {
2031       *c = p[0];
2032     }
2033
2034   return 1;
2035 }
2036
2037 bool
2038 are_urls_equal (const char *u1, const char *u2)
2039 {
2040   const char *p, *q;
2041   int pp, qq;
2042   char ch1, ch2;
2043   assert(u1 && u2);
2044
2045   p = u1;
2046   q = u2;
2047
2048   while (*p && *q
2049          && (pp = getchar_from_escaped_string (p, &ch1))
2050          && (qq = getchar_from_escaped_string (q, &ch2))
2051          && (c_tolower(ch1) == c_tolower(ch2)))
2052     {
2053       p += pp;
2054       q += qq;
2055     }
2056
2057   return (*p == 0 && *q == 0 ? true : false);
2058 }
2059 \f
2060 #ifdef TESTING
2061 /* Debugging and testing support for path_simplify. */
2062
2063 #if 0
2064 /* Debug: run path_simplify on PATH and return the result in a new
2065    string.  Useful for calling from the debugger.  */
2066 static char *
2067 ps (char *path)
2068 {
2069   char *copy = xstrdup (path);
2070   path_simplify (copy);
2071   return copy;
2072 }
2073 #endif
2074
2075 static const char *
2076 run_test (char *test, char *expected_result, enum url_scheme scheme,
2077           bool expected_change)
2078 {
2079   char *test_copy = xstrdup (test);
2080   bool modified = path_simplify (scheme, test_copy);
2081
2082   if (0 != strcmp (test_copy, expected_result))
2083     {
2084       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2085               test, expected_result, test_copy);
2086       mu_assert ("", 0);
2087     }
2088   if (modified != expected_change)
2089     {
2090       if (expected_change)
2091         printf ("Expected modification with path_simplify(\"%s\").\n",
2092                 test);
2093       else
2094         printf ("Expected no modification with path_simplify(\"%s\").\n",
2095                 test);
2096     }
2097   xfree (test_copy);
2098   mu_assert ("", modified == expected_change);
2099   return NULL;
2100 }
2101
2102 const char *
2103 test_path_simplify (void)
2104 {
2105   static struct {
2106     char *test, *result;
2107     enum url_scheme scheme;
2108     bool should_modify;
2109   } tests[] = {
2110     { "",                       "",             SCHEME_HTTP, false },
2111     { ".",                      "",             SCHEME_HTTP, true },
2112     { "./",                     "",             SCHEME_HTTP, true },
2113     { "..",                     "",             SCHEME_HTTP, true },
2114     { "../",                    "",             SCHEME_HTTP, true },
2115     { "..",                     "..",           SCHEME_FTP,  false },
2116     { "../",                    "../",          SCHEME_FTP,  false },
2117     { "foo",                    "foo",          SCHEME_HTTP, false },
2118     { "foo/bar",                "foo/bar",      SCHEME_HTTP, false },
2119     { "foo///bar",              "foo///bar",    SCHEME_HTTP, false },
2120     { "foo/.",                  "foo/",         SCHEME_HTTP, true },
2121     { "foo/./",                 "foo/",         SCHEME_HTTP, true },
2122     { "foo./",                  "foo./",        SCHEME_HTTP, false },
2123     { "foo/../bar",             "bar",          SCHEME_HTTP, true },
2124     { "foo/../bar/",            "bar/",         SCHEME_HTTP, true },
2125     { "foo/bar/..",             "foo/",         SCHEME_HTTP, true },
2126     { "foo/bar/../x",           "foo/x",        SCHEME_HTTP, true },
2127     { "foo/bar/../x/",          "foo/x/",       SCHEME_HTTP, true },
2128     { "foo/..",                 "",             SCHEME_HTTP, true },
2129     { "foo/../..",              "",             SCHEME_HTTP, true },
2130     { "foo/../../..",           "",             SCHEME_HTTP, true },
2131     { "foo/../../bar/../../baz", "baz",         SCHEME_HTTP, true },
2132     { "foo/../..",              "..",           SCHEME_FTP,  true },
2133     { "foo/../../..",           "../..",        SCHEME_FTP,  true },
2134     { "foo/../../bar/../../baz", "../../baz",   SCHEME_FTP,  true },
2135     { "a/b/../../c",            "c",            SCHEME_HTTP, true },
2136     { "./a/../b",               "b",            SCHEME_HTTP, true }
2137   };
2138   int i;
2139
2140   for (i = 0; i < countof (tests); i++)
2141     {
2142       const char *message;
2143       char *test = tests[i].test;
2144       char *expected_result = tests[i].result;
2145       enum url_scheme scheme = tests[i].scheme;
2146       bool  expected_change = tests[i].should_modify;
2147       message = run_test (test, expected_result, scheme, expected_change);
2148       if (message) return message;
2149     }
2150   return NULL;
2151 }
2152
2153 const char *
2154 test_append_uri_pathel()
2155 {
2156   int i;
2157   struct {
2158     char *original_url;
2159     char *input;
2160     bool escaped;
2161     char *expected_result;
2162   } test_array[] = {
2163     { "http://www.yoyodyne.com/path/", "somepage.html", false, "http://www.yoyodyne.com/path/somepage.html" },
2164   };
2165
2166   for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
2167     {
2168       struct growable dest;
2169       const char *p = test_array[i].input;
2170
2171       memset (&dest, 0, sizeof (dest));
2172
2173       append_string (test_array[i].original_url, &dest);
2174       append_uri_pathel (p, p + strlen(p), test_array[i].escaped, &dest);
2175       append_char ('\0', &dest);
2176
2177       mu_assert ("test_append_uri_pathel: wrong result",
2178                  strcmp (dest.base, test_array[i].expected_result) == 0);
2179     }
2180
2181   return NULL;
2182 }
2183
2184 const char*
2185 test_are_urls_equal()
2186 {
2187   int i;
2188   struct {
2189     char *url1;
2190     char *url2;
2191     bool expected_result;
2192   } test_array[] = {
2193     { "http://www.adomain.com/apath/", "http://www.adomain.com/apath/",       true },
2194     { "http://www.adomain.com/apath/", "http://www.adomain.com/anotherpath/", false },
2195     { "http://www.adomain.com/apath/", "http://www.anotherdomain.com/path/",  false },
2196     { "http://www.adomain.com/~path/", "http://www.adomain.com/%7epath/",     true },
2197     { "http://www.adomain.com/longer-path/", "http://www.adomain.com/path/",  false },
2198     { "http://www.adomain.com/path%2f", "http://www.adomain.com/path/",       false },
2199   };
2200
2201   for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
2202     {
2203       mu_assert ("test_are_urls_equal: wrong result",
2204                  are_urls_equal (test_array[i].url1, test_array[i].url2) == test_array[i].expected_result);
2205     }
2206
2207   return NULL;
2208 }
2209
2210 #endif /* TESTING */
2211
2212 /*
2213  * vim: et ts=2 sw=2
2214  */
2215