sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
   3    2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or (at
  10 your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #include "wget.h"
  32
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 #ifdef HAVE_UNISTD_H
  37 # include <unistd.h>
  38 #endif
  39 #include <errno.h>
  40 #include <assert.h>
  41
  42 #include "utils.h"
  43 #include "url.h"
  44 #include "host.h"  /* for is_valid_ipv6_address */
  45 #include "iri.h"
  46
  47 #ifdef TESTING
  48 #include "test.h"
  49 #endif
  50
  51 enum {
  52   scm_disabled = 1,             /* for https when OpenSSL fails to init. */
  53   scm_has_params = 2,           /* whether scheme has ;params */
  54   scm_has_query = 4,            /* whether scheme has ?query */
  55   scm_has_fragment = 8          /* whether scheme has #fragment */
  56 };
  57
  58 struct scheme_data
  59 {
  60   /* Short name of the scheme, such as "http" or "ftp". */
  61   const char *name;
  62   /* Leading string that identifies the scheme, such as "https://". */
  63   const char *leading_string;
  64   /* Default port of the scheme when none is specified. */
  65   int default_port;
  66   /* Various flags. */
  67   int flags;
  68 };
  69
  70 /* Supported schemes: */
  71 static struct scheme_data supported_schemes[] =
  72 {
  73   { "http",     "http://",  DEFAULT_HTTP_PORT,  scm_has_query|scm_has_fragment },
  74 #ifdef HAVE_SSL
  75   { "https",    "https://", DEFAULT_HTTPS_PORT, scm_has_query|scm_has_fragment },
  76 #endif
  77   { "ftp",      "ftp://",   DEFAULT_FTP_PORT,   scm_has_params|scm_has_fragment },
  78
  79   /* SCHEME_INVALID */
  80   { NULL,       NULL,       -1,                 0 }
  81 };
  82
  83 /* Forward declarations: */
  84
  85 static bool path_simplify (enum url_scheme, char *);
  86 \f
  87 /* Support for escaping and unescaping of URL strings.  */
  88
  89 /* Table of "reserved" and "unsafe" characters.  Those terms are
  90    rfc1738-speak, as such largely obsoleted by rfc2396 and later
  91    specs, but the general idea remains.
  92
  93    A reserved character is the one that you can't decode without
  94    changing the meaning of the URL.  For example, you can't decode
  95    "/foo/%2f/bar" into "/foo///bar" because the number and contents of
  96    path components is different.  Non-reserved characters can be
  97    changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar".  The
  98    unsafe characters are loosely based on rfc1738, plus "$" and ",",
  99    as recommended by rfc2396, and minus "~", which is very frequently
 100    used (and sometimes unrecognized as %7E by broken servers).
 101
 102    An unsafe character is the one that should be encoded when URLs are
 103    placed in foreign environments.  E.g. space and newline are unsafe
 104    in HTTP contexts because HTTP uses them as separator and line
 105    terminator, so they must be encoded to %20 and %0A respectively.
 106    "*" is unsafe in shell context, etc.
 107
 108    We determine whether a character is unsafe through static table
 109    lookup.  This code assumes ASCII character set and 8-bit chars.  */
 110
 111 enum {
 112   /* rfc1738 reserved chars + "$" and ",".  */
 113   urlchr_reserved = 1,
 114
 115   /* rfc1738 unsafe chars, plus non-printables.  */
 116   urlchr_unsafe   = 2
 117 };
 118
 119 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 120 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 121 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 122
 123 /* Shorthands for the table: */
 124 #define R  urlchr_reserved
 125 #define U  urlchr_unsafe
 126 #define RU R|U
 127
 128 static const unsigned char urlchr_table[256] =
 129 {
 130   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 131   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 132   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 133   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 134   U,  0,  U, RU,   R,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 135   0,  0,  0,  R,   R,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 136   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 137   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 138  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 139   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 140   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 141   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 142   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 143   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 144   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 145   0,  0,  0,  U,   U,  U,  0,  U,   /* x   y   z   {    |   }   ~   DEL */
 146
 147   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 148   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 149   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 150   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 151
 152   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 153   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 154   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 155   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 156 };
 157 #undef R
 158 #undef U
 159 #undef RU
 160
 161 /* URL-unescape the string S.
 162
 163    This is done by transforming the sequences "%HH" to the character
 164    represented by the hexadecimal digits HH.  If % is not followed by
 165    two hexadecimal digits, it is inserted literally.
 166
 167    The transformation is done in place.  If you need the original
 168    string intact, make a copy before calling this function.  */
 169
 170 static void
 171 url_unescape (char *s)
 172 {
 173   char *t = s;                  /* t - tortoise */
 174   char *h = s;                  /* h - hare     */
 175
 176   for (; *h; h++, t++)
 177     {
 178       if (*h != '%')
 179         {
 180         copychar:
 181           *t = *h;
 182         }
 183       else
 184         {
 185           char c;
 186           /* Do nothing if '%' is not followed by two hex digits. */
 187           if (!h[1] || !h[2] || !(c_isxdigit (h[1]) && c_isxdigit (h[2])))
 188             goto copychar;
 189           c = X2DIGITS_TO_NUM (h[1], h[2]);
 190           /* Don't unescape %00 because there is no way to insert it
 191              into a C string without effectively truncating it. */
 192           if (c == '\0')
 193             goto copychar;
 194           *t = c;
 195           h += 2;
 196         }
 197     }
 198   *t = '\0';
 199 }
 200
 201 /* The core of url_escape_* functions.  Escapes the characters that
 202    match the provided mask in urlchr_table.
 203
 204    If ALLOW_PASSTHROUGH is true, a string with no unsafe chars will be
 205    returned unchanged.  If ALLOW_PASSTHROUGH is false, a freshly
 206    allocated string will be returned in all cases.  */
 207
 208 static char *
 209 url_escape_1 (const char *s, unsigned char mask, bool allow_passthrough)
 210 {
 211   const char *p1;
 212   char *p2, *newstr;
 213   int newlen;
 214   int addition = 0;
 215
 216   for (p1 = s; *p1; p1++)
 217     if (urlchr_test (*p1, mask))
 218       addition += 2;            /* Two more characters (hex digits) */
 219
 220   if (!addition)
 221     return allow_passthrough ? (char *)s : xstrdup (s);
 222
 223   newlen = (p1 - s) + addition;
 224   newstr = xmalloc (newlen + 1);
 225
 226   p1 = s;
 227   p2 = newstr;
 228   while (*p1)
 229     {
 230       /* Quote the characters that match the test mask. */
 231       if (urlchr_test (*p1, mask))
 232         {
 233           unsigned char c = *p1++;
 234           *p2++ = '%';
 235           *p2++ = XNUM_TO_DIGIT (c >> 4);
 236           *p2++ = XNUM_TO_DIGIT (c & 0xf);
 237         }
 238       else
 239         *p2++ = *p1++;
 240     }
 241   assert (p2 - newstr == newlen);
 242   *p2 = '\0';
 243
 244   return newstr;
 245 }
 246
 247 /* URL-escape the unsafe characters (see urlchr_table) in a given
 248    string, returning a freshly allocated string.  */
 249
 250 char *
 251 url_escape (const char *s)
 252 {
 253   return url_escape_1 (s, urlchr_unsafe, false);
 254 }
 255
 256 /* URL-escape the unsafe characters (see urlchr_table) in a given
 257    string.  If no characters are unsafe, S is returned.  */
 258
 259 static char *
 260 url_escape_allow_passthrough (const char *s)
 261 {
 262   return url_escape_1 (s, urlchr_unsafe, true);
 263 }
 264 \f
 265 /* Decide whether the char at position P needs to be encoded.  (It is
 266    not enough to pass a single char *P because the function may need
 267    to inspect the surrounding context.)
 268
 269    Return true if the char should be escaped as %XX, false otherwise.  */
 270
 271 static inline bool
 272 char_needs_escaping (const char *p)
 273 {
 274   if (*p == '%')
 275     {
 276       if (c_isxdigit (*(p + 1)) && c_isxdigit (*(p + 2)))
 277         return false;
 278       else
 279         /* Garbled %.. sequence: encode `%'. */
 280         return true;
 281     }
 282   else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
 283     return true;
 284   else
 285     return false;
 286 }
 287
 288 /* Translate a %-escaped (but possibly non-conformant) input string S
 289    into a %-escaped (and conformant) output string.  If no characters
 290    are encoded or decoded, return the same string S; otherwise, return
 291    a freshly allocated string with the new contents.
 292
 293    After a URL has been run through this function, the protocols that
 294    use `%' as the quote character can use the resulting string as-is,
 295    while those that don't can use url_unescape to get to the intended
 296    data.  This function is stable: once the input is transformed,
 297    further transformations of the result yield the same output.
 298
 299    Let's discuss why this function is needed.
 300
 301    Imagine Wget is asked to retrieve `http://abc.xyz/abc def'.  Since
 302    a raw space character would mess up the HTTP request, it needs to
 303    be quoted, like this:
 304
 305        GET /abc%20def HTTP/1.0
 306
 307    It would appear that the unsafe chars need to be quoted, for
 308    example with url_escape.  But what if we're requested to download
 309    `abc%20def'?  url_escape transforms "%" to "%25", which would leave
 310    us with `abc%2520def'.  This is incorrect -- since %-escapes are
 311    part of URL syntax, "%20" is the correct way to denote a literal
 312    space on the Wget command line.  This leads to the conclusion that
 313    in that case Wget should not call url_escape, but leave the `%20'
 314    as is.  This is clearly contradictory, but it only gets worse.
 315
 316    What if the requested URI is `abc%20 def'?  If we call url_escape,
 317    we end up with `/abc%2520%20def', which is almost certainly not
 318    intended.  If we don't call url_escape, we are left with the
 319    embedded space and cannot complete the request.  What the user
 320    meant was for Wget to request `/abc%20%20def', and this is where
 321    reencode_escapes kicks in.
 322
 323    Wget used to solve this by first decoding %-quotes, and then
 324    encoding all the "unsafe" characters found in the resulting string.
 325    This was wrong because it didn't preserve certain URL special
 326    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 327    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 328    whether we considered `+' reserved (it is).  One of these results
 329    is inevitable because by the second step we would lose information
 330    on whether the `+' was originally encoded or not.  Both results
 331    were wrong because in CGI parameters + means space, while %2B means
 332    literal plus.  reencode_escapes correctly translates the above to
 333    "a%2B+b", i.e. returns the original string.
 334
 335    This function uses a modified version of the algorithm originally
 336    proposed by Anon Sricharoenchai:
 337
 338    * Encode all "unsafe" characters, except those that are also
 339      "reserved", to %XX.  See urlchr_table for which characters are
 340      unsafe and reserved.
 341
 342    * Encode the "%" characters not followed by two hex digits to
 343      "%25".
 344
 345    * Pass through all other characters and %XX escapes as-is.  (Up to
 346      Wget 1.10 this decoded %XX escapes corresponding to "safe"
 347      characters, but that was obtrusive and broke some servers.)
 348
 349    Anon's test case:
 350
 351    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 352    ->
 353    "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc"
 354
 355    Simpler test cases:
 356
 357    "foo bar"         -> "foo%20bar"
 358    "foo%20bar"       -> "foo%20bar"
 359    "foo %20bar"      -> "foo%20%20bar"
 360    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 361    "foo%25%20bar"    -> "foo%25%20bar"
 362    "foo%2%20bar"     -> "foo%252%20bar"
 363    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 364    "foo%2b+bar"      -> "foo%2b+bar"  */
 365
 366 static char *
 367 reencode_escapes (const char *s)
 368 {
 369   const char *p1;
 370   char *newstr, *p2;
 371   int oldlen, newlen;
 372
 373   int encode_count = 0;
 374
 375   /* First pass: inspect the string to see if there's anything to do,
 376      and to calculate the new length.  */
 377   for (p1 = s; *p1; p1++)
 378     if (char_needs_escaping (p1))
 379       ++encode_count;
 380
 381   if (!encode_count)
 382     /* The string is good as it is. */
 383     return (char *) s;          /* C const model sucks. */
 384
 385   oldlen = p1 - s;
 386   /* Each encoding adds two characters (hex digits).  */
 387   newlen = oldlen + 2 * encode_count;
 388   newstr = xmalloc (newlen + 1);
 389
 390   /* Second pass: copy the string to the destination address, encoding
 391      chars when needed.  */
 392   p1 = s;
 393   p2 = newstr;
 394
 395   while (*p1)
 396     if (char_needs_escaping (p1))
 397       {
 398         unsigned char c = *p1++;
 399         *p2++ = '%';
 400         *p2++ = XNUM_TO_DIGIT (c >> 4);
 401         *p2++ = XNUM_TO_DIGIT (c & 0xf);
 402       }
 403     else
 404       *p2++ = *p1++;
 405
 406   *p2 = '\0';
 407   assert (p2 - newstr == newlen);
 408   return newstr;
 409 }
 410 \f
 411 /* Returns the scheme type if the scheme is supported, or
 412    SCHEME_INVALID if not.  */
 413
 414 enum url_scheme
 415 url_scheme (const char *url)
 416 {
 417   int i;
 418
 419   for (i = 0; supported_schemes[i].leading_string; i++)
 420     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 421                           strlen (supported_schemes[i].leading_string)))
 422       {
 423         if (!(supported_schemes[i].flags & scm_disabled))
 424           return (enum url_scheme) i;
 425         else
 426           return SCHEME_INVALID;
 427       }
 428
 429   return SCHEME_INVALID;
 430 }
 431
 432 #define SCHEME_CHAR(ch) (c_isalnum (ch) || (ch) == '-' || (ch) == '+')
 433
 434 /* Return 1 if the URL begins with any "scheme", 0 otherwise.  As
 435    currently implemented, it returns true if URL begins with
 436    [-+a-zA-Z0-9]+: .  */
 437
 438 bool
 439 url_has_scheme (const char *url)
 440 {
 441   const char *p = url;
 442
 443   /* The first char must be a scheme char. */
 444   if (!*p || !SCHEME_CHAR (*p))
 445     return false;
 446   ++p;
 447   /* Followed by 0 or more scheme chars. */
 448   while (*p && SCHEME_CHAR (*p))
 449     ++p;
 450   /* Terminated by ':'. */
 451   return *p == ':';
 452 }
 453
 454 int
 455 scheme_default_port (enum url_scheme scheme)
 456 {
 457   return supported_schemes[scheme].default_port;
 458 }
 459
 460 void
 461 scheme_disable (enum url_scheme scheme)
 462 {
 463   supported_schemes[scheme].flags |= scm_disabled;
 464 }
 465
 466 /* Skip the username and password, if present in the URL.  The
 467    function should *not* be called with the complete URL, but with the
 468    portion after the scheme.
 469
 470    If no username and password are found, return URL.  */
 471
 472 static const char *
 473 url_skip_credentials (const char *url)
 474 {
 475   /* Look for '@' that comes before terminators, such as '/', '?',
 476      '#', or ';'.  */
 477   const char *p = (const char *)strpbrk (url, "@/?#;");
 478   if (!p || *p != '@')
 479     return url;
 480   return p + 1;
 481 }
 482
 483 /* Parse credentials contained in [BEG, END).  The region is expected
 484    to have come from a URL and is unescaped.  */
 485
 486 static bool
 487 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
 488 {
 489   char *colon;
 490   const char *userend;
 491
 492   if (beg == end)
 493     return false;               /* empty user name */
 494
 495   colon = memchr (beg, ':', end - beg);
 496   if (colon == beg)
 497     return false;               /* again empty user name */
 498
 499   if (colon)
 500     {
 501       *passwd = strdupdelim (colon + 1, end);
 502       userend = colon;
 503       url_unescape (*passwd);
 504     }
 505   else
 506     {
 507       *passwd = NULL;
 508       userend = end;
 509     }
 510   *user = strdupdelim (beg, userend);
 511   url_unescape (*user);
 512   return true;
 513 }
 514
 515 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 516    originally popularized by Netscape and NcFTP.  HTTP shorthands look
 517    like this:
 518
 519    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 520    www.foo.com[:port]            -> http://www.foo.com[:port]
 521
 522    FTP shorthands look like this:
 523
 524    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 525    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 526
 527    If the URL needs not or cannot be rewritten, return NULL.  */
 528
 529 char *
 530 rewrite_shorthand_url (const char *url)
 531 {
 532   const char *p;
 533   char *ret;
 534
 535   if (url_scheme (url) != SCHEME_INVALID)
 536     return NULL;
 537
 538   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 539      latter Netscape.  */
 540   p = strpbrk (url, ":/");
 541   if (p == url)
 542     return NULL;
 543
 544   /* If we're looking at "://", it means the URL uses a scheme we
 545      don't support, which may include "https" when compiled without
 546      SSL support.  Don't bogusly rewrite such URLs.  */
 547   if (p && p[0] == ':' && p[1] == '/' && p[2] == '/')
 548     return NULL;
 549
 550   if (p && *p == ':')
 551     {
 552       /* Colon indicates ftp, as in foo.bar.com:path.  Check for
 553          special case of http port number ("localhost:10000").  */
 554       int digits = strspn (p + 1, "0123456789");
 555       if (digits && (p[1 + digits] == '/' || p[1 + digits] == '\0'))
 556         goto http;
 557
 558       /* Turn "foo.bar.com:path" to "ftp://foo.bar.com/path". */
 559       ret = aprintf ("ftp://%s", url);
 560       ret[6 + (p - url)] = '/';
 561     }
 562   else
 563     {
 564     http:
 565       /* Just prepend "http://" to URL. */
 566       ret = aprintf ("http://%s", url);
 567     }
 568   return ret;
 569 }
 570 \f
 571 static void split_path (const char *, char **, char **);
 572
 573 /* Like strpbrk, with the exception that it returns the pointer to the
 574    terminating zero (end-of-string aka "eos") if no matching character
 575    is found.  */
 576
 577 static inline char *
 578 strpbrk_or_eos (const char *s, const char *accept)
 579 {
 580   char *p = strpbrk (s, accept);
 581   if (!p)
 582     p = strchr (s, '\0');
 583   return p;
 584 }
 585
 586 /* Turn STR into lowercase; return true if a character was actually
 587    changed. */
 588
 589 static bool
 590 lowercase_str (char *str)
 591 {
 592   bool changed = false;
 593   for (; *str; str++)
 594     if (c_isupper (*str))
 595       {
 596         changed = true;
 597         *str = c_tolower (*str);
 598       }
 599   return changed;
 600 }
 601
 602 static const char *
 603 init_seps (enum url_scheme scheme)
 604 {
 605   static char seps[8] = ":/";
 606   char *p = seps + 2;
 607   int flags = supported_schemes[scheme].flags;
 608
 609   if (flags & scm_has_params)
 610     *p++ = ';';
 611   if (flags & scm_has_query)
 612     *p++ = '?';
 613   if (flags & scm_has_fragment)
 614     *p++ = '#';
 615   *p++ = '\0';
 616   return seps;
 617 }
 618
 619 static const char *parse_errors[] = {
 620 #define PE_NO_ERROR                     0
 621   N_("No error"),
 622 #define PE_UNSUPPORTED_SCHEME           1
 623   N_("Unsupported scheme"),
 624 #define PE_INVALID_HOST_NAME            2
 625   N_("Invalid host name"),
 626 #define PE_BAD_PORT_NUMBER              3
 627   N_("Bad port number"),
 628 #define PE_INVALID_USER_NAME            4
 629   N_("Invalid user name"),
 630 #define PE_UNTERMINATED_IPV6_ADDRESS    5
 631   N_("Unterminated IPv6 numeric address"),
 632 #define PE_IPV6_NOT_SUPPORTED           6
 633   N_("IPv6 addresses not supported"),
 634 #define PE_INVALID_IPV6_ADDRESS         7
 635   N_("Invalid IPv6 numeric address")
 636 };
 637
 638 /* Parse a URL.
 639
 640    Return a new struct url if successful, NULL on error.  In case of
 641    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 642    error code. */
 643 struct url *
 644 url_parse (const char *url, int *error, bool *utf8_encode)
 645 {
 646   struct url *u;
 647   const char *p;
 648   bool path_modified, host_modified;
 649
 650   enum url_scheme scheme;
 651   const char *seps;
 652
 653   const char *uname_b,     *uname_e;
 654   const char *host_b,      *host_e;
 655   const char *path_b,      *path_e;
 656   const char *params_b,    *params_e;
 657   const char *query_b,     *query_e;
 658   const char *fragment_b,  *fragment_e;
 659
 660   int port;
 661   char *user = NULL, *passwd = NULL;
 662
 663   char *url_encoded = NULL;
 664
 665   int error_code;
 666
 667   scheme = url_scheme (url);
 668   if (scheme == SCHEME_INVALID)
 669     {
 670       error_code = PE_UNSUPPORTED_SCHEME;
 671       goto error;
 672     }
 673
 674   if (opt.enable_iri && *utf8_encode)
 675     {
 676       const char *new;
 677       url_unescape ((char *) url);
 678       *utf8_encode = remote_to_utf8 (url, &new);
 679       if (*utf8_encode)
 680         url = new;
 681     }
 682
 683   url_encoded = reencode_escapes (url);
 684   p = url_encoded;
 685
 686   p += strlen (supported_schemes[scheme].leading_string);
 687   uname_b = p;
 688   p = url_skip_credentials (p);
 689   uname_e = p;
 690
 691   /* scheme://user:pass@host[:port]... */
 692   /*                    ^              */
 693
 694   /* We attempt to break down the URL into the components path,
 695      params, query, and fragment.  They are ordered like this:
 696
 697        scheme://host[:port][/path][;params][?query][#fragment]  */
 698
 699   path_b     = path_e     = NULL;
 700   params_b   = params_e   = NULL;
 701   query_b    = query_e    = NULL;
 702   fragment_b = fragment_e = NULL;
 703
 704   /* Initialize separators for optional parts of URL, depending on the
 705      scheme.  For example, FTP has params, and HTTP and HTTPS have
 706      query string and fragment. */
 707   seps = init_seps (scheme);
 708
 709   host_b = p;
 710
 711   if (*p == '[')
 712     {
 713       /* Handle IPv6 address inside square brackets.  Ideally we'd
 714          just look for the terminating ']', but rfc2732 mandates
 715          rejecting invalid IPv6 addresses.  */
 716
 717       /* The address begins after '['. */
 718       host_b = p + 1;
 719       host_e = strchr (host_b, ']');
 720
 721       if (!host_e)
 722         {
 723           error_code = PE_UNTERMINATED_IPV6_ADDRESS;
 724           goto error;
 725         }
 726
 727 #ifdef ENABLE_IPV6
 728       /* Check if the IPv6 address is valid. */
 729       if (!is_valid_ipv6_address(host_b, host_e))
 730         {
 731           error_code = PE_INVALID_IPV6_ADDRESS;
 732           goto error;
 733         }
 734
 735       /* Continue parsing after the closing ']'. */
 736       p = host_e + 1;
 737 #else
 738       error_code = PE_IPV6_NOT_SUPPORTED;
 739       goto error;
 740 #endif
 741
 742       /* The closing bracket must be followed by a separator or by the
 743          null char.  */
 744       /* http://[::1]... */
 745       /*             ^   */
 746       if (!strchr (seps, *p))
 747         {
 748           /* Trailing garbage after []-delimited IPv6 address. */
 749           error_code = PE_INVALID_HOST_NAME;
 750           goto error;
 751         }
 752     }
 753   else
 754     {
 755       p = strpbrk_or_eos (p, seps);
 756       host_e = p;
 757     }
 758   ++seps;                       /* advance to '/' */
 759
 760   if (host_b == host_e)
 761     {
 762       error_code = PE_INVALID_HOST_NAME;
 763       goto error;
 764     }
 765
 766   port = scheme_default_port (scheme);
 767   if (*p == ':')
 768     {
 769       const char *port_b, *port_e, *pp;
 770
 771       /* scheme://host:port/tralala */
 772       /*              ^             */
 773       ++p;
 774       port_b = p;
 775       p = strpbrk_or_eos (p, seps);
 776       port_e = p;
 777
 778       /* Allow empty port, as per rfc2396. */
 779       if (port_b != port_e)
 780         for (port = 0, pp = port_b; pp < port_e; pp++)
 781           {
 782             if (!c_isdigit (*pp))
 783               {
 784                 /* http://host:12randomgarbage/blah */
 785                 /*               ^                  */
 786                 error_code = PE_BAD_PORT_NUMBER;
 787                 goto error;
 788               }
 789             port = 10 * port + (*pp - '0');
 790             /* Check for too large port numbers here, before we have
 791                a chance to overflow on bogus port values.  */
 792             if (port > 0xffff)
 793               {
 794                 error_code = PE_BAD_PORT_NUMBER;
 795                 goto error;
 796               }
 797           }
 798     }
 799   /* Advance to the first separator *after* '/' (either ';' or '?',
 800      depending on the scheme).  */
 801   ++seps;
 802
 803   /* Get the optional parts of URL, each part being delimited by
 804      current location and the position of the next separator.  */
 805 #define GET_URL_PART(sepchar, var) do {                         \
 806   if (*p == sepchar)                                            \
 807     var##_b = ++p, var##_e = p = strpbrk_or_eos (p, seps);      \
 808   ++seps;                                                       \
 809 } while (0)
 810
 811   GET_URL_PART ('/', path);
 812   if (supported_schemes[scheme].flags & scm_has_params)
 813     GET_URL_PART (';', params);
 814   if (supported_schemes[scheme].flags & scm_has_query)
 815     GET_URL_PART ('?', query);
 816   if (supported_schemes[scheme].flags & scm_has_fragment)
 817     GET_URL_PART ('#', fragment);
 818
 819 #undef GET_URL_PART
 820   assert (*p == 0);
 821
 822   if (uname_b != uname_e)
 823     {
 824       /* http://user:pass@host */
 825       /*        ^         ^    */
 826       /*     uname_b   uname_e */
 827       if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
 828         {
 829           error_code = PE_INVALID_USER_NAME;
 830           goto error;
 831         }
 832     }
 833
 834   u = xnew0 (struct url);
 835   u->scheme = scheme;
 836   u->host   = strdupdelim (host_b, host_e);
 837   u->port   = port;
 838   u->user   = user;
 839   u->passwd = passwd;
 840
 841   u->path = strdupdelim (path_b, path_e);
 842   path_modified = path_simplify (scheme, u->path);
 843   split_path (u->path, &u->dir, &u->file);
 844
 845   host_modified = lowercase_str (u->host);
 846
 847   /* Decode %HH sequences in host name.  This is important not so much
 848      to support %HH sequences in host names (which other browser
 849      don't), but to support binary characters (which will have been
 850      converted to %HH by reencode_escapes).  */
 851   if (strchr (u->host, '%'))
 852     {
 853       url_unescape (u->host);
 854       host_modified = true;
 855     }
 856
 857   if (opt.enable_iri)
 858     {
 859       char *new = idn_encode (u->host, *utf8_encode);
 860       if (new)
 861         {
 862           xfree (u->host);
 863           u->host = new;
 864           host_modified = true;
 865         }
 866     }
 867
 868   if (params_b)
 869     u->params = strdupdelim (params_b, params_e);
 870   if (query_b)
 871     u->query = strdupdelim (query_b, query_e);
 872   if (fragment_b)
 873     u->fragment = strdupdelim (fragment_b, fragment_e);
 874
 875   if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e)
 876     {
 877       /* If we suspect that a transformation has rendered what
 878          url_string might return different from URL_ENCODED, rebuild
 879          u->url using url_string.  */
 880       u->url = url_string (u, URL_AUTH_SHOW);
 881
 882       if (url_encoded != url)
 883         xfree ((char *) url_encoded);
 884     }
 885   else
 886     {
 887       if (url_encoded == url)
 888         u->url = xstrdup (url);
 889       else
 890         u->url = url_encoded;
 891     }
 892
 893   return u;
 894
 895  error:
 896   /* Cleanup in case of error: */
 897   if (url_encoded && url_encoded != url)
 898     xfree (url_encoded);
 899
 900   /* Transmit the error code to the caller, if the caller wants to
 901      know.  */
 902   if (error)
 903     *error = error_code;
 904   return NULL;
 905 }
 906
 907 /* Return the error message string from ERROR_CODE, which should have
 908    been retrieved from url_parse.  The error message is translated.  */
 909
 910 const char *
 911 url_error (int error_code)
 912 {
 913   assert (error_code >= 0 && ((size_t) error_code) < countof (parse_errors));
 914   return _(parse_errors[error_code]);
 915 }
 916
 917 /* Split PATH into DIR and FILE.  PATH comes from the URL and is
 918    expected to be URL-escaped.
 919
 920    The path is split into directory (the part up to the last slash)
 921    and file (the part after the last slash), which are subsequently
 922    unescaped.  Examples:
 923
 924    PATH                 DIR           FILE
 925    "foo/bar/baz"        "foo/bar"     "baz"
 926    "foo/bar/"           "foo/bar"     ""
 927    "foo"                ""            "foo"
 928    "foo/bar/baz%2fqux"  "foo/bar"     "baz/qux" (!)
 929
 930    DIR and FILE are freshly allocated.  */
 931
 932 static void
 933 split_path (const char *path, char **dir, char **file)
 934 {
 935   char *last_slash = strrchr (path, '/');
 936   if (!last_slash)
 937     {
 938       *dir = xstrdup ("");
 939       *file = xstrdup (path);
 940     }
 941   else
 942     {
 943       *dir = strdupdelim (path, last_slash);
 944       *file = xstrdup (last_slash + 1);
 945     }
 946   url_unescape (*dir);
 947   url_unescape (*file);
 948 }
 949
 950 /* Note: URL's "full path" is the path with the query string and
 951    params appended.  The "fragment" (#foo) is intentionally ignored,
 952    but that might be changed.  For example, if the original URL was
 953    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
 954    the full path will be "/foo/bar/baz;bullshit?querystring".  */
 955
 956 /* Return the length of the full path, without the terminating
 957    zero.  */
 958
 959 static int
 960 full_path_length (const struct url *url)
 961 {
 962   int len = 0;
 963
 964 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
 965
 966   FROB (path);
 967   FROB (params);
 968   FROB (query);
 969
 970 #undef FROB
 971
 972   return len;
 973 }
 974
 975 /* Write out the full path. */
 976
 977 static void
 978 full_path_write (const struct url *url, char *where)
 979 {
 980 #define FROB(el, chr) do {                      \
 981   char *f_el = url->el;                         \
 982   if (f_el) {                                   \
 983     int l = strlen (f_el);                      \
 984     *where++ = chr;                             \
 985     memcpy (where, f_el, l);                    \
 986     where += l;                                 \
 987   }                                             \
 988 } while (0)
 989
 990   FROB (path, '/');
 991   FROB (params, ';');
 992   FROB (query, '?');
 993
 994 #undef FROB
 995 }
 996
 997 /* Public function for getting the "full path".  E.g. if u->path is
 998    "foo/bar" and u->query is "param=value", full_path will be
 999    "/foo/bar?param=value". */
1000
1001 char *
1002 url_full_path (const struct url *url)
1003 {
1004   int length = full_path_length (url);
1005   char *full_path = xmalloc (length + 1);
1006
1007   full_path_write (url, full_path);
1008   full_path[length] = '\0';
1009
1010   return full_path;
1011 }
1012
1013 /* Unescape CHR in an otherwise escaped STR.  Used to selectively
1014    escaping of certain characters, such as "/" and ":".  Returns a
1015    count of unescaped chars.  */
1016
1017 static void
1018 unescape_single_char (char *str, char chr)
1019 {
1020   const char c1 = XNUM_TO_DIGIT (chr >> 4);
1021   const char c2 = XNUM_TO_DIGIT (chr & 0xf);
1022   char *h = str;                /* hare */
1023   char *t = str;                /* tortoise */
1024   for (; *h; h++, t++)
1025     {
1026       if (h[0] == '%' && h[1] == c1 && h[2] == c2)
1027         {
1028           *t = chr;
1029           h += 2;
1030         }
1031       else
1032         *t = *h;
1033     }
1034   *t = '\0';
1035 }
1036
1037 /* Escape unsafe and reserved characters, except for the slash
1038    characters.  */
1039
1040 static char *
1041 url_escape_dir (const char *dir)
1042 {
1043   char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1044   if (newdir == dir)
1045     return (char *)dir;
1046
1047   unescape_single_char (newdir, '/');
1048   return newdir;
1049 }
1050
1051 /* Sync u->path and u->url with u->dir and u->file.  Called after
1052    u->file or u->dir have been changed, typically by the FTP code.  */
1053
1054 static void
1055 sync_path (struct url *u)
1056 {
1057   char *newpath, *efile, *edir;
1058
1059   xfree (u->path);
1060
1061   /* u->dir and u->file are not escaped.  URL-escape them before
1062      reassembling them into u->path.  That way, if they contain
1063      separators like '?' or even if u->file contains slashes, the
1064      path will be correctly assembled.  (u->file can contain slashes
1065      if the URL specifies it with %2f, or if an FTP server returns
1066      it.)  */
1067   edir = url_escape_dir (u->dir);
1068   efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1069
1070   if (!*edir)
1071     newpath = xstrdup (efile);
1072   else
1073     {
1074       int dirlen = strlen (edir);
1075       int filelen = strlen (efile);
1076
1077       /* Copy "DIR/FILE" to newpath. */
1078       char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1079       memcpy (p, edir, dirlen);
1080       p += dirlen;
1081       *p++ = '/';
1082       memcpy (p, efile, filelen);
1083       p += filelen;
1084       *p = '\0';
1085     }
1086
1087   u->path = newpath;
1088
1089   if (edir != u->dir)
1090     xfree (edir);
1091   if (efile != u->file)
1092     xfree (efile);
1093
1094   /* Regenerate u->url as well.  */
1095   xfree (u->url);
1096   u->url = url_string (u, URL_AUTH_SHOW);
1097 }
1098
1099 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1100    This way we can sync u->path and u->url when they get changed.  */
1101
1102 void
1103 url_set_dir (struct url *url, const char *newdir)
1104 {
1105   xfree (url->dir);
1106   url->dir = xstrdup (newdir);
1107   sync_path (url);
1108 }
1109
1110 void
1111 url_set_file (struct url *url, const char *newfile)
1112 {
1113   xfree (url->file);
1114   url->file = xstrdup (newfile);
1115   sync_path (url);
1116 }
1117
1118 void
1119 url_free (struct url *url)
1120 {
1121   xfree (url->host);
1122   xfree (url->path);
1123   xfree (url->url);
1124
1125   xfree_null (url->params);
1126   xfree_null (url->query);
1127   xfree_null (url->fragment);
1128   xfree_null (url->user);
1129   xfree_null (url->passwd);
1130
1131   xfree (url->dir);
1132   xfree (url->file);
1133
1134   xfree (url);
1135 }
1136 \f
1137 /* Create all the necessary directories for PATH (a file).  Calls
1138    make_directory internally.  */
1139 int
1140 mkalldirs (const char *path)
1141 {
1142   const char *p;
1143   char *t;
1144   struct_stat st;
1145   int res;
1146
1147   p = path + strlen (path);
1148   for (; *p != '/' && p != path; p--)
1149     ;
1150
1151   /* Don't create if it's just a file.  */
1152   if ((p == path) && (*p != '/'))
1153     return 0;
1154   t = strdupdelim (path, p);
1155
1156   /* Check whether the directory exists.  */
1157   if ((stat (t, &st) == 0))
1158     {
1159       if (S_ISDIR (st.st_mode))
1160         {
1161           xfree (t);
1162           return 0;
1163         }
1164       else
1165         {
1166           /* If the dir exists as a file name, remove it first.  This
1167              is *only* for Wget to work with buggy old CERN http
1168              servers.  Here is the scenario: When Wget tries to
1169              retrieve a directory without a slash, e.g.
1170              http://foo/bar (bar being a directory), CERN server will
1171              not redirect it too http://foo/bar/ -- it will generate a
1172              directory listing containing links to bar/file1,
1173              bar/file2, etc.  Wget will lose because it saves this
1174              HTML listing to a file `bar', so it cannot create the
1175              directory.  To work around this, if the file of the same
1176              name exists, we just remove it and create the directory
1177              anyway.  */
1178           DEBUGP (("Removing %s because of directory danger!\n", t));
1179           unlink (t);
1180         }
1181     }
1182   res = make_directory (t);
1183   if (res != 0)
1184     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1185   xfree (t);
1186   return res;
1187 }
1188 \f
1189 /* Functions for constructing the file name out of URL components.  */
1190
1191 /* A growable string structure, used by url_file_name and friends.
1192    This should perhaps be moved to utils.c.
1193
1194    The idea is to have a convenient and efficient way to construct a
1195    string by having various functions append data to it.  Instead of
1196    passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1197    functions in questions, we pass the pointer to this struct.  */
1198
1199 struct growable {
1200   char *base;
1201   int size;
1202   int tail;
1203 };
1204
1205 /* Ensure that the string can accept APPEND_COUNT more characters past
1206    the current TAIL position.  If necessary, this will grow the string
1207    and update its allocated size.  If the string is already large
1208    enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1209 #define GROW(g, append_size) do {                                       \
1210   struct growable *G_ = g;                                              \
1211   DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1212 } while (0)
1213
1214 /* Return the tail position of the string. */
1215 #define TAIL(r) ((r)->base + (r)->tail)
1216
1217 /* Move the tail position by APPEND_COUNT characters. */
1218 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1219
1220 /* Append the string STR to DEST.  NOTICE: the string in DEST is not
1221    terminated.  */
1222
1223 static void
1224 append_string (const char *str, struct growable *dest)
1225 {
1226   int l = strlen (str);
1227   GROW (dest, l);
1228   memcpy (TAIL (dest), str, l);
1229   TAIL_INCR (dest, l);
1230 }
1231
1232 /* Append CH to DEST.  For example, append_char (0, DEST)
1233    zero-terminates DEST.  */
1234
1235 static void
1236 append_char (char ch, struct growable *dest)
1237 {
1238   GROW (dest, 1);
1239   *TAIL (dest) = ch;
1240   TAIL_INCR (dest, 1);
1241 }
1242
1243 enum {
1244   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1245   filechr_not_windows = 2,      /* unusable on Windows, one of \|/<>?:*" */
1246   filechr_control     = 4       /* a control character, e.g. 0-31 */
1247 };
1248
1249 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1250
1251 /* Shorthands for the table: */
1252 #define U filechr_not_unix
1253 #define W filechr_not_windows
1254 #define C filechr_control
1255
1256 #define UW U|W
1257 #define UWC U|W|C
1258
1259 /* Table of characters unsafe under various conditions (see above).
1260
1261    Arguably we could also claim `%' to be unsafe, since we use it as
1262    the escape character.  If we ever want to be able to reliably
1263    translate file name back to URL, this would become important
1264    crucial.  Right now, it's better to be minimal in escaping.  */
1265
1266 static const unsigned char filechr_table[256] =
1267 {
1268 UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1269   C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1270   C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1271   C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1272   0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1273   0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1274   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1275   0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
1276   0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1277   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1278   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1279   0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1280   0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1281   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1282   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1283   0,  0,  0,  0,   W,  0,  0,  C,   /* x   y   z   {    |   }   ~   DEL */
1284
1285   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
1286   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
1287   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1288   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1289
1290   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1291   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1292   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1293   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1294 };
1295 #undef U
1296 #undef W
1297 #undef C
1298 #undef UW
1299 #undef UWC
1300
1301 /* FN_PORT_SEP is the separator between host and port in file names
1302    for non-standard port numbers.  On Unix this is normally ':', as in
1303    "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1304    because Windows can't handle ':' in file names.  */
1305 #define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1306
1307 /* FN_QUERY_SEP is the separator between the file name and the URL
1308    query, normally '?'.  Since Windows cannot handle '?' as part of
1309    file name, we use '@' instead there.  */
1310 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1311
1312 /* Quote path element, characters in [b, e), as file name, and append
1313    the quoted string to DEST.  Each character is quoted as per
1314    file_unsafe_char and the corresponding table.
1315
1316    If ESCAPED is true, the path element is considered to be
1317    URL-escaped and will be unescaped prior to inspection.  */
1318
1319 static void
1320 append_uri_pathel (const char *b, const char *e, bool escaped,
1321                    struct growable *dest)
1322 {
1323   const char *p;
1324   int quoted, outlen;
1325
1326   int mask;
1327   if (opt.restrict_files_os == restrict_unix)
1328     mask = filechr_not_unix;
1329   else
1330     mask = filechr_not_windows;
1331   if (opt.restrict_files_ctrl)
1332     mask |= filechr_control;
1333
1334   /* Copy [b, e) to PATHEL and URL-unescape it. */
1335   if (escaped)
1336     {
1337       char *unescaped;
1338       BOUNDED_TO_ALLOCA (b, e, unescaped);
1339       url_unescape (unescaped);
1340       b = unescaped;
1341       e = unescaped + strlen (unescaped);
1342     }
1343
1344   /* Defang ".." when found as component of path.  Remember that path
1345      comes from the URL and might contain malicious input.  */
1346   if (e - b == 2 && b[0] == '.' && b[1] == '.')
1347     {
1348       b = "%2E%2E";
1349       e = b + 6;
1350     }
1351
1352   /* Walk the PATHEL string and check how many characters we'll need
1353      to quote.  */
1354   quoted = 0;
1355   for (p = b; p < e; p++)
1356     if (FILE_CHAR_TEST (*p, mask))
1357       ++quoted;
1358
1359   /* Calculate the length of the output string.  e-b is the input
1360      string length.  Each quoted char introduces two additional
1361      characters in the string, hence 2*quoted.  */
1362   outlen = (e - b) + (2 * quoted);
1363   GROW (dest, outlen);
1364
1365   if (!quoted)
1366     {
1367       /* If there's nothing to quote, we can simply append the string
1368          without processing it again.  */
1369       memcpy (TAIL (dest), b, outlen);
1370     }
1371   else
1372     {
1373       char *q = TAIL (dest);
1374       for (p = b; p < e; p++)
1375         {
1376           if (!FILE_CHAR_TEST (*p, mask))
1377             *q++ = *p;
1378           else
1379             {
1380               unsigned char ch = *p;
1381               *q++ = '%';
1382               *q++ = XNUM_TO_DIGIT (ch >> 4);
1383               *q++ = XNUM_TO_DIGIT (ch & 0xf);
1384             }
1385         }
1386       assert (q - TAIL (dest) == outlen);
1387     }
1388
1389   /* Perform inline case transformation if required.  */
1390   if (opt.restrict_files_case == restrict_lowercase
1391       || opt.restrict_files_case == restrict_uppercase)
1392     {
1393       char *q;
1394       for (q = TAIL (dest); q < TAIL (dest) + outlen; ++q)
1395         {
1396           if (opt.restrict_files_case == restrict_lowercase)
1397             *q = c_tolower (*q);
1398           else
1399             *q = c_toupper (*q);
1400         }
1401     }
1402
1403   TAIL_INCR (dest, outlen);
1404 }
1405
1406 /* Append to DEST the directory structure that corresponds the
1407    directory part of URL's path.  For example, if the URL is
1408    http://server/dir1/dir2/file, this appends "/dir1/dir2".
1409
1410    Each path element ("dir1" and "dir2" in the above example) is
1411    examined, url-unescaped, and re-escaped as file name element.
1412
1413    Additionally, it cuts as many directories from the path as
1414    specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1415    will produce "bar" for the above example.  For 2 or more, it will
1416    produce "".
1417
1418    Each component of the path is quoted for use as file name.  */
1419
1420 static void
1421 append_dir_structure (const struct url *u, struct growable *dest)
1422 {
1423   char *pathel, *next;
1424   int cut = opt.cut_dirs;
1425
1426   /* Go through the path components, de-URL-quote them, and quote them
1427      (if necessary) as file names.  */
1428
1429   pathel = u->path;
1430   for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1431     {
1432       if (cut-- > 0)
1433         continue;
1434       if (pathel == next)
1435         /* Ignore empty pathels.  */
1436         continue;
1437
1438       if (dest->tail)
1439         append_char ('/', dest);
1440       append_uri_pathel (pathel, next, true, dest);
1441     }
1442 }
1443
1444 /* Return a unique file name that matches the given URL as good as
1445    possible.  Does not create directories on the file system.  */
1446
1447 char *
1448 url_file_name (const struct url *u)
1449 {
1450   struct growable fnres;        /* stands for "file name result" */
1451
1452   const char *u_file, *u_query;
1453   char *fname, *unique;
1454
1455   fnres.base = NULL;
1456   fnres.size = 0;
1457   fnres.tail = 0;
1458
1459   /* Start with the directory prefix, if specified. */
1460   if (opt.dir_prefix)
1461     append_string (opt.dir_prefix, &fnres);
1462
1463   /* If "dirstruct" is turned on (typically the case with -r), add
1464      the host and port (unless those have been turned off) and
1465      directory structure.  */
1466   if (opt.dirstruct)
1467     {
1468       if (opt.protocol_directories)
1469         {
1470           if (fnres.tail)
1471             append_char ('/', &fnres);
1472           append_string (supported_schemes[u->scheme].name, &fnres);
1473         }
1474       if (opt.add_hostdir)
1475         {
1476           if (fnres.tail)
1477             append_char ('/', &fnres);
1478           if (0 != strcmp (u->host, ".."))
1479             append_string (u->host, &fnres);
1480           else
1481             /* Host name can come from the network; malicious DNS may
1482                allow ".." to be resolved, causing us to write to
1483                "../<file>".  Defang such host names.  */
1484             append_string ("%2E%2E", &fnres);
1485           if (u->port != scheme_default_port (u->scheme))
1486             {
1487               char portstr[24];
1488               number_to_string (portstr, u->port);
1489               append_char (FN_PORT_SEP, &fnres);
1490               append_string (portstr, &fnres);
1491             }
1492         }
1493
1494       append_dir_structure (u, &fnres);
1495     }
1496
1497   /* Add the file name. */
1498   if (fnres.tail)
1499     append_char ('/', &fnres);
1500   u_file = *u->file ? u->file : "index.html";
1501   append_uri_pathel (u_file, u_file + strlen (u_file), false, &fnres);
1502
1503   /* Append "?query" to the file name. */
1504   u_query = u->query && *u->query ? u->query : NULL;
1505   if (u_query)
1506     {
1507       append_char (FN_QUERY_SEP, &fnres);
1508       append_uri_pathel (u_query, u_query + strlen (u_query), true, &fnres);
1509     }
1510
1511   /* Zero-terminate the file name. */
1512   append_char ('\0', &fnres);
1513
1514   fname = fnres.base;
1515
1516   /* Check the cases in which the unique extensions are not used:
1517      1) Clobbering is turned off (-nc).
1518      2) Retrieval with regetting.
1519      3) Timestamping is used.
1520      4) Hierarchy is built.
1521
1522      The exception is the case when file does exist and is a
1523      directory (see `mkalldirs' for explanation).  */
1524
1525   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1526       && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1527     return fname;
1528
1529   unique = unique_name (fname, true);
1530   if (unique != fname)
1531     xfree (fname);
1532   return unique;
1533 }
1534 \f
1535 /* Resolve "." and ".." elements of PATH by destructively modifying
1536    PATH and return true if PATH has been modified, false otherwise.
1537
1538    The algorithm is in spirit similar to the one described in rfc1808,
1539    although implemented differently, in one pass.  To recap, path
1540    elements containing only "." are removed, and ".." is taken to mean
1541    "back up one element".  Single leading and trailing slashes are
1542    preserved.
1543
1544    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1545    test examples are provided below.  If you change anything in this
1546    function, run test_path_simplify to make sure you haven't broken a
1547    test case.  */
1548
1549 static bool
1550 path_simplify (enum url_scheme scheme, char *path)
1551 {
1552   char *h = path;               /* hare */
1553   char *t = path;               /* tortoise */
1554   char *beg = path;
1555   char *end = strchr (path, '\0');
1556
1557   while (h < end)
1558     {
1559       /* Hare should be at the beginning of a path element. */
1560
1561       if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1562         {
1563           /* Ignore "./". */
1564           h += 2;
1565         }
1566       else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1567         {
1568           /* Handle "../" by retreating the tortoise by one path
1569              element -- but not past beggining.  */
1570           if (t > beg)
1571             {
1572               /* Move backwards until T hits the beginning of the
1573                  previous path element or the beginning of path. */
1574               for (--t; t > beg && t[-1] != '/'; t--)
1575                 ;
1576             }
1577           else if (scheme == SCHEME_FTP)
1578             {
1579               /* If we're at the beginning, copy the "../" literally
1580                  and move the beginning so a later ".." doesn't remove
1581                  it.  This violates RFC 3986; but we do it for FTP
1582                  anyway because there is otherwise no way to get at a
1583                  parent directory, when the FTP server drops us in a
1584                  non-root directory (which is not uncommon). */
1585               beg = t + 3;
1586               goto regular;
1587             }
1588           h += 3;
1589         }
1590       else
1591         {
1592         regular:
1593           /* A regular path element.  If H hasn't advanced past T,
1594              simply skip to the next path element.  Otherwise, copy
1595              the path element until the next slash.  */
1596           if (t == h)
1597             {
1598               /* Skip the path element, including the slash.  */
1599               while (h < end && *h != '/')
1600                 t++, h++;
1601               if (h < end)
1602                 t++, h++;
1603             }
1604           else
1605             {
1606               /* Copy the path element, including the final slash.  */
1607               while (h < end && *h != '/')
1608                 *t++ = *h++;
1609               if (h < end)
1610                 *t++ = *h++;
1611             }
1612         }
1613     }
1614
1615   if (t != h)
1616     *t = '\0';
1617
1618   return t != h;
1619 }
1620 \f
1621 /* Return the length of URL's path.  Path is considered to be
1622    terminated by one or more of the ?query or ;params or #fragment,
1623    depending on the scheme.  */
1624
1625 static const char *
1626 path_end (const char *url)
1627 {
1628   enum url_scheme scheme = url_scheme (url);
1629   const char *seps;
1630   if (scheme == SCHEME_INVALID)
1631     scheme = SCHEME_HTTP;       /* use http semantics for rel links */
1632   /* +2 to ignore the first two separators ':' and '/' */
1633   seps = init_seps (scheme) + 2;
1634   return strpbrk_or_eos (url, seps);
1635 }
1636
1637 /* Find the last occurrence of character C in the range [b, e), or
1638    NULL, if none are present.  */
1639 #define find_last_char(b, e, c) memrchr ((b), (c), (e) - (b))
1640
1641 /* Merge BASE with LINK and return the resulting URI.
1642
1643    Either of the URIs may be absolute or relative, complete with the
1644    host name, or path only.  This tries to reasonably handle all
1645    foreseeable cases.  It only employs minimal URL parsing, without
1646    knowledge of the specifics of schemes.
1647
1648    I briefly considered making this function call path_simplify after
1649    the merging process, as rfc1738 seems to suggest.  This is a bad
1650    idea for several reasons: 1) it complexifies the code, and 2)
1651    url_parse has to simplify path anyway, so it's wasteful to boot.  */
1652
1653 char *
1654 uri_merge (const char *base, const char *link)
1655 {
1656   int linklength;
1657   const char *end;
1658   char *merge;
1659
1660   if (url_has_scheme (link))
1661     return xstrdup (link);
1662
1663   /* We may not examine BASE past END. */
1664   end = path_end (base);
1665   linklength = strlen (link);
1666
1667   if (!*link)
1668     {
1669       /* Empty LINK points back to BASE, query string and all. */
1670       return xstrdup (base);
1671     }
1672   else if (*link == '?')
1673     {
1674       /* LINK points to the same location, but changes the query
1675          string.  Examples: */
1676       /* uri_merge("path",         "?new") -> "path?new"     */
1677       /* uri_merge("path?foo",     "?new") -> "path?new"     */
1678       /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1679       /* uri_merge("path#foo",     "?new") -> "path?new"     */
1680       int baselength = end - base;
1681       merge = xmalloc (baselength + linklength + 1);
1682       memcpy (merge, base, baselength);
1683       memcpy (merge + baselength, link, linklength);
1684       merge[baselength + linklength] = '\0';
1685     }
1686   else if (*link == '#')
1687     {
1688       /* uri_merge("path",         "#new") -> "path#new"     */
1689       /* uri_merge("path#foo",     "#new") -> "path#new"     */
1690       /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1691       /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1692       int baselength;
1693       const char *end1 = strchr (base, '#');
1694       if (!end1)
1695         end1 = base + strlen (base);
1696       baselength = end1 - base;
1697       merge = xmalloc (baselength + linklength + 1);
1698       memcpy (merge, base, baselength);
1699       memcpy (merge + baselength, link, linklength);
1700       merge[baselength + linklength] = '\0';
1701     }
1702   else if (*link == '/' && *(link + 1) == '/')
1703     {
1704       /* LINK begins with "//" and so is a net path: we need to
1705          replace everything after (and including) the double slash
1706          with LINK. */
1707
1708       /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1709       /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1710       /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1711
1712       int span;
1713       const char *slash;
1714       const char *start_insert;
1715
1716       /* Look for first slash. */
1717       slash = memchr (base, '/', end - base);
1718       /* If found slash and it is a double slash, then replace
1719          from this point, else default to replacing from the
1720          beginning.  */
1721       if (slash && *(slash + 1) == '/')
1722         start_insert = slash;
1723       else
1724         start_insert = base;
1725
1726       span = start_insert - base;
1727       merge = xmalloc (span + linklength + 1);
1728       if (span)
1729         memcpy (merge, base, span);
1730       memcpy (merge + span, link, linklength);
1731       merge[span + linklength] = '\0';
1732     }
1733   else if (*link == '/')
1734     {
1735       /* LINK is an absolute path: we need to replace everything
1736          after (and including) the FIRST slash with LINK.
1737
1738          So, if BASE is "http://host/whatever/foo/bar", and LINK is
1739          "/qux/xyzzy", our result should be
1740          "http://host/qux/xyzzy".  */
1741       int span;
1742       const char *slash;
1743       const char *start_insert = NULL; /* for gcc to shut up. */
1744       const char *pos = base;
1745       bool seen_slash_slash = false;
1746       /* We're looking for the first slash, but want to ignore
1747          double slash. */
1748     again:
1749       slash = memchr (pos, '/', end - pos);
1750       if (slash && !seen_slash_slash)
1751         if (*(slash + 1) == '/')
1752           {
1753             pos = slash + 2;
1754             seen_slash_slash = true;
1755             goto again;
1756           }
1757
1758       /* At this point, SLASH is the location of the first / after
1759          "//", or the first slash altogether.  START_INSERT is the
1760          pointer to the location where LINK will be inserted.  When
1761          examining the last two examples, keep in mind that LINK
1762          begins with '/'. */
1763
1764       if (!slash && !seen_slash_slash)
1765         /* example: "foo" */
1766         /*           ^    */
1767         start_insert = base;
1768       else if (!slash && seen_slash_slash)
1769         /* example: "http://foo" */
1770         /*                     ^ */
1771         start_insert = end;
1772       else if (slash && !seen_slash_slash)
1773         /* example: "foo/bar" */
1774         /*           ^        */
1775         start_insert = base;
1776       else if (slash && seen_slash_slash)
1777         /* example: "http://something/" */
1778         /*                           ^  */
1779         start_insert = slash;
1780
1781       span = start_insert - base;
1782       merge = xmalloc (span + linklength + 1);
1783       if (span)
1784         memcpy (merge, base, span);
1785       memcpy (merge + span, link, linklength);
1786       merge[span + linklength] = '\0';
1787     }
1788   else
1789     {
1790       /* LINK is a relative URL: we need to replace everything
1791          after last slash (possibly empty) with LINK.
1792
1793          So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1794          our result should be "whatever/foo/qux/xyzzy".  */
1795       bool need_explicit_slash = false;
1796       int span;
1797       const char *start_insert;
1798       const char *last_slash = find_last_char (base, end, '/');
1799       if (!last_slash)
1800         {
1801           /* No slash found at all.  Replace what we have with LINK. */
1802           start_insert = base;
1803         }
1804       else if (last_slash && last_slash >= base + 2
1805                && last_slash[-2] == ':' && last_slash[-1] == '/')
1806         {
1807           /* example: http://host"  */
1808           /*                      ^ */
1809           start_insert = end + 1;
1810           need_explicit_slash = true;
1811         }
1812       else
1813         {
1814           /* example: "whatever/foo/bar" */
1815           /*                        ^    */
1816           start_insert = last_slash + 1;
1817         }
1818
1819       span = start_insert - base;
1820       merge = xmalloc (span + linklength + 1);
1821       if (span)
1822         memcpy (merge, base, span);
1823       if (need_explicit_slash)
1824         merge[span - 1] = '/';
1825       memcpy (merge + span, link, linklength);
1826       merge[span + linklength] = '\0';
1827     }
1828
1829   return merge;
1830 }
1831 \f
1832 #define APPEND(p, s) do {                       \
1833   int len = strlen (s);                         \
1834   memcpy (p, s, len);                           \
1835   p += len;                                     \
1836 } while (0)
1837
1838 /* Use this instead of password when the actual password is supposed
1839    to be hidden.  We intentionally use a generic string without giving
1840    away the number of characters in the password, like previous
1841    versions did.  */
1842 #define HIDDEN_PASSWORD "*password*"
1843
1844 /* Recreate the URL string from the data in URL.
1845
1846    If HIDE is true (as it is when we're calling this on a URL we plan
1847    to print, but not when calling it to canonicalize a URL for use
1848    within the program), password will be hidden.  Unsafe characters in
1849    the URL will be quoted.  */
1850
1851 char *
1852 url_string (const struct url *url, enum url_auth_mode auth_mode)
1853 {
1854   int size;
1855   char *result, *p;
1856   char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;
1857
1858   int scheme_port = supported_schemes[url->scheme].default_port;
1859   const char *scheme_str = supported_schemes[url->scheme].leading_string;
1860   int fplen = full_path_length (url);
1861
1862   bool brackets_around_host;
1863
1864   assert (scheme_str != NULL);
1865
1866   /* Make sure the user name and password are quoted. */
1867   if (url->user)
1868     {
1869       if (auth_mode != URL_AUTH_HIDE)
1870         {
1871           quoted_user = url_escape_allow_passthrough (url->user);
1872           if (url->passwd)
1873             {
1874               if (auth_mode == URL_AUTH_HIDE_PASSWD)
1875                 quoted_passwd = HIDDEN_PASSWORD;
1876               else
1877                 quoted_passwd = url_escape_allow_passthrough (url->passwd);
1878             }
1879         }
1880     }
1881
1882   /* In the unlikely event that the host name contains non-printable
1883      characters, quote it for displaying to the user.  */
1884   quoted_host = url_escape_allow_passthrough (url->host);
1885
1886   /* Undo the quoting of colons that URL escaping performs.  IPv6
1887      addresses may legally contain colons, and in that case must be
1888      placed in square brackets.  */
1889   if (quoted_host != url->host)
1890     unescape_single_char (quoted_host, ':');
1891   brackets_around_host = strchr (quoted_host, ':') != NULL;
1892
1893   size = (strlen (scheme_str)
1894           + strlen (quoted_host)
1895           + (brackets_around_host ? 2 : 0)
1896           + fplen
1897           + 1);
1898   if (url->port != scheme_port)
1899     size += 1 + numdigit (url->port);
1900   if (quoted_user)
1901     {
1902       size += 1 + strlen (quoted_user);
1903       if (quoted_passwd)
1904         size += 1 + strlen (quoted_passwd);
1905     }
1906
1907   p = result = xmalloc (size);
1908
1909   APPEND (p, scheme_str);
1910   if (quoted_user)
1911     {
1912       APPEND (p, quoted_user);
1913       if (quoted_passwd)
1914         {
1915           *p++ = ':';
1916           APPEND (p, quoted_passwd);
1917         }
1918       *p++ = '@';
1919     }
1920
1921   if (brackets_around_host)
1922     *p++ = '[';
1923   APPEND (p, quoted_host);
1924   if (brackets_around_host)
1925     *p++ = ']';
1926   if (url->port != scheme_port)
1927     {
1928       *p++ = ':';
1929       p = number_to_string (p, url->port);
1930     }
1931
1932   full_path_write (url, p);
1933   p += fplen;
1934   *p++ = '\0';
1935
1936   assert (p - result == size);
1937
1938   if (quoted_user && quoted_user != url->user)
1939     xfree (quoted_user);
1940   if (quoted_passwd && auth_mode == URL_AUTH_SHOW
1941       && quoted_passwd != url->passwd)
1942     xfree (quoted_passwd);
1943   if (quoted_host != url->host)
1944     xfree (quoted_host);
1945
1946   return result;
1947 }
1948 \f
1949 /* Return true if scheme a is similar to scheme b.
1950
1951    Schemes are similar if they are equal.  If SSL is supported, schemes
1952    are also similar if one is http (SCHEME_HTTP) and the other is https
1953    (SCHEME_HTTPS).  */
1954 bool
1955 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
1956 {
1957   if (a == b)
1958     return true;
1959 #ifdef HAVE_SSL
1960   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
1961       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
1962     return true;
1963 #endif
1964   return false;
1965 }
1966 \f
1967 static int
1968 getchar_from_escaped_string (const char *str, char *c)
1969 {
1970   const char *p = str;
1971
1972   assert (str && *str);
1973   assert (c);
1974
1975   if (p[0] == '%')
1976     {
1977       if (!c_isxdigit(p[1]) || !c_isxdigit(p[2]))
1978         {
1979           *c = '%';
1980           return 1;
1981         }
1982       else
1983         {
1984           if (p[2] == 0)
1985             return 0; /* error: invalid string */
1986
1987           *c = X2DIGITS_TO_NUM (p[1], p[2]);
1988           if (URL_RESERVED_CHAR(*c))
1989             {
1990               *c = '%';
1991               return 1;
1992             }
1993           else
1994             return 3;
1995         }
1996     }
1997   else
1998     {
1999       *c = p[0];
2000     }
2001
2002   return 1;
2003 }
2004
2005 bool
2006 are_urls_equal (const char *u1, const char *u2)
2007 {
2008   const char *p, *q;
2009   int pp, qq;
2010   char ch1, ch2;
2011   assert(u1 && u2);
2012
2013   p = u1;
2014   q = u2;
2015
2016   while (*p && *q
2017          && (pp = getchar_from_escaped_string (p, &ch1))
2018          && (qq = getchar_from_escaped_string (q, &ch2))
2019          && (c_tolower(ch1) == c_tolower(ch2)))
2020     {
2021       p += pp;
2022       q += qq;
2023     }
2024
2025   return (*p == 0 && *q == 0 ? true : false);
2026 }
2027 \f
2028 #ifdef TESTING
2029 /* Debugging and testing support for path_simplify. */
2030
2031 #if 0
2032 /* Debug: run path_simplify on PATH and return the result in a new
2033    string.  Useful for calling from the debugger.  */
2034 static char *
2035 ps (char *path)
2036 {
2037   char *copy = xstrdup (path);
2038   path_simplify (copy);
2039   return copy;
2040 }
2041 #endif
2042
2043 static const char *
2044 run_test (char *test, char *expected_result, enum url_scheme scheme,
2045           bool expected_change)
2046 {
2047   char *test_copy = xstrdup (test);
2048   bool modified = path_simplify (scheme, test_copy);
2049
2050   if (0 != strcmp (test_copy, expected_result))
2051     {
2052       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2053               test, expected_result, test_copy);
2054       mu_assert ("", 0);
2055     }
2056   if (modified != expected_change)
2057     {
2058       if (expected_change)
2059         printf ("Expected modification with path_simplify(\"%s\").\n",
2060                 test);
2061       else
2062         printf ("Expected no modification with path_simplify(\"%s\").\n",
2063                 test);
2064     }
2065   xfree (test_copy);
2066   mu_assert ("", modified == expected_change);
2067   return NULL;
2068 }
2069
2070 const char *
2071 test_path_simplify (void)
2072 {
2073   static struct {
2074     char *test, *result;
2075     enum url_scheme scheme;
2076     bool should_modify;
2077   } tests[] = {
2078     { "",                       "",             SCHEME_HTTP, false },
2079     { ".",                      "",             SCHEME_HTTP, true },
2080     { "./",                     "",             SCHEME_HTTP, true },
2081     { "..",                     "",             SCHEME_HTTP, true },
2082     { "../",                    "",             SCHEME_HTTP, true },
2083     { "..",                     "..",           SCHEME_FTP,  false },
2084     { "../",                    "../",          SCHEME_FTP,  false },
2085     { "foo",                    "foo",          SCHEME_HTTP, false },
2086     { "foo/bar",                "foo/bar",      SCHEME_HTTP, false },
2087     { "foo///bar",              "foo///bar",    SCHEME_HTTP, false },
2088     { "foo/.",                  "foo/",         SCHEME_HTTP, true },
2089     { "foo/./",                 "foo/",         SCHEME_HTTP, true },
2090     { "foo./",                  "foo./",        SCHEME_HTTP, false },
2091     { "foo/../bar",             "bar",          SCHEME_HTTP, true },
2092     { "foo/../bar/",            "bar/",         SCHEME_HTTP, true },
2093     { "foo/bar/..",             "foo/",         SCHEME_HTTP, true },
2094     { "foo/bar/../x",           "foo/x",        SCHEME_HTTP, true },
2095     { "foo/bar/../x/",          "foo/x/",       SCHEME_HTTP, true },
2096     { "foo/..",                 "",             SCHEME_HTTP, true },
2097     { "foo/../..",              "",             SCHEME_HTTP, true },
2098     { "foo/../../..",           "",             SCHEME_HTTP, true },
2099     { "foo/../../bar/../../baz", "baz",         SCHEME_HTTP, true },
2100     { "foo/../..",              "..",           SCHEME_FTP,  true },
2101     { "foo/../../..",           "../..",        SCHEME_FTP,  true },
2102     { "foo/../../bar/../../baz", "../../baz",   SCHEME_FTP,  true },
2103     { "a/b/../../c",            "c",            SCHEME_HTTP, true },
2104     { "./a/../b",               "b",            SCHEME_HTTP, true }
2105   };
2106   int i;
2107
2108   for (i = 0; i < countof (tests); i++)
2109     {
2110       const char *message;
2111       char *test = tests[i].test;
2112       char *expected_result = tests[i].result;
2113       enum url_scheme scheme = tests[i].scheme;
2114       bool  expected_change = tests[i].should_modify;
2115       message = run_test (test, expected_result, scheme, expected_change);
2116       if (message) return message;
2117     }
2118   return NULL;
2119 }
2120
2121 const char *
2122 test_append_uri_pathel()
2123 {
2124   int i;
2125   struct {
2126     char *original_url;
2127     char *input;
2128     bool escaped;
2129     char *expected_result;
2130   } test_array[] = {
2131     { "http://www.yoyodyne.com/path/", "somepage.html", false, "http://www.yoyodyne.com/path/somepage.html" },
2132   };
2133
2134   for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
2135     {
2136       struct growable dest;
2137       const char *p = test_array[i].input;
2138
2139       memset (&dest, 0, sizeof (dest));
2140
2141       append_string (test_array[i].original_url, &dest);
2142       append_uri_pathel (p, p + strlen(p), test_array[i].escaped, &dest);
2143       append_char ('\0', &dest);
2144
2145       mu_assert ("test_append_uri_pathel: wrong result",
2146                  strcmp (dest.base, test_array[i].expected_result) == 0);
2147     }
2148
2149   return NULL;
2150 }
2151
2152 const char*
2153 test_are_urls_equal()
2154 {
2155   int i;
2156   struct {
2157     char *url1;
2158     char *url2;
2159     bool expected_result;
2160   } test_array[] = {
2161     { "http://www.adomain.com/apath/", "http://www.adomain.com/apath/",       true },
2162     { "http://www.adomain.com/apath/", "http://www.adomain.com/anotherpath/", false },
2163     { "http://www.adomain.com/apath/", "http://www.anotherdomain.com/path/",  false },
2164     { "http://www.adomain.com/~path/", "http://www.adomain.com/%7epath/",     true },
2165     { "http://www.adomain.com/longer-path/", "http://www.adomain.com/path/",  false },
2166     { "http://www.adomain.com/path%2f", "http://www.adomain.com/path/",       false },
2167   };
2168
2169   for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
2170     {
2171       mu_assert ("test_are_urls_equal: wrong result",
2172                  are_urls_equal (test_array[i].url1, test_array[i].url2) == test_array[i].expected_result);
2173     }
2174
2175   return NULL;
2176 }
2177
2178 #endif /* TESTING */
2179
2180 /*
2181  * vim: et ts=2 sw=2
2182  */
2183