sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
   3    2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or (at
  10 your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #include "wget.h"
  32
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 #ifdef HAVE_UNISTD_H
  37 # include <unistd.h>
  38 #endif
  39 #include <errno.h>
  40 #include <assert.h>
  41
  42 #include "utils.h"
  43 #include "url.h"
  44 #include "host.h"  /* for is_valid_ipv6_address */
  45
  46 #ifdef TESTING
  47 #include "test.h"
  48 #endif
  49
  50 enum {
  51   scm_disabled = 1,             /* for https when OpenSSL fails to init. */
  52   scm_has_params = 2,           /* whether scheme has ;params */
  53   scm_has_query = 4,            /* whether scheme has ?query */
  54   scm_has_fragment = 8          /* whether scheme has #fragment */
  55 };
  56
  57 struct scheme_data
  58 {
  59   /* Short name of the scheme, such as "http" or "ftp". */
  60   const char *name;
  61   /* Leading string that identifies the scheme, such as "https://". */
  62   const char *leading_string;
  63   /* Default port of the scheme when none is specified. */
  64   int default_port;
  65   /* Various flags. */
  66   int flags;
  67 };
  68
  69 /* Supported schemes: */
  70 static struct scheme_data supported_schemes[] =
  71 {
  72   { "http",     "http://",  DEFAULT_HTTP_PORT,  scm_has_query|scm_has_fragment },
  73 #ifdef HAVE_SSL
  74   { "https",    "https://", DEFAULT_HTTPS_PORT, scm_has_query|scm_has_fragment },
  75 #endif
  76   { "ftp",      "ftp://",   DEFAULT_FTP_PORT,   scm_has_params|scm_has_fragment },
  77
  78   /* SCHEME_INVALID */
  79   { NULL,       NULL,       -1,                 0 }
  80 };
  81
  82 /* Forward declarations: */
  83
  84 static bool path_simplify (enum url_scheme, char *);
  85 \f
  86 /* Support for escaping and unescaping of URL strings.  */
  87
  88 /* Table of "reserved" and "unsafe" characters.  Those terms are
  89    rfc1738-speak, as such largely obsoleted by rfc2396 and later
  90    specs, but the general idea remains.
  91
  92    A reserved character is the one that you can't decode without
  93    changing the meaning of the URL.  For example, you can't decode
  94    "/foo/%2f/bar" into "/foo///bar" because the number and contents of
  95    path components is different.  Non-reserved characters can be
  96    changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar".  The
  97    unsafe characters are loosely based on rfc1738, plus "$" and ",",
  98    as recommended by rfc2396, and minus "~", which is very frequently
  99    used (and sometimes unrecognized as %7E by broken servers).
 100
 101    An unsafe character is the one that should be encoded when URLs are
 102    placed in foreign environments.  E.g. space and newline are unsafe
 103    in HTTP contexts because HTTP uses them as separator and line
 104    terminator, so they must be encoded to %20 and %0A respectively.
 105    "*" is unsafe in shell context, etc.
 106
 107    We determine whether a character is unsafe through static table
 108    lookup.  This code assumes ASCII character set and 8-bit chars.  */
 109
 110 enum {
 111   /* rfc1738 reserved chars + "$" and ",".  */
 112   urlchr_reserved = 1,
 113
 114   /* rfc1738 unsafe chars, plus non-printables.  */
 115   urlchr_unsafe   = 2
 116 };
 117
 118 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 119 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 120 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 121
 122 /* Shorthands for the table: */
 123 #define R  urlchr_reserved
 124 #define U  urlchr_unsafe
 125 #define RU R|U
 126
 127 static const unsigned char urlchr_table[256] =
 128 {
 129   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 130   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 131   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 132   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 133   U,  0,  U, RU,   R,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 134   0,  0,  0,  R,   R,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 135   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 136   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 137  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 138   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 139   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 140   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 141   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 142   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 143   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 144   0,  0,  0,  U,   U,  U,  0,  U,   /* x   y   z   {    |   }   ~   DEL */
 145
 146   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 147   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 148   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 149   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 150
 151   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 152   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 153   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 154   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 155 };
 156 #undef R
 157 #undef U
 158 #undef RU
 159
 160 /* URL-unescape the string S.
 161
 162    This is done by transforming the sequences "%HH" to the character
 163    represented by the hexadecimal digits HH.  If % is not followed by
 164    two hexadecimal digits, it is inserted literally.
 165
 166    The transformation is done in place.  If you need the original
 167    string intact, make a copy before calling this function.  */
 168
 169 static void
 170 url_unescape (char *s)
 171 {
 172   char *t = s;                  /* t - tortoise */
 173   char *h = s;                  /* h - hare     */
 174
 175   for (; *h; h++, t++)
 176     {
 177       if (*h != '%')
 178         {
 179         copychar:
 180           *t = *h;
 181         }
 182       else
 183         {
 184           char c;
 185           /* Do nothing if '%' is not followed by two hex digits. */
 186           if (!h[1] || !h[2] || !(c_isxdigit (h[1]) && c_isxdigit (h[2])))
 187             goto copychar;
 188           c = X2DIGITS_TO_NUM (h[1], h[2]);
 189           /* Don't unescape %00 because there is no way to insert it
 190              into a C string without effectively truncating it. */
 191           if (c == '\0')
 192             goto copychar;
 193           *t = c;
 194           h += 2;
 195         }
 196     }
 197   *t = '\0';
 198 }
 199
 200 /* The core of url_escape_* functions.  Escapes the characters that
 201    match the provided mask in urlchr_table.
 202
 203    If ALLOW_PASSTHROUGH is true, a string with no unsafe chars will be
 204    returned unchanged.  If ALLOW_PASSTHROUGH is false, a freshly
 205    allocated string will be returned in all cases.  */
 206
 207 static char *
 208 url_escape_1 (const char *s, unsigned char mask, bool allow_passthrough)
 209 {
 210   const char *p1;
 211   char *p2, *newstr;
 212   int newlen;
 213   int addition = 0;
 214
 215   for (p1 = s; *p1; p1++)
 216     if (urlchr_test (*p1, mask))
 217       addition += 2;            /* Two more characters (hex digits) */
 218
 219   if (!addition)
 220     return allow_passthrough ? (char *)s : xstrdup (s);
 221
 222   newlen = (p1 - s) + addition;
 223   newstr = xmalloc (newlen + 1);
 224
 225   p1 = s;
 226   p2 = newstr;
 227   while (*p1)
 228     {
 229       /* Quote the characters that match the test mask. */
 230       if (urlchr_test (*p1, mask))
 231         {
 232           unsigned char c = *p1++;
 233           *p2++ = '%';
 234           *p2++ = XNUM_TO_DIGIT (c >> 4);
 235           *p2++ = XNUM_TO_DIGIT (c & 0xf);
 236         }
 237       else
 238         *p2++ = *p1++;
 239     }
 240   assert (p2 - newstr == newlen);
 241   *p2 = '\0';
 242
 243   return newstr;
 244 }
 245
 246 /* URL-escape the unsafe characters (see urlchr_table) in a given
 247    string, returning a freshly allocated string.  */
 248
 249 char *
 250 url_escape (const char *s)
 251 {
 252   return url_escape_1 (s, urlchr_unsafe, false);
 253 }
 254
 255 /* URL-escape the unsafe and reserved characters (see urlchr_table) in
 256    a given string, returning a freshly allocated string.  */
 257
 258 char *
 259 url_escape_unsafe_and_reserved (const char *s)
 260 {
 261   return url_escape_1 (s, urlchr_unsafe|urlchr_reserved, false);
 262 }
 263
 264 /* URL-escape the unsafe characters (see urlchr_table) in a given
 265    string.  If no characters are unsafe, S is returned.  */
 266
 267 static char *
 268 url_escape_allow_passthrough (const char *s)
 269 {
 270   return url_escape_1 (s, urlchr_unsafe, true);
 271 }
 272 \f
 273 /* Decide whether the char at position P needs to be encoded.  (It is
 274    not enough to pass a single char *P because the function may need
 275    to inspect the surrounding context.)
 276
 277    Return true if the char should be escaped as %XX, false otherwise.  */
 278
 279 static inline bool
 280 char_needs_escaping (const char *p)
 281 {
 282   if (*p == '%')
 283     {
 284       if (c_isxdigit (*(p + 1)) && c_isxdigit (*(p + 2)))
 285         return false;
 286       else
 287         /* Garbled %.. sequence: encode `%'. */
 288         return true;
 289     }
 290   else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
 291     return true;
 292   else
 293     return false;
 294 }
 295
 296 /* Translate a %-escaped (but possibly non-conformant) input string S
 297    into a %-escaped (and conformant) output string.  If no characters
 298    are encoded or decoded, return the same string S; otherwise, return
 299    a freshly allocated string with the new contents.
 300
 301    After a URL has been run through this function, the protocols that
 302    use `%' as the quote character can use the resulting string as-is,
 303    while those that don't can use url_unescape to get to the intended
 304    data.  This function is stable: once the input is transformed,
 305    further transformations of the result yield the same output.
 306
 307    Let's discuss why this function is needed.
 308
 309    Imagine Wget is asked to retrieve `http://abc.xyz/abc def'.  Since
 310    a raw space character would mess up the HTTP request, it needs to
 311    be quoted, like this:
 312
 313        GET /abc%20def HTTP/1.0
 314
 315    It would appear that the unsafe chars need to be quoted, for
 316    example with url_escape.  But what if we're requested to download
 317    `abc%20def'?  url_escape transforms "%" to "%25", which would leave
 318    us with `abc%2520def'.  This is incorrect -- since %-escapes are
 319    part of URL syntax, "%20" is the correct way to denote a literal
 320    space on the Wget command line.  This leads to the conclusion that
 321    in that case Wget should not call url_escape, but leave the `%20'
 322    as is.  This is clearly contradictory, but it only gets worse.
 323
 324    What if the requested URI is `abc%20 def'?  If we call url_escape,
 325    we end up with `/abc%2520%20def', which is almost certainly not
 326    intended.  If we don't call url_escape, we are left with the
 327    embedded space and cannot complete the request.  What the user
 328    meant was for Wget to request `/abc%20%20def', and this is where
 329    reencode_escapes kicks in.
 330
 331    Wget used to solve this by first decoding %-quotes, and then
 332    encoding all the "unsafe" characters found in the resulting string.
 333    This was wrong because it didn't preserve certain URL special
 334    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 335    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 336    whether we considered `+' reserved (it is).  One of these results
 337    is inevitable because by the second step we would lose information
 338    on whether the `+' was originally encoded or not.  Both results
 339    were wrong because in CGI parameters + means space, while %2B means
 340    literal plus.  reencode_escapes correctly translates the above to
 341    "a%2B+b", i.e. returns the original string.
 342
 343    This function uses a modified version of the algorithm originally
 344    proposed by Anon Sricharoenchai:
 345
 346    * Encode all "unsafe" characters, except those that are also
 347      "reserved", to %XX.  See urlchr_table for which characters are
 348      unsafe and reserved.
 349
 350    * Encode the "%" characters not followed by two hex digits to
 351      "%25".
 352
 353    * Pass through all other characters and %XX escapes as-is.  (Up to
 354      Wget 1.10 this decoded %XX escapes corresponding to "safe"
 355      characters, but that was obtrusive and broke some servers.)
 356
 357    Anon's test case:
 358
 359    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 360    ->
 361    "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc"
 362
 363    Simpler test cases:
 364
 365    "foo bar"         -> "foo%20bar"
 366    "foo%20bar"       -> "foo%20bar"
 367    "foo %20bar"      -> "foo%20%20bar"
 368    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 369    "foo%25%20bar"    -> "foo%25%20bar"
 370    "foo%2%20bar"     -> "foo%252%20bar"
 371    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 372    "foo%2b+bar"      -> "foo%2b+bar"  */
 373
 374 static char *
 375 reencode_escapes (const char *s)
 376 {
 377   const char *p1;
 378   char *newstr, *p2;
 379   int oldlen, newlen;
 380
 381   int encode_count = 0;
 382
 383   /* First pass: inspect the string to see if there's anything to do,
 384      and to calculate the new length.  */
 385   for (p1 = s; *p1; p1++)
 386     if (char_needs_escaping (p1))
 387       ++encode_count;
 388
 389   if (!encode_count)
 390     /* The string is good as it is. */
 391     return (char *) s;          /* C const model sucks. */
 392
 393   oldlen = p1 - s;
 394   /* Each encoding adds two characters (hex digits).  */
 395   newlen = oldlen + 2 * encode_count;
 396   newstr = xmalloc (newlen + 1);
 397
 398   /* Second pass: copy the string to the destination address, encoding
 399      chars when needed.  */
 400   p1 = s;
 401   p2 = newstr;
 402
 403   while (*p1)
 404     if (char_needs_escaping (p1))
 405       {
 406         unsigned char c = *p1++;
 407         *p2++ = '%';
 408         *p2++ = XNUM_TO_DIGIT (c >> 4);
 409         *p2++ = XNUM_TO_DIGIT (c & 0xf);
 410       }
 411     else
 412       *p2++ = *p1++;
 413
 414   *p2 = '\0';
 415   assert (p2 - newstr == newlen);
 416   return newstr;
 417 }
 418 \f
 419 /* Returns the scheme type if the scheme is supported, or
 420    SCHEME_INVALID if not.  */
 421
 422 enum url_scheme
 423 url_scheme (const char *url)
 424 {
 425   int i;
 426
 427   for (i = 0; supported_schemes[i].leading_string; i++)
 428     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 429                           strlen (supported_schemes[i].leading_string)))
 430       {
 431         if (!(supported_schemes[i].flags & scm_disabled))
 432           return (enum url_scheme) i;
 433         else
 434           return SCHEME_INVALID;
 435       }
 436
 437   return SCHEME_INVALID;
 438 }
 439
 440 #define SCHEME_CHAR(ch) (c_isalnum (ch) || (ch) == '-' || (ch) == '+')
 441
 442 /* Return 1 if the URL begins with any "scheme", 0 otherwise.  As
 443    currently implemented, it returns true if URL begins with
 444    [-+a-zA-Z0-9]+: .  */
 445
 446 bool
 447 url_has_scheme (const char *url)
 448 {
 449   const char *p = url;
 450
 451   /* The first char must be a scheme char. */
 452   if (!*p || !SCHEME_CHAR (*p))
 453     return false;
 454   ++p;
 455   /* Followed by 0 or more scheme chars. */
 456   while (*p && SCHEME_CHAR (*p))
 457     ++p;
 458   /* Terminated by ':'. */
 459   return *p == ':';
 460 }
 461
 462 int
 463 scheme_default_port (enum url_scheme scheme)
 464 {
 465   return supported_schemes[scheme].default_port;
 466 }
 467
 468 void
 469 scheme_disable (enum url_scheme scheme)
 470 {
 471   supported_schemes[scheme].flags |= scm_disabled;
 472 }
 473
 474 /* Skip the username and password, if present in the URL.  The
 475    function should *not* be called with the complete URL, but with the
 476    portion after the scheme.
 477
 478    If no username and password are found, return URL.  */
 479
 480 static const char *
 481 url_skip_credentials (const char *url)
 482 {
 483   /* Look for '@' that comes before terminators, such as '/', '?',
 484      '#', or ';'.  */
 485   const char *p = (const char *)strpbrk (url, "@/?#;");
 486   if (!p || *p != '@')
 487     return url;
 488   return p + 1;
 489 }
 490
 491 /* Parse credentials contained in [BEG, END).  The region is expected
 492    to have come from a URL and is unescaped.  */
 493
 494 static bool
 495 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
 496 {
 497   char *colon;
 498   const char *userend;
 499
 500   if (beg == end)
 501     return false;               /* empty user name */
 502
 503   colon = memchr (beg, ':', end - beg);
 504   if (colon == beg)
 505     return false;               /* again empty user name */
 506
 507   if (colon)
 508     {
 509       *passwd = strdupdelim (colon + 1, end);
 510       userend = colon;
 511       url_unescape (*passwd);
 512     }
 513   else
 514     {
 515       *passwd = NULL;
 516       userend = end;
 517     }
 518   *user = strdupdelim (beg, userend);
 519   url_unescape (*user);
 520   return true;
 521 }
 522
 523 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 524    originally popularized by Netscape and NcFTP.  HTTP shorthands look
 525    like this:
 526
 527    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 528    www.foo.com[:port]            -> http://www.foo.com[:port]
 529
 530    FTP shorthands look like this:
 531
 532    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 533    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 534
 535    If the URL needs not or cannot be rewritten, return NULL.  */
 536
 537 char *
 538 rewrite_shorthand_url (const char *url)
 539 {
 540   const char *p;
 541   char *ret;
 542
 543   if (url_scheme (url) != SCHEME_INVALID)
 544     return NULL;
 545
 546   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 547      latter Netscape.  */
 548   p = strpbrk (url, ":/");
 549   if (p == url)
 550     return NULL;
 551
 552   /* If we're looking at "://", it means the URL uses a scheme we
 553      don't support, which may include "https" when compiled without
 554      SSL support.  Don't bogusly rewrite such URLs.  */
 555   if (p && p[0] == ':' && p[1] == '/' && p[2] == '/')
 556     return NULL;
 557
 558   if (p && *p == ':')
 559     {
 560       /* Colon indicates ftp, as in foo.bar.com:path.  Check for
 561          special case of http port number ("localhost:10000").  */
 562       int digits = strspn (p + 1, "0123456789");
 563       if (digits && (p[1 + digits] == '/' || p[1 + digits] == '\0'))
 564         goto http;
 565
 566       /* Turn "foo.bar.com:path" to "ftp://foo.bar.com/path". */
 567       ret = aprintf ("ftp://%s", url);
 568       ret[6 + (p - url)] = '/';
 569     }
 570   else
 571     {
 572     http:
 573       /* Just prepend "http://" to URL. */
 574       ret = aprintf ("http://%s", url);
 575     }
 576   return ret;
 577 }
 578 \f
 579 static void split_path (const char *, char **, char **);
 580
 581 /* Like strpbrk, with the exception that it returns the pointer to the
 582    terminating zero (end-of-string aka "eos") if no matching character
 583    is found.  */
 584
 585 static inline char *
 586 strpbrk_or_eos (const char *s, const char *accept)
 587 {
 588   char *p = strpbrk (s, accept);
 589   if (!p)
 590     p = strchr (s, '\0');
 591   return p;
 592 }
 593
 594 /* Turn STR into lowercase; return true if a character was actually
 595    changed. */
 596
 597 static bool
 598 lowercase_str (char *str)
 599 {
 600   bool changed = false;
 601   for (; *str; str++)
 602     if (c_isupper (*str))
 603       {
 604         changed = true;
 605         *str = c_tolower (*str);
 606       }
 607   return changed;
 608 }
 609
 610 static const char *
 611 init_seps (enum url_scheme scheme)
 612 {
 613   static char seps[8] = ":/";
 614   char *p = seps + 2;
 615   int flags = supported_schemes[scheme].flags;
 616
 617   if (flags & scm_has_params)
 618     *p++ = ';';
 619   if (flags & scm_has_query)
 620     *p++ = '?';
 621   if (flags & scm_has_fragment)
 622     *p++ = '#';
 623   *p++ = '\0';
 624   return seps;
 625 }
 626
 627 static const char *parse_errors[] = {
 628 #define PE_NO_ERROR                     0
 629   N_("No error"),
 630 #define PE_UNSUPPORTED_SCHEME           1
 631   N_("Unsupported scheme %s"), /* support for format token only here */
 632 #define PE_MISSING_SCHEME               2
 633   N_("Scheme missing"),
 634 #define PE_INVALID_HOST_NAME            3
 635   N_("Invalid host name"),
 636 #define PE_BAD_PORT_NUMBER              4
 637   N_("Bad port number"),
 638 #define PE_INVALID_USER_NAME            5
 639   N_("Invalid user name"),
 640 #define PE_UNTERMINATED_IPV6_ADDRESS    6
 641   N_("Unterminated IPv6 numeric address"),
 642 #define PE_IPV6_NOT_SUPPORTED           7
 643   N_("IPv6 addresses not supported"),
 644 #define PE_INVALID_IPV6_ADDRESS         8
 645   N_("Invalid IPv6 numeric address")
 646 };
 647
 648 /* Parse a URL.
 649
 650    Return a new struct url if successful, NULL on error.  In case of
 651    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 652    error code. */
 653 struct url *
 654 url_parse (const char *url, int *error, struct iri *iri, bool percent_encode)
 655 {
 656   struct url *u;
 657   const char *p;
 658   bool path_modified, host_modified;
 659
 660   enum url_scheme scheme;
 661   const char *seps;
 662
 663   const char *uname_b,     *uname_e;
 664   const char *host_b,      *host_e;
 665   const char *path_b,      *path_e;
 666   const char *params_b,    *params_e;
 667   const char *query_b,     *query_e;
 668   const char *fragment_b,  *fragment_e;
 669
 670   int port;
 671   char *user = NULL, *passwd = NULL;
 672
 673   const char *url_encoded = NULL;
 674   char *new_url = NULL;
 675
 676   int error_code;
 677
 678   scheme = url_scheme (url);
 679   if (scheme == SCHEME_INVALID)
 680     {
 681       if (url_has_scheme (url))
 682         error_code = PE_UNSUPPORTED_SCHEME;
 683       else
 684         error_code = PE_MISSING_SCHEME;
 685       goto error;
 686     }
 687
 688   if (iri && iri->utf8_encode)
 689     {
 690       iri->utf8_encode = remote_to_utf8 (iri, iri->orig_url ? iri->orig_url : url, (const char **) &new_url);
 691       if (!iri->utf8_encode)
 692         new_url = NULL;
 693       else
 694         iri->orig_url = xstrdup (url);
 695     }
 696
 697   /* XXX XXX Could that change introduce (security) bugs ???  XXX XXX*/
 698   if (percent_encode)
 699     url_encoded = reencode_escapes (new_url ? new_url : url);
 700   else
 701     url_encoded = new_url ? new_url : url;
 702
 703   p = url_encoded;
 704
 705   if (new_url && url_encoded != new_url)
 706     xfree (new_url);
 707
 708   p += strlen (supported_schemes[scheme].leading_string);
 709   uname_b = p;
 710   p = url_skip_credentials (p);
 711   uname_e = p;
 712
 713   /* scheme://user:pass@host[:port]... */
 714   /*                    ^              */
 715
 716   /* We attempt to break down the URL into the components path,
 717      params, query, and fragment.  They are ordered like this:
 718
 719        scheme://host[:port][/path][;params][?query][#fragment]  */
 720
 721   path_b     = path_e     = NULL;
 722   params_b   = params_e   = NULL;
 723   query_b    = query_e    = NULL;
 724   fragment_b = fragment_e = NULL;
 725
 726   /* Initialize separators for optional parts of URL, depending on the
 727      scheme.  For example, FTP has params, and HTTP and HTTPS have
 728      query string and fragment. */
 729   seps = init_seps (scheme);
 730
 731   host_b = p;
 732
 733   if (*p == '[')
 734     {
 735       /* Handle IPv6 address inside square brackets.  Ideally we'd
 736          just look for the terminating ']', but rfc2732 mandates
 737          rejecting invalid IPv6 addresses.  */
 738
 739       /* The address begins after '['. */
 740       host_b = p + 1;
 741       host_e = strchr (host_b, ']');
 742
 743       if (!host_e)
 744         {
 745           error_code = PE_UNTERMINATED_IPV6_ADDRESS;
 746           goto error;
 747         }
 748
 749 #ifdef ENABLE_IPV6
 750       /* Check if the IPv6 address is valid. */
 751       if (!is_valid_ipv6_address(host_b, host_e))
 752         {
 753           error_code = PE_INVALID_IPV6_ADDRESS;
 754           goto error;
 755         }
 756
 757       /* Continue parsing after the closing ']'. */
 758       p = host_e + 1;
 759 #else
 760       error_code = PE_IPV6_NOT_SUPPORTED;
 761       goto error;
 762 #endif
 763
 764       /* The closing bracket must be followed by a separator or by the
 765          null char.  */
 766       /* http://[::1]... */
 767       /*             ^   */
 768       if (!strchr (seps, *p))
 769         {
 770           /* Trailing garbage after []-delimited IPv6 address. */
 771           error_code = PE_INVALID_HOST_NAME;
 772           goto error;
 773         }
 774     }
 775   else
 776     {
 777       p = strpbrk_or_eos (p, seps);
 778       host_e = p;
 779     }
 780   ++seps;                       /* advance to '/' */
 781
 782   if (host_b == host_e)
 783     {
 784       error_code = PE_INVALID_HOST_NAME;
 785       goto error;
 786     }
 787
 788   port = scheme_default_port (scheme);
 789   if (*p == ':')
 790     {
 791       const char *port_b, *port_e, *pp;
 792
 793       /* scheme://host:port/tralala */
 794       /*              ^             */
 795       ++p;
 796       port_b = p;
 797       p = strpbrk_or_eos (p, seps);
 798       port_e = p;
 799
 800       /* Allow empty port, as per rfc2396. */
 801       if (port_b != port_e)
 802         for (port = 0, pp = port_b; pp < port_e; pp++)
 803           {
 804             if (!c_isdigit (*pp))
 805               {
 806                 /* http://host:12randomgarbage/blah */
 807                 /*               ^                  */
 808                 error_code = PE_BAD_PORT_NUMBER;
 809                 goto error;
 810               }
 811             port = 10 * port + (*pp - '0');
 812             /* Check for too large port numbers here, before we have
 813                a chance to overflow on bogus port values.  */
 814             if (port > 0xffff)
 815               {
 816                 error_code = PE_BAD_PORT_NUMBER;
 817                 goto error;
 818               }
 819           }
 820     }
 821   /* Advance to the first separator *after* '/' (either ';' or '?',
 822      depending on the scheme).  */
 823   ++seps;
 824
 825   /* Get the optional parts of URL, each part being delimited by
 826      current location and the position of the next separator.  */
 827 #define GET_URL_PART(sepchar, var) do {                         \
 828   if (*p == sepchar)                                            \
 829     var##_b = ++p, var##_e = p = strpbrk_or_eos (p, seps);      \
 830   ++seps;                                                       \
 831 } while (0)
 832
 833   GET_URL_PART ('/', path);
 834   if (supported_schemes[scheme].flags & scm_has_params)
 835     GET_URL_PART (';', params);
 836   if (supported_schemes[scheme].flags & scm_has_query)
 837     GET_URL_PART ('?', query);
 838   if (supported_schemes[scheme].flags & scm_has_fragment)
 839     GET_URL_PART ('#', fragment);
 840
 841 #undef GET_URL_PART
 842   assert (*p == 0);
 843
 844   if (uname_b != uname_e)
 845     {
 846       /* http://user:pass@host */
 847       /*        ^         ^    */
 848       /*     uname_b   uname_e */
 849       if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
 850         {
 851           error_code = PE_INVALID_USER_NAME;
 852           goto error;
 853         }
 854     }
 855
 856   u = xnew0 (struct url);
 857   u->scheme = scheme;
 858   u->host   = strdupdelim (host_b, host_e);
 859   u->port   = port;
 860   u->user   = user;
 861   u->passwd = passwd;
 862
 863   u->path = strdupdelim (path_b, path_e);
 864   path_modified = path_simplify (scheme, u->path);
 865   split_path (u->path, &u->dir, &u->file);
 866
 867   host_modified = lowercase_str (u->host);
 868
 869   /* Decode %HH sequences in host name.  This is important not so much
 870      to support %HH sequences in host names (which other browser
 871      don't), but to support binary characters (which will have been
 872      converted to %HH by reencode_escapes).  */
 873   if (strchr (u->host, '%'))
 874     {
 875       url_unescape (u->host);
 876       host_modified = true;
 877
 878       /* Apply IDNA regardless of iri->utf8_encode status */
 879       if (opt.enable_iri && iri)
 880         {
 881           char *new = idn_encode (iri, u->host);
 882           if (new)
 883             {
 884               xfree (u->host);
 885               u->host = new;
 886               host_modified = true;
 887             }
 888         }
 889     }
 890
 891   if (params_b)
 892     u->params = strdupdelim (params_b, params_e);
 893   if (query_b)
 894     u->query = strdupdelim (query_b, query_e);
 895   if (fragment_b)
 896     u->fragment = strdupdelim (fragment_b, fragment_e);
 897
 898   if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e)
 899     {
 900       /* If we suspect that a transformation has rendered what
 901          url_string might return different from URL_ENCODED, rebuild
 902          u->url using url_string.  */
 903       u->url = url_string (u, URL_AUTH_SHOW);
 904
 905       if (url_encoded != url)
 906         xfree ((char *) url_encoded);
 907     }
 908   else
 909     {
 910       if (url_encoded == url)
 911         u->url = xstrdup (url);
 912       else
 913         u->url = (char *) url_encoded;
 914     }
 915
 916   return u;
 917
 918  error:
 919   /* Cleanup in case of error: */
 920   if (url_encoded && url_encoded != url)
 921     xfree ((char *) url_encoded);
 922
 923   /* Transmit the error code to the caller, if the caller wants to
 924      know.  */
 925   if (error)
 926     *error = error_code;
 927   return NULL;
 928 }
 929
 930 /* Return the error message string from ERROR_CODE, which should have
 931    been retrieved from url_parse.  The error message is translated.  */
 932
 933 char *
 934 url_error (const char *url, int error_code)
 935 {
 936   assert (error_code >= 0 && ((size_t) error_code) < countof (parse_errors));
 937
 938   if (error_code == PE_UNSUPPORTED_SCHEME)
 939     {
 940       char *error, *p;
 941       char *scheme = xstrdup (url);
 942       assert (url_has_scheme (url));
 943
 944       if ((p = strchr (scheme, ':')))
 945         *p = '\0';
 946       if (!strcasecmp (scheme, "https"))
 947         error = aprintf (_("HTTPS support not compiled in"));
 948       else
 949         error = aprintf (_(parse_errors[error_code]), quote (scheme));
 950       xfree (scheme);
 951
 952       return error;
 953     }
 954   else
 955     return xstrdup (_(parse_errors[error_code]));
 956 }
 957
 958 /* Split PATH into DIR and FILE.  PATH comes from the URL and is
 959    expected to be URL-escaped.
 960
 961    The path is split into directory (the part up to the last slash)
 962    and file (the part after the last slash), which are subsequently
 963    unescaped.  Examples:
 964
 965    PATH                 DIR           FILE
 966    "foo/bar/baz"        "foo/bar"     "baz"
 967    "foo/bar/"           "foo/bar"     ""
 968    "foo"                ""            "foo"
 969    "foo/bar/baz%2fqux"  "foo/bar"     "baz/qux" (!)
 970
 971    DIR and FILE are freshly allocated.  */
 972
 973 static void
 974 split_path (const char *path, char **dir, char **file)
 975 {
 976   char *last_slash = strrchr (path, '/');
 977   if (!last_slash)
 978     {
 979       *dir = xstrdup ("");
 980       *file = xstrdup (path);
 981     }
 982   else
 983     {
 984       *dir = strdupdelim (path, last_slash);
 985       *file = xstrdup (last_slash + 1);
 986     }
 987   url_unescape (*dir);
 988   url_unescape (*file);
 989 }
 990
 991 /* Note: URL's "full path" is the path with the query string and
 992    params appended.  The "fragment" (#foo) is intentionally ignored,
 993    but that might be changed.  For example, if the original URL was
 994    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
 995    the full path will be "/foo/bar/baz;bullshit?querystring".  */
 996
 997 /* Return the length of the full path, without the terminating
 998    zero.  */
 999
1000 static int
1001 full_path_length (const struct url *url)
1002 {
1003   int len = 0;
1004
1005 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
1006
1007   FROB (path);
1008   FROB (params);
1009   FROB (query);
1010
1011 #undef FROB
1012
1013   return len;
1014 }
1015
1016 /* Write out the full path. */
1017
1018 static void
1019 full_path_write (const struct url *url, char *where)
1020 {
1021 #define FROB(el, chr) do {                      \
1022   char *f_el = url->el;                         \
1023   if (f_el) {                                   \
1024     int l = strlen (f_el);                      \
1025     *where++ = chr;                             \
1026     memcpy (where, f_el, l);                    \
1027     where += l;                                 \
1028   }                                             \
1029 } while (0)
1030
1031   FROB (path, '/');
1032   FROB (params, ';');
1033   FROB (query, '?');
1034
1035 #undef FROB
1036 }
1037
1038 /* Public function for getting the "full path".  E.g. if u->path is
1039    "foo/bar" and u->query is "param=value", full_path will be
1040    "/foo/bar?param=value". */
1041
1042 char *
1043 url_full_path (const struct url *url)
1044 {
1045   int length = full_path_length (url);
1046   char *full_path = xmalloc (length + 1);
1047
1048   full_path_write (url, full_path);
1049   full_path[length] = '\0';
1050
1051   return full_path;
1052 }
1053
1054 /* Unescape CHR in an otherwise escaped STR.  Used to selectively
1055    escaping of certain characters, such as "/" and ":".  Returns a
1056    count of unescaped chars.  */
1057
1058 static void
1059 unescape_single_char (char *str, char chr)
1060 {
1061   const char c1 = XNUM_TO_DIGIT (chr >> 4);
1062   const char c2 = XNUM_TO_DIGIT (chr & 0xf);
1063   char *h = str;                /* hare */
1064   char *t = str;                /* tortoise */
1065   for (; *h; h++, t++)
1066     {
1067       if (h[0] == '%' && h[1] == c1 && h[2] == c2)
1068         {
1069           *t = chr;
1070           h += 2;
1071         }
1072       else
1073         *t = *h;
1074     }
1075   *t = '\0';
1076 }
1077
1078 /* Escape unsafe and reserved characters, except for the slash
1079    characters.  */
1080
1081 static char *
1082 url_escape_dir (const char *dir)
1083 {
1084   char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1085   if (newdir == dir)
1086     return (char *)dir;
1087
1088   unescape_single_char (newdir, '/');
1089   return newdir;
1090 }
1091
1092 /* Sync u->path and u->url with u->dir and u->file.  Called after
1093    u->file or u->dir have been changed, typically by the FTP code.  */
1094
1095 static void
1096 sync_path (struct url *u)
1097 {
1098   char *newpath, *efile, *edir;
1099
1100   xfree (u->path);
1101
1102   /* u->dir and u->file are not escaped.  URL-escape them before
1103      reassembling them into u->path.  That way, if they contain
1104      separators like '?' or even if u->file contains slashes, the
1105      path will be correctly assembled.  (u->file can contain slashes
1106      if the URL specifies it with %2f, or if an FTP server returns
1107      it.)  */
1108   edir = url_escape_dir (u->dir);
1109   efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1110
1111   if (!*edir)
1112     newpath = xstrdup (efile);
1113   else
1114     {
1115       int dirlen = strlen (edir);
1116       int filelen = strlen (efile);
1117
1118       /* Copy "DIR/FILE" to newpath. */
1119       char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1120       memcpy (p, edir, dirlen);
1121       p += dirlen;
1122       *p++ = '/';
1123       memcpy (p, efile, filelen);
1124       p += filelen;
1125       *p = '\0';
1126     }
1127
1128   u->path = newpath;
1129
1130   if (edir != u->dir)
1131     xfree (edir);
1132   if (efile != u->file)
1133     xfree (efile);
1134
1135   /* Regenerate u->url as well.  */
1136   xfree (u->url);
1137   u->url = url_string (u, URL_AUTH_SHOW);
1138 }
1139
1140 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1141    This way we can sync u->path and u->url when they get changed.  */
1142
1143 void
1144 url_set_dir (struct url *url, const char *newdir)
1145 {
1146   xfree (url->dir);
1147   url->dir = xstrdup (newdir);
1148   sync_path (url);
1149 }
1150
1151 void
1152 url_set_file (struct url *url, const char *newfile)
1153 {
1154   xfree (url->file);
1155   url->file = xstrdup (newfile);
1156   sync_path (url);
1157 }
1158
1159 void
1160 url_free (struct url *url)
1161 {
1162   xfree (url->host);
1163   xfree (url->path);
1164   xfree (url->url);
1165
1166   xfree_null (url->params);
1167   xfree_null (url->query);
1168   xfree_null (url->fragment);
1169   xfree_null (url->user);
1170   xfree_null (url->passwd);
1171
1172   xfree (url->dir);
1173   xfree (url->file);
1174
1175   xfree (url);
1176 }
1177 \f
1178 /* Create all the necessary directories for PATH (a file).  Calls
1179    make_directory internally.  */
1180 int
1181 mkalldirs (const char *path)
1182 {
1183   const char *p;
1184   char *t;
1185   struct_stat st;
1186   int res;
1187
1188   p = path + strlen (path);
1189   for (; *p != '/' && p != path; p--)
1190     ;
1191
1192   /* Don't create if it's just a file.  */
1193   if ((p == path) && (*p != '/'))
1194     return 0;
1195   t = strdupdelim (path, p);
1196
1197   /* Check whether the directory exists.  */
1198   if ((stat (t, &st) == 0))
1199     {
1200       if (S_ISDIR (st.st_mode))
1201         {
1202           xfree (t);
1203           return 0;
1204         }
1205       else
1206         {
1207           /* If the dir exists as a file name, remove it first.  This
1208              is *only* for Wget to work with buggy old CERN http
1209              servers.  Here is the scenario: When Wget tries to
1210              retrieve a directory without a slash, e.g.
1211              http://foo/bar (bar being a directory), CERN server will
1212              not redirect it too http://foo/bar/ -- it will generate a
1213              directory listing containing links to bar/file1,
1214              bar/file2, etc.  Wget will lose because it saves this
1215              HTML listing to a file `bar', so it cannot create the
1216              directory.  To work around this, if the file of the same
1217              name exists, we just remove it and create the directory
1218              anyway.  */
1219           DEBUGP (("Removing %s because of directory danger!\n", t));
1220           unlink (t);
1221         }
1222     }
1223   res = make_directory (t);
1224   if (res != 0)
1225     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1226   xfree (t);
1227   return res;
1228 }
1229 \f
1230 /* Functions for constructing the file name out of URL components.  */
1231
1232 /* A growable string structure, used by url_file_name and friends.
1233    This should perhaps be moved to utils.c.
1234
1235    The idea is to have a convenient and efficient way to construct a
1236    string by having various functions append data to it.  Instead of
1237    passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1238    functions in questions, we pass the pointer to this struct.  */
1239
1240 struct growable {
1241   char *base;
1242   int size;
1243   int tail;
1244 };
1245
1246 /* Ensure that the string can accept APPEND_COUNT more characters past
1247    the current TAIL position.  If necessary, this will grow the string
1248    and update its allocated size.  If the string is already large
1249    enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1250 #define GROW(g, append_size) do {                                       \
1251   struct growable *G_ = g;                                              \
1252   DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1253 } while (0)
1254
1255 /* Return the tail position of the string. */
1256 #define TAIL(r) ((r)->base + (r)->tail)
1257
1258 /* Move the tail position by APPEND_COUNT characters. */
1259 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1260
1261 /* Append the string STR to DEST.  NOTICE: the string in DEST is not
1262    terminated.  */
1263
1264 static void
1265 append_string (const char *str, struct growable *dest)
1266 {
1267   int l = strlen (str);
1268   GROW (dest, l);
1269   memcpy (TAIL (dest), str, l);
1270   TAIL_INCR (dest, l);
1271 }
1272
1273 /* Append CH to DEST.  For example, append_char (0, DEST)
1274    zero-terminates DEST.  */
1275
1276 static void
1277 append_char (char ch, struct growable *dest)
1278 {
1279   GROW (dest, 1);
1280   *TAIL (dest) = ch;
1281   TAIL_INCR (dest, 1);
1282 }
1283
1284 enum {
1285   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1286   filechr_not_windows = 2,      /* unusable on Windows, one of \|/<>?:*" */
1287   filechr_control     = 4       /* a control character, e.g. 0-31 */
1288 };
1289
1290 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1291
1292 /* Shorthands for the table: */
1293 #define U filechr_not_unix
1294 #define W filechr_not_windows
1295 #define C filechr_control
1296
1297 #define UW U|W
1298 #define UWC U|W|C
1299
1300 /* Table of characters unsafe under various conditions (see above).
1301
1302    Arguably we could also claim `%' to be unsafe, since we use it as
1303    the escape character.  If we ever want to be able to reliably
1304    translate file name back to URL, this would become important
1305    crucial.  Right now, it's better to be minimal in escaping.  */
1306
1307 static const unsigned char filechr_table[256] =
1308 {
1309 UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1310   C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1311   C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1312   C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1313   0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1314   0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1315   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1316   0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
1317   0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1318   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1319   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1320   0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1321   0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1322   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1323   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1324   0,  0,  0,  0,   W,  0,  0,  C,   /* x   y   z   {    |   }   ~   DEL */
1325
1326   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
1327   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
1328   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1329   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1330
1331   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1332   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1333   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1334   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1335 };
1336 #undef U
1337 #undef W
1338 #undef C
1339 #undef UW
1340 #undef UWC
1341
1342 /* FN_PORT_SEP is the separator between host and port in file names
1343    for non-standard port numbers.  On Unix this is normally ':', as in
1344    "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1345    because Windows can't handle ':' in file names.  */
1346 #define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1347
1348 /* FN_QUERY_SEP is the separator between the file name and the URL
1349    query, normally '?'.  Since Windows cannot handle '?' as part of
1350    file name, we use '@' instead there.  */
1351 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1352
1353 /* Quote path element, characters in [b, e), as file name, and append
1354    the quoted string to DEST.  Each character is quoted as per
1355    file_unsafe_char and the corresponding table.
1356
1357    If ESCAPED is true, the path element is considered to be
1358    URL-escaped and will be unescaped prior to inspection.  */
1359
1360 static void
1361 append_uri_pathel (const char *b, const char *e, bool escaped,
1362                    struct growable *dest)
1363 {
1364   const char *p;
1365   int quoted, outlen;
1366
1367   int mask;
1368   if (opt.restrict_files_os == restrict_unix)
1369     mask = filechr_not_unix;
1370   else
1371     mask = filechr_not_windows;
1372   if (opt.restrict_files_ctrl)
1373     mask |= filechr_control;
1374
1375   /* Copy [b, e) to PATHEL and URL-unescape it. */
1376   if (escaped)
1377     {
1378       char *unescaped;
1379       BOUNDED_TO_ALLOCA (b, e, unescaped);
1380       url_unescape (unescaped);
1381       b = unescaped;
1382       e = unescaped + strlen (unescaped);
1383     }
1384
1385   /* Defang ".." when found as component of path.  Remember that path
1386      comes from the URL and might contain malicious input.  */
1387   if (e - b == 2 && b[0] == '.' && b[1] == '.')
1388     {
1389       b = "%2E%2E";
1390       e = b + 6;
1391     }
1392
1393   /* Walk the PATHEL string and check how many characters we'll need
1394      to quote.  */
1395   quoted = 0;
1396   for (p = b; p < e; p++)
1397     if (FILE_CHAR_TEST (*p, mask))
1398       ++quoted;
1399
1400   /* Calculate the length of the output string.  e-b is the input
1401      string length.  Each quoted char introduces two additional
1402      characters in the string, hence 2*quoted.  */
1403   outlen = (e - b) + (2 * quoted);
1404   GROW (dest, outlen);
1405
1406   if (!quoted)
1407     {
1408       /* If there's nothing to quote, we can simply append the string
1409          without processing it again.  */
1410       memcpy (TAIL (dest), b, outlen);
1411     }
1412   else
1413     {
1414       char *q = TAIL (dest);
1415       for (p = b; p < e; p++)
1416         {
1417           if (!FILE_CHAR_TEST (*p, mask))
1418             *q++ = *p;
1419           else
1420             {
1421               unsigned char ch = *p;
1422               *q++ = '%';
1423               *q++ = XNUM_TO_DIGIT (ch >> 4);
1424               *q++ = XNUM_TO_DIGIT (ch & 0xf);
1425             }
1426         }
1427       assert (q - TAIL (dest) == outlen);
1428     }
1429
1430   /* Perform inline case transformation if required.  */
1431   if (opt.restrict_files_case == restrict_lowercase
1432       || opt.restrict_files_case == restrict_uppercase)
1433     {
1434       char *q;
1435       for (q = TAIL (dest); q < TAIL (dest) + outlen; ++q)
1436         {
1437           if (opt.restrict_files_case == restrict_lowercase)
1438             *q = c_tolower (*q);
1439           else
1440             *q = c_toupper (*q);
1441         }
1442     }
1443
1444   TAIL_INCR (dest, outlen);
1445 }
1446
1447 /* Append to DEST the directory structure that corresponds the
1448    directory part of URL's path.  For example, if the URL is
1449    http://server/dir1/dir2/file, this appends "/dir1/dir2".
1450
1451    Each path element ("dir1" and "dir2" in the above example) is
1452    examined, url-unescaped, and re-escaped as file name element.
1453
1454    Additionally, it cuts as many directories from the path as
1455    specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1456    will produce "bar" for the above example.  For 2 or more, it will
1457    produce "".
1458
1459    Each component of the path is quoted for use as file name.  */
1460
1461 static void
1462 append_dir_structure (const struct url *u, struct growable *dest)
1463 {
1464   char *pathel, *next;
1465   int cut = opt.cut_dirs;
1466
1467   /* Go through the path components, de-URL-quote them, and quote them
1468      (if necessary) as file names.  */
1469
1470   pathel = u->path;
1471   for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1472     {
1473       if (cut-- > 0)
1474         continue;
1475       if (pathel == next)
1476         /* Ignore empty pathels.  */
1477         continue;
1478
1479       if (dest->tail)
1480         append_char ('/', dest);
1481       append_uri_pathel (pathel, next, true, dest);
1482     }
1483 }
1484
1485 /* Return a unique file name that matches the given URL as good as
1486    possible.  Does not create directories on the file system.  */
1487
1488 char *
1489 url_file_name (const struct url *u)
1490 {
1491   struct growable fnres;        /* stands for "file name result" */
1492
1493   const char *u_file, *u_query;
1494   char *fname, *unique;
1495   char *index_filename = "index.html"; /* The default index file is index.html */
1496
1497   fnres.base = NULL;
1498   fnres.size = 0;
1499   fnres.tail = 0;
1500
1501   /* If an alternative index file was defined, change index_filename */
1502   if (opt.default_page)
1503     index_filename = opt.default_page;
1504
1505
1506   /* Start with the directory prefix, if specified. */
1507   if (opt.dir_prefix)
1508     append_string (opt.dir_prefix, &fnres);
1509
1510   /* If "dirstruct" is turned on (typically the case with -r), add
1511      the host and port (unless those have been turned off) and
1512      directory structure.  */
1513   if (opt.dirstruct)
1514     {
1515       if (opt.protocol_directories)
1516         {
1517           if (fnres.tail)
1518             append_char ('/', &fnres);
1519           append_string (supported_schemes[u->scheme].name, &fnres);
1520         }
1521       if (opt.add_hostdir)
1522         {
1523           if (fnres.tail)
1524             append_char ('/', &fnres);
1525           if (0 != strcmp (u->host, ".."))
1526             append_string (u->host, &fnres);
1527           else
1528             /* Host name can come from the network; malicious DNS may
1529                allow ".." to be resolved, causing us to write to
1530                "../<file>".  Defang such host names.  */
1531             append_string ("%2E%2E", &fnres);
1532           if (u->port != scheme_default_port (u->scheme))
1533             {
1534               char portstr[24];
1535               number_to_string (portstr, u->port);
1536               append_char (FN_PORT_SEP, &fnres);
1537               append_string (portstr, &fnres);
1538             }
1539         }
1540
1541       append_dir_structure (u, &fnres);
1542     }
1543
1544   /* Add the file name. */
1545   if (fnres.tail)
1546     append_char ('/', &fnres);
1547   u_file = *u->file ? u->file : index_filename;
1548   append_uri_pathel (u_file, u_file + strlen (u_file), false, &fnres);
1549
1550   /* Append "?query" to the file name. */
1551   u_query = u->query && *u->query ? u->query : NULL;
1552   if (u_query)
1553     {
1554       append_char (FN_QUERY_SEP, &fnres);
1555       append_uri_pathel (u_query, u_query + strlen (u_query), true, &fnres);
1556     }
1557
1558   /* Zero-terminate the file name. */
1559   append_char ('\0', &fnres);
1560
1561   fname = fnres.base;
1562
1563   /* Check the cases in which the unique extensions are not used:
1564      1) Clobbering is turned off (-nc).
1565      2) Retrieval with regetting.
1566      3) Timestamping is used.
1567      4) Hierarchy is built.
1568
1569      The exception is the case when file does exist and is a
1570      directory (see `mkalldirs' for explanation).  */
1571
1572   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1573       && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1574     return fname;
1575
1576   unique = unique_name (fname, true);
1577   if (unique != fname)
1578     xfree (fname);
1579   return unique;
1580 }
1581 \f
1582 /* Resolve "." and ".." elements of PATH by destructively modifying
1583    PATH and return true if PATH has been modified, false otherwise.
1584
1585    The algorithm is in spirit similar to the one described in rfc1808,
1586    although implemented differently, in one pass.  To recap, path
1587    elements containing only "." are removed, and ".." is taken to mean
1588    "back up one element".  Single leading and trailing slashes are
1589    preserved.
1590
1591    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1592    test examples are provided below.  If you change anything in this
1593    function, run test_path_simplify to make sure you haven't broken a
1594    test case.  */
1595
1596 static bool
1597 path_simplify (enum url_scheme scheme, char *path)
1598 {
1599   char *h = path;               /* hare */
1600   char *t = path;               /* tortoise */
1601   char *beg = path;
1602   char *end = strchr (path, '\0');
1603
1604   while (h < end)
1605     {
1606       /* Hare should be at the beginning of a path element. */
1607
1608       if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1609         {
1610           /* Ignore "./". */
1611           h += 2;
1612         }
1613       else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1614         {
1615           /* Handle "../" by retreating the tortoise by one path
1616              element -- but not past beggining.  */
1617           if (t > beg)
1618             {
1619               /* Move backwards until T hits the beginning of the
1620                  previous path element or the beginning of path. */
1621               for (--t; t > beg && t[-1] != '/'; t--)
1622                 ;
1623             }
1624           else if (scheme == SCHEME_FTP)
1625             {
1626               /* If we're at the beginning, copy the "../" literally
1627                  and move the beginning so a later ".." doesn't remove
1628                  it.  This violates RFC 3986; but we do it for FTP
1629                  anyway because there is otherwise no way to get at a
1630                  parent directory, when the FTP server drops us in a
1631                  non-root directory (which is not uncommon). */
1632               beg = t + 3;
1633               goto regular;
1634             }
1635           h += 3;
1636         }
1637       else
1638         {
1639         regular:
1640           /* A regular path element.  If H hasn't advanced past T,
1641              simply skip to the next path element.  Otherwise, copy
1642              the path element until the next slash.  */
1643           if (t == h)
1644             {
1645               /* Skip the path element, including the slash.  */
1646               while (h < end && *h != '/')
1647                 t++, h++;
1648               if (h < end)
1649                 t++, h++;
1650             }
1651           else
1652             {
1653               /* Copy the path element, including the final slash.  */
1654               while (h < end && *h != '/')
1655                 *t++ = *h++;
1656               if (h < end)
1657                 *t++ = *h++;
1658             }
1659         }
1660     }
1661
1662   if (t != h)
1663     *t = '\0';
1664
1665   return t != h;
1666 }
1667 \f
1668 /* Return the length of URL's path.  Path is considered to be
1669    terminated by one or more of the ?query or ;params or #fragment,
1670    depending on the scheme.  */
1671
1672 static const char *
1673 path_end (const char *url)
1674 {
1675   enum url_scheme scheme = url_scheme (url);
1676   const char *seps;
1677   if (scheme == SCHEME_INVALID)
1678     scheme = SCHEME_HTTP;       /* use http semantics for rel links */
1679   /* +2 to ignore the first two separators ':' and '/' */
1680   seps = init_seps (scheme) + 2;
1681   return strpbrk_or_eos (url, seps);
1682 }
1683
1684 /* Find the last occurrence of character C in the range [b, e), or
1685    NULL, if none are present.  */
1686 #define find_last_char(b, e, c) memrchr ((b), (c), (e) - (b))
1687
1688 /* Merge BASE with LINK and return the resulting URI.
1689
1690    Either of the URIs may be absolute or relative, complete with the
1691    host name, or path only.  This tries to reasonably handle all
1692    foreseeable cases.  It only employs minimal URL parsing, without
1693    knowledge of the specifics of schemes.
1694
1695    I briefly considered making this function call path_simplify after
1696    the merging process, as rfc1738 seems to suggest.  This is a bad
1697    idea for several reasons: 1) it complexifies the code, and 2)
1698    url_parse has to simplify path anyway, so it's wasteful to boot.  */
1699
1700 char *
1701 uri_merge (const char *base, const char *link)
1702 {
1703   int linklength;
1704   const char *end;
1705   char *merge;
1706
1707   if (url_has_scheme (link))
1708     return xstrdup (link);
1709
1710   /* We may not examine BASE past END. */
1711   end = path_end (base);
1712   linklength = strlen (link);
1713
1714   if (!*link)
1715     {
1716       /* Empty LINK points back to BASE, query string and all. */
1717       return xstrdup (base);
1718     }
1719   else if (*link == '?')
1720     {
1721       /* LINK points to the same location, but changes the query
1722          string.  Examples: */
1723       /* uri_merge("path",         "?new") -> "path?new"     */
1724       /* uri_merge("path?foo",     "?new") -> "path?new"     */
1725       /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1726       /* uri_merge("path#foo",     "?new") -> "path?new"     */
1727       int baselength = end - base;
1728       merge = xmalloc (baselength + linklength + 1);
1729       memcpy (merge, base, baselength);
1730       memcpy (merge + baselength, link, linklength);
1731       merge[baselength + linklength] = '\0';
1732     }
1733   else if (*link == '#')
1734     {
1735       /* uri_merge("path",         "#new") -> "path#new"     */
1736       /* uri_merge("path#foo",     "#new") -> "path#new"     */
1737       /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1738       /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1739       int baselength;
1740       const char *end1 = strchr (base, '#');
1741       if (!end1)
1742         end1 = base + strlen (base);
1743       baselength = end1 - base;
1744       merge = xmalloc (baselength + linklength + 1);
1745       memcpy (merge, base, baselength);
1746       memcpy (merge + baselength, link, linklength);
1747       merge[baselength + linklength] = '\0';
1748     }
1749   else if (*link == '/' && *(link + 1) == '/')
1750     {
1751       /* LINK begins with "//" and so is a net path: we need to
1752          replace everything after (and including) the double slash
1753          with LINK. */
1754
1755       /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1756       /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1757       /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1758
1759       int span;
1760       const char *slash;
1761       const char *start_insert;
1762
1763       /* Look for first slash. */
1764       slash = memchr (base, '/', end - base);
1765       /* If found slash and it is a double slash, then replace
1766          from this point, else default to replacing from the
1767          beginning.  */
1768       if (slash && *(slash + 1) == '/')
1769         start_insert = slash;
1770       else
1771         start_insert = base;
1772
1773       span = start_insert - base;
1774       merge = xmalloc (span + linklength + 1);
1775       if (span)
1776         memcpy (merge, base, span);
1777       memcpy (merge + span, link, linklength);
1778       merge[span + linklength] = '\0';
1779     }
1780   else if (*link == '/')
1781     {
1782       /* LINK is an absolute path: we need to replace everything
1783          after (and including) the FIRST slash with LINK.
1784
1785          So, if BASE is "http://host/whatever/foo/bar", and LINK is
1786          "/qux/xyzzy", our result should be
1787          "http://host/qux/xyzzy".  */
1788       int span;
1789       const char *slash;
1790       const char *start_insert = NULL; /* for gcc to shut up. */
1791       const char *pos = base;
1792       bool seen_slash_slash = false;
1793       /* We're looking for the first slash, but want to ignore
1794          double slash. */
1795     again:
1796       slash = memchr (pos, '/', end - pos);
1797       if (slash && !seen_slash_slash)
1798         if (*(slash + 1) == '/')
1799           {
1800             pos = slash + 2;
1801             seen_slash_slash = true;
1802             goto again;
1803           }
1804
1805       /* At this point, SLASH is the location of the first / after
1806          "//", or the first slash altogether.  START_INSERT is the
1807          pointer to the location where LINK will be inserted.  When
1808          examining the last two examples, keep in mind that LINK
1809          begins with '/'. */
1810
1811       if (!slash && !seen_slash_slash)
1812         /* example: "foo" */
1813         /*           ^    */
1814         start_insert = base;
1815       else if (!slash && seen_slash_slash)
1816         /* example: "http://foo" */
1817         /*                     ^ */
1818         start_insert = end;
1819       else if (slash && !seen_slash_slash)
1820         /* example: "foo/bar" */
1821         /*           ^        */
1822         start_insert = base;
1823       else if (slash && seen_slash_slash)
1824         /* example: "http://something/" */
1825         /*                           ^  */
1826         start_insert = slash;
1827
1828       span = start_insert - base;
1829       merge = xmalloc (span + linklength + 1);
1830       if (span)
1831         memcpy (merge, base, span);
1832       memcpy (merge + span, link, linklength);
1833       merge[span + linklength] = '\0';
1834     }
1835   else
1836     {
1837       /* LINK is a relative URL: we need to replace everything
1838          after last slash (possibly empty) with LINK.
1839
1840          So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1841          our result should be "whatever/foo/qux/xyzzy".  */
1842       bool need_explicit_slash = false;
1843       int span;
1844       const char *start_insert;
1845       const char *last_slash = find_last_char (base, end, '/');
1846       if (!last_slash)
1847         {
1848           /* No slash found at all.  Replace what we have with LINK. */
1849           start_insert = base;
1850         }
1851       else if (last_slash && last_slash >= base + 2
1852                && last_slash[-2] == ':' && last_slash[-1] == '/')
1853         {
1854           /* example: http://host"  */
1855           /*                      ^ */
1856           start_insert = end + 1;
1857           need_explicit_slash = true;
1858         }
1859       else
1860         {
1861           /* example: "whatever/foo/bar" */
1862           /*                        ^    */
1863           start_insert = last_slash + 1;
1864         }
1865
1866       span = start_insert - base;
1867       merge = xmalloc (span + linklength + 1);
1868       if (span)
1869         memcpy (merge, base, span);
1870       if (need_explicit_slash)
1871         merge[span - 1] = '/';
1872       memcpy (merge + span, link, linklength);
1873       merge[span + linklength] = '\0';
1874     }
1875
1876   return merge;
1877 }
1878 \f
1879 #define APPEND(p, s) do {                       \
1880   int len = strlen (s);                         \
1881   memcpy (p, s, len);                           \
1882   p += len;                                     \
1883 } while (0)
1884
1885 /* Use this instead of password when the actual password is supposed
1886    to be hidden.  We intentionally use a generic string without giving
1887    away the number of characters in the password, like previous
1888    versions did.  */
1889 #define HIDDEN_PASSWORD "*password*"
1890
1891 /* Recreate the URL string from the data in URL.
1892
1893    If HIDE is true (as it is when we're calling this on a URL we plan
1894    to print, but not when calling it to canonicalize a URL for use
1895    within the program), password will be hidden.  Unsafe characters in
1896    the URL will be quoted.  */
1897
1898 char *
1899 url_string (const struct url *url, enum url_auth_mode auth_mode)
1900 {
1901   int size;
1902   char *result, *p;
1903   char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;
1904
1905   int scheme_port = supported_schemes[url->scheme].default_port;
1906   const char *scheme_str = supported_schemes[url->scheme].leading_string;
1907   int fplen = full_path_length (url);
1908
1909   bool brackets_around_host;
1910
1911   assert (scheme_str != NULL);
1912
1913   /* Make sure the user name and password are quoted. */
1914   if (url->user)
1915     {
1916       if (auth_mode != URL_AUTH_HIDE)
1917         {
1918           quoted_user = url_escape_allow_passthrough (url->user);
1919           if (url->passwd)
1920             {
1921               if (auth_mode == URL_AUTH_HIDE_PASSWD)
1922                 quoted_passwd = HIDDEN_PASSWORD;
1923               else
1924                 quoted_passwd = url_escape_allow_passthrough (url->passwd);
1925             }
1926         }
1927     }
1928
1929   /* In the unlikely event that the host name contains non-printable
1930      characters, quote it for displaying to the user.  */
1931   quoted_host = url_escape_allow_passthrough (url->host);
1932
1933   /* Undo the quoting of colons that URL escaping performs.  IPv6
1934      addresses may legally contain colons, and in that case must be
1935      placed in square brackets.  */
1936   if (quoted_host != url->host)
1937     unescape_single_char (quoted_host, ':');
1938   brackets_around_host = strchr (quoted_host, ':') != NULL;
1939
1940   size = (strlen (scheme_str)
1941           + strlen (quoted_host)
1942           + (brackets_around_host ? 2 : 0)
1943           + fplen
1944           + 1);
1945   if (url->port != scheme_port)
1946     size += 1 + numdigit (url->port);
1947   if (quoted_user)
1948     {
1949       size += 1 + strlen (quoted_user);
1950       if (quoted_passwd)
1951         size += 1 + strlen (quoted_passwd);
1952     }
1953
1954   p = result = xmalloc (size);
1955
1956   APPEND (p, scheme_str);
1957   if (quoted_user)
1958     {
1959       APPEND (p, quoted_user);
1960       if (quoted_passwd)
1961         {
1962           *p++ = ':';
1963           APPEND (p, quoted_passwd);
1964         }
1965       *p++ = '@';
1966     }
1967
1968   if (brackets_around_host)
1969     *p++ = '[';
1970   APPEND (p, quoted_host);
1971   if (brackets_around_host)
1972     *p++ = ']';
1973   if (url->port != scheme_port)
1974     {
1975       *p++ = ':';
1976       p = number_to_string (p, url->port);
1977     }
1978
1979   full_path_write (url, p);
1980   p += fplen;
1981   *p++ = '\0';
1982
1983   assert (p - result == size);
1984
1985   if (quoted_user && quoted_user != url->user)
1986     xfree (quoted_user);
1987   if (quoted_passwd && auth_mode == URL_AUTH_SHOW
1988       && quoted_passwd != url->passwd)
1989     xfree (quoted_passwd);
1990   if (quoted_host != url->host)
1991     xfree (quoted_host);
1992
1993   return result;
1994 }
1995 \f
1996 /* Return true if scheme a is similar to scheme b.
1997
1998    Schemes are similar if they are equal.  If SSL is supported, schemes
1999    are also similar if one is http (SCHEME_HTTP) and the other is https
2000    (SCHEME_HTTPS).  */
2001 bool
2002 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2003 {
2004   if (a == b)
2005     return true;
2006 #ifdef HAVE_SSL
2007   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2008       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2009     return true;
2010 #endif
2011   return false;
2012 }
2013 \f
2014 static int
2015 getchar_from_escaped_string (const char *str, char *c)
2016 {
2017   const char *p = str;
2018
2019   assert (str && *str);
2020   assert (c);
2021
2022   if (p[0] == '%')
2023     {
2024       if (!c_isxdigit(p[1]) || !c_isxdigit(p[2]))
2025         {
2026           *c = '%';
2027           return 1;
2028         }
2029       else
2030         {
2031           if (p[2] == 0)
2032             return 0; /* error: invalid string */
2033
2034           *c = X2DIGITS_TO_NUM (p[1], p[2]);
2035           if (URL_RESERVED_CHAR(*c))
2036             {
2037               *c = '%';
2038               return 1;
2039             }
2040           else
2041             return 3;
2042         }
2043     }
2044   else
2045     {
2046       *c = p[0];
2047     }
2048
2049   return 1;
2050 }
2051
2052 bool
2053 are_urls_equal (const char *u1, const char *u2)
2054 {
2055   const char *p, *q;
2056   int pp, qq;
2057   char ch1, ch2;
2058   assert(u1 && u2);
2059
2060   p = u1;
2061   q = u2;
2062
2063   while (*p && *q
2064          && (pp = getchar_from_escaped_string (p, &ch1))
2065          && (qq = getchar_from_escaped_string (q, &ch2))
2066          && (c_tolower(ch1) == c_tolower(ch2)))
2067     {
2068       p += pp;
2069       q += qq;
2070     }
2071
2072   return (*p == 0 && *q == 0 ? true : false);
2073 }
2074 \f
2075 #ifdef TESTING
2076 /* Debugging and testing support for path_simplify. */
2077
2078 #if 0
2079 /* Debug: run path_simplify on PATH and return the result in a new
2080    string.  Useful for calling from the debugger.  */
2081 static char *
2082 ps (char *path)
2083 {
2084   char *copy = xstrdup (path);
2085   path_simplify (copy);
2086   return copy;
2087 }
2088 #endif
2089
2090 static const char *
2091 run_test (char *test, char *expected_result, enum url_scheme scheme,
2092           bool expected_change)
2093 {
2094   char *test_copy = xstrdup (test);
2095   bool modified = path_simplify (scheme, test_copy);
2096
2097   if (0 != strcmp (test_copy, expected_result))
2098     {
2099       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2100               test, expected_result, test_copy);
2101       mu_assert ("", 0);
2102     }
2103   if (modified != expected_change)
2104     {
2105       if (expected_change)
2106         printf ("Expected modification with path_simplify(\"%s\").\n",
2107                 test);
2108       else
2109         printf ("Expected no modification with path_simplify(\"%s\").\n",
2110                 test);
2111     }
2112   xfree (test_copy);
2113   mu_assert ("", modified == expected_change);
2114   return NULL;
2115 }
2116
2117 const char *
2118 test_path_simplify (void)
2119 {
2120   static struct {
2121     char *test, *result;
2122     enum url_scheme scheme;
2123     bool should_modify;
2124   } tests[] = {
2125     { "",                       "",             SCHEME_HTTP, false },
2126     { ".",                      "",             SCHEME_HTTP, true },
2127     { "./",                     "",             SCHEME_HTTP, true },
2128     { "..",                     "",             SCHEME_HTTP, true },
2129     { "../",                    "",             SCHEME_HTTP, true },
2130     { "..",                     "..",           SCHEME_FTP,  false },
2131     { "../",                    "../",          SCHEME_FTP,  false },
2132     { "foo",                    "foo",          SCHEME_HTTP, false },
2133     { "foo/bar",                "foo/bar",      SCHEME_HTTP, false },
2134     { "foo///bar",              "foo///bar",    SCHEME_HTTP, false },
2135     { "foo/.",                  "foo/",         SCHEME_HTTP, true },
2136     { "foo/./",                 "foo/",         SCHEME_HTTP, true },
2137     { "foo./",                  "foo./",        SCHEME_HTTP, false },
2138     { "foo/../bar",             "bar",          SCHEME_HTTP, true },
2139     { "foo/../bar/",            "bar/",         SCHEME_HTTP, true },
2140     { "foo/bar/..",             "foo/",         SCHEME_HTTP, true },
2141     { "foo/bar/../x",           "foo/x",        SCHEME_HTTP, true },
2142     { "foo/bar/../x/",          "foo/x/",       SCHEME_HTTP, true },
2143     { "foo/..",                 "",             SCHEME_HTTP, true },
2144     { "foo/../..",              "",             SCHEME_HTTP, true },
2145     { "foo/../../..",           "",             SCHEME_HTTP, true },
2146     { "foo/../../bar/../../baz", "baz",         SCHEME_HTTP, true },
2147     { "foo/../..",              "..",           SCHEME_FTP,  true },
2148     { "foo/../../..",           "../..",        SCHEME_FTP,  true },
2149     { "foo/../../bar/../../baz", "../../baz",   SCHEME_FTP,  true },
2150     { "a/b/../../c",            "c",            SCHEME_HTTP, true },
2151     { "./a/../b",               "b",            SCHEME_HTTP, true }
2152   };
2153   int i;
2154
2155   for (i = 0; i < countof (tests); i++)
2156     {
2157       const char *message;
2158       char *test = tests[i].test;
2159       char *expected_result = tests[i].result;
2160       enum url_scheme scheme = tests[i].scheme;
2161       bool  expected_change = tests[i].should_modify;
2162       message = run_test (test, expected_result, scheme, expected_change);
2163       if (message) return message;
2164     }
2165   return NULL;
2166 }
2167
2168 const char *
2169 test_append_uri_pathel()
2170 {
2171   int i;
2172   struct {
2173     char *original_url;
2174     char *input;
2175     bool escaped;
2176     char *expected_result;
2177   } test_array[] = {
2178     { "http://www.yoyodyne.com/path/", "somepage.html", false, "http://www.yoyodyne.com/path/somepage.html" },
2179   };
2180
2181   for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
2182     {
2183       struct growable dest;
2184       const char *p = test_array[i].input;
2185
2186       memset (&dest, 0, sizeof (dest));
2187
2188       append_string (test_array[i].original_url, &dest);
2189       append_uri_pathel (p, p + strlen(p), test_array[i].escaped, &dest);
2190       append_char ('\0', &dest);
2191
2192       mu_assert ("test_append_uri_pathel: wrong result",
2193                  strcmp (dest.base, test_array[i].expected_result) == 0);
2194     }
2195
2196   return NULL;
2197 }
2198
2199 const char*
2200 test_are_urls_equal()
2201 {
2202   int i;
2203   struct {
2204     char *url1;
2205     char *url2;
2206     bool expected_result;
2207   } test_array[] = {
2208     { "http://www.adomain.com/apath/", "http://www.adomain.com/apath/",       true },
2209     { "http://www.adomain.com/apath/", "http://www.adomain.com/anotherpath/", false },
2210     { "http://www.adomain.com/apath/", "http://www.anotherdomain.com/path/",  false },
2211     { "http://www.adomain.com/~path/", "http://www.adomain.com/%7epath/",     true },
2212     { "http://www.adomain.com/longer-path/", "http://www.adomain.com/path/",  false },
2213     { "http://www.adomain.com/path%2f", "http://www.adomain.com/path/",       false },
2214   };
2215
2216   for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
2217     {
2218       mu_assert ("test_are_urls_equal: wrong result",
2219                  are_urls_equal (test_array[i].url1, test_array[i].url2) == test_array[i].expected_result);
2220     }
2221
2222   return NULL;
2223 }
2224
2225 #endif /* TESTING */
2226
2227 /*
2228  * vim: et ts=2 sw=2
2229  */
2230