sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
   3    2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or (at
  10 your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #include "wget.h"
  32
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 #ifdef HAVE_UNISTD_H
  37 # include <unistd.h>
  38 #endif
  39 #include <errno.h>
  40 #include <assert.h>
  41
  42 #include "utils.h"
  43 #include "url.h"
  44 #include "host.h"  /* for is_valid_ipv6_address */
  45
  46 #ifdef TESTING
  47 #include "test.h"
  48 #endif
  49
  50 enum {
  51   scm_disabled = 1,             /* for https when OpenSSL fails to init. */
  52   scm_has_params = 2,           /* whether scheme has ;params */
  53   scm_has_query = 4,            /* whether scheme has ?query */
  54   scm_has_fragment = 8          /* whether scheme has #fragment */
  55 };
  56
  57 struct scheme_data
  58 {
  59   /* Short name of the scheme, such as "http" or "ftp". */
  60   const char *name;
  61   /* Leading string that identifies the scheme, such as "https://". */
  62   const char *leading_string;
  63   /* Default port of the scheme when none is specified. */
  64   int default_port;
  65   /* Various flags. */
  66   int flags;
  67 };
  68
  69 /* Supported schemes: */
  70 static struct scheme_data supported_schemes[] =
  71 {
  72   { "http",     "http://",  DEFAULT_HTTP_PORT,  scm_has_query|scm_has_fragment },
  73 #ifdef HAVE_SSL
  74   { "https",    "https://", DEFAULT_HTTPS_PORT, scm_has_query|scm_has_fragment },
  75 #endif
  76   { "ftp",      "ftp://",   DEFAULT_FTP_PORT,   scm_has_params|scm_has_fragment },
  77
  78   /* SCHEME_INVALID */
  79   { NULL,       NULL,       -1,                 0 }
  80 };
  81
  82 /* Forward declarations: */
  83
  84 static bool path_simplify (enum url_scheme, char *);
  85 \f
  86 /* Support for escaping and unescaping of URL strings.  */
  87
  88 /* Table of "reserved" and "unsafe" characters.  Those terms are
  89    rfc1738-speak, as such largely obsoleted by rfc2396 and later
  90    specs, but the general idea remains.
  91
  92    A reserved character is the one that you can't decode without
  93    changing the meaning of the URL.  For example, you can't decode
  94    "/foo/%2f/bar" into "/foo///bar" because the number and contents of
  95    path components is different.  Non-reserved characters can be
  96    changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar".  The
  97    unsafe characters are loosely based on rfc1738, plus "$" and ",",
  98    as recommended by rfc2396, and minus "~", which is very frequently
  99    used (and sometimes unrecognized as %7E by broken servers).
 100
 101    An unsafe character is the one that should be encoded when URLs are
 102    placed in foreign environments.  E.g. space and newline are unsafe
 103    in HTTP contexts because HTTP uses them as separator and line
 104    terminator, so they must be encoded to %20 and %0A respectively.
 105    "*" is unsafe in shell context, etc.
 106
 107    We determine whether a character is unsafe through static table
 108    lookup.  This code assumes ASCII character set and 8-bit chars.  */
 109
 110 enum {
 111   /* rfc1738 reserved chars + "$" and ",".  */
 112   urlchr_reserved = 1,
 113
 114   /* rfc1738 unsafe chars, plus non-printables.  */
 115   urlchr_unsafe   = 2
 116 };
 117
 118 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 119 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 120 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 121
 122 /* Shorthands for the table: */
 123 #define R  urlchr_reserved
 124 #define U  urlchr_unsafe
 125 #define RU R|U
 126
 127 static const unsigned char urlchr_table[256] =
 128 {
 129   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 130   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 131   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 132   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 133   U,  0,  U, RU,   R,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 134   0,  0,  0,  R,   R,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 135   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 136   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 137  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 138   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 139   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 140   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 141   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 142   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 143   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 144   0,  0,  0,  U,   U,  U,  0,  U,   /* x   y   z   {    |   }   ~   DEL */
 145
 146   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 147   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 148   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 149   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 150
 151   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 152   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 153   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 154   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 155 };
 156 #undef R
 157 #undef U
 158 #undef RU
 159
 160 /* URL-unescape the string S.
 161
 162    This is done by transforming the sequences "%HH" to the character
 163    represented by the hexadecimal digits HH.  If % is not followed by
 164    two hexadecimal digits, it is inserted literally.
 165
 166    The transformation is done in place.  If you need the original
 167    string intact, make a copy before calling this function.  */
 168
 169 static void
 170 url_unescape (char *s)
 171 {
 172   char *t = s;                  /* t - tortoise */
 173   char *h = s;                  /* h - hare     */
 174
 175   for (; *h; h++, t++)
 176     {
 177       if (*h != '%')
 178         {
 179         copychar:
 180           *t = *h;
 181         }
 182       else
 183         {
 184           char c;
 185           /* Do nothing if '%' is not followed by two hex digits. */
 186           if (!h[1] || !h[2] || !(c_isxdigit (h[1]) && c_isxdigit (h[2])))
 187             goto copychar;
 188           c = X2DIGITS_TO_NUM (h[1], h[2]);
 189           /* Don't unescape %00 because there is no way to insert it
 190              into a C string without effectively truncating it. */
 191           if (c == '\0')
 192             goto copychar;
 193           *t = c;
 194           h += 2;
 195         }
 196     }
 197   *t = '\0';
 198 }
 199
 200 /* The core of url_escape_* functions.  Escapes the characters that
 201    match the provided mask in urlchr_table.
 202
 203    If ALLOW_PASSTHROUGH is true, a string with no unsafe chars will be
 204    returned unchanged.  If ALLOW_PASSTHROUGH is false, a freshly
 205    allocated string will be returned in all cases.  */
 206
 207 static char *
 208 url_escape_1 (const char *s, unsigned char mask, bool allow_passthrough)
 209 {
 210   const char *p1;
 211   char *p2, *newstr;
 212   int newlen;
 213   int addition = 0;
 214
 215   for (p1 = s; *p1; p1++)
 216     if (urlchr_test (*p1, mask))
 217       addition += 2;            /* Two more characters (hex digits) */
 218
 219   if (!addition)
 220     return allow_passthrough ? (char *)s : xstrdup (s);
 221
 222   newlen = (p1 - s) + addition;
 223   newstr = xmalloc (newlen + 1);
 224
 225   p1 = s;
 226   p2 = newstr;
 227   while (*p1)
 228     {
 229       /* Quote the characters that match the test mask. */
 230       if (urlchr_test (*p1, mask))
 231         {
 232           unsigned char c = *p1++;
 233           *p2++ = '%';
 234           *p2++ = XNUM_TO_DIGIT (c >> 4);
 235           *p2++ = XNUM_TO_DIGIT (c & 0xf);
 236         }
 237       else
 238         *p2++ = *p1++;
 239     }
 240   assert (p2 - newstr == newlen);
 241   *p2 = '\0';
 242
 243   return newstr;
 244 }
 245
 246 /* URL-escape the unsafe characters (see urlchr_table) in a given
 247    string, returning a freshly allocated string.  */
 248
 249 char *
 250 url_escape (const char *s)
 251 {
 252   return url_escape_1 (s, urlchr_unsafe, false);
 253 }
 254
 255 /* URL-escape the unsafe and reserved characters (see urlchr_table) in
 256    a given string, returning a freshly allocated string.  */
 257
 258 char *
 259 url_escape_unsafe_and_reserved (const char *s)
 260 {
 261   return url_escape_1 (s, urlchr_unsafe|urlchr_reserved, false);
 262 }
 263
 264 /* URL-escape the unsafe characters (see urlchr_table) in a given
 265    string.  If no characters are unsafe, S is returned.  */
 266
 267 static char *
 268 url_escape_allow_passthrough (const char *s)
 269 {
 270   return url_escape_1 (s, urlchr_unsafe, true);
 271 }
 272 \f
 273 /* Decide whether the char at position P needs to be encoded.  (It is
 274    not enough to pass a single char *P because the function may need
 275    to inspect the surrounding context.)
 276
 277    Return true if the char should be escaped as %XX, false otherwise.  */
 278
 279 static inline bool
 280 char_needs_escaping (const char *p)
 281 {
 282   if (*p == '%')
 283     {
 284       if (c_isxdigit (*(p + 1)) && c_isxdigit (*(p + 2)))
 285         return false;
 286       else
 287         /* Garbled %.. sequence: encode `%'. */
 288         return true;
 289     }
 290   else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
 291     return true;
 292   else
 293     return false;
 294 }
 295
 296 /* Translate a %-escaped (but possibly non-conformant) input string S
 297    into a %-escaped (and conformant) output string.  If no characters
 298    are encoded or decoded, return the same string S; otherwise, return
 299    a freshly allocated string with the new contents.
 300
 301    After a URL has been run through this function, the protocols that
 302    use `%' as the quote character can use the resulting string as-is,
 303    while those that don't can use url_unescape to get to the intended
 304    data.  This function is stable: once the input is transformed,
 305    further transformations of the result yield the same output.
 306
 307    Let's discuss why this function is needed.
 308
 309    Imagine Wget is asked to retrieve `http://abc.xyz/abc def'.  Since
 310    a raw space character would mess up the HTTP request, it needs to
 311    be quoted, like this:
 312
 313        GET /abc%20def HTTP/1.0
 314
 315    It would appear that the unsafe chars need to be quoted, for
 316    example with url_escape.  But what if we're requested to download
 317    `abc%20def'?  url_escape transforms "%" to "%25", which would leave
 318    us with `abc%2520def'.  This is incorrect -- since %-escapes are
 319    part of URL syntax, "%20" is the correct way to denote a literal
 320    space on the Wget command line.  This leads to the conclusion that
 321    in that case Wget should not call url_escape, but leave the `%20'
 322    as is.  This is clearly contradictory, but it only gets worse.
 323
 324    What if the requested URI is `abc%20 def'?  If we call url_escape,
 325    we end up with `/abc%2520%20def', which is almost certainly not
 326    intended.  If we don't call url_escape, we are left with the
 327    embedded space and cannot complete the request.  What the user
 328    meant was for Wget to request `/abc%20%20def', and this is where
 329    reencode_escapes kicks in.
 330
 331    Wget used to solve this by first decoding %-quotes, and then
 332    encoding all the "unsafe" characters found in the resulting string.
 333    This was wrong because it didn't preserve certain URL special
 334    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 335    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 336    whether we considered `+' reserved (it is).  One of these results
 337    is inevitable because by the second step we would lose information
 338    on whether the `+' was originally encoded or not.  Both results
 339    were wrong because in CGI parameters + means space, while %2B means
 340    literal plus.  reencode_escapes correctly translates the above to
 341    "a%2B+b", i.e. returns the original string.
 342
 343    This function uses a modified version of the algorithm originally
 344    proposed by Anon Sricharoenchai:
 345
 346    * Encode all "unsafe" characters, except those that are also
 347      "reserved", to %XX.  See urlchr_table for which characters are
 348      unsafe and reserved.
 349
 350    * Encode the "%" characters not followed by two hex digits to
 351      "%25".
 352
 353    * Pass through all other characters and %XX escapes as-is.  (Up to
 354      Wget 1.10 this decoded %XX escapes corresponding to "safe"
 355      characters, but that was obtrusive and broke some servers.)
 356
 357    Anon's test case:
 358
 359    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 360    ->
 361    "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc"
 362
 363    Simpler test cases:
 364
 365    "foo bar"         -> "foo%20bar"
 366    "foo%20bar"       -> "foo%20bar"
 367    "foo %20bar"      -> "foo%20%20bar"
 368    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 369    "foo%25%20bar"    -> "foo%25%20bar"
 370    "foo%2%20bar"     -> "foo%252%20bar"
 371    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 372    "foo%2b+bar"      -> "foo%2b+bar"  */
 373
 374 static char *
 375 reencode_escapes (const char *s)
 376 {
 377   const char *p1;
 378   char *newstr, *p2;
 379   int oldlen, newlen;
 380
 381   int encode_count = 0;
 382
 383   /* First pass: inspect the string to see if there's anything to do,
 384      and to calculate the new length.  */
 385   for (p1 = s; *p1; p1++)
 386     if (char_needs_escaping (p1))
 387       ++encode_count;
 388
 389   if (!encode_count)
 390     /* The string is good as it is. */
 391     return (char *) s;          /* C const model sucks. */
 392
 393   oldlen = p1 - s;
 394   /* Each encoding adds two characters (hex digits).  */
 395   newlen = oldlen + 2 * encode_count;
 396   newstr = xmalloc (newlen + 1);
 397
 398   /* Second pass: copy the string to the destination address, encoding
 399      chars when needed.  */
 400   p1 = s;
 401   p2 = newstr;
 402
 403   while (*p1)
 404     if (char_needs_escaping (p1))
 405       {
 406         unsigned char c = *p1++;
 407         *p2++ = '%';
 408         *p2++ = XNUM_TO_DIGIT (c >> 4);
 409         *p2++ = XNUM_TO_DIGIT (c & 0xf);
 410       }
 411     else
 412       *p2++ = *p1++;
 413
 414   *p2 = '\0';
 415   assert (p2 - newstr == newlen);
 416   return newstr;
 417 }
 418 \f
 419 /* Returns the scheme type if the scheme is supported, or
 420    SCHEME_INVALID if not.  */
 421
 422 enum url_scheme
 423 url_scheme (const char *url)
 424 {
 425   int i;
 426
 427   for (i = 0; supported_schemes[i].leading_string; i++)
 428     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 429                           strlen (supported_schemes[i].leading_string)))
 430       {
 431         if (!(supported_schemes[i].flags & scm_disabled))
 432           return (enum url_scheme) i;
 433         else
 434           return SCHEME_INVALID;
 435       }
 436
 437   return SCHEME_INVALID;
 438 }
 439
 440 #define SCHEME_CHAR(ch) (c_isalnum (ch) || (ch) == '-' || (ch) == '+')
 441
 442 /* Return 1 if the URL begins with any "scheme", 0 otherwise.  As
 443    currently implemented, it returns true if URL begins with
 444    [-+a-zA-Z0-9]+: .  */
 445
 446 bool
 447 url_has_scheme (const char *url)
 448 {
 449   const char *p = url;
 450
 451   /* The first char must be a scheme char. */
 452   if (!*p || !SCHEME_CHAR (*p))
 453     return false;
 454   ++p;
 455   /* Followed by 0 or more scheme chars. */
 456   while (*p && SCHEME_CHAR (*p))
 457     ++p;
 458   /* Terminated by ':'. */
 459   return *p == ':';
 460 }
 461
 462 int
 463 scheme_default_port (enum url_scheme scheme)
 464 {
 465   return supported_schemes[scheme].default_port;
 466 }
 467
 468 void
 469 scheme_disable (enum url_scheme scheme)
 470 {
 471   supported_schemes[scheme].flags |= scm_disabled;
 472 }
 473
 474 /* Skip the username and password, if present in the URL.  The
 475    function should *not* be called with the complete URL, but with the
 476    portion after the scheme.
 477
 478    If no username and password are found, return URL.  */
 479
 480 static const char *
 481 url_skip_credentials (const char *url)
 482 {
 483   /* Look for '@' that comes before terminators, such as '/', '?',
 484      '#', or ';'.  */
 485   const char *p = (const char *)strpbrk (url, "@/?#;");
 486   if (!p || *p != '@')
 487     return url;
 488   return p + 1;
 489 }
 490
 491 /* Parse credentials contained in [BEG, END).  The region is expected
 492    to have come from a URL and is unescaped.  */
 493
 494 static bool
 495 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
 496 {
 497   char *colon;
 498   const char *userend;
 499
 500   if (beg == end)
 501     return false;               /* empty user name */
 502
 503   colon = memchr (beg, ':', end - beg);
 504   if (colon == beg)
 505     return false;               /* again empty user name */
 506
 507   if (colon)
 508     {
 509       *passwd = strdupdelim (colon + 1, end);
 510       userend = colon;
 511       url_unescape (*passwd);
 512     }
 513   else
 514     {
 515       *passwd = NULL;
 516       userend = end;
 517     }
 518   *user = strdupdelim (beg, userend);
 519   url_unescape (*user);
 520   return true;
 521 }
 522
 523 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 524    originally popularized by Netscape and NcFTP.  HTTP shorthands look
 525    like this:
 526
 527    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 528    www.foo.com[:port]            -> http://www.foo.com[:port]
 529
 530    FTP shorthands look like this:
 531
 532    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 533    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 534
 535    If the URL needs not or cannot be rewritten, return NULL.  */
 536
 537 char *
 538 rewrite_shorthand_url (const char *url)
 539 {
 540   const char *p;
 541   char *ret;
 542
 543   if (url_scheme (url) != SCHEME_INVALID)
 544     return NULL;
 545
 546   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 547      latter Netscape.  */
 548   p = strpbrk (url, ":/");
 549   if (p == url)
 550     return NULL;
 551
 552   /* If we're looking at "://", it means the URL uses a scheme we
 553      don't support, which may include "https" when compiled without
 554      SSL support.  Don't bogusly rewrite such URLs.  */
 555   if (p && p[0] == ':' && p[1] == '/' && p[2] == '/')
 556     return NULL;
 557
 558   if (p && *p == ':')
 559     {
 560       /* Colon indicates ftp, as in foo.bar.com:path.  Check for
 561          special case of http port number ("localhost:10000").  */
 562       int digits = strspn (p + 1, "0123456789");
 563       if (digits && (p[1 + digits] == '/' || p[1 + digits] == '\0'))
 564         goto http;
 565
 566       /* Turn "foo.bar.com:path" to "ftp://foo.bar.com/path". */
 567       ret = aprintf ("ftp://%s", url);
 568       ret[6 + (p - url)] = '/';
 569     }
 570   else
 571     {
 572     http:
 573       /* Just prepend "http://" to URL. */
 574       ret = aprintf ("http://%s", url);
 575     }
 576   return ret;
 577 }
 578 \f
 579 static void split_path (const char *, char **, char **);
 580
 581 /* Like strpbrk, with the exception that it returns the pointer to the
 582    terminating zero (end-of-string aka "eos") if no matching character
 583    is found.  */
 584
 585 static inline char *
 586 strpbrk_or_eos (const char *s, const char *accept)
 587 {
 588   char *p = strpbrk (s, accept);
 589   if (!p)
 590     p = strchr (s, '\0');
 591   return p;
 592 }
 593
 594 /* Turn STR into lowercase; return true if a character was actually
 595    changed. */
 596
 597 static bool
 598 lowercase_str (char *str)
 599 {
 600   bool changed = false;
 601   for (; *str; str++)
 602     if (c_isupper (*str))
 603       {
 604         changed = true;
 605         *str = c_tolower (*str);
 606       }
 607   return changed;
 608 }
 609
 610 static const char *
 611 init_seps (enum url_scheme scheme)
 612 {
 613   static char seps[8] = ":/";
 614   char *p = seps + 2;
 615   int flags = supported_schemes[scheme].flags;
 616
 617   if (flags & scm_has_params)
 618     *p++ = ';';
 619   if (flags & scm_has_query)
 620     *p++ = '?';
 621   if (flags & scm_has_fragment)
 622     *p++ = '#';
 623   *p++ = '\0';
 624   return seps;
 625 }
 626
 627 static const char *parse_errors[] = {
 628 #define PE_NO_ERROR                     0
 629   N_("No error"),
 630 #define PE_UNSUPPORTED_SCHEME           1
 631   N_("Unsupported scheme %s"),
 632 #define PE_INVALID_HOST_NAME            2
 633   N_("Invalid host name"),
 634 #define PE_BAD_PORT_NUMBER              3
 635   N_("Bad port number"),
 636 #define PE_INVALID_USER_NAME            4
 637   N_("Invalid user name"),
 638 #define PE_UNTERMINATED_IPV6_ADDRESS    5
 639   N_("Unterminated IPv6 numeric address"),
 640 #define PE_IPV6_NOT_SUPPORTED           6
 641   N_("IPv6 addresses not supported"),
 642 #define PE_INVALID_IPV6_ADDRESS         7
 643   N_("Invalid IPv6 numeric address")
 644 };
 645
 646 /* Parse a URL.
 647
 648    Return a new struct url if successful, NULL on error.  In case of
 649    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 650    error code. */
 651 struct url *
 652 url_parse (const char *url, int *error, struct iri *iri, bool percent_encode)
 653 {
 654   struct url *u;
 655   const char *p;
 656   bool path_modified, host_modified;
 657
 658   enum url_scheme scheme;
 659   const char *seps;
 660
 661   const char *uname_b,     *uname_e;
 662   const char *host_b,      *host_e;
 663   const char *path_b,      *path_e;
 664   const char *params_b,    *params_e;
 665   const char *query_b,     *query_e;
 666   const char *fragment_b,  *fragment_e;
 667
 668   int port;
 669   char *user = NULL, *passwd = NULL;
 670
 671   const char *url_encoded = NULL;
 672   char *new_url = NULL;
 673
 674   int error_code;
 675
 676   scheme = url_scheme (url);
 677   if (scheme == SCHEME_INVALID)
 678     {
 679       error_code = PE_UNSUPPORTED_SCHEME;
 680       goto error;
 681     }
 682
 683   if (iri && iri->utf8_encode)
 684     {
 685       iri->utf8_encode = remote_to_utf8 (iri, iri->orig_url ? iri->orig_url : url, (const char **) &new_url);
 686       if (!iri->utf8_encode)
 687         new_url = NULL;
 688       else
 689         iri->orig_url = xstrdup (url);
 690     }
 691
 692   /* XXX XXX Could that change introduce (security) bugs ???  XXX XXX*/
 693   if (percent_encode)
 694     url_encoded = reencode_escapes (new_url ? new_url : url);
 695   else
 696      url_encoded = new_url ? new_url : url;
 697
 698   p = url_encoded;
 699
 700   if (new_url && url_encoded != new_url)
 701     xfree (new_url);
 702
 703   p += strlen (supported_schemes[scheme].leading_string);
 704   uname_b = p;
 705   p = url_skip_credentials (p);
 706   uname_e = p;
 707
 708   /* scheme://user:pass@host[:port]... */
 709   /*                    ^              */
 710
 711   /* We attempt to break down the URL into the components path,
 712      params, query, and fragment.  They are ordered like this:
 713
 714        scheme://host[:port][/path][;params][?query][#fragment]  */
 715
 716   path_b     = path_e     = NULL;
 717   params_b   = params_e   = NULL;
 718   query_b    = query_e    = NULL;
 719   fragment_b = fragment_e = NULL;
 720
 721   /* Initialize separators for optional parts of URL, depending on the
 722      scheme.  For example, FTP has params, and HTTP and HTTPS have
 723      query string and fragment. */
 724   seps = init_seps (scheme);
 725
 726   host_b = p;
 727
 728   if (*p == '[')
 729     {
 730       /* Handle IPv6 address inside square brackets.  Ideally we'd
 731          just look for the terminating ']', but rfc2732 mandates
 732          rejecting invalid IPv6 addresses.  */
 733
 734       /* The address begins after '['. */
 735       host_b = p + 1;
 736       host_e = strchr (host_b, ']');
 737
 738       if (!host_e)
 739         {
 740           error_code = PE_UNTERMINATED_IPV6_ADDRESS;
 741           goto error;
 742         }
 743
 744 #ifdef ENABLE_IPV6
 745       /* Check if the IPv6 address is valid. */
 746       if (!is_valid_ipv6_address(host_b, host_e))
 747         {
 748           error_code = PE_INVALID_IPV6_ADDRESS;
 749           goto error;
 750         }
 751
 752       /* Continue parsing after the closing ']'. */
 753       p = host_e + 1;
 754 #else
 755       error_code = PE_IPV6_NOT_SUPPORTED;
 756       goto error;
 757 #endif
 758
 759       /* The closing bracket must be followed by a separator or by the
 760          null char.  */
 761       /* http://[::1]... */
 762       /*             ^   */
 763       if (!strchr (seps, *p))
 764         {
 765           /* Trailing garbage after []-delimited IPv6 address. */
 766           error_code = PE_INVALID_HOST_NAME;
 767           goto error;
 768         }
 769     }
 770   else
 771     {
 772       p = strpbrk_or_eos (p, seps);
 773       host_e = p;
 774     }
 775   ++seps;                       /* advance to '/' */
 776
 777   if (host_b == host_e)
 778     {
 779       error_code = PE_INVALID_HOST_NAME;
 780       goto error;
 781     }
 782
 783   port = scheme_default_port (scheme);
 784   if (*p == ':')
 785     {
 786       const char *port_b, *port_e, *pp;
 787
 788       /* scheme://host:port/tralala */
 789       /*              ^             */
 790       ++p;
 791       port_b = p;
 792       p = strpbrk_or_eos (p, seps);
 793       port_e = p;
 794
 795       /* Allow empty port, as per rfc2396. */
 796       if (port_b != port_e)
 797         for (port = 0, pp = port_b; pp < port_e; pp++)
 798           {
 799             if (!c_isdigit (*pp))
 800               {
 801                 /* http://host:12randomgarbage/blah */
 802                 /*               ^                  */
 803                 error_code = PE_BAD_PORT_NUMBER;
 804                 goto error;
 805               }
 806             port = 10 * port + (*pp - '0');
 807             /* Check for too large port numbers here, before we have
 808                a chance to overflow on bogus port values.  */
 809             if (port > 0xffff)
 810               {
 811                 error_code = PE_BAD_PORT_NUMBER;
 812                 goto error;
 813               }
 814           }
 815     }
 816   /* Advance to the first separator *after* '/' (either ';' or '?',
 817      depending on the scheme).  */
 818   ++seps;
 819
 820   /* Get the optional parts of URL, each part being delimited by
 821      current location and the position of the next separator.  */
 822 #define GET_URL_PART(sepchar, var) do {                         \
 823   if (*p == sepchar)                                            \
 824     var##_b = ++p, var##_e = p = strpbrk_or_eos (p, seps);      \
 825   ++seps;                                                       \
 826 } while (0)
 827
 828   GET_URL_PART ('/', path);
 829   if (supported_schemes[scheme].flags & scm_has_params)
 830     GET_URL_PART (';', params);
 831   if (supported_schemes[scheme].flags & scm_has_query)
 832     GET_URL_PART ('?', query);
 833   if (supported_schemes[scheme].flags & scm_has_fragment)
 834     GET_URL_PART ('#', fragment);
 835
 836 #undef GET_URL_PART
 837   assert (*p == 0);
 838
 839   if (uname_b != uname_e)
 840     {
 841       /* http://user:pass@host */
 842       /*        ^         ^    */
 843       /*     uname_b   uname_e */
 844       if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
 845         {
 846           error_code = PE_INVALID_USER_NAME;
 847           goto error;
 848         }
 849     }
 850
 851   u = xnew0 (struct url);
 852   u->scheme = scheme;
 853   u->host   = strdupdelim (host_b, host_e);
 854   u->port   = port;
 855   u->user   = user;
 856   u->passwd = passwd;
 857
 858   u->path = strdupdelim (path_b, path_e);
 859   path_modified = path_simplify (scheme, u->path);
 860   split_path (u->path, &u->dir, &u->file);
 861
 862   host_modified = lowercase_str (u->host);
 863
 864   /* Decode %HH sequences in host name.  This is important not so much
 865      to support %HH sequences in host names (which other browser
 866      don't), but to support binary characters (which will have been
 867      converted to %HH by reencode_escapes).  */
 868   if (strchr (u->host, '%'))
 869     {
 870       url_unescape (u->host);
 871       host_modified = true;
 872
 873       /* Apply IDNA regardless of iri->utf8_encode status */
 874       if (opt.enable_iri && iri)
 875         {
 876           char *new = idn_encode (iri, u->host);
 877           if (new)
 878             {
 879               xfree (u->host);
 880               u->host = new;
 881               host_modified = true;
 882             }
 883         }
 884     }
 885
 886   if (params_b)
 887     u->params = strdupdelim (params_b, params_e);
 888   if (query_b)
 889     u->query = strdupdelim (query_b, query_e);
 890   if (fragment_b)
 891     u->fragment = strdupdelim (fragment_b, fragment_e);
 892
 893   if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e)
 894     {
 895       /* If we suspect that a transformation has rendered what
 896          url_string might return different from URL_ENCODED, rebuild
 897          u->url using url_string.  */
 898       u->url = url_string (u, URL_AUTH_SHOW);
 899
 900       if (url_encoded != url)
 901         xfree ((char *) url_encoded);
 902     }
 903   else
 904     {
 905       if (url_encoded == url)
 906         u->url = xstrdup (url);
 907       else
 908         u->url = (char *) url_encoded;
 909     }
 910
 911   return u;
 912
 913  error:
 914   /* Cleanup in case of error: */
 915   if (url_encoded && url_encoded != url)
 916     xfree ((char *) url_encoded);
 917
 918   /* Transmit the error code to the caller, if the caller wants to
 919      know.  */
 920   if (error)
 921     *error = error_code;
 922   return NULL;
 923 }
 924
 925 /* Return the error message string from ERROR_CODE, which should have
 926    been retrieved from url_parse.  The error message is translated.  */
 927
 928 char *
 929 url_error (const char *url, int error_code)
 930 {
 931   assert (error_code >= 0 && ((size_t) error_code) < countof (parse_errors));
 932
 933   if (error_code == PE_UNSUPPORTED_SCHEME)
 934     {
 935       char *error, *p;
 936       char *scheme = xstrdup (url);
 937       assert (url_has_scheme (url));
 938
 939       if ((p = strchr (scheme, ':')))
 940         *p = '\0';
 941       if (!strcasecmp (scheme, "https"))
 942         error = aprintf (_("HTTPS support not compiled in"));
 943       else
 944         error = aprintf (_(parse_errors[error_code]), quote (scheme));
 945       xfree (scheme);
 946
 947       return error;
 948     }
 949   else
 950     return xstrdup (_(parse_errors[error_code]));
 951 }
 952
 953 /* Split PATH into DIR and FILE.  PATH comes from the URL and is
 954    expected to be URL-escaped.
 955
 956    The path is split into directory (the part up to the last slash)
 957    and file (the part after the last slash), which are subsequently
 958    unescaped.  Examples:
 959
 960    PATH                 DIR           FILE
 961    "foo/bar/baz"        "foo/bar"     "baz"
 962    "foo/bar/"           "foo/bar"     ""
 963    "foo"                ""            "foo"
 964    "foo/bar/baz%2fqux"  "foo/bar"     "baz/qux" (!)
 965
 966    DIR and FILE are freshly allocated.  */
 967
 968 static void
 969 split_path (const char *path, char **dir, char **file)
 970 {
 971   char *last_slash = strrchr (path, '/');
 972   if (!last_slash)
 973     {
 974       *dir = xstrdup ("");
 975       *file = xstrdup (path);
 976     }
 977   else
 978     {
 979       *dir = strdupdelim (path, last_slash);
 980       *file = xstrdup (last_slash + 1);
 981     }
 982   url_unescape (*dir);
 983   url_unescape (*file);
 984 }
 985
 986 /* Note: URL's "full path" is the path with the query string and
 987    params appended.  The "fragment" (#foo) is intentionally ignored,
 988    but that might be changed.  For example, if the original URL was
 989    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
 990    the full path will be "/foo/bar/baz;bullshit?querystring".  */
 991
 992 /* Return the length of the full path, without the terminating
 993    zero.  */
 994
 995 static int
 996 full_path_length (const struct url *url)
 997 {
 998   int len = 0;
 999
1000 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
1001
1002   FROB (path);
1003   FROB (params);
1004   FROB (query);
1005
1006 #undef FROB
1007
1008   return len;
1009 }
1010
1011 /* Write out the full path. */
1012
1013 static void
1014 full_path_write (const struct url *url, char *where)
1015 {
1016 #define FROB(el, chr) do {                      \
1017   char *f_el = url->el;                         \
1018   if (f_el) {                                   \
1019     int l = strlen (f_el);                      \
1020     *where++ = chr;                             \
1021     memcpy (where, f_el, l);                    \
1022     where += l;                                 \
1023   }                                             \
1024 } while (0)
1025
1026   FROB (path, '/');
1027   FROB (params, ';');
1028   FROB (query, '?');
1029
1030 #undef FROB
1031 }
1032
1033 /* Public function for getting the "full path".  E.g. if u->path is
1034    "foo/bar" and u->query is "param=value", full_path will be
1035    "/foo/bar?param=value". */
1036
1037 char *
1038 url_full_path (const struct url *url)
1039 {
1040   int length = full_path_length (url);
1041   char *full_path = xmalloc (length + 1);
1042
1043   full_path_write (url, full_path);
1044   full_path[length] = '\0';
1045
1046   return full_path;
1047 }
1048
1049 /* Unescape CHR in an otherwise escaped STR.  Used to selectively
1050    escaping of certain characters, such as "/" and ":".  Returns a
1051    count of unescaped chars.  */
1052
1053 static void
1054 unescape_single_char (char *str, char chr)
1055 {
1056   const char c1 = XNUM_TO_DIGIT (chr >> 4);
1057   const char c2 = XNUM_TO_DIGIT (chr & 0xf);
1058   char *h = str;                /* hare */
1059   char *t = str;                /* tortoise */
1060   for (; *h; h++, t++)
1061     {
1062       if (h[0] == '%' && h[1] == c1 && h[2] == c2)
1063         {
1064           *t = chr;
1065           h += 2;
1066         }
1067       else
1068         *t = *h;
1069     }
1070   *t = '\0';
1071 }
1072
1073 /* Escape unsafe and reserved characters, except for the slash
1074    characters.  */
1075
1076 static char *
1077 url_escape_dir (const char *dir)
1078 {
1079   char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1080   if (newdir == dir)
1081     return (char *)dir;
1082
1083   unescape_single_char (newdir, '/');
1084   return newdir;
1085 }
1086
1087 /* Sync u->path and u->url with u->dir and u->file.  Called after
1088    u->file or u->dir have been changed, typically by the FTP code.  */
1089
1090 static void
1091 sync_path (struct url *u)
1092 {
1093   char *newpath, *efile, *edir;
1094
1095   xfree (u->path);
1096
1097   /* u->dir and u->file are not escaped.  URL-escape them before
1098      reassembling them into u->path.  That way, if they contain
1099      separators like '?' or even if u->file contains slashes, the
1100      path will be correctly assembled.  (u->file can contain slashes
1101      if the URL specifies it with %2f, or if an FTP server returns
1102      it.)  */
1103   edir = url_escape_dir (u->dir);
1104   efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1105
1106   if (!*edir)
1107     newpath = xstrdup (efile);
1108   else
1109     {
1110       int dirlen = strlen (edir);
1111       int filelen = strlen (efile);
1112
1113       /* Copy "DIR/FILE" to newpath. */
1114       char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1115       memcpy (p, edir, dirlen);
1116       p += dirlen;
1117       *p++ = '/';
1118       memcpy (p, efile, filelen);
1119       p += filelen;
1120       *p = '\0';
1121     }
1122
1123   u->path = newpath;
1124
1125   if (edir != u->dir)
1126     xfree (edir);
1127   if (efile != u->file)
1128     xfree (efile);
1129
1130   /* Regenerate u->url as well.  */
1131   xfree (u->url);
1132   u->url = url_string (u, URL_AUTH_SHOW);
1133 }
1134
1135 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1136    This way we can sync u->path and u->url when they get changed.  */
1137
1138 void
1139 url_set_dir (struct url *url, const char *newdir)
1140 {
1141   xfree (url->dir);
1142   url->dir = xstrdup (newdir);
1143   sync_path (url);
1144 }
1145
1146 void
1147 url_set_file (struct url *url, const char *newfile)
1148 {
1149   xfree (url->file);
1150   url->file = xstrdup (newfile);
1151   sync_path (url);
1152 }
1153
1154 void
1155 url_free (struct url *url)
1156 {
1157   xfree (url->host);
1158   xfree (url->path);
1159   xfree (url->url);
1160
1161   xfree_null (url->params);
1162   xfree_null (url->query);
1163   xfree_null (url->fragment);
1164   xfree_null (url->user);
1165   xfree_null (url->passwd);
1166
1167   xfree (url->dir);
1168   xfree (url->file);
1169
1170   xfree (url);
1171 }
1172 \f
1173 /* Create all the necessary directories for PATH (a file).  Calls
1174    make_directory internally.  */
1175 int
1176 mkalldirs (const char *path)
1177 {
1178   const char *p;
1179   char *t;
1180   struct_stat st;
1181   int res;
1182
1183   p = path + strlen (path);
1184   for (; *p != '/' && p != path; p--)
1185     ;
1186
1187   /* Don't create if it's just a file.  */
1188   if ((p == path) && (*p != '/'))
1189     return 0;
1190   t = strdupdelim (path, p);
1191
1192   /* Check whether the directory exists.  */
1193   if ((stat (t, &st) == 0))
1194     {
1195       if (S_ISDIR (st.st_mode))
1196         {
1197           xfree (t);
1198           return 0;
1199         }
1200       else
1201         {
1202           /* If the dir exists as a file name, remove it first.  This
1203              is *only* for Wget to work with buggy old CERN http
1204              servers.  Here is the scenario: When Wget tries to
1205              retrieve a directory without a slash, e.g.
1206              http://foo/bar (bar being a directory), CERN server will
1207              not redirect it too http://foo/bar/ -- it will generate a
1208              directory listing containing links to bar/file1,
1209              bar/file2, etc.  Wget will lose because it saves this
1210              HTML listing to a file `bar', so it cannot create the
1211              directory.  To work around this, if the file of the same
1212              name exists, we just remove it and create the directory
1213              anyway.  */
1214           DEBUGP (("Removing %s because of directory danger!\n", t));
1215           unlink (t);
1216         }
1217     }
1218   res = make_directory (t);
1219   if (res != 0)
1220     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1221   xfree (t);
1222   return res;
1223 }
1224 \f
1225 /* Functions for constructing the file name out of URL components.  */
1226
1227 /* A growable string structure, used by url_file_name and friends.
1228    This should perhaps be moved to utils.c.
1229
1230    The idea is to have a convenient and efficient way to construct a
1231    string by having various functions append data to it.  Instead of
1232    passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1233    functions in questions, we pass the pointer to this struct.  */
1234
1235 struct growable {
1236   char *base;
1237   int size;
1238   int tail;
1239 };
1240
1241 /* Ensure that the string can accept APPEND_COUNT more characters past
1242    the current TAIL position.  If necessary, this will grow the string
1243    and update its allocated size.  If the string is already large
1244    enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1245 #define GROW(g, append_size) do {                                       \
1246   struct growable *G_ = g;                                              \
1247   DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1248 } while (0)
1249
1250 /* Return the tail position of the string. */
1251 #define TAIL(r) ((r)->base + (r)->tail)
1252
1253 /* Move the tail position by APPEND_COUNT characters. */
1254 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1255
1256 /* Append the string STR to DEST.  NOTICE: the string in DEST is not
1257    terminated.  */
1258
1259 static void
1260 append_string (const char *str, struct growable *dest)
1261 {
1262   int l = strlen (str);
1263   GROW (dest, l);
1264   memcpy (TAIL (dest), str, l);
1265   TAIL_INCR (dest, l);
1266 }
1267
1268 /* Append CH to DEST.  For example, append_char (0, DEST)
1269    zero-terminates DEST.  */
1270
1271 static void
1272 append_char (char ch, struct growable *dest)
1273 {
1274   GROW (dest, 1);
1275   *TAIL (dest) = ch;
1276   TAIL_INCR (dest, 1);
1277 }
1278
1279 enum {
1280   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1281   filechr_not_windows = 2,      /* unusable on Windows, one of \|/<>?:*" */
1282   filechr_control     = 4       /* a control character, e.g. 0-31 */
1283 };
1284
1285 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1286
1287 /* Shorthands for the table: */
1288 #define U filechr_not_unix
1289 #define W filechr_not_windows
1290 #define C filechr_control
1291
1292 #define UW U|W
1293 #define UWC U|W|C
1294
1295 /* Table of characters unsafe under various conditions (see above).
1296
1297    Arguably we could also claim `%' to be unsafe, since we use it as
1298    the escape character.  If we ever want to be able to reliably
1299    translate file name back to URL, this would become important
1300    crucial.  Right now, it's better to be minimal in escaping.  */
1301
1302 static const unsigned char filechr_table[256] =
1303 {
1304 UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1305   C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1306   C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1307   C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1308   0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1309   0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1310   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1311   0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
1312   0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1313   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1314   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1315   0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1316   0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1317   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1318   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1319   0,  0,  0,  0,   W,  0,  0,  C,   /* x   y   z   {    |   }   ~   DEL */
1320
1321   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
1322   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
1323   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1324   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1325
1326   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1327   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1328   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1329   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1330 };
1331 #undef U
1332 #undef W
1333 #undef C
1334 #undef UW
1335 #undef UWC
1336
1337 /* FN_PORT_SEP is the separator between host and port in file names
1338    for non-standard port numbers.  On Unix this is normally ':', as in
1339    "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1340    because Windows can't handle ':' in file names.  */
1341 #define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1342
1343 /* FN_QUERY_SEP is the separator between the file name and the URL
1344    query, normally '?'.  Since Windows cannot handle '?' as part of
1345    file name, we use '@' instead there.  */
1346 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1347
1348 /* Quote path element, characters in [b, e), as file name, and append
1349    the quoted string to DEST.  Each character is quoted as per
1350    file_unsafe_char and the corresponding table.
1351
1352    If ESCAPED is true, the path element is considered to be
1353    URL-escaped and will be unescaped prior to inspection.  */
1354
1355 static void
1356 append_uri_pathel (const char *b, const char *e, bool escaped,
1357                    struct growable *dest)
1358 {
1359   const char *p;
1360   int quoted, outlen;
1361
1362   int mask;
1363   if (opt.restrict_files_os == restrict_unix)
1364     mask = filechr_not_unix;
1365   else
1366     mask = filechr_not_windows;
1367   if (opt.restrict_files_ctrl)
1368     mask |= filechr_control;
1369
1370   /* Copy [b, e) to PATHEL and URL-unescape it. */
1371   if (escaped)
1372     {
1373       char *unescaped;
1374       BOUNDED_TO_ALLOCA (b, e, unescaped);
1375       url_unescape (unescaped);
1376       b = unescaped;
1377       e = unescaped + strlen (unescaped);
1378     }
1379
1380   /* Defang ".." when found as component of path.  Remember that path
1381      comes from the URL and might contain malicious input.  */
1382   if (e - b == 2 && b[0] == '.' && b[1] == '.')
1383     {
1384       b = "%2E%2E";
1385       e = b + 6;
1386     }
1387
1388   /* Walk the PATHEL string and check how many characters we'll need
1389      to quote.  */
1390   quoted = 0;
1391   for (p = b; p < e; p++)
1392     if (FILE_CHAR_TEST (*p, mask))
1393       ++quoted;
1394
1395   /* Calculate the length of the output string.  e-b is the input
1396      string length.  Each quoted char introduces two additional
1397      characters in the string, hence 2*quoted.  */
1398   outlen = (e - b) + (2 * quoted);
1399   GROW (dest, outlen);
1400
1401   if (!quoted)
1402     {
1403       /* If there's nothing to quote, we can simply append the string
1404          without processing it again.  */
1405       memcpy (TAIL (dest), b, outlen);
1406     }
1407   else
1408     {
1409       char *q = TAIL (dest);
1410       for (p = b; p < e; p++)
1411         {
1412           if (!FILE_CHAR_TEST (*p, mask))
1413             *q++ = *p;
1414           else
1415             {
1416               unsigned char ch = *p;
1417               *q++ = '%';
1418               *q++ = XNUM_TO_DIGIT (ch >> 4);
1419               *q++ = XNUM_TO_DIGIT (ch & 0xf);
1420             }
1421         }
1422       assert (q - TAIL (dest) == outlen);
1423     }
1424
1425   /* Perform inline case transformation if required.  */
1426   if (opt.restrict_files_case == restrict_lowercase
1427       || opt.restrict_files_case == restrict_uppercase)
1428     {
1429       char *q;
1430       for (q = TAIL (dest); q < TAIL (dest) + outlen; ++q)
1431         {
1432           if (opt.restrict_files_case == restrict_lowercase)
1433             *q = c_tolower (*q);
1434           else
1435             *q = c_toupper (*q);
1436         }
1437     }
1438
1439   TAIL_INCR (dest, outlen);
1440 }
1441
1442 /* Append to DEST the directory structure that corresponds the
1443    directory part of URL's path.  For example, if the URL is
1444    http://server/dir1/dir2/file, this appends "/dir1/dir2".
1445
1446    Each path element ("dir1" and "dir2" in the above example) is
1447    examined, url-unescaped, and re-escaped as file name element.
1448
1449    Additionally, it cuts as many directories from the path as
1450    specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1451    will produce "bar" for the above example.  For 2 or more, it will
1452    produce "".
1453
1454    Each component of the path is quoted for use as file name.  */
1455
1456 static void
1457 append_dir_structure (const struct url *u, struct growable *dest)
1458 {
1459   char *pathel, *next;
1460   int cut = opt.cut_dirs;
1461
1462   /* Go through the path components, de-URL-quote them, and quote them
1463      (if necessary) as file names.  */
1464
1465   pathel = u->path;
1466   for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1467     {
1468       if (cut-- > 0)
1469         continue;
1470       if (pathel == next)
1471         /* Ignore empty pathels.  */
1472         continue;
1473
1474       if (dest->tail)
1475         append_char ('/', dest);
1476       append_uri_pathel (pathel, next, true, dest);
1477     }
1478 }
1479
1480 /* Return a unique file name that matches the given URL as good as
1481    possible.  Does not create directories on the file system.  */
1482
1483 char *
1484 url_file_name (const struct url *u)
1485 {
1486   struct growable fnres;        /* stands for "file name result" */
1487
1488   const char *u_file, *u_query;
1489   char *fname, *unique;
1490   char *index_filename = "index.html"; /* The default index file is index.html */
1491
1492   fnres.base = NULL;
1493   fnres.size = 0;
1494   fnres.tail = 0;
1495
1496   /* If an alternative index file was defined, change index_filename */
1497   if (opt.default_page)
1498     index_filename = opt.default_page;
1499
1500
1501   /* Start with the directory prefix, if specified. */
1502   if (opt.dir_prefix)
1503     append_string (opt.dir_prefix, &fnres);
1504
1505   /* If "dirstruct" is turned on (typically the case with -r), add
1506      the host and port (unless those have been turned off) and
1507      directory structure.  */
1508   if (opt.dirstruct)
1509     {
1510       if (opt.protocol_directories)
1511         {
1512           if (fnres.tail)
1513             append_char ('/', &fnres);
1514           append_string (supported_schemes[u->scheme].name, &fnres);
1515         }
1516       if (opt.add_hostdir)
1517         {
1518           if (fnres.tail)
1519             append_char ('/', &fnres);
1520           if (0 != strcmp (u->host, ".."))
1521             append_string (u->host, &fnres);
1522           else
1523             /* Host name can come from the network; malicious DNS may
1524                allow ".." to be resolved, causing us to write to
1525                "../<file>".  Defang such host names.  */
1526             append_string ("%2E%2E", &fnres);
1527           if (u->port != scheme_default_port (u->scheme))
1528             {
1529               char portstr[24];
1530               number_to_string (portstr, u->port);
1531               append_char (FN_PORT_SEP, &fnres);
1532               append_string (portstr, &fnres);
1533             }
1534         }
1535
1536       append_dir_structure (u, &fnres);
1537     }
1538
1539   /* Add the file name. */
1540   if (fnres.tail)
1541     append_char ('/', &fnres);
1542   u_file = *u->file ? u->file : index_filename;
1543   append_uri_pathel (u_file, u_file + strlen (u_file), false, &fnres);
1544
1545   /* Append "?query" to the file name. */
1546   u_query = u->query && *u->query ? u->query : NULL;
1547   if (u_query)
1548     {
1549       append_char (FN_QUERY_SEP, &fnres);
1550       append_uri_pathel (u_query, u_query + strlen (u_query), true, &fnres);
1551     }
1552
1553   /* Zero-terminate the file name. */
1554   append_char ('\0', &fnres);
1555
1556   fname = fnres.base;
1557
1558   /* Check the cases in which the unique extensions are not used:
1559      1) Clobbering is turned off (-nc).
1560      2) Retrieval with regetting.
1561      3) Timestamping is used.
1562      4) Hierarchy is built.
1563
1564      The exception is the case when file does exist and is a
1565      directory (see `mkalldirs' for explanation).  */
1566
1567   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1568       && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1569     return fname;
1570
1571   unique = unique_name (fname, true);
1572   if (unique != fname)
1573     xfree (fname);
1574   return unique;
1575 }
1576 \f
1577 /* Resolve "." and ".." elements of PATH by destructively modifying
1578    PATH and return true if PATH has been modified, false otherwise.
1579
1580    The algorithm is in spirit similar to the one described in rfc1808,
1581    although implemented differently, in one pass.  To recap, path
1582    elements containing only "." are removed, and ".." is taken to mean
1583    "back up one element".  Single leading and trailing slashes are
1584    preserved.
1585
1586    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1587    test examples are provided below.  If you change anything in this
1588    function, run test_path_simplify to make sure you haven't broken a
1589    test case.  */
1590
1591 static bool
1592 path_simplify (enum url_scheme scheme, char *path)
1593 {
1594   char *h = path;               /* hare */
1595   char *t = path;               /* tortoise */
1596   char *beg = path;
1597   char *end = strchr (path, '\0');
1598
1599   while (h < end)
1600     {
1601       /* Hare should be at the beginning of a path element. */
1602
1603       if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1604         {
1605           /* Ignore "./". */
1606           h += 2;
1607         }
1608       else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1609         {
1610           /* Handle "../" by retreating the tortoise by one path
1611              element -- but not past beggining.  */
1612           if (t > beg)
1613             {
1614               /* Move backwards until T hits the beginning of the
1615                  previous path element or the beginning of path. */
1616               for (--t; t > beg && t[-1] != '/'; t--)
1617                 ;
1618             }
1619           else if (scheme == SCHEME_FTP)
1620             {
1621               /* If we're at the beginning, copy the "../" literally
1622                  and move the beginning so a later ".." doesn't remove
1623                  it.  This violates RFC 3986; but we do it for FTP
1624                  anyway because there is otherwise no way to get at a
1625                  parent directory, when the FTP server drops us in a
1626                  non-root directory (which is not uncommon). */
1627               beg = t + 3;
1628               goto regular;
1629             }
1630           h += 3;
1631         }
1632       else
1633         {
1634         regular:
1635           /* A regular path element.  If H hasn't advanced past T,
1636              simply skip to the next path element.  Otherwise, copy
1637              the path element until the next slash.  */
1638           if (t == h)
1639             {
1640               /* Skip the path element, including the slash.  */
1641               while (h < end && *h != '/')
1642                 t++, h++;
1643               if (h < end)
1644                 t++, h++;
1645             }
1646           else
1647             {
1648               /* Copy the path element, including the final slash.  */
1649               while (h < end && *h != '/')
1650                 *t++ = *h++;
1651               if (h < end)
1652                 *t++ = *h++;
1653             }
1654         }
1655     }
1656
1657   if (t != h)
1658     *t = '\0';
1659
1660   return t != h;
1661 }
1662 \f
1663 /* Return the length of URL's path.  Path is considered to be
1664    terminated by one or more of the ?query or ;params or #fragment,
1665    depending on the scheme.  */
1666
1667 static const char *
1668 path_end (const char *url)
1669 {
1670   enum url_scheme scheme = url_scheme (url);
1671   const char *seps;
1672   if (scheme == SCHEME_INVALID)
1673     scheme = SCHEME_HTTP;       /* use http semantics for rel links */
1674   /* +2 to ignore the first two separators ':' and '/' */
1675   seps = init_seps (scheme) + 2;
1676   return strpbrk_or_eos (url, seps);
1677 }
1678
1679 /* Find the last occurrence of character C in the range [b, e), or
1680    NULL, if none are present.  */
1681 #define find_last_char(b, e, c) memrchr ((b), (c), (e) - (b))
1682
1683 /* Merge BASE with LINK and return the resulting URI.
1684
1685    Either of the URIs may be absolute or relative, complete with the
1686    host name, or path only.  This tries to reasonably handle all
1687    foreseeable cases.  It only employs minimal URL parsing, without
1688    knowledge of the specifics of schemes.
1689
1690    I briefly considered making this function call path_simplify after
1691    the merging process, as rfc1738 seems to suggest.  This is a bad
1692    idea for several reasons: 1) it complexifies the code, and 2)
1693    url_parse has to simplify path anyway, so it's wasteful to boot.  */
1694
1695 char *
1696 uri_merge (const char *base, const char *link)
1697 {
1698   int linklength;
1699   const char *end;
1700   char *merge;
1701
1702   if (url_has_scheme (link))
1703     return xstrdup (link);
1704
1705   /* We may not examine BASE past END. */
1706   end = path_end (base);
1707   linklength = strlen (link);
1708
1709   if (!*link)
1710     {
1711       /* Empty LINK points back to BASE, query string and all. */
1712       return xstrdup (base);
1713     }
1714   else if (*link == '?')
1715     {
1716       /* LINK points to the same location, but changes the query
1717          string.  Examples: */
1718       /* uri_merge("path",         "?new") -> "path?new"     */
1719       /* uri_merge("path?foo",     "?new") -> "path?new"     */
1720       /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1721       /* uri_merge("path#foo",     "?new") -> "path?new"     */
1722       int baselength = end - base;
1723       merge = xmalloc (baselength + linklength + 1);
1724       memcpy (merge, base, baselength);
1725       memcpy (merge + baselength, link, linklength);
1726       merge[baselength + linklength] = '\0';
1727     }
1728   else if (*link == '#')
1729     {
1730       /* uri_merge("path",         "#new") -> "path#new"     */
1731       /* uri_merge("path#foo",     "#new") -> "path#new"     */
1732       /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1733       /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1734       int baselength;
1735       const char *end1 = strchr (base, '#');
1736       if (!end1)
1737         end1 = base + strlen (base);
1738       baselength = end1 - base;
1739       merge = xmalloc (baselength + linklength + 1);
1740       memcpy (merge, base, baselength);
1741       memcpy (merge + baselength, link, linklength);
1742       merge[baselength + linklength] = '\0';
1743     }
1744   else if (*link == '/' && *(link + 1) == '/')
1745     {
1746       /* LINK begins with "//" and so is a net path: we need to
1747          replace everything after (and including) the double slash
1748          with LINK. */
1749
1750       /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1751       /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1752       /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1753
1754       int span;
1755       const char *slash;
1756       const char *start_insert;
1757
1758       /* Look for first slash. */
1759       slash = memchr (base, '/', end - base);
1760       /* If found slash and it is a double slash, then replace
1761          from this point, else default to replacing from the
1762          beginning.  */
1763       if (slash && *(slash + 1) == '/')
1764         start_insert = slash;
1765       else
1766         start_insert = base;
1767
1768       span = start_insert - base;
1769       merge = xmalloc (span + linklength + 1);
1770       if (span)
1771         memcpy (merge, base, span);
1772       memcpy (merge + span, link, linklength);
1773       merge[span + linklength] = '\0';
1774     }
1775   else if (*link == '/')
1776     {
1777       /* LINK is an absolute path: we need to replace everything
1778          after (and including) the FIRST slash with LINK.
1779
1780          So, if BASE is "http://host/whatever/foo/bar", and LINK is
1781          "/qux/xyzzy", our result should be
1782          "http://host/qux/xyzzy".  */
1783       int span;
1784       const char *slash;
1785       const char *start_insert = NULL; /* for gcc to shut up. */
1786       const char *pos = base;
1787       bool seen_slash_slash = false;
1788       /* We're looking for the first slash, but want to ignore
1789          double slash. */
1790     again:
1791       slash = memchr (pos, '/', end - pos);
1792       if (slash && !seen_slash_slash)
1793         if (*(slash + 1) == '/')
1794           {
1795             pos = slash + 2;
1796             seen_slash_slash = true;
1797             goto again;
1798           }
1799
1800       /* At this point, SLASH is the location of the first / after
1801          "//", or the first slash altogether.  START_INSERT is the
1802          pointer to the location where LINK will be inserted.  When
1803          examining the last two examples, keep in mind that LINK
1804          begins with '/'. */
1805
1806       if (!slash && !seen_slash_slash)
1807         /* example: "foo" */
1808         /*           ^    */
1809         start_insert = base;
1810       else if (!slash && seen_slash_slash)
1811         /* example: "http://foo" */
1812         /*                     ^ */
1813         start_insert = end;
1814       else if (slash && !seen_slash_slash)
1815         /* example: "foo/bar" */
1816         /*           ^        */
1817         start_insert = base;
1818       else if (slash && seen_slash_slash)
1819         /* example: "http://something/" */
1820         /*                           ^  */
1821         start_insert = slash;
1822
1823       span = start_insert - base;
1824       merge = xmalloc (span + linklength + 1);
1825       if (span)
1826         memcpy (merge, base, span);
1827       memcpy (merge + span, link, linklength);
1828       merge[span + linklength] = '\0';
1829     }
1830   else
1831     {
1832       /* LINK is a relative URL: we need to replace everything
1833          after last slash (possibly empty) with LINK.
1834
1835          So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1836          our result should be "whatever/foo/qux/xyzzy".  */
1837       bool need_explicit_slash = false;
1838       int span;
1839       const char *start_insert;
1840       const char *last_slash = find_last_char (base, end, '/');
1841       if (!last_slash)
1842         {
1843           /* No slash found at all.  Replace what we have with LINK. */
1844           start_insert = base;
1845         }
1846       else if (last_slash && last_slash >= base + 2
1847                && last_slash[-2] == ':' && last_slash[-1] == '/')
1848         {
1849           /* example: http://host"  */
1850           /*                      ^ */
1851           start_insert = end + 1;
1852           need_explicit_slash = true;
1853         }
1854       else
1855         {
1856           /* example: "whatever/foo/bar" */
1857           /*                        ^    */
1858           start_insert = last_slash + 1;
1859         }
1860
1861       span = start_insert - base;
1862       merge = xmalloc (span + linklength + 1);
1863       if (span)
1864         memcpy (merge, base, span);
1865       if (need_explicit_slash)
1866         merge[span - 1] = '/';
1867       memcpy (merge + span, link, linklength);
1868       merge[span + linklength] = '\0';
1869     }
1870
1871   return merge;
1872 }
1873 \f
1874 #define APPEND(p, s) do {                       \
1875   int len = strlen (s);                         \
1876   memcpy (p, s, len);                           \
1877   p += len;                                     \
1878 } while (0)
1879
1880 /* Use this instead of password when the actual password is supposed
1881    to be hidden.  We intentionally use a generic string without giving
1882    away the number of characters in the password, like previous
1883    versions did.  */
1884 #define HIDDEN_PASSWORD "*password*"
1885
1886 /* Recreate the URL string from the data in URL.
1887
1888    If HIDE is true (as it is when we're calling this on a URL we plan
1889    to print, but not when calling it to canonicalize a URL for use
1890    within the program), password will be hidden.  Unsafe characters in
1891    the URL will be quoted.  */
1892
1893 char *
1894 url_string (const struct url *url, enum url_auth_mode auth_mode)
1895 {
1896   int size;
1897   char *result, *p;
1898   char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;
1899
1900   int scheme_port = supported_schemes[url->scheme].default_port;
1901   const char *scheme_str = supported_schemes[url->scheme].leading_string;
1902   int fplen = full_path_length (url);
1903
1904   bool brackets_around_host;
1905
1906   assert (scheme_str != NULL);
1907
1908   /* Make sure the user name and password are quoted. */
1909   if (url->user)
1910     {
1911       if (auth_mode != URL_AUTH_HIDE)
1912         {
1913           quoted_user = url_escape_allow_passthrough (url->user);
1914           if (url->passwd)
1915             {
1916               if (auth_mode == URL_AUTH_HIDE_PASSWD)
1917                 quoted_passwd = HIDDEN_PASSWORD;
1918               else
1919                 quoted_passwd = url_escape_allow_passthrough (url->passwd);
1920             }
1921         }
1922     }
1923
1924   /* In the unlikely event that the host name contains non-printable
1925      characters, quote it for displaying to the user.  */
1926   quoted_host = url_escape_allow_passthrough (url->host);
1927
1928   /* Undo the quoting of colons that URL escaping performs.  IPv6
1929      addresses may legally contain colons, and in that case must be
1930      placed in square brackets.  */
1931   if (quoted_host != url->host)
1932     unescape_single_char (quoted_host, ':');
1933   brackets_around_host = strchr (quoted_host, ':') != NULL;
1934
1935   size = (strlen (scheme_str)
1936           + strlen (quoted_host)
1937           + (brackets_around_host ? 2 : 0)
1938           + fplen
1939           + 1);
1940   if (url->port != scheme_port)
1941     size += 1 + numdigit (url->port);
1942   if (quoted_user)
1943     {
1944       size += 1 + strlen (quoted_user);
1945       if (quoted_passwd)
1946         size += 1 + strlen (quoted_passwd);
1947     }
1948
1949   p = result = xmalloc (size);
1950
1951   APPEND (p, scheme_str);
1952   if (quoted_user)
1953     {
1954       APPEND (p, quoted_user);
1955       if (quoted_passwd)
1956         {
1957           *p++ = ':';
1958           APPEND (p, quoted_passwd);
1959         }
1960       *p++ = '@';
1961     }
1962
1963   if (brackets_around_host)
1964     *p++ = '[';
1965   APPEND (p, quoted_host);
1966   if (brackets_around_host)
1967     *p++ = ']';
1968   if (url->port != scheme_port)
1969     {
1970       *p++ = ':';
1971       p = number_to_string (p, url->port);
1972     }
1973
1974   full_path_write (url, p);
1975   p += fplen;
1976   *p++ = '\0';
1977
1978   assert (p - result == size);
1979
1980   if (quoted_user && quoted_user != url->user)
1981     xfree (quoted_user);
1982   if (quoted_passwd && auth_mode == URL_AUTH_SHOW
1983       && quoted_passwd != url->passwd)
1984     xfree (quoted_passwd);
1985   if (quoted_host != url->host)
1986     xfree (quoted_host);
1987
1988   return result;
1989 }
1990 \f
1991 /* Return true if scheme a is similar to scheme b.
1992
1993    Schemes are similar if they are equal.  If SSL is supported, schemes
1994    are also similar if one is http (SCHEME_HTTP) and the other is https
1995    (SCHEME_HTTPS).  */
1996 bool
1997 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
1998 {
1999   if (a == b)
2000     return true;
2001 #ifdef HAVE_SSL
2002   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2003       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2004     return true;
2005 #endif
2006   return false;
2007 }
2008 \f
2009 static int
2010 getchar_from_escaped_string (const char *str, char *c)
2011 {
2012   const char *p = str;
2013
2014   assert (str && *str);
2015   assert (c);
2016
2017   if (p[0] == '%')
2018     {
2019       if (!c_isxdigit(p[1]) || !c_isxdigit(p[2]))
2020         {
2021           *c = '%';
2022           return 1;
2023         }
2024       else
2025         {
2026           if (p[2] == 0)
2027             return 0; /* error: invalid string */
2028
2029           *c = X2DIGITS_TO_NUM (p[1], p[2]);
2030           if (URL_RESERVED_CHAR(*c))
2031             {
2032               *c = '%';
2033               return 1;
2034             }
2035           else
2036             return 3;
2037         }
2038     }
2039   else
2040     {
2041       *c = p[0];
2042     }
2043
2044   return 1;
2045 }
2046
2047 bool
2048 are_urls_equal (const char *u1, const char *u2)
2049 {
2050   const char *p, *q;
2051   int pp, qq;
2052   char ch1, ch2;
2053   assert(u1 && u2);
2054
2055   p = u1;
2056   q = u2;
2057
2058   while (*p && *q
2059          && (pp = getchar_from_escaped_string (p, &ch1))
2060          && (qq = getchar_from_escaped_string (q, &ch2))
2061          && (c_tolower(ch1) == c_tolower(ch2)))
2062     {
2063       p += pp;
2064       q += qq;
2065     }
2066
2067   return (*p == 0 && *q == 0 ? true : false);
2068 }
2069 \f
2070 #ifdef TESTING
2071 /* Debugging and testing support for path_simplify. */
2072
2073 #if 0
2074 /* Debug: run path_simplify on PATH and return the result in a new
2075    string.  Useful for calling from the debugger.  */
2076 static char *
2077 ps (char *path)
2078 {
2079   char *copy = xstrdup (path);
2080   path_simplify (copy);
2081   return copy;
2082 }
2083 #endif
2084
2085 static const char *
2086 run_test (char *test, char *expected_result, enum url_scheme scheme,
2087           bool expected_change)
2088 {
2089   char *test_copy = xstrdup (test);
2090   bool modified = path_simplify (scheme, test_copy);
2091
2092   if (0 != strcmp (test_copy, expected_result))
2093     {
2094       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2095               test, expected_result, test_copy);
2096       mu_assert ("", 0);
2097     }
2098   if (modified != expected_change)
2099     {
2100       if (expected_change)
2101         printf ("Expected modification with path_simplify(\"%s\").\n",
2102                 test);
2103       else
2104         printf ("Expected no modification with path_simplify(\"%s\").\n",
2105                 test);
2106     }
2107   xfree (test_copy);
2108   mu_assert ("", modified == expected_change);
2109   return NULL;
2110 }
2111
2112 const char *
2113 test_path_simplify (void)
2114 {
2115   static struct {
2116     char *test, *result;
2117     enum url_scheme scheme;
2118     bool should_modify;
2119   } tests[] = {
2120     { "",                       "",             SCHEME_HTTP, false },
2121     { ".",                      "",             SCHEME_HTTP, true },
2122     { "./",                     "",             SCHEME_HTTP, true },
2123     { "..",                     "",             SCHEME_HTTP, true },
2124     { "../",                    "",             SCHEME_HTTP, true },
2125     { "..",                     "..",           SCHEME_FTP,  false },
2126     { "../",                    "../",          SCHEME_FTP,  false },
2127     { "foo",                    "foo",          SCHEME_HTTP, false },
2128     { "foo/bar",                "foo/bar",      SCHEME_HTTP, false },
2129     { "foo///bar",              "foo///bar",    SCHEME_HTTP, false },
2130     { "foo/.",                  "foo/",         SCHEME_HTTP, true },
2131     { "foo/./",                 "foo/",         SCHEME_HTTP, true },
2132     { "foo./",                  "foo./",        SCHEME_HTTP, false },
2133     { "foo/../bar",             "bar",          SCHEME_HTTP, true },
2134     { "foo/../bar/",            "bar/",         SCHEME_HTTP, true },
2135     { "foo/bar/..",             "foo/",         SCHEME_HTTP, true },
2136     { "foo/bar/../x",           "foo/x",        SCHEME_HTTP, true },
2137     { "foo/bar/../x/",          "foo/x/",       SCHEME_HTTP, true },
2138     { "foo/..",                 "",             SCHEME_HTTP, true },
2139     { "foo/../..",              "",             SCHEME_HTTP, true },
2140     { "foo/../../..",           "",             SCHEME_HTTP, true },
2141     { "foo/../../bar/../../baz", "baz",         SCHEME_HTTP, true },
2142     { "foo/../..",              "..",           SCHEME_FTP,  true },
2143     { "foo/../../..",           "../..",        SCHEME_FTP,  true },
2144     { "foo/../../bar/../../baz", "../../baz",   SCHEME_FTP,  true },
2145     { "a/b/../../c",            "c",            SCHEME_HTTP, true },
2146     { "./a/../b",               "b",            SCHEME_HTTP, true }
2147   };
2148   int i;
2149
2150   for (i = 0; i < countof (tests); i++)
2151     {
2152       const char *message;
2153       char *test = tests[i].test;
2154       char *expected_result = tests[i].result;
2155       enum url_scheme scheme = tests[i].scheme;
2156       bool  expected_change = tests[i].should_modify;
2157       message = run_test (test, expected_result, scheme, expected_change);
2158       if (message) return message;
2159     }
2160   return NULL;
2161 }
2162
2163 const char *
2164 test_append_uri_pathel()
2165 {
2166   int i;
2167   struct {
2168     char *original_url;
2169     char *input;
2170     bool escaped;
2171     char *expected_result;
2172   } test_array[] = {
2173     { "http://www.yoyodyne.com/path/", "somepage.html", false, "http://www.yoyodyne.com/path/somepage.html" },
2174   };
2175
2176   for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
2177     {
2178       struct growable dest;
2179       const char *p = test_array[i].input;
2180
2181       memset (&dest, 0, sizeof (dest));
2182
2183       append_string (test_array[i].original_url, &dest);
2184       append_uri_pathel (p, p + strlen(p), test_array[i].escaped, &dest);
2185       append_char ('\0', &dest);
2186
2187       mu_assert ("test_append_uri_pathel: wrong result",
2188                  strcmp (dest.base, test_array[i].expected_result) == 0);
2189     }
2190
2191   return NULL;
2192 }
2193
2194 const char*
2195 test_are_urls_equal()
2196 {
2197   int i;
2198   struct {
2199     char *url1;
2200     char *url2;
2201     bool expected_result;
2202   } test_array[] = {
2203     { "http://www.adomain.com/apath/", "http://www.adomain.com/apath/",       true },
2204     { "http://www.adomain.com/apath/", "http://www.adomain.com/anotherpath/", false },
2205     { "http://www.adomain.com/apath/", "http://www.anotherdomain.com/path/",  false },
2206     { "http://www.adomain.com/~path/", "http://www.adomain.com/%7epath/",     true },
2207     { "http://www.adomain.com/longer-path/", "http://www.adomain.com/path/",  false },
2208     { "http://www.adomain.com/path%2f", "http://www.adomain.com/path/",       false },
2209   };
2210
2211   for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
2212     {
2213       mu_assert ("test_are_urls_equal: wrong result",
2214                  are_urls_equal (test_array[i].url1, test_array[i].url2) == test_array[i].expected_result);
2215     }
2216
2217   return NULL;
2218 }
2219
2220 #endif /* TESTING */
2221
2222 /*
2223  * vim: et ts=2 sw=2
2224  */
2225