sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   3    2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation,
   4    Inc.
   5
   6 This file is part of GNU Wget.
   7
   8 GNU Wget is free software; you can redistribute it and/or modify
   9 it under the terms of the GNU General Public License as published by
  10 the Free Software Foundation; either version 3 of the License, or (at
  11 your option) any later version.
  12
  13 GNU Wget is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  20
  21 Additional permission under GNU GPL version 3 section 7
  22
  23 If you modify this program, or any covered work, by linking or
  24 combining it with the OpenSSL project's OpenSSL library (or a
  25 modified version of that library), containing parts covered by the
  26 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  27 grants you additional permission to convey the resulting work.
  28 Corresponding Source for a non-source form of such a combination
  29 shall include the source code for the parts of OpenSSL used as well
  30 as that of the covered work.  */
  31
  32 #include "wget.h"
  33
  34 #include <stdio.h>
  35 #include <stdlib.h>
  36 #include <string.h>
  37 #include <unistd.h>
  38 #include <errno.h>
  39 #include <assert.h>
  40
  41 #include "utils.h"
  42 #include "url.h"
  43 #include "host.h"  /* for is_valid_ipv6_address */
  44
  45 #ifdef __VMS
  46 #include "vms.h"
  47 #endif /* def __VMS */
  48
  49 #ifdef TESTING
  50 #include "test.h"
  51 #endif
  52
  53 enum {
  54   scm_disabled = 1,             /* for https when OpenSSL fails to init. */
  55   scm_has_params = 2,           /* whether scheme has ;params */
  56   scm_has_query = 4,            /* whether scheme has ?query */
  57   scm_has_fragment = 8          /* whether scheme has #fragment */
  58 };
  59
  60 struct scheme_data
  61 {
  62   /* Short name of the scheme, such as "http" or "ftp". */
  63   const char *name;
  64   /* Leading string that identifies the scheme, such as "https://". */
  65   const char *leading_string;
  66   /* Default port of the scheme when none is specified. */
  67   int default_port;
  68   /* Various flags. */
  69   int flags;
  70 };
  71
  72 /* Supported schemes: */
  73 static struct scheme_data supported_schemes[] =
  74 {
  75   { "http",     "http://",  DEFAULT_HTTP_PORT,  scm_has_query|scm_has_fragment },
  76 #ifdef HAVE_SSL
  77   { "https",    "https://", DEFAULT_HTTPS_PORT, scm_has_query|scm_has_fragment },
  78 #endif
  79   { "ftp",      "ftp://",   DEFAULT_FTP_PORT,   scm_has_params|scm_has_fragment },
  80
  81   /* SCHEME_INVALID */
  82   { NULL,       NULL,       -1,                 0 }
  83 };
  84
  85 /* Forward declarations: */
  86
  87 static bool path_simplify (enum url_scheme, char *);
  88 \f
  89 /* Support for escaping and unescaping of URL strings.  */
  90
  91 /* Table of "reserved" and "unsafe" characters.  Those terms are
  92    rfc1738-speak, as such largely obsoleted by rfc2396 and later
  93    specs, but the general idea remains.
  94
  95    A reserved character is the one that you can't decode without
  96    changing the meaning of the URL.  For example, you can't decode
  97    "/foo/%2f/bar" into "/foo///bar" because the number and contents of
  98    path components is different.  Non-reserved characters can be
  99    changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar".  The
 100    unsafe characters are loosely based on rfc1738, plus "$" and ",",
 101    as recommended by rfc2396, and minus "~", which is very frequently
 102    used (and sometimes unrecognized as %7E by broken servers).
 103
 104    An unsafe character is the one that should be encoded when URLs are
 105    placed in foreign environments.  E.g. space and newline are unsafe
 106    in HTTP contexts because HTTP uses them as separator and line
 107    terminator, so they must be encoded to %20 and %0A respectively.
 108    "*" is unsafe in shell context, etc.
 109
 110    We determine whether a character is unsafe through static table
 111    lookup.  This code assumes ASCII character set and 8-bit chars.  */
 112
 113 enum {
 114   /* rfc1738 reserved chars + "$" and ",".  */
 115   urlchr_reserved = 1,
 116
 117   /* rfc1738 unsafe chars, plus non-printables.  */
 118   urlchr_unsafe   = 2
 119 };
 120
 121 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 122 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 123 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 124
 125 /* Shorthands for the table: */
 126 #define R  urlchr_reserved
 127 #define U  urlchr_unsafe
 128 #define RU R|U
 129
 130 static const unsigned char urlchr_table[256] =
 131 {
 132   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 133   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 134   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 135   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 136   U,  0,  U, RU,   R,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 137   0,  0,  0,  R,   R,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 138   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 139   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 140  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 141   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 142   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 143   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 144   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 145   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 146   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 147   0,  0,  0,  U,   U,  U,  0,  U,   /* x   y   z   {    |   }   ~   DEL */
 148
 149   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 150   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 151   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 152   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 153
 154   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 155   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 156   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 157   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 158 };
 159 #undef R
 160 #undef U
 161 #undef RU
 162
 163 /* URL-unescape the string S.
 164
 165    This is done by transforming the sequences "%HH" to the character
 166    represented by the hexadecimal digits HH.  If % is not followed by
 167    two hexadecimal digits, it is inserted literally.
 168
 169    The transformation is done in place.  If you need the original
 170    string intact, make a copy before calling this function.  */
 171
 172 static void
 173 url_unescape (char *s)
 174 {
 175   char *t = s;                  /* t - tortoise */
 176   char *h = s;                  /* h - hare     */
 177
 178   for (; *h; h++, t++)
 179     {
 180       if (*h != '%')
 181         {
 182         copychar:
 183           *t = *h;
 184         }
 185       else
 186         {
 187           char c;
 188           /* Do nothing if '%' is not followed by two hex digits. */
 189           if (!h[1] || !h[2] || !(c_isxdigit (h[1]) && c_isxdigit (h[2])))
 190             goto copychar;
 191           c = X2DIGITS_TO_NUM (h[1], h[2]);
 192           /* Don't unescape %00 because there is no way to insert it
 193              into a C string without effectively truncating it. */
 194           if (c == '\0')
 195             goto copychar;
 196           *t = c;
 197           h += 2;
 198         }
 199     }
 200   *t = '\0';
 201 }
 202
 203 /* The core of url_escape_* functions.  Escapes the characters that
 204    match the provided mask in urlchr_table.
 205
 206    If ALLOW_PASSTHROUGH is true, a string with no unsafe chars will be
 207    returned unchanged.  If ALLOW_PASSTHROUGH is false, a freshly
 208    allocated string will be returned in all cases.  */
 209
 210 static char *
 211 url_escape_1 (const char *s, unsigned char mask, bool allow_passthrough)
 212 {
 213   const char *p1;
 214   char *p2, *newstr;
 215   int newlen;
 216   int addition = 0;
 217
 218   for (p1 = s; *p1; p1++)
 219     if (urlchr_test (*p1, mask))
 220       addition += 2;            /* Two more characters (hex digits) */
 221
 222   if (!addition)
 223     return allow_passthrough ? (char *)s : xstrdup (s);
 224
 225   newlen = (p1 - s) + addition;
 226   newstr = xmalloc (newlen + 1);
 227
 228   p1 = s;
 229   p2 = newstr;
 230   while (*p1)
 231     {
 232       /* Quote the characters that match the test mask. */
 233       if (urlchr_test (*p1, mask))
 234         {
 235           unsigned char c = *p1++;
 236           *p2++ = '%';
 237           *p2++ = XNUM_TO_DIGIT (c >> 4);
 238           *p2++ = XNUM_TO_DIGIT (c & 0xf);
 239         }
 240       else
 241         *p2++ = *p1++;
 242     }
 243   assert (p2 - newstr == newlen);
 244   *p2 = '\0';
 245
 246   return newstr;
 247 }
 248
 249 /* URL-escape the unsafe characters (see urlchr_table) in a given
 250    string, returning a freshly allocated string.  */
 251
 252 char *
 253 url_escape (const char *s)
 254 {
 255   return url_escape_1 (s, urlchr_unsafe, false);
 256 }
 257
 258 /* URL-escape the unsafe and reserved characters (see urlchr_table) in
 259    a given string, returning a freshly allocated string.  */
 260
 261 char *
 262 url_escape_unsafe_and_reserved (const char *s)
 263 {
 264   return url_escape_1 (s, urlchr_unsafe|urlchr_reserved, false);
 265 }
 266
 267 /* URL-escape the unsafe characters (see urlchr_table) in a given
 268    string.  If no characters are unsafe, S is returned.  */
 269
 270 static char *
 271 url_escape_allow_passthrough (const char *s)
 272 {
 273   return url_escape_1 (s, urlchr_unsafe, true);
 274 }
 275 \f
 276 /* Decide whether the char at position P needs to be encoded.  (It is
 277    not enough to pass a single char *P because the function may need
 278    to inspect the surrounding context.)
 279
 280    Return true if the char should be escaped as %XX, false otherwise.  */
 281
 282 static inline bool
 283 char_needs_escaping (const char *p)
 284 {
 285   if (*p == '%')
 286     {
 287       if (c_isxdigit (*(p + 1)) && c_isxdigit (*(p + 2)))
 288         return false;
 289       else
 290         /* Garbled %.. sequence: encode `%'. */
 291         return true;
 292     }
 293   else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
 294     return true;
 295   else
 296     return false;
 297 }
 298
 299 /* Translate a %-escaped (but possibly non-conformant) input string S
 300    into a %-escaped (and conformant) output string.  If no characters
 301    are encoded or decoded, return the same string S; otherwise, return
 302    a freshly allocated string with the new contents.
 303
 304    After a URL has been run through this function, the protocols that
 305    use `%' as the quote character can use the resulting string as-is,
 306    while those that don't can use url_unescape to get to the intended
 307    data.  This function is stable: once the input is transformed,
 308    further transformations of the result yield the same output.
 309
 310    Let's discuss why this function is needed.
 311
 312    Imagine Wget is asked to retrieve `http://abc.xyz/abc def'.  Since
 313    a raw space character would mess up the HTTP request, it needs to
 314    be quoted, like this:
 315
 316        GET /abc%20def HTTP/1.0
 317
 318    It would appear that the unsafe chars need to be quoted, for
 319    example with url_escape.  But what if we're requested to download
 320    `abc%20def'?  url_escape transforms "%" to "%25", which would leave
 321    us with `abc%2520def'.  This is incorrect -- since %-escapes are
 322    part of URL syntax, "%20" is the correct way to denote a literal
 323    space on the Wget command line.  This leads to the conclusion that
 324    in that case Wget should not call url_escape, but leave the `%20'
 325    as is.  This is clearly contradictory, but it only gets worse.
 326
 327    What if the requested URI is `abc%20 def'?  If we call url_escape,
 328    we end up with `/abc%2520%20def', which is almost certainly not
 329    intended.  If we don't call url_escape, we are left with the
 330    embedded space and cannot complete the request.  What the user
 331    meant was for Wget to request `/abc%20%20def', and this is where
 332    reencode_escapes kicks in.
 333
 334    Wget used to solve this by first decoding %-quotes, and then
 335    encoding all the "unsafe" characters found in the resulting string.
 336    This was wrong because it didn't preserve certain URL special
 337    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 338    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 339    whether we considered `+' reserved (it is).  One of these results
 340    is inevitable because by the second step we would lose information
 341    on whether the `+' was originally encoded or not.  Both results
 342    were wrong because in CGI parameters + means space, while %2B means
 343    literal plus.  reencode_escapes correctly translates the above to
 344    "a%2B+b", i.e. returns the original string.
 345
 346    This function uses a modified version of the algorithm originally
 347    proposed by Anon Sricharoenchai:
 348
 349    * Encode all "unsafe" characters, except those that are also
 350      "reserved", to %XX.  See urlchr_table for which characters are
 351      unsafe and reserved.
 352
 353    * Encode the "%" characters not followed by two hex digits to
 354      "%25".
 355
 356    * Pass through all other characters and %XX escapes as-is.  (Up to
 357      Wget 1.10 this decoded %XX escapes corresponding to "safe"
 358      characters, but that was obtrusive and broke some servers.)
 359
 360    Anon's test case:
 361
 362    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 363    ->
 364    "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc"
 365
 366    Simpler test cases:
 367
 368    "foo bar"         -> "foo%20bar"
 369    "foo%20bar"       -> "foo%20bar"
 370    "foo %20bar"      -> "foo%20%20bar"
 371    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 372    "foo%25%20bar"    -> "foo%25%20bar"
 373    "foo%2%20bar"     -> "foo%252%20bar"
 374    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 375    "foo%2b+bar"      -> "foo%2b+bar"  */
 376
 377 static char *
 378 reencode_escapes (const char *s)
 379 {
 380   const char *p1;
 381   char *newstr, *p2;
 382   int oldlen, newlen;
 383
 384   int encode_count = 0;
 385
 386   /* First pass: inspect the string to see if there's anything to do,
 387      and to calculate the new length.  */
 388   for (p1 = s; *p1; p1++)
 389     if (char_needs_escaping (p1))
 390       ++encode_count;
 391
 392   if (!encode_count)
 393     /* The string is good as it is. */
 394     return (char *) s;          /* C const model sucks. */
 395
 396   oldlen = p1 - s;
 397   /* Each encoding adds two characters (hex digits).  */
 398   newlen = oldlen + 2 * encode_count;
 399   newstr = xmalloc (newlen + 1);
 400
 401   /* Second pass: copy the string to the destination address, encoding
 402      chars when needed.  */
 403   p1 = s;
 404   p2 = newstr;
 405
 406   while (*p1)
 407     if (char_needs_escaping (p1))
 408       {
 409         unsigned char c = *p1++;
 410         *p2++ = '%';
 411         *p2++ = XNUM_TO_DIGIT (c >> 4);
 412         *p2++ = XNUM_TO_DIGIT (c & 0xf);
 413       }
 414     else
 415       *p2++ = *p1++;
 416
 417   *p2 = '\0';
 418   assert (p2 - newstr == newlen);
 419   return newstr;
 420 }
 421 \f
 422 /* Returns the scheme type if the scheme is supported, or
 423    SCHEME_INVALID if not.  */
 424
 425 enum url_scheme
 426 url_scheme (const char *url)
 427 {
 428   int i;
 429
 430   for (i = 0; supported_schemes[i].leading_string; i++)
 431     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 432                           strlen (supported_schemes[i].leading_string)))
 433       {
 434         if (!(supported_schemes[i].flags & scm_disabled))
 435           return (enum url_scheme) i;
 436         else
 437           return SCHEME_INVALID;
 438       }
 439
 440   return SCHEME_INVALID;
 441 }
 442
 443 #define SCHEME_CHAR(ch) (c_isalnum (ch) || (ch) == '-' || (ch) == '+')
 444
 445 /* Return 1 if the URL begins with any "scheme", 0 otherwise.  As
 446    currently implemented, it returns true if URL begins with
 447    [-+a-zA-Z0-9]+: .  */
 448
 449 bool
 450 url_has_scheme (const char *url)
 451 {
 452   const char *p = url;
 453
 454   /* The first char must be a scheme char. */
 455   if (!*p || !SCHEME_CHAR (*p))
 456     return false;
 457   ++p;
 458   /* Followed by 0 or more scheme chars. */
 459   while (*p && SCHEME_CHAR (*p))
 460     ++p;
 461   /* Terminated by ':'. */
 462   return *p == ':';
 463 }
 464
 465 bool
 466 url_valid_scheme (const char *url)
 467 {
 468   enum url_scheme scheme = url_scheme (url);
 469   return scheme != SCHEME_INVALID;
 470 }
 471
 472 int
 473 scheme_default_port (enum url_scheme scheme)
 474 {
 475   return supported_schemes[scheme].default_port;
 476 }
 477
 478 void
 479 scheme_disable (enum url_scheme scheme)
 480 {
 481   supported_schemes[scheme].flags |= scm_disabled;
 482 }
 483
 484 /* Skip the username and password, if present in the URL.  The
 485    function should *not* be called with the complete URL, but with the
 486    portion after the scheme.
 487
 488    If no username and password are found, return URL.  */
 489
 490 static const char *
 491 url_skip_credentials (const char *url)
 492 {
 493   /* Look for '@' that comes before terminators, such as '/', '?',
 494      '#', or ';'.  */
 495   const char *p = (const char *)strpbrk (url, "@/?#;");
 496   if (!p || *p != '@')
 497     return url;
 498   return p + 1;
 499 }
 500
 501 /* Parse credentials contained in [BEG, END).  The region is expected
 502    to have come from a URL and is unescaped.  */
 503
 504 static bool
 505 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
 506 {
 507   char *colon;
 508   const char *userend;
 509
 510   if (beg == end)
 511     return false;               /* empty user name */
 512
 513   colon = memchr (beg, ':', end - beg);
 514   if (colon == beg)
 515     return false;               /* again empty user name */
 516
 517   if (colon)
 518     {
 519       *passwd = strdupdelim (colon + 1, end);
 520       userend = colon;
 521       url_unescape (*passwd);
 522     }
 523   else
 524     {
 525       *passwd = NULL;
 526       userend = end;
 527     }
 528   *user = strdupdelim (beg, userend);
 529   url_unescape (*user);
 530   return true;
 531 }
 532
 533 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 534    originally popularized by Netscape and NcFTP.  HTTP shorthands look
 535    like this:
 536
 537    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 538    www.foo.com[:port]            -> http://www.foo.com[:port]
 539
 540    FTP shorthands look like this:
 541
 542    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 543    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 544
 545    If the URL needs not or cannot be rewritten, return NULL.  */
 546
 547 char *
 548 rewrite_shorthand_url (const char *url)
 549 {
 550   const char *p;
 551   char *ret;
 552
 553   if (url_scheme (url) != SCHEME_INVALID)
 554     return NULL;
 555
 556   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 557      latter Netscape.  */
 558   p = strpbrk (url, ":/");
 559   if (p == url)
 560     return NULL;
 561
 562   /* If we're looking at "://", it means the URL uses a scheme we
 563      don't support, which may include "https" when compiled without
 564      SSL support.  Don't bogusly rewrite such URLs.  */
 565   if (p && p[0] == ':' && p[1] == '/' && p[2] == '/')
 566     return NULL;
 567
 568   if (p && *p == ':')
 569     {
 570       /* Colon indicates ftp, as in foo.bar.com:path.  Check for
 571          special case of http port number ("localhost:10000").  */
 572       int digits = strspn (p + 1, "0123456789");
 573       if (digits && (p[1 + digits] == '/' || p[1 + digits] == '\0'))
 574         goto http;
 575
 576       /* Turn "foo.bar.com:path" to "ftp://foo.bar.com/path". */
 577       ret = aprintf ("ftp://%s", url);
 578       ret[6 + (p - url)] = '/';
 579     }
 580   else
 581     {
 582     http:
 583       /* Just prepend "http://" to URL. */
 584       ret = aprintf ("http://%s", url);
 585     }
 586   return ret;
 587 }
 588 \f
 589 static void split_path (const char *, char **, char **);
 590
 591 /* Like strpbrk, with the exception that it returns the pointer to the
 592    terminating zero (end-of-string aka "eos") if no matching character
 593    is found.  */
 594
 595 static inline char *
 596 strpbrk_or_eos (const char *s, const char *accept)
 597 {
 598   char *p = strpbrk (s, accept);
 599   if (!p)
 600     p = strchr (s, '\0');
 601   return p;
 602 }
 603
 604 /* Turn STR into lowercase; return true if a character was actually
 605    changed. */
 606
 607 static bool
 608 lowercase_str (char *str)
 609 {
 610   bool changed = false;
 611   for (; *str; str++)
 612     if (c_isupper (*str))
 613       {
 614         changed = true;
 615         *str = c_tolower (*str);
 616       }
 617   return changed;
 618 }
 619
 620 static const char *
 621 init_seps (enum url_scheme scheme)
 622 {
 623   static char seps[8] = ":/";
 624   char *p = seps + 2;
 625   int flags = supported_schemes[scheme].flags;
 626
 627   if (flags & scm_has_params)
 628     *p++ = ';';
 629   if (flags & scm_has_query)
 630     *p++ = '?';
 631   if (flags & scm_has_fragment)
 632     *p++ = '#';
 633   *p = '\0';
 634   return seps;
 635 }
 636
 637 static const char *parse_errors[] = {
 638 #define PE_NO_ERROR                     0
 639   N_("No error"),
 640 #define PE_UNSUPPORTED_SCHEME           1
 641   N_("Unsupported scheme %s"), /* support for format token only here */
 642 #define PE_MISSING_SCHEME               2
 643   N_("Scheme missing"),
 644 #define PE_INVALID_HOST_NAME            3
 645   N_("Invalid host name"),
 646 #define PE_BAD_PORT_NUMBER              4
 647   N_("Bad port number"),
 648 #define PE_INVALID_USER_NAME            5
 649   N_("Invalid user name"),
 650 #define PE_UNTERMINATED_IPV6_ADDRESS    6
 651   N_("Unterminated IPv6 numeric address"),
 652 #define PE_IPV6_NOT_SUPPORTED           7
 653   N_("IPv6 addresses not supported"),
 654 #define PE_INVALID_IPV6_ADDRESS         8
 655   N_("Invalid IPv6 numeric address")
 656 };
 657
 658 /* Parse a URL.
 659
 660    Return a new struct url if successful, NULL on error.  In case of
 661    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 662    error code. */
 663 struct url *
 664 url_parse (const char *url, int *error, struct iri *iri, bool percent_encode)
 665 {
 666   struct url *u;
 667   const char *p;
 668   bool path_modified, host_modified;
 669
 670   enum url_scheme scheme;
 671   const char *seps;
 672
 673   const char *uname_b,     *uname_e;
 674   const char *host_b,      *host_e;
 675   const char *path_b,      *path_e;
 676   const char *params_b,    *params_e;
 677   const char *query_b,     *query_e;
 678   const char *fragment_b,  *fragment_e;
 679
 680   int port;
 681   char *user = NULL, *passwd = NULL;
 682
 683   const char *url_encoded = NULL;
 684   char *new_url = NULL;
 685
 686   int error_code;
 687
 688   scheme = url_scheme (url);
 689   if (scheme == SCHEME_INVALID)
 690     {
 691       if (url_has_scheme (url))
 692         error_code = PE_UNSUPPORTED_SCHEME;
 693       else
 694         error_code = PE_MISSING_SCHEME;
 695       goto error;
 696     }
 697
 698   if (iri && iri->utf8_encode)
 699     {
 700       iri->utf8_encode = remote_to_utf8 (iri, iri->orig_url ? iri->orig_url : url, (const char **) &new_url);
 701       if (!iri->utf8_encode)
 702         new_url = NULL;
 703       else
 704         iri->orig_url = xstrdup (url);
 705     }
 706
 707   /* XXX XXX Could that change introduce (security) bugs ???  XXX XXX*/
 708   if (percent_encode)
 709     url_encoded = reencode_escapes (new_url ? new_url : url);
 710   else
 711     url_encoded = new_url ? new_url : url;
 712
 713   p = url_encoded;
 714
 715   if (new_url && url_encoded != new_url)
 716     xfree (new_url);
 717
 718   p += strlen (supported_schemes[scheme].leading_string);
 719   uname_b = p;
 720   p = url_skip_credentials (p);
 721   uname_e = p;
 722
 723   /* scheme://user:pass@host[:port]... */
 724   /*                    ^              */
 725
 726   /* We attempt to break down the URL into the components path,
 727      params, query, and fragment.  They are ordered like this:
 728
 729        scheme://host[:port][/path][;params][?query][#fragment]  */
 730
 731   path_b     = path_e     = NULL;
 732   params_b   = params_e   = NULL;
 733   query_b    = query_e    = NULL;
 734   fragment_b = fragment_e = NULL;
 735
 736   /* Initialize separators for optional parts of URL, depending on the
 737      scheme.  For example, FTP has params, and HTTP and HTTPS have
 738      query string and fragment. */
 739   seps = init_seps (scheme);
 740
 741   host_b = p;
 742
 743   if (*p == '[')
 744     {
 745       /* Handle IPv6 address inside square brackets.  Ideally we'd
 746          just look for the terminating ']', but rfc2732 mandates
 747          rejecting invalid IPv6 addresses.  */
 748
 749       /* The address begins after '['. */
 750       host_b = p + 1;
 751       host_e = strchr (host_b, ']');
 752
 753       if (!host_e)
 754         {
 755           error_code = PE_UNTERMINATED_IPV6_ADDRESS;
 756           goto error;
 757         }
 758
 759 #ifdef ENABLE_IPV6
 760       /* Check if the IPv6 address is valid. */
 761       if (!is_valid_ipv6_address(host_b, host_e))
 762         {
 763           error_code = PE_INVALID_IPV6_ADDRESS;
 764           goto error;
 765         }
 766
 767       /* Continue parsing after the closing ']'. */
 768       p = host_e + 1;
 769 #else
 770       error_code = PE_IPV6_NOT_SUPPORTED;
 771       goto error;
 772 #endif
 773
 774       /* The closing bracket must be followed by a separator or by the
 775          null char.  */
 776       /* http://[::1]... */
 777       /*             ^   */
 778       if (!strchr (seps, *p))
 779         {
 780           /* Trailing garbage after []-delimited IPv6 address. */
 781           error_code = PE_INVALID_HOST_NAME;
 782           goto error;
 783         }
 784     }
 785   else
 786     {
 787       p = strpbrk_or_eos (p, seps);
 788       host_e = p;
 789     }
 790   ++seps;                       /* advance to '/' */
 791
 792   if (host_b == host_e)
 793     {
 794       error_code = PE_INVALID_HOST_NAME;
 795       goto error;
 796     }
 797
 798   port = scheme_default_port (scheme);
 799   if (*p == ':')
 800     {
 801       const char *port_b, *port_e, *pp;
 802
 803       /* scheme://host:port/tralala */
 804       /*              ^             */
 805       ++p;
 806       port_b = p;
 807       p = strpbrk_or_eos (p, seps);
 808       port_e = p;
 809
 810       /* Allow empty port, as per rfc2396. */
 811       if (port_b != port_e)
 812         for (port = 0, pp = port_b; pp < port_e; pp++)
 813           {
 814             if (!c_isdigit (*pp))
 815               {
 816                 /* http://host:12randomgarbage/blah */
 817                 /*               ^                  */
 818                 error_code = PE_BAD_PORT_NUMBER;
 819                 goto error;
 820               }
 821             port = 10 * port + (*pp - '0');
 822             /* Check for too large port numbers here, before we have
 823                a chance to overflow on bogus port values.  */
 824             if (port > 0xffff)
 825               {
 826                 error_code = PE_BAD_PORT_NUMBER;
 827                 goto error;
 828               }
 829           }
 830     }
 831   /* Advance to the first separator *after* '/' (either ';' or '?',
 832      depending on the scheme).  */
 833   ++seps;
 834
 835   /* Get the optional parts of URL, each part being delimited by
 836      current location and the position of the next separator.  */
 837 #define GET_URL_PART(sepchar, var) do {                         \
 838   if (*p == sepchar)                                            \
 839     var##_b = ++p, var##_e = p = strpbrk_or_eos (p, seps);      \
 840   ++seps;                                                       \
 841 } while (0)
 842
 843   GET_URL_PART ('/', path);
 844   if (supported_schemes[scheme].flags & scm_has_params)
 845     GET_URL_PART (';', params);
 846   if (supported_schemes[scheme].flags & scm_has_query)
 847     GET_URL_PART ('?', query);
 848   if (supported_schemes[scheme].flags & scm_has_fragment)
 849     GET_URL_PART ('#', fragment);
 850
 851 #undef GET_URL_PART
 852   assert (*p == 0);
 853
 854   if (uname_b != uname_e)
 855     {
 856       /* http://user:pass@host */
 857       /*        ^         ^    */
 858       /*     uname_b   uname_e */
 859       if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
 860         {
 861           error_code = PE_INVALID_USER_NAME;
 862           goto error;
 863         }
 864     }
 865
 866   u = xnew0 (struct url);
 867   u->scheme = scheme;
 868   u->host   = strdupdelim (host_b, host_e);
 869   u->port   = port;
 870   u->user   = user;
 871   u->passwd = passwd;
 872
 873   u->path = strdupdelim (path_b, path_e);
 874   path_modified = path_simplify (scheme, u->path);
 875   split_path (u->path, &u->dir, &u->file);
 876
 877   host_modified = lowercase_str (u->host);
 878
 879   /* Decode %HH sequences in host name.  This is important not so much
 880      to support %HH sequences in host names (which other browser
 881      don't), but to support binary characters (which will have been
 882      converted to %HH by reencode_escapes).  */
 883   if (strchr (u->host, '%'))
 884     {
 885       url_unescape (u->host);
 886       host_modified = true;
 887
 888       /* Apply IDNA regardless of iri->utf8_encode status */
 889       if (opt.enable_iri && iri)
 890         {
 891           char *new = idn_encode (iri, u->host);
 892           if (new)
 893             {
 894               xfree (u->host);
 895               u->host = new;
 896               host_modified = true;
 897             }
 898         }
 899     }
 900
 901   if (params_b)
 902     u->params = strdupdelim (params_b, params_e);
 903   if (query_b)
 904     u->query = strdupdelim (query_b, query_e);
 905   if (fragment_b)
 906     u->fragment = strdupdelim (fragment_b, fragment_e);
 907
 908   if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e)
 909     {
 910       /* If we suspect that a transformation has rendered what
 911          url_string might return different from URL_ENCODED, rebuild
 912          u->url using url_string.  */
 913       u->url = url_string (u, URL_AUTH_SHOW);
 914
 915       if (url_encoded != url)
 916         xfree ((char *) url_encoded);
 917     }
 918   else
 919     {
 920       if (url_encoded == url)
 921         u->url = xstrdup (url);
 922       else
 923         u->url = (char *) url_encoded;
 924     }
 925
 926   return u;
 927
 928  error:
 929   /* Cleanup in case of error: */
 930   if (url_encoded && url_encoded != url)
 931     xfree ((char *) url_encoded);
 932
 933   /* Transmit the error code to the caller, if the caller wants to
 934      know.  */
 935   if (error)
 936     *error = error_code;
 937   return NULL;
 938 }
 939
 940 /* Return the error message string from ERROR_CODE, which should have
 941    been retrieved from url_parse.  The error message is translated.  */
 942
 943 char *
 944 url_error (const char *url, int error_code)
 945 {
 946   assert (error_code >= 0 && ((size_t) error_code) < countof (parse_errors));
 947
 948   if (error_code == PE_UNSUPPORTED_SCHEME)
 949     {
 950       char *error, *p;
 951       char *scheme = xstrdup (url);
 952       assert (url_has_scheme (url));
 953
 954       if ((p = strchr (scheme, ':')))
 955         *p = '\0';
 956       if (!strcasecmp (scheme, "https"))
 957         error = aprintf (_("HTTPS support not compiled in"));
 958       else
 959         error = aprintf (_(parse_errors[error_code]), quote (scheme));
 960       xfree (scheme);
 961
 962       return error;
 963     }
 964   else
 965     return xstrdup (_(parse_errors[error_code]));
 966 }
 967
 968 /* Split PATH into DIR and FILE.  PATH comes from the URL and is
 969    expected to be URL-escaped.
 970
 971    The path is split into directory (the part up to the last slash)
 972    and file (the part after the last slash), which are subsequently
 973    unescaped.  Examples:
 974
 975    PATH                 DIR           FILE
 976    "foo/bar/baz"        "foo/bar"     "baz"
 977    "foo/bar/"           "foo/bar"     ""
 978    "foo"                ""            "foo"
 979    "foo/bar/baz%2fqux"  "foo/bar"     "baz/qux" (!)
 980
 981    DIR and FILE are freshly allocated.  */
 982
 983 static void
 984 split_path (const char *path, char **dir, char **file)
 985 {
 986   char *last_slash = strrchr (path, '/');
 987   if (!last_slash)
 988     {
 989       *dir = xstrdup ("");
 990       *file = xstrdup (path);
 991     }
 992   else
 993     {
 994       *dir = strdupdelim (path, last_slash);
 995       *file = xstrdup (last_slash + 1);
 996     }
 997   url_unescape (*dir);
 998   url_unescape (*file);
 999 }
1000
1001 /* Note: URL's "full path" is the path with the query string and
1002    params appended.  The "fragment" (#foo) is intentionally ignored,
1003    but that might be changed.  For example, if the original URL was
1004    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
1005    the full path will be "/foo/bar/baz;bullshit?querystring".  */
1006
1007 /* Return the length of the full path, without the terminating
1008    zero.  */
1009
1010 static int
1011 full_path_length (const struct url *url)
1012 {
1013   int len = 0;
1014
1015 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
1016
1017   FROB (path);
1018   FROB (params);
1019   FROB (query);
1020
1021 #undef FROB
1022
1023   return len;
1024 }
1025
1026 /* Write out the full path. */
1027
1028 static void
1029 full_path_write (const struct url *url, char *where)
1030 {
1031 #define FROB(el, chr) do {                      \
1032   char *f_el = url->el;                         \
1033   if (f_el) {                                   \
1034     int l = strlen (f_el);                      \
1035     *where++ = chr;                             \
1036     memcpy (where, f_el, l);                    \
1037     where += l;                                 \
1038   }                                             \
1039 } while (0)
1040
1041   FROB (path, '/');
1042   FROB (params, ';');
1043   FROB (query, '?');
1044
1045 #undef FROB
1046 }
1047
1048 /* Public function for getting the "full path".  E.g. if u->path is
1049    "foo/bar" and u->query is "param=value", full_path will be
1050    "/foo/bar?param=value". */
1051
1052 char *
1053 url_full_path (const struct url *url)
1054 {
1055   int length = full_path_length (url);
1056   char *full_path = xmalloc (length + 1);
1057
1058   full_path_write (url, full_path);
1059   full_path[length] = '\0';
1060
1061   return full_path;
1062 }
1063
1064 /* Unescape CHR in an otherwise escaped STR.  Used to selectively
1065    escaping of certain characters, such as "/" and ":".  Returns a
1066    count of unescaped chars.  */
1067
1068 static void
1069 unescape_single_char (char *str, char chr)
1070 {
1071   const char c1 = XNUM_TO_DIGIT (chr >> 4);
1072   const char c2 = XNUM_TO_DIGIT (chr & 0xf);
1073   char *h = str;                /* hare */
1074   char *t = str;                /* tortoise */
1075   for (; *h; h++, t++)
1076     {
1077       if (h[0] == '%' && h[1] == c1 && h[2] == c2)
1078         {
1079           *t = chr;
1080           h += 2;
1081         }
1082       else
1083         *t = *h;
1084     }
1085   *t = '\0';
1086 }
1087
1088 /* Escape unsafe and reserved characters, except for the slash
1089    characters.  */
1090
1091 static char *
1092 url_escape_dir (const char *dir)
1093 {
1094   char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1095   if (newdir == dir)
1096     return (char *)dir;
1097
1098   unescape_single_char (newdir, '/');
1099   return newdir;
1100 }
1101
1102 /* Sync u->path and u->url with u->dir and u->file.  Called after
1103    u->file or u->dir have been changed, typically by the FTP code.  */
1104
1105 static void
1106 sync_path (struct url *u)
1107 {
1108   char *newpath, *efile, *edir;
1109
1110   xfree (u->path);
1111
1112   /* u->dir and u->file are not escaped.  URL-escape them before
1113      reassembling them into u->path.  That way, if they contain
1114      separators like '?' or even if u->file contains slashes, the
1115      path will be correctly assembled.  (u->file can contain slashes
1116      if the URL specifies it with %2f, or if an FTP server returns
1117      it.)  */
1118   edir = url_escape_dir (u->dir);
1119   efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1120
1121   if (!*edir)
1122     newpath = xstrdup (efile);
1123   else
1124     {
1125       int dirlen = strlen (edir);
1126       int filelen = strlen (efile);
1127
1128       /* Copy "DIR/FILE" to newpath. */
1129       char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1130       memcpy (p, edir, dirlen);
1131       p += dirlen;
1132       *p++ = '/';
1133       memcpy (p, efile, filelen);
1134       p += filelen;
1135       *p = '\0';
1136     }
1137
1138   u->path = newpath;
1139
1140   if (edir != u->dir)
1141     xfree (edir);
1142   if (efile != u->file)
1143     xfree (efile);
1144
1145   /* Regenerate u->url as well.  */
1146   xfree (u->url);
1147   u->url = url_string (u, URL_AUTH_SHOW);
1148 }
1149
1150 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1151    This way we can sync u->path and u->url when they get changed.  */
1152
1153 void
1154 url_set_dir (struct url *url, const char *newdir)
1155 {
1156   xfree (url->dir);
1157   url->dir = xstrdup (newdir);
1158   sync_path (url);
1159 }
1160
1161 void
1162 url_set_file (struct url *url, const char *newfile)
1163 {
1164   xfree (url->file);
1165   url->file = xstrdup (newfile);
1166   sync_path (url);
1167 }
1168
1169 void
1170 url_free (struct url *url)
1171 {
1172   xfree (url->host);
1173   xfree (url->path);
1174   xfree (url->url);
1175
1176   xfree_null (url->params);
1177   xfree_null (url->query);
1178   xfree_null (url->fragment);
1179   xfree_null (url->user);
1180   xfree_null (url->passwd);
1181
1182   xfree (url->dir);
1183   xfree (url->file);
1184
1185   xfree (url);
1186 }
1187 \f
1188 /* Create all the necessary directories for PATH (a file).  Calls
1189    make_directory internally.  */
1190 int
1191 mkalldirs (const char *path)
1192 {
1193   const char *p;
1194   char *t;
1195   struct_stat st;
1196   int res;
1197
1198   p = path + strlen (path);
1199   for (; *p != '/' && p != path; p--)
1200     ;
1201
1202   /* Don't create if it's just a file.  */
1203   if ((p == path) && (*p != '/'))
1204     return 0;
1205   t = strdupdelim (path, p);
1206
1207   /* Check whether the directory exists.  */
1208   if ((stat (t, &st) == 0))
1209     {
1210       if (S_ISDIR (st.st_mode))
1211         {
1212           xfree (t);
1213           return 0;
1214         }
1215       else
1216         {
1217           /* If the dir exists as a file name, remove it first.  This
1218              is *only* for Wget to work with buggy old CERN http
1219              servers.  Here is the scenario: When Wget tries to
1220              retrieve a directory without a slash, e.g.
1221              http://foo/bar (bar being a directory), CERN server will
1222              not redirect it too http://foo/bar/ -- it will generate a
1223              directory listing containing links to bar/file1,
1224              bar/file2, etc.  Wget will lose because it saves this
1225              HTML listing to a file `bar', so it cannot create the
1226              directory.  To work around this, if the file of the same
1227              name exists, we just remove it and create the directory
1228              anyway.  */
1229           DEBUGP (("Removing %s because of directory danger!\n", t));
1230           unlink (t);
1231         }
1232     }
1233   res = make_directory (t);
1234   if (res != 0)
1235     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1236   xfree (t);
1237   return res;
1238 }
1239 \f
1240 /* Functions for constructing the file name out of URL components.  */
1241
1242 /* A growable string structure, used by url_file_name and friends.
1243    This should perhaps be moved to utils.c.
1244
1245    The idea is to have a convenient and efficient way to construct a
1246    string by having various functions append data to it.  Instead of
1247    passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1248    functions in questions, we pass the pointer to this struct.  */
1249
1250 struct growable {
1251   char *base;
1252   int size;
1253   int tail;
1254 };
1255
1256 /* Ensure that the string can accept APPEND_COUNT more characters past
1257    the current TAIL position.  If necessary, this will grow the string
1258    and update its allocated size.  If the string is already large
1259    enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1260 #define GROW(g, append_size) do {                                       \
1261   struct growable *G_ = g;                                              \
1262   DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1263 } while (0)
1264
1265 /* Return the tail position of the string. */
1266 #define TAIL(r) ((r)->base + (r)->tail)
1267
1268 /* Move the tail position by APPEND_COUNT characters. */
1269 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1270
1271 /* Append the string STR to DEST.  NOTICE: the string in DEST is not
1272    terminated.  */
1273
1274 static void
1275 append_string (const char *str, struct growable *dest)
1276 {
1277   int l = strlen (str);
1278   GROW (dest, l);
1279   memcpy (TAIL (dest), str, l);
1280   TAIL_INCR (dest, l);
1281 }
1282
1283 /* Append CH to DEST.  For example, append_char (0, DEST)
1284    zero-terminates DEST.  */
1285
1286 static void
1287 append_char (char ch, struct growable *dest)
1288 {
1289   GROW (dest, 1);
1290   *TAIL (dest) = ch;
1291   TAIL_INCR (dest, 1);
1292 }
1293
1294 enum {
1295   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1296   filechr_not_windows = 2,      /* unusable on Windows, one of \|/<>?:*" */
1297   filechr_control     = 4       /* a control character, e.g. 0-31 */
1298 };
1299
1300 #define FILE_CHAR_TEST(c, mask) \
1301     ((opt.restrict_files_nonascii && !c_isascii ((unsigned char)(c))) || \
1302     (filechr_table[(unsigned char)(c)] & (mask)))
1303
1304 /* Shorthands for the table: */
1305 #define U filechr_not_unix
1306 #define W filechr_not_windows
1307 #define C filechr_control
1308
1309 #define UW U|W
1310 #define UWC U|W|C
1311
1312 /* Table of characters unsafe under various conditions (see above).
1313
1314    Arguably we could also claim `%' to be unsafe, since we use it as
1315    the escape character.  If we ever want to be able to reliably
1316    translate file name back to URL, this would become important
1317    crucial.  Right now, it's better to be minimal in escaping.  */
1318
1319 static const unsigned char filechr_table[256] =
1320 {
1321 UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1322   C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1323   C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1324   C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1325   0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1326   0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1327   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1328   0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
1329   0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1330   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1331   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1332   0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1333   0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1334   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1335   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1336   0,  0,  0,  0,   W,  0,  0,  C,   /* x   y   z   {    |   }   ~   DEL */
1337
1338   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
1339   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
1340   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1341   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1342
1343   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1344   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1345   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1346   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1347 };
1348 #undef U
1349 #undef W
1350 #undef C
1351 #undef UW
1352 #undef UWC
1353
1354 /* FN_PORT_SEP is the separator between host and port in file names
1355    for non-standard port numbers.  On Unix this is normally ':', as in
1356    "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1357    because Windows can't handle ':' in file names.  */
1358 #define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1359
1360 /* FN_QUERY_SEP is the separator between the file name and the URL
1361    query, normally '?'.  Since Windows cannot handle '?' as part of
1362    file name, we use '@' instead there.  */
1363 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1364
1365 /* Quote path element, characters in [b, e), as file name, and append
1366    the quoted string to DEST.  Each character is quoted as per
1367    file_unsafe_char and the corresponding table.
1368
1369    If ESCAPED is true, the path element is considered to be
1370    URL-escaped and will be unescaped prior to inspection.  */
1371
1372 static void
1373 append_uri_pathel (const char *b, const char *e, bool escaped,
1374                    struct growable *dest)
1375 {
1376   const char *p;
1377   int quoted, outlen;
1378
1379   int mask;
1380   if (opt.restrict_files_os == restrict_unix)
1381     mask = filechr_not_unix;
1382   else
1383     mask = filechr_not_windows;
1384   if (opt.restrict_files_ctrl)
1385     mask |= filechr_control;
1386
1387   /* Copy [b, e) to PATHEL and URL-unescape it. */
1388   if (escaped)
1389     {
1390       char *unescaped;
1391       BOUNDED_TO_ALLOCA (b, e, unescaped);
1392       url_unescape (unescaped);
1393       b = unescaped;
1394       e = unescaped + strlen (unescaped);
1395     }
1396
1397   /* Defang ".." when found as component of path.  Remember that path
1398      comes from the URL and might contain malicious input.  */
1399   if (e - b == 2 && b[0] == '.' && b[1] == '.')
1400     {
1401       b = "%2E%2E";
1402       e = b + 6;
1403     }
1404
1405   /* Walk the PATHEL string and check how many characters we'll need
1406      to quote.  */
1407   quoted = 0;
1408   for (p = b; p < e; p++)
1409     if (FILE_CHAR_TEST (*p, mask))
1410       ++quoted;
1411
1412   /* Calculate the length of the output string.  e-b is the input
1413      string length.  Each quoted char introduces two additional
1414      characters in the string, hence 2*quoted.  */
1415   outlen = (e - b) + (2 * quoted);
1416   GROW (dest, outlen);
1417
1418   if (!quoted)
1419     {
1420       /* If there's nothing to quote, we can simply append the string
1421          without processing it again.  */
1422       memcpy (TAIL (dest), b, outlen);
1423     }
1424   else
1425     {
1426       char *q = TAIL (dest);
1427       for (p = b; p < e; p++)
1428         {
1429           if (!FILE_CHAR_TEST (*p, mask))
1430             *q++ = *p;
1431           else
1432             {
1433               unsigned char ch = *p;
1434               *q++ = '%';
1435               *q++ = XNUM_TO_DIGIT (ch >> 4);
1436               *q++ = XNUM_TO_DIGIT (ch & 0xf);
1437             }
1438         }
1439       assert (q - TAIL (dest) == outlen);
1440     }
1441
1442   /* Perform inline case transformation if required.  */
1443   if (opt.restrict_files_case == restrict_lowercase
1444       || opt.restrict_files_case == restrict_uppercase)
1445     {
1446       char *q;
1447       for (q = TAIL (dest); q < TAIL (dest) + outlen; ++q)
1448         {
1449           if (opt.restrict_files_case == restrict_lowercase)
1450             *q = c_tolower (*q);
1451           else
1452             *q = c_toupper (*q);
1453         }
1454     }
1455
1456   TAIL_INCR (dest, outlen);
1457 }
1458
1459 /* Append to DEST the directory structure that corresponds the
1460    directory part of URL's path.  For example, if the URL is
1461    http://server/dir1/dir2/file, this appends "/dir1/dir2".
1462
1463    Each path element ("dir1" and "dir2" in the above example) is
1464    examined, url-unescaped, and re-escaped as file name element.
1465
1466    Additionally, it cuts as many directories from the path as
1467    specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1468    will produce "bar" for the above example.  For 2 or more, it will
1469    produce "".
1470
1471    Each component of the path is quoted for use as file name.  */
1472
1473 static void
1474 append_dir_structure (const struct url *u, struct growable *dest)
1475 {
1476   char *pathel, *next;
1477   int cut = opt.cut_dirs;
1478
1479   /* Go through the path components, de-URL-quote them, and quote them
1480      (if necessary) as file names.  */
1481
1482   pathel = u->path;
1483   for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1484     {
1485       if (cut-- > 0)
1486         continue;
1487       if (pathel == next)
1488         /* Ignore empty pathels.  */
1489         continue;
1490
1491       if (dest->tail)
1492         append_char ('/', dest);
1493       append_uri_pathel (pathel, next, true, dest);
1494     }
1495 }
1496
1497 /* Return a unique file name that matches the given URL as good as
1498    possible.  Does not create directories on the file system.  */
1499
1500 char *
1501 url_file_name (const struct url *u, char *replaced_filename)
1502 {
1503   struct growable fnres;        /* stands for "file name result" */
1504
1505   const char *u_file, *u_query;
1506   char *fname, *unique;
1507   char *index_filename = "index.html"; /* The default index file is index.html */
1508
1509   fnres.base = NULL;
1510   fnres.size = 0;
1511   fnres.tail = 0;
1512
1513   /* If an alternative index file was defined, change index_filename */
1514   if (opt.default_page)
1515     index_filename = opt.default_page;
1516
1517
1518   /* Start with the directory prefix, if specified. */
1519   if (opt.dir_prefix)
1520     append_string (opt.dir_prefix, &fnres);
1521
1522   /* If "dirstruct" is turned on (typically the case with -r), add
1523      the host and port (unless those have been turned off) and
1524      directory structure.  */
1525   if (opt.dirstruct)
1526     {
1527       if (opt.protocol_directories)
1528         {
1529           if (fnres.tail)
1530             append_char ('/', &fnres);
1531           append_string (supported_schemes[u->scheme].name, &fnres);
1532         }
1533       if (opt.add_hostdir)
1534         {
1535           if (fnres.tail)
1536             append_char ('/', &fnres);
1537           if (0 != strcmp (u->host, ".."))
1538             append_string (u->host, &fnres);
1539           else
1540             /* Host name can come from the network; malicious DNS may
1541                allow ".." to be resolved, causing us to write to
1542                "../<file>".  Defang such host names.  */
1543             append_string ("%2E%2E", &fnres);
1544           if (u->port != scheme_default_port (u->scheme))
1545             {
1546               char portstr[24];
1547               number_to_string (portstr, u->port);
1548               append_char (FN_PORT_SEP, &fnres);
1549               append_string (portstr, &fnres);
1550             }
1551         }
1552
1553       append_dir_structure (u, &fnres);
1554     }
1555
1556   if (!replaced_filename)
1557     {
1558       /* Add the file name. */
1559       if (fnres.tail)
1560         append_char ('/', &fnres);
1561       u_file = *u->file ? u->file : index_filename;
1562       append_uri_pathel (u_file, u_file + strlen (u_file), false, &fnres);
1563
1564       /* Append "?query" to the file name. */
1565       u_query = u->query && *u->query ? u->query : NULL;
1566       if (u_query)
1567         {
1568           append_char (FN_QUERY_SEP, &fnres);
1569           append_uri_pathel (u_query, u_query + strlen (u_query),
1570                              true, &fnres);
1571         }
1572     }
1573   else
1574     {
1575       if (fnres.tail)
1576         append_char ('/', &fnres);
1577       u_file = replaced_filename;
1578       append_uri_pathel (u_file, u_file + strlen (u_file), false, &fnres);
1579     }
1580
1581   /* Zero-terminate the file name. */
1582   append_char ('\0', &fnres);
1583
1584   fname = fnres.base;
1585
1586   /* Check the cases in which the unique extensions are not used:
1587      1) Clobbering is turned off (-nc).
1588      2) Retrieval with regetting.
1589      3) Timestamping is used.
1590      4) Hierarchy is built.
1591
1592      The exception is the case when file does exist and is a
1593      directory (see `mkalldirs' for explanation).  */
1594
1595   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1596       && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1597     {
1598       unique = fname;
1599     }
1600   else
1601     {
1602       unique = unique_name (fname, true);
1603       if (unique != fname)
1604         xfree (fname);
1605     }
1606
1607 /* On VMS, alter the name as required. */
1608 #ifdef __VMS
1609   {
1610     char *unique2;
1611
1612     unique2 = ods_conform( unique);
1613     if (unique2 != unique)
1614       {
1615         xfree (unique);
1616         unique = unique2;
1617       }
1618   }
1619 #endif /* def __VMS */
1620
1621   return unique;
1622 }
1623 \f
1624 /* Resolve "." and ".." elements of PATH by destructively modifying
1625    PATH and return true if PATH has been modified, false otherwise.
1626
1627    The algorithm is in spirit similar to the one described in rfc1808,
1628    although implemented differently, in one pass.  To recap, path
1629    elements containing only "." are removed, and ".." is taken to mean
1630    "back up one element".  Single leading and trailing slashes are
1631    preserved.
1632
1633    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1634    test examples are provided below.  If you change anything in this
1635    function, run test_path_simplify to make sure you haven't broken a
1636    test case.  */
1637
1638 static bool
1639 path_simplify (enum url_scheme scheme, char *path)
1640 {
1641   char *h = path;               /* hare */
1642   char *t = path;               /* tortoise */
1643   char *beg = path;
1644   char *end = strchr (path, '\0');
1645
1646   while (h < end)
1647     {
1648       /* Hare should be at the beginning of a path element. */
1649
1650       if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1651         {
1652           /* Ignore "./". */
1653           h += 2;
1654         }
1655       else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1656         {
1657           /* Handle "../" by retreating the tortoise by one path
1658              element -- but not past beggining.  */
1659           if (t > beg)
1660             {
1661               /* Move backwards until T hits the beginning of the
1662                  previous path element or the beginning of path. */
1663               for (--t; t > beg && t[-1] != '/'; t--)
1664                 ;
1665             }
1666           else if (scheme == SCHEME_FTP)
1667             {
1668               /* If we're at the beginning, copy the "../" literally
1669                  and move the beginning so a later ".." doesn't remove
1670                  it.  This violates RFC 3986; but we do it for FTP
1671                  anyway because there is otherwise no way to get at a
1672                  parent directory, when the FTP server drops us in a
1673                  non-root directory (which is not uncommon). */
1674               beg = t + 3;
1675               goto regular;
1676             }
1677           h += 3;
1678         }
1679       else
1680         {
1681         regular:
1682           /* A regular path element.  If H hasn't advanced past T,
1683              simply skip to the next path element.  Otherwise, copy
1684              the path element until the next slash.  */
1685           if (t == h)
1686             {
1687               /* Skip the path element, including the slash.  */
1688               while (h < end && *h != '/')
1689                 t++, h++;
1690               if (h < end)
1691                 t++, h++;
1692             }
1693           else
1694             {
1695               /* Copy the path element, including the final slash.  */
1696               while (h < end && *h != '/')
1697                 *t++ = *h++;
1698               if (h < end)
1699                 *t++ = *h++;
1700             }
1701         }
1702     }
1703
1704   if (t != h)
1705     *t = '\0';
1706
1707   return t != h;
1708 }
1709 \f
1710 /* Return the length of URL's path.  Path is considered to be
1711    terminated by one or more of the ?query or ;params or #fragment,
1712    depending on the scheme.  */
1713
1714 static const char *
1715 path_end (const char *url)
1716 {
1717   enum url_scheme scheme = url_scheme (url);
1718   const char *seps;
1719   if (scheme == SCHEME_INVALID)
1720     scheme = SCHEME_HTTP;       /* use http semantics for rel links */
1721   /* +2 to ignore the first two separators ':' and '/' */
1722   seps = init_seps (scheme) + 2;
1723   return strpbrk_or_eos (url, seps);
1724 }
1725
1726 /* Find the last occurrence of character C in the range [b, e), or
1727    NULL, if none are present.  */
1728 #define find_last_char(b, e, c) memrchr ((b), (c), (e) - (b))
1729
1730 /* Merge BASE with LINK and return the resulting URI.
1731
1732    Either of the URIs may be absolute or relative, complete with the
1733    host name, or path only.  This tries to reasonably handle all
1734    foreseeable cases.  It only employs minimal URL parsing, without
1735    knowledge of the specifics of schemes.
1736
1737    I briefly considered making this function call path_simplify after
1738    the merging process, as rfc1738 seems to suggest.  This is a bad
1739    idea for several reasons: 1) it complexifies the code, and 2)
1740    url_parse has to simplify path anyway, so it's wasteful to boot.  */
1741
1742 char *
1743 uri_merge (const char *base, const char *link)
1744 {
1745   int linklength;
1746   const char *end;
1747   char *merge;
1748
1749   if (url_has_scheme (link))
1750     return xstrdup (link);
1751
1752   /* We may not examine BASE past END. */
1753   end = path_end (base);
1754   linklength = strlen (link);
1755
1756   if (!*link)
1757     {
1758       /* Empty LINK points back to BASE, query string and all. */
1759       return xstrdup (base);
1760     }
1761   else if (*link == '?')
1762     {
1763       /* LINK points to the same location, but changes the query
1764          string.  Examples: */
1765       /* uri_merge("path",         "?new") -> "path?new"     */
1766       /* uri_merge("path?foo",     "?new") -> "path?new"     */
1767       /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1768       /* uri_merge("path#foo",     "?new") -> "path?new"     */
1769       int baselength = end - base;
1770       merge = xmalloc (baselength + linklength + 1);
1771       memcpy (merge, base, baselength);
1772       memcpy (merge + baselength, link, linklength);
1773       merge[baselength + linklength] = '\0';
1774     }
1775   else if (*link == '#')
1776     {
1777       /* uri_merge("path",         "#new") -> "path#new"     */
1778       /* uri_merge("path#foo",     "#new") -> "path#new"     */
1779       /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1780       /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1781       int baselength;
1782       const char *end1 = strchr (base, '#');
1783       if (!end1)
1784         end1 = base + strlen (base);
1785       baselength = end1 - base;
1786       merge = xmalloc (baselength + linklength + 1);
1787       memcpy (merge, base, baselength);
1788       memcpy (merge + baselength, link, linklength);
1789       merge[baselength + linklength] = '\0';
1790     }
1791   else if (*link == '/' && *(link + 1) == '/')
1792     {
1793       /* LINK begins with "//" and so is a net path: we need to
1794          replace everything after (and including) the double slash
1795          with LINK. */
1796
1797       /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1798       /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1799       /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1800
1801       int span;
1802       const char *slash;
1803       const char *start_insert;
1804
1805       /* Look for first slash. */
1806       slash = memchr (base, '/', end - base);
1807       /* If found slash and it is a double slash, then replace
1808          from this point, else default to replacing from the
1809          beginning.  */
1810       if (slash && *(slash + 1) == '/')
1811         start_insert = slash;
1812       else
1813         start_insert = base;
1814
1815       span = start_insert - base;
1816       merge = xmalloc (span + linklength + 1);
1817       if (span)
1818         memcpy (merge, base, span);
1819       memcpy (merge + span, link, linklength);
1820       merge[span + linklength] = '\0';
1821     }
1822   else if (*link == '/')
1823     {
1824       /* LINK is an absolute path: we need to replace everything
1825          after (and including) the FIRST slash with LINK.
1826
1827          So, if BASE is "http://host/whatever/foo/bar", and LINK is
1828          "/qux/xyzzy", our result should be
1829          "http://host/qux/xyzzy".  */
1830       int span;
1831       const char *slash;
1832       const char *start_insert = NULL; /* for gcc to shut up. */
1833       const char *pos = base;
1834       bool seen_slash_slash = false;
1835       /* We're looking for the first slash, but want to ignore
1836          double slash. */
1837     again:
1838       slash = memchr (pos, '/', end - pos);
1839       if (slash && !seen_slash_slash)
1840         if (*(slash + 1) == '/')
1841           {
1842             pos = slash + 2;
1843             seen_slash_slash = true;
1844             goto again;
1845           }
1846
1847       /* At this point, SLASH is the location of the first / after
1848          "//", or the first slash altogether.  START_INSERT is the
1849          pointer to the location where LINK will be inserted.  When
1850          examining the last two examples, keep in mind that LINK
1851          begins with '/'. */
1852
1853       if (!slash && !seen_slash_slash)
1854         /* example: "foo" */
1855         /*           ^    */
1856         start_insert = base;
1857       else if (!slash && seen_slash_slash)
1858         /* example: "http://foo" */
1859         /*                     ^ */
1860         start_insert = end;
1861       else if (slash && !seen_slash_slash)
1862         /* example: "foo/bar" */
1863         /*           ^        */
1864         start_insert = base;
1865       else if (slash && seen_slash_slash)
1866         /* example: "http://something/" */
1867         /*                           ^  */
1868         start_insert = slash;
1869
1870       span = start_insert - base;
1871       merge = xmalloc (span + linklength + 1);
1872       if (span)
1873         memcpy (merge, base, span);
1874       memcpy (merge + span, link, linklength);
1875       merge[span + linklength] = '\0';
1876     }
1877   else
1878     {
1879       /* LINK is a relative URL: we need to replace everything
1880          after last slash (possibly empty) with LINK.
1881
1882          So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1883          our result should be "whatever/foo/qux/xyzzy".  */
1884       bool need_explicit_slash = false;
1885       int span;
1886       const char *start_insert;
1887       const char *last_slash = find_last_char (base, end, '/');
1888       if (!last_slash)
1889         {
1890           /* No slash found at all.  Replace what we have with LINK. */
1891           start_insert = base;
1892         }
1893       else if (last_slash && last_slash >= base + 2
1894                && last_slash[-2] == ':' && last_slash[-1] == '/')
1895         {
1896           /* example: http://host"  */
1897           /*                      ^ */
1898           start_insert = end + 1;
1899           need_explicit_slash = true;
1900         }
1901       else
1902         {
1903           /* example: "whatever/foo/bar" */
1904           /*                        ^    */
1905           start_insert = last_slash + 1;
1906         }
1907
1908       span = start_insert - base;
1909       merge = xmalloc (span + linklength + 1);
1910       if (span)
1911         memcpy (merge, base, span);
1912       if (need_explicit_slash)
1913         merge[span - 1] = '/';
1914       memcpy (merge + span, link, linklength);
1915       merge[span + linklength] = '\0';
1916     }
1917
1918   return merge;
1919 }
1920 \f
1921 #define APPEND(p, s) do {                       \
1922   int len = strlen (s);                         \
1923   memcpy (p, s, len);                           \
1924   p += len;                                     \
1925 } while (0)
1926
1927 /* Use this instead of password when the actual password is supposed
1928    to be hidden.  We intentionally use a generic string without giving
1929    away the number of characters in the password, like previous
1930    versions did.  */
1931 #define HIDDEN_PASSWORD "*password*"
1932
1933 /* Recreate the URL string from the data in URL.
1934
1935    If HIDE is true (as it is when we're calling this on a URL we plan
1936    to print, but not when calling it to canonicalize a URL for use
1937    within the program), password will be hidden.  Unsafe characters in
1938    the URL will be quoted.  */
1939
1940 char *
1941 url_string (const struct url *url, enum url_auth_mode auth_mode)
1942 {
1943   int size;
1944   char *result, *p;
1945   char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;
1946
1947   int scheme_port = supported_schemes[url->scheme].default_port;
1948   const char *scheme_str = supported_schemes[url->scheme].leading_string;
1949   int fplen = full_path_length (url);
1950
1951   bool brackets_around_host;
1952
1953   assert (scheme_str != NULL);
1954
1955   /* Make sure the user name and password are quoted. */
1956   if (url->user)
1957     {
1958       if (auth_mode != URL_AUTH_HIDE)
1959         {
1960           quoted_user = url_escape_allow_passthrough (url->user);
1961           if (url->passwd)
1962             {
1963               if (auth_mode == URL_AUTH_HIDE_PASSWD)
1964                 quoted_passwd = HIDDEN_PASSWORD;
1965               else
1966                 quoted_passwd = url_escape_allow_passthrough (url->passwd);
1967             }
1968         }
1969     }
1970
1971   /* In the unlikely event that the host name contains non-printable
1972      characters, quote it for displaying to the user.  */
1973   quoted_host = url_escape_allow_passthrough (url->host);
1974
1975   /* Undo the quoting of colons that URL escaping performs.  IPv6
1976      addresses may legally contain colons, and in that case must be
1977      placed in square brackets.  */
1978   if (quoted_host != url->host)
1979     unescape_single_char (quoted_host, ':');
1980   brackets_around_host = strchr (quoted_host, ':') != NULL;
1981
1982   size = (strlen (scheme_str)
1983           + strlen (quoted_host)
1984           + (brackets_around_host ? 2 : 0)
1985           + fplen
1986           + 1);
1987   if (url->port != scheme_port)
1988     size += 1 + numdigit (url->port);
1989   if (quoted_user)
1990     {
1991       size += 1 + strlen (quoted_user);
1992       if (quoted_passwd)
1993         size += 1 + strlen (quoted_passwd);
1994     }
1995
1996   p = result = xmalloc (size);
1997
1998   APPEND (p, scheme_str);
1999   if (quoted_user)
2000     {
2001       APPEND (p, quoted_user);
2002       if (quoted_passwd)
2003         {
2004           *p++ = ':';
2005           APPEND (p, quoted_passwd);
2006         }
2007       *p++ = '@';
2008     }
2009
2010   if (brackets_around_host)
2011     *p++ = '[';
2012   APPEND (p, quoted_host);
2013   if (brackets_around_host)
2014     *p++ = ']';
2015   if (url->port != scheme_port)
2016     {
2017       *p++ = ':';
2018       p = number_to_string (p, url->port);
2019     }
2020
2021   full_path_write (url, p);
2022   p += fplen;
2023   *p++ = '\0';
2024
2025   assert (p - result == size);
2026
2027   if (quoted_user && quoted_user != url->user)
2028     xfree (quoted_user);
2029   if (quoted_passwd && auth_mode == URL_AUTH_SHOW
2030       && quoted_passwd != url->passwd)
2031     xfree (quoted_passwd);
2032   if (quoted_host != url->host)
2033     xfree (quoted_host);
2034
2035   return result;
2036 }
2037 \f
2038 /* Return true if scheme a is similar to scheme b.
2039
2040    Schemes are similar if they are equal.  If SSL is supported, schemes
2041    are also similar if one is http (SCHEME_HTTP) and the other is https
2042    (SCHEME_HTTPS).  */
2043 bool
2044 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2045 {
2046   if (a == b)
2047     return true;
2048 #ifdef HAVE_SSL
2049   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2050       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2051     return true;
2052 #endif
2053   return false;
2054 }
2055 \f
2056 static int
2057 getchar_from_escaped_string (const char *str, char *c)
2058 {
2059   const char *p = str;
2060
2061   assert (str && *str);
2062   assert (c);
2063
2064   if (p[0] == '%')
2065     {
2066       if (!c_isxdigit(p[1]) || !c_isxdigit(p[2]))
2067         {
2068           *c = '%';
2069           return 1;
2070         }
2071       else
2072         {
2073           if (p[2] == 0)
2074             return 0; /* error: invalid string */
2075
2076           *c = X2DIGITS_TO_NUM (p[1], p[2]);
2077           if (URL_RESERVED_CHAR(*c))
2078             {
2079               *c = '%';
2080               return 1;
2081             }
2082           else
2083             return 3;
2084         }
2085     }
2086   else
2087     {
2088       *c = p[0];
2089     }
2090
2091   return 1;
2092 }
2093
2094 bool
2095 are_urls_equal (const char *u1, const char *u2)
2096 {
2097   const char *p, *q;
2098   int pp, qq;
2099   char ch1, ch2;
2100   assert(u1 && u2);
2101
2102   p = u1;
2103   q = u2;
2104
2105   while (*p && *q
2106          && (pp = getchar_from_escaped_string (p, &ch1))
2107          && (qq = getchar_from_escaped_string (q, &ch2))
2108          && (c_tolower(ch1) == c_tolower(ch2)))
2109     {
2110       p += pp;
2111       q += qq;
2112     }
2113
2114   return (*p == 0 && *q == 0 ? true : false);
2115 }
2116 \f
2117 #ifdef TESTING
2118 /* Debugging and testing support for path_simplify. */
2119
2120 #if 0
2121 /* Debug: run path_simplify on PATH and return the result in a new
2122    string.  Useful for calling from the debugger.  */
2123 static char *
2124 ps (char *path)
2125 {
2126   char *copy = xstrdup (path);
2127   path_simplify (copy);
2128   return copy;
2129 }
2130 #endif
2131
2132 static const char *
2133 run_test (char *test, char *expected_result, enum url_scheme scheme,
2134           bool expected_change)
2135 {
2136   char *test_copy = xstrdup (test);
2137   bool modified = path_simplify (scheme, test_copy);
2138
2139   if (0 != strcmp (test_copy, expected_result))
2140     {
2141       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2142               test, expected_result, test_copy);
2143       mu_assert ("", 0);
2144     }
2145   if (modified != expected_change)
2146     {
2147       if (expected_change)
2148         printf ("Expected modification with path_simplify(\"%s\").\n",
2149                 test);
2150       else
2151         printf ("Expected no modification with path_simplify(\"%s\").\n",
2152                 test);
2153     }
2154   xfree (test_copy);
2155   mu_assert ("", modified == expected_change);
2156   return NULL;
2157 }
2158
2159 const char *
2160 test_path_simplify (void)
2161 {
2162   static struct {
2163     char *test, *result;
2164     enum url_scheme scheme;
2165     bool should_modify;
2166   } tests[] = {
2167     { "",                       "",             SCHEME_HTTP, false },
2168     { ".",                      "",             SCHEME_HTTP, true },
2169     { "./",                     "",             SCHEME_HTTP, true },
2170     { "..",                     "",             SCHEME_HTTP, true },
2171     { "../",                    "",             SCHEME_HTTP, true },
2172     { "..",                     "..",           SCHEME_FTP,  false },
2173     { "../",                    "../",          SCHEME_FTP,  false },
2174     { "foo",                    "foo",          SCHEME_HTTP, false },
2175     { "foo/bar",                "foo/bar",      SCHEME_HTTP, false },
2176     { "foo///bar",              "foo///bar",    SCHEME_HTTP, false },
2177     { "foo/.",                  "foo/",         SCHEME_HTTP, true },
2178     { "foo/./",                 "foo/",         SCHEME_HTTP, true },
2179     { "foo./",                  "foo./",        SCHEME_HTTP, false },
2180     { "foo/../bar",             "bar",          SCHEME_HTTP, true },
2181     { "foo/../bar/",            "bar/",         SCHEME_HTTP, true },
2182     { "foo/bar/..",             "foo/",         SCHEME_HTTP, true },
2183     { "foo/bar/../x",           "foo/x",        SCHEME_HTTP, true },
2184     { "foo/bar/../x/",          "foo/x/",       SCHEME_HTTP, true },
2185     { "foo/..",                 "",             SCHEME_HTTP, true },
2186     { "foo/../..",              "",             SCHEME_HTTP, true },
2187     { "foo/../../..",           "",             SCHEME_HTTP, true },
2188     { "foo/../../bar/../../baz", "baz",         SCHEME_HTTP, true },
2189     { "foo/../..",              "..",           SCHEME_FTP,  true },
2190     { "foo/../../..",           "../..",        SCHEME_FTP,  true },
2191     { "foo/../../bar/../../baz", "../../baz",   SCHEME_FTP,  true },
2192     { "a/b/../../c",            "c",            SCHEME_HTTP, true },
2193     { "./a/../b",               "b",            SCHEME_HTTP, true }
2194   };
2195   int i;
2196
2197   for (i = 0; i < countof (tests); i++)
2198     {
2199       const char *message;
2200       char *test = tests[i].test;
2201       char *expected_result = tests[i].result;
2202       enum url_scheme scheme = tests[i].scheme;
2203       bool  expected_change = tests[i].should_modify;
2204       message = run_test (test, expected_result, scheme, expected_change);
2205       if (message) return message;
2206     }
2207   return NULL;
2208 }
2209
2210 const char *
2211 test_append_uri_pathel()
2212 {
2213   int i;
2214   struct {
2215     char *original_url;
2216     char *input;
2217     bool escaped;
2218     char *expected_result;
2219   } test_array[] = {
2220     { "http://www.yoyodyne.com/path/", "somepage.html", false, "http://www.yoyodyne.com/path/somepage.html" },
2221   };
2222
2223   for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
2224     {
2225       struct growable dest;
2226       const char *p = test_array[i].input;
2227
2228       memset (&dest, 0, sizeof (dest));
2229
2230       append_string (test_array[i].original_url, &dest);
2231       append_uri_pathel (p, p + strlen(p), test_array[i].escaped, &dest);
2232       append_char ('\0', &dest);
2233
2234       mu_assert ("test_append_uri_pathel: wrong result",
2235                  strcmp (dest.base, test_array[i].expected_result) == 0);
2236     }
2237
2238   return NULL;
2239 }
2240
2241 const char*
2242 test_are_urls_equal()
2243 {
2244   int i;
2245   struct {
2246     char *url1;
2247     char *url2;
2248     bool expected_result;
2249   } test_array[] = {
2250     { "http://www.adomain.com/apath/", "http://www.adomain.com/apath/",       true },
2251     { "http://www.adomain.com/apath/", "http://www.adomain.com/anotherpath/", false },
2252     { "http://www.adomain.com/apath/", "http://www.anotherdomain.com/path/",  false },
2253     { "http://www.adomain.com/~path/", "http://www.adomain.com/%7epath/",     true },
2254     { "http://www.adomain.com/longer-path/", "http://www.adomain.com/path/",  false },
2255     { "http://www.adomain.com/path%2f", "http://www.adomain.com/path/",       false },
2256   };
2257
2258   for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
2259     {
2260       mu_assert ("test_are_urls_equal: wrong result",
2261                  are_urls_equal (test_array[i].url1, test_array[i].url2) == test_array[i].expected_result);
2262     }
2263
2264   return NULL;
2265 }
2266
2267 #endif /* TESTING */
2268
2269 /*
2270  * vim: et ts=2 sw=2
2271  */
2272