sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   3    2005, 2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or (at
  10 your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #include "wget.h"
  32
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 #include <unistd.h>
  37 #include <errno.h>
  38 #include <assert.h>
  39
  40 #include "utils.h"
  41 #include "url.h"
  42 #include "host.h"  /* for is_valid_ipv6_address */
  43
  44 #ifdef __VMS
  45 #include "vms.h"
  46 #endif /* def __VMS */
  47
  48 #ifdef TESTING
  49 #include "test.h"
  50 #endif
  51
  52 enum {
  53   scm_disabled = 1,             /* for https when OpenSSL fails to init. */
  54   scm_has_params = 2,           /* whether scheme has ;params */
  55   scm_has_query = 4,            /* whether scheme has ?query */
  56   scm_has_fragment = 8          /* whether scheme has #fragment */
  57 };
  58
  59 struct scheme_data
  60 {
  61   /* Short name of the scheme, such as "http" or "ftp". */
  62   const char *name;
  63   /* Leading string that identifies the scheme, such as "https://". */
  64   const char *leading_string;
  65   /* Default port of the scheme when none is specified. */
  66   int default_port;
  67   /* Various flags. */
  68   int flags;
  69 };
  70
  71 /* Supported schemes: */
  72 static struct scheme_data supported_schemes[] =
  73 {
  74   { "http",     "http://",  DEFAULT_HTTP_PORT,  scm_has_query|scm_has_fragment },
  75 #ifdef HAVE_SSL
  76   { "https",    "https://", DEFAULT_HTTPS_PORT, scm_has_query|scm_has_fragment },
  77 #endif
  78   { "ftp",      "ftp://",   DEFAULT_FTP_PORT,   scm_has_params|scm_has_fragment },
  79
  80   /* SCHEME_INVALID */
  81   { NULL,       NULL,       -1,                 0 }
  82 };
  83
  84 /* Forward declarations: */
  85
  86 static bool path_simplify (enum url_scheme, char *);
  87 \f
  88 /* Support for escaping and unescaping of URL strings.  */
  89
  90 /* Table of "reserved" and "unsafe" characters.  Those terms are
  91    rfc1738-speak, as such largely obsoleted by rfc2396 and later
  92    specs, but the general idea remains.
  93
  94    A reserved character is the one that you can't decode without
  95    changing the meaning of the URL.  For example, you can't decode
  96    "/foo/%2f/bar" into "/foo///bar" because the number and contents of
  97    path components is different.  Non-reserved characters can be
  98    changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar".  The
  99    unsafe characters are loosely based on rfc1738, plus "$" and ",",
 100    as recommended by rfc2396, and minus "~", which is very frequently
 101    used (and sometimes unrecognized as %7E by broken servers).
 102
 103    An unsafe character is the one that should be encoded when URLs are
 104    placed in foreign environments.  E.g. space and newline are unsafe
 105    in HTTP contexts because HTTP uses them as separator and line
 106    terminator, so they must be encoded to %20 and %0A respectively.
 107    "*" is unsafe in shell context, etc.
 108
 109    We determine whether a character is unsafe through static table
 110    lookup.  This code assumes ASCII character set and 8-bit chars.  */
 111
 112 enum {
 113   /* rfc1738 reserved chars + "$" and ",".  */
 114   urlchr_reserved = 1,
 115
 116   /* rfc1738 unsafe chars, plus non-printables.  */
 117   urlchr_unsafe   = 2
 118 };
 119
 120 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 121 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 122 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 123
 124 /* Shorthands for the table: */
 125 #define R  urlchr_reserved
 126 #define U  urlchr_unsafe
 127 #define RU R|U
 128
 129 static const unsigned char urlchr_table[256] =
 130 {
 131   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 132   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 133   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 134   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 135   U,  0,  U, RU,   R,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 136   0,  0,  0,  R,   R,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 137   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 138   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 139  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 140   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 141   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 142   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 143   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 144   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 145   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 146   0,  0,  0,  U,   U,  U,  0,  U,   /* x   y   z   {    |   }   ~   DEL */
 147
 148   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 149   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 150   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 151   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 152
 153   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 154   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 155   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 156   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 157 };
 158 #undef R
 159 #undef U
 160 #undef RU
 161
 162 /* URL-unescape the string S.
 163
 164    This is done by transforming the sequences "%HH" to the character
 165    represented by the hexadecimal digits HH.  If % is not followed by
 166    two hexadecimal digits, it is inserted literally.
 167
 168    The transformation is done in place.  If you need the original
 169    string intact, make a copy before calling this function.  */
 170
 171 static void
 172 url_unescape (char *s)
 173 {
 174   char *t = s;                  /* t - tortoise */
 175   char *h = s;                  /* h - hare     */
 176
 177   for (; *h; h++, t++)
 178     {
 179       if (*h != '%')
 180         {
 181         copychar:
 182           *t = *h;
 183         }
 184       else
 185         {
 186           char c;
 187           /* Do nothing if '%' is not followed by two hex digits. */
 188           if (!h[1] || !h[2] || !(c_isxdigit (h[1]) && c_isxdigit (h[2])))
 189             goto copychar;
 190           c = X2DIGITS_TO_NUM (h[1], h[2]);
 191           /* Don't unescape %00 because there is no way to insert it
 192              into a C string without effectively truncating it. */
 193           if (c == '\0')
 194             goto copychar;
 195           *t = c;
 196           h += 2;
 197         }
 198     }
 199   *t = '\0';
 200 }
 201
 202 /* The core of url_escape_* functions.  Escapes the characters that
 203    match the provided mask in urlchr_table.
 204
 205    If ALLOW_PASSTHROUGH is true, a string with no unsafe chars will be
 206    returned unchanged.  If ALLOW_PASSTHROUGH is false, a freshly
 207    allocated string will be returned in all cases.  */
 208
 209 static char *
 210 url_escape_1 (const char *s, unsigned char mask, bool allow_passthrough)
 211 {
 212   const char *p1;
 213   char *p2, *newstr;
 214   int newlen;
 215   int addition = 0;
 216
 217   for (p1 = s; *p1; p1++)
 218     if (urlchr_test (*p1, mask))
 219       addition += 2;            /* Two more characters (hex digits) */
 220
 221   if (!addition)
 222     return allow_passthrough ? (char *)s : xstrdup (s);
 223
 224   newlen = (p1 - s) + addition;
 225   newstr = xmalloc (newlen + 1);
 226
 227   p1 = s;
 228   p2 = newstr;
 229   while (*p1)
 230     {
 231       /* Quote the characters that match the test mask. */
 232       if (urlchr_test (*p1, mask))
 233         {
 234           unsigned char c = *p1++;
 235           *p2++ = '%';
 236           *p2++ = XNUM_TO_DIGIT (c >> 4);
 237           *p2++ = XNUM_TO_DIGIT (c & 0xf);
 238         }
 239       else
 240         *p2++ = *p1++;
 241     }
 242   assert (p2 - newstr == newlen);
 243   *p2 = '\0';
 244
 245   return newstr;
 246 }
 247
 248 /* URL-escape the unsafe characters (see urlchr_table) in a given
 249    string, returning a freshly allocated string.  */
 250
 251 char *
 252 url_escape (const char *s)
 253 {
 254   return url_escape_1 (s, urlchr_unsafe, false);
 255 }
 256
 257 /* URL-escape the unsafe and reserved characters (see urlchr_table) in
 258    a given string, returning a freshly allocated string.  */
 259
 260 char *
 261 url_escape_unsafe_and_reserved (const char *s)
 262 {
 263   return url_escape_1 (s, urlchr_unsafe|urlchr_reserved, false);
 264 }
 265
 266 /* URL-escape the unsafe characters (see urlchr_table) in a given
 267    string.  If no characters are unsafe, S is returned.  */
 268
 269 static char *
 270 url_escape_allow_passthrough (const char *s)
 271 {
 272   return url_escape_1 (s, urlchr_unsafe, true);
 273 }
 274 \f
 275 /* Decide whether the char at position P needs to be encoded.  (It is
 276    not enough to pass a single char *P because the function may need
 277    to inspect the surrounding context.)
 278
 279    Return true if the char should be escaped as %XX, false otherwise.  */
 280
 281 static inline bool
 282 char_needs_escaping (const char *p)
 283 {
 284   if (*p == '%')
 285     {
 286       if (c_isxdigit (*(p + 1)) && c_isxdigit (*(p + 2)))
 287         return false;
 288       else
 289         /* Garbled %.. sequence: encode `%'. */
 290         return true;
 291     }
 292   else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
 293     return true;
 294   else
 295     return false;
 296 }
 297
 298 /* Translate a %-escaped (but possibly non-conformant) input string S
 299    into a %-escaped (and conformant) output string.  If no characters
 300    are encoded or decoded, return the same string S; otherwise, return
 301    a freshly allocated string with the new contents.
 302
 303    After a URL has been run through this function, the protocols that
 304    use `%' as the quote character can use the resulting string as-is,
 305    while those that don't can use url_unescape to get to the intended
 306    data.  This function is stable: once the input is transformed,
 307    further transformations of the result yield the same output.
 308
 309    Let's discuss why this function is needed.
 310
 311    Imagine Wget is asked to retrieve `http://abc.xyz/abc def'.  Since
 312    a raw space character would mess up the HTTP request, it needs to
 313    be quoted, like this:
 314
 315        GET /abc%20def HTTP/1.0
 316
 317    It would appear that the unsafe chars need to be quoted, for
 318    example with url_escape.  But what if we're requested to download
 319    `abc%20def'?  url_escape transforms "%" to "%25", which would leave
 320    us with `abc%2520def'.  This is incorrect -- since %-escapes are
 321    part of URL syntax, "%20" is the correct way to denote a literal
 322    space on the Wget command line.  This leads to the conclusion that
 323    in that case Wget should not call url_escape, but leave the `%20'
 324    as is.  This is clearly contradictory, but it only gets worse.
 325
 326    What if the requested URI is `abc%20 def'?  If we call url_escape,
 327    we end up with `/abc%2520%20def', which is almost certainly not
 328    intended.  If we don't call url_escape, we are left with the
 329    embedded space and cannot complete the request.  What the user
 330    meant was for Wget to request `/abc%20%20def', and this is where
 331    reencode_escapes kicks in.
 332
 333    Wget used to solve this by first decoding %-quotes, and then
 334    encoding all the "unsafe" characters found in the resulting string.
 335    This was wrong because it didn't preserve certain URL special
 336    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 337    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 338    whether we considered `+' reserved (it is).  One of these results
 339    is inevitable because by the second step we would lose information
 340    on whether the `+' was originally encoded or not.  Both results
 341    were wrong because in CGI parameters + means space, while %2B means
 342    literal plus.  reencode_escapes correctly translates the above to
 343    "a%2B+b", i.e. returns the original string.
 344
 345    This function uses a modified version of the algorithm originally
 346    proposed by Anon Sricharoenchai:
 347
 348    * Encode all "unsafe" characters, except those that are also
 349      "reserved", to %XX.  See urlchr_table for which characters are
 350      unsafe and reserved.
 351
 352    * Encode the "%" characters not followed by two hex digits to
 353      "%25".
 354
 355    * Pass through all other characters and %XX escapes as-is.  (Up to
 356      Wget 1.10 this decoded %XX escapes corresponding to "safe"
 357      characters, but that was obtrusive and broke some servers.)
 358
 359    Anon's test case:
 360
 361    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 362    ->
 363    "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc"
 364
 365    Simpler test cases:
 366
 367    "foo bar"         -> "foo%20bar"
 368    "foo%20bar"       -> "foo%20bar"
 369    "foo %20bar"      -> "foo%20%20bar"
 370    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 371    "foo%25%20bar"    -> "foo%25%20bar"
 372    "foo%2%20bar"     -> "foo%252%20bar"
 373    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 374    "foo%2b+bar"      -> "foo%2b+bar"  */
 375
 376 static char *
 377 reencode_escapes (const char *s)
 378 {
 379   const char *p1;
 380   char *newstr, *p2;
 381   int oldlen, newlen;
 382
 383   int encode_count = 0;
 384
 385   /* First pass: inspect the string to see if there's anything to do,
 386      and to calculate the new length.  */
 387   for (p1 = s; *p1; p1++)
 388     if (char_needs_escaping (p1))
 389       ++encode_count;
 390
 391   if (!encode_count)
 392     /* The string is good as it is. */
 393     return (char *) s;          /* C const model sucks. */
 394
 395   oldlen = p1 - s;
 396   /* Each encoding adds two characters (hex digits).  */
 397   newlen = oldlen + 2 * encode_count;
 398   newstr = xmalloc (newlen + 1);
 399
 400   /* Second pass: copy the string to the destination address, encoding
 401      chars when needed.  */
 402   p1 = s;
 403   p2 = newstr;
 404
 405   while (*p1)
 406     if (char_needs_escaping (p1))
 407       {
 408         unsigned char c = *p1++;
 409         *p2++ = '%';
 410         *p2++ = XNUM_TO_DIGIT (c >> 4);
 411         *p2++ = XNUM_TO_DIGIT (c & 0xf);
 412       }
 413     else
 414       *p2++ = *p1++;
 415
 416   *p2 = '\0';
 417   assert (p2 - newstr == newlen);
 418   return newstr;
 419 }
 420 \f
 421 /* Returns the scheme type if the scheme is supported, or
 422    SCHEME_INVALID if not.  */
 423
 424 enum url_scheme
 425 url_scheme (const char *url)
 426 {
 427   int i;
 428
 429   for (i = 0; supported_schemes[i].leading_string; i++)
 430     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 431                           strlen (supported_schemes[i].leading_string)))
 432       {
 433         if (!(supported_schemes[i].flags & scm_disabled))
 434           return (enum url_scheme) i;
 435         else
 436           return SCHEME_INVALID;
 437       }
 438
 439   return SCHEME_INVALID;
 440 }
 441
 442 #define SCHEME_CHAR(ch) (c_isalnum (ch) || (ch) == '-' || (ch) == '+')
 443
 444 /* Return 1 if the URL begins with any "scheme", 0 otherwise.  As
 445    currently implemented, it returns true if URL begins with
 446    [-+a-zA-Z0-9]+: .  */
 447
 448 bool
 449 url_has_scheme (const char *url)
 450 {
 451   const char *p = url;
 452
 453   /* The first char must be a scheme char. */
 454   if (!*p || !SCHEME_CHAR (*p))
 455     return false;
 456   ++p;
 457   /* Followed by 0 or more scheme chars. */
 458   while (*p && SCHEME_CHAR (*p))
 459     ++p;
 460   /* Terminated by ':'. */
 461   return *p == ':';
 462 }
 463
 464 bool
 465 url_valid_scheme (const char *url)
 466 {
 467   enum url_scheme scheme = url_scheme (url);
 468   return scheme != SCHEME_INVALID;
 469 }
 470
 471 int
 472 scheme_default_port (enum url_scheme scheme)
 473 {
 474   return supported_schemes[scheme].default_port;
 475 }
 476
 477 void
 478 scheme_disable (enum url_scheme scheme)
 479 {
 480   supported_schemes[scheme].flags |= scm_disabled;
 481 }
 482
 483 /* Skip the username and password, if present in the URL.  The
 484    function should *not* be called with the complete URL, but with the
 485    portion after the scheme.
 486
 487    If no username and password are found, return URL.  */
 488
 489 static const char *
 490 url_skip_credentials (const char *url)
 491 {
 492   /* Look for '@' that comes before terminators, such as '/', '?',
 493      '#', or ';'.  */
 494   const char *p = (const char *)strpbrk (url, "@/?#;");
 495   if (!p || *p != '@')
 496     return url;
 497   return p + 1;
 498 }
 499
 500 /* Parse credentials contained in [BEG, END).  The region is expected
 501    to have come from a URL and is unescaped.  */
 502
 503 static bool
 504 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
 505 {
 506   char *colon;
 507   const char *userend;
 508
 509   if (beg == end)
 510     return false;               /* empty user name */
 511
 512   colon = memchr (beg, ':', end - beg);
 513   if (colon == beg)
 514     return false;               /* again empty user name */
 515
 516   if (colon)
 517     {
 518       *passwd = strdupdelim (colon + 1, end);
 519       userend = colon;
 520       url_unescape (*passwd);
 521     }
 522   else
 523     {
 524       *passwd = NULL;
 525       userend = end;
 526     }
 527   *user = strdupdelim (beg, userend);
 528   url_unescape (*user);
 529   return true;
 530 }
 531
 532 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 533    originally popularized by Netscape and NcFTP.  HTTP shorthands look
 534    like this:
 535
 536    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 537    www.foo.com[:port]            -> http://www.foo.com[:port]
 538
 539    FTP shorthands look like this:
 540
 541    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 542    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 543
 544    If the URL needs not or cannot be rewritten, return NULL.  */
 545
 546 char *
 547 rewrite_shorthand_url (const char *url)
 548 {
 549   const char *p;
 550   char *ret;
 551
 552   if (url_scheme (url) != SCHEME_INVALID)
 553     return NULL;
 554
 555   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 556      latter Netscape.  */
 557   p = strpbrk (url, ":/");
 558   if (p == url)
 559     return NULL;
 560
 561   /* If we're looking at "://", it means the URL uses a scheme we
 562      don't support, which may include "https" when compiled without
 563      SSL support.  Don't bogusly rewrite such URLs.  */
 564   if (p && p[0] == ':' && p[1] == '/' && p[2] == '/')
 565     return NULL;
 566
 567   if (p && *p == ':')
 568     {
 569       /* Colon indicates ftp, as in foo.bar.com:path.  Check for
 570          special case of http port number ("localhost:10000").  */
 571       int digits = strspn (p + 1, "0123456789");
 572       if (digits && (p[1 + digits] == '/' || p[1 + digits] == '\0'))
 573         goto http;
 574
 575       /* Turn "foo.bar.com:path" to "ftp://foo.bar.com/path". */
 576       ret = aprintf ("ftp://%s", url);
 577       ret[6 + (p - url)] = '/';
 578     }
 579   else
 580     {
 581     http:
 582       /* Just prepend "http://" to URL. */
 583       ret = aprintf ("http://%s", url);
 584     }
 585   return ret;
 586 }
 587 \f
 588 static void split_path (const char *, char **, char **);
 589
 590 /* Like strpbrk, with the exception that it returns the pointer to the
 591    terminating zero (end-of-string aka "eos") if no matching character
 592    is found.  */
 593
 594 static inline char *
 595 strpbrk_or_eos (const char *s, const char *accept)
 596 {
 597   char *p = strpbrk (s, accept);
 598   if (!p)
 599     p = strchr (s, '\0');
 600   return p;
 601 }
 602
 603 /* Turn STR into lowercase; return true if a character was actually
 604    changed. */
 605
 606 static bool
 607 lowercase_str (char *str)
 608 {
 609   bool changed = false;
 610   for (; *str; str++)
 611     if (c_isupper (*str))
 612       {
 613         changed = true;
 614         *str = c_tolower (*str);
 615       }
 616   return changed;
 617 }
 618
 619 static const char *
 620 init_seps (enum url_scheme scheme)
 621 {
 622   static char seps[8] = ":/";
 623   char *p = seps + 2;
 624   int flags = supported_schemes[scheme].flags;
 625
 626   if (flags & scm_has_params)
 627     *p++ = ';';
 628   if (flags & scm_has_query)
 629     *p++ = '?';
 630   if (flags & scm_has_fragment)
 631     *p++ = '#';
 632   *p = '\0';
 633   return seps;
 634 }
 635
 636 static const char *parse_errors[] = {
 637 #define PE_NO_ERROR                     0
 638   N_("No error"),
 639 #define PE_UNSUPPORTED_SCHEME           1
 640   N_("Unsupported scheme %s"), /* support for format token only here */
 641 #define PE_MISSING_SCHEME               2
 642   N_("Scheme missing"),
 643 #define PE_INVALID_HOST_NAME            3
 644   N_("Invalid host name"),
 645 #define PE_BAD_PORT_NUMBER              4
 646   N_("Bad port number"),
 647 #define PE_INVALID_USER_NAME            5
 648   N_("Invalid user name"),
 649 #define PE_UNTERMINATED_IPV6_ADDRESS    6
 650   N_("Unterminated IPv6 numeric address"),
 651 #define PE_IPV6_NOT_SUPPORTED           7
 652   N_("IPv6 addresses not supported"),
 653 #define PE_INVALID_IPV6_ADDRESS         8
 654   N_("Invalid IPv6 numeric address")
 655 };
 656
 657 /* Parse a URL.
 658
 659    Return a new struct url if successful, NULL on error.  In case of
 660    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 661    error code. */
 662 struct url *
 663 url_parse (const char *url, int *error, struct iri *iri, bool percent_encode)
 664 {
 665   struct url *u;
 666   const char *p;
 667   bool path_modified, host_modified;
 668
 669   enum url_scheme scheme;
 670   const char *seps;
 671
 672   const char *uname_b,     *uname_e;
 673   const char *host_b,      *host_e;
 674   const char *path_b,      *path_e;
 675   const char *params_b,    *params_e;
 676   const char *query_b,     *query_e;
 677   const char *fragment_b,  *fragment_e;
 678
 679   int port;
 680   char *user = NULL, *passwd = NULL;
 681
 682   const char *url_encoded = NULL;
 683   char *new_url = NULL;
 684
 685   int error_code;
 686
 687   scheme = url_scheme (url);
 688   if (scheme == SCHEME_INVALID)
 689     {
 690       if (url_has_scheme (url))
 691         error_code = PE_UNSUPPORTED_SCHEME;
 692       else
 693         error_code = PE_MISSING_SCHEME;
 694       goto error;
 695     }
 696
 697   if (iri && iri->utf8_encode)
 698     {
 699       iri->utf8_encode = remote_to_utf8 (iri, iri->orig_url ? iri->orig_url : url, (const char **) &new_url);
 700       if (!iri->utf8_encode)
 701         new_url = NULL;
 702       else
 703         iri->orig_url = xstrdup (url);
 704     }
 705
 706   /* XXX XXX Could that change introduce (security) bugs ???  XXX XXX*/
 707   if (percent_encode)
 708     url_encoded = reencode_escapes (new_url ? new_url : url);
 709   else
 710     url_encoded = new_url ? new_url : url;
 711
 712   p = url_encoded;
 713
 714   if (new_url && url_encoded != new_url)
 715     xfree (new_url);
 716
 717   p += strlen (supported_schemes[scheme].leading_string);
 718   uname_b = p;
 719   p = url_skip_credentials (p);
 720   uname_e = p;
 721
 722   /* scheme://user:pass@host[:port]... */
 723   /*                    ^              */
 724
 725   /* We attempt to break down the URL into the components path,
 726      params, query, and fragment.  They are ordered like this:
 727
 728        scheme://host[:port][/path][;params][?query][#fragment]  */
 729
 730   path_b     = path_e     = NULL;
 731   params_b   = params_e   = NULL;
 732   query_b    = query_e    = NULL;
 733   fragment_b = fragment_e = NULL;
 734
 735   /* Initialize separators for optional parts of URL, depending on the
 736      scheme.  For example, FTP has params, and HTTP and HTTPS have
 737      query string and fragment. */
 738   seps = init_seps (scheme);
 739
 740   host_b = p;
 741
 742   if (*p == '[')
 743     {
 744       /* Handle IPv6 address inside square brackets.  Ideally we'd
 745          just look for the terminating ']', but rfc2732 mandates
 746          rejecting invalid IPv6 addresses.  */
 747
 748       /* The address begins after '['. */
 749       host_b = p + 1;
 750       host_e = strchr (host_b, ']');
 751
 752       if (!host_e)
 753         {
 754           error_code = PE_UNTERMINATED_IPV6_ADDRESS;
 755           goto error;
 756         }
 757
 758 #ifdef ENABLE_IPV6
 759       /* Check if the IPv6 address is valid. */
 760       if (!is_valid_ipv6_address(host_b, host_e))
 761         {
 762           error_code = PE_INVALID_IPV6_ADDRESS;
 763           goto error;
 764         }
 765
 766       /* Continue parsing after the closing ']'. */
 767       p = host_e + 1;
 768 #else
 769       error_code = PE_IPV6_NOT_SUPPORTED;
 770       goto error;
 771 #endif
 772
 773       /* The closing bracket must be followed by a separator or by the
 774          null char.  */
 775       /* http://[::1]... */
 776       /*             ^   */
 777       if (!strchr (seps, *p))
 778         {
 779           /* Trailing garbage after []-delimited IPv6 address. */
 780           error_code = PE_INVALID_HOST_NAME;
 781           goto error;
 782         }
 783     }
 784   else
 785     {
 786       p = strpbrk_or_eos (p, seps);
 787       host_e = p;
 788     }
 789   ++seps;                       /* advance to '/' */
 790
 791   if (host_b == host_e)
 792     {
 793       error_code = PE_INVALID_HOST_NAME;
 794       goto error;
 795     }
 796
 797   port = scheme_default_port (scheme);
 798   if (*p == ':')
 799     {
 800       const char *port_b, *port_e, *pp;
 801
 802       /* scheme://host:port/tralala */
 803       /*              ^             */
 804       ++p;
 805       port_b = p;
 806       p = strpbrk_or_eos (p, seps);
 807       port_e = p;
 808
 809       /* Allow empty port, as per rfc2396. */
 810       if (port_b != port_e)
 811         for (port = 0, pp = port_b; pp < port_e; pp++)
 812           {
 813             if (!c_isdigit (*pp))
 814               {
 815                 /* http://host:12randomgarbage/blah */
 816                 /*               ^                  */
 817                 error_code = PE_BAD_PORT_NUMBER;
 818                 goto error;
 819               }
 820             port = 10 * port + (*pp - '0');
 821             /* Check for too large port numbers here, before we have
 822                a chance to overflow on bogus port values.  */
 823             if (port > 0xffff)
 824               {
 825                 error_code = PE_BAD_PORT_NUMBER;
 826                 goto error;
 827               }
 828           }
 829     }
 830   /* Advance to the first separator *after* '/' (either ';' or '?',
 831      depending on the scheme).  */
 832   ++seps;
 833
 834   /* Get the optional parts of URL, each part being delimited by
 835      current location and the position of the next separator.  */
 836 #define GET_URL_PART(sepchar, var) do {                         \
 837   if (*p == sepchar)                                            \
 838     var##_b = ++p, var##_e = p = strpbrk_or_eos (p, seps);      \
 839   ++seps;                                                       \
 840 } while (0)
 841
 842   GET_URL_PART ('/', path);
 843   if (supported_schemes[scheme].flags & scm_has_params)
 844     GET_URL_PART (';', params);
 845   if (supported_schemes[scheme].flags & scm_has_query)
 846     GET_URL_PART ('?', query);
 847   if (supported_schemes[scheme].flags & scm_has_fragment)
 848     GET_URL_PART ('#', fragment);
 849
 850 #undef GET_URL_PART
 851   assert (*p == 0);
 852
 853   if (uname_b != uname_e)
 854     {
 855       /* http://user:pass@host */
 856       /*        ^         ^    */
 857       /*     uname_b   uname_e */
 858       if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
 859         {
 860           error_code = PE_INVALID_USER_NAME;
 861           goto error;
 862         }
 863     }
 864
 865   u = xnew0 (struct url);
 866   u->scheme = scheme;
 867   u->host   = strdupdelim (host_b, host_e);
 868   u->port   = port;
 869   u->user   = user;
 870   u->passwd = passwd;
 871
 872   u->path = strdupdelim (path_b, path_e);
 873   path_modified = path_simplify (scheme, u->path);
 874   split_path (u->path, &u->dir, &u->file);
 875
 876   host_modified = lowercase_str (u->host);
 877
 878   /* Decode %HH sequences in host name.  This is important not so much
 879      to support %HH sequences in host names (which other browser
 880      don't), but to support binary characters (which will have been
 881      converted to %HH by reencode_escapes).  */
 882   if (strchr (u->host, '%'))
 883     {
 884       url_unescape (u->host);
 885       host_modified = true;
 886
 887       /* Apply IDNA regardless of iri->utf8_encode status */
 888       if (opt.enable_iri && iri)
 889         {
 890           char *new = idn_encode (iri, u->host);
 891           if (new)
 892             {
 893               xfree (u->host);
 894               u->host = new;
 895               host_modified = true;
 896             }
 897         }
 898     }
 899
 900   if (params_b)
 901     u->params = strdupdelim (params_b, params_e);
 902   if (query_b)
 903     u->query = strdupdelim (query_b, query_e);
 904   if (fragment_b)
 905     u->fragment = strdupdelim (fragment_b, fragment_e);
 906
 907   if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e)
 908     {
 909       /* If we suspect that a transformation has rendered what
 910          url_string might return different from URL_ENCODED, rebuild
 911          u->url using url_string.  */
 912       u->url = url_string (u, URL_AUTH_SHOW);
 913
 914       if (url_encoded != url)
 915         xfree ((char *) url_encoded);
 916     }
 917   else
 918     {
 919       if (url_encoded == url)
 920         u->url = xstrdup (url);
 921       else
 922         u->url = (char *) url_encoded;
 923     }
 924
 925   return u;
 926
 927  error:
 928   /* Cleanup in case of error: */
 929   if (url_encoded && url_encoded != url)
 930     xfree ((char *) url_encoded);
 931
 932   /* Transmit the error code to the caller, if the caller wants to
 933      know.  */
 934   if (error)
 935     *error = error_code;
 936   return NULL;
 937 }
 938
 939 /* Return the error message string from ERROR_CODE, which should have
 940    been retrieved from url_parse.  The error message is translated.  */
 941
 942 char *
 943 url_error (const char *url, int error_code)
 944 {
 945   assert (error_code >= 0 && ((size_t) error_code) < countof (parse_errors));
 946
 947   if (error_code == PE_UNSUPPORTED_SCHEME)
 948     {
 949       char *error, *p;
 950       char *scheme = xstrdup (url);
 951       assert (url_has_scheme (url));
 952
 953       if ((p = strchr (scheme, ':')))
 954         *p = '\0';
 955       if (!strcasecmp (scheme, "https"))
 956         error = aprintf (_("HTTPS support not compiled in"));
 957       else
 958         error = aprintf (_(parse_errors[error_code]), quote (scheme));
 959       xfree (scheme);
 960
 961       return error;
 962     }
 963   else
 964     return xstrdup (_(parse_errors[error_code]));
 965 }
 966
 967 /* Split PATH into DIR and FILE.  PATH comes from the URL and is
 968    expected to be URL-escaped.
 969
 970    The path is split into directory (the part up to the last slash)
 971    and file (the part after the last slash), which are subsequently
 972    unescaped.  Examples:
 973
 974    PATH                 DIR           FILE
 975    "foo/bar/baz"        "foo/bar"     "baz"
 976    "foo/bar/"           "foo/bar"     ""
 977    "foo"                ""            "foo"
 978    "foo/bar/baz%2fqux"  "foo/bar"     "baz/qux" (!)
 979
 980    DIR and FILE are freshly allocated.  */
 981
 982 static void
 983 split_path (const char *path, char **dir, char **file)
 984 {
 985   char *last_slash = strrchr (path, '/');
 986   if (!last_slash)
 987     {
 988       *dir = xstrdup ("");
 989       *file = xstrdup (path);
 990     }
 991   else
 992     {
 993       *dir = strdupdelim (path, last_slash);
 994       *file = xstrdup (last_slash + 1);
 995     }
 996   url_unescape (*dir);
 997   url_unescape (*file);
 998 }
 999
1000 /* Note: URL's "full path" is the path with the query string and
1001    params appended.  The "fragment" (#foo) is intentionally ignored,
1002    but that might be changed.  For example, if the original URL was
1003    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
1004    the full path will be "/foo/bar/baz;bullshit?querystring".  */
1005
1006 /* Return the length of the full path, without the terminating
1007    zero.  */
1008
1009 static int
1010 full_path_length (const struct url *url)
1011 {
1012   int len = 0;
1013
1014 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
1015
1016   FROB (path);
1017   FROB (params);
1018   FROB (query);
1019
1020 #undef FROB
1021
1022   return len;
1023 }
1024
1025 /* Write out the full path. */
1026
1027 static void
1028 full_path_write (const struct url *url, char *where)
1029 {
1030 #define FROB(el, chr) do {                      \
1031   char *f_el = url->el;                         \
1032   if (f_el) {                                   \
1033     int l = strlen (f_el);                      \
1034     *where++ = chr;                             \
1035     memcpy (where, f_el, l);                    \
1036     where += l;                                 \
1037   }                                             \
1038 } while (0)
1039
1040   FROB (path, '/');
1041   FROB (params, ';');
1042   FROB (query, '?');
1043
1044 #undef FROB
1045 }
1046
1047 /* Public function for getting the "full path".  E.g. if u->path is
1048    "foo/bar" and u->query is "param=value", full_path will be
1049    "/foo/bar?param=value". */
1050
1051 char *
1052 url_full_path (const struct url *url)
1053 {
1054   int length = full_path_length (url);
1055   char *full_path = xmalloc (length + 1);
1056
1057   full_path_write (url, full_path);
1058   full_path[length] = '\0';
1059
1060   return full_path;
1061 }
1062
1063 /* Unescape CHR in an otherwise escaped STR.  Used to selectively
1064    escaping of certain characters, such as "/" and ":".  Returns a
1065    count of unescaped chars.  */
1066
1067 static void
1068 unescape_single_char (char *str, char chr)
1069 {
1070   const char c1 = XNUM_TO_DIGIT (chr >> 4);
1071   const char c2 = XNUM_TO_DIGIT (chr & 0xf);
1072   char *h = str;                /* hare */
1073   char *t = str;                /* tortoise */
1074   for (; *h; h++, t++)
1075     {
1076       if (h[0] == '%' && h[1] == c1 && h[2] == c2)
1077         {
1078           *t = chr;
1079           h += 2;
1080         }
1081       else
1082         *t = *h;
1083     }
1084   *t = '\0';
1085 }
1086
1087 /* Escape unsafe and reserved characters, except for the slash
1088    characters.  */
1089
1090 static char *
1091 url_escape_dir (const char *dir)
1092 {
1093   char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1094   if (newdir == dir)
1095     return (char *)dir;
1096
1097   unescape_single_char (newdir, '/');
1098   return newdir;
1099 }
1100
1101 /* Sync u->path and u->url with u->dir and u->file.  Called after
1102    u->file or u->dir have been changed, typically by the FTP code.  */
1103
1104 static void
1105 sync_path (struct url *u)
1106 {
1107   char *newpath, *efile, *edir;
1108
1109   xfree (u->path);
1110
1111   /* u->dir and u->file are not escaped.  URL-escape them before
1112      reassembling them into u->path.  That way, if they contain
1113      separators like '?' or even if u->file contains slashes, the
1114      path will be correctly assembled.  (u->file can contain slashes
1115      if the URL specifies it with %2f, or if an FTP server returns
1116      it.)  */
1117   edir = url_escape_dir (u->dir);
1118   efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1119
1120   if (!*edir)
1121     newpath = xstrdup (efile);
1122   else
1123     {
1124       int dirlen = strlen (edir);
1125       int filelen = strlen (efile);
1126
1127       /* Copy "DIR/FILE" to newpath. */
1128       char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1129       memcpy (p, edir, dirlen);
1130       p += dirlen;
1131       *p++ = '/';
1132       memcpy (p, efile, filelen);
1133       p += filelen;
1134       *p = '\0';
1135     }
1136
1137   u->path = newpath;
1138
1139   if (edir != u->dir)
1140     xfree (edir);
1141   if (efile != u->file)
1142     xfree (efile);
1143
1144   /* Regenerate u->url as well.  */
1145   xfree (u->url);
1146   u->url = url_string (u, URL_AUTH_SHOW);
1147 }
1148
1149 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1150    This way we can sync u->path and u->url when they get changed.  */
1151
1152 void
1153 url_set_dir (struct url *url, const char *newdir)
1154 {
1155   xfree (url->dir);
1156   url->dir = xstrdup (newdir);
1157   sync_path (url);
1158 }
1159
1160 void
1161 url_set_file (struct url *url, const char *newfile)
1162 {
1163   xfree (url->file);
1164   url->file = xstrdup (newfile);
1165   sync_path (url);
1166 }
1167
1168 void
1169 url_free (struct url *url)
1170 {
1171   xfree (url->host);
1172   xfree (url->path);
1173   xfree (url->url);
1174
1175   xfree_null (url->params);
1176   xfree_null (url->query);
1177   xfree_null (url->fragment);
1178   xfree_null (url->user);
1179   xfree_null (url->passwd);
1180
1181   xfree (url->dir);
1182   xfree (url->file);
1183
1184   xfree (url);
1185 }
1186 \f
1187 /* Create all the necessary directories for PATH (a file).  Calls
1188    make_directory internally.  */
1189 int
1190 mkalldirs (const char *path)
1191 {
1192   const char *p;
1193   char *t;
1194   struct_stat st;
1195   int res;
1196
1197   p = path + strlen (path);
1198   for (; *p != '/' && p != path; p--)
1199     ;
1200
1201   /* Don't create if it's just a file.  */
1202   if ((p == path) && (*p != '/'))
1203     return 0;
1204   t = strdupdelim (path, p);
1205
1206   /* Check whether the directory exists.  */
1207   if ((stat (t, &st) == 0))
1208     {
1209       if (S_ISDIR (st.st_mode))
1210         {
1211           xfree (t);
1212           return 0;
1213         }
1214       else
1215         {
1216           /* If the dir exists as a file name, remove it first.  This
1217              is *only* for Wget to work with buggy old CERN http
1218              servers.  Here is the scenario: When Wget tries to
1219              retrieve a directory without a slash, e.g.
1220              http://foo/bar (bar being a directory), CERN server will
1221              not redirect it too http://foo/bar/ -- it will generate a
1222              directory listing containing links to bar/file1,
1223              bar/file2, etc.  Wget will lose because it saves this
1224              HTML listing to a file `bar', so it cannot create the
1225              directory.  To work around this, if the file of the same
1226              name exists, we just remove it and create the directory
1227              anyway.  */
1228           DEBUGP (("Removing %s because of directory danger!\n", t));
1229           unlink (t);
1230         }
1231     }
1232   res = make_directory (t);
1233   if (res != 0)
1234     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1235   xfree (t);
1236   return res;
1237 }
1238 \f
1239 /* Functions for constructing the file name out of URL components.  */
1240
1241 /* A growable string structure, used by url_file_name and friends.
1242    This should perhaps be moved to utils.c.
1243
1244    The idea is to have a convenient and efficient way to construct a
1245    string by having various functions append data to it.  Instead of
1246    passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1247    functions in questions, we pass the pointer to this struct.  */
1248
1249 struct growable {
1250   char *base;
1251   int size;
1252   int tail;
1253 };
1254
1255 /* Ensure that the string can accept APPEND_COUNT more characters past
1256    the current TAIL position.  If necessary, this will grow the string
1257    and update its allocated size.  If the string is already large
1258    enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1259 #define GROW(g, append_size) do {                                       \
1260   struct growable *G_ = g;                                              \
1261   DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1262 } while (0)
1263
1264 /* Return the tail position of the string. */
1265 #define TAIL(r) ((r)->base + (r)->tail)
1266
1267 /* Move the tail position by APPEND_COUNT characters. */
1268 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1269
1270 /* Append the string STR to DEST.  NOTICE: the string in DEST is not
1271    terminated.  */
1272
1273 static void
1274 append_string (const char *str, struct growable *dest)
1275 {
1276   int l = strlen (str);
1277   GROW (dest, l);
1278   memcpy (TAIL (dest), str, l);
1279   TAIL_INCR (dest, l);
1280 }
1281
1282 /* Append CH to DEST.  For example, append_char (0, DEST)
1283    zero-terminates DEST.  */
1284
1285 static void
1286 append_char (char ch, struct growable *dest)
1287 {
1288   GROW (dest, 1);
1289   *TAIL (dest) = ch;
1290   TAIL_INCR (dest, 1);
1291 }
1292
1293 enum {
1294   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1295   filechr_not_windows = 2,      /* unusable on Windows, one of \|/<>?:*" */
1296   filechr_control     = 4       /* a control character, e.g. 0-31 */
1297 };
1298
1299 #define FILE_CHAR_TEST(c, mask) \
1300     ((opt.restrict_files_nonascii && !c_isascii ((unsigned char)(c))) || \
1301     (filechr_table[(unsigned char)(c)] & (mask)))
1302
1303 /* Shorthands for the table: */
1304 #define U filechr_not_unix
1305 #define W filechr_not_windows
1306 #define C filechr_control
1307
1308 #define UW U|W
1309 #define UWC U|W|C
1310
1311 /* Table of characters unsafe under various conditions (see above).
1312
1313    Arguably we could also claim `%' to be unsafe, since we use it as
1314    the escape character.  If we ever want to be able to reliably
1315    translate file name back to URL, this would become important
1316    crucial.  Right now, it's better to be minimal in escaping.  */
1317
1318 static const unsigned char filechr_table[256] =
1319 {
1320 UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1321   C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1322   C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1323   C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1324   0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1325   0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1326   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1327   0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
1328   0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1329   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1330   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1331   0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1332   0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1333   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1334   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1335   0,  0,  0,  0,   W,  0,  0,  C,   /* x   y   z   {    |   }   ~   DEL */
1336
1337   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
1338   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
1339   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1340   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1341
1342   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1343   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1344   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1345   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1346 };
1347 #undef U
1348 #undef W
1349 #undef C
1350 #undef UW
1351 #undef UWC
1352
1353 /* FN_PORT_SEP is the separator between host and port in file names
1354    for non-standard port numbers.  On Unix this is normally ':', as in
1355    "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1356    because Windows can't handle ':' in file names.  */
1357 #define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1358
1359 /* FN_QUERY_SEP is the separator between the file name and the URL
1360    query, normally '?'.  Since Windows cannot handle '?' as part of
1361    file name, we use '@' instead there.  */
1362 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1363
1364 /* Quote path element, characters in [b, e), as file name, and append
1365    the quoted string to DEST.  Each character is quoted as per
1366    file_unsafe_char and the corresponding table.
1367
1368    If ESCAPED is true, the path element is considered to be
1369    URL-escaped and will be unescaped prior to inspection.  */
1370
1371 static void
1372 append_uri_pathel (const char *b, const char *e, bool escaped,
1373                    struct growable *dest)
1374 {
1375   const char *p;
1376   int quoted, outlen;
1377
1378   int mask;
1379   if (opt.restrict_files_os == restrict_unix)
1380     mask = filechr_not_unix;
1381   else
1382     mask = filechr_not_windows;
1383   if (opt.restrict_files_ctrl)
1384     mask |= filechr_control;
1385
1386   /* Copy [b, e) to PATHEL and URL-unescape it. */
1387   if (escaped)
1388     {
1389       char *unescaped;
1390       BOUNDED_TO_ALLOCA (b, e, unescaped);
1391       url_unescape (unescaped);
1392       b = unescaped;
1393       e = unescaped + strlen (unescaped);
1394     }
1395
1396   /* Defang ".." when found as component of path.  Remember that path
1397      comes from the URL and might contain malicious input.  */
1398   if (e - b == 2 && b[0] == '.' && b[1] == '.')
1399     {
1400       b = "%2E%2E";
1401       e = b + 6;
1402     }
1403
1404   /* Walk the PATHEL string and check how many characters we'll need
1405      to quote.  */
1406   quoted = 0;
1407   for (p = b; p < e; p++)
1408     if (FILE_CHAR_TEST (*p, mask))
1409       ++quoted;
1410
1411   /* Calculate the length of the output string.  e-b is the input
1412      string length.  Each quoted char introduces two additional
1413      characters in the string, hence 2*quoted.  */
1414   outlen = (e - b) + (2 * quoted);
1415   GROW (dest, outlen);
1416
1417   if (!quoted)
1418     {
1419       /* If there's nothing to quote, we can simply append the string
1420          without processing it again.  */
1421       memcpy (TAIL (dest), b, outlen);
1422     }
1423   else
1424     {
1425       char *q = TAIL (dest);
1426       for (p = b; p < e; p++)
1427         {
1428           if (!FILE_CHAR_TEST (*p, mask))
1429             *q++ = *p;
1430           else
1431             {
1432               unsigned char ch = *p;
1433               *q++ = '%';
1434               *q++ = XNUM_TO_DIGIT (ch >> 4);
1435               *q++ = XNUM_TO_DIGIT (ch & 0xf);
1436             }
1437         }
1438       assert (q - TAIL (dest) == outlen);
1439     }
1440
1441   /* Perform inline case transformation if required.  */
1442   if (opt.restrict_files_case == restrict_lowercase
1443       || opt.restrict_files_case == restrict_uppercase)
1444     {
1445       char *q;
1446       for (q = TAIL (dest); q < TAIL (dest) + outlen; ++q)
1447         {
1448           if (opt.restrict_files_case == restrict_lowercase)
1449             *q = c_tolower (*q);
1450           else
1451             *q = c_toupper (*q);
1452         }
1453     }
1454
1455   TAIL_INCR (dest, outlen);
1456 }
1457
1458 /* Append to DEST the directory structure that corresponds the
1459    directory part of URL's path.  For example, if the URL is
1460    http://server/dir1/dir2/file, this appends "/dir1/dir2".
1461
1462    Each path element ("dir1" and "dir2" in the above example) is
1463    examined, url-unescaped, and re-escaped as file name element.
1464
1465    Additionally, it cuts as many directories from the path as
1466    specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1467    will produce "bar" for the above example.  For 2 or more, it will
1468    produce "".
1469
1470    Each component of the path is quoted for use as file name.  */
1471
1472 static void
1473 append_dir_structure (const struct url *u, struct growable *dest)
1474 {
1475   char *pathel, *next;
1476   int cut = opt.cut_dirs;
1477
1478   /* Go through the path components, de-URL-quote them, and quote them
1479      (if necessary) as file names.  */
1480
1481   pathel = u->path;
1482   for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1483     {
1484       if (cut-- > 0)
1485         continue;
1486       if (pathel == next)
1487         /* Ignore empty pathels.  */
1488         continue;
1489
1490       if (dest->tail)
1491         append_char ('/', dest);
1492       append_uri_pathel (pathel, next, true, dest);
1493     }
1494 }
1495
1496 /* Return a unique file name that matches the given URL as good as
1497    possible.  Does not create directories on the file system.  */
1498
1499 char *
1500 url_file_name (const struct url *u, char *replaced_filename)
1501 {
1502   struct growable fnres;        /* stands for "file name result" */
1503
1504   const char *u_file, *u_query;
1505   char *fname, *unique;
1506   char *index_filename = "index.html"; /* The default index file is index.html */
1507
1508   fnres.base = NULL;
1509   fnres.size = 0;
1510   fnres.tail = 0;
1511
1512   /* If an alternative index file was defined, change index_filename */
1513   if (opt.default_page)
1514     index_filename = opt.default_page;
1515
1516
1517   /* Start with the directory prefix, if specified. */
1518   if (opt.dir_prefix)
1519     append_string (opt.dir_prefix, &fnres);
1520
1521   /* If "dirstruct" is turned on (typically the case with -r), add
1522      the host and port (unless those have been turned off) and
1523      directory structure.  */
1524   if (opt.dirstruct)
1525     {
1526       if (opt.protocol_directories)
1527         {
1528           if (fnres.tail)
1529             append_char ('/', &fnres);
1530           append_string (supported_schemes[u->scheme].name, &fnres);
1531         }
1532       if (opt.add_hostdir)
1533         {
1534           if (fnres.tail)
1535             append_char ('/', &fnres);
1536           if (0 != strcmp (u->host, ".."))
1537             append_string (u->host, &fnres);
1538           else
1539             /* Host name can come from the network; malicious DNS may
1540                allow ".." to be resolved, causing us to write to
1541                "../<file>".  Defang such host names.  */
1542             append_string ("%2E%2E", &fnres);
1543           if (u->port != scheme_default_port (u->scheme))
1544             {
1545               char portstr[24];
1546               number_to_string (portstr, u->port);
1547               append_char (FN_PORT_SEP, &fnres);
1548               append_string (portstr, &fnres);
1549             }
1550         }
1551
1552       append_dir_structure (u, &fnres);
1553     }
1554
1555   if (!replaced_filename)
1556     {
1557       /* Add the file name. */
1558       if (fnres.tail)
1559         append_char ('/', &fnres);
1560       u_file = *u->file ? u->file : index_filename;
1561       append_uri_pathel (u_file, u_file + strlen (u_file), false, &fnres);
1562
1563       /* Append "?query" to the file name. */
1564       u_query = u->query && *u->query ? u->query : NULL;
1565       if (u_query)
1566         {
1567           append_char (FN_QUERY_SEP, &fnres);
1568           append_uri_pathel (u_query, u_query + strlen (u_query),
1569                              true, &fnres);
1570         }
1571     }
1572   else
1573     {
1574       if (fnres.tail)
1575         append_char ('/', &fnres);
1576       u_file = replaced_filename;
1577       append_uri_pathel (u_file, u_file + strlen (u_file), false, &fnres);
1578     }
1579
1580   /* Zero-terminate the file name. */
1581   append_char ('\0', &fnres);
1582
1583   fname = fnres.base;
1584
1585   /* Check the cases in which the unique extensions are not used:
1586      1) Clobbering is turned off (-nc).
1587      2) Retrieval with regetting.
1588      3) Timestamping is used.
1589      4) Hierarchy is built.
1590
1591      The exception is the case when file does exist and is a
1592      directory (see `mkalldirs' for explanation).  */
1593
1594   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1595       && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1596     {
1597       unique = fname;
1598     }
1599   else
1600     {
1601       unique = unique_name (fname, true);
1602       if (unique != fname)
1603         xfree (fname);
1604     }
1605
1606 /* On VMS, alter the name as required. */
1607 #ifdef __VMS
1608   {
1609     char *unique2;
1610
1611     unique2 = ods_conform( unique);
1612     if (unique2 != unique)
1613       {
1614         xfree (unique);
1615         unique = unique2;
1616       }
1617   }
1618 #endif /* def __VMS */
1619
1620   return unique;
1621 }
1622 \f
1623 /* Resolve "." and ".." elements of PATH by destructively modifying
1624    PATH and return true if PATH has been modified, false otherwise.
1625
1626    The algorithm is in spirit similar to the one described in rfc1808,
1627    although implemented differently, in one pass.  To recap, path
1628    elements containing only "." are removed, and ".." is taken to mean
1629    "back up one element".  Single leading and trailing slashes are
1630    preserved.
1631
1632    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1633    test examples are provided below.  If you change anything in this
1634    function, run test_path_simplify to make sure you haven't broken a
1635    test case.  */
1636
1637 static bool
1638 path_simplify (enum url_scheme scheme, char *path)
1639 {
1640   char *h = path;               /* hare */
1641   char *t = path;               /* tortoise */
1642   char *beg = path;
1643   char *end = strchr (path, '\0');
1644
1645   while (h < end)
1646     {
1647       /* Hare should be at the beginning of a path element. */
1648
1649       if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1650         {
1651           /* Ignore "./". */
1652           h += 2;
1653         }
1654       else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1655         {
1656           /* Handle "../" by retreating the tortoise by one path
1657              element -- but not past beggining.  */
1658           if (t > beg)
1659             {
1660               /* Move backwards until T hits the beginning of the
1661                  previous path element or the beginning of path. */
1662               for (--t; t > beg && t[-1] != '/'; t--)
1663                 ;
1664             }
1665           else if (scheme == SCHEME_FTP)
1666             {
1667               /* If we're at the beginning, copy the "../" literally
1668                  and move the beginning so a later ".." doesn't remove
1669                  it.  This violates RFC 3986; but we do it for FTP
1670                  anyway because there is otherwise no way to get at a
1671                  parent directory, when the FTP server drops us in a
1672                  non-root directory (which is not uncommon). */
1673               beg = t + 3;
1674               goto regular;
1675             }
1676           h += 3;
1677         }
1678       else
1679         {
1680         regular:
1681           /* A regular path element.  If H hasn't advanced past T,
1682              simply skip to the next path element.  Otherwise, copy
1683              the path element until the next slash.  */
1684           if (t == h)
1685             {
1686               /* Skip the path element, including the slash.  */
1687               while (h < end && *h != '/')
1688                 t++, h++;
1689               if (h < end)
1690                 t++, h++;
1691             }
1692           else
1693             {
1694               /* Copy the path element, including the final slash.  */
1695               while (h < end && *h != '/')
1696                 *t++ = *h++;
1697               if (h < end)
1698                 *t++ = *h++;
1699             }
1700         }
1701     }
1702
1703   if (t != h)
1704     *t = '\0';
1705
1706   return t != h;
1707 }
1708 \f
1709 /* Return the length of URL's path.  Path is considered to be
1710    terminated by one or more of the ?query or ;params or #fragment,
1711    depending on the scheme.  */
1712
1713 static const char *
1714 path_end (const char *url)
1715 {
1716   enum url_scheme scheme = url_scheme (url);
1717   const char *seps;
1718   if (scheme == SCHEME_INVALID)
1719     scheme = SCHEME_HTTP;       /* use http semantics for rel links */
1720   /* +2 to ignore the first two separators ':' and '/' */
1721   seps = init_seps (scheme) + 2;
1722   return strpbrk_or_eos (url, seps);
1723 }
1724
1725 /* Find the last occurrence of character C in the range [b, e), or
1726    NULL, if none are present.  */
1727 #define find_last_char(b, e, c) memrchr ((b), (c), (e) - (b))
1728
1729 /* Merge BASE with LINK and return the resulting URI.
1730
1731    Either of the URIs may be absolute or relative, complete with the
1732    host name, or path only.  This tries to reasonably handle all
1733    foreseeable cases.  It only employs minimal URL parsing, without
1734    knowledge of the specifics of schemes.
1735
1736    I briefly considered making this function call path_simplify after
1737    the merging process, as rfc1738 seems to suggest.  This is a bad
1738    idea for several reasons: 1) it complexifies the code, and 2)
1739    url_parse has to simplify path anyway, so it's wasteful to boot.  */
1740
1741 char *
1742 uri_merge (const char *base, const char *link)
1743 {
1744   int linklength;
1745   const char *end;
1746   char *merge;
1747
1748   if (url_has_scheme (link))
1749     return xstrdup (link);
1750
1751   /* We may not examine BASE past END. */
1752   end = path_end (base);
1753   linklength = strlen (link);
1754
1755   if (!*link)
1756     {
1757       /* Empty LINK points back to BASE, query string and all. */
1758       return xstrdup (base);
1759     }
1760   else if (*link == '?')
1761     {
1762       /* LINK points to the same location, but changes the query
1763          string.  Examples: */
1764       /* uri_merge("path",         "?new") -> "path?new"     */
1765       /* uri_merge("path?foo",     "?new") -> "path?new"     */
1766       /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1767       /* uri_merge("path#foo",     "?new") -> "path?new"     */
1768       int baselength = end - base;
1769       merge = xmalloc (baselength + linklength + 1);
1770       memcpy (merge, base, baselength);
1771       memcpy (merge + baselength, link, linklength);
1772       merge[baselength + linklength] = '\0';
1773     }
1774   else if (*link == '#')
1775     {
1776       /* uri_merge("path",         "#new") -> "path#new"     */
1777       /* uri_merge("path#foo",     "#new") -> "path#new"     */
1778       /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1779       /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1780       int baselength;
1781       const char *end1 = strchr (base, '#');
1782       if (!end1)
1783         end1 = base + strlen (base);
1784       baselength = end1 - base;
1785       merge = xmalloc (baselength + linklength + 1);
1786       memcpy (merge, base, baselength);
1787       memcpy (merge + baselength, link, linklength);
1788       merge[baselength + linklength] = '\0';
1789     }
1790   else if (*link == '/' && *(link + 1) == '/')
1791     {
1792       /* LINK begins with "//" and so is a net path: we need to
1793          replace everything after (and including) the double slash
1794          with LINK. */
1795
1796       /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1797       /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1798       /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1799
1800       int span;
1801       const char *slash;
1802       const char *start_insert;
1803
1804       /* Look for first slash. */
1805       slash = memchr (base, '/', end - base);
1806       /* If found slash and it is a double slash, then replace
1807          from this point, else default to replacing from the
1808          beginning.  */
1809       if (slash && *(slash + 1) == '/')
1810         start_insert = slash;
1811       else
1812         start_insert = base;
1813
1814       span = start_insert - base;
1815       merge = xmalloc (span + linklength + 1);
1816       if (span)
1817         memcpy (merge, base, span);
1818       memcpy (merge + span, link, linklength);
1819       merge[span + linklength] = '\0';
1820     }
1821   else if (*link == '/')
1822     {
1823       /* LINK is an absolute path: we need to replace everything
1824          after (and including) the FIRST slash with LINK.
1825
1826          So, if BASE is "http://host/whatever/foo/bar", and LINK is
1827          "/qux/xyzzy", our result should be
1828          "http://host/qux/xyzzy".  */
1829       int span;
1830       const char *slash;
1831       const char *start_insert = NULL; /* for gcc to shut up. */
1832       const char *pos = base;
1833       bool seen_slash_slash = false;
1834       /* We're looking for the first slash, but want to ignore
1835          double slash. */
1836     again:
1837       slash = memchr (pos, '/', end - pos);
1838       if (slash && !seen_slash_slash)
1839         if (*(slash + 1) == '/')
1840           {
1841             pos = slash + 2;
1842             seen_slash_slash = true;
1843             goto again;
1844           }
1845
1846       /* At this point, SLASH is the location of the first / after
1847          "//", or the first slash altogether.  START_INSERT is the
1848          pointer to the location where LINK will be inserted.  When
1849          examining the last two examples, keep in mind that LINK
1850          begins with '/'. */
1851
1852       if (!slash && !seen_slash_slash)
1853         /* example: "foo" */
1854         /*           ^    */
1855         start_insert = base;
1856       else if (!slash && seen_slash_slash)
1857         /* example: "http://foo" */
1858         /*                     ^ */
1859         start_insert = end;
1860       else if (slash && !seen_slash_slash)
1861         /* example: "foo/bar" */
1862         /*           ^        */
1863         start_insert = base;
1864       else if (slash && seen_slash_slash)
1865         /* example: "http://something/" */
1866         /*                           ^  */
1867         start_insert = slash;
1868
1869       span = start_insert - base;
1870       merge = xmalloc (span + linklength + 1);
1871       if (span)
1872         memcpy (merge, base, span);
1873       memcpy (merge + span, link, linklength);
1874       merge[span + linklength] = '\0';
1875     }
1876   else
1877     {
1878       /* LINK is a relative URL: we need to replace everything
1879          after last slash (possibly empty) with LINK.
1880
1881          So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1882          our result should be "whatever/foo/qux/xyzzy".  */
1883       bool need_explicit_slash = false;
1884       int span;
1885       const char *start_insert;
1886       const char *last_slash = find_last_char (base, end, '/');
1887       if (!last_slash)
1888         {
1889           /* No slash found at all.  Replace what we have with LINK. */
1890           start_insert = base;
1891         }
1892       else if (last_slash && last_slash >= base + 2
1893                && last_slash[-2] == ':' && last_slash[-1] == '/')
1894         {
1895           /* example: http://host"  */
1896           /*                      ^ */
1897           start_insert = end + 1;
1898           need_explicit_slash = true;
1899         }
1900       else
1901         {
1902           /* example: "whatever/foo/bar" */
1903           /*                        ^    */
1904           start_insert = last_slash + 1;
1905         }
1906
1907       span = start_insert - base;
1908       merge = xmalloc (span + linklength + 1);
1909       if (span)
1910         memcpy (merge, base, span);
1911       if (need_explicit_slash)
1912         merge[span - 1] = '/';
1913       memcpy (merge + span, link, linklength);
1914       merge[span + linklength] = '\0';
1915     }
1916
1917   return merge;
1918 }
1919 \f
1920 #define APPEND(p, s) do {                       \
1921   int len = strlen (s);                         \
1922   memcpy (p, s, len);                           \
1923   p += len;                                     \
1924 } while (0)
1925
1926 /* Use this instead of password when the actual password is supposed
1927    to be hidden.  We intentionally use a generic string without giving
1928    away the number of characters in the password, like previous
1929    versions did.  */
1930 #define HIDDEN_PASSWORD "*password*"
1931
1932 /* Recreate the URL string from the data in URL.
1933
1934    If HIDE is true (as it is when we're calling this on a URL we plan
1935    to print, but not when calling it to canonicalize a URL for use
1936    within the program), password will be hidden.  Unsafe characters in
1937    the URL will be quoted.  */
1938
1939 char *
1940 url_string (const struct url *url, enum url_auth_mode auth_mode)
1941 {
1942   int size;
1943   char *result, *p;
1944   char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;
1945
1946   int scheme_port = supported_schemes[url->scheme].default_port;
1947   const char *scheme_str = supported_schemes[url->scheme].leading_string;
1948   int fplen = full_path_length (url);
1949
1950   bool brackets_around_host;
1951
1952   assert (scheme_str != NULL);
1953
1954   /* Make sure the user name and password are quoted. */
1955   if (url->user)
1956     {
1957       if (auth_mode != URL_AUTH_HIDE)
1958         {
1959           quoted_user = url_escape_allow_passthrough (url->user);
1960           if (url->passwd)
1961             {
1962               if (auth_mode == URL_AUTH_HIDE_PASSWD)
1963                 quoted_passwd = HIDDEN_PASSWORD;
1964               else
1965                 quoted_passwd = url_escape_allow_passthrough (url->passwd);
1966             }
1967         }
1968     }
1969
1970   /* In the unlikely event that the host name contains non-printable
1971      characters, quote it for displaying to the user.  */
1972   quoted_host = url_escape_allow_passthrough (url->host);
1973
1974   /* Undo the quoting of colons that URL escaping performs.  IPv6
1975      addresses may legally contain colons, and in that case must be
1976      placed in square brackets.  */
1977   if (quoted_host != url->host)
1978     unescape_single_char (quoted_host, ':');
1979   brackets_around_host = strchr (quoted_host, ':') != NULL;
1980
1981   size = (strlen (scheme_str)
1982           + strlen (quoted_host)
1983           + (brackets_around_host ? 2 : 0)
1984           + fplen
1985           + 1);
1986   if (url->port != scheme_port)
1987     size += 1 + numdigit (url->port);
1988   if (quoted_user)
1989     {
1990       size += 1 + strlen (quoted_user);
1991       if (quoted_passwd)
1992         size += 1 + strlen (quoted_passwd);
1993     }
1994
1995   p = result = xmalloc (size);
1996
1997   APPEND (p, scheme_str);
1998   if (quoted_user)
1999     {
2000       APPEND (p, quoted_user);
2001       if (quoted_passwd)
2002         {
2003           *p++ = ':';
2004           APPEND (p, quoted_passwd);
2005         }
2006       *p++ = '@';
2007     }
2008
2009   if (brackets_around_host)
2010     *p++ = '[';
2011   APPEND (p, quoted_host);
2012   if (brackets_around_host)
2013     *p++ = ']';
2014   if (url->port != scheme_port)
2015     {
2016       *p++ = ':';
2017       p = number_to_string (p, url->port);
2018     }
2019
2020   full_path_write (url, p);
2021   p += fplen;
2022   *p++ = '\0';
2023
2024   assert (p - result == size);
2025
2026   if (quoted_user && quoted_user != url->user)
2027     xfree (quoted_user);
2028   if (quoted_passwd && auth_mode == URL_AUTH_SHOW
2029       && quoted_passwd != url->passwd)
2030     xfree (quoted_passwd);
2031   if (quoted_host != url->host)
2032     xfree (quoted_host);
2033
2034   return result;
2035 }
2036 \f
2037 /* Return true if scheme a is similar to scheme b.
2038
2039    Schemes are similar if they are equal.  If SSL is supported, schemes
2040    are also similar if one is http (SCHEME_HTTP) and the other is https
2041    (SCHEME_HTTPS).  */
2042 bool
2043 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2044 {
2045   if (a == b)
2046     return true;
2047 #ifdef HAVE_SSL
2048   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2049       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2050     return true;
2051 #endif
2052   return false;
2053 }
2054 \f
2055 static int
2056 getchar_from_escaped_string (const char *str, char *c)
2057 {
2058   const char *p = str;
2059
2060   assert (str && *str);
2061   assert (c);
2062
2063   if (p[0] == '%')
2064     {
2065       if (!c_isxdigit(p[1]) || !c_isxdigit(p[2]))
2066         {
2067           *c = '%';
2068           return 1;
2069         }
2070       else
2071         {
2072           if (p[2] == 0)
2073             return 0; /* error: invalid string */
2074
2075           *c = X2DIGITS_TO_NUM (p[1], p[2]);
2076           if (URL_RESERVED_CHAR(*c))
2077             {
2078               *c = '%';
2079               return 1;
2080             }
2081           else
2082             return 3;
2083         }
2084     }
2085   else
2086     {
2087       *c = p[0];
2088     }
2089
2090   return 1;
2091 }
2092
2093 bool
2094 are_urls_equal (const char *u1, const char *u2)
2095 {
2096   const char *p, *q;
2097   int pp, qq;
2098   char ch1, ch2;
2099   assert(u1 && u2);
2100
2101   p = u1;
2102   q = u2;
2103
2104   while (*p && *q
2105          && (pp = getchar_from_escaped_string (p, &ch1))
2106          && (qq = getchar_from_escaped_string (q, &ch2))
2107          && (c_tolower(ch1) == c_tolower(ch2)))
2108     {
2109       p += pp;
2110       q += qq;
2111     }
2112
2113   return (*p == 0 && *q == 0 ? true : false);
2114 }
2115 \f
2116 #ifdef TESTING
2117 /* Debugging and testing support for path_simplify. */
2118
2119 #if 0
2120 /* Debug: run path_simplify on PATH and return the result in a new
2121    string.  Useful for calling from the debugger.  */
2122 static char *
2123 ps (char *path)
2124 {
2125   char *copy = xstrdup (path);
2126   path_simplify (copy);
2127   return copy;
2128 }
2129 #endif
2130
2131 static const char *
2132 run_test (char *test, char *expected_result, enum url_scheme scheme,
2133           bool expected_change)
2134 {
2135   char *test_copy = xstrdup (test);
2136   bool modified = path_simplify (scheme, test_copy);
2137
2138   if (0 != strcmp (test_copy, expected_result))
2139     {
2140       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2141               test, expected_result, test_copy);
2142       mu_assert ("", 0);
2143     }
2144   if (modified != expected_change)
2145     {
2146       if (expected_change)
2147         printf ("Expected modification with path_simplify(\"%s\").\n",
2148                 test);
2149       else
2150         printf ("Expected no modification with path_simplify(\"%s\").\n",
2151                 test);
2152     }
2153   xfree (test_copy);
2154   mu_assert ("", modified == expected_change);
2155   return NULL;
2156 }
2157
2158 const char *
2159 test_path_simplify (void)
2160 {
2161   static struct {
2162     char *test, *result;
2163     enum url_scheme scheme;
2164     bool should_modify;
2165   } tests[] = {
2166     { "",                       "",             SCHEME_HTTP, false },
2167     { ".",                      "",             SCHEME_HTTP, true },
2168     { "./",                     "",             SCHEME_HTTP, true },
2169     { "..",                     "",             SCHEME_HTTP, true },
2170     { "../",                    "",             SCHEME_HTTP, true },
2171     { "..",                     "..",           SCHEME_FTP,  false },
2172     { "../",                    "../",          SCHEME_FTP,  false },
2173     { "foo",                    "foo",          SCHEME_HTTP, false },
2174     { "foo/bar",                "foo/bar",      SCHEME_HTTP, false },
2175     { "foo///bar",              "foo///bar",    SCHEME_HTTP, false },
2176     { "foo/.",                  "foo/",         SCHEME_HTTP, true },
2177     { "foo/./",                 "foo/",         SCHEME_HTTP, true },
2178     { "foo./",                  "foo./",        SCHEME_HTTP, false },
2179     { "foo/../bar",             "bar",          SCHEME_HTTP, true },
2180     { "foo/../bar/",            "bar/",         SCHEME_HTTP, true },
2181     { "foo/bar/..",             "foo/",         SCHEME_HTTP, true },
2182     { "foo/bar/../x",           "foo/x",        SCHEME_HTTP, true },
2183     { "foo/bar/../x/",          "foo/x/",       SCHEME_HTTP, true },
2184     { "foo/..",                 "",             SCHEME_HTTP, true },
2185     { "foo/../..",              "",             SCHEME_HTTP, true },
2186     { "foo/../../..",           "",             SCHEME_HTTP, true },
2187     { "foo/../../bar/../../baz", "baz",         SCHEME_HTTP, true },
2188     { "foo/../..",              "..",           SCHEME_FTP,  true },
2189     { "foo/../../..",           "../..",        SCHEME_FTP,  true },
2190     { "foo/../../bar/../../baz", "../../baz",   SCHEME_FTP,  true },
2191     { "a/b/../../c",            "c",            SCHEME_HTTP, true },
2192     { "./a/../b",               "b",            SCHEME_HTTP, true }
2193   };
2194   int i;
2195
2196   for (i = 0; i < countof (tests); i++)
2197     {
2198       const char *message;
2199       char *test = tests[i].test;
2200       char *expected_result = tests[i].result;
2201       enum url_scheme scheme = tests[i].scheme;
2202       bool  expected_change = tests[i].should_modify;
2203       message = run_test (test, expected_result, scheme, expected_change);
2204       if (message) return message;
2205     }
2206   return NULL;
2207 }
2208
2209 const char *
2210 test_append_uri_pathel()
2211 {
2212   int i;
2213   struct {
2214     char *original_url;
2215     char *input;
2216     bool escaped;
2217     char *expected_result;
2218   } test_array[] = {
2219     { "http://www.yoyodyne.com/path/", "somepage.html", false, "http://www.yoyodyne.com/path/somepage.html" },
2220   };
2221
2222   for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
2223     {
2224       struct growable dest;
2225       const char *p = test_array[i].input;
2226
2227       memset (&dest, 0, sizeof (dest));
2228
2229       append_string (test_array[i].original_url, &dest);
2230       append_uri_pathel (p, p + strlen(p), test_array[i].escaped, &dest);
2231       append_char ('\0', &dest);
2232
2233       mu_assert ("test_append_uri_pathel: wrong result",
2234                  strcmp (dest.base, test_array[i].expected_result) == 0);
2235     }
2236
2237   return NULL;
2238 }
2239
2240 const char*
2241 test_are_urls_equal()
2242 {
2243   int i;
2244   struct {
2245     char *url1;
2246     char *url2;
2247     bool expected_result;
2248   } test_array[] = {
2249     { "http://www.adomain.com/apath/", "http://www.adomain.com/apath/",       true },
2250     { "http://www.adomain.com/apath/", "http://www.adomain.com/anotherpath/", false },
2251     { "http://www.adomain.com/apath/", "http://www.anotherdomain.com/path/",  false },
2252     { "http://www.adomain.com/~path/", "http://www.adomain.com/%7epath/",     true },
2253     { "http://www.adomain.com/longer-path/", "http://www.adomain.com/path/",  false },
2254     { "http://www.adomain.com/path%2f", "http://www.adomain.com/path/",       false },
2255   };
2256
2257   for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
2258     {
2259       mu_assert ("test_are_urls_equal: wrong result",
2260                  are_urls_equal (test_array[i].url1, test_array[i].url2) == test_array[i].expected_result);
2261     }
2262
2263   return NULL;
2264 }
2265
2266 #endif /* TESTING */
2267
2268 /*
2269  * vim: et ts=2 sw=2
2270  */
2271