sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
   3    2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or (at
  10 your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #include "wget.h"
  32
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 #ifdef HAVE_UNISTD_H
  37 # include <unistd.h>
  38 #endif
  39 #include <errno.h>
  40 #include <assert.h>
  41
  42 #include "utils.h"
  43 #include "url.h"
  44 #include "host.h"  /* for is_valid_ipv6_address */
  45 #include "iri.h"
  46
  47 #ifdef TESTING
  48 #include "test.h"
  49 #endif
  50
  51 enum {
  52   scm_disabled = 1,             /* for https when OpenSSL fails to init. */
  53   scm_has_params = 2,           /* whether scheme has ;params */
  54   scm_has_query = 4,            /* whether scheme has ?query */
  55   scm_has_fragment = 8          /* whether scheme has #fragment */
  56 };
  57
  58 struct scheme_data
  59 {
  60   /* Short name of the scheme, such as "http" or "ftp". */
  61   const char *name;
  62   /* Leading string that identifies the scheme, such as "https://". */
  63   const char *leading_string;
  64   /* Default port of the scheme when none is specified. */
  65   int default_port;
  66   /* Various flags. */
  67   int flags;
  68 };
  69
  70 /* Supported schemes: */
  71 static struct scheme_data supported_schemes[] =
  72 {
  73   { "http",     "http://",  DEFAULT_HTTP_PORT,  scm_has_query|scm_has_fragment },
  74 #ifdef HAVE_SSL
  75   { "https",    "https://", DEFAULT_HTTPS_PORT, scm_has_query|scm_has_fragment },
  76 #endif
  77   { "ftp",      "ftp://",   DEFAULT_FTP_PORT,   scm_has_params|scm_has_fragment },
  78
  79   /* SCHEME_INVALID */
  80   { NULL,       NULL,       -1,                 0 }
  81 };
  82
  83 /* Forward declarations: */
  84
  85 static bool path_simplify (enum url_scheme, char *);
  86 \f
  87 /* Support for escaping and unescaping of URL strings.  */
  88
  89 /* Table of "reserved" and "unsafe" characters.  Those terms are
  90    rfc1738-speak, as such largely obsoleted by rfc2396 and later
  91    specs, but the general idea remains.
  92
  93    A reserved character is the one that you can't decode without
  94    changing the meaning of the URL.  For example, you can't decode
  95    "/foo/%2f/bar" into "/foo///bar" because the number and contents of
  96    path components is different.  Non-reserved characters can be
  97    changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar".  The
  98    unsafe characters are loosely based on rfc1738, plus "$" and ",",
  99    as recommended by rfc2396, and minus "~", which is very frequently
 100    used (and sometimes unrecognized as %7E by broken servers).
 101
 102    An unsafe character is the one that should be encoded when URLs are
 103    placed in foreign environments.  E.g. space and newline are unsafe
 104    in HTTP contexts because HTTP uses them as separator and line
 105    terminator, so they must be encoded to %20 and %0A respectively.
 106    "*" is unsafe in shell context, etc.
 107
 108    We determine whether a character is unsafe through static table
 109    lookup.  This code assumes ASCII character set and 8-bit chars.  */
 110
 111 enum {
 112   /* rfc1738 reserved chars + "$" and ",".  */
 113   urlchr_reserved = 1,
 114
 115   /* rfc1738 unsafe chars, plus non-printables.  */
 116   urlchr_unsafe   = 2
 117 };
 118
 119 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 120 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 121 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 122
 123 /* Shorthands for the table: */
 124 #define R  urlchr_reserved
 125 #define U  urlchr_unsafe
 126 #define RU R|U
 127
 128 static const unsigned char urlchr_table[256] =
 129 {
 130   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 131   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 132   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 133   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 134   U,  0,  U, RU,   R,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 135   0,  0,  0,  R,   R,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 136   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 137   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 138  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 139   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 140   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 141   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 142   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 143   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 144   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 145   0,  0,  0,  U,   U,  U,  0,  U,   /* x   y   z   {    |   }   ~   DEL */
 146
 147   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 148   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 149   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 150   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 151
 152   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 153   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 154   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 155   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 156 };
 157 #undef R
 158 #undef U
 159 #undef RU
 160
 161 /* URL-unescape the string S.
 162
 163    This is done by transforming the sequences "%HH" to the character
 164    represented by the hexadecimal digits HH.  If % is not followed by
 165    two hexadecimal digits, it is inserted literally.
 166
 167    The transformation is done in place.  If you need the original
 168    string intact, make a copy before calling this function.  */
 169
 170 static void
 171 url_unescape (char *s)
 172 {
 173   char *t = s;                  /* t - tortoise */
 174   char *h = s;                  /* h - hare     */
 175
 176   for (; *h; h++, t++)
 177     {
 178       if (*h != '%')
 179         {
 180         copychar:
 181           *t = *h;
 182         }
 183       else
 184         {
 185           char c;
 186           /* Do nothing if '%' is not followed by two hex digits. */
 187           if (!h[1] || !h[2] || !(c_isxdigit (h[1]) && c_isxdigit (h[2])))
 188             goto copychar;
 189           c = X2DIGITS_TO_NUM (h[1], h[2]);
 190           /* Don't unescape %00 because there is no way to insert it
 191              into a C string without effectively truncating it. */
 192           if (c == '\0')
 193             goto copychar;
 194           *t = c;
 195           h += 2;
 196         }
 197     }
 198   *t = '\0';
 199 }
 200
 201 /* The core of url_escape_* functions.  Escapes the characters that
 202    match the provided mask in urlchr_table.
 203
 204    If ALLOW_PASSTHROUGH is true, a string with no unsafe chars will be
 205    returned unchanged.  If ALLOW_PASSTHROUGH is false, a freshly
 206    allocated string will be returned in all cases.  */
 207
 208 static char *
 209 url_escape_1 (const char *s, unsigned char mask, bool allow_passthrough)
 210 {
 211   const char *p1;
 212   char *p2, *newstr;
 213   int newlen;
 214   int addition = 0;
 215
 216   for (p1 = s; *p1; p1++)
 217     if (urlchr_test (*p1, mask))
 218       addition += 2;            /* Two more characters (hex digits) */
 219
 220   if (!addition)
 221     return allow_passthrough ? (char *)s : xstrdup (s);
 222
 223   newlen = (p1 - s) + addition;
 224   newstr = xmalloc (newlen + 1);
 225
 226   p1 = s;
 227   p2 = newstr;
 228   while (*p1)
 229     {
 230       /* Quote the characters that match the test mask. */
 231       if (urlchr_test (*p1, mask))
 232         {
 233           unsigned char c = *p1++;
 234           *p2++ = '%';
 235           *p2++ = XNUM_TO_DIGIT (c >> 4);
 236           *p2++ = XNUM_TO_DIGIT (c & 0xf);
 237         }
 238       else
 239         *p2++ = *p1++;
 240     }
 241   assert (p2 - newstr == newlen);
 242   *p2 = '\0';
 243
 244   return newstr;
 245 }
 246
 247 /* URL-escape the unsafe characters (see urlchr_table) in a given
 248    string, returning a freshly allocated string.  */
 249
 250 char *
 251 url_escape (const char *s)
 252 {
 253   return url_escape_1 (s, urlchr_unsafe, false);
 254 }
 255
 256 /* URL-escape the unsafe characters (see urlchr_table) in a given
 257    string.  If no characters are unsafe, S is returned.  */
 258
 259 static char *
 260 url_escape_allow_passthrough (const char *s)
 261 {
 262   return url_escape_1 (s, urlchr_unsafe, true);
 263 }
 264 \f
 265 /* Decide whether the char at position P needs to be encoded.  (It is
 266    not enough to pass a single char *P because the function may need
 267    to inspect the surrounding context.)
 268
 269    Return true if the char should be escaped as %XX, false otherwise.  */
 270
 271 static inline bool
 272 char_needs_escaping (const char *p)
 273 {
 274   if (*p == '%')
 275     {
 276       if (c_isxdigit (*(p + 1)) && c_isxdigit (*(p + 2)))
 277         return false;
 278       else
 279         /* Garbled %.. sequence: encode `%'. */
 280         return true;
 281     }
 282   else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
 283     return true;
 284   else
 285     return false;
 286 }
 287
 288 /* Translate a %-escaped (but possibly non-conformant) input string S
 289    into a %-escaped (and conformant) output string.  If no characters
 290    are encoded or decoded, return the same string S; otherwise, return
 291    a freshly allocated string with the new contents.
 292
 293    After a URL has been run through this function, the protocols that
 294    use `%' as the quote character can use the resulting string as-is,
 295    while those that don't can use url_unescape to get to the intended
 296    data.  This function is stable: once the input is transformed,
 297    further transformations of the result yield the same output.
 298
 299    Let's discuss why this function is needed.
 300
 301    Imagine Wget is asked to retrieve `http://abc.xyz/abc def'.  Since
 302    a raw space character would mess up the HTTP request, it needs to
 303    be quoted, like this:
 304
 305        GET /abc%20def HTTP/1.0
 306
 307    It would appear that the unsafe chars need to be quoted, for
 308    example with url_escape.  But what if we're requested to download
 309    `abc%20def'?  url_escape transforms "%" to "%25", which would leave
 310    us with `abc%2520def'.  This is incorrect -- since %-escapes are
 311    part of URL syntax, "%20" is the correct way to denote a literal
 312    space on the Wget command line.  This leads to the conclusion that
 313    in that case Wget should not call url_escape, but leave the `%20'
 314    as is.  This is clearly contradictory, but it only gets worse.
 315
 316    What if the requested URI is `abc%20 def'?  If we call url_escape,
 317    we end up with `/abc%2520%20def', which is almost certainly not
 318    intended.  If we don't call url_escape, we are left with the
 319    embedded space and cannot complete the request.  What the user
 320    meant was for Wget to request `/abc%20%20def', and this is where
 321    reencode_escapes kicks in.
 322
 323    Wget used to solve this by first decoding %-quotes, and then
 324    encoding all the "unsafe" characters found in the resulting string.
 325    This was wrong because it didn't preserve certain URL special
 326    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 327    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 328    whether we considered `+' reserved (it is).  One of these results
 329    is inevitable because by the second step we would lose information
 330    on whether the `+' was originally encoded or not.  Both results
 331    were wrong because in CGI parameters + means space, while %2B means
 332    literal plus.  reencode_escapes correctly translates the above to
 333    "a%2B+b", i.e. returns the original string.
 334
 335    This function uses a modified version of the algorithm originally
 336    proposed by Anon Sricharoenchai:
 337
 338    * Encode all "unsafe" characters, except those that are also
 339      "reserved", to %XX.  See urlchr_table for which characters are
 340      unsafe and reserved.
 341
 342    * Encode the "%" characters not followed by two hex digits to
 343      "%25".
 344
 345    * Pass through all other characters and %XX escapes as-is.  (Up to
 346      Wget 1.10 this decoded %XX escapes corresponding to "safe"
 347      characters, but that was obtrusive and broke some servers.)
 348
 349    Anon's test case:
 350
 351    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 352    ->
 353    "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc"
 354
 355    Simpler test cases:
 356
 357    "foo bar"         -> "foo%20bar"
 358    "foo%20bar"       -> "foo%20bar"
 359    "foo %20bar"      -> "foo%20%20bar"
 360    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 361    "foo%25%20bar"    -> "foo%25%20bar"
 362    "foo%2%20bar"     -> "foo%252%20bar"
 363    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 364    "foo%2b+bar"      -> "foo%2b+bar"  */
 365
 366 static char *
 367 reencode_escapes (const char *s)
 368 {
 369   const char *p1;
 370   char *newstr, *p2;
 371   int oldlen, newlen;
 372
 373   int encode_count = 0;
 374
 375   /* First pass: inspect the string to see if there's anything to do,
 376      and to calculate the new length.  */
 377   for (p1 = s; *p1; p1++)
 378     if (char_needs_escaping (p1))
 379       ++encode_count;
 380
 381   if (!encode_count)
 382     /* The string is good as it is. */
 383     return (char *) s;          /* C const model sucks. */
 384
 385   oldlen = p1 - s;
 386   /* Each encoding adds two characters (hex digits).  */
 387   newlen = oldlen + 2 * encode_count;
 388   newstr = xmalloc (newlen + 1);
 389
 390   /* Second pass: copy the string to the destination address, encoding
 391      chars when needed.  */
 392   p1 = s;
 393   p2 = newstr;
 394
 395   while (*p1)
 396     if (char_needs_escaping (p1))
 397       {
 398         unsigned char c = *p1++;
 399         *p2++ = '%';
 400         *p2++ = XNUM_TO_DIGIT (c >> 4);
 401         *p2++ = XNUM_TO_DIGIT (c & 0xf);
 402       }
 403     else
 404       *p2++ = *p1++;
 405
 406   *p2 = '\0';
 407   assert (p2 - newstr == newlen);
 408   return newstr;
 409 }
 410 \f
 411 /* Returns the scheme type if the scheme is supported, or
 412    SCHEME_INVALID if not.  */
 413
 414 enum url_scheme
 415 url_scheme (const char *url)
 416 {
 417   int i;
 418
 419   for (i = 0; supported_schemes[i].leading_string; i++)
 420     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 421                           strlen (supported_schemes[i].leading_string)))
 422       {
 423         if (!(supported_schemes[i].flags & scm_disabled))
 424           return (enum url_scheme) i;
 425         else
 426           return SCHEME_INVALID;
 427       }
 428
 429   return SCHEME_INVALID;
 430 }
 431
 432 #define SCHEME_CHAR(ch) (c_isalnum (ch) || (ch) == '-' || (ch) == '+')
 433
 434 /* Return 1 if the URL begins with any "scheme", 0 otherwise.  As
 435    currently implemented, it returns true if URL begins with
 436    [-+a-zA-Z0-9]+: .  */
 437
 438 bool
 439 url_has_scheme (const char *url)
 440 {
 441   const char *p = url;
 442
 443   /* The first char must be a scheme char. */
 444   if (!*p || !SCHEME_CHAR (*p))
 445     return false;
 446   ++p;
 447   /* Followed by 0 or more scheme chars. */
 448   while (*p && SCHEME_CHAR (*p))
 449     ++p;
 450   /* Terminated by ':'. */
 451   return *p == ':';
 452 }
 453
 454 int
 455 scheme_default_port (enum url_scheme scheme)
 456 {
 457   return supported_schemes[scheme].default_port;
 458 }
 459
 460 void
 461 scheme_disable (enum url_scheme scheme)
 462 {
 463   supported_schemes[scheme].flags |= scm_disabled;
 464 }
 465
 466 /* Skip the username and password, if present in the URL.  The
 467    function should *not* be called with the complete URL, but with the
 468    portion after the scheme.
 469
 470    If no username and password are found, return URL.  */
 471
 472 static const char *
 473 url_skip_credentials (const char *url)
 474 {
 475   /* Look for '@' that comes before terminators, such as '/', '?',
 476      '#', or ';'.  */
 477   const char *p = (const char *)strpbrk (url, "@/?#;");
 478   if (!p || *p != '@')
 479     return url;
 480   return p + 1;
 481 }
 482
 483 /* Parse credentials contained in [BEG, END).  The region is expected
 484    to have come from a URL and is unescaped.  */
 485
 486 static bool
 487 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
 488 {
 489   char *colon;
 490   const char *userend;
 491
 492   if (beg == end)
 493     return false;               /* empty user name */
 494
 495   colon = memchr (beg, ':', end - beg);
 496   if (colon == beg)
 497     return false;               /* again empty user name */
 498
 499   if (colon)
 500     {
 501       *passwd = strdupdelim (colon + 1, end);
 502       userend = colon;
 503       url_unescape (*passwd);
 504     }
 505   else
 506     {
 507       *passwd = NULL;
 508       userend = end;
 509     }
 510   *user = strdupdelim (beg, userend);
 511   url_unescape (*user);
 512   return true;
 513 }
 514
 515 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 516    originally popularized by Netscape and NcFTP.  HTTP shorthands look
 517    like this:
 518
 519    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 520    www.foo.com[:port]            -> http://www.foo.com[:port]
 521
 522    FTP shorthands look like this:
 523
 524    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 525    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 526
 527    If the URL needs not or cannot be rewritten, return NULL.  */
 528
 529 char *
 530 rewrite_shorthand_url (const char *url)
 531 {
 532   const char *p;
 533   char *ret;
 534
 535   if (url_scheme (url) != SCHEME_INVALID)
 536     return NULL;
 537
 538   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 539      latter Netscape.  */
 540   p = strpbrk (url, ":/");
 541   if (p == url)
 542     return NULL;
 543
 544   /* If we're looking at "://", it means the URL uses a scheme we
 545      don't support, which may include "https" when compiled without
 546      SSL support.  Don't bogusly rewrite such URLs.  */
 547   if (p && p[0] == ':' && p[1] == '/' && p[2] == '/')
 548     return NULL;
 549
 550   if (p && *p == ':')
 551     {
 552       /* Colon indicates ftp, as in foo.bar.com:path.  Check for
 553          special case of http port number ("localhost:10000").  */
 554       int digits = strspn (p + 1, "0123456789");
 555       if (digits && (p[1 + digits] == '/' || p[1 + digits] == '\0'))
 556         goto http;
 557
 558       /* Turn "foo.bar.com:path" to "ftp://foo.bar.com/path". */
 559       ret = aprintf ("ftp://%s", url);
 560       ret[6 + (p - url)] = '/';
 561     }
 562   else
 563     {
 564     http:
 565       /* Just prepend "http://" to URL. */
 566       ret = aprintf ("http://%s", url);
 567     }
 568   return ret;
 569 }
 570 \f
 571 static void split_path (const char *, char **, char **);
 572
 573 /* Like strpbrk, with the exception that it returns the pointer to the
 574    terminating zero (end-of-string aka "eos") if no matching character
 575    is found.  */
 576
 577 static inline char *
 578 strpbrk_or_eos (const char *s, const char *accept)
 579 {
 580   char *p = strpbrk (s, accept);
 581   if (!p)
 582     p = strchr (s, '\0');
 583   return p;
 584 }
 585
 586 /* Turn STR into lowercase; return true if a character was actually
 587    changed. */
 588
 589 static bool
 590 lowercase_str (char *str)
 591 {
 592   bool changed = false;
 593   for (; *str; str++)
 594     if (c_isupper (*str))
 595       {
 596         changed = true;
 597         *str = c_tolower (*str);
 598       }
 599   return changed;
 600 }
 601
 602 static const char *
 603 init_seps (enum url_scheme scheme)
 604 {
 605   static char seps[8] = ":/";
 606   char *p = seps + 2;
 607   int flags = supported_schemes[scheme].flags;
 608
 609   if (flags & scm_has_params)
 610     *p++ = ';';
 611   if (flags & scm_has_query)
 612     *p++ = '?';
 613   if (flags & scm_has_fragment)
 614     *p++ = '#';
 615   *p++ = '\0';
 616   return seps;
 617 }
 618
 619 static const char *parse_errors[] = {
 620 #define PE_NO_ERROR                     0
 621   N_("No error"),
 622 #define PE_UNSUPPORTED_SCHEME           1
 623   N_("Unsupported scheme"),
 624 #define PE_INVALID_HOST_NAME            2
 625   N_("Invalid host name"),
 626 #define PE_BAD_PORT_NUMBER              3
 627   N_("Bad port number"),
 628 #define PE_INVALID_USER_NAME            4
 629   N_("Invalid user name"),
 630 #define PE_UNTERMINATED_IPV6_ADDRESS    5
 631   N_("Unterminated IPv6 numeric address"),
 632 #define PE_IPV6_NOT_SUPPORTED           6
 633   N_("IPv6 addresses not supported"),
 634 #define PE_INVALID_IPV6_ADDRESS         7
 635   N_("Invalid IPv6 numeric address")
 636 };
 637
 638 /* Parse a URL.
 639
 640    Return a new struct url if successful, NULL on error.  In case of
 641    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 642    error code. */
 643 struct url *
 644 url_parse (const char *url, int *error)
 645 {
 646   struct url *u;
 647   const char *p;
 648   bool path_modified, host_modified;
 649
 650   enum url_scheme scheme;
 651   const char *seps;
 652
 653   const char *uname_b,     *uname_e;
 654   const char *host_b,      *host_e;
 655   const char *path_b,      *path_e;
 656   const char *params_b,    *params_e;
 657   const char *query_b,     *query_e;
 658   const char *fragment_b,  *fragment_e;
 659
 660   int port;
 661   char *user = NULL, *passwd = NULL;
 662
 663   char *url_encoded = NULL;
 664
 665   int error_code;
 666
 667   scheme = url_scheme (url);
 668   if (scheme == SCHEME_INVALID)
 669     {
 670       error_code = PE_UNSUPPORTED_SCHEME;
 671       goto error;
 672     }
 673
 674   if (opt.enable_iri)
 675     {
 676       url_unescape ((char *) url);
 677       url = locale_to_utf8(url);
 678     }
 679
 680   url_encoded = reencode_escapes (url);
 681   p = url_encoded;
 682
 683   p += strlen (supported_schemes[scheme].leading_string);
 684   uname_b = p;
 685   p = url_skip_credentials (p);
 686   uname_e = p;
 687
 688   /* scheme://user:pass@host[:port]... */
 689   /*                    ^              */
 690
 691   /* We attempt to break down the URL into the components path,
 692      params, query, and fragment.  They are ordered like this:
 693
 694        scheme://host[:port][/path][;params][?query][#fragment]  */
 695
 696   path_b     = path_e     = NULL;
 697   params_b   = params_e   = NULL;
 698   query_b    = query_e    = NULL;
 699   fragment_b = fragment_e = NULL;
 700
 701   /* Initialize separators for optional parts of URL, depending on the
 702      scheme.  For example, FTP has params, and HTTP and HTTPS have
 703      query string and fragment. */
 704   seps = init_seps (scheme);
 705
 706   host_b = p;
 707
 708   if (*p == '[')
 709     {
 710       /* Handle IPv6 address inside square brackets.  Ideally we'd
 711          just look for the terminating ']', but rfc2732 mandates
 712          rejecting invalid IPv6 addresses.  */
 713
 714       /* The address begins after '['. */
 715       host_b = p + 1;
 716       host_e = strchr (host_b, ']');
 717
 718       if (!host_e)
 719         {
 720           error_code = PE_UNTERMINATED_IPV6_ADDRESS;
 721           goto error;
 722         }
 723
 724 #ifdef ENABLE_IPV6
 725       /* Check if the IPv6 address is valid. */
 726       if (!is_valid_ipv6_address(host_b, host_e))
 727         {
 728           error_code = PE_INVALID_IPV6_ADDRESS;
 729           goto error;
 730         }
 731
 732       /* Continue parsing after the closing ']'. */
 733       p = host_e + 1;
 734 #else
 735       error_code = PE_IPV6_NOT_SUPPORTED;
 736       goto error;
 737 #endif
 738
 739       /* The closing bracket must be followed by a separator or by the
 740          null char.  */
 741       /* http://[::1]... */
 742       /*             ^   */
 743       if (!strchr (seps, *p))
 744         {
 745           /* Trailing garbage after []-delimited IPv6 address. */
 746           error_code = PE_INVALID_HOST_NAME;
 747           goto error;
 748         }
 749     }
 750   else
 751     {
 752       p = strpbrk_or_eos (p, seps);
 753       host_e = p;
 754     }
 755   ++seps;                       /* advance to '/' */
 756
 757   if (host_b == host_e)
 758     {
 759       error_code = PE_INVALID_HOST_NAME;
 760       goto error;
 761     }
 762
 763   port = scheme_default_port (scheme);
 764   if (*p == ':')
 765     {
 766       const char *port_b, *port_e, *pp;
 767
 768       /* scheme://host:port/tralala */
 769       /*              ^             */
 770       ++p;
 771       port_b = p;
 772       p = strpbrk_or_eos (p, seps);
 773       port_e = p;
 774
 775       /* Allow empty port, as per rfc2396. */
 776       if (port_b != port_e)
 777         for (port = 0, pp = port_b; pp < port_e; pp++)
 778           {
 779             if (!c_isdigit (*pp))
 780               {
 781                 /* http://host:12randomgarbage/blah */
 782                 /*               ^                  */
 783                 error_code = PE_BAD_PORT_NUMBER;
 784                 goto error;
 785               }
 786             port = 10 * port + (*pp - '0');
 787             /* Check for too large port numbers here, before we have
 788                a chance to overflow on bogus port values.  */
 789             if (port > 0xffff)
 790               {
 791                 error_code = PE_BAD_PORT_NUMBER;
 792                 goto error;
 793               }
 794           }
 795     }
 796   /* Advance to the first separator *after* '/' (either ';' or '?',
 797      depending on the scheme).  */
 798   ++seps;
 799
 800   /* Get the optional parts of URL, each part being delimited by
 801      current location and the position of the next separator.  */
 802 #define GET_URL_PART(sepchar, var) do {                         \
 803   if (*p == sepchar)                                            \
 804     var##_b = ++p, var##_e = p = strpbrk_or_eos (p, seps);      \
 805   ++seps;                                                       \
 806 } while (0)
 807
 808   GET_URL_PART ('/', path);
 809   if (supported_schemes[scheme].flags & scm_has_params)
 810     GET_URL_PART (';', params);
 811   if (supported_schemes[scheme].flags & scm_has_query)
 812     GET_URL_PART ('?', query);
 813   if (supported_schemes[scheme].flags & scm_has_fragment)
 814     GET_URL_PART ('#', fragment);
 815
 816 #undef GET_URL_PART
 817   assert (*p == 0);
 818
 819   if (uname_b != uname_e)
 820     {
 821       /* http://user:pass@host */
 822       /*        ^         ^    */
 823       /*     uname_b   uname_e */
 824       if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
 825         {
 826           error_code = PE_INVALID_USER_NAME;
 827           goto error;
 828         }
 829     }
 830
 831   u = xnew0 (struct url);
 832   u->scheme = scheme;
 833   u->host   = strdupdelim (host_b, host_e);
 834   u->port   = port;
 835   u->user   = user;
 836   u->passwd = passwd;
 837
 838   u->path = strdupdelim (path_b, path_e);
 839   path_modified = path_simplify (scheme, u->path);
 840   split_path (u->path, &u->dir, &u->file);
 841
 842   host_modified = lowercase_str (u->host);
 843
 844   /* Decode %HH sequences in host name.  This is important not so much
 845      to support %HH sequences in host names (which other browser
 846      don't), but to support binary characters (which will have been
 847      converted to %HH by reencode_escapes).  */
 848   if (strchr (u->host, '%'))
 849     {
 850       url_unescape (u->host);
 851       host_modified = true;
 852     }
 853
 854   if (opt.enable_iri)
 855     {
 856       char *new = idn_encode (u->host);
 857       if (new)
 858         {
 859           xfree (u->host);
 860           u->host = new;
 861           host_modified = true;
 862         }
 863     }
 864
 865   if (params_b)
 866     u->params = strdupdelim (params_b, params_e);
 867   if (query_b)
 868     u->query = strdupdelim (query_b, query_e);
 869   if (fragment_b)
 870     u->fragment = strdupdelim (fragment_b, fragment_e);
 871
 872   if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e)
 873     {
 874       /* If we suspect that a transformation has rendered what
 875          url_string might return different from URL_ENCODED, rebuild
 876          u->url using url_string.  */
 877       u->url = url_string (u, URL_AUTH_SHOW);
 878
 879       if (url_encoded != url)
 880         xfree ((char *) url_encoded);
 881     }
 882   else
 883     {
 884       if (url_encoded == url)
 885         u->url = xstrdup (url);
 886       else
 887         u->url = url_encoded;
 888     }
 889
 890   return u;
 891
 892  error:
 893   /* Cleanup in case of error: */
 894   if (url_encoded && url_encoded != url)
 895     xfree (url_encoded);
 896
 897   /* Transmit the error code to the caller, if the caller wants to
 898      know.  */
 899   if (error)
 900     *error = error_code;
 901   return NULL;
 902 }
 903
 904 /* Return the error message string from ERROR_CODE, which should have
 905    been retrieved from url_parse.  The error message is translated.  */
 906
 907 const char *
 908 url_error (int error_code)
 909 {
 910   assert (error_code >= 0 && ((size_t) error_code) < countof (parse_errors));
 911   return _(parse_errors[error_code]);
 912 }
 913
 914 /* Split PATH into DIR and FILE.  PATH comes from the URL and is
 915    expected to be URL-escaped.
 916
 917    The path is split into directory (the part up to the last slash)
 918    and file (the part after the last slash), which are subsequently
 919    unescaped.  Examples:
 920
 921    PATH                 DIR           FILE
 922    "foo/bar/baz"        "foo/bar"     "baz"
 923    "foo/bar/"           "foo/bar"     ""
 924    "foo"                ""            "foo"
 925    "foo/bar/baz%2fqux"  "foo/bar"     "baz/qux" (!)
 926
 927    DIR and FILE are freshly allocated.  */
 928
 929 static void
 930 split_path (const char *path, char **dir, char **file)
 931 {
 932   char *last_slash = strrchr (path, '/');
 933   if (!last_slash)
 934     {
 935       *dir = xstrdup ("");
 936       *file = xstrdup (path);
 937     }
 938   else
 939     {
 940       *dir = strdupdelim (path, last_slash);
 941       *file = xstrdup (last_slash + 1);
 942     }
 943   url_unescape (*dir);
 944   url_unescape (*file);
 945 }
 946
 947 /* Note: URL's "full path" is the path with the query string and
 948    params appended.  The "fragment" (#foo) is intentionally ignored,
 949    but that might be changed.  For example, if the original URL was
 950    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
 951    the full path will be "/foo/bar/baz;bullshit?querystring".  */
 952
 953 /* Return the length of the full path, without the terminating
 954    zero.  */
 955
 956 static int
 957 full_path_length (const struct url *url)
 958 {
 959   int len = 0;
 960
 961 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
 962
 963   FROB (path);
 964   FROB (params);
 965   FROB (query);
 966
 967 #undef FROB
 968
 969   return len;
 970 }
 971
 972 /* Write out the full path. */
 973
 974 static void
 975 full_path_write (const struct url *url, char *where)
 976 {
 977 #define FROB(el, chr) do {                      \
 978   char *f_el = url->el;                         \
 979   if (f_el) {                                   \
 980     int l = strlen (f_el);                      \
 981     *where++ = chr;                             \
 982     memcpy (where, f_el, l);                    \
 983     where += l;                                 \
 984   }                                             \
 985 } while (0)
 986
 987   FROB (path, '/');
 988   FROB (params, ';');
 989   FROB (query, '?');
 990
 991 #undef FROB
 992 }
 993
 994 /* Public function for getting the "full path".  E.g. if u->path is
 995    "foo/bar" and u->query is "param=value", full_path will be
 996    "/foo/bar?param=value". */
 997
 998 char *
 999 url_full_path (const struct url *url)
1000 {
1001   int length = full_path_length (url);
1002   char *full_path = xmalloc (length + 1);
1003
1004   full_path_write (url, full_path);
1005   full_path[length] = '\0';
1006
1007   return full_path;
1008 }
1009
1010 /* Unescape CHR in an otherwise escaped STR.  Used to selectively
1011    escaping of certain characters, such as "/" and ":".  Returns a
1012    count of unescaped chars.  */
1013
1014 static void
1015 unescape_single_char (char *str, char chr)
1016 {
1017   const char c1 = XNUM_TO_DIGIT (chr >> 4);
1018   const char c2 = XNUM_TO_DIGIT (chr & 0xf);
1019   char *h = str;                /* hare */
1020   char *t = str;                /* tortoise */
1021   for (; *h; h++, t++)
1022     {
1023       if (h[0] == '%' && h[1] == c1 && h[2] == c2)
1024         {
1025           *t = chr;
1026           h += 2;
1027         }
1028       else
1029         *t = *h;
1030     }
1031   *t = '\0';
1032 }
1033
1034 /* Escape unsafe and reserved characters, except for the slash
1035    characters.  */
1036
1037 static char *
1038 url_escape_dir (const char *dir)
1039 {
1040   char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1041   if (newdir == dir)
1042     return (char *)dir;
1043
1044   unescape_single_char (newdir, '/');
1045   return newdir;
1046 }
1047
1048 /* Sync u->path and u->url with u->dir and u->file.  Called after
1049    u->file or u->dir have been changed, typically by the FTP code.  */
1050
1051 static void
1052 sync_path (struct url *u)
1053 {
1054   char *newpath, *efile, *edir;
1055
1056   xfree (u->path);
1057
1058   /* u->dir and u->file are not escaped.  URL-escape them before
1059      reassembling them into u->path.  That way, if they contain
1060      separators like '?' or even if u->file contains slashes, the
1061      path will be correctly assembled.  (u->file can contain slashes
1062      if the URL specifies it with %2f, or if an FTP server returns
1063      it.)  */
1064   edir = url_escape_dir (u->dir);
1065   efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1066
1067   if (!*edir)
1068     newpath = xstrdup (efile);
1069   else
1070     {
1071       int dirlen = strlen (edir);
1072       int filelen = strlen (efile);
1073
1074       /* Copy "DIR/FILE" to newpath. */
1075       char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1076       memcpy (p, edir, dirlen);
1077       p += dirlen;
1078       *p++ = '/';
1079       memcpy (p, efile, filelen);
1080       p += filelen;
1081       *p = '\0';
1082     }
1083
1084   u->path = newpath;
1085
1086   if (edir != u->dir)
1087     xfree (edir);
1088   if (efile != u->file)
1089     xfree (efile);
1090
1091   /* Regenerate u->url as well.  */
1092   xfree (u->url);
1093   u->url = url_string (u, URL_AUTH_SHOW);
1094 }
1095
1096 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1097    This way we can sync u->path and u->url when they get changed.  */
1098
1099 void
1100 url_set_dir (struct url *url, const char *newdir)
1101 {
1102   xfree (url->dir);
1103   url->dir = xstrdup (newdir);
1104   sync_path (url);
1105 }
1106
1107 void
1108 url_set_file (struct url *url, const char *newfile)
1109 {
1110   xfree (url->file);
1111   url->file = xstrdup (newfile);
1112   sync_path (url);
1113 }
1114
1115 void
1116 url_free (struct url *url)
1117 {
1118   xfree (url->host);
1119   xfree (url->path);
1120   xfree (url->url);
1121
1122   xfree_null (url->params);
1123   xfree_null (url->query);
1124   xfree_null (url->fragment);
1125   xfree_null (url->user);
1126   xfree_null (url->passwd);
1127
1128   xfree (url->dir);
1129   xfree (url->file);
1130
1131   xfree (url);
1132 }
1133 \f
1134 /* Create all the necessary directories for PATH (a file).  Calls
1135    make_directory internally.  */
1136 int
1137 mkalldirs (const char *path)
1138 {
1139   const char *p;
1140   char *t;
1141   struct_stat st;
1142   int res;
1143
1144   p = path + strlen (path);
1145   for (; *p != '/' && p != path; p--)
1146     ;
1147
1148   /* Don't create if it's just a file.  */
1149   if ((p == path) && (*p != '/'))
1150     return 0;
1151   t = strdupdelim (path, p);
1152
1153   /* Check whether the directory exists.  */
1154   if ((stat (t, &st) == 0))
1155     {
1156       if (S_ISDIR (st.st_mode))
1157         {
1158           xfree (t);
1159           return 0;
1160         }
1161       else
1162         {
1163           /* If the dir exists as a file name, remove it first.  This
1164              is *only* for Wget to work with buggy old CERN http
1165              servers.  Here is the scenario: When Wget tries to
1166              retrieve a directory without a slash, e.g.
1167              http://foo/bar (bar being a directory), CERN server will
1168              not redirect it too http://foo/bar/ -- it will generate a
1169              directory listing containing links to bar/file1,
1170              bar/file2, etc.  Wget will lose because it saves this
1171              HTML listing to a file `bar', so it cannot create the
1172              directory.  To work around this, if the file of the same
1173              name exists, we just remove it and create the directory
1174              anyway.  */
1175           DEBUGP (("Removing %s because of directory danger!\n", t));
1176           unlink (t);
1177         }
1178     }
1179   res = make_directory (t);
1180   if (res != 0)
1181     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1182   xfree (t);
1183   return res;
1184 }
1185 \f
1186 /* Functions for constructing the file name out of URL components.  */
1187
1188 /* A growable string structure, used by url_file_name and friends.
1189    This should perhaps be moved to utils.c.
1190
1191    The idea is to have a convenient and efficient way to construct a
1192    string by having various functions append data to it.  Instead of
1193    passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1194    functions in questions, we pass the pointer to this struct.  */
1195
1196 struct growable {
1197   char *base;
1198   int size;
1199   int tail;
1200 };
1201
1202 /* Ensure that the string can accept APPEND_COUNT more characters past
1203    the current TAIL position.  If necessary, this will grow the string
1204    and update its allocated size.  If the string is already large
1205    enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1206 #define GROW(g, append_size) do {                                       \
1207   struct growable *G_ = g;                                              \
1208   DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1209 } while (0)
1210
1211 /* Return the tail position of the string. */
1212 #define TAIL(r) ((r)->base + (r)->tail)
1213
1214 /* Move the tail position by APPEND_COUNT characters. */
1215 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1216
1217 /* Append the string STR to DEST.  NOTICE: the string in DEST is not
1218    terminated.  */
1219
1220 static void
1221 append_string (const char *str, struct growable *dest)
1222 {
1223   int l = strlen (str);
1224   GROW (dest, l);
1225   memcpy (TAIL (dest), str, l);
1226   TAIL_INCR (dest, l);
1227 }
1228
1229 /* Append CH to DEST.  For example, append_char (0, DEST)
1230    zero-terminates DEST.  */
1231
1232 static void
1233 append_char (char ch, struct growable *dest)
1234 {
1235   GROW (dest, 1);
1236   *TAIL (dest) = ch;
1237   TAIL_INCR (dest, 1);
1238 }
1239
1240 enum {
1241   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1242   filechr_not_windows = 2,      /* unusable on Windows, one of \|/<>?:*" */
1243   filechr_control     = 4       /* a control character, e.g. 0-31 */
1244 };
1245
1246 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1247
1248 /* Shorthands for the table: */
1249 #define U filechr_not_unix
1250 #define W filechr_not_windows
1251 #define C filechr_control
1252
1253 #define UW U|W
1254 #define UWC U|W|C
1255
1256 /* Table of characters unsafe under various conditions (see above).
1257
1258    Arguably we could also claim `%' to be unsafe, since we use it as
1259    the escape character.  If we ever want to be able to reliably
1260    translate file name back to URL, this would become important
1261    crucial.  Right now, it's better to be minimal in escaping.  */
1262
1263 static const unsigned char filechr_table[256] =
1264 {
1265 UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1266   C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1267   C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1268   C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1269   0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1270   0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1271   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1272   0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
1273   0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1274   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1275   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1276   0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1277   0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1278   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1279   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1280   0,  0,  0,  0,   W,  0,  0,  C,   /* x   y   z   {    |   }   ~   DEL */
1281
1282   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
1283   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
1284   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1285   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1286
1287   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1288   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1289   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1290   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1291 };
1292 #undef U
1293 #undef W
1294 #undef C
1295 #undef UW
1296 #undef UWC
1297
1298 /* FN_PORT_SEP is the separator between host and port in file names
1299    for non-standard port numbers.  On Unix this is normally ':', as in
1300    "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1301    because Windows can't handle ':' in file names.  */
1302 #define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1303
1304 /* FN_QUERY_SEP is the separator between the file name and the URL
1305    query, normally '?'.  Since Windows cannot handle '?' as part of
1306    file name, we use '@' instead there.  */
1307 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1308
1309 /* Quote path element, characters in [b, e), as file name, and append
1310    the quoted string to DEST.  Each character is quoted as per
1311    file_unsafe_char and the corresponding table.
1312
1313    If ESCAPED is true, the path element is considered to be
1314    URL-escaped and will be unescaped prior to inspection.  */
1315
1316 static void
1317 append_uri_pathel (const char *b, const char *e, bool escaped,
1318                    struct growable *dest)
1319 {
1320   const char *p;
1321   int quoted, outlen;
1322
1323   int mask;
1324   if (opt.restrict_files_os == restrict_unix)
1325     mask = filechr_not_unix;
1326   else
1327     mask = filechr_not_windows;
1328   if (opt.restrict_files_ctrl)
1329     mask |= filechr_control;
1330
1331   /* Copy [b, e) to PATHEL and URL-unescape it. */
1332   if (escaped)
1333     {
1334       char *unescaped;
1335       BOUNDED_TO_ALLOCA (b, e, unescaped);
1336       url_unescape (unescaped);
1337       b = unescaped;
1338       e = unescaped + strlen (unescaped);
1339     }
1340
1341   /* Defang ".." when found as component of path.  Remember that path
1342      comes from the URL and might contain malicious input.  */
1343   if (e - b == 2 && b[0] == '.' && b[1] == '.')
1344     {
1345       b = "%2E%2E";
1346       e = b + 6;
1347     }
1348
1349   /* Walk the PATHEL string and check how many characters we'll need
1350      to quote.  */
1351   quoted = 0;
1352   for (p = b; p < e; p++)
1353     if (FILE_CHAR_TEST (*p, mask))
1354       ++quoted;
1355
1356   /* Calculate the length of the output string.  e-b is the input
1357      string length.  Each quoted char introduces two additional
1358      characters in the string, hence 2*quoted.  */
1359   outlen = (e - b) + (2 * quoted);
1360   GROW (dest, outlen);
1361
1362   if (!quoted)
1363     {
1364       /* If there's nothing to quote, we can simply append the string
1365          without processing it again.  */
1366       memcpy (TAIL (dest), b, outlen);
1367     }
1368   else
1369     {
1370       char *q = TAIL (dest);
1371       for (p = b; p < e; p++)
1372         {
1373           if (!FILE_CHAR_TEST (*p, mask))
1374             *q++ = *p;
1375           else
1376             {
1377               unsigned char ch = *p;
1378               *q++ = '%';
1379               *q++ = XNUM_TO_DIGIT (ch >> 4);
1380               *q++ = XNUM_TO_DIGIT (ch & 0xf);
1381             }
1382         }
1383       assert (q - TAIL (dest) == outlen);
1384     }
1385
1386   /* Perform inline case transformation if required.  */
1387   if (opt.restrict_files_case == restrict_lowercase
1388       || opt.restrict_files_case == restrict_uppercase)
1389     {
1390       char *q;
1391       for (q = TAIL (dest); q < TAIL (dest) + outlen; ++q)
1392         {
1393           if (opt.restrict_files_case == restrict_lowercase)
1394             *q = c_tolower (*q);
1395           else
1396             *q = c_toupper (*q);
1397         }
1398     }
1399
1400   TAIL_INCR (dest, outlen);
1401 }
1402
1403 /* Append to DEST the directory structure that corresponds the
1404    directory part of URL's path.  For example, if the URL is
1405    http://server/dir1/dir2/file, this appends "/dir1/dir2".
1406
1407    Each path element ("dir1" and "dir2" in the above example) is
1408    examined, url-unescaped, and re-escaped as file name element.
1409
1410    Additionally, it cuts as many directories from the path as
1411    specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1412    will produce "bar" for the above example.  For 2 or more, it will
1413    produce "".
1414
1415    Each component of the path is quoted for use as file name.  */
1416
1417 static void
1418 append_dir_structure (const struct url *u, struct growable *dest)
1419 {
1420   char *pathel, *next;
1421   int cut = opt.cut_dirs;
1422
1423   /* Go through the path components, de-URL-quote them, and quote them
1424      (if necessary) as file names.  */
1425
1426   pathel = u->path;
1427   for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1428     {
1429       if (cut-- > 0)
1430         continue;
1431       if (pathel == next)
1432         /* Ignore empty pathels.  */
1433         continue;
1434
1435       if (dest->tail)
1436         append_char ('/', dest);
1437       append_uri_pathel (pathel, next, true, dest);
1438     }
1439 }
1440
1441 /* Return a unique file name that matches the given URL as good as
1442    possible.  Does not create directories on the file system.  */
1443
1444 char *
1445 url_file_name (const struct url *u)
1446 {
1447   struct growable fnres;        /* stands for "file name result" */
1448
1449   const char *u_file, *u_query;
1450   char *fname, *unique;
1451
1452   fnres.base = NULL;
1453   fnres.size = 0;
1454   fnres.tail = 0;
1455
1456   /* Start with the directory prefix, if specified. */
1457   if (opt.dir_prefix)
1458     append_string (opt.dir_prefix, &fnres);
1459
1460   /* If "dirstruct" is turned on (typically the case with -r), add
1461      the host and port (unless those have been turned off) and
1462      directory structure.  */
1463   if (opt.dirstruct)
1464     {
1465       if (opt.protocol_directories)
1466         {
1467           if (fnres.tail)
1468             append_char ('/', &fnres);
1469           append_string (supported_schemes[u->scheme].name, &fnres);
1470         }
1471       if (opt.add_hostdir)
1472         {
1473           if (fnres.tail)
1474             append_char ('/', &fnres);
1475           if (0 != strcmp (u->host, ".."))
1476             append_string (u->host, &fnres);
1477           else
1478             /* Host name can come from the network; malicious DNS may
1479                allow ".." to be resolved, causing us to write to
1480                "../<file>".  Defang such host names.  */
1481             append_string ("%2E%2E", &fnres);
1482           if (u->port != scheme_default_port (u->scheme))
1483             {
1484               char portstr[24];
1485               number_to_string (portstr, u->port);
1486               append_char (FN_PORT_SEP, &fnres);
1487               append_string (portstr, &fnres);
1488             }
1489         }
1490
1491       append_dir_structure (u, &fnres);
1492     }
1493
1494   /* Add the file name. */
1495   if (fnres.tail)
1496     append_char ('/', &fnres);
1497   u_file = *u->file ? u->file : "index.html";
1498   append_uri_pathel (u_file, u_file + strlen (u_file), false, &fnres);
1499
1500   /* Append "?query" to the file name. */
1501   u_query = u->query && *u->query ? u->query : NULL;
1502   if (u_query)
1503     {
1504       append_char (FN_QUERY_SEP, &fnres);
1505       append_uri_pathel (u_query, u_query + strlen (u_query), true, &fnres);
1506     }
1507
1508   /* Zero-terminate the file name. */
1509   append_char ('\0', &fnres);
1510
1511   fname = fnres.base;
1512
1513   /* Check the cases in which the unique extensions are not used:
1514      1) Clobbering is turned off (-nc).
1515      2) Retrieval with regetting.
1516      3) Timestamping is used.
1517      4) Hierarchy is built.
1518
1519      The exception is the case when file does exist and is a
1520      directory (see `mkalldirs' for explanation).  */
1521
1522   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1523       && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1524     return fname;
1525
1526   unique = unique_name (fname, true);
1527   if (unique != fname)
1528     xfree (fname);
1529   return unique;
1530 }
1531 \f
1532 /* Resolve "." and ".." elements of PATH by destructively modifying
1533    PATH and return true if PATH has been modified, false otherwise.
1534
1535    The algorithm is in spirit similar to the one described in rfc1808,
1536    although implemented differently, in one pass.  To recap, path
1537    elements containing only "." are removed, and ".." is taken to mean
1538    "back up one element".  Single leading and trailing slashes are
1539    preserved.
1540
1541    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1542    test examples are provided below.  If you change anything in this
1543    function, run test_path_simplify to make sure you haven't broken a
1544    test case.  */
1545
1546 static bool
1547 path_simplify (enum url_scheme scheme, char *path)
1548 {
1549   char *h = path;               /* hare */
1550   char *t = path;               /* tortoise */
1551   char *beg = path;
1552   char *end = strchr (path, '\0');
1553
1554   while (h < end)
1555     {
1556       /* Hare should be at the beginning of a path element. */
1557
1558       if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1559         {
1560           /* Ignore "./". */
1561           h += 2;
1562         }
1563       else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1564         {
1565           /* Handle "../" by retreating the tortoise by one path
1566              element -- but not past beggining.  */
1567           if (t > beg)
1568             {
1569               /* Move backwards until T hits the beginning of the
1570                  previous path element or the beginning of path. */
1571               for (--t; t > beg && t[-1] != '/'; t--)
1572                 ;
1573             }
1574           else if (scheme == SCHEME_FTP)
1575             {
1576               /* If we're at the beginning, copy the "../" literally
1577                  and move the beginning so a later ".." doesn't remove
1578                  it.  This violates RFC 3986; but we do it for FTP
1579                  anyway because there is otherwise no way to get at a
1580                  parent directory, when the FTP server drops us in a
1581                  non-root directory (which is not uncommon). */
1582               beg = t + 3;
1583               goto regular;
1584             }
1585           h += 3;
1586         }
1587       else
1588         {
1589         regular:
1590           /* A regular path element.  If H hasn't advanced past T,
1591              simply skip to the next path element.  Otherwise, copy
1592              the path element until the next slash.  */
1593           if (t == h)
1594             {
1595               /* Skip the path element, including the slash.  */
1596               while (h < end && *h != '/')
1597                 t++, h++;
1598               if (h < end)
1599                 t++, h++;
1600             }
1601           else
1602             {
1603               /* Copy the path element, including the final slash.  */
1604               while (h < end && *h != '/')
1605                 *t++ = *h++;
1606               if (h < end)
1607                 *t++ = *h++;
1608             }
1609         }
1610     }
1611
1612   if (t != h)
1613     *t = '\0';
1614
1615   return t != h;
1616 }
1617 \f
1618 /* Return the length of URL's path.  Path is considered to be
1619    terminated by one or more of the ?query or ;params or #fragment,
1620    depending on the scheme.  */
1621
1622 static const char *
1623 path_end (const char *url)
1624 {
1625   enum url_scheme scheme = url_scheme (url);
1626   const char *seps;
1627   if (scheme == SCHEME_INVALID)
1628     scheme = SCHEME_HTTP;       /* use http semantics for rel links */
1629   /* +2 to ignore the first two separators ':' and '/' */
1630   seps = init_seps (scheme) + 2;
1631   return strpbrk_or_eos (url, seps);
1632 }
1633
1634 /* Find the last occurrence of character C in the range [b, e), or
1635    NULL, if none are present.  */
1636 #define find_last_char(b, e, c) memrchr ((b), (c), (e) - (b))
1637
1638 /* Merge BASE with LINK and return the resulting URI.
1639
1640    Either of the URIs may be absolute or relative, complete with the
1641    host name, or path only.  This tries to reasonably handle all
1642    foreseeable cases.  It only employs minimal URL parsing, without
1643    knowledge of the specifics of schemes.
1644
1645    I briefly considered making this function call path_simplify after
1646    the merging process, as rfc1738 seems to suggest.  This is a bad
1647    idea for several reasons: 1) it complexifies the code, and 2)
1648    url_parse has to simplify path anyway, so it's wasteful to boot.  */
1649
1650 char *
1651 uri_merge (const char *base, const char *link)
1652 {
1653   int linklength;
1654   const char *end;
1655   char *merge;
1656
1657   if (url_has_scheme (link))
1658     return xstrdup (link);
1659
1660   /* We may not examine BASE past END. */
1661   end = path_end (base);
1662   linklength = strlen (link);
1663
1664   if (!*link)
1665     {
1666       /* Empty LINK points back to BASE, query string and all. */
1667       return xstrdup (base);
1668     }
1669   else if (*link == '?')
1670     {
1671       /* LINK points to the same location, but changes the query
1672          string.  Examples: */
1673       /* uri_merge("path",         "?new") -> "path?new"     */
1674       /* uri_merge("path?foo",     "?new") -> "path?new"     */
1675       /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1676       /* uri_merge("path#foo",     "?new") -> "path?new"     */
1677       int baselength = end - base;
1678       merge = xmalloc (baselength + linklength + 1);
1679       memcpy (merge, base, baselength);
1680       memcpy (merge + baselength, link, linklength);
1681       merge[baselength + linklength] = '\0';
1682     }
1683   else if (*link == '#')
1684     {
1685       /* uri_merge("path",         "#new") -> "path#new"     */
1686       /* uri_merge("path#foo",     "#new") -> "path#new"     */
1687       /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1688       /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1689       int baselength;
1690       const char *end1 = strchr (base, '#');
1691       if (!end1)
1692         end1 = base + strlen (base);
1693       baselength = end1 - base;
1694       merge = xmalloc (baselength + linklength + 1);
1695       memcpy (merge, base, baselength);
1696       memcpy (merge + baselength, link, linklength);
1697       merge[baselength + linklength] = '\0';
1698     }
1699   else if (*link == '/' && *(link + 1) == '/')
1700     {
1701       /* LINK begins with "//" and so is a net path: we need to
1702          replace everything after (and including) the double slash
1703          with LINK. */
1704
1705       /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1706       /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1707       /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1708
1709       int span;
1710       const char *slash;
1711       const char *start_insert;
1712
1713       /* Look for first slash. */
1714       slash = memchr (base, '/', end - base);
1715       /* If found slash and it is a double slash, then replace
1716          from this point, else default to replacing from the
1717          beginning.  */
1718       if (slash && *(slash + 1) == '/')
1719         start_insert = slash;
1720       else
1721         start_insert = base;
1722
1723       span = start_insert - base;
1724       merge = xmalloc (span + linklength + 1);
1725       if (span)
1726         memcpy (merge, base, span);
1727       memcpy (merge + span, link, linklength);
1728       merge[span + linklength] = '\0';
1729     }
1730   else if (*link == '/')
1731     {
1732       /* LINK is an absolute path: we need to replace everything
1733          after (and including) the FIRST slash with LINK.
1734
1735          So, if BASE is "http://host/whatever/foo/bar", and LINK is
1736          "/qux/xyzzy", our result should be
1737          "http://host/qux/xyzzy".  */
1738       int span;
1739       const char *slash;
1740       const char *start_insert = NULL; /* for gcc to shut up. */
1741       const char *pos = base;
1742       bool seen_slash_slash = false;
1743       /* We're looking for the first slash, but want to ignore
1744          double slash. */
1745     again:
1746       slash = memchr (pos, '/', end - pos);
1747       if (slash && !seen_slash_slash)
1748         if (*(slash + 1) == '/')
1749           {
1750             pos = slash + 2;
1751             seen_slash_slash = true;
1752             goto again;
1753           }
1754
1755       /* At this point, SLASH is the location of the first / after
1756          "//", or the first slash altogether.  START_INSERT is the
1757          pointer to the location where LINK will be inserted.  When
1758          examining the last two examples, keep in mind that LINK
1759          begins with '/'. */
1760
1761       if (!slash && !seen_slash_slash)
1762         /* example: "foo" */
1763         /*           ^    */
1764         start_insert = base;
1765       else if (!slash && seen_slash_slash)
1766         /* example: "http://foo" */
1767         /*                     ^ */
1768         start_insert = end;
1769       else if (slash && !seen_slash_slash)
1770         /* example: "foo/bar" */
1771         /*           ^        */
1772         start_insert = base;
1773       else if (slash && seen_slash_slash)
1774         /* example: "http://something/" */
1775         /*                           ^  */
1776         start_insert = slash;
1777
1778       span = start_insert - base;
1779       merge = xmalloc (span + linklength + 1);
1780       if (span)
1781         memcpy (merge, base, span);
1782       memcpy (merge + span, link, linklength);
1783       merge[span + linklength] = '\0';
1784     }
1785   else
1786     {
1787       /* LINK is a relative URL: we need to replace everything
1788          after last slash (possibly empty) with LINK.
1789
1790          So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1791          our result should be "whatever/foo/qux/xyzzy".  */
1792       bool need_explicit_slash = false;
1793       int span;
1794       const char *start_insert;
1795       const char *last_slash = find_last_char (base, end, '/');
1796       if (!last_slash)
1797         {
1798           /* No slash found at all.  Replace what we have with LINK. */
1799           start_insert = base;
1800         }
1801       else if (last_slash && last_slash >= base + 2
1802                && last_slash[-2] == ':' && last_slash[-1] == '/')
1803         {
1804           /* example: http://host"  */
1805           /*                      ^ */
1806           start_insert = end + 1;
1807           need_explicit_slash = true;
1808         }
1809       else
1810         {
1811           /* example: "whatever/foo/bar" */
1812           /*                        ^    */
1813           start_insert = last_slash + 1;
1814         }
1815
1816       span = start_insert - base;
1817       merge = xmalloc (span + linklength + 1);
1818       if (span)
1819         memcpy (merge, base, span);
1820       if (need_explicit_slash)
1821         merge[span - 1] = '/';
1822       memcpy (merge + span, link, linklength);
1823       merge[span + linklength] = '\0';
1824     }
1825
1826   return merge;
1827 }
1828 \f
1829 #define APPEND(p, s) do {                       \
1830   int len = strlen (s);                         \
1831   memcpy (p, s, len);                           \
1832   p += len;                                     \
1833 } while (0)
1834
1835 /* Use this instead of password when the actual password is supposed
1836    to be hidden.  We intentionally use a generic string without giving
1837    away the number of characters in the password, like previous
1838    versions did.  */
1839 #define HIDDEN_PASSWORD "*password*"
1840
1841 /* Recreate the URL string from the data in URL.
1842
1843    If HIDE is true (as it is when we're calling this on a URL we plan
1844    to print, but not when calling it to canonicalize a URL for use
1845    within the program), password will be hidden.  Unsafe characters in
1846    the URL will be quoted.  */
1847
1848 char *
1849 url_string (const struct url *url, enum url_auth_mode auth_mode)
1850 {
1851   int size;
1852   char *result, *p;
1853   char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;
1854
1855   int scheme_port = supported_schemes[url->scheme].default_port;
1856   const char *scheme_str = supported_schemes[url->scheme].leading_string;
1857   int fplen = full_path_length (url);
1858
1859   bool brackets_around_host;
1860
1861   assert (scheme_str != NULL);
1862
1863   /* Make sure the user name and password are quoted. */
1864   if (url->user)
1865     {
1866       if (auth_mode != URL_AUTH_HIDE)
1867         {
1868           quoted_user = url_escape_allow_passthrough (url->user);
1869           if (url->passwd)
1870             {
1871               if (auth_mode == URL_AUTH_HIDE_PASSWD)
1872                 quoted_passwd = HIDDEN_PASSWORD;
1873               else
1874                 quoted_passwd = url_escape_allow_passthrough (url->passwd);
1875             }
1876         }
1877     }
1878
1879   /* In the unlikely event that the host name contains non-printable
1880      characters, quote it for displaying to the user.  */
1881   quoted_host = url_escape_allow_passthrough (url->host);
1882
1883   /* Undo the quoting of colons that URL escaping performs.  IPv6
1884      addresses may legally contain colons, and in that case must be
1885      placed in square brackets.  */
1886   if (quoted_host != url->host)
1887     unescape_single_char (quoted_host, ':');
1888   brackets_around_host = strchr (quoted_host, ':') != NULL;
1889
1890   size = (strlen (scheme_str)
1891           + strlen (quoted_host)
1892           + (brackets_around_host ? 2 : 0)
1893           + fplen
1894           + 1);
1895   if (url->port != scheme_port)
1896     size += 1 + numdigit (url->port);
1897   if (quoted_user)
1898     {
1899       size += 1 + strlen (quoted_user);
1900       if (quoted_passwd)
1901         size += 1 + strlen (quoted_passwd);
1902     }
1903
1904   p = result = xmalloc (size);
1905
1906   APPEND (p, scheme_str);
1907   if (quoted_user)
1908     {
1909       APPEND (p, quoted_user);
1910       if (quoted_passwd)
1911         {
1912           *p++ = ':';
1913           APPEND (p, quoted_passwd);
1914         }
1915       *p++ = '@';
1916     }
1917
1918   if (brackets_around_host)
1919     *p++ = '[';
1920   APPEND (p, quoted_host);
1921   if (brackets_around_host)
1922     *p++ = ']';
1923   if (url->port != scheme_port)
1924     {
1925       *p++ = ':';
1926       p = number_to_string (p, url->port);
1927     }
1928
1929   full_path_write (url, p);
1930   p += fplen;
1931   *p++ = '\0';
1932
1933   assert (p - result == size);
1934
1935   if (quoted_user && quoted_user != url->user)
1936     xfree (quoted_user);
1937   if (quoted_passwd && auth_mode == URL_AUTH_SHOW
1938       && quoted_passwd != url->passwd)
1939     xfree (quoted_passwd);
1940   if (quoted_host != url->host)
1941     xfree (quoted_host);
1942
1943   return result;
1944 }
1945 \f
1946 /* Return true if scheme a is similar to scheme b.
1947
1948    Schemes are similar if they are equal.  If SSL is supported, schemes
1949    are also similar if one is http (SCHEME_HTTP) and the other is https
1950    (SCHEME_HTTPS).  */
1951 bool
1952 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
1953 {
1954   if (a == b)
1955     return true;
1956 #ifdef HAVE_SSL
1957   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
1958       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
1959     return true;
1960 #endif
1961   return false;
1962 }
1963 \f
1964 static int
1965 getchar_from_escaped_string (const char *str, char *c)
1966 {
1967   const char *p = str;
1968
1969   assert (str && *str);
1970   assert (c);
1971
1972   if (p[0] == '%')
1973     {
1974       if (!c_isxdigit(p[1]) || !c_isxdigit(p[2]))
1975         {
1976           *c = '%';
1977           return 1;
1978         }
1979       else
1980         {
1981           if (p[2] == 0)
1982             return 0; /* error: invalid string */
1983
1984           *c = X2DIGITS_TO_NUM (p[1], p[2]);
1985           if (URL_RESERVED_CHAR(*c))
1986             {
1987               *c = '%';
1988               return 1;
1989             }
1990           else
1991             return 3;
1992         }
1993     }
1994   else
1995     {
1996       *c = p[0];
1997     }
1998
1999   return 1;
2000 }
2001
2002 bool
2003 are_urls_equal (const char *u1, const char *u2)
2004 {
2005   const char *p, *q;
2006   int pp, qq;
2007   char ch1, ch2;
2008   assert(u1 && u2);
2009
2010   p = u1;
2011   q = u2;
2012
2013   while (*p && *q
2014          && (pp = getchar_from_escaped_string (p, &ch1))
2015          && (qq = getchar_from_escaped_string (q, &ch2))
2016          && (c_tolower(ch1) == c_tolower(ch2)))
2017     {
2018       p += pp;
2019       q += qq;
2020     }
2021
2022   return (*p == 0 && *q == 0 ? true : false);
2023 }
2024 \f
2025 #ifdef TESTING
2026 /* Debugging and testing support for path_simplify. */
2027
2028 #if 0
2029 /* Debug: run path_simplify on PATH and return the result in a new
2030    string.  Useful for calling from the debugger.  */
2031 static char *
2032 ps (char *path)
2033 {
2034   char *copy = xstrdup (path);
2035   path_simplify (copy);
2036   return copy;
2037 }
2038 #endif
2039
2040 static const char *
2041 run_test (char *test, char *expected_result, enum url_scheme scheme,
2042           bool expected_change)
2043 {
2044   char *test_copy = xstrdup (test);
2045   bool modified = path_simplify (scheme, test_copy);
2046
2047   if (0 != strcmp (test_copy, expected_result))
2048     {
2049       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2050               test, expected_result, test_copy);
2051       mu_assert ("", 0);
2052     }
2053   if (modified != expected_change)
2054     {
2055       if (expected_change)
2056         printf ("Expected modification with path_simplify(\"%s\").\n",
2057                 test);
2058       else
2059         printf ("Expected no modification with path_simplify(\"%s\").\n",
2060                 test);
2061     }
2062   xfree (test_copy);
2063   mu_assert ("", modified == expected_change);
2064   return NULL;
2065 }
2066
2067 const char *
2068 test_path_simplify (void)
2069 {
2070   static struct {
2071     char *test, *result;
2072     enum url_scheme scheme;
2073     bool should_modify;
2074   } tests[] = {
2075     { "",                       "",             SCHEME_HTTP, false },
2076     { ".",                      "",             SCHEME_HTTP, true },
2077     { "./",                     "",             SCHEME_HTTP, true },
2078     { "..",                     "",             SCHEME_HTTP, true },
2079     { "../",                    "",             SCHEME_HTTP, true },
2080     { "..",                     "..",           SCHEME_FTP,  false },
2081     { "../",                    "../",          SCHEME_FTP,  false },
2082     { "foo",                    "foo",          SCHEME_HTTP, false },
2083     { "foo/bar",                "foo/bar",      SCHEME_HTTP, false },
2084     { "foo///bar",              "foo///bar",    SCHEME_HTTP, false },
2085     { "foo/.",                  "foo/",         SCHEME_HTTP, true },
2086     { "foo/./",                 "foo/",         SCHEME_HTTP, true },
2087     { "foo./",                  "foo./",        SCHEME_HTTP, false },
2088     { "foo/../bar",             "bar",          SCHEME_HTTP, true },
2089     { "foo/../bar/",            "bar/",         SCHEME_HTTP, true },
2090     { "foo/bar/..",             "foo/",         SCHEME_HTTP, true },
2091     { "foo/bar/../x",           "foo/x",        SCHEME_HTTP, true },
2092     { "foo/bar/../x/",          "foo/x/",       SCHEME_HTTP, true },
2093     { "foo/..",                 "",             SCHEME_HTTP, true },
2094     { "foo/../..",              "",             SCHEME_HTTP, true },
2095     { "foo/../../..",           "",             SCHEME_HTTP, true },
2096     { "foo/../../bar/../../baz", "baz",         SCHEME_HTTP, true },
2097     { "foo/../..",              "..",           SCHEME_FTP,  true },
2098     { "foo/../../..",           "../..",        SCHEME_FTP,  true },
2099     { "foo/../../bar/../../baz", "../../baz",   SCHEME_FTP,  true },
2100     { "a/b/../../c",            "c",            SCHEME_HTTP, true },
2101     { "./a/../b",               "b",            SCHEME_HTTP, true }
2102   };
2103   int i;
2104
2105   for (i = 0; i < countof (tests); i++)
2106     {
2107       const char *message;
2108       char *test = tests[i].test;
2109       char *expected_result = tests[i].result;
2110       enum url_scheme scheme = tests[i].scheme;
2111       bool  expected_change = tests[i].should_modify;
2112       message = run_test (test, expected_result, scheme, expected_change);
2113       if (message) return message;
2114     }
2115   return NULL;
2116 }
2117
2118 const char *
2119 test_append_uri_pathel()
2120 {
2121   int i;
2122   struct {
2123     char *original_url;
2124     char *input;
2125     bool escaped;
2126     char *expected_result;
2127   } test_array[] = {
2128     { "http://www.yoyodyne.com/path/", "somepage.html", false, "http://www.yoyodyne.com/path/somepage.html" },
2129   };
2130
2131   for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
2132     {
2133       struct growable dest;
2134       const char *p = test_array[i].input;
2135
2136       memset (&dest, 0, sizeof (dest));
2137
2138       append_string (test_array[i].original_url, &dest);
2139       append_uri_pathel (p, p + strlen(p), test_array[i].escaped, &dest);
2140       append_char ('\0', &dest);
2141
2142       mu_assert ("test_append_uri_pathel: wrong result",
2143                  strcmp (dest.base, test_array[i].expected_result) == 0);
2144     }
2145
2146   return NULL;
2147 }
2148
2149 const char*
2150 test_are_urls_equal()
2151 {
2152   int i;
2153   struct {
2154     char *url1;
2155     char *url2;
2156     bool expected_result;
2157   } test_array[] = {
2158     { "http://www.adomain.com/apath/", "http://www.adomain.com/apath/",       true },
2159     { "http://www.adomain.com/apath/", "http://www.adomain.com/anotherpath/", false },
2160     { "http://www.adomain.com/apath/", "http://www.anotherdomain.com/path/",  false },
2161     { "http://www.adomain.com/~path/", "http://www.adomain.com/%7epath/",     true },
2162     { "http://www.adomain.com/longer-path/", "http://www.adomain.com/path/",  false },
2163     { "http://www.adomain.com/path%2f", "http://www.adomain.com/path/",       false },
2164   };
2165
2166   for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
2167     {
2168       mu_assert ("test_are_urls_equal: wrong result",
2169                  are_urls_equal (test_array[i].url1, test_array[i].url2) == test_array[i].expected_result);
2170     }
2171
2172   return NULL;
2173 }
2174
2175 #endif /* TESTING */
2176
2177 /*
2178  * vim: et ts=2 sw=2
2179  */
2180