sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1996-2005 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software Foundation, Inc.,
  18 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include <config.h>
  31
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #include <string.h>
  35 #ifdef HAVE_UNISTD_H
  36 # include <unistd.h>
  37 #endif
  38 #include <errno.h>
  39 #include <assert.h>
  40
  41 #include "wget.h"
  42 #include "utils.h"
  43 #include "url.h"
  44 #include "host.h"  /* for is_valid_ipv6_address */
  45
  46 enum {
  47   scm_disabled = 1,             /* for https when OpenSSL fails to init. */
  48   scm_has_params = 2,           /* whether scheme has ;params */
  49   scm_has_query = 4,            /* whether scheme has ?query */
  50   scm_has_fragment = 8          /* whether scheme has #fragment */
  51 };
  52
  53 struct scheme_data
  54 {
  55   /* Short name of the scheme, such as "http" or "ftp". */
  56   const char *name;
  57   /* Leading string that identifies the scheme, such as "https://". */
  58   const char *leading_string;
  59   /* Default port of the scheme when none is specified. */
  60   int default_port;
  61   /* Various flags. */
  62   int flags;
  63 };
  64
  65 /* Supported schemes: */
  66 static struct scheme_data supported_schemes[] =
  67 {
  68   { "http",     "http://",  DEFAULT_HTTP_PORT,  scm_has_query|scm_has_fragment },
  69 #ifdef HAVE_SSL
  70   { "https",    "https://", DEFAULT_HTTPS_PORT, scm_has_query|scm_has_fragment },
  71 #endif
  72   { "ftp",      "ftp://",   DEFAULT_FTP_PORT,   scm_has_params|scm_has_fragment },
  73
  74   /* SCHEME_INVALID */
  75   { NULL,       NULL,       -1,                 0 }
  76 };
  77
  78 /* Forward declarations: */
  79
  80 static bool path_simplify (char *);
  81 \f
  82 /* Support for escaping and unescaping of URL strings.  */
  83
  84 /* Table of "reserved" and "unsafe" characters.  Those terms are
  85    rfc1738-speak, as such largely obsoleted by rfc2396 and later
  86    specs, but the general idea remains.
  87
  88    A reserved character is the one that you can't decode without
  89    changing the meaning of the URL.  For example, you can't decode
  90    "/foo/%2f/bar" into "/foo///bar" because the number and contents of
  91    path components is different.  Non-reserved characters can be
  92    changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar".  The
  93    unsafe characters are loosely based on rfc1738, plus "$" and ",",
  94    as recommended by rfc2396, and minus "~", which is very frequently
  95    used (and sometimes unrecognized as %7E by broken servers).
  96
  97    An unsafe character is the one that should be encoded when URLs are
  98    placed in foreign environments.  E.g. space and newline are unsafe
  99    in HTTP contexts because HTTP uses them as separator and line
 100    terminator, so they must be encoded to %20 and %0A respectively.
 101    "*" is unsafe in shell context, etc.
 102
 103    We determine whether a character is unsafe through static table
 104    lookup.  This code assumes ASCII character set and 8-bit chars.  */
 105
 106 enum {
 107   /* rfc1738 reserved chars + "$" and ",".  */
 108   urlchr_reserved = 1,
 109
 110   /* rfc1738 unsafe chars, plus non-printables.  */
 111   urlchr_unsafe   = 2
 112 };
 113
 114 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 115 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 116 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 117
 118 /* Shorthands for the table: */
 119 #define R  urlchr_reserved
 120 #define U  urlchr_unsafe
 121 #define RU R|U
 122
 123 static const unsigned char urlchr_table[256] =
 124 {
 125   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 126   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 127   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 128   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 129   U,  0,  U, RU,   R,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 130   0,  0,  0,  R,   R,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 131   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 132   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 133  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 134   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 135   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 136   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 137   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 138   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 139   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 140   0,  0,  0,  U,   U,  U,  0,  U,   /* x   y   z   {    |   }   ~   DEL */
 141
 142   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 143   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 144   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 145   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 146
 147   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 148   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 149   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 150   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 151 };
 152 #undef R
 153 #undef U
 154 #undef RU
 155
 156 /* URL-unescape the string S.
 157
 158    This is done by transforming the sequences "%HH" to the character
 159    represented by the hexadecimal digits HH.  If % is not followed by
 160    two hexadecimal digits, it is inserted literally.
 161
 162    The transformation is done in place.  If you need the original
 163    string intact, make a copy before calling this function.  */
 164
 165 static void
 166 url_unescape (char *s)
 167 {
 168   char *t = s;                  /* t - tortoise */
 169   char *h = s;                  /* h - hare     */
 170
 171   for (; *h; h++, t++)
 172     {
 173       if (*h != '%')
 174         {
 175         copychar:
 176           *t = *h;
 177         }
 178       else
 179         {
 180           char c;
 181           /* Do nothing if '%' is not followed by two hex digits. */
 182           if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2])))
 183             goto copychar;
 184           c = X2DIGITS_TO_NUM (h[1], h[2]);
 185           /* Don't unescape %00 because there is no way to insert it
 186              into a C string without effectively truncating it. */
 187           if (c == '\0')
 188             goto copychar;
 189           *t = c;
 190           h += 2;
 191         }
 192     }
 193   *t = '\0';
 194 }
 195
 196 /* The core of url_escape_* functions.  Escapes the characters that
 197    match the provided mask in urlchr_table.
 198
 199    If ALLOW_PASSTHROUGH is true, a string with no unsafe chars will be
 200    returned unchanged.  If ALLOW_PASSTHROUGH is false, a freshly
 201    allocated string will be returned in all cases.  */
 202
 203 static char *
 204 url_escape_1 (const char *s, unsigned char mask, bool allow_passthrough)
 205 {
 206   const char *p1;
 207   char *p2, *newstr;
 208   int newlen;
 209   int addition = 0;
 210
 211   for (p1 = s; *p1; p1++)
 212     if (urlchr_test (*p1, mask))
 213       addition += 2;            /* Two more characters (hex digits) */
 214
 215   if (!addition)
 216     return allow_passthrough ? (char *)s : xstrdup (s);
 217
 218   newlen = (p1 - s) + addition;
 219   newstr = xmalloc (newlen + 1);
 220
 221   p1 = s;
 222   p2 = newstr;
 223   while (*p1)
 224     {
 225       /* Quote the characters that match the test mask. */
 226       if (urlchr_test (*p1, mask))
 227         {
 228           unsigned char c = *p1++;
 229           *p2++ = '%';
 230           *p2++ = XNUM_TO_DIGIT (c >> 4);
 231           *p2++ = XNUM_TO_DIGIT (c & 0xf);
 232         }
 233       else
 234         *p2++ = *p1++;
 235     }
 236   assert (p2 - newstr == newlen);
 237   *p2 = '\0';
 238
 239   return newstr;
 240 }
 241
 242 /* URL-escape the unsafe characters (see urlchr_table) in a given
 243    string, returning a freshly allocated string.  */
 244
 245 char *
 246 url_escape (const char *s)
 247 {
 248   return url_escape_1 (s, urlchr_unsafe, false);
 249 }
 250
 251 /* URL-escape the unsafe characters (see urlchr_table) in a given
 252    string.  If no characters are unsafe, S is returned.  */
 253
 254 static char *
 255 url_escape_allow_passthrough (const char *s)
 256 {
 257   return url_escape_1 (s, urlchr_unsafe, true);
 258 }
 259 \f
 260 /* Decide whether the char at position P needs to be encoded.  (It is
 261    not enough to pass a single char *P because the function may need
 262    to inspect the surrounding context.)
 263
 264    Return true if the char should be escaped as %XX, false otherwise.  */
 265
 266 static inline bool
 267 char_needs_escaping (const char *p)
 268 {
 269   if (*p == '%')
 270     {
 271       if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
 272         return false;
 273       else
 274         /* Garbled %.. sequence: encode `%'. */
 275         return true;
 276     }
 277   else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
 278     return true;
 279   else
 280     return false;
 281 }
 282
 283 /* Translate a %-escaped (but possibly non-conformant) input string S
 284    into a %-escaped (and conformant) output string.  If no characters
 285    are encoded or decoded, return the same string S; otherwise, return
 286    a freshly allocated string with the new contents.
 287
 288    After a URL has been run through this function, the protocols that
 289    use `%' as the quote character can use the resulting string as-is,
 290    while those that don't can use url_unescape to get to the intended
 291    data.  This function is stable: once the input is transformed,
 292    further transformations of the result yield the same output.
 293
 294    Let's discuss why this function is needed.
 295
 296    Imagine Wget is asked to retrieve `http://abc.xyz/abc def'.  Since
 297    a raw space character would mess up the HTTP request, it needs to
 298    be quoted, like this:
 299
 300        GET /abc%20def HTTP/1.0
 301
 302    It would appear that the unsafe chars need to be quoted, for
 303    example with url_escape.  But what if we're requested to download
 304    `abc%20def'?  url_escape transforms "%" to "%25", which would leave
 305    us with `abc%2520def'.  This is incorrect -- since %-escapes are
 306    part of URL syntax, "%20" is the correct way to denote a literal
 307    space on the Wget command line.  This leads to the conclusion that
 308    in that case Wget should not call url_escape, but leave the `%20'
 309    as is.  This is clearly contradictory, but it only gets worse.
 310
 311    What if the requested URI is `abc%20 def'?  If we call url_escape,
 312    we end up with `/abc%2520%20def', which is almost certainly not
 313    intended.  If we don't call url_escape, we are left with the
 314    embedded space and cannot complete the request.  What the user
 315    meant was for Wget to request `/abc%20%20def', and this is where
 316    reencode_escapes kicks in.
 317
 318    Wget used to solve this by first decoding %-quotes, and then
 319    encoding all the "unsafe" characters found in the resulting string.
 320    This was wrong because it didn't preserve certain URL special
 321    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 322    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 323    whether we considered `+' reserved (it is).  One of these results
 324    is inevitable because by the second step we would lose information
 325    on whether the `+' was originally encoded or not.  Both results
 326    were wrong because in CGI parameters + means space, while %2B means
 327    literal plus.  reencode_escapes correctly translates the above to
 328    "a%2B+b", i.e. returns the original string.
 329
 330    This function uses a modified version of the algorithm originally
 331    proposed by Anon Sricharoenchai:
 332
 333    * Encode all "unsafe" characters, except those that are also
 334      "reserved", to %XX.  See urlchr_table for which characters are
 335      unsafe and reserved.
 336
 337    * Encode the "%" characters not followed by two hex digits to
 338      "%25".
 339
 340    * Pass through all other characters and %XX escapes as-is.  (Up to
 341      Wget 1.10 this decoded %XX escapes corresponding to "safe"
 342      characters, but that was obtrusive and broke some servers.)
 343
 344    Anon's test case:
 345
 346    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 347    ->
 348    "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc"
 349
 350    Simpler test cases:
 351
 352    "foo bar"         -> "foo%20bar"
 353    "foo%20bar"       -> "foo%20bar"
 354    "foo %20bar"      -> "foo%20%20bar"
 355    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 356    "foo%25%20bar"    -> "foo%25%20bar"
 357    "foo%2%20bar"     -> "foo%252%20bar"
 358    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 359    "foo%2b+bar"      -> "foo%2b+bar"  */
 360
 361 static char *
 362 reencode_escapes (const char *s)
 363 {
 364   const char *p1;
 365   char *newstr, *p2;
 366   int oldlen, newlen;
 367
 368   int encode_count = 0;
 369
 370   /* First pass: inspect the string to see if there's anything to do,
 371      and to calculate the new length.  */
 372   for (p1 = s; *p1; p1++)
 373     if (char_needs_escaping (p1))
 374       ++encode_count;
 375
 376   if (!encode_count)
 377     /* The string is good as it is. */
 378     return (char *) s;          /* C const model sucks. */
 379
 380   oldlen = p1 - s;
 381   /* Each encoding adds two characters (hex digits).  */
 382   newlen = oldlen + 2 * encode_count;
 383   newstr = xmalloc (newlen + 1);
 384
 385   /* Second pass: copy the string to the destination address, encoding
 386      chars when needed.  */
 387   p1 = s;
 388   p2 = newstr;
 389
 390   while (*p1)
 391     if (char_needs_escaping (p1))
 392       {
 393         unsigned char c = *p1++;
 394         *p2++ = '%';
 395         *p2++ = XNUM_TO_DIGIT (c >> 4);
 396         *p2++ = XNUM_TO_DIGIT (c & 0xf);
 397       }
 398     else
 399       *p2++ = *p1++;
 400
 401   *p2 = '\0';
 402   assert (p2 - newstr == newlen);
 403   return newstr;
 404 }
 405 \f
 406 /* Returns the scheme type if the scheme is supported, or
 407    SCHEME_INVALID if not.  */
 408
 409 enum url_scheme
 410 url_scheme (const char *url)
 411 {
 412   int i;
 413
 414   for (i = 0; supported_schemes[i].leading_string; i++)
 415     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 416                           strlen (supported_schemes[i].leading_string)))
 417       {
 418         if (!(supported_schemes[i].flags & scm_disabled))
 419           return (enum url_scheme) i;
 420         else
 421           return SCHEME_INVALID;
 422       }
 423
 424   return SCHEME_INVALID;
 425 }
 426
 427 #define SCHEME_CHAR(ch) (ISALNUM (ch) || (ch) == '-' || (ch) == '+')
 428
 429 /* Return 1 if the URL begins with any "scheme", 0 otherwise.  As
 430    currently implemented, it returns true if URL begins with
 431    [-+a-zA-Z0-9]+: .  */
 432
 433 bool
 434 url_has_scheme (const char *url)
 435 {
 436   const char *p = url;
 437
 438   /* The first char must be a scheme char. */
 439   if (!*p || !SCHEME_CHAR (*p))
 440     return false;
 441   ++p;
 442   /* Followed by 0 or more scheme chars. */
 443   while (*p && SCHEME_CHAR (*p))
 444     ++p;
 445   /* Terminated by ':'. */
 446   return *p == ':';
 447 }
 448
 449 int
 450 scheme_default_port (enum url_scheme scheme)
 451 {
 452   return supported_schemes[scheme].default_port;
 453 }
 454
 455 void
 456 scheme_disable (enum url_scheme scheme)
 457 {
 458   supported_schemes[scheme].flags |= scm_disabled;
 459 }
 460
 461 /* Skip the username and password, if present in the URL.  The
 462    function should *not* be called with the complete URL, but with the
 463    portion after the scheme.
 464
 465    If no username and password are found, return URL.  */
 466
 467 static const char *
 468 url_skip_credentials (const char *url)
 469 {
 470   /* Look for '@' that comes before terminators, such as '/', '?',
 471      '#', or ';'.  */
 472   const char *p = (const char *)strpbrk (url, "@/?#;");
 473   if (!p || *p != '@')
 474     return url;
 475   return p + 1;
 476 }
 477
 478 /* Parse credentials contained in [BEG, END).  The region is expected
 479    to have come from a URL and is unescaped.  */
 480
 481 static bool
 482 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
 483 {
 484   char *colon;
 485   const char *userend;
 486
 487   if (beg == end)
 488     return false;               /* empty user name */
 489
 490   colon = memchr (beg, ':', end - beg);
 491   if (colon == beg)
 492     return false;               /* again empty user name */
 493
 494   if (colon)
 495     {
 496       *passwd = strdupdelim (colon + 1, end);
 497       userend = colon;
 498       url_unescape (*passwd);
 499     }
 500   else
 501     {
 502       *passwd = NULL;
 503       userend = end;
 504     }
 505   *user = strdupdelim (beg, userend);
 506   url_unescape (*user);
 507   return true;
 508 }
 509
 510 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 511    popularized by Netscape and NcFTP.  HTTP shorthands look like this:
 512
 513    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 514    www.foo.com[:port]            -> http://www.foo.com[:port]
 515
 516    FTP shorthands look like this:
 517
 518    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 519    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 520
 521    If the URL needs not or cannot be rewritten, return NULL.  */
 522
 523 char *
 524 rewrite_shorthand_url (const char *url)
 525 {
 526   const char *p;
 527
 528   if (url_scheme (url) != SCHEME_INVALID)
 529     return NULL;
 530
 531   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 532      latter Netscape.  */
 533   for (p = url; *p && *p != ':' && *p != '/'; p++)
 534     ;
 535
 536   if (p == url)
 537     return NULL;
 538
 539   /* If we're looking at "://", it means the URL uses a scheme we
 540      don't support, which may include "https" when compiled without
 541      SSL support.  Don't bogusly rewrite such URLs.  */
 542   if (p[0] == ':' && p[1] == '/' && p[2] == '/')
 543     return NULL;
 544
 545   if (*p == ':')
 546     {
 547       const char *pp;
 548       char *res;
 549       /* If the characters after the colon and before the next slash
 550          or end of string are all digits, it's HTTP.  */
 551       int digits = 0;
 552       for (pp = p + 1; ISDIGIT (*pp); pp++)
 553         ++digits;
 554       if (digits > 0 && (*pp == '/' || *pp == '\0'))
 555         goto http;
 556
 557       /* Prepend "ftp://" to the entire URL... */
 558       res = xmalloc (6 + strlen (url) + 1);
 559       sprintf (res, "ftp://%s", url);
 560       /* ...and replace ':' with '/'. */
 561       res[6 + (p - url)] = '/';
 562       return res;
 563     }
 564   else
 565     {
 566       char *res;
 567     http:
 568       /* Just prepend "http://" to what we have. */
 569       res = xmalloc (7 + strlen (url) + 1);
 570       sprintf (res, "http://%s", url);
 571       return res;
 572     }
 573 }
 574 \f
 575 static void split_path (const char *, char **, char **);
 576
 577 /* Like strpbrk, with the exception that it returns the pointer to the
 578    terminating zero (end-of-string aka "eos") if no matching character
 579    is found.  */
 580
 581 static inline char *
 582 strpbrk_or_eos (const char *s, const char *accept)
 583 {
 584   char *p = strpbrk (s, accept);
 585   if (!p)
 586     p = strchr (s, '\0');
 587   return p;
 588 }
 589
 590 /* Turn STR into lowercase; return true if a character was actually
 591    changed. */
 592
 593 static bool
 594 lowercase_str (char *str)
 595 {
 596   bool changed = false;
 597   for (; *str; str++)
 598     if (ISUPPER (*str))
 599       {
 600         changed = true;
 601         *str = TOLOWER (*str);
 602       }
 603   return changed;
 604 }
 605
 606 static const char *
 607 init_seps (enum url_scheme scheme)
 608 {
 609   static char seps[8] = ":/";
 610   char *p = seps + 2;
 611   int flags = supported_schemes[scheme].flags;
 612
 613   if (flags & scm_has_params)
 614     *p++ = ';';
 615   if (flags & scm_has_query)
 616     *p++ = '?';
 617   if (flags & scm_has_fragment)
 618     *p++ = '#';
 619   *p++ = '\0';
 620   return seps;
 621 }
 622
 623 static const char *parse_errors[] = {
 624 #define PE_NO_ERROR                     0
 625   N_("No error"),
 626 #define PE_UNSUPPORTED_SCHEME           1
 627   N_("Unsupported scheme"),
 628 #define PE_INVALID_HOST_NAME            2
 629   N_("Invalid host name"),
 630 #define PE_BAD_PORT_NUMBER              3
 631   N_("Bad port number"),
 632 #define PE_INVALID_USER_NAME            4
 633   N_("Invalid user name"),
 634 #define PE_UNTERMINATED_IPV6_ADDRESS    5
 635   N_("Unterminated IPv6 numeric address"),
 636 #define PE_IPV6_NOT_SUPPORTED           6
 637   N_("IPv6 addresses not supported"),
 638 #define PE_INVALID_IPV6_ADDRESS         7
 639   N_("Invalid IPv6 numeric address")
 640 };
 641
 642 /* Parse a URL.
 643
 644    Return a new struct url if successful, NULL on error.  In case of
 645    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 646    error code. */
 647 struct url *
 648 url_parse (const char *url, int *error)
 649 {
 650   struct url *u;
 651   const char *p;
 652   bool path_modified, host_modified;
 653
 654   enum url_scheme scheme;
 655   const char *seps;
 656
 657   const char *uname_b,     *uname_e;
 658   const char *host_b,      *host_e;
 659   const char *path_b,      *path_e;
 660   const char *params_b,    *params_e;
 661   const char *query_b,     *query_e;
 662   const char *fragment_b,  *fragment_e;
 663
 664   int port;
 665   char *user = NULL, *passwd = NULL;
 666
 667   char *url_encoded = NULL;
 668
 669   int error_code;
 670
 671   scheme = url_scheme (url);
 672   if (scheme == SCHEME_INVALID)
 673     {
 674       error_code = PE_UNSUPPORTED_SCHEME;
 675       goto error;
 676     }
 677
 678   url_encoded = reencode_escapes (url);
 679   p = url_encoded;
 680
 681   p += strlen (supported_schemes[scheme].leading_string);
 682   uname_b = p;
 683   p = url_skip_credentials (p);
 684   uname_e = p;
 685
 686   /* scheme://user:pass@host[:port]... */
 687   /*                    ^              */
 688
 689   /* We attempt to break down the URL into the components path,
 690      params, query, and fragment.  They are ordered like this:
 691
 692        scheme://host[:port][/path][;params][?query][#fragment]  */
 693
 694   path_b     = path_e     = NULL;
 695   params_b   = params_e   = NULL;
 696   query_b    = query_e    = NULL;
 697   fragment_b = fragment_e = NULL;
 698
 699   /* Initialize separators for optional parts of URL, depending on the
 700      scheme.  For example, FTP has params, and HTTP and HTTPS have
 701      query string and fragment. */
 702   seps = init_seps (scheme);
 703
 704   host_b = p;
 705
 706   if (*p == '[')
 707     {
 708       /* Handle IPv6 address inside square brackets.  Ideally we'd
 709          just look for the terminating ']', but rfc2732 mandates
 710          rejecting invalid IPv6 addresses.  */
 711
 712       /* The address begins after '['. */
 713       host_b = p + 1;
 714       host_e = strchr (host_b, ']');
 715
 716       if (!host_e)
 717         {
 718           error_code = PE_UNTERMINATED_IPV6_ADDRESS;
 719           goto error;
 720         }
 721
 722 #ifdef ENABLE_IPV6
 723       /* Check if the IPv6 address is valid. */
 724       if (!is_valid_ipv6_address(host_b, host_e))
 725         {
 726           error_code = PE_INVALID_IPV6_ADDRESS;
 727           goto error;
 728         }
 729
 730       /* Continue parsing after the closing ']'. */
 731       p = host_e + 1;
 732 #else
 733       error_code = PE_IPV6_NOT_SUPPORTED;
 734       goto error;
 735 #endif
 736
 737       /* The closing bracket must be followed by a separator or by the
 738          null char.  */
 739       /* http://[::1]... */
 740       /*             ^   */
 741       if (!strchr (seps, *p))
 742         {
 743           /* Trailing garbage after []-delimited IPv6 address. */
 744           error_code = PE_INVALID_HOST_NAME;
 745           goto error;
 746         }
 747     }
 748   else
 749     {
 750       p = strpbrk_or_eos (p, seps);
 751       host_e = p;
 752     }
 753   ++seps;                       /* advance to '/' */
 754
 755   if (host_b == host_e)
 756     {
 757       error_code = PE_INVALID_HOST_NAME;
 758       goto error;
 759     }
 760
 761   port = scheme_default_port (scheme);
 762   if (*p == ':')
 763     {
 764       const char *port_b, *port_e, *pp;
 765
 766       /* scheme://host:port/tralala */
 767       /*              ^             */
 768       ++p;
 769       port_b = p;
 770       p = strpbrk_or_eos (p, seps);
 771       port_e = p;
 772
 773       /* Allow empty port, as per rfc2396. */
 774       if (port_b != port_e)
 775         for (port = 0, pp = port_b; pp < port_e; pp++)
 776           {
 777             if (!ISDIGIT (*pp))
 778               {
 779                 /* http://host:12randomgarbage/blah */
 780                 /*               ^                  */
 781                 error_code = PE_BAD_PORT_NUMBER;
 782                 goto error;
 783               }
 784             port = 10 * port + (*pp - '0');
 785             /* Check for too large port numbers here, before we have
 786                a chance to overflow on bogus port values.  */
 787             if (port > 0xffff)
 788               {
 789                 error_code = PE_BAD_PORT_NUMBER;
 790                 goto error;
 791               }
 792           }
 793     }
 794   /* Advance to the first separator *after* '/' (either ';' or '?',
 795      depending on the scheme).  */
 796   ++seps;
 797
 798   /* Get the optional parts of URL, each part being delimited by
 799      current location and the position of the next separator.  */
 800 #define GET_URL_PART(sepchar, var) do {                         \
 801   if (*p == sepchar)                                            \
 802     var##_b = ++p, var##_e = p = strpbrk_or_eos (p, seps);      \
 803   ++seps;                                                       \
 804 } while (0)
 805
 806   GET_URL_PART ('/', path);
 807   if (supported_schemes[scheme].flags & scm_has_params)
 808     GET_URL_PART (';', params);
 809   if (supported_schemes[scheme].flags & scm_has_query)
 810     GET_URL_PART ('?', query);
 811   if (supported_schemes[scheme].flags & scm_has_fragment)
 812     GET_URL_PART ('#', fragment);
 813
 814 #undef GET_URL_PART
 815   assert (*p == 0);
 816
 817   if (uname_b != uname_e)
 818     {
 819       /* http://user:pass@host */
 820       /*        ^         ^    */
 821       /*     uname_b   uname_e */
 822       if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
 823         {
 824           error_code = PE_INVALID_USER_NAME;
 825           goto error;
 826         }
 827     }
 828
 829   u = xnew0 (struct url);
 830   u->scheme = scheme;
 831   u->host   = strdupdelim (host_b, host_e);
 832   u->port   = port;
 833   u->user   = user;
 834   u->passwd = passwd;
 835
 836   u->path = strdupdelim (path_b, path_e);
 837   path_modified = path_simplify (u->path);
 838   split_path (u->path, &u->dir, &u->file);
 839
 840   host_modified = lowercase_str (u->host);
 841
 842   /* Decode %HH sequences in host name.  This is important not so much
 843      to support %HH sequences in host names (which other browser
 844      don't), but to support binary characters (which will have been
 845      converted to %HH by reencode_escapes).  */
 846   if (strchr (u->host, '%'))
 847     {
 848       url_unescape (u->host);
 849       host_modified = true;
 850     }
 851
 852   if (params_b)
 853     u->params = strdupdelim (params_b, params_e);
 854   if (query_b)
 855     u->query = strdupdelim (query_b, query_e);
 856   if (fragment_b)
 857     u->fragment = strdupdelim (fragment_b, fragment_e);
 858
 859   if (path_modified || u->fragment || host_modified || path_b == path_e)
 860     {
 861       /* If we suspect that a transformation has rendered what
 862          url_string might return different from URL_ENCODED, rebuild
 863          u->url using url_string.  */
 864       u->url = url_string (u, false);
 865
 866       if (url_encoded != url)
 867         xfree ((char *) url_encoded);
 868     }
 869   else
 870     {
 871       if (url_encoded == url)
 872         u->url = xstrdup (url);
 873       else
 874         u->url = url_encoded;
 875     }
 876
 877   return u;
 878
 879  error:
 880   /* Cleanup in case of error: */
 881   if (url_encoded && url_encoded != url)
 882     xfree (url_encoded);
 883
 884   /* Transmit the error code to the caller, if the caller wants to
 885      know.  */
 886   if (error)
 887     *error = error_code;
 888   return NULL;
 889 }
 890
 891 /* Return the error message string from ERROR_CODE, which should have
 892    been retrieved from url_parse.  The error message is translated.  */
 893
 894 const char *
 895 url_error (int error_code)
 896 {
 897   assert (error_code >= 0 && error_code < countof (parse_errors));
 898   return _(parse_errors[error_code]);
 899 }
 900
 901 /* Split PATH into DIR and FILE.  PATH comes from the URL and is
 902    expected to be URL-escaped.
 903
 904    The path is split into directory (the part up to the last slash)
 905    and file (the part after the last slash), which are subsequently
 906    unescaped.  Examples:
 907
 908    PATH                 DIR           FILE
 909    "foo/bar/baz"        "foo/bar"     "baz"
 910    "foo/bar/"           "foo/bar"     ""
 911    "foo"                ""            "foo"
 912    "foo/bar/baz%2fqux"  "foo/bar"     "baz/qux" (!)
 913
 914    DIR and FILE are freshly allocated.  */
 915
 916 static void
 917 split_path (const char *path, char **dir, char **file)
 918 {
 919   char *last_slash = strrchr (path, '/');
 920   if (!last_slash)
 921     {
 922       *dir = xstrdup ("");
 923       *file = xstrdup (path);
 924     }
 925   else
 926     {
 927       *dir = strdupdelim (path, last_slash);
 928       *file = xstrdup (last_slash + 1);
 929     }
 930   url_unescape (*dir);
 931   url_unescape (*file);
 932 }
 933
 934 /* Note: URL's "full path" is the path with the query string and
 935    params appended.  The "fragment" (#foo) is intentionally ignored,
 936    but that might be changed.  For example, if the original URL was
 937    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
 938    the full path will be "/foo/bar/baz;bullshit?querystring".  */
 939
 940 /* Return the length of the full path, without the terminating
 941    zero.  */
 942
 943 static int
 944 full_path_length (const struct url *url)
 945 {
 946   int len = 0;
 947
 948 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
 949
 950   FROB (path);
 951   FROB (params);
 952   FROB (query);
 953
 954 #undef FROB
 955
 956   return len;
 957 }
 958
 959 /* Write out the full path. */
 960
 961 static void
 962 full_path_write (const struct url *url, char *where)
 963 {
 964 #define FROB(el, chr) do {                      \
 965   char *f_el = url->el;                         \
 966   if (f_el) {                                   \
 967     int l = strlen (f_el);                      \
 968     *where++ = chr;                             \
 969     memcpy (where, f_el, l);                    \
 970     where += l;                                 \
 971   }                                             \
 972 } while (0)
 973
 974   FROB (path, '/');
 975   FROB (params, ';');
 976   FROB (query, '?');
 977
 978 #undef FROB
 979 }
 980
 981 /* Public function for getting the "full path".  E.g. if u->path is
 982    "foo/bar" and u->query is "param=value", full_path will be
 983    "/foo/bar?param=value". */
 984
 985 char *
 986 url_full_path (const struct url *url)
 987 {
 988   int length = full_path_length (url);
 989   char *full_path = xmalloc (length + 1);
 990
 991   full_path_write (url, full_path);
 992   full_path[length] = '\0';
 993
 994   return full_path;
 995 }
 996
 997 /* Unescape CHR in an otherwise escaped STR.  Used to selectively
 998    escaping of certain characters, such as "/" and ":".  Returns a
 999    count of unescaped chars.  */
1000
1001 static void
1002 unescape_single_char (char *str, char chr)
1003 {
1004   const char c1 = XNUM_TO_DIGIT (chr >> 4);
1005   const char c2 = XNUM_TO_DIGIT (chr & 0xf);
1006   char *h = str;                /* hare */
1007   char *t = str;                /* tortoise */
1008   for (; *h; h++, t++)
1009     {
1010       if (h[0] == '%' && h[1] == c1 && h[2] == c2)
1011         {
1012           *t = chr;
1013           h += 2;
1014         }
1015       else
1016         *t = *h;
1017     }
1018   *t = '\0';
1019 }
1020
1021 /* Escape unsafe and reserved characters, except for the slash
1022    characters.  */
1023
1024 static char *
1025 url_escape_dir (const char *dir)
1026 {
1027   char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1028   if (newdir == dir)
1029     return (char *)dir;
1030
1031   unescape_single_char (newdir, '/');
1032   return newdir;
1033 }
1034
1035 /* Sync u->path and u->url with u->dir and u->file.  Called after
1036    u->file or u->dir have been changed, typically by the FTP code.  */
1037
1038 static void
1039 sync_path (struct url *u)
1040 {
1041   char *newpath, *efile, *edir;
1042
1043   xfree (u->path);
1044
1045   /* u->dir and u->file are not escaped.  URL-escape them before
1046      reassembling them into u->path.  That way, if they contain
1047      separators like '?' or even if u->file contains slashes, the
1048      path will be correctly assembled.  (u->file can contain slashes
1049      if the URL specifies it with %2f, or if an FTP server returns
1050      it.)  */
1051   edir = url_escape_dir (u->dir);
1052   efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1053
1054   if (!*edir)
1055     newpath = xstrdup (efile);
1056   else
1057     {
1058       int dirlen = strlen (edir);
1059       int filelen = strlen (efile);
1060
1061       /* Copy "DIR/FILE" to newpath. */
1062       char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1063       memcpy (p, edir, dirlen);
1064       p += dirlen;
1065       *p++ = '/';
1066       memcpy (p, efile, filelen);
1067       p += filelen;
1068       *p = '\0';
1069     }
1070
1071   u->path = newpath;
1072
1073   if (edir != u->dir)
1074     xfree (edir);
1075   if (efile != u->file)
1076     xfree (efile);
1077
1078   /* Regenerate u->url as well.  */
1079   xfree (u->url);
1080   u->url = url_string (u, false);
1081 }
1082
1083 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1084    This way we can sync u->path and u->url when they get changed.  */
1085
1086 void
1087 url_set_dir (struct url *url, const char *newdir)
1088 {
1089   xfree (url->dir);
1090   url->dir = xstrdup (newdir);
1091   sync_path (url);
1092 }
1093
1094 void
1095 url_set_file (struct url *url, const char *newfile)
1096 {
1097   xfree (url->file);
1098   url->file = xstrdup (newfile);
1099   sync_path (url);
1100 }
1101
1102 void
1103 url_free (struct url *url)
1104 {
1105   xfree (url->host);
1106   xfree (url->path);
1107   xfree (url->url);
1108
1109   xfree_null (url->params);
1110   xfree_null (url->query);
1111   xfree_null (url->fragment);
1112   xfree_null (url->user);
1113   xfree_null (url->passwd);
1114
1115   xfree (url->dir);
1116   xfree (url->file);
1117
1118   xfree (url);
1119 }
1120 \f
1121 /* Create all the necessary directories for PATH (a file).  Calls
1122    make_directory internally.  */
1123 int
1124 mkalldirs (const char *path)
1125 {
1126   const char *p;
1127   char *t;
1128   struct_stat st;
1129   int res;
1130
1131   p = path + strlen (path);
1132   for (; *p != '/' && p != path; p--)
1133     ;
1134
1135   /* Don't create if it's just a file.  */
1136   if ((p == path) && (*p != '/'))
1137     return 0;
1138   t = strdupdelim (path, p);
1139
1140   /* Check whether the directory exists.  */
1141   if ((stat (t, &st) == 0))
1142     {
1143       if (S_ISDIR (st.st_mode))
1144         {
1145           xfree (t);
1146           return 0;
1147         }
1148       else
1149         {
1150           /* If the dir exists as a file name, remove it first.  This
1151              is *only* for Wget to work with buggy old CERN http
1152              servers.  Here is the scenario: When Wget tries to
1153              retrieve a directory without a slash, e.g.
1154              http://foo/bar (bar being a directory), CERN server will
1155              not redirect it too http://foo/bar/ -- it will generate a
1156              directory listing containing links to bar/file1,
1157              bar/file2, etc.  Wget will lose because it saves this
1158              HTML listing to a file `bar', so it cannot create the
1159              directory.  To work around this, if the file of the same
1160              name exists, we just remove it and create the directory
1161              anyway.  */
1162           DEBUGP (("Removing %s because of directory danger!\n", t));
1163           unlink (t);
1164         }
1165     }
1166   res = make_directory (t);
1167   if (res != 0)
1168     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1169   xfree (t);
1170   return res;
1171 }
1172 \f
1173 /* Functions for constructing the file name out of URL components.  */
1174
1175 /* A growable string structure, used by url_file_name and friends.
1176    This should perhaps be moved to utils.c.
1177
1178    The idea is to have a convenient and efficient way to construct a
1179    string by having various functions append data to it.  Instead of
1180    passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1181    functions in questions, we pass the pointer to this struct.  */
1182
1183 struct growable {
1184   char *base;
1185   int size;
1186   int tail;
1187 };
1188
1189 /* Ensure that the string can accept APPEND_COUNT more characters past
1190    the current TAIL position.  If necessary, this will grow the string
1191    and update its allocated size.  If the string is already large
1192    enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1193 #define GROW(g, append_size) do {                                       \
1194   struct growable *G_ = g;                                              \
1195   DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1196 } while (0)
1197
1198 /* Return the tail position of the string. */
1199 #define TAIL(r) ((r)->base + (r)->tail)
1200
1201 /* Move the tail position by APPEND_COUNT characters. */
1202 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1203
1204 /* Append the string STR to DEST.  NOTICE: the string in DEST is not
1205    terminated.  */
1206
1207 static void
1208 append_string (const char *str, struct growable *dest)
1209 {
1210   int l = strlen (str);
1211   GROW (dest, l);
1212   memcpy (TAIL (dest), str, l);
1213   TAIL_INCR (dest, l);
1214 }
1215
1216 /* Append CH to DEST.  For example, append_char (0, DEST)
1217    zero-terminates DEST.  */
1218
1219 static void
1220 append_char (char ch, struct growable *dest)
1221 {
1222   GROW (dest, 1);
1223   *TAIL (dest) = ch;
1224   TAIL_INCR (dest, 1);
1225 }
1226
1227 enum {
1228   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1229   filechr_not_windows = 2,      /* unusable on Windows, one of \|/<>?:*" */
1230   filechr_control     = 4       /* a control character, e.g. 0-31 */
1231 };
1232
1233 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1234
1235 /* Shorthands for the table: */
1236 #define U filechr_not_unix
1237 #define W filechr_not_windows
1238 #define C filechr_control
1239
1240 #define UW U|W
1241 #define UWC U|W|C
1242
1243 /* Table of characters unsafe under various conditions (see above).
1244
1245    Arguably we could also claim `%' to be unsafe, since we use it as
1246    the escape character.  If we ever want to be able to reliably
1247    translate file name back to URL, this would become important
1248    crucial.  Right now, it's better to be minimal in escaping.  */
1249
1250 static const unsigned char filechr_table[256] =
1251 {
1252 UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1253   C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1254   C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1255   C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1256   0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1257   0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1258   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1259   0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
1260   0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1261   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1262   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1263   0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1264   0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1265   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1266   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1267   0,  0,  0,  0,   0,  0,  0,  0,   /* x   y   z   {    |   }   ~   DEL */
1268
1269   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
1270   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
1271   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1272   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1273
1274   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1275   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1276   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1277   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1278 };
1279 #undef U
1280 #undef W
1281 #undef C
1282 #undef UW
1283 #undef UWC
1284
1285 /* FN_PORT_SEP is the separator between host and port in file names
1286    for non-standard port numbers.  On Unix this is normally ':', as in
1287    "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1288    because Windows can't handle ':' in file names.  */
1289 #define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1290
1291 /* FN_QUERY_SEP is the separator between the file name and the URL
1292    query, normally '?'.  Since Windows cannot handle '?' as part of
1293    file name, we use '@' instead there.  */
1294 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1295
1296 /* Quote path element, characters in [b, e), as file name, and append
1297    the quoted string to DEST.  Each character is quoted as per
1298    file_unsafe_char and the corresponding table.
1299
1300    If ESCAPED is true, the path element is considered to be
1301    URL-escaped and will be unescaped prior to inspection.  */
1302
1303 static void
1304 append_uri_pathel (const char *b, const char *e, bool escaped,
1305                    struct growable *dest)
1306 {
1307   const char *p;
1308   int quoted, outlen;
1309
1310   int mask;
1311   if (opt.restrict_files_os == restrict_unix)
1312     mask = filechr_not_unix;
1313   else
1314     mask = filechr_not_windows;
1315   if (opt.restrict_files_ctrl)
1316     mask |= filechr_control;
1317
1318   /* Copy [b, e) to PATHEL and URL-unescape it. */
1319   if (escaped)
1320     {
1321       char *unescaped;
1322       BOUNDED_TO_ALLOCA (b, e, unescaped);
1323       url_unescape (unescaped);
1324       b = unescaped;
1325       e = unescaped + strlen (unescaped);
1326     }
1327
1328   /* Defang ".." when found as component of path.  Remember that path
1329      comes from the URL and might contain malicious input.  */
1330   if (e - b == 2 && b[0] == '.' && b[1] == '.')
1331     {
1332       b = "%2E%2E";
1333       e = b + 6;
1334     }
1335
1336   /* Walk the PATHEL string and check how many characters we'll need
1337      to quote.  */
1338   quoted = 0;
1339   for (p = b; p < e; p++)
1340     if (FILE_CHAR_TEST (*p, mask))
1341       ++quoted;
1342
1343   /* Calculate the length of the output string.  e-b is the input
1344      string length.  Each quoted char introduces two additional
1345      characters in the string, hence 2*quoted.  */
1346   outlen = (e - b) + (2 * quoted);
1347   GROW (dest, outlen);
1348
1349   if (!quoted)
1350     {
1351       /* If there's nothing to quote, we can simply append the string
1352          without processing it again.  */
1353       memcpy (TAIL (dest), b, outlen);
1354     }
1355   else
1356     {
1357       char *q = TAIL (dest);
1358       for (p = b; p < e; p++)
1359         {
1360           if (!FILE_CHAR_TEST (*p, mask))
1361             *q++ = *p;
1362           else
1363             {
1364               unsigned char ch = *p;
1365               *q++ = '%';
1366               *q++ = XNUM_TO_DIGIT (ch >> 4);
1367               *q++ = XNUM_TO_DIGIT (ch & 0xf);
1368             }
1369         }
1370       assert (q - TAIL (dest) == outlen);
1371     }
1372   TAIL_INCR (dest, outlen);
1373 }
1374
1375 /* Append to DEST the directory structure that corresponds the
1376    directory part of URL's path.  For example, if the URL is
1377    http://server/dir1/dir2/file, this appends "/dir1/dir2".
1378
1379    Each path element ("dir1" and "dir2" in the above example) is
1380    examined, url-unescaped, and re-escaped as file name element.
1381
1382    Additionally, it cuts as many directories from the path as
1383    specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1384    will produce "bar" for the above example.  For 2 or more, it will
1385    produce "".
1386
1387    Each component of the path is quoted for use as file name.  */
1388
1389 static void
1390 append_dir_structure (const struct url *u, struct growable *dest)
1391 {
1392   char *pathel, *next;
1393   int cut = opt.cut_dirs;
1394
1395   /* Go through the path components, de-URL-quote them, and quote them
1396      (if necessary) as file names.  */
1397
1398   pathel = u->path;
1399   for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1400     {
1401       if (cut-- > 0)
1402         continue;
1403       if (pathel == next)
1404         /* Ignore empty pathels.  */
1405         continue;
1406
1407       if (dest->tail)
1408         append_char ('/', dest);
1409       append_uri_pathel (pathel, next, true, dest);
1410     }
1411 }
1412
1413 /* Return a unique file name that matches the given URL as good as
1414    possible.  Does not create directories on the file system.  */
1415
1416 char *
1417 url_file_name (const struct url *u)
1418 {
1419   struct growable fnres;        /* stands for "file name result" */
1420
1421   const char *u_file, *u_query;
1422   char *fname, *unique;
1423
1424   fnres.base = NULL;
1425   fnres.size = 0;
1426   fnres.tail = 0;
1427
1428   /* Start with the directory prefix, if specified. */
1429   if (opt.dir_prefix)
1430     append_string (opt.dir_prefix, &fnres);
1431
1432   /* If "dirstruct" is turned on (typically the case with -r), add
1433      the host and port (unless those have been turned off) and
1434      directory structure.  */
1435   if (opt.dirstruct)
1436     {
1437       if (opt.protocol_directories)
1438         {
1439           if (fnres.tail)
1440             append_char ('/', &fnres);
1441           append_string (supported_schemes[u->scheme].name, &fnres);
1442         }
1443       if (opt.add_hostdir)
1444         {
1445           if (fnres.tail)
1446             append_char ('/', &fnres);
1447           if (0 != strcmp (u->host, ".."))
1448             append_string (u->host, &fnres);
1449           else
1450             /* Host name can come from the network; malicious DNS may
1451                allow ".." to be resolved, causing us to write to
1452                "../<file>".  Defang such host names.  */
1453             append_string ("%2E%2E", &fnres);
1454           if (u->port != scheme_default_port (u->scheme))
1455             {
1456               char portstr[24];
1457               number_to_string (portstr, u->port);
1458               append_char (FN_PORT_SEP, &fnres);
1459               append_string (portstr, &fnres);
1460             }
1461         }
1462
1463       append_dir_structure (u, &fnres);
1464     }
1465
1466   /* Add the file name. */
1467   if (fnres.tail)
1468     append_char ('/', &fnres);
1469   u_file = *u->file ? u->file : "index.html";
1470   append_uri_pathel (u_file, u_file + strlen (u_file), false, &fnres);
1471
1472   /* Append "?query" to the file name. */
1473   u_query = u->query && *u->query ? u->query : NULL;
1474   if (u_query)
1475     {
1476       append_char (FN_QUERY_SEP, &fnres);
1477       append_uri_pathel (u_query, u_query + strlen (u_query), true, &fnres);
1478     }
1479
1480   /* Zero-terminate the file name. */
1481   append_char ('\0', &fnres);
1482
1483   fname = fnres.base;
1484
1485   /* Check the cases in which the unique extensions are not used:
1486      1) Clobbering is turned off (-nc).
1487      2) Retrieval with regetting.
1488      3) Timestamping is used.
1489      4) Hierarchy is built.
1490
1491      The exception is the case when file does exist and is a
1492      directory (see `mkalldirs' for explanation).  */
1493
1494   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1495       && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1496     return fname;
1497
1498   unique = unique_name (fname, true);
1499   if (unique != fname)
1500     xfree (fname);
1501   return unique;
1502 }
1503 \f
1504 /* Resolve "." and ".." elements of PATH by destructively modifying
1505    PATH and return true if PATH has been modified, false otherwise.
1506
1507    The algorithm is in spirit similar to the one described in rfc1808,
1508    although implemented differently, in one pass.  To recap, path
1509    elements containing only "." are removed, and ".." is taken to mean
1510    "back up one element".  Single leading and trailing slashes are
1511    preserved.
1512
1513    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1514    test examples are provided below.  If you change anything in this
1515    function, run test_path_simplify to make sure you haven't broken a
1516    test case.  */
1517
1518 static bool
1519 path_simplify (char *path)
1520 {
1521   char *h = path;               /* hare */
1522   char *t = path;               /* tortoise */
1523   char *beg = path;             /* boundary for backing the tortoise */
1524   char *end = path + strlen (path);
1525
1526   while (h < end)
1527     {
1528       /* Hare should be at the beginning of a path element. */
1529
1530       if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1531         {
1532           /* Ignore "./". */
1533           h += 2;
1534         }
1535       else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1536         {
1537           /* Handle "../" by retreating the tortoise by one path
1538              element -- but not past beggining.  */
1539           if (t > beg)
1540             {
1541               /* Move backwards until T hits the beginning of the
1542                  previous path element or the beginning of path. */
1543               for (--t; t > beg && t[-1] != '/'; t--)
1544                 ;
1545             }
1546           else
1547             {
1548               /* If we're at the beginning, copy the "../" literally
1549                  move the beginning so a later ".." doesn't remove
1550                  it.  */
1551               beg = t + 3;
1552               goto regular;
1553             }
1554           h += 3;
1555         }
1556       else
1557         {
1558         regular:
1559           /* A regular path element.  If H hasn't advanced past T,
1560              simply skip to the next path element.  Otherwise, copy
1561              the path element until the next slash.  */
1562           if (t == h)
1563             {
1564               /* Skip the path element, including the slash.  */
1565               while (h < end && *h != '/')
1566                 t++, h++;
1567               if (h < end)
1568                 t++, h++;
1569             }
1570           else
1571             {
1572               /* Copy the path element, including the final slash.  */
1573               while (h < end && *h != '/')
1574                 *t++ = *h++;
1575               if (h < end)
1576                 *t++ = *h++;
1577             }
1578         }
1579     }
1580
1581   if (t != h)
1582     *t = '\0';
1583
1584   return t != h;
1585 }
1586 \f
1587 /* Return the length of URL's path.  Path is considered to be
1588    terminated by one or more of the ?query or ;params or #fragment,
1589    depending on the scheme.  */
1590
1591 static const char *
1592 path_end (const char *url)
1593 {
1594   enum url_scheme scheme = url_scheme (url);
1595   const char *seps;
1596   if (scheme == SCHEME_INVALID)
1597     scheme = SCHEME_HTTP;       /* use http semantics for rel links */
1598   /* +2 to ignore the first two separators ':' and '/' */
1599   seps = init_seps (scheme) + 2;
1600   return strpbrk_or_eos (url, seps);
1601 }
1602
1603 /* Find the last occurrence of character C in the range [b, e), or
1604    NULL, if none are present.  We might want to use memrchr (a GNU
1605    extension) under GNU libc.  */
1606
1607 static const char *
1608 find_last_char (const char *b, const char *e, char c)
1609 {
1610   for (; e > b; e--)
1611     if (*e == c)
1612       return e;
1613   return NULL;
1614 }
1615
1616 /* Merge BASE with LINK and return the resulting URI.
1617
1618    Either of the URIs may be absolute or relative, complete with the
1619    host name, or path only.  This tries to reasonably handle all
1620    foreseeable cases.  It only employs minimal URL parsing, without
1621    knowledge of the specifics of schemes.
1622
1623    I briefly considered making this function call path_simplify after
1624    the merging process, as rfc1738 seems to suggest.  This is a bad
1625    idea for several reasons: 1) it complexifies the code, and 2)
1626    url_parse has to simplify path anyway, so it's wasteful to boot.  */
1627
1628 char *
1629 uri_merge (const char *base, const char *link)
1630 {
1631   int linklength;
1632   const char *end;
1633   char *merge;
1634
1635   if (url_has_scheme (link))
1636     return xstrdup (link);
1637
1638   /* We may not examine BASE past END. */
1639   end = path_end (base);
1640   linklength = strlen (link);
1641
1642   if (!*link)
1643     {
1644       /* Empty LINK points back to BASE, query string and all. */
1645       return xstrdup (base);
1646     }
1647   else if (*link == '?')
1648     {
1649       /* LINK points to the same location, but changes the query
1650          string.  Examples: */
1651       /* uri_merge("path",         "?new") -> "path?new"     */
1652       /* uri_merge("path?foo",     "?new") -> "path?new"     */
1653       /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1654       /* uri_merge("path#foo",     "?new") -> "path?new"     */
1655       int baselength = end - base;
1656       merge = xmalloc (baselength + linklength + 1);
1657       memcpy (merge, base, baselength);
1658       memcpy (merge + baselength, link, linklength);
1659       merge[baselength + linklength] = '\0';
1660     }
1661   else if (*link == '#')
1662     {
1663       /* uri_merge("path",         "#new") -> "path#new"     */
1664       /* uri_merge("path#foo",     "#new") -> "path#new"     */
1665       /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1666       /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1667       int baselength;
1668       const char *end1 = strchr (base, '#');
1669       if (!end1)
1670         end1 = base + strlen (base);
1671       baselength = end1 - base;
1672       merge = xmalloc (baselength + linklength + 1);
1673       memcpy (merge, base, baselength);
1674       memcpy (merge + baselength, link, linklength);
1675       merge[baselength + linklength] = '\0';
1676     }
1677   else if (*link == '/' && *(link + 1) == '/')
1678     {
1679       /* LINK begins with "//" and so is a net path: we need to
1680          replace everything after (and including) the double slash
1681          with LINK. */
1682
1683       /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1684       /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1685       /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1686
1687       int span;
1688       const char *slash;
1689       const char *start_insert;
1690
1691       /* Look for first slash. */
1692       slash = memchr (base, '/', end - base);
1693       /* If found slash and it is a double slash, then replace
1694          from this point, else default to replacing from the
1695          beginning.  */
1696       if (slash && *(slash + 1) == '/')
1697         start_insert = slash;
1698       else
1699         start_insert = base;
1700
1701       span = start_insert - base;
1702       merge = xmalloc (span + linklength + 1);
1703       if (span)
1704         memcpy (merge, base, span);
1705       memcpy (merge + span, link, linklength);
1706       merge[span + linklength] = '\0';
1707     }
1708   else if (*link == '/')
1709     {
1710       /* LINK is an absolute path: we need to replace everything
1711          after (and including) the FIRST slash with LINK.
1712
1713          So, if BASE is "http://host/whatever/foo/bar", and LINK is
1714          "/qux/xyzzy", our result should be
1715          "http://host/qux/xyzzy".  */
1716       int span;
1717       const char *slash;
1718       const char *start_insert = NULL; /* for gcc to shut up. */
1719       const char *pos = base;
1720       bool seen_slash_slash = false;
1721       /* We're looking for the first slash, but want to ignore
1722          double slash. */
1723     again:
1724       slash = memchr (pos, '/', end - pos);
1725       if (slash && !seen_slash_slash)
1726         if (*(slash + 1) == '/')
1727           {
1728             pos = slash + 2;
1729             seen_slash_slash = true;
1730             goto again;
1731           }
1732
1733       /* At this point, SLASH is the location of the first / after
1734          "//", or the first slash altogether.  START_INSERT is the
1735          pointer to the location where LINK will be inserted.  When
1736          examining the last two examples, keep in mind that LINK
1737          begins with '/'. */
1738
1739       if (!slash && !seen_slash_slash)
1740         /* example: "foo" */
1741         /*           ^    */
1742         start_insert = base;
1743       else if (!slash && seen_slash_slash)
1744         /* example: "http://foo" */
1745         /*                     ^ */
1746         start_insert = end;
1747       else if (slash && !seen_slash_slash)
1748         /* example: "foo/bar" */
1749         /*           ^        */
1750         start_insert = base;
1751       else if (slash && seen_slash_slash)
1752         /* example: "http://something/" */
1753         /*                           ^  */
1754         start_insert = slash;
1755
1756       span = start_insert - base;
1757       merge = xmalloc (span + linklength + 1);
1758       if (span)
1759         memcpy (merge, base, span);
1760       memcpy (merge + span, link, linklength);
1761       merge[span + linklength] = '\0';
1762     }
1763   else
1764     {
1765       /* LINK is a relative URL: we need to replace everything
1766          after last slash (possibly empty) with LINK.
1767
1768          So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1769          our result should be "whatever/foo/qux/xyzzy".  */
1770       bool need_explicit_slash = false;
1771       int span;
1772       const char *start_insert;
1773       const char *last_slash = find_last_char (base, end, '/');
1774       if (!last_slash)
1775         {
1776           /* No slash found at all.  Replace what we have with LINK. */
1777           start_insert = base;
1778         }
1779       else if (last_slash && last_slash >= base + 2
1780                && last_slash[-2] == ':' && last_slash[-1] == '/')
1781         {
1782           /* example: http://host"  */
1783           /*                      ^ */
1784           start_insert = end + 1;
1785           need_explicit_slash = true;
1786         }
1787       else
1788         {
1789           /* example: "whatever/foo/bar" */
1790           /*                        ^    */
1791           start_insert = last_slash + 1;
1792         }
1793
1794       span = start_insert - base;
1795       merge = xmalloc (span + linklength + 1);
1796       if (span)
1797         memcpy (merge, base, span);
1798       if (need_explicit_slash)
1799         merge[span - 1] = '/';
1800       memcpy (merge + span, link, linklength);
1801       merge[span + linklength] = '\0';
1802     }
1803
1804   return merge;
1805 }
1806 \f
1807 #define APPEND(p, s) do {                       \
1808   int len = strlen (s);                         \
1809   memcpy (p, s, len);                           \
1810   p += len;                                     \
1811 } while (0)
1812
1813 /* Use this instead of password when the actual password is supposed
1814    to be hidden.  We intentionally use a generic string without giving
1815    away the number of characters in the password, like previous
1816    versions did.  */
1817 #define HIDDEN_PASSWORD "*password*"
1818
1819 /* Recreate the URL string from the data in URL.
1820
1821    If HIDE is true (as it is when we're calling this on a URL we plan
1822    to print, but not when calling it to canonicalize a URL for use
1823    within the program), password will be hidden.  Unsafe characters in
1824    the URL will be quoted.  */
1825
1826 char *
1827 url_string (const struct url *url, bool hide_password)
1828 {
1829   int size;
1830   char *result, *p;
1831   char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;
1832
1833   int scheme_port = supported_schemes[url->scheme].default_port;
1834   const char *scheme_str = supported_schemes[url->scheme].leading_string;
1835   int fplen = full_path_length (url);
1836
1837   bool brackets_around_host;
1838
1839   assert (scheme_str != NULL);
1840
1841   /* Make sure the user name and password are quoted. */
1842   if (url->user)
1843     {
1844       quoted_user = url_escape_allow_passthrough (url->user);
1845       if (url->passwd)
1846         {
1847           if (hide_password)
1848             quoted_passwd = HIDDEN_PASSWORD;
1849           else
1850             quoted_passwd = url_escape_allow_passthrough (url->passwd);
1851         }
1852     }
1853
1854   /* In the unlikely event that the host name contains non-printable
1855      characters, quote it for displaying to the user.  */
1856   quoted_host = url_escape_allow_passthrough (url->host);
1857
1858   /* Undo the quoting of colons that URL escaping performs.  IPv6
1859      addresses may legally contain colons, and in that case must be
1860      placed in square brackets.  */
1861   if (quoted_host != url->host)
1862     unescape_single_char (quoted_host, ':');
1863   brackets_around_host = strchr (quoted_host, ':') != NULL;
1864
1865   size = (strlen (scheme_str)
1866           + strlen (quoted_host)
1867           + (brackets_around_host ? 2 : 0)
1868           + fplen
1869           + 1);
1870   if (url->port != scheme_port)
1871     size += 1 + numdigit (url->port);
1872   if (quoted_user)
1873     {
1874       size += 1 + strlen (quoted_user);
1875       if (quoted_passwd)
1876         size += 1 + strlen (quoted_passwd);
1877     }
1878
1879   p = result = xmalloc (size);
1880
1881   APPEND (p, scheme_str);
1882   if (quoted_user)
1883     {
1884       APPEND (p, quoted_user);
1885       if (quoted_passwd)
1886         {
1887           *p++ = ':';
1888           APPEND (p, quoted_passwd);
1889         }
1890       *p++ = '@';
1891     }
1892
1893   if (brackets_around_host)
1894     *p++ = '[';
1895   APPEND (p, quoted_host);
1896   if (brackets_around_host)
1897     *p++ = ']';
1898   if (url->port != scheme_port)
1899     {
1900       *p++ = ':';
1901       p = number_to_string (p, url->port);
1902     }
1903
1904   full_path_write (url, p);
1905   p += fplen;
1906   *p++ = '\0';
1907
1908   assert (p - result == size);
1909
1910   if (quoted_user && quoted_user != url->user)
1911     xfree (quoted_user);
1912   if (quoted_passwd && !hide_password && quoted_passwd != url->passwd)
1913     xfree (quoted_passwd);
1914   if (quoted_host != url->host)
1915     xfree (quoted_host);
1916
1917   return result;
1918 }
1919 \f
1920 /* Return true if scheme a is similar to scheme b.
1921
1922    Schemes are similar if they are equal.  If SSL is supported, schemes
1923    are also similar if one is http (SCHEME_HTTP) and the other is https
1924    (SCHEME_HTTPS).  */
1925 bool
1926 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
1927 {
1928   if (a == b)
1929     return true;
1930 #ifdef HAVE_SSL
1931   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
1932       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
1933     return true;
1934 #endif
1935   return false;
1936 }
1937 \f
1938 #if 0
1939 /* Debugging and testing support for path_simplify. */
1940
1941 /* Debug: run path_simplify on PATH and return the result in a new
1942    string.  Useful for calling from the debugger.  */
1943 static char *
1944 ps (char *path)
1945 {
1946   char *copy = xstrdup (path);
1947   path_simplify (copy);
1948   return copy;
1949 }
1950
1951 static void
1952 run_test (char *test, char *expected_result, bool expected_change)
1953 {
1954   char *test_copy = xstrdup (test);
1955   bool modified = path_simplify (test_copy);
1956
1957   if (0 != strcmp (test_copy, expected_result))
1958     {
1959       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
1960               test, expected_result, test_copy);
1961     }
1962   if (modified != expected_change)
1963     {
1964       if (expected_change)
1965         printf ("Expected modification with path_simplify(\"%s\").\n",
1966                 test);
1967       else
1968         printf ("Expected no modification with path_simplify(\"%s\").\n",
1969                 test);
1970     }
1971   xfree (test_copy);
1972 }
1973
1974 static void
1975 test_path_simplify (void)
1976 {
1977   static struct {
1978     char *test, *result;
1979     bool should_modify;
1980   } tests[] = {
1981     { "",                       "",             false },
1982     { ".",                      "",             true },
1983     { "./",                     "",             true },
1984     { "..",                     "..",           false },
1985     { "../",                    "../",          false },
1986     { "foo",                    "foo",          false },
1987     { "foo/bar",                "foo/bar",      false },
1988     { "foo///bar",              "foo///bar",    false },
1989     { "foo/.",                  "foo/",         true },
1990     { "foo/./",                 "foo/",         true },
1991     { "foo./",                  "foo./",        false },
1992     { "foo/../bar",             "bar",          true },
1993     { "foo/../bar/",            "bar/",         true },
1994     { "foo/bar/..",             "foo/",         true },
1995     { "foo/bar/../x",           "foo/x",        true },
1996     { "foo/bar/../x/",          "foo/x/",       true },
1997     { "foo/..",                 "",             true },
1998     { "foo/../..",              "..",           true },
1999     { "foo/../../..",           "../..",        true },
2000     { "foo/../../bar/../../baz", "../../baz",   true },
2001     { "a/b/../../c",            "c",            true },
2002     { "./a/../b",               "b",            true }
2003   };
2004   int i;
2005
2006   for (i = 0; i < countof (tests); i++)
2007     {
2008       char *test = tests[i].test;
2009       char *expected_result = tests[i].result;
2010       bool  expected_change = tests[i].should_modify;
2011       run_test (test, expected_result, expected_change);
2012     }
2013 }
2014 #endif