sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001, 2003, 2003
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2 of the License, or (at
  10 your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget; if not, write to the Free Software
  19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  20
  21 In addition, as a special exception, the Free Software Foundation
  22 gives permission to link the code of its release of Wget with the
  23 OpenSSL project's "OpenSSL" library (or with modified versions of it
  24 that use the same license as the "OpenSSL" library), and distribute
  25 the linked executables.  You must obey the GNU General Public License
  26 in all respects for all of the code used other than "OpenSSL".  If you
  27 modify this file, you may extend this exception to your version of the
  28 file, but you are not obligated to do so.  If you do not wish to do
  29 so, delete this exception statement from your version.  */
  30
  31 #include <config.h>
  32
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #ifdef HAVE_STRING_H
  36 # include <string.h>
  37 #else
  38 # include <strings.h>
  39 #endif
  40 #include <sys/types.h>
  41 #ifdef HAVE_UNISTD_H
  42 # include <unistd.h>
  43 #endif
  44 #include <errno.h>
  45 #include <assert.h>
  46
  47 #include "wget.h"
  48 #include "utils.h"
  49 #include "url.h"
  50
  51 #ifndef errno
  52 extern int errno;
  53 #endif
  54
  55 struct scheme_data
  56 {
  57   char *leading_string;
  58   int default_port;
  59   int enabled;
  60 };
  61
  62 /* Supported schemes: */
  63 static struct scheme_data supported_schemes[] =
  64 {
  65   { "http://",  DEFAULT_HTTP_PORT,  1 },
  66 #ifdef HAVE_SSL
  67   { "https://", DEFAULT_HTTPS_PORT, 1 },
  68 #endif
  69   { "ftp://",   DEFAULT_FTP_PORT,   1 },
  70
  71   /* SCHEME_INVALID */
  72   { NULL,       -1,                 0 }
  73 };
  74
  75 /* Forward declarations: */
  76
  77 static int path_simplify PARAMS ((char *));
  78 \f
  79 /* Support for encoding and decoding of URL strings.  We determine
  80    whether a character is unsafe through static table lookup.  This
  81    code assumes ASCII character set and 8-bit chars.  */
  82
  83 enum {
  84   /* rfc1738 reserved chars, preserved from encoding.  */
  85   urlchr_reserved = 1,
  86
  87   /* rfc1738 unsafe chars, plus some more.  */
  88   urlchr_unsafe   = 2
  89 };
  90
  91 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
  92 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
  93 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
  94
  95 /* Shorthands for the table: */
  96 #define R  urlchr_reserved
  97 #define U  urlchr_unsafe
  98 #define RU R|U
  99
 100 const static unsigned char urlchr_table[256] =
 101 {
 102   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 103   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 104   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 105   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 106   U,  0,  U, RU,   0,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 107   0,  0,  0,  R,   0,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 108   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 109   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 110  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 111   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 112   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 113   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 114   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 115   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 116   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 117   0,  0,  0,  U,   U,  U,  U,  U,   /* x   y   z   {    |   }   ~   DEL */
 118
 119   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 120   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 121   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 122   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 123
 124   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 125   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 126   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 127   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 128 };
 129 #undef R
 130 #undef U
 131 #undef RU
 132
 133 /* URL-unescape the string S.
 134
 135    This is done by transforming the sequences "%HH" to the character
 136    represented by the hexadecimal digits HH.  If % is not followed by
 137    two hexadecimal digits, it is inserted literally.
 138
 139    The transformation is done in place.  If you need the original
 140    string intact, make a copy before calling this function.  */
 141
 142 static void
 143 url_unescape (char *s)
 144 {
 145   char *t = s;                  /* t - tortoise */
 146   char *h = s;                  /* h - hare     */
 147
 148   for (; *h; h++, t++)
 149     {
 150       if (*h != '%')
 151         {
 152         copychar:
 153           *t = *h;
 154         }
 155       else
 156         {
 157           /* Do nothing if '%' is not followed by two hex digits. */
 158           if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2])))
 159             goto copychar;
 160           *t = X2DIGITS_TO_NUM (h[1], h[2]);
 161           h += 2;
 162         }
 163     }
 164   *t = '\0';
 165 }
 166
 167 /* The core of url_escape_* functions.  Escapes the characters that
 168    match the provided mask in urlchr_table.
 169
 170    If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars
 171    will be returned unchanged.  If ALLOW_PASSTHROUGH is zero, a
 172    freshly allocated string will be returned in all cases.  */
 173
 174 static char *
 175 url_escape_1 (const char *s, unsigned char mask, int allow_passthrough)
 176 {
 177   const char *p1;
 178   char *p2, *newstr;
 179   int newlen;
 180   int addition = 0;
 181
 182   for (p1 = s; *p1; p1++)
 183     if (urlchr_test (*p1, mask))
 184       addition += 2;            /* Two more characters (hex digits) */
 185
 186   if (!addition)
 187     return allow_passthrough ? (char *)s : xstrdup (s);
 188
 189   newlen = (p1 - s) + addition;
 190   newstr = (char *)xmalloc (newlen + 1);
 191
 192   p1 = s;
 193   p2 = newstr;
 194   while (*p1)
 195     {
 196       /* Quote the characters that match the test mask. */
 197       if (urlchr_test (*p1, mask))
 198         {
 199           unsigned char c = *p1++;
 200           *p2++ = '%';
 201           *p2++ = XNUM_TO_digit (c >> 4);
 202           *p2++ = XNUM_TO_digit (c & 0xf);
 203         }
 204       else
 205         *p2++ = *p1++;
 206     }
 207   assert (p2 - newstr == newlen);
 208   *p2 = '\0';
 209
 210   return newstr;
 211 }
 212
 213 /* URL-escape the unsafe characters (see urlchr_table) in a given
 214    string, returning a freshly allocated string.  */
 215
 216 char *
 217 url_escape (const char *s)
 218 {
 219   return url_escape_1 (s, urlchr_unsafe, 0);
 220 }
 221
 222 /* URL-escape the unsafe characters (see urlchr_table) in a given
 223    string.  If no characters are unsafe, S is returned.  */
 224
 225 static char *
 226 url_escape_allow_passthrough (const char *s)
 227 {
 228   return url_escape_1 (s, urlchr_unsafe, 1);
 229 }
 230 \f
 231 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
 232
 233 /* Decide whether to encode, decode, or pass through the char at P.
 234    This used to be a macro, but it got a little too convoluted.  */
 235 static inline enum copy_method
 236 decide_copy_method (const char *p)
 237 {
 238   if (*p == '%')
 239     {
 240       if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
 241         {
 242           /* %xx sequence: decode it, unless it would decode to an
 243              unsafe or a reserved char; in that case, leave it as
 244              is. */
 245           char preempt = X2DIGITS_TO_NUM (*(p + 1), *(p + 2));
 246           if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt))
 247             return CM_PASSTHROUGH;
 248           else
 249             return CM_DECODE;
 250         }
 251       else
 252         /* Garbled %.. sequence: encode `%'. */
 253         return CM_ENCODE;
 254     }
 255   else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
 256     return CM_ENCODE;
 257   else
 258     return CM_PASSTHROUGH;
 259 }
 260
 261 /* Translate a %-escaped (but possibly non-conformant) input string S
 262    into a %-escaped (and conformant) output string.  If no characters
 263    are encoded or decoded, return the same string S; otherwise, return
 264    a freshly allocated string with the new contents.
 265
 266    After a URL has been run through this function, the protocols that
 267    use `%' as the quote character can use the resulting string as-is,
 268    while those that don't call url_unescape() to get to the intended
 269    data.  This function is also stable: after an input string is
 270    transformed the first time, all further transformations of the
 271    result yield the same result string.
 272
 273    Let's discuss why this function is needed.
 274
 275    Imagine Wget is to retrieve `http://abc.xyz/abc def'.  Since a raw
 276    space character would mess up the HTTP request, it needs to be
 277    quoted, like this:
 278
 279        GET /abc%20def HTTP/1.0
 280
 281    It appears that the unsafe chars need to be quoted, for example
 282    with url_escape.  But what if we're requested to download
 283    `abc%20def'?  url_escape transforms "%" to "%25", which would leave
 284    us with `abc%2520def'.  This is incorrect -- since %-escapes are
 285    part of URL syntax, "%20" is the correct way to denote a literal
 286    space on the Wget command line.  This leaves us in the conclusion
 287    that in that case Wget should not call url_escape, but leave the
 288    `%20' as is.
 289
 290    And what if the requested URI is `abc%20 def'?  If we call
 291    url_escape, we end up with `/abc%2520%20def', which is almost
 292    certainly not intended.  If we don't call url_escape, we are left
 293    with the embedded space and cannot complete the request.  What the
 294    user meant was for Wget to request `/abc%20%20def', and this is
 295    where reencode_escapes kicks in.
 296
 297    Wget used to solve this by first decoding %-quotes, and then
 298    encoding all the "unsafe" characters found in the resulting string.
 299    This was wrong because it didn't preserve certain URL special
 300    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 301    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 302    whether we considered `+' reserved (it is).  One of these results
 303    is inevitable because by the second step we would lose information
 304    on whether the `+' was originally encoded or not.  Both results
 305    were wrong because in CGI parameters + means space, while %2B means
 306    literal plus.  reencode_escapes correctly translates the above to
 307    "a%2B+b", i.e. returns the original string.
 308
 309    This function uses an algorithm proposed by Anon Sricharoenchai:
 310
 311    1. Encode all URL_UNSAFE and the "%" that are not followed by 2
 312       hexdigits.
 313
 314    2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
 315       "+".
 316
 317    ...except that this code conflates the two steps, and decides
 318    whether to encode, decode, or pass through each character in turn.
 319    The function still uses two passes, but their logic is the same --
 320    the first pass exists merely for the sake of allocation.  Another
 321    small difference is that we include `+' to URL_RESERVED.
 322
 323    Anon's test case:
 324
 325    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 326    ->
 327    "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
 328
 329    Simpler test cases:
 330
 331    "foo bar"         -> "foo%20bar"
 332    "foo%20bar"       -> "foo%20bar"
 333    "foo %20bar"      -> "foo%20%20bar"
 334    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 335    "foo%25%20bar"    -> "foo%25%20bar"
 336    "foo%2%20bar"     -> "foo%252%20bar"
 337    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 338    "foo%2b+bar"      -> "foo%2b+bar"  */
 339
 340 static char *
 341 reencode_escapes (const char *s)
 342 {
 343   const char *p1;
 344   char *newstr, *p2;
 345   int oldlen, newlen;
 346
 347   int encode_count = 0;
 348   int decode_count = 0;
 349
 350   /* First, pass through the string to see if there's anything to do,
 351      and to calculate the new length.  */
 352   for (p1 = s; *p1; p1++)
 353     {
 354       switch (decide_copy_method (p1))
 355         {
 356         case CM_ENCODE:
 357           ++encode_count;
 358           break;
 359         case CM_DECODE:
 360           ++decode_count;
 361           break;
 362         case CM_PASSTHROUGH:
 363           break;
 364         }
 365     }
 366
 367   if (!encode_count && !decode_count)
 368     /* The string is good as it is. */
 369     return (char *)s;           /* C const model sucks. */
 370
 371   oldlen = p1 - s;
 372   /* Each encoding adds two characters (hex digits), while each
 373      decoding removes two characters.  */
 374   newlen = oldlen + 2 * (encode_count - decode_count);
 375   newstr = xmalloc (newlen + 1);
 376
 377   p1 = s;
 378   p2 = newstr;
 379
 380   while (*p1)
 381     {
 382       switch (decide_copy_method (p1))
 383         {
 384         case CM_ENCODE:
 385           {
 386             unsigned char c = *p1++;
 387             *p2++ = '%';
 388             *p2++ = XNUM_TO_DIGIT (c >> 4);
 389             *p2++ = XNUM_TO_DIGIT (c & 0xf);
 390           }
 391           break;
 392         case CM_DECODE:
 393           *p2++ = X2DIGITS_TO_NUM (p1[1], p1[2]);
 394           p1 += 3;              /* skip %xx */
 395           break;
 396         case CM_PASSTHROUGH:
 397           *p2++ = *p1++;
 398         }
 399     }
 400   *p2 = '\0';
 401   assert (p2 - newstr == newlen);
 402   return newstr;
 403 }
 404 \f
 405 /* Returns the scheme type if the scheme is supported, or
 406    SCHEME_INVALID if not.  */
 407
 408 enum url_scheme
 409 url_scheme (const char *url)
 410 {
 411   int i;
 412
 413   for (i = 0; supported_schemes[i].leading_string; i++)
 414     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 415                           strlen (supported_schemes[i].leading_string)))
 416       {
 417         if (supported_schemes[i].enabled)
 418           return (enum url_scheme) i;
 419         else
 420           return SCHEME_INVALID;
 421       }
 422
 423   return SCHEME_INVALID;
 424 }
 425
 426 #define SCHEME_CHAR(ch) (ISALNUM (ch) || (ch) == '-' || (ch) == '+')
 427
 428 /* Return 1 if the URL begins with any "scheme", 0 otherwise.  As
 429    currently implemented, it returns true if URL begins with
 430    [-+a-zA-Z0-9]+: .  */
 431
 432 int
 433 url_has_scheme (const char *url)
 434 {
 435   const char *p = url;
 436
 437   /* The first char must be a scheme char. */
 438   if (!*p || !SCHEME_CHAR (*p))
 439     return 0;
 440   ++p;
 441   /* Followed by 0 or more scheme chars. */
 442   while (*p && SCHEME_CHAR (*p))
 443     ++p;
 444   /* Terminated by ':'. */
 445   return *p == ':';
 446 }
 447
 448 int
 449 scheme_default_port (enum url_scheme scheme)
 450 {
 451   return supported_schemes[scheme].default_port;
 452 }
 453
 454 void
 455 scheme_disable (enum url_scheme scheme)
 456 {
 457   supported_schemes[scheme].enabled = 0;
 458 }
 459
 460 /* Skip the username and password, if present here.  The function
 461    should *not* be called with the complete URL, but with the part
 462    right after the scheme.
 463
 464    If no username and password are found, return 0.  */
 465
 466 static int
 467 url_skip_credentials (const char *url)
 468 {
 469   /* Look for '@' that comes before terminators, such as '/', '?',
 470      '#', or ';'.  */
 471   const char *p = (const char *)strpbrk (url, "@/?#;");
 472   if (!p || *p != '@')
 473     return 0;
 474   return p + 1 - url;
 475 }
 476
 477 /* Parse credentials contained in [BEG, END).  The region is expected
 478    to have come from a URL and is unescaped.  */
 479
 480 static int
 481 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
 482 {
 483   char *colon;
 484   const char *userend;
 485
 486   if (beg == end)
 487     return 0;                   /* empty user name */
 488
 489   colon = memchr (beg, ':', end - beg);
 490   if (colon == beg)
 491     return 0;                   /* again empty user name */
 492
 493   if (colon)
 494     {
 495       *passwd = strdupdelim (colon + 1, end);
 496       userend = colon;
 497       url_unescape (*passwd);
 498     }
 499   else
 500     {
 501       *passwd = NULL;
 502       userend = end;
 503     }
 504   *user = strdupdelim (beg, userend);
 505   url_unescape (*user);
 506   return 1;
 507 }
 508
 509 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 510    popularized by Netscape and NcFTP.  HTTP shorthands look like this:
 511
 512    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 513    www.foo.com[:port]            -> http://www.foo.com[:port]
 514
 515    FTP shorthands look like this:
 516
 517    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 518    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 519
 520    If the URL needs not or cannot be rewritten, return NULL.  */
 521
 522 char *
 523 rewrite_shorthand_url (const char *url)
 524 {
 525   const char *p;
 526
 527   if (url_has_scheme (url))
 528     return NULL;
 529
 530   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 531      latter Netscape.  */
 532   for (p = url; *p && *p != ':' && *p != '/'; p++)
 533     ;
 534
 535   if (p == url)
 536     return NULL;
 537
 538   if (*p == ':')
 539     {
 540       const char *pp;
 541       char *res;
 542       /* If the characters after the colon and before the next slash
 543          or end of string are all digits, it's HTTP.  */
 544       int digits = 0;
 545       for (pp = p + 1; ISDIGIT (*pp); pp++)
 546         ++digits;
 547       if (digits > 0 && (*pp == '/' || *pp == '\0'))
 548         goto http;
 549
 550       /* Prepend "ftp://" to the entire URL... */
 551       res = xmalloc (6 + strlen (url) + 1);
 552       sprintf (res, "ftp://%s", url);
 553       /* ...and replace ':' with '/'. */
 554       res[6 + (p - url)] = '/';
 555       return res;
 556     }
 557   else
 558     {
 559       char *res;
 560     http:
 561       /* Just prepend "http://" to what we have. */
 562       res = xmalloc (7 + strlen (url) + 1);
 563       sprintf (res, "http://%s", url);
 564       return res;
 565     }
 566 }
 567 \f
 568 static void split_path PARAMS ((const char *, char **, char **));
 569
 570 /* Like strpbrk, with the exception that it returns the pointer to the
 571    terminating zero (end-of-string aka "eos") if no matching character
 572    is found.
 573
 574    Although I normally balk at Gcc-specific optimizations, it probably
 575    makes sense here: glibc has optimizations that detect strpbrk being
 576    called with literal string as ACCEPT and inline the search.  That
 577    optimization is defeated if strpbrk is hidden within the call to
 578    another function.  (And no, making strpbrk_or_eos inline doesn't
 579    help because the check for literal accept is in the
 580    preprocessor.)  */
 581
 582 #ifdef __GNUC__
 583
 584 #define strpbrk_or_eos(s, accept) ({            \
 585   char *SOE_p = strpbrk (s, accept);            \
 586   if (!SOE_p)                                   \
 587     SOE_p = (char *)s + strlen (s);             \
 588   SOE_p;                                        \
 589 })
 590
 591 #else  /* not __GNUC__ */
 592
 593 static char *
 594 strpbrk_or_eos (const char *s, const char *accept)
 595 {
 596   char *p = strpbrk (s, accept);
 597   if (!p)
 598     p = (char *)s + strlen (s);
 599   return p;
 600 }
 601 #endif
 602
 603 /* Turn STR into lowercase; return non-zero if a character was
 604    actually changed. */
 605
 606 static int
 607 lowercase_str (char *str)
 608 {
 609   int change = 0;
 610   for (; *str; str++)
 611     if (ISUPPER (*str))
 612       {
 613         change = 1;
 614         *str = TOLOWER (*str);
 615       }
 616   return change;
 617 }
 618
 619 static char *parse_errors[] = {
 620 #define PE_NO_ERROR                     0
 621   "No error",
 622 #define PE_UNSUPPORTED_SCHEME           1
 623   "Unsupported scheme",
 624 #define PE_EMPTY_HOST                   2
 625   "Empty host",
 626 #define PE_BAD_PORT_NUMBER              3
 627   "Bad port number",
 628 #define PE_INVALID_USER_NAME            4
 629   "Invalid user name",
 630 #define PE_UNTERMINATED_IPV6_ADDRESS    5
 631   "Unterminated IPv6 numeric address",
 632 #define PE_IPV6_NOT_SUPPORTED           6
 633   "IPv6 addresses not supported",
 634 #define PE_INVALID_IPV6_ADDRESS         7
 635   "Invalid IPv6 numeric address"
 636 };
 637
 638 #define SETERR(p, v) do {                       \
 639   if (p)                                        \
 640     *(p) = (v);                                 \
 641 } while (0)
 642
 643 #ifdef ENABLE_IPV6
 644 /* The following two functions were adapted from glibc. */
 645
 646 static int
 647 is_valid_ipv4_address (const char *str, const char *end)
 648 {
 649   int saw_digit, octets;
 650   int val;
 651
 652   saw_digit = 0;
 653   octets = 0;
 654   val = 0;
 655
 656   while (str < end) {
 657     int ch = *str++;
 658
 659     if (ch >= '0' && ch <= '9') {
 660       val = val * 10 + (ch - '0');
 661
 662       if (val > 255)
 663         return 0;
 664       if (saw_digit == 0) {
 665         if (++octets > 4)
 666           return 0;
 667         saw_digit = 1;
 668       }
 669     } else if (ch == '.' && saw_digit == 1) {
 670       if (octets == 4)
 671         return 0;
 672       val = 0;
 673       saw_digit = 0;
 674     } else
 675       return 0;
 676   }
 677   if (octets < 4)
 678     return 0;
 679
 680   return 1;
 681 }
 682
 683 static const int NS_INADDRSZ  = 4;
 684 static const int NS_IN6ADDRSZ = 16;
 685 static const int NS_INT16SZ   = 2;
 686
 687 static int
 688 is_valid_ipv6_address (const char *str, const char *end)
 689 {
 690   static const char xdigits[] = "0123456789abcdef";
 691   const char *curtok;
 692   int tp;
 693   const char *colonp;
 694   int saw_xdigit;
 695   unsigned int val;
 696
 697   tp = 0;
 698   colonp = NULL;
 699
 700   if (str == end)
 701     return 0;
 702
 703   /* Leading :: requires some special handling. */
 704   if (*str == ':')
 705     {
 706       ++str;
 707       if (str == end || *str != ':')
 708         return 0;
 709     }
 710
 711   curtok = str;
 712   saw_xdigit = 0;
 713   val = 0;
 714
 715   while (str < end) {
 716     int ch = *str++;
 717     const char *pch;
 718
 719     /* if ch is a number, add it to val. */
 720     pch = strchr(xdigits, ch);
 721     if (pch != NULL) {
 722       val <<= 4;
 723       val |= (pch - xdigits);
 724       if (val > 0xffff)
 725         return 0;
 726       saw_xdigit = 1;
 727       continue;
 728     }
 729
 730     /* if ch is a colon ... */
 731     if (ch == ':') {
 732       curtok = str;
 733       if (saw_xdigit == 0) {
 734         if (colonp != NULL)
 735           return 0;
 736         colonp = str + tp;
 737         continue;
 738       } else if (str == end) {
 739         return 0;
 740       }
 741       if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
 742         return 0;
 743       tp += NS_INT16SZ;
 744       saw_xdigit = 0;
 745       val = 0;
 746       continue;
 747     }
 748
 749     /* if ch is a dot ... */
 750     if (ch == '.' && (tp <= NS_IN6ADDRSZ - NS_INADDRSZ) &&
 751         is_valid_ipv4_address(curtok, end) == 1) {
 752       tp += NS_INADDRSZ;
 753       saw_xdigit = 0;
 754       break;
 755     }
 756
 757     return 0;
 758   }
 759
 760   if (saw_xdigit == 1) {
 761     if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
 762       return 0;
 763     tp += NS_INT16SZ;
 764   }
 765
 766   if (colonp != NULL) {
 767     if (tp == NS_IN6ADDRSZ)
 768       return 0;
 769     tp = NS_IN6ADDRSZ;
 770   }
 771
 772   if (tp != NS_IN6ADDRSZ)
 773     return 0;
 774
 775   return 1;
 776 }
 777 #endif
 778
 779 /* Parse a URL.
 780
 781    Return a new struct url if successful, NULL on error.  In case of
 782    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 783    error code. */
 784 struct url *
 785 url_parse (const char *url, int *error)
 786 {
 787   struct url *u;
 788   const char *p;
 789   int path_modified, host_modified;
 790
 791   enum url_scheme scheme;
 792
 793   const char *uname_b,     *uname_e;
 794   const char *host_b,      *host_e;
 795   const char *path_b,      *path_e;
 796   const char *params_b,    *params_e;
 797   const char *query_b,     *query_e;
 798   const char *fragment_b,  *fragment_e;
 799
 800   int port;
 801   char *user = NULL, *passwd = NULL;
 802
 803   char *url_encoded;
 804
 805   scheme = url_scheme (url);
 806   if (scheme == SCHEME_INVALID)
 807     {
 808       SETERR (error, PE_UNSUPPORTED_SCHEME);
 809       return NULL;
 810     }
 811
 812   url_encoded = reencode_escapes (url);
 813   p = url_encoded;
 814
 815   p += strlen (supported_schemes[scheme].leading_string);
 816   uname_b = p;
 817   p += url_skip_credentials (p);
 818   uname_e = p;
 819
 820   /* scheme://user:pass@host[:port]... */
 821   /*                    ^              */
 822
 823   /* We attempt to break down the URL into the components path,
 824      params, query, and fragment.  They are ordered like this:
 825
 826        scheme://host[:port][/path][;params][?query][#fragment]  */
 827
 828   params_b   = params_e   = NULL;
 829   query_b    = query_e    = NULL;
 830   fragment_b = fragment_e = NULL;
 831
 832   host_b = p;
 833
 834   if (*p == '[')
 835     {
 836       /* Handle IPv6 address inside square brackets.  Ideally we'd
 837          just look for the terminating ']', but rfc2732 mandates
 838          rejecting invalid IPv6 addresses.  */
 839
 840       /* The address begins after '['. */
 841       host_b = p + 1;
 842       host_e = strchr (host_b, ']');
 843
 844       if (!host_e)
 845         {
 846           SETERR (error, PE_UNTERMINATED_IPV6_ADDRESS);
 847           return NULL;
 848         }
 849
 850 #ifdef ENABLE_IPV6
 851       /* Check if the IPv6 address is valid. */
 852       if (!is_valid_ipv6_address(host_b, host_e))
 853         {
 854           SETERR (error, PE_INVALID_IPV6_ADDRESS);
 855           return NULL;
 856         }
 857
 858       /* Continue parsing after the closing ']'. */
 859       p = host_e + 1;
 860 #else
 861       SETERR (error, PE_IPV6_NOT_SUPPORTED);
 862       return NULL;
 863 #endif
 864     }
 865   else
 866     {
 867       p = strpbrk_or_eos (p, ":/;?#");
 868       host_e = p;
 869     }
 870
 871   if (host_b == host_e)
 872     {
 873       SETERR (error, PE_EMPTY_HOST);
 874       return NULL;
 875     }
 876
 877   port = scheme_default_port (scheme);
 878   if (*p == ':')
 879     {
 880       const char *port_b, *port_e, *pp;
 881
 882       /* scheme://host:port/tralala */
 883       /*              ^             */
 884       ++p;
 885       port_b = p;
 886       p = strpbrk_or_eos (p, "/;?#");
 887       port_e = p;
 888
 889       if (port_b == port_e)
 890         {
 891           /* http://host:/whatever */
 892           /*             ^         */
 893           SETERR (error, PE_BAD_PORT_NUMBER);
 894           return NULL;
 895         }
 896
 897       for (port = 0, pp = port_b; pp < port_e; pp++)
 898         {
 899           if (!ISDIGIT (*pp))
 900             {
 901               /* http://host:12randomgarbage/blah */
 902               /*               ^                  */
 903               SETERR (error, PE_BAD_PORT_NUMBER);
 904               return NULL;
 905             }
 906
 907           port = 10 * port + (*pp - '0');
 908         }
 909     }
 910
 911   if (*p == '/')
 912     {
 913       ++p;
 914       path_b = p;
 915       p = strpbrk_or_eos (p, ";?#");
 916       path_e = p;
 917     }
 918   else
 919     {
 920       /* Path is not allowed not to exist. */
 921       path_b = path_e = p;
 922     }
 923
 924   if (*p == ';')
 925     {
 926       ++p;
 927       params_b = p;
 928       p = strpbrk_or_eos (p, "?#");
 929       params_e = p;
 930     }
 931   if (*p == '?')
 932     {
 933       ++p;
 934       query_b = p;
 935       p = strpbrk_or_eos (p, "#");
 936       query_e = p;
 937
 938       /* Hack that allows users to use '?' (a wildcard character) in
 939          FTP URLs without it being interpreted as a query string
 940          delimiter.  */
 941       if (scheme == SCHEME_FTP)
 942         {
 943           query_b = query_e = NULL;
 944           path_e = p;
 945         }
 946     }
 947   if (*p == '#')
 948     {
 949       ++p;
 950       fragment_b = p;
 951       p += strlen (p);
 952       fragment_e = p;
 953     }
 954   assert (*p == 0);
 955
 956   if (uname_b != uname_e)
 957     {
 958       /* http://user:pass@host */
 959       /*        ^         ^    */
 960       /*     uname_b   uname_e */
 961       if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
 962         {
 963           SETERR (error, PE_INVALID_USER_NAME);
 964           return NULL;
 965         }
 966     }
 967
 968   u = (struct url *)xmalloc (sizeof (struct url));
 969   memset (u, 0, sizeof (*u));
 970
 971   u->scheme = scheme;
 972   u->host   = strdupdelim (host_b, host_e);
 973   u->port   = port;
 974   u->user   = user;
 975   u->passwd = passwd;
 976
 977   u->path = strdupdelim (path_b, path_e);
 978   path_modified = path_simplify (u->path);
 979   split_path (u->path, &u->dir, &u->file);
 980
 981   host_modified = lowercase_str (u->host);
 982
 983   if (params_b)
 984     u->params = strdupdelim (params_b, params_e);
 985   if (query_b)
 986     u->query = strdupdelim (query_b, query_e);
 987   if (fragment_b)
 988     u->fragment = strdupdelim (fragment_b, fragment_e);
 989
 990   if (path_modified || u->fragment || host_modified || path_b == path_e)
 991     {
 992       /* If we suspect that a transformation has rendered what
 993          url_string might return different from URL_ENCODED, rebuild
 994          u->url using url_string.  */
 995       u->url = url_string (u, 0);
 996
 997       if (url_encoded != url)
 998         xfree ((char *) url_encoded);
 999     }
1000   else
1001     {
1002       if (url_encoded == url)
1003         u->url = xstrdup (url);
1004       else
1005         u->url = url_encoded;
1006     }
1007   url_encoded = NULL;
1008
1009   return u;
1010 }
1011
1012 const char *
1013 url_error (int error_code)
1014 {
1015   assert (error_code >= 0 && error_code < countof (parse_errors));
1016   return parse_errors[error_code];
1017 }
1018
1019 /* Split PATH into DIR and FILE.  PATH comes from the URL and is
1020    expected to be URL-escaped.
1021
1022    The path is split into directory (the part up to the last slash)
1023    and file (the part after the last slash), which are subsequently
1024    unescaped.  Examples:
1025
1026    PATH                 DIR           FILE
1027    "foo/bar/baz"        "foo/bar"     "baz"
1028    "foo/bar/"           "foo/bar"     ""
1029    "foo"                ""            "foo"
1030    "foo/bar/baz%2fqux"  "foo/bar"     "baz/qux" (!)
1031
1032    DIR and FILE are freshly allocated.  */
1033
1034 static void
1035 split_path (const char *path, char **dir, char **file)
1036 {
1037   char *last_slash = strrchr (path, '/');
1038   if (!last_slash)
1039     {
1040       *dir = xstrdup ("");
1041       *file = xstrdup (path);
1042     }
1043   else
1044     {
1045       *dir = strdupdelim (path, last_slash);
1046       *file = xstrdup (last_slash + 1);
1047     }
1048   url_unescape (*dir);
1049   url_unescape (*file);
1050 }
1051
1052 /* Note: URL's "full path" is the path with the query string and
1053    params appended.  The "fragment" (#foo) is intentionally ignored,
1054    but that might be changed.  For example, if the original URL was
1055    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
1056    the full path will be "/foo/bar/baz;bullshit?querystring".  */
1057
1058 /* Return the length of the full path, without the terminating
1059    zero.  */
1060
1061 static int
1062 full_path_length (const struct url *url)
1063 {
1064   int len = 0;
1065
1066 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
1067
1068   FROB (path);
1069   FROB (params);
1070   FROB (query);
1071
1072 #undef FROB
1073
1074   return len;
1075 }
1076
1077 /* Write out the full path. */
1078
1079 static void
1080 full_path_write (const struct url *url, char *where)
1081 {
1082 #define FROB(el, chr) do {                      \
1083   char *f_el = url->el;                         \
1084   if (f_el) {                                   \
1085     int l = strlen (f_el);                      \
1086     *where++ = chr;                             \
1087     memcpy (where, f_el, l);                    \
1088     where += l;                                 \
1089   }                                             \
1090 } while (0)
1091
1092   FROB (path, '/');
1093   FROB (params, ';');
1094   FROB (query, '?');
1095
1096 #undef FROB
1097 }
1098
1099 /* Public function for getting the "full path".  E.g. if u->path is
1100    "foo/bar" and u->query is "param=value", full_path will be
1101    "/foo/bar?param=value". */
1102
1103 char *
1104 url_full_path (const struct url *url)
1105 {
1106   int length = full_path_length (url);
1107   char *full_path = (char *)xmalloc(length + 1);
1108
1109   full_path_write (url, full_path);
1110   full_path[length] = '\0';
1111
1112   return full_path;
1113 }
1114
1115 /* Escape unsafe and reserved characters, except for the slash
1116    characters.  */
1117
1118 static char *
1119 url_escape_dir (const char *dir)
1120 {
1121   char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1122   char *h, *t;
1123   if (newdir == dir)
1124     return (char *)dir;
1125
1126   /* Unescape slashes in NEWDIR. */
1127
1128   h = newdir;                   /* hare */
1129   t = newdir;                   /* tortoise */
1130
1131   for (; *h; h++, t++)
1132     {
1133       if (*h == '%' && h[1] == '2' && h[2] == 'F')
1134         {
1135           *t = '/';
1136           h += 2;
1137         }
1138       else
1139         *t = *h;
1140     }
1141   *t = '\0';
1142
1143   return newdir;
1144 }
1145
1146 /* Sync u->path and u->url with u->dir and u->file.  Called after
1147    u->file or u->dir have been changed, typically by the FTP code.  */
1148
1149 static void
1150 sync_path (struct url *u)
1151 {
1152   char *newpath, *efile, *edir;
1153
1154   xfree (u->path);
1155
1156   /* u->dir and u->file are not escaped.  URL-escape them before
1157      reassembling them into u->path.  That way, if they contain
1158      separators like '?' or even if u->file contains slashes, the
1159      path will be correctly assembled.  (u->file can contain slashes
1160      if the URL specifies it with %2f, or if an FTP server returns
1161      it.)  */
1162   edir = url_escape_dir (u->dir);
1163   efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1164
1165   if (!*edir)
1166     newpath = xstrdup (efile);
1167   else
1168     {
1169       int dirlen = strlen (edir);
1170       int filelen = strlen (efile);
1171
1172       /* Copy "DIR/FILE" to newpath. */
1173       char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1174       memcpy (p, edir, dirlen);
1175       p += dirlen;
1176       *p++ = '/';
1177       memcpy (p, efile, filelen);
1178       p += filelen;
1179       *p++ = '\0';
1180     }
1181
1182   u->path = newpath;
1183
1184   if (edir != u->dir)
1185     xfree (edir);
1186   if (efile != u->file)
1187     xfree (efile);
1188
1189   /* Regenerate u->url as well.  */
1190   xfree (u->url);
1191   u->url = url_string (u, 0);
1192 }
1193
1194 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1195    This way we can sync u->path and u->url when they get changed.  */
1196
1197 void
1198 url_set_dir (struct url *url, const char *newdir)
1199 {
1200   xfree (url->dir);
1201   url->dir = xstrdup (newdir);
1202   sync_path (url);
1203 }
1204
1205 void
1206 url_set_file (struct url *url, const char *newfile)
1207 {
1208   xfree (url->file);
1209   url->file = xstrdup (newfile);
1210   sync_path (url);
1211 }
1212
1213 void
1214 url_free (struct url *url)
1215 {
1216   xfree (url->host);
1217   xfree (url->path);
1218   xfree (url->url);
1219
1220   FREE_MAYBE (url->params);
1221   FREE_MAYBE (url->query);
1222   FREE_MAYBE (url->fragment);
1223   FREE_MAYBE (url->user);
1224   FREE_MAYBE (url->passwd);
1225
1226   xfree (url->dir);
1227   xfree (url->file);
1228
1229   xfree (url);
1230 }
1231 \f
1232 /* Create all the necessary directories for PATH (a file).  Calls
1233    mkdirhier() internally.  */
1234 int
1235 mkalldirs (const char *path)
1236 {
1237   const char *p;
1238   char *t;
1239   struct stat st;
1240   int res;
1241
1242   p = path + strlen (path);
1243   for (; *p != '/' && p != path; p--)
1244     ;
1245
1246   /* Don't create if it's just a file.  */
1247   if ((p == path) && (*p != '/'))
1248     return 0;
1249   t = strdupdelim (path, p);
1250
1251   /* Check whether the directory exists.  */
1252   if ((stat (t, &st) == 0))
1253     {
1254       if (S_ISDIR (st.st_mode))
1255         {
1256           xfree (t);
1257           return 0;
1258         }
1259       else
1260         {
1261           /* If the dir exists as a file name, remove it first.  This
1262              is *only* for Wget to work with buggy old CERN http
1263              servers.  Here is the scenario: When Wget tries to
1264              retrieve a directory without a slash, e.g.
1265              http://foo/bar (bar being a directory), CERN server will
1266              not redirect it too http://foo/bar/ -- it will generate a
1267              directory listing containing links to bar/file1,
1268              bar/file2, etc.  Wget will lose because it saves this
1269              HTML listing to a file `bar', so it cannot create the
1270              directory.  To work around this, if the file of the same
1271              name exists, we just remove it and create the directory
1272              anyway.  */
1273           DEBUGP (("Removing %s because of directory danger!\n", t));
1274           unlink (t);
1275         }
1276     }
1277   res = make_directory (t);
1278   if (res != 0)
1279     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1280   xfree (t);
1281   return res;
1282 }
1283 \f
1284 /* Functions for constructing the file name out of URL components.  */
1285
1286 /* A growable string structure, used by url_file_name and friends.
1287    This should perhaps be moved to utils.c.
1288
1289    The idea is to have a convenient and efficient way to construct a
1290    string by having various functions append data to it.  Instead of
1291    passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1292    functions in questions, we pass the pointer to this struct.  */
1293
1294 struct growable {
1295   char *base;
1296   int size;
1297   int tail;
1298 };
1299
1300 /* Ensure that the string can accept APPEND_COUNT more characters past
1301    the current TAIL position.  If necessary, this will grow the string
1302    and update its allocated size.  If the string is already large
1303    enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1304 #define GROW(g, append_size) do {                                       \
1305   struct growable *G_ = g;                                              \
1306   DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1307 } while (0)
1308
1309 /* Return the tail position of the string. */
1310 #define TAIL(r) ((r)->base + (r)->tail)
1311
1312 /* Move the tail position by APPEND_COUNT characters. */
1313 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1314
1315 /* Append the string STR to DEST.  NOTICE: the string in DEST is not
1316    terminated.  */
1317
1318 static void
1319 append_string (const char *str, struct growable *dest)
1320 {
1321   int l = strlen (str);
1322   GROW (dest, l);
1323   memcpy (TAIL (dest), str, l);
1324   TAIL_INCR (dest, l);
1325 }
1326
1327 /* Append CH to DEST.  For example, append_char (0, DEST)
1328    zero-terminates DEST.  */
1329
1330 static void
1331 append_char (char ch, struct growable *dest)
1332 {
1333   GROW (dest, 1);
1334   *TAIL (dest) = ch;
1335   TAIL_INCR (dest, 1);
1336 }
1337
1338 enum {
1339   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1340   filechr_not_windows = 2,      /* unusable on Windows, one of \|/<>?:*" */
1341   filechr_control     = 4,      /* a control character, e.g. 0-31 */
1342 };
1343
1344 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1345
1346 /* Shorthands for the table: */
1347 #define U filechr_not_unix
1348 #define W filechr_not_windows
1349 #define C filechr_control
1350
1351 #define UW U|W
1352 #define UWC U|W|C
1353
1354 /* Table of characters unsafe under various conditions (see above).
1355
1356    Arguably we could also claim `%' to be unsafe, since we use it as
1357    the escape character.  If we ever want to be able to reliably
1358    translate file name back to URL, this would become important
1359    crucial.  Right now, it's better to be minimal in escaping.  */
1360
1361 const static unsigned char filechr_table[256] =
1362 {
1363 UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1364   C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1365   C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1366   C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1367   0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1368   0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1369   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1370   0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
1371   0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1372   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1373   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1374   0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1375   0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1376   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1377   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1378   0,  0,  0,  0,   0,  0,  0,  0,   /* x   y   z   {    |   }   ~   DEL */
1379
1380   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
1381   C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
1382   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1383   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1384
1385   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1386   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1387   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1388   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1389 };
1390 #undef U
1391 #undef W
1392 #undef C
1393 #undef UW
1394 #undef UWC
1395
1396 /* FN_PORT_SEP is the separator between host and port in file names
1397    for non-standard port numbers.  On Unix this is normally ':', as in
1398    "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1399    because Windows can't handle ':' in file names.  */
1400 #define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1401
1402 /* FN_QUERY_SEP is the separator between the file name and the URL
1403    query, normally '?'.  Since Windows cannot handle '?' as part of
1404    file name, we use '@' instead there.  */
1405 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1406
1407 /* Quote path element, characters in [b, e), as file name, and append
1408    the quoted string to DEST.  Each character is quoted as per
1409    file_unsafe_char and the corresponding table.  */
1410
1411 static void
1412 append_uri_pathel (const char *b, const char *e, struct growable *dest)
1413 {
1414   char *pathel;
1415   int pathlen;
1416
1417   const char *p;
1418   int quoted, outlen;
1419
1420   int mask;
1421   if (opt.restrict_files_os == restrict_unix)
1422     mask = filechr_not_unix;
1423   else
1424     mask = filechr_not_windows;
1425   if (opt.restrict_files_ctrl)
1426     mask |= filechr_control;
1427
1428   /* Copy [b, e) to PATHEL and URL-unescape it. */
1429   BOUNDED_TO_ALLOCA (b, e, pathel);
1430   url_unescape (pathel);
1431   pathlen = strlen (pathel);
1432
1433   /* Go through PATHEL and check how many characters we'll need to
1434      add for file quoting. */
1435   quoted = 0;
1436   for (p = pathel; *p; p++)
1437     if (FILE_CHAR_TEST (*p, mask))
1438       ++quoted;
1439
1440   /* p - pathel is the string length.  Each quoted char means two
1441      additional characters in the string, hence 2*quoted.  */
1442   outlen = (p - pathel) + (2 * quoted);
1443   GROW (dest, outlen);
1444
1445   if (!quoted)
1446     {
1447       /* If there's nothing to quote, we don't need to go through the
1448          string the second time.  */
1449       memcpy (TAIL (dest), pathel, outlen);
1450     }
1451   else
1452     {
1453       char *q = TAIL (dest);
1454       for (p = pathel; *p; p++)
1455         {
1456           if (!FILE_CHAR_TEST (*p, mask))
1457             *q++ = *p;
1458           else
1459             {
1460               unsigned char ch = *p;
1461               *q++ = '%';
1462               *q++ = XNUM_TO_DIGIT (ch >> 4);
1463               *q++ = XNUM_TO_DIGIT (ch & 0xf);
1464             }
1465         }
1466       assert (q - TAIL (dest) == outlen);
1467     }
1468   TAIL_INCR (dest, outlen);
1469 }
1470
1471 /* Append to DEST the directory structure that corresponds the
1472    directory part of URL's path.  For example, if the URL is
1473    http://server/dir1/dir2/file, this appends "/dir1/dir2".
1474
1475    Each path element ("dir1" and "dir2" in the above example) is
1476    examined, url-unescaped, and re-escaped as file name element.
1477
1478    Additionally, it cuts as many directories from the path as
1479    specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1480    will produce "bar" for the above example.  For 2 or more, it will
1481    produce "".
1482
1483    Each component of the path is quoted for use as file name.  */
1484
1485 static void
1486 append_dir_structure (const struct url *u, struct growable *dest)
1487 {
1488   char *pathel, *next;
1489   int cut = opt.cut_dirs;
1490
1491   /* Go through the path components, de-URL-quote them, and quote them
1492      (if necessary) as file names.  */
1493
1494   pathel = u->path;
1495   for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1496     {
1497       if (cut-- > 0)
1498         continue;
1499       if (pathel == next)
1500         /* Ignore empty pathels.  path_simplify should remove
1501            occurrences of "//" from the path, but it has special cases
1502            for starting / which generates an empty pathel here.  */
1503         continue;
1504
1505       if (dest->tail)
1506         append_char ('/', dest);
1507       append_uri_pathel (pathel, next, dest);
1508     }
1509 }
1510
1511 /* Return a unique file name that matches the given URL as good as
1512    possible.  Does not create directories on the file system.  */
1513
1514 char *
1515 url_file_name (const struct url *u)
1516 {
1517   struct growable fnres;
1518
1519   char *u_file, *u_query;
1520   char *fname, *unique;
1521
1522   fnres.base = NULL;
1523   fnres.size = 0;
1524   fnres.tail = 0;
1525
1526   /* Start with the directory prefix, if specified. */
1527   if (opt.dir_prefix)
1528     append_string (opt.dir_prefix, &fnres);
1529
1530   /* If "dirstruct" is turned on (typically the case with -r), add
1531      the host and port (unless those have been turned off) and
1532      directory structure.  */
1533   if (opt.dirstruct)
1534     {
1535       if (opt.add_hostdir)
1536         {
1537           if (fnres.tail)
1538             append_char ('/', &fnres);
1539           append_string (u->host, &fnres);
1540           if (u->port != scheme_default_port (u->scheme))
1541             {
1542               char portstr[24];
1543               number_to_string (portstr, u->port);
1544               append_char (FN_PORT_SEP, &fnres);
1545               append_string (portstr, &fnres);
1546             }
1547         }
1548
1549       append_dir_structure (u, &fnres);
1550     }
1551
1552   /* Add the file name. */
1553   if (fnres.tail)
1554     append_char ('/', &fnres);
1555   u_file = *u->file ? u->file : "index.html";
1556   append_uri_pathel (u_file, u_file + strlen (u_file), &fnres);
1557
1558   /* Append "?query" to the file name. */
1559   u_query = u->query && *u->query ? u->query : NULL;
1560   if (u_query)
1561     {
1562       append_char (FN_QUERY_SEP, &fnres);
1563       append_uri_pathel (u_query, u_query + strlen (u_query), &fnres);
1564     }
1565
1566   /* Zero-terminate the file name. */
1567   append_char ('\0', &fnres);
1568
1569   fname = fnres.base;
1570
1571   /* Check the cases in which the unique extensions are not used:
1572      1) Clobbering is turned off (-nc).
1573      2) Retrieval with regetting.
1574      3) Timestamping is used.
1575      4) Hierarchy is built.
1576
1577      The exception is the case when file does exist and is a
1578      directory (see `mkalldirs' for explanation).  */
1579
1580   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1581       && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1582     return fname;
1583
1584   unique = unique_name (fname, 1);
1585   if (unique != fname)
1586     xfree (fname);
1587   return unique;
1588 }
1589
1590 /* Return the length of URL's path.  Path is considered to be
1591    terminated by one of '?', ';', '#', or by the end of the
1592    string.  */
1593 static int
1594 path_length (const char *url)
1595 {
1596   const char *q = strpbrk_or_eos (url, "?;#");
1597   return q - url;
1598 }
1599
1600 /* Find the last occurrence of character C in the range [b, e), or
1601    NULL, if none are present.  This is equivalent to strrchr(b, c),
1602    except that it accepts an END argument instead of requiring the
1603    string to be zero-terminated.  Why is there no memrchr()?  */
1604 static const char *
1605 find_last_char (const char *b, const char *e, char c)
1606 {
1607   for (; e > b; e--)
1608     if (*e == c)
1609       return e;
1610   return NULL;
1611 }
1612 \f
1613 /* Resolve "." and ".." elements of PATH by destructively modifying
1614    PATH.  "." is resolved by removing that path element, and ".." is
1615    resolved by removing the preceding path element.  Single leading
1616    and trailing slashes are preserved.
1617
1618    Return non-zero if any changes have been made.
1619
1620    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1621    test examples are provided below.  If you change anything in this
1622    function, run test_path_simplify to make sure you haven't broken a
1623    test case.
1624
1625    A previous version of this function was based on path_simplify()
1626    from GNU Bash, but it has been rewritten for Wget 1.8.1.  */
1627
1628 static int
1629 path_simplify (char *path)
1630 {
1631   char *h, *t, *end;
1632
1633   /* Preserve the leading '/'. */
1634   if (path[0] == '/')
1635     ++path;
1636
1637   h = path;                     /* hare */
1638   t = path;                     /* tortoise */
1639   end = path + strlen (path);
1640
1641   while (h < end)
1642     {
1643       /* Hare should be at the beginning of a path element. */
1644
1645       if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1646         {
1647           /* Ignore "./". */
1648           h += 2;
1649         }
1650       else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1651         {
1652           /* Handle "../" by retreating the tortoise by one path
1653              element -- but not past beggining of PATH.  */
1654
1655           if (t > path)
1656             {
1657               /* Move backwards until B hits the beginning of the
1658                  previous path element or the beginning of path. */
1659               for (--t; t > path && t[-1] != '/'; t--)
1660                 ;
1661             }
1662           h += 3;
1663         }
1664       else if (*h == '/')
1665         {
1666           /* Ignore empty path elements.  Supporting them is hard (in
1667              which directory do you save http://x.com///y.html?), and
1668              they don't bring any practical gain.  Plus, they break
1669              our filesystem-influenced assumptions: allowing empty
1670              path elements means that "x/y/../z" simplifies to
1671              "x/y/z", whereas most people would expect "x/z".  */
1672           ++h;
1673         }
1674       else
1675         {
1676           /* A regular path element.  If H hasn't advanced past T,
1677              simply skip to the next path element.  Otherwise, copy
1678              the path element until the next slash.  */
1679           if (t == h)
1680             {
1681               /* Skip the path element, including the slash.  */
1682               while (h < end && *h != '/')
1683                 t++, h++;
1684               if (h < end)
1685                 t++, h++;
1686             }
1687           else
1688             {
1689               /* Copy the path element, including the final slash.  */
1690               while (h < end && *h != '/')
1691                 *t++ = *h++;
1692               if (h < end)
1693                 *t++ = *h++;
1694             }
1695         }
1696     }
1697
1698   if (t != h)
1699     *t = '\0';
1700
1701   return t != h;
1702 }
1703 \f
1704 /* Merge BASE with LINK and return the resulting URI.
1705
1706    Either of the URIs may be absolute or relative, complete with the
1707    host name, or path only.  This tries to reasonably handle all
1708    foreseeable cases.  It only employs minimal URL parsing, without
1709    knowledge of the specifics of schemes.
1710
1711    Perhaps this function should call path_simplify so that the callers
1712    don't have to call url_parse unconditionally.  */
1713
1714 char *
1715 uri_merge (const char *base, const char *link)
1716 {
1717   int linklength;
1718   const char *end;
1719   char *merge;
1720
1721   if (url_has_scheme (link))
1722     return xstrdup (link);
1723
1724   /* We may not examine BASE past END. */
1725   end = base + path_length (base);
1726   linklength = strlen (link);
1727
1728   if (!*link)
1729     {
1730       /* Empty LINK points back to BASE, query string and all. */
1731       return xstrdup (base);
1732     }
1733   else if (*link == '?')
1734     {
1735       /* LINK points to the same location, but changes the query
1736          string.  Examples: */
1737       /* uri_merge("path",         "?new") -> "path?new"     */
1738       /* uri_merge("path?foo",     "?new") -> "path?new"     */
1739       /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1740       /* uri_merge("path#foo",     "?new") -> "path?new"     */
1741       int baselength = end - base;
1742       merge = xmalloc (baselength + linklength + 1);
1743       memcpy (merge, base, baselength);
1744       memcpy (merge + baselength, link, linklength);
1745       merge[baselength + linklength] = '\0';
1746     }
1747   else if (*link == '#')
1748     {
1749       /* uri_merge("path",         "#new") -> "path#new"     */
1750       /* uri_merge("path#foo",     "#new") -> "path#new"     */
1751       /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1752       /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1753       int baselength;
1754       const char *end1 = strchr (base, '#');
1755       if (!end1)
1756         end1 = base + strlen (base);
1757       baselength = end1 - base;
1758       merge = xmalloc (baselength + linklength + 1);
1759       memcpy (merge, base, baselength);
1760       memcpy (merge + baselength, link, linklength);
1761       merge[baselength + linklength] = '\0';
1762     }
1763   else if (*link == '/' && *(link + 1) == '/')
1764     {
1765       /* LINK begins with "//" and so is a net path: we need to
1766          replace everything after (and including) the double slash
1767          with LINK. */
1768
1769       /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1770       /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1771       /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1772
1773       int span;
1774       const char *slash;
1775       const char *start_insert;
1776
1777       /* Look for first slash. */
1778       slash = memchr (base, '/', end - base);
1779       /* If found slash and it is a double slash, then replace
1780          from this point, else default to replacing from the
1781          beginning.  */
1782       if (slash && *(slash + 1) == '/')
1783         start_insert = slash;
1784       else
1785         start_insert = base;
1786
1787       span = start_insert - base;
1788       merge = (char *)xmalloc (span + linklength + 1);
1789       if (span)
1790         memcpy (merge, base, span);
1791       memcpy (merge + span, link, linklength);
1792       merge[span + linklength] = '\0';
1793     }
1794   else if (*link == '/')
1795     {
1796       /* LINK is an absolute path: we need to replace everything
1797          after (and including) the FIRST slash with LINK.
1798
1799          So, if BASE is "http://host/whatever/foo/bar", and LINK is
1800          "/qux/xyzzy", our result should be
1801          "http://host/qux/xyzzy".  */
1802       int span;
1803       const char *slash;
1804       const char *start_insert = NULL; /* for gcc to shut up. */
1805       const char *pos = base;
1806       int seen_slash_slash = 0;
1807       /* We're looking for the first slash, but want to ignore
1808          double slash. */
1809     again:
1810       slash = memchr (pos, '/', end - pos);
1811       if (slash && !seen_slash_slash)
1812         if (*(slash + 1) == '/')
1813           {
1814             pos = slash + 2;
1815             seen_slash_slash = 1;
1816             goto again;
1817           }
1818
1819       /* At this point, SLASH is the location of the first / after
1820          "//", or the first slash altogether.  START_INSERT is the
1821          pointer to the location where LINK will be inserted.  When
1822          examining the last two examples, keep in mind that LINK
1823          begins with '/'. */
1824
1825       if (!slash && !seen_slash_slash)
1826         /* example: "foo" */
1827         /*           ^    */
1828         start_insert = base;
1829       else if (!slash && seen_slash_slash)
1830         /* example: "http://foo" */
1831         /*                     ^ */
1832         start_insert = end;
1833       else if (slash && !seen_slash_slash)
1834         /* example: "foo/bar" */
1835         /*           ^        */
1836         start_insert = base;
1837       else if (slash && seen_slash_slash)
1838         /* example: "http://something/" */
1839         /*                           ^  */
1840         start_insert = slash;
1841
1842       span = start_insert - base;
1843       merge = (char *)xmalloc (span + linklength + 1);
1844       if (span)
1845         memcpy (merge, base, span);
1846       memcpy (merge + span, link, linklength);
1847       merge[span + linklength] = '\0';
1848     }
1849   else
1850     {
1851       /* LINK is a relative URL: we need to replace everything
1852          after last slash (possibly empty) with LINK.
1853
1854          So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1855          our result should be "whatever/foo/qux/xyzzy".  */
1856       int need_explicit_slash = 0;
1857       int span;
1858       const char *start_insert;
1859       const char *last_slash = find_last_char (base, end, '/');
1860       if (!last_slash)
1861         {
1862           /* No slash found at all.  Append LINK to what we have,
1863              but we'll need a slash as a separator.
1864
1865              Example: if base == "foo" and link == "qux/xyzzy", then
1866              we cannot just append link to base, because we'd get
1867              "fooqux/xyzzy", whereas what we want is
1868              "foo/qux/xyzzy".
1869
1870              To make sure the / gets inserted, we set
1871              need_explicit_slash to 1.  We also set start_insert
1872              to end + 1, so that the length calculations work out
1873              correctly for one more (slash) character.  Accessing
1874              that character is fine, since it will be the
1875              delimiter, '\0' or '?'.  */
1876           /* example: "foo?..." */
1877           /*               ^    ('?' gets changed to '/') */
1878           start_insert = end + 1;
1879           need_explicit_slash = 1;
1880         }
1881       else if (last_slash && last_slash >= base + 2
1882                && last_slash[-2] == ':' && last_slash[-1] == '/')
1883         {
1884           /* example: http://host"  */
1885           /*                      ^ */
1886           start_insert = end + 1;
1887           need_explicit_slash = 1;
1888         }
1889       else
1890         {
1891           /* example: "whatever/foo/bar" */
1892           /*                        ^    */
1893           start_insert = last_slash + 1;
1894         }
1895
1896       span = start_insert - base;
1897       merge = (char *)xmalloc (span + linklength + 1);
1898       if (span)
1899         memcpy (merge, base, span);
1900       if (need_explicit_slash)
1901         merge[span - 1] = '/';
1902       memcpy (merge + span, link, linklength);
1903       merge[span + linklength] = '\0';
1904     }
1905
1906   return merge;
1907 }
1908 \f
1909 #define APPEND(p, s) do {                       \
1910   int len = strlen (s);                         \
1911   memcpy (p, s, len);                           \
1912   p += len;                                     \
1913 } while (0)
1914
1915 /* Use this instead of password when the actual password is supposed
1916    to be hidden.  We intentionally use a generic string without giving
1917    away the number of characters in the password, like previous
1918    versions did.  */
1919 #define HIDDEN_PASSWORD "*password*"
1920
1921 /* Recreate the URL string from the data in URL.
1922
1923    If HIDE is non-zero (as it is when we're calling this on a URL we
1924    plan to print, but not when calling it to canonicalize a URL for
1925    use within the program), password will be hidden.  Unsafe
1926    characters in the URL will be quoted.  */
1927
1928 char *
1929 url_string (const struct url *url, int hide_password)
1930 {
1931   int size;
1932   char *result, *p;
1933   char *quoted_user = NULL, *quoted_passwd = NULL;
1934
1935   int scheme_port  = supported_schemes[url->scheme].default_port;
1936   char *scheme_str = supported_schemes[url->scheme].leading_string;
1937   int fplen = full_path_length (url);
1938
1939   int brackets_around_host = 0;
1940
1941   assert (scheme_str != NULL);
1942
1943   /* Make sure the user name and password are quoted. */
1944   if (url->user)
1945     {
1946       quoted_user = url_escape_allow_passthrough (url->user);
1947       if (url->passwd)
1948         {
1949           if (hide_password)
1950             quoted_passwd = HIDDEN_PASSWORD;
1951           else
1952             quoted_passwd = url_escape_allow_passthrough (url->passwd);
1953         }
1954     }
1955
1956   if (strchr (url->host, ':'))
1957     brackets_around_host = 1;
1958
1959   size = (strlen (scheme_str)
1960           + strlen (url->host)
1961           + (brackets_around_host ? 2 : 0)
1962           + fplen
1963           + 1);
1964   if (url->port != scheme_port)
1965     size += 1 + numdigit (url->port);
1966   if (quoted_user)
1967     {
1968       size += 1 + strlen (quoted_user);
1969       if (quoted_passwd)
1970         size += 1 + strlen (quoted_passwd);
1971     }
1972
1973   p = result = xmalloc (size);
1974
1975   APPEND (p, scheme_str);
1976   if (quoted_user)
1977     {
1978       APPEND (p, quoted_user);
1979       if (quoted_passwd)
1980         {
1981           *p++ = ':';
1982           APPEND (p, quoted_passwd);
1983         }
1984       *p++ = '@';
1985     }
1986
1987   if (brackets_around_host)
1988     *p++ = '[';
1989   APPEND (p, url->host);
1990   if (brackets_around_host)
1991     *p++ = ']';
1992   if (url->port != scheme_port)
1993     {
1994       *p++ = ':';
1995       p = number_to_string (p, url->port);
1996     }
1997
1998   full_path_write (url, p);
1999   p += fplen;
2000   *p++ = '\0';
2001
2002   assert (p - result == size);
2003
2004   if (quoted_user && quoted_user != url->user)
2005     xfree (quoted_user);
2006   if (quoted_passwd && !hide_password
2007       && quoted_passwd != url->passwd)
2008     xfree (quoted_passwd);
2009
2010   return result;
2011 }
2012 \f
2013 /* Return non-zero if scheme a is similar to scheme b.
2014
2015    Schemes are similar if they are equal.  If SSL is supported, schemes
2016    are also similar if one is http (SCHEME_HTTP) and the other is https
2017    (SCHEME_HTTPS).  */
2018 int
2019 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2020 {
2021   if (a == b)
2022     return 1;
2023 #ifdef HAVE_SSL
2024   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2025       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2026     return 1;
2027 #endif
2028   return 0;
2029 }
2030 \f
2031 #if 0
2032 /* Debugging and testing support for path_simplify. */
2033
2034 /* Debug: run path_simplify on PATH and return the result in a new
2035    string.  Useful for calling from the debugger.  */
2036 static char *
2037 ps (char *path)
2038 {
2039   char *copy = xstrdup (path);
2040   path_simplify (copy);
2041   return copy;
2042 }
2043
2044 static void
2045 run_test (char *test, char *expected_result, int expected_change)
2046 {
2047   char *test_copy = xstrdup (test);
2048   int modified = path_simplify (test_copy);
2049
2050   if (0 != strcmp (test_copy, expected_result))
2051     {
2052       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2053               test, expected_result, test_copy);
2054     }
2055   if (modified != expected_change)
2056     {
2057       if (expected_change == 1)
2058         printf ("Expected no modification with path_simplify(\"%s\").\n",
2059                 test);
2060       else
2061         printf ("Expected modification with path_simplify(\"%s\").\n",
2062                 test);
2063     }
2064   xfree (test_copy);
2065 }
2066
2067 static void
2068 test_path_simplify (void)
2069 {
2070   static struct {
2071     char *test, *result;
2072     int should_modify;
2073   } tests[] = {
2074     { "",               "",             0 },
2075     { ".",              "",             1 },
2076     { "..",             "",             1 },
2077     { "foo",            "foo",          0 },
2078     { "foo/bar",        "foo/bar",      0 },
2079     { "foo///bar",      "foo/bar",      1 },
2080     { "foo/.",          "foo/",         1 },
2081     { "foo/./",         "foo/",         1 },
2082     { "foo./",          "foo./",        0 },
2083     { "foo/../bar",     "bar",          1 },
2084     { "foo/../bar/",    "bar/",         1 },
2085     { "foo/bar/..",     "foo/",         1 },
2086     { "foo/bar/../x",   "foo/x",        1 },
2087     { "foo/bar/../x/",  "foo/x/",       1 },
2088     { "foo/..",         "",             1 },
2089     { "foo/../..",      "",             1 },
2090     { "a/b/../../c",    "c",            1 },
2091     { "./a/../b",       "b",            1 }
2092   };
2093   int i;
2094
2095   for (i = 0; i < countof (tests); i++)
2096     {
2097       char *test = tests[i].test;
2098       char *expected_result = tests[i].result;
2099       int   expected_change = tests[i].should_modify;
2100       run_test (test, expected_result, expected_change);
2101     }
2102
2103   /* Now run all the tests with a leading slash before the test case,
2104      to prove that the slash is being preserved.  */
2105   for (i = 0; i < countof (tests); i++)
2106     {
2107       char *test, *expected_result;
2108       int expected_change = tests[i].should_modify;
2109
2110       test = xmalloc (1 + strlen (tests[i].test) + 1);
2111       sprintf (test, "/%s", tests[i].test);
2112
2113       expected_result = xmalloc (1 + strlen (tests[i].result) + 1);
2114       sprintf (expected_result, "/%s", tests[i].result);
2115
2116       run_test (test, expected_result, expected_change);
2117
2118       xfree (test);
2119       xfree (expected_result);
2120     }
2121 }
2122 #endif