sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include <config.h>
  31
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #ifdef HAVE_STRING_H
  35 # include <string.h>
  36 #else
  37 # include <strings.h>
  38 #endif
  39 #include <sys/types.h>
  40 #ifdef HAVE_UNISTD_H
  41 # include <unistd.h>
  42 #endif
  43 #include <errno.h>
  44 #include <assert.h>
  45
  46 #include "wget.h"
  47 #include "utils.h"
  48 #include "url.h"
  49 #include "host.h"
  50 #include "hash.h"
  51
  52 #ifndef errno
  53 extern int errno;
  54 #endif
  55
  56 /* Is X "."?  */
  57 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  58 /* Is X ".."?  */
  59 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  60
  61 static const int NS_INADDRSZ  = 4;
  62 static const int NS_IN6ADDRSZ = 16;
  63 static const int NS_INT16SZ = 2;
  64
  65
  66 struct scheme_data
  67 {
  68   char *leading_string;
  69   int default_port;
  70   int enabled;
  71 };
  72
  73 /* Supported schemes: */
  74 static struct scheme_data supported_schemes[] =
  75 {
  76   { "http://",  DEFAULT_HTTP_PORT,  1 },
  77 #ifdef HAVE_SSL
  78   { "https://", DEFAULT_HTTPS_PORT, 1 },
  79 #endif
  80   { "ftp://",   DEFAULT_FTP_PORT,   1 },
  81
  82   /* SCHEME_INVALID */
  83   { NULL,       -1,                 0 }
  84 };
  85
  86 /* Forward declarations: */
  87
  88 static char *construct_relative PARAMS ((const char *, const char *));
  89 static int path_simplify PARAMS ((char *));
  90
  91
  92 \f
  93 /* Support for encoding and decoding of URL strings.  We determine
  94    whether a character is unsafe through static table lookup.  This
  95    code assumes ASCII character set and 8-bit chars.  */
  96
  97 enum {
  98   urlchr_reserved = 1,
  99   urlchr_unsafe   = 2
 100 };
 101
 102 #define R  urlchr_reserved
 103 #define U  urlchr_unsafe
 104 #define RU R|U
 105
 106 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 107
 108 /* rfc1738 reserved chars, preserved from encoding.  */
 109
 110 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 111
 112 /* rfc1738 unsafe chars, plus some more.  */
 113
 114 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 115
 116 const static unsigned char urlchr_table[256] =
 117 {
 118   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 119   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 120   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 121   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 122   U,  0,  U, RU,   0,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 123   0,  0,  0,  R,   0,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 124   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 125   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 126  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 127   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 128   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 129   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 130   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 131   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 132   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 133   0,  0,  0,  U,   U,  U,  U,  U,   /* x   y   z   {    |   }   ~   DEL */
 134
 135   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 136   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 137   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 138   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 139
 140   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 141   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 142   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 143   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 144 };
 145
 146 /* Decodes the forms %xy in a URL to the character the hexadecimal
 147    code of which is xy.  xy are hexadecimal digits from
 148    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 149    hex-digits or `%' precedes `\0', the sequence is inserted
 150    literally.  */
 151
 152 static void
 153 decode_string (char *s)
 154 {
 155   char *t = s;                  /* t - tortoise */
 156   char *h = s;                  /* h - hare     */
 157
 158   for (; *h; h++, t++)
 159     {
 160       if (*h != '%')
 161         {
 162         copychar:
 163           *t = *h;
 164         }
 165       else
 166         {
 167           /* Do nothing if '%' is not followed by two hex digits. */
 168           if (!*(h + 1) || !*(h + 2)
 169               || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
 170             goto copychar;
 171           *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
 172           h += 2;
 173         }
 174     }
 175   *t = '\0';
 176 }
 177
 178 /* Like encode_string, but return S if there are no unsafe chars.  */
 179
 180 static char *
 181 encode_string_maybe (const char *s)
 182 {
 183   const char *p1;
 184   char *p2, *newstr;
 185   int newlen;
 186   int addition = 0;
 187
 188   for (p1 = s; *p1; p1++)
 189     if (UNSAFE_CHAR (*p1))
 190       addition += 2;            /* Two more characters (hex digits) */
 191
 192   if (!addition)
 193     return (char *)s;
 194
 195   newlen = (p1 - s) + addition;
 196   newstr = (char *)xmalloc (newlen + 1);
 197
 198   p1 = s;
 199   p2 = newstr;
 200   while (*p1)
 201     {
 202       if (UNSAFE_CHAR (*p1))
 203         {
 204           unsigned char c = *p1++;
 205           *p2++ = '%';
 206           *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 207           *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 208         }
 209       else
 210         *p2++ = *p1++;
 211     }
 212   *p2 = '\0';
 213   assert (p2 - newstr == newlen);
 214
 215   return newstr;
 216 }
 217
 218 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
 219    given string, returning a malloc-ed %XX encoded string.  */
 220
 221 char *
 222 encode_string (const char *s)
 223 {
 224   char *encoded = encode_string_maybe (s);
 225   if (encoded != s)
 226     return encoded;
 227   else
 228     return xstrdup (s);
 229 }
 230
 231 /* Encode unsafe characters in PTR to %xx.  If such encoding is done,
 232    the old value of PTR is freed and PTR is made to point to the newly
 233    allocated storage.  */
 234
 235 #define ENCODE(ptr) do {                        \
 236   char *e_new = encode_string_maybe (ptr);      \
 237   if (e_new != ptr)                             \
 238     {                                           \
 239       xfree (ptr);                              \
 240       ptr = e_new;                              \
 241     }                                           \
 242 } while (0)
 243 \f
 244 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
 245
 246 /* Decide whether to encode, decode, or pass through the char at P.
 247    This used to be a macro, but it got a little too convoluted.  */
 248 static inline enum copy_method
 249 decide_copy_method (const char *p)
 250 {
 251   if (*p == '%')
 252     {
 253       if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
 254         {
 255           /* %xx sequence: decode it, unless it would decode to an
 256              unsafe or a reserved char; in that case, leave it as
 257              is. */
 258           char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
 259             XCHAR_TO_XDIGIT (*(p + 2));
 260
 261           if (UNSAFE_CHAR (preempt) || RESERVED_CHAR (preempt))
 262             return CM_PASSTHROUGH;
 263           else
 264             return CM_DECODE;
 265         }
 266       else
 267         /* Garbled %.. sequence: encode `%'. */
 268         return CM_ENCODE;
 269     }
 270   else if (UNSAFE_CHAR (*p) && !RESERVED_CHAR (*p))
 271     return CM_ENCODE;
 272   else
 273     return CM_PASSTHROUGH;
 274 }
 275
 276 /* Translate a %-quoting (but possibly non-conformant) input string S
 277    into a %-quoting (and conformant) output string.  If no characters
 278    are encoded or decoded, return the same string S; otherwise, return
 279    a freshly allocated string with the new contents.
 280
 281    After a URL has been run through this function, the protocols that
 282    use `%' as the quote character can use the resulting string as-is,
 283    while those that don't call decode_string() to get to the intended
 284    data.  This function is also stable: after an input string is
 285    transformed the first time, all further transformations of the
 286    result yield the same result string.
 287
 288    Let's discuss why this function is needed.
 289
 290    Imagine Wget is to retrieve `http://abc.xyz/abc def'.  Since a raw
 291    space character would mess up the HTTP request, it needs to be
 292    quoted, like this:
 293
 294        GET /abc%20def HTTP/1.0
 295
 296    So it appears that the unsafe chars need to be quoted, as with
 297    encode_string.  But what if we're requested to download
 298    `abc%20def'?  Remember that %-encoding is valid URL syntax, so what
 299    the user meant was a literal space, and he was kind enough to quote
 300    it.  In that case, Wget should obviously leave the `%20' as is, and
 301    send the same request as above.  So in this case we may not call
 302    encode_string.
 303
 304    But what if the requested URI is `abc%20 def'?  If we call
 305    encode_string, we end up with `/abc%2520%20def', which is almost
 306    certainly not intended.  If we don't call encode_string, we are
 307    left with the embedded space and cannot send the request.  What the
 308    user meant was for Wget to request `/abc%20%20def', and this is
 309    where reencode_string kicks in.
 310
 311    Wget used to solve this by first decoding %-quotes, and then
 312    encoding all the "unsafe" characters found in the resulting string.
 313    This was wrong because it didn't preserve certain URL special
 314    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 315    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 316    whether we considered `+' reserved (it is).  One of these results
 317    is inevitable because by the second step we would lose information
 318    on whether the `+' was originally encoded or not.  Both results
 319    were wrong because in CGI parameters + means space, while %2B means
 320    literal plus.  reencode_string correctly translates the above to
 321    "a%2B+b", i.e. returns the original string.
 322
 323    This function uses an algorithm proposed by Anon Sricharoenchai:
 324
 325    1. Encode all URL_UNSAFE and the "%" that are not followed by 2
 326       hexdigits.
 327
 328    2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
 329       "+".
 330
 331    ...except that this code conflates the two steps, and decides
 332    whether to encode, decode, or pass through each character in turn.
 333    The function still uses two passes, but their logic is the same --
 334    the first pass exists merely for the sake of allocation.  Another
 335    small difference is that we include `+' to URL_RESERVED.
 336
 337    Anon's test case:
 338
 339    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 340    ->
 341    "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
 342
 343    Simpler test cases:
 344
 345    "foo bar"         -> "foo%20bar"
 346    "foo%20bar"       -> "foo%20bar"
 347    "foo %20bar"      -> "foo%20%20bar"
 348    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 349    "foo%25%20bar"    -> "foo%25%20bar"
 350    "foo%2%20bar"     -> "foo%252%20bar"
 351    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 352    "foo%2b+bar"      -> "foo%2b+bar"  */
 353
 354 static char *
 355 reencode_string (const char *s)
 356 {
 357   const char *p1;
 358   char *newstr, *p2;
 359   int oldlen, newlen;
 360
 361   int encode_count = 0;
 362   int decode_count = 0;
 363
 364   /* First, pass through the string to see if there's anything to do,
 365      and to calculate the new length.  */
 366   for (p1 = s; *p1; p1++)
 367     {
 368       switch (decide_copy_method (p1))
 369         {
 370         case CM_ENCODE:
 371           ++encode_count;
 372           break;
 373         case CM_DECODE:
 374           ++decode_count;
 375           break;
 376         case CM_PASSTHROUGH:
 377           break;
 378         }
 379     }
 380
 381   if (!encode_count && !decode_count)
 382     /* The string is good as it is. */
 383     return (char *)s;           /* C const model sucks. */
 384
 385   oldlen = p1 - s;
 386   /* Each encoding adds two characters (hex digits), while each
 387      decoding removes two characters.  */
 388   newlen = oldlen + 2 * (encode_count - decode_count);
 389   newstr = xmalloc (newlen + 1);
 390
 391   p1 = s;
 392   p2 = newstr;
 393
 394   while (*p1)
 395     {
 396       switch (decide_copy_method (p1))
 397         {
 398         case CM_ENCODE:
 399           {
 400             unsigned char c = *p1++;
 401             *p2++ = '%';
 402             *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 403             *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 404           }
 405           break;
 406         case CM_DECODE:
 407           *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
 408                    + (XCHAR_TO_XDIGIT (*(p1 + 2))));
 409           p1 += 3;              /* skip %xx */
 410           break;
 411         case CM_PASSTHROUGH:
 412           *p2++ = *p1++;
 413         }
 414     }
 415   *p2 = '\0';
 416   assert (p2 - newstr == newlen);
 417   return newstr;
 418 }
 419
 420 /* Run PTR_VAR through reencode_string.  If a new string is consed,
 421    free PTR_VAR and make it point to the new storage.  Obviously,
 422    PTR_VAR needs to be an lvalue.  */
 423
 424 #define REENCODE(ptr_var) do {                  \
 425   char *rf_new = reencode_string (ptr_var);     \
 426   if (rf_new != ptr_var)                        \
 427     {                                           \
 428       xfree (ptr_var);                          \
 429       ptr_var = rf_new;                         \
 430     }                                           \
 431 } while (0)
 432 \f
 433 /* Returns the scheme type if the scheme is supported, or
 434    SCHEME_INVALID if not.  */
 435 enum url_scheme
 436 url_scheme (const char *url)
 437 {
 438   int i;
 439
 440   for (i = 0; supported_schemes[i].leading_string; i++)
 441     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 442                           strlen (supported_schemes[i].leading_string)))
 443       {
 444         if (supported_schemes[i].enabled)
 445           return (enum url_scheme) i;
 446         else
 447           return SCHEME_INVALID;
 448       }
 449
 450   return SCHEME_INVALID;
 451 }
 452
 453 /* Return the number of characters needed to skip the scheme part of
 454    the URL, e.g. `http://'.  If no scheme is found, returns 0.  */
 455 int
 456 url_skip_scheme (const char *url)
 457 {
 458   const char *p = url;
 459
 460   /* Skip the scheme name.  We allow `-' and `+' because of `whois++',
 461      etc. */
 462   while (ISALNUM (*p) || *p == '-' || *p == '+')
 463     ++p;
 464   if (*p != ':')
 465     return 0;
 466   /* Skip ':'. */
 467   ++p;
 468
 469   /* Skip "//" if found. */
 470   if (*p == '/' && *(p + 1) == '/')
 471     p += 2;
 472
 473   return p - url;
 474 }
 475
 476 /* Returns 1 if the URL begins with a scheme (supported or
 477    unsupported), 0 otherwise.  */
 478 int
 479 url_has_scheme (const char *url)
 480 {
 481   const char *p = url;
 482   while (ISALNUM (*p) || *p == '-' || *p == '+')
 483     ++p;
 484   return *p == ':';
 485 }
 486
 487 int
 488 scheme_default_port (enum url_scheme scheme)
 489 {
 490   return supported_schemes[scheme].default_port;
 491 }
 492
 493 void
 494 scheme_disable (enum url_scheme scheme)
 495 {
 496   supported_schemes[scheme].enabled = 0;
 497 }
 498
 499 /* Skip the username and password, if present here.  The function
 500    should be called *not* with the complete URL, but with the part
 501    right after the scheme.
 502
 503    If no username and password are found, return 0.  */
 504 int
 505 url_skip_uname (const char *url)
 506 {
 507   const char *p;
 508
 509   /* Look for '@' that comes before '/' or '?'. */
 510   p = (const char *)strpbrk (url, "/?@");
 511   if (!p || *p != '@')
 512     return 0;
 513
 514   return p - url + 1;
 515 }
 516
 517 static int
 518 parse_uname (const char *str, int len, char **user, char **passwd)
 519 {
 520   char *colon;
 521
 522   if (len == 0)
 523     /* Empty user name not allowed. */
 524     return 0;
 525
 526   colon = memchr (str, ':', len);
 527   if (colon == str)
 528     /* Empty user name again. */
 529     return 0;
 530
 531   if (colon)
 532     {
 533       int pwlen = len - (colon + 1 - str);
 534       *passwd = xmalloc (pwlen + 1);
 535       memcpy (*passwd, colon + 1, pwlen);
 536       (*passwd)[pwlen] = '\0';
 537       len -= pwlen + 1;
 538     }
 539   else
 540     *passwd = NULL;
 541
 542   *user = xmalloc (len + 1);
 543   memcpy (*user, str, len);
 544   (*user)[len] = '\0';
 545
 546   if (*user)
 547     decode_string (*user);
 548   if (*passwd)
 549     decode_string (*passwd);
 550
 551   return 1;
 552 }
 553
 554 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 555    popularized by Netscape and NcFTP.  HTTP shorthands look like this:
 556
 557    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 558    www.foo.com[:port]            -> http://www.foo.com[:port]
 559
 560    FTP shorthands look like this:
 561
 562    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 563    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 564
 565    If the URL needs not or cannot be rewritten, return NULL.  */
 566 char *
 567 rewrite_shorthand_url (const char *url)
 568 {
 569   const char *p;
 570
 571   if (url_has_scheme (url))
 572     return NULL;
 573
 574   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 575      latter Netscape.  */
 576   for (p = url; *p && *p != ':' && *p != '/'; p++)
 577     ;
 578
 579   if (p == url)
 580     return NULL;
 581
 582   if (*p == ':')
 583     {
 584       const char *pp;
 585       char *res;
 586       /* If the characters after the colon and before the next slash
 587          or end of string are all digits, it's HTTP.  */
 588       int digits = 0;
 589       for (pp = p + 1; ISDIGIT (*pp); pp++)
 590         ++digits;
 591       if (digits > 0 && (*pp == '/' || *pp == '\0'))
 592         goto http;
 593
 594       /* Prepend "ftp://" to the entire URL... */
 595       res = xmalloc (6 + strlen (url) + 1);
 596       sprintf (res, "ftp://%s", url);
 597       /* ...and replace ':' with '/'. */
 598       res[6 + (p - url)] = '/';
 599       return res;
 600     }
 601   else
 602     {
 603       char *res;
 604     http:
 605       /* Just prepend "http://" to what we have. */
 606       res = xmalloc (7 + strlen (url) + 1);
 607       sprintf (res, "http://%s", url);
 608       return res;
 609     }
 610 }
 611 \f
 612 static void parse_path PARAMS ((const char *, char **, char **));
 613
 614 static char *
 615 strpbrk_or_eos (const char *s, const char *accept)
 616 {
 617   char *p = strpbrk (s, accept);
 618   if (!p)
 619     p = (char *)s + strlen (s);
 620   return p;
 621 }
 622
 623 /* Turn STR into lowercase; return non-zero if a character was
 624    actually changed. */
 625
 626 static int
 627 lowercase_str (char *str)
 628 {
 629   int change = 0;
 630   for (; *str; str++)
 631     if (ISUPPER (*str))
 632       {
 633         change = 1;
 634         *str = TOLOWER (*str);
 635       }
 636   return change;
 637 }
 638
 639 static char *parse_errors[] = {
 640 #define PE_NO_ERROR                     0
 641   "No error",
 642 #define PE_UNSUPPORTED_SCHEME           1
 643   "Unsupported scheme",
 644 #define PE_EMPTY_HOST                   2
 645   "Empty host",
 646 #define PE_BAD_PORT_NUMBER              3
 647   "Bad port number",
 648 #define PE_INVALID_USER_NAME            4
 649   "Invalid user name",
 650 #define PE_UNTERMINATED_IPV6_ADDRESS    5
 651   "Unterminated IPv6 numeric address",
 652 #define PE_INVALID_IPV6_ADDRESS         6
 653   "Invalid IPv6 numeric address"
 654 };
 655
 656 #define SETERR(p, v) do {                       \
 657   if (p)                                        \
 658     *(p) = (v);                                 \
 659 } while (0)
 660
 661 /* The following two functions were adapted from glibc. */
 662
 663 static int
 664 is_valid_ipv4_address (const char *str, const char *end)
 665 {
 666   int saw_digit, octets;
 667   int val;
 668
 669   saw_digit = 0;
 670   octets = 0;
 671   val = 0;
 672
 673   while (str < end) {
 674     int ch = *str++;
 675
 676     if (ch >= '0' && ch <= '9') {
 677       val = val * 10 + (ch - '0');
 678
 679       if (val > 255)
 680         return 0;
 681       if (saw_digit == 0) {
 682         if (++octets > 4)
 683           return 0;
 684         saw_digit = 1;
 685       }
 686     } else if (ch == '.' && saw_digit == 1) {
 687       if (octets == 4)
 688         return 0;
 689       val = 0;
 690       saw_digit = 0;
 691     } else
 692       return 0;
 693   }
 694   if (octets < 4)
 695     return 0;
 696
 697   return 1;
 698 }
 699
 700 static int
 701 is_valid_ipv6_address (const char *str, const char *end)
 702 {
 703   static const char xdigits[] = "0123456789abcdef";
 704   const char *curtok;
 705   int tp;
 706   const char *colonp;
 707   int saw_xdigit;
 708   unsigned int val;
 709
 710   tp = 0;
 711   colonp = NULL;
 712
 713   if (str == end)
 714     return 0;
 715
 716   /* Leading :: requires some special handling. */
 717   if (*str == ':')
 718     {
 719       ++str;
 720       if (str == end || *str != ':')
 721         return 0;
 722     }
 723
 724   curtok = str;
 725   saw_xdigit = 0;
 726   val = 0;
 727
 728   while (str < end) {
 729     int ch = *str++;
 730     const char *pch;
 731
 732     /* if ch is a number, add it to val. */
 733     pch = strchr(xdigits, ch);
 734     if (pch != NULL) {
 735       val <<= 4;
 736       val |= (pch - xdigits);
 737       if (val > 0xffff)
 738         return 0;
 739       saw_xdigit = 1;
 740       continue;
 741     }
 742
 743     /* if ch is a colon ... */
 744     if (ch == ':') {
 745       curtok = str;
 746       if (saw_xdigit == 0) {
 747         if (colonp != NULL)
 748           return 0;
 749         colonp = str + tp;
 750         continue;
 751       } else if (str == end) {
 752         return 0;
 753       }
 754       if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
 755         return 0;
 756       tp += NS_INT16SZ;
 757       saw_xdigit = 0;
 758       val = 0;
 759       continue;
 760     }
 761
 762     /* if ch is a dot ... */
 763     if (ch == '.' && (tp <= NS_IN6ADDRSZ - NS_INADDRSZ) &&
 764         is_valid_ipv4_address(curtok, end) == 1) {
 765       tp += NS_INADDRSZ;
 766       saw_xdigit = 0;
 767       break;
 768     }
 769
 770     return 0;
 771   }
 772
 773   if (saw_xdigit == 1) {
 774     if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
 775       return 0;
 776     tp += NS_INT16SZ;
 777   }
 778
 779   if (colonp != NULL) {
 780     if (tp == NS_IN6ADDRSZ)
 781       return 0;
 782     tp = NS_IN6ADDRSZ;
 783   }
 784
 785   if (tp != NS_IN6ADDRSZ)
 786     return 0;
 787
 788   return 1;
 789 }
 790
 791
 792 /* Parse a URL.
 793
 794    Return a new struct url if successful, NULL on error.  In case of
 795    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 796    error code. */
 797 struct url *
 798 url_parse (const char *url, int *error)
 799 {
 800   struct url *u;
 801   const char *p;
 802   int path_modified, host_modified;
 803
 804   enum url_scheme scheme;
 805
 806   const char *uname_b,     *uname_e;
 807   const char *host_b,      *host_e;
 808   const char *path_b,      *path_e;
 809   const char *params_b,    *params_e;
 810   const char *query_b,     *query_e;
 811   const char *fragment_b,  *fragment_e;
 812
 813   int port;
 814   char *user = NULL, *passwd = NULL;
 815
 816   char *url_encoded;
 817
 818   scheme = url_scheme (url);
 819   if (scheme == SCHEME_INVALID)
 820     {
 821       SETERR (error, PE_UNSUPPORTED_SCHEME);
 822       return NULL;
 823     }
 824
 825   url_encoded = reencode_string (url);
 826   p = url_encoded;
 827
 828   p += strlen (supported_schemes[scheme].leading_string);
 829   uname_b = p;
 830   p += url_skip_uname (p);
 831   uname_e = p;
 832
 833   /* scheme://user:pass@host[:port]... */
 834   /*                    ^              */
 835
 836   /* We attempt to break down the URL into the components path,
 837      params, query, and fragment.  They are ordered like this:
 838
 839        scheme://host[:port][/path][;params][?query][#fragment]  */
 840
 841   params_b   = params_e   = NULL;
 842   query_b    = query_e    = NULL;
 843   fragment_b = fragment_e = NULL;
 844
 845   host_b = p;
 846
 847   if (*p == '[')
 848     {
 849       /* Handle IPv6 address inside square brackets.  Ideally we'd
 850          just look for the terminating ']', but rfc2732 mandates
 851          rejecting invalid IPv6 addresses.  */
 852
 853       /* The address begins after '['. */
 854       host_b = p + 1;
 855       host_e = strchr (host_b, ']');
 856
 857       if (!host_e)
 858         {
 859           SETERR (error, PE_UNTERMINATED_IPV6_ADDRESS);
 860           return NULL;
 861         }
 862
 863       /* Check if the IPv6 address is valid. */
 864       if (!is_valid_ipv6_address(host_b, host_e))
 865         {
 866           SETERR (error, PE_INVALID_IPV6_ADDRESS);
 867           return NULL;
 868         }
 869
 870       /* Continue parsing after the closing ']'. */
 871       p = host_e + 1;
 872     }
 873   else
 874     {
 875       p = strpbrk_or_eos (p, ":/;?#");
 876       host_e = p;
 877     }
 878
 879   if (host_b == host_e)
 880     {
 881       SETERR (error, PE_EMPTY_HOST);
 882       return NULL;
 883     }
 884
 885   port = scheme_default_port (scheme);
 886   if (*p == ':')
 887     {
 888       const char *port_b, *port_e, *pp;
 889
 890       /* scheme://host:port/tralala */
 891       /*              ^             */
 892       ++p;
 893       port_b = p;
 894       p = strpbrk_or_eos (p, "/;?#");
 895       port_e = p;
 896
 897       if (port_b == port_e)
 898         {
 899           /* http://host:/whatever */
 900           /*             ^         */
 901           SETERR (error, PE_BAD_PORT_NUMBER);
 902           return NULL;
 903         }
 904
 905       for (port = 0, pp = port_b; pp < port_e; pp++)
 906         {
 907           if (!ISDIGIT (*pp))
 908             {
 909               /* http://host:12randomgarbage/blah */
 910               /*               ^                  */
 911               SETERR (error, PE_BAD_PORT_NUMBER);
 912               return NULL;
 913             }
 914
 915           port = 10 * port + (*pp - '0');
 916         }
 917     }
 918
 919   if (*p == '/')
 920     {
 921       ++p;
 922       path_b = p;
 923       p = strpbrk_or_eos (p, ";?#");
 924       path_e = p;
 925     }
 926   else
 927     {
 928       /* Path is not allowed not to exist. */
 929       path_b = path_e = p;
 930     }
 931
 932   if (*p == ';')
 933     {
 934       ++p;
 935       params_b = p;
 936       p = strpbrk_or_eos (p, "?#");
 937       params_e = p;
 938     }
 939   if (*p == '?')
 940     {
 941       ++p;
 942       query_b = p;
 943       p = strpbrk_or_eos (p, "#");
 944       query_e = p;
 945
 946       /* Hack that allows users to use '?' (a wildcard character) in
 947          FTP URLs without it being interpreted as a query string
 948          delimiter.  */
 949       if (scheme == SCHEME_FTP)
 950         {
 951           query_b = query_e = NULL;
 952           path_e = p;
 953         }
 954     }
 955   if (*p == '#')
 956     {
 957       ++p;
 958       fragment_b = p;
 959       p += strlen (p);
 960       fragment_e = p;
 961     }
 962   assert (*p == 0);
 963
 964   if (uname_b != uname_e)
 965     {
 966       /* http://user:pass@host */
 967       /*        ^         ^    */
 968       /*     uname_b   uname_e */
 969       if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
 970         {
 971           SETERR (error, PE_INVALID_USER_NAME);
 972           return NULL;
 973         }
 974     }
 975
 976   u = (struct url *)xmalloc (sizeof (struct url));
 977   memset (u, 0, sizeof (*u));
 978
 979   u->scheme = scheme;
 980   u->host   = strdupdelim (host_b, host_e);
 981   u->port   = port;
 982   u->user   = user;
 983   u->passwd = passwd;
 984
 985   u->path = strdupdelim (path_b, path_e);
 986   path_modified = path_simplify (u->path);
 987   parse_path (u->path, &u->dir, &u->file);
 988
 989   host_modified = lowercase_str (u->host);
 990
 991   if (params_b)
 992     u->params = strdupdelim (params_b, params_e);
 993   if (query_b)
 994     u->query = strdupdelim (query_b, query_e);
 995   if (fragment_b)
 996     u->fragment = strdupdelim (fragment_b, fragment_e);
 997
 998   if (path_modified || u->fragment || host_modified || path_b == path_e)
 999     {
1000       /* If we suspect that a transformation has rendered what
1001          url_string might return different from URL_ENCODED, rebuild
1002          u->url using url_string.  */
1003       u->url = url_string (u, 0);
1004
1005       if (url_encoded != url)
1006         xfree ((char *) url_encoded);
1007     }
1008   else
1009     {
1010       if (url_encoded == url)
1011         u->url    = xstrdup (url);
1012       else
1013         u->url    = url_encoded;
1014     }
1015   url_encoded = NULL;
1016
1017   return u;
1018 }
1019
1020 const char *
1021 url_error (int error_code)
1022 {
1023   assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
1024   return parse_errors[error_code];
1025 }
1026
1027 static void
1028 parse_path (const char *quoted_path, char **dir, char **file)
1029 {
1030   char *path, *last_slash;
1031
1032   STRDUP_ALLOCA (path, quoted_path);
1033   decode_string (path);
1034
1035   last_slash = strrchr (path, '/');
1036   if (!last_slash)
1037     {
1038       *dir = xstrdup ("");
1039       *file = xstrdup (path);
1040     }
1041   else
1042     {
1043       *dir = strdupdelim (path, last_slash);
1044       *file = xstrdup (last_slash + 1);
1045     }
1046 }
1047
1048 /* Note: URL's "full path" is the path with the query string and
1049    params appended.  The "fragment" (#foo) is intentionally ignored,
1050    but that might be changed.  For example, if the original URL was
1051    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
1052    the full path will be "/foo/bar/baz;bullshit?querystring".  */
1053
1054 /* Return the length of the full path, without the terminating
1055    zero.  */
1056
1057 static int
1058 full_path_length (const struct url *url)
1059 {
1060   int len = 0;
1061
1062 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
1063
1064   FROB (path);
1065   FROB (params);
1066   FROB (query);
1067
1068 #undef FROB
1069
1070   return len;
1071 }
1072
1073 /* Write out the full path. */
1074
1075 static void
1076 full_path_write (const struct url *url, char *where)
1077 {
1078 #define FROB(el, chr) do {                      \
1079   char *f_el = url->el;                         \
1080   if (f_el) {                                   \
1081     int l = strlen (f_el);                      \
1082     *where++ = chr;                             \
1083     memcpy (where, f_el, l);                    \
1084     where += l;                                 \
1085   }                                             \
1086 } while (0)
1087
1088   FROB (path, '/');
1089   FROB (params, ';');
1090   FROB (query, '?');
1091
1092 #undef FROB
1093 }
1094
1095 /* Public function for getting the "full path".  E.g. if u->path is
1096    "foo/bar" and u->query is "param=value", full_path will be
1097    "/foo/bar?param=value". */
1098
1099 char *
1100 url_full_path (const struct url *url)
1101 {
1102   int length = full_path_length (url);
1103   char *full_path = (char *)xmalloc(length + 1);
1104
1105   full_path_write (url, full_path);
1106   full_path[length] = '\0';
1107
1108   return full_path;
1109 }
1110
1111 /* Sync u->path and u->url with u->dir and u->file. */
1112
1113 static void
1114 sync_path (struct url *url)
1115 {
1116   char *newpath;
1117
1118   xfree (url->path);
1119
1120   if (!*url->dir)
1121     {
1122       newpath = xstrdup (url->file);
1123       REENCODE (newpath);
1124     }
1125   else
1126     {
1127       int dirlen = strlen (url->dir);
1128       int filelen = strlen (url->file);
1129
1130       newpath = xmalloc (dirlen + 1 + filelen + 1);
1131       memcpy (newpath, url->dir, dirlen);
1132       newpath[dirlen] = '/';
1133       memcpy (newpath + dirlen + 1, url->file, filelen);
1134       newpath[dirlen + 1 + filelen] = '\0';
1135       REENCODE (newpath);
1136     }
1137
1138   url->path = newpath;
1139
1140   /* Synchronize u->url. */
1141   xfree (url->url);
1142   url->url = url_string (url, 0);
1143 }
1144
1145 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1146    This way we can sync u->path and u->url when they get changed.  */
1147
1148 void
1149 url_set_dir (struct url *url, const char *newdir)
1150 {
1151   xfree (url->dir);
1152   url->dir = xstrdup (newdir);
1153   sync_path (url);
1154 }
1155
1156 void
1157 url_set_file (struct url *url, const char *newfile)
1158 {
1159   xfree (url->file);
1160   url->file = xstrdup (newfile);
1161   sync_path (url);
1162 }
1163
1164 void
1165 url_free (struct url *url)
1166 {
1167   xfree (url->host);
1168   xfree (url->path);
1169   xfree (url->url);
1170
1171   FREE_MAYBE (url->params);
1172   FREE_MAYBE (url->query);
1173   FREE_MAYBE (url->fragment);
1174   FREE_MAYBE (url->user);
1175   FREE_MAYBE (url->passwd);
1176
1177   xfree (url->dir);
1178   xfree (url->file);
1179
1180   xfree (url);
1181 }
1182 \f
1183 struct urlpos *
1184 get_urls_file (const char *file)
1185 {
1186   struct file_memory *fm;
1187   struct urlpos *head, *tail;
1188   const char *text, *text_end;
1189
1190   /* Load the file.  */
1191   fm = read_file (file);
1192   if (!fm)
1193     {
1194       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
1195       return NULL;
1196     }
1197   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
1198
1199   head = tail = NULL;
1200   text = fm->content;
1201   text_end = fm->content + fm->length;
1202   while (text < text_end)
1203     {
1204       const char *line_beg = text;
1205       const char *line_end = memchr (text, '\n', text_end - text);
1206       if (!line_end)
1207         line_end = text_end;
1208       else
1209         ++line_end;
1210       text = line_end;
1211
1212       /* Strip whitespace from the beginning and end of line. */
1213       while (line_beg < line_end && ISSPACE (*line_beg))
1214         ++line_beg;
1215       while (line_end > line_beg && ISSPACE (*(line_end - 1)))
1216         --line_end;
1217
1218       if (line_end > line_beg)
1219         {
1220           /* URL is in the [line_beg, line_end) region. */
1221
1222           int up_error_code;
1223           char *url_text;
1224           struct urlpos *entry;
1225           struct url *url;
1226
1227           /* We must copy the URL to a zero-terminated string, and we
1228              can't use alloca because we're in a loop.  *sigh*.  */
1229           url_text = strdupdelim (line_beg, line_end);
1230
1231           if (opt.base_href)
1232             {
1233               /* Merge opt.base_href with URL. */
1234               char *merged = uri_merge (opt.base_href, url_text);
1235               xfree (url_text);
1236               url_text = merged;
1237             }
1238
1239           url = url_parse (url_text, &up_error_code);
1240           if (!url)
1241             {
1242               logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1243                          file, url_text, url_error (up_error_code));
1244               xfree (url_text);
1245               continue;
1246             }
1247           xfree (url_text);
1248
1249           entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1250           memset (entry, 0, sizeof (*entry));
1251           entry->next = NULL;
1252           entry->url = url;
1253
1254           if (!head)
1255             head = entry;
1256           else
1257             tail->next = entry;
1258           tail = entry;
1259         }
1260     }
1261   read_file_free (fm);
1262   return head;
1263 }
1264 \f
1265 /* Free the linked list of urlpos.  */
1266 void
1267 free_urlpos (struct urlpos *l)
1268 {
1269   while (l)
1270     {
1271       struct urlpos *next = l->next;
1272       if (l->url)
1273         url_free (l->url);
1274       FREE_MAYBE (l->local_name);
1275       xfree (l);
1276       l = next;
1277     }
1278 }
1279
1280 /* Rotate FNAME opt.backups times */
1281 void
1282 rotate_backups(const char *fname)
1283 {
1284   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1285   char *from = (char *)alloca (maxlen);
1286   char *to = (char *)alloca (maxlen);
1287   struct stat sb;
1288   int i;
1289
1290   if (stat (fname, &sb) == 0)
1291     if (S_ISREG (sb.st_mode) == 0)
1292       return;
1293
1294   for (i = opt.backups; i > 1; i--)
1295     {
1296       sprintf (from, "%s.%d", fname, i - 1);
1297       sprintf (to, "%s.%d", fname, i);
1298       /* #### This will fail on machines without the rename() system
1299          call.  */
1300       rename (from, to);
1301     }
1302
1303   sprintf (to, "%s.%d", fname, 1);
1304   rename(fname, to);
1305 }
1306
1307 /* Create all the necessary directories for PATH (a file).  Calls
1308    mkdirhier() internally.  */
1309 int
1310 mkalldirs (const char *path)
1311 {
1312   const char *p;
1313   char *t;
1314   struct stat st;
1315   int res;
1316
1317   p = path + strlen (path);
1318   for (; *p != '/' && p != path; p--);
1319   /* Don't create if it's just a file.  */
1320   if ((p == path) && (*p != '/'))
1321     return 0;
1322   t = strdupdelim (path, p);
1323   /* Check whether the directory exists.  */
1324   if ((stat (t, &st) == 0))
1325     {
1326       if (S_ISDIR (st.st_mode))
1327         {
1328           xfree (t);
1329           return 0;
1330         }
1331       else
1332         {
1333           /* If the dir exists as a file name, remove it first.  This
1334              is *only* for Wget to work with buggy old CERN http
1335              servers.  Here is the scenario: When Wget tries to
1336              retrieve a directory without a slash, e.g.
1337              http://foo/bar (bar being a directory), CERN server will
1338              not redirect it too http://foo/bar/ -- it will generate a
1339              directory listing containing links to bar/file1,
1340              bar/file2, etc.  Wget will lose because it saves this
1341              HTML listing to a file `bar', so it cannot create the
1342              directory.  To work around this, if the file of the same
1343              name exists, we just remove it and create the directory
1344              anyway.  */
1345           DEBUGP (("Removing %s because of directory danger!\n", t));
1346           unlink (t);
1347         }
1348     }
1349   res = make_directory (t);
1350   if (res != 0)
1351     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1352   xfree (t);
1353   return res;
1354 }
1355
1356 static int
1357 count_slashes (const char *s)
1358 {
1359   int i = 0;
1360   while (*s)
1361     if (*s++ == '/')
1362       ++i;
1363   return i;
1364 }
1365
1366 /* Return the path name of the URL-equivalent file name, with a
1367    remote-like structure of directories.  */
1368 static char *
1369 mkstruct (const struct url *u)
1370 {
1371   char *dir, *file;
1372   char *res, *dirpref;
1373   int l;
1374
1375   if (opt.cut_dirs)
1376     {
1377       char *ptr = u->dir + (*u->dir == '/');
1378       int slash_count = 1 + count_slashes (ptr);
1379       int cut = MINVAL (opt.cut_dirs, slash_count);
1380       for (; cut && *ptr; ptr++)
1381         if (*ptr == '/')
1382           --cut;
1383       STRDUP_ALLOCA (dir, ptr);
1384     }
1385   else
1386     dir = u->dir + (*u->dir == '/');
1387
1388   /* Check for the true name (or at least a consistent name for saving
1389      to directory) of HOST, reusing the hlist if possible.  */
1390   if (opt.add_hostdir)
1391     {
1392       /* Add dir_prefix and hostname (if required) to the beginning of
1393          dir.  */
1394       dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1395                                 + strlen (u->host)
1396                                 + 1 + numdigit (u->port)
1397                                 + 1);
1398       if (!DOTP (opt.dir_prefix))
1399         sprintf (dirpref, "%s/%s", opt.dir_prefix, u->host);
1400       else
1401         strcpy (dirpref, u->host);
1402
1403       if (u->port != scheme_default_port (u->scheme))
1404         {
1405           int len = strlen (dirpref);
1406           dirpref[len] = ':';
1407           number_to_string (dirpref + len + 1, u->port);
1408         }
1409     }
1410   else                          /* not add_hostdir */
1411     {
1412       if (!DOTP (opt.dir_prefix))
1413         dirpref = opt.dir_prefix;
1414       else
1415         dirpref = "";
1416     }
1417
1418   /* If there is a prefix, prepend it.  */
1419   if (*dirpref)
1420     {
1421       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1422       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1423       dir = newdir;
1424     }
1425
1426   l = strlen (dir);
1427   if (l && dir[l - 1] == '/')
1428     dir[l - 1] = '\0';
1429
1430   if (!*u->file)
1431     file = "index.html";
1432   else
1433     file = u->file;
1434
1435   /* Finally, construct the full name.  */
1436   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file)
1437                          + 1);
1438   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1439
1440   return res;
1441 }
1442
1443 /* Compose a file name out of BASE, an unescaped file name, and QUERY,
1444    an escaped query string.  The trick is to make sure that unsafe
1445    characters in BASE are escaped, and that slashes in QUERY are also
1446    escaped.  */
1447
1448 static char *
1449 compose_file_name (char *base, char *query)
1450 {
1451   char result[256];
1452   char *from;
1453   char *to = result;
1454
1455   /* Copy BASE to RESULT and encode all unsafe characters.  */
1456   from = base;
1457   while (*from && to - result < sizeof (result))
1458     {
1459       if (UNSAFE_CHAR (*from))
1460         {
1461           unsigned char c = *from++;
1462           *to++ = '%';
1463           *to++ = XDIGIT_TO_XCHAR (c >> 4);
1464           *to++ = XDIGIT_TO_XCHAR (c & 0xf);
1465         }
1466       else
1467         *to++ = *from++;
1468     }
1469
1470   if (query && to - result < sizeof (result))
1471     {
1472       *to++ = '?';
1473
1474       /* Copy QUERY to RESULT and encode all '/' characters. */
1475       from = query;
1476       while (*from && to - result < sizeof (result))
1477         {
1478           if (*from == '/')
1479             {
1480               *to++ = '%';
1481               *to++ = '2';
1482               *to++ = 'F';
1483               ++from;
1484             }
1485           else
1486             *to++ = *from++;
1487         }
1488     }
1489
1490   if (to - result < sizeof (result))
1491     *to = '\0';
1492   else
1493     /* Truncate input which is too long, presumably due to a huge
1494        query string.  */
1495     result[sizeof (result) - 1] = '\0';
1496
1497   return xstrdup (result);
1498 }
1499
1500 /* Create a unique filename, corresponding to a given URL.  Calls
1501    mkstruct if necessary.  Does *not* actually create any directories.  */
1502 char *
1503 url_filename (const struct url *u)
1504 {
1505   char *file, *name;
1506
1507   char *query = u->query && *u->query ? u->query : NULL;
1508
1509   if (opt.dirstruct)
1510     {
1511       char *base = mkstruct (u);
1512       file = compose_file_name (base, query);
1513       xfree (base);
1514     }
1515   else
1516     {
1517       char *base = *u->file ? u->file : "index.html";
1518       file = compose_file_name (base, query);
1519
1520       /* Check whether the prefix directory is something other than "."
1521          before prepending it.  */
1522       if (!DOTP (opt.dir_prefix))
1523         {
1524           /* #### should just realloc FILE and prepend dir_prefix. */
1525           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1526                                          + 1 + strlen (file) + 1);
1527           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1528           xfree (file);
1529           file = nfile;
1530         }
1531     }
1532
1533   /* DOS-ish file systems don't like `%' signs in them; we change it
1534      to `@'.  */
1535 #ifdef WINDOWS
1536   {
1537     char *p = file;
1538     for (p = file; *p; p++)
1539       if (*p == '%')
1540         *p = '@';
1541   }
1542 #endif /* WINDOWS */
1543
1544   /* Check the cases in which the unique extensions are not used:
1545      1) Clobbering is turned off (-nc).
1546      2) Retrieval with regetting.
1547      3) Timestamping is used.
1548      4) Hierarchy is built.
1549
1550      The exception is the case when file does exist and is a
1551      directory (actually support for bad httpd-s).  */
1552   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1553       && !(file_exists_p (file) && !file_non_directory_p (file)))
1554     return file;
1555
1556   /* Find a unique name.  */
1557   name = unique_name (file);
1558   xfree (file);
1559   return name;
1560 }
1561
1562 /* Return the langth of URL's path.  Path is considered to be
1563    terminated by one of '?', ';', '#', or by the end of the
1564    string.  */
1565 static int
1566 path_length (const char *url)
1567 {
1568   const char *q = strpbrk_or_eos (url, "?;#");
1569   return q - url;
1570 }
1571
1572 /* Find the last occurrence of character C in the range [b, e), or
1573    NULL, if none are present.  This is equivalent to strrchr(b, c),
1574    except that it accepts an END argument instead of requiring the
1575    string to be zero-terminated.  Why is there no memrchr()?  */
1576 static const char *
1577 find_last_char (const char *b, const char *e, char c)
1578 {
1579   for (; e > b; e--)
1580     if (*e == c)
1581       return e;
1582   return NULL;
1583 }
1584 \f
1585 /* Resolve "." and ".." elements of PATH by destructively modifying
1586    PATH.  "." is resolved by removing that path element, and ".." is
1587    resolved by removing the preceding path element.  Leading and
1588    trailing slashes are preserved.
1589
1590    Return non-zero if any changes have been made.
1591
1592    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1593    test examples are provided below.  If you change anything in this
1594    function, run test_path_simplify to make sure you haven't broken a
1595    test case.
1596
1597    A previous version of this function was based on path_simplify()
1598    from GNU Bash, but it has been rewritten for Wget 1.8.1.  */
1599
1600 static int
1601 path_simplify (char *path)
1602 {
1603   int change = 0;
1604   char *p, *end;
1605
1606   if (path[0] == '/')
1607     ++path;                     /* preserve the leading '/'. */
1608
1609   p = path;
1610   end = p + strlen (p) + 1;     /* position past the terminating zero. */
1611
1612   while (1)
1613     {
1614     again:
1615       /* P should point to the beginning of a path element. */
1616
1617       if (*p == '.' && (*(p + 1) == '/' || *(p + 1) == '\0'))
1618         {
1619           /* Handle "./foo" by moving "foo" two characters to the
1620              left. */
1621           if (*(p + 1) == '/')
1622             {
1623               change = 1;
1624               memmove (p, p + 2, end - p);
1625               end -= 2;
1626               goto again;
1627             }
1628           else
1629             {
1630               change = 1;
1631               *p = '\0';
1632               break;
1633             }
1634         }
1635       else if (*p == '.' && *(p + 1) == '.'
1636                && (*(p + 2) == '/' || *(p + 2) == '\0'))
1637         {
1638           /* Handle "../foo" by moving "foo" one path element to the
1639              left.  */
1640           char *b = p;          /* not p-1 because P can equal PATH */
1641
1642           /* Backtrack by one path element, but not past the beginning
1643              of PATH. */
1644
1645           /* foo/bar/../baz */
1646           /*         ^ p    */
1647           /*     ^ b        */
1648
1649           if (b > path)
1650             {
1651               /* Move backwards until B hits the beginning of the
1652                  previous path element or the beginning of path. */
1653               for (--b; b > path && *(b - 1) != '/'; b--)
1654                 ;
1655             }
1656
1657           change = 1;
1658           if (*(p + 2) == '/')
1659             {
1660               memmove (b, p + 3, end - (p + 3));
1661               end -= (p + 3) - b;
1662               p = b;
1663             }
1664           else
1665             {
1666               *b = '\0';
1667               break;
1668             }
1669
1670           goto again;
1671         }
1672       else if (*p == '/')
1673         {
1674           /* Remove empty path elements.  Not mandated by rfc1808 et
1675              al, but empty path elements are not all that useful, and
1676              the rest of Wget might not deal with them well. */
1677           char *q = p;
1678           while (*q == '/')
1679             ++q;
1680           change = 1;
1681           if (*q == '\0')
1682             {
1683               *p = '\0';
1684               break;
1685             }
1686           memmove (p, q, end - q);
1687           end -= q - p;
1688           goto again;
1689         }
1690
1691       /* Skip to the next path element. */
1692       while (*p && *p != '/')
1693         ++p;
1694       if (*p == '\0')
1695         break;
1696
1697       /* Make sure P points to the beginning of the next path element,
1698          which is location after the slash. */
1699       ++p;
1700     }
1701
1702   return change;
1703 }
1704 \f
1705 /* Resolve the result of "linking" a base URI (BASE) to a
1706    link-specified URI (LINK).
1707
1708    Either of the URIs may be absolute or relative, complete with the
1709    host name, or path only.  This tries to behave "reasonably" in all
1710    foreseeable cases.  It employs little specific knowledge about
1711    schemes or URL-specific stuff -- it just works on strings.
1712
1713    The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1714    See uri_merge for a gentler interface to this functionality.
1715
1716    Perhaps this function should call path_simplify so that the callers
1717    don't have to call url_parse unconditionally.  */
1718 static char *
1719 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1720 {
1721   char *constr;
1722
1723   if (no_scheme)
1724     {
1725       const char *end = base + path_length (base);
1726
1727       if (!*link)
1728         {
1729           /* Empty LINK points back to BASE, query string and all. */
1730           constr = xstrdup (base);
1731         }
1732       else if (*link == '?')
1733         {
1734           /* LINK points to the same location, but changes the query
1735              string.  Examples: */
1736           /* uri_merge("path",         "?new") -> "path?new"     */
1737           /* uri_merge("path?foo",     "?new") -> "path?new"     */
1738           /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1739           /* uri_merge("path#foo",     "?new") -> "path?new"     */
1740           int baselength = end - base;
1741           constr = xmalloc (baselength + linklength + 1);
1742           memcpy (constr, base, baselength);
1743           memcpy (constr + baselength, link, linklength);
1744           constr[baselength + linklength] = '\0';
1745         }
1746       else if (*link == '#')
1747         {
1748           /* uri_merge("path",         "#new") -> "path#new"     */
1749           /* uri_merge("path#foo",     "#new") -> "path#new"     */
1750           /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1751           /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1752           int baselength;
1753           const char *end1 = strchr (base, '#');
1754           if (!end1)
1755             end1 = base + strlen (base);
1756           baselength = end1 - base;
1757           constr = xmalloc (baselength + linklength + 1);
1758           memcpy (constr, base, baselength);
1759           memcpy (constr + baselength, link, linklength);
1760           constr[baselength + linklength] = '\0';
1761         }
1762       else if (linklength > 1 && *link == '/' && *(link + 1) == '/')
1763         {
1764           /* LINK begins with "//" and so is a net path: we need to
1765              replace everything after (and including) the double slash
1766              with LINK. */
1767
1768           /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1769           /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1770           /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1771
1772           int span;
1773           const char *slash;
1774           const char *start_insert;
1775
1776           /* Look for first slash. */
1777           slash = memchr (base, '/', end - base);
1778           /* If found slash and it is a double slash, then replace
1779              from this point, else default to replacing from the
1780              beginning.  */
1781           if (slash && *(slash + 1) == '/')
1782             start_insert = slash;
1783           else
1784             start_insert = base;
1785
1786           span = start_insert - base;
1787           constr = (char *)xmalloc (span + linklength + 1);
1788           if (span)
1789             memcpy (constr, base, span);
1790           memcpy (constr + span, link, linklength);
1791           constr[span + linklength] = '\0';
1792         }
1793       else if (*link == '/')
1794         {
1795           /* LINK is an absolute path: we need to replace everything
1796              after (and including) the FIRST slash with LINK.
1797
1798              So, if BASE is "http://host/whatever/foo/bar", and LINK is
1799              "/qux/xyzzy", our result should be
1800              "http://host/qux/xyzzy".  */
1801           int span;
1802           const char *slash;
1803           const char *start_insert = NULL; /* for gcc to shut up. */
1804           const char *pos = base;
1805           int seen_slash_slash = 0;
1806           /* We're looking for the first slash, but want to ignore
1807              double slash. */
1808         again:
1809           slash = memchr (pos, '/', end - pos);
1810           if (slash && !seen_slash_slash)
1811             if (*(slash + 1) == '/')
1812               {
1813                 pos = slash + 2;
1814                 seen_slash_slash = 1;
1815                 goto again;
1816               }
1817
1818           /* At this point, SLASH is the location of the first / after
1819              "//", or the first slash altogether.  START_INSERT is the
1820              pointer to the location where LINK will be inserted.  When
1821              examining the last two examples, keep in mind that LINK
1822              begins with '/'. */
1823
1824           if (!slash && !seen_slash_slash)
1825             /* example: "foo" */
1826             /*           ^    */
1827             start_insert = base;
1828           else if (!slash && seen_slash_slash)
1829             /* example: "http://foo" */
1830             /*                     ^ */
1831             start_insert = end;
1832           else if (slash && !seen_slash_slash)
1833             /* example: "foo/bar" */
1834             /*           ^        */
1835             start_insert = base;
1836           else if (slash && seen_slash_slash)
1837             /* example: "http://something/" */
1838             /*                           ^  */
1839             start_insert = slash;
1840
1841           span = start_insert - base;
1842           constr = (char *)xmalloc (span + linklength + 1);
1843           if (span)
1844             memcpy (constr, base, span);
1845           if (linklength)
1846             memcpy (constr + span, link, linklength);
1847           constr[span + linklength] = '\0';
1848         }
1849       else
1850         {
1851           /* LINK is a relative URL: we need to replace everything
1852              after last slash (possibly empty) with LINK.
1853
1854              So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1855              our result should be "whatever/foo/qux/xyzzy".  */
1856           int need_explicit_slash = 0;
1857           int span;
1858           const char *start_insert;
1859           const char *last_slash = find_last_char (base, end, '/');
1860           if (!last_slash)
1861             {
1862               /* No slash found at all.  Append LINK to what we have,
1863                  but we'll need a slash as a separator.
1864
1865                  Example: if base == "foo" and link == "qux/xyzzy", then
1866                  we cannot just append link to base, because we'd get
1867                  "fooqux/xyzzy", whereas what we want is
1868                  "foo/qux/xyzzy".
1869
1870                  To make sure the / gets inserted, we set
1871                  need_explicit_slash to 1.  We also set start_insert
1872                  to end + 1, so that the length calculations work out
1873                  correctly for one more (slash) character.  Accessing
1874                  that character is fine, since it will be the
1875                  delimiter, '\0' or '?'.  */
1876               /* example: "foo?..." */
1877               /*               ^    ('?' gets changed to '/') */
1878               start_insert = end + 1;
1879               need_explicit_slash = 1;
1880             }
1881           else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1882             {
1883               /* example: http://host"  */
1884               /*                      ^ */
1885               start_insert = end + 1;
1886               need_explicit_slash = 1;
1887             }
1888           else
1889             {
1890               /* example: "whatever/foo/bar" */
1891               /*                        ^    */
1892               start_insert = last_slash + 1;
1893             }
1894
1895           span = start_insert - base;
1896           constr = (char *)xmalloc (span + linklength + 1);
1897           if (span)
1898             memcpy (constr, base, span);
1899           if (need_explicit_slash)
1900             constr[span - 1] = '/';
1901           if (linklength)
1902             memcpy (constr + span, link, linklength);
1903           constr[span + linklength] = '\0';
1904         }
1905     }
1906   else /* !no_scheme */
1907     {
1908       constr = strdupdelim (link, link + linklength);
1909     }
1910   return constr;
1911 }
1912
1913 /* Merge BASE with LINK and return the resulting URI.  This is an
1914    interface to uri_merge_1 that assumes that LINK is a
1915    zero-terminated string.  */
1916 char *
1917 uri_merge (const char *base, const char *link)
1918 {
1919   return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
1920 }
1921 \f
1922 #define APPEND(p, s) do {                       \
1923   int len = strlen (s);                         \
1924   memcpy (p, s, len);                           \
1925   p += len;                                     \
1926 } while (0)
1927
1928 /* Use this instead of password when the actual password is supposed
1929    to be hidden.  We intentionally use a generic string without giving
1930    away the number of characters in the password, like previous
1931    versions did.  */
1932 #define HIDDEN_PASSWORD "*password*"
1933
1934 /* Recreate the URL string from the data in URL.
1935
1936    If HIDE is non-zero (as it is when we're calling this on a URL we
1937    plan to print, but not when calling it to canonicalize a URL for
1938    use within the program), password will be hidden.  Unsafe
1939    characters in the URL will be quoted.  */
1940
1941 char *
1942 url_string (const struct url *url, int hide_password)
1943 {
1944   int size;
1945   char *result, *p;
1946   char *quoted_user = NULL, *quoted_passwd = NULL;
1947
1948   int scheme_port  = supported_schemes[url->scheme].default_port;
1949   char *scheme_str = supported_schemes[url->scheme].leading_string;
1950   int fplen = full_path_length (url);
1951
1952   int brackets_around_host = 0;
1953
1954   assert (scheme_str != NULL);
1955
1956   /* Make sure the user name and password are quoted. */
1957   if (url->user)
1958     {
1959       quoted_user = encode_string_maybe (url->user);
1960       if (url->passwd)
1961         {
1962           if (hide_password)
1963             quoted_passwd = HIDDEN_PASSWORD;
1964           else
1965             quoted_passwd = encode_string_maybe (url->passwd);
1966         }
1967     }
1968
1969   if (strchr (url->host, ':'))
1970     brackets_around_host = 1;
1971
1972   size = (strlen (scheme_str)
1973           + strlen (url->host)
1974           + (brackets_around_host ? 2 : 0)
1975           + fplen
1976           + 1);
1977   if (url->port != scheme_port)
1978     size += 1 + numdigit (url->port);
1979   if (quoted_user)
1980     {
1981       size += 1 + strlen (quoted_user);
1982       if (quoted_passwd)
1983         size += 1 + strlen (quoted_passwd);
1984     }
1985
1986   p = result = xmalloc (size);
1987
1988   APPEND (p, scheme_str);
1989   if (quoted_user)
1990     {
1991       APPEND (p, quoted_user);
1992       if (quoted_passwd)
1993         {
1994           *p++ = ':';
1995           APPEND (p, quoted_passwd);
1996         }
1997       *p++ = '@';
1998     }
1999
2000   if (brackets_around_host)
2001     *p++ = '[';
2002   APPEND (p, url->host);
2003   if (brackets_around_host)
2004     *p++ = ']';
2005   if (url->port != scheme_port)
2006     {
2007       *p++ = ':';
2008       p = number_to_string (p, url->port);
2009     }
2010
2011   full_path_write (url, p);
2012   p += fplen;
2013   *p++ = '\0';
2014
2015   assert (p - result == size);
2016
2017   if (quoted_user && quoted_user != url->user)
2018     xfree (quoted_user);
2019   if (quoted_passwd && !hide_password
2020       && quoted_passwd != url->passwd)
2021     xfree (quoted_passwd);
2022
2023   return result;
2024 }
2025 \f
2026 /* Return the URL of the proxy appropriate for url U.  */
2027 char *
2028 getproxy (struct url *u)
2029 {
2030   char *proxy = NULL;
2031   char *rewritten_url;
2032   static char rewritten_storage[1024];
2033
2034   if (!opt.use_proxy)
2035     return NULL;
2036   if (!no_proxy_match (u->host, (const char **)opt.no_proxy))
2037     return NULL;
2038
2039   switch (u->scheme)
2040     {
2041     case SCHEME_HTTP:
2042       proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
2043       break;
2044 #ifdef HAVE_SSL
2045     case SCHEME_HTTPS:
2046       proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
2047       break;
2048 #endif
2049     case SCHEME_FTP:
2050       proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
2051       break;
2052     case SCHEME_INVALID:
2053       break;
2054     }
2055   if (!proxy || !*proxy)
2056     return NULL;
2057
2058   /* Handle shorthands.  `rewritten_storage' is a kludge to allow
2059      getproxy() to return static storage. */
2060   rewritten_url = rewrite_shorthand_url (proxy);
2061   if (rewritten_url)
2062     {
2063       strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
2064       rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
2065       proxy = rewritten_storage;
2066     }
2067
2068   return proxy;
2069 }
2070
2071 /* Should a host be accessed through proxy, concerning no_proxy?  */
2072 int
2073 no_proxy_match (const char *host, const char **no_proxy)
2074 {
2075   if (!no_proxy)
2076     return 1;
2077   else
2078     return !sufmatch (no_proxy, host);
2079 }
2080 \f
2081 /* Support for converting links for local viewing in downloaded HTML
2082    files.  This should be moved to another file, because it has
2083    nothing to do with processing URLs.  */
2084
2085 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
2086 static const char *replace_attr PARAMS ((const char *, int, FILE *,
2087                                          const char *));
2088 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
2089                                                       const char *, int));
2090 static char *local_quote_string PARAMS ((const char *));
2091
2092 /* Change the links in one HTML file.  LINKS is a list of links in the
2093    document, along with their positions and the desired direction of
2094    the conversion.  */
2095 void
2096 convert_links (const char *file, struct urlpos *links)
2097 {
2098   struct file_memory *fm;
2099   FILE *fp;
2100   const char *p;
2101   downloaded_file_t downloaded_file_return;
2102
2103   struct urlpos *link;
2104   int to_url_count = 0, to_file_count = 0;
2105
2106   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
2107
2108   {
2109     /* First we do a "dry run": go through the list L and see whether
2110        any URL needs to be converted in the first place.  If not, just
2111        leave the file alone.  */
2112     int dry_count = 0;
2113     struct urlpos *dry = links;
2114     for (dry = links; dry; dry = dry->next)
2115       if (dry->convert != CO_NOCONVERT)
2116         ++dry_count;
2117     if (!dry_count)
2118       {
2119         logputs (LOG_VERBOSE, _("nothing to do.\n"));
2120         return;
2121       }
2122   }
2123
2124   fm = read_file (file);
2125   if (!fm)
2126     {
2127       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2128                  file, strerror (errno));
2129       return;
2130     }
2131
2132   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
2133   if (opt.backup_converted && downloaded_file_return)
2134     write_backup_file (file, downloaded_file_return);
2135
2136   /* Before opening the file for writing, unlink the file.  This is
2137      important if the data in FM is mmaped.  In such case, nulling the
2138      file, which is what fopen() below does, would make us read all
2139      zeroes from the mmaped region.  */
2140   if (unlink (file) < 0 && errno != ENOENT)
2141     {
2142       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
2143                  file, strerror (errno));
2144       read_file_free (fm);
2145       return;
2146     }
2147   /* Now open the file for writing.  */
2148   fp = fopen (file, "wb");
2149   if (!fp)
2150     {
2151       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2152                  file, strerror (errno));
2153       read_file_free (fm);
2154       return;
2155     }
2156
2157   /* Here we loop through all the URLs in file, replacing those of
2158      them that are downloaded with relative references.  */
2159   p = fm->content;
2160   for (link = links; link; link = link->next)
2161     {
2162       char *url_start = fm->content + link->pos;
2163
2164       if (link->pos >= fm->length)
2165         {
2166           DEBUGP (("Something strange is going on.  Please investigate."));
2167           break;
2168         }
2169       /* If the URL is not to be converted, skip it.  */
2170       if (link->convert == CO_NOCONVERT)
2171         {
2172           DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
2173           continue;
2174         }
2175
2176       /* Echo the file contents, up to the offending URL's opening
2177          quote, to the outfile.  */
2178       fwrite (p, 1, url_start - p, fp);
2179       p = url_start;
2180
2181       switch (link->convert)
2182         {
2183         case CO_CONVERT_TO_RELATIVE:
2184           /* Convert absolute URL to relative. */
2185           {
2186             char *newname = construct_relative (file, link->local_name);
2187             char *quoted_newname = local_quote_string (newname);
2188
2189             if (!link->link_refresh_p)
2190               p = replace_attr (p, link->size, fp, quoted_newname);
2191             else
2192               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
2193                                              link->refresh_timeout);
2194
2195             DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
2196                      link->url->url, newname, link->pos, file));
2197             xfree (newname);
2198             xfree (quoted_newname);
2199             ++to_file_count;
2200             break;
2201           }
2202         case CO_CONVERT_TO_COMPLETE:
2203           /* Convert the link to absolute URL. */
2204           {
2205             char *newlink = link->url->url;
2206             char *quoted_newlink = html_quote_string (newlink);
2207
2208             if (!link->link_refresh_p)
2209               p = replace_attr (p, link->size, fp, quoted_newlink);
2210             else
2211               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
2212                                              link->refresh_timeout);
2213
2214             DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
2215                      newlink, link->pos, file));
2216             xfree (quoted_newlink);
2217             ++to_url_count;
2218             break;
2219           }
2220         case CO_NULLIFY_BASE:
2221           /* Change the base href to "". */
2222           p = replace_attr (p, link->size, fp, "");
2223           break;
2224         case CO_NOCONVERT:
2225           abort ();
2226           break;
2227         }
2228     }
2229
2230   /* Output the rest of the file. */
2231   if (p - fm->content < fm->length)
2232     fwrite (p, 1, fm->length - (p - fm->content), fp);
2233   fclose (fp);
2234   read_file_free (fm);
2235
2236   logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
2237 }
2238
2239 /* Construct and return a malloced copy of the relative link from two
2240    pieces of information: local name S1 of the referring file and
2241    local name S2 of the referred file.
2242
2243    So, if S1 is "jagor.srce.hr/index.html" and S2 is
2244    "jagor.srce.hr/images/news.gif", the function will return
2245    "images/news.gif".
2246
2247    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
2248    "fly.cc.fer.hr/images/fly.gif", the function will return
2249    "../images/fly.gif".
2250
2251    Caveats: S1 should not begin with `/', unless S2 also begins with
2252    '/'.  S1 should not contain things like ".." and such --
2253    construct_relative ("fly/ioccc/../index.html",
2254    "fly/images/fly.gif") will fail.  (A workaround is to call
2255    something like path_simplify() on S1).  */
2256 static char *
2257 construct_relative (const char *s1, const char *s2)
2258 {
2259   int i, cnt, sepdirs1;
2260   char *res;
2261
2262   if (*s2 == '/')
2263     return xstrdup (s2);
2264   /* S1 should *not* be absolute, if S2 wasn't.  */
2265   assert (*s1 != '/');
2266   i = cnt = 0;
2267   /* Skip the directories common to both strings.  */
2268   while (1)
2269     {
2270       while (s1[i] && s2[i]
2271              && (s1[i] == s2[i])
2272              && (s1[i] != '/')
2273              && (s2[i] != '/'))
2274         ++i;
2275       if (s1[i] == '/' && s2[i] == '/')
2276         cnt = ++i;
2277       else
2278         break;
2279     }
2280   for (sepdirs1 = 0; s1[i]; i++)
2281     if (s1[i] == '/')
2282       ++sepdirs1;
2283   /* Now, construct the file as of:
2284      - ../ repeated sepdirs1 time
2285      - all the non-mutual directories of S2.  */
2286   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
2287   for (i = 0; i < sepdirs1; i++)
2288     memcpy (res + 3 * i, "../", 3);
2289   strcpy (res + 3 * i, s2 + cnt);
2290   return res;
2291 }
2292 \f
2293 static void
2294 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
2295 {
2296   /* Rather than just writing over the original .html file with the
2297      converted version, save the former to *.orig.  Note we only do
2298      this for files we've _successfully_ downloaded, so we don't
2299      clobber .orig files sitting around from previous invocations. */
2300
2301   /* Construct the backup filename as the original name plus ".orig". */
2302   size_t         filename_len = strlen(file);
2303   char*          filename_plus_orig_suffix;
2304   boolean        already_wrote_backup_file = FALSE;
2305   slist*         converted_file_ptr;
2306   static slist*  converted_files = NULL;
2307
2308   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
2309     {
2310       /* Just write "orig" over "html".  We need to do it this way
2311          because when we're checking to see if we've downloaded the
2312          file before (to see if we can skip downloading it), we don't
2313          know if it's a text/html file.  Therefore we don't know yet
2314          at that stage that -E is going to cause us to tack on
2315          ".html", so we need to compare vs. the original URL plus
2316          ".orig", not the original URL plus ".html.orig". */
2317       filename_plus_orig_suffix = alloca (filename_len + 1);
2318       strcpy(filename_plus_orig_suffix, file);
2319       strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
2320     }
2321   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
2322     {
2323       /* Append ".orig" to the name. */
2324       filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
2325       strcpy(filename_plus_orig_suffix, file);
2326       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
2327     }
2328
2329   /* We can get called twice on the same URL thanks to the
2330      convert_all_links() call in main().  If we write the .orig file
2331      each time in such a case, it'll end up containing the first-pass
2332      conversion, not the original file.  So, see if we've already been
2333      called on this file. */
2334   converted_file_ptr = converted_files;
2335   while (converted_file_ptr != NULL)
2336     if (strcmp(converted_file_ptr->string, file) == 0)
2337       {
2338         already_wrote_backup_file = TRUE;
2339         break;
2340       }
2341     else
2342       converted_file_ptr = converted_file_ptr->next;
2343
2344   if (!already_wrote_backup_file)
2345     {
2346       /* Rename <file> to <file>.orig before former gets written over. */
2347       if (rename(file, filename_plus_orig_suffix) != 0)
2348         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
2349                    file, filename_plus_orig_suffix, strerror (errno));
2350
2351       /* Remember that we've already written a .orig backup for this file.
2352          Note that we never free this memory since we need it till the
2353          convert_all_links() call, which is one of the last things the
2354          program does before terminating.  BTW, I'm not sure if it would be
2355          safe to just set 'converted_file_ptr->string' to 'file' below,
2356          rather than making a copy of the string...  Another note is that I
2357          thought I could just add a field to the urlpos structure saying
2358          that we'd written a .orig file for this URL, but that didn't work,
2359          so I had to make this separate list.
2360          -- Dan Harkless <wget@harkless.org>
2361
2362          This [adding a field to the urlpos structure] didn't work
2363          because convert_file() is called from convert_all_links at
2364          the end of the retrieval with a freshly built new urlpos
2365          list.
2366          -- Hrvoje Niksic <hniksic@arsdigita.com>
2367       */
2368       converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
2369       converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
2370       converted_file_ptr->next = converted_files;
2371       converted_files = converted_file_ptr;
2372     }
2373 }
2374
2375 static int find_fragment PARAMS ((const char *, int, const char **,
2376                                   const char **));
2377
2378 /* Replace an attribute's original text with NEW_TEXT. */
2379
2380 static const char *
2381 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
2382 {
2383   int quote_flag = 0;
2384   char quote_char = '\"';       /* use "..." for quoting, unless the
2385                                    original value is quoted, in which
2386                                    case reuse its quoting char. */
2387   const char *frag_beg, *frag_end;
2388
2389   /* Structure of our string is:
2390        "...old-contents..."
2391        <---    size    --->  (with quotes)
2392      OR:
2393        ...old-contents...
2394        <---    size   -->    (no quotes)   */
2395
2396   if (*p == '\"' || *p == '\'')
2397     {
2398       quote_char = *p;
2399       quote_flag = 1;
2400       ++p;
2401       size -= 2;                /* disregard opening and closing quote */
2402     }
2403   putc (quote_char, fp);
2404   fputs (new_text, fp);
2405
2406   /* Look for fragment identifier, if any. */
2407   if (find_fragment (p, size, &frag_beg, &frag_end))
2408     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2409   p += size;
2410   if (quote_flag)
2411     ++p;
2412   putc (quote_char, fp);
2413
2414   return p;
2415 }
2416
2417 /* The same as REPLACE_ATTR, but used when replacing
2418    <meta http-equiv=refresh content="new_text"> because we need to
2419    append "timeout_value; URL=" before the next_text.  */
2420
2421 static const char *
2422 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
2423                            const char *new_text, int timeout)
2424 {
2425   /* "0; URL=..." */
2426   char *new_with_timeout = (char *)alloca (numdigit (timeout)
2427                                            + 6 /* "; URL=" */
2428                                            + strlen (new_text)
2429                                            + 1);
2430   sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
2431
2432   return replace_attr (p, size, fp, new_with_timeout);
2433 }
2434
2435 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2436    preceded by '&'.  If the character is not found, return zero.  If
2437    the character is found, return 1 and set BP and EP to point to the
2438    beginning and end of the region.
2439
2440    This is used for finding the fragment indentifiers in URLs.  */
2441
2442 static int
2443 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2444 {
2445   const char *end = beg + size;
2446   int saw_amp = 0;
2447   for (; beg < end; beg++)
2448     {
2449       switch (*beg)
2450         {
2451         case '&':
2452           saw_amp = 1;
2453           break;
2454         case '#':
2455           if (!saw_amp)
2456             {
2457               *bp = beg;
2458               *ep = end;
2459               return 1;
2460             }
2461           /* fallthrough */
2462         default:
2463           saw_amp = 0;
2464         }
2465     }
2466   return 0;
2467 }
2468
2469 /* Quote FILE for use as local reference to an HTML file.
2470
2471    We quote ? as %3F to avoid passing part of the file name as the
2472    parameter when browsing the converted file through HTTP.  However,
2473    it is safe to do this only when `--html-extension' is turned on.
2474    This is because converting "index.html?foo=bar" to
2475    "index.html%3Ffoo=bar" would break local browsing, as the latter
2476    isn't even recognized as an HTML file!  However, converting
2477    "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
2478    safe for both local and HTTP-served browsing.  */
2479
2480 static char *
2481 local_quote_string (const char *file)
2482 {
2483   const char *file_sans_qmark;
2484   int qm;
2485
2486   if (!opt.html_extension)
2487     return html_quote_string (file);
2488
2489   qm = count_char (file, '?');
2490
2491   if (qm)
2492     {
2493       const char *from = file;
2494       char *to, *newname;
2495
2496       /* qm * 2 because we replace each question mark with "%3F",
2497          i.e. replace one char with three, hence two more.  */
2498       int fsqlen = strlen (file) + qm * 2;
2499
2500       to = newname = (char *)alloca (fsqlen + 1);
2501       for (; *from; from++)
2502         {
2503           if (*from != '?')
2504             *to++ = *from;
2505           else
2506             {
2507               *to++ = '%';
2508               *to++ = '3';
2509               *to++ = 'F';
2510             }
2511         }
2512       assert (to - newname == fsqlen);
2513       *to = '\0';
2514
2515       file_sans_qmark = newname;
2516     }
2517   else
2518     file_sans_qmark = file;
2519
2520   return html_quote_string (file_sans_qmark);
2521 }
2522
2523 /* We're storing "modes" of type downloaded_file_t in the hash table.
2524    However, our hash tables only accept pointers for keys and values.
2525    So when we need a pointer, we use the address of a
2526    downloaded_file_t variable of static storage.  */
2527
2528 static downloaded_file_t *
2529 downloaded_mode_to_ptr (downloaded_file_t mode)
2530 {
2531   static downloaded_file_t
2532     v1 = FILE_NOT_ALREADY_DOWNLOADED,
2533     v2 = FILE_DOWNLOADED_NORMALLY,
2534     v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2535     v4 = CHECK_FOR_FILE;
2536
2537   switch (mode)
2538     {
2539     case FILE_NOT_ALREADY_DOWNLOADED:
2540       return &v1;
2541     case FILE_DOWNLOADED_NORMALLY:
2542       return &v2;
2543     case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2544       return &v3;
2545     case CHECK_FOR_FILE:
2546       return &v4;
2547     }
2548   return NULL;
2549 }
2550
2551 /* This should really be merged with dl_file_url_map and
2552    downloaded_html_files in recur.c.  This was originally a list, but
2553    I changed it to a hash table beause it was actually taking a lot of
2554    time to find things in it.  */
2555
2556 static struct hash_table *downloaded_files_hash;
2557
2558 /* Remembers which files have been downloaded.  In the standard case, should be
2559    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2560    download successfully (i.e. not for ones we have failures on or that we skip
2561    due to -N).
2562
2563    When we've downloaded a file and tacked on a ".html" extension due to -E,
2564    call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2565    FILE_DOWNLOADED_NORMALLY.
2566
2567    If you just want to check if a file has been previously added without adding
2568    it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
2569    with local filenames, not remote URLs. */
2570 downloaded_file_t
2571 downloaded_file (downloaded_file_t mode, const char *file)
2572 {
2573   downloaded_file_t *ptr;
2574
2575   if (mode == CHECK_FOR_FILE)
2576     {
2577       if (!downloaded_files_hash)
2578         return FILE_NOT_ALREADY_DOWNLOADED;
2579       ptr = hash_table_get (downloaded_files_hash, file);
2580       if (!ptr)
2581         return FILE_NOT_ALREADY_DOWNLOADED;
2582       return *ptr;
2583     }
2584
2585   if (!downloaded_files_hash)
2586     downloaded_files_hash = make_string_hash_table (0);
2587
2588   ptr = hash_table_get (downloaded_files_hash, file);
2589   if (ptr)
2590     return *ptr;
2591
2592   ptr = downloaded_mode_to_ptr (mode);
2593   hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2594
2595   return FILE_NOT_ALREADY_DOWNLOADED;
2596 }
2597
2598 static int
2599 df_free_mapper (void *key, void *value, void *ignored)
2600 {
2601   xfree (key);
2602   return 0;
2603 }
2604
2605 void
2606 downloaded_files_free (void)
2607 {
2608   if (downloaded_files_hash)
2609     {
2610       hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2611       hash_table_destroy (downloaded_files_hash);
2612       downloaded_files_hash = NULL;
2613     }
2614 }
2615
2616 /* Return non-zero if scheme a is similar to scheme b.
2617
2618    Schemes are similar if they are equal.  If SSL is supported, schemes
2619    are also similar if one is http (SCHEME_HTTP) and the other is https
2620    (SCHEME_HTTPS).  */
2621 int
2622 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2623 {
2624   if (a == b)
2625     return 1;
2626 #ifdef HAVE_SSL
2627   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2628       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2629     return 1;
2630 #endif
2631   return 0;
2632 }
2633 \f
2634 #if 0
2635 /* Debugging and testing support for path_simplify. */
2636
2637 /* Debug: run path_simplify on PATH and return the result in a new
2638    string.  Useful for calling from the debugger.  */
2639 static char *
2640 ps (char *path)
2641 {
2642   char *copy = xstrdup (path);
2643   path_simplify (copy);
2644   return copy;
2645 }
2646
2647 static void
2648 run_test (char *test, char *expected_result, int expected_change)
2649 {
2650   char *test_copy = xstrdup (test);
2651   int modified = path_simplify (test_copy);
2652
2653   if (0 != strcmp (test_copy, expected_result))
2654     {
2655       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2656               test, expected_result, test_copy);
2657     }
2658   if (modified != expected_change)
2659     {
2660       if (expected_change == 1)
2661         printf ("Expected no modification with path_simplify(\"%s\").\n",
2662                 test);
2663       else
2664         printf ("Expected modification with path_simplify(\"%s\").\n",
2665                 test);
2666     }
2667   xfree (test_copy);
2668 }
2669
2670 static void
2671 test_path_simplify (void)
2672 {
2673   static struct {
2674     char *test, *result;
2675     int should_modify;
2676   } tests[] = {
2677     { "",               "",             0 },
2678     { ".",              "",             1 },
2679     { "..",             "",             1 },
2680     { "foo",            "foo",          0 },
2681     { "foo/bar",        "foo/bar",      0 },
2682     { "foo///bar",      "foo/bar",      1 },
2683     { "foo/.",          "foo/",         1 },
2684     { "foo/./",         "foo/",         1 },
2685     { "foo./",          "foo./",        0 },
2686     { "foo/../bar",     "bar",          1 },
2687     { "foo/../bar/",    "bar/",         1 },
2688     { "foo/bar/..",     "foo/",         1 },
2689     { "foo/bar/../x",   "foo/x",        1 },
2690     { "foo/bar/../x/",  "foo/x/",       1 },
2691     { "foo/..",         "",             1 },
2692     { "foo/../..",      "",             1 },
2693     { "a/b/../../c",    "c",            1 },
2694     { "./a/../b",       "b",            1 }
2695   };
2696   int i;
2697
2698   for (i = 0; i < ARRAY_SIZE (tests); i++)
2699     {
2700       char *test = tests[i].test;
2701       char *expected_result = tests[i].result;
2702       int   expected_change = tests[i].should_modify;
2703       run_test (test, expected_result, expected_change);
2704     }
2705
2706   /* Now run all the tests with a leading slash before the test case,
2707      to prove that the slash is being preserved.  */
2708   for (i = 0; i < ARRAY_SIZE (tests); i++)
2709     {
2710       char *test, *expected_result;
2711       int expected_change = tests[i].should_modify;
2712
2713       test = xmalloc (1 + strlen (tests[i].test) + 1);
2714       sprintf (test, "/%s", tests[i].test);
2715
2716       expected_result = xmalloc (1 + strlen (tests[i].result) + 1);
2717       sprintf (expected_result, "/%s", tests[i].result);
2718
2719       run_test (test, expected_result, expected_change);
2720
2721       xfree (test);
2722       xfree (expected_result);
2723     }
2724 }
2725 #endif