sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif
  29 #include <sys/types.h>
  30 #ifdef HAVE_UNISTD_H
  31 # include <unistd.h>
  32 #endif
  33 #include <errno.h>
  34 #include <assert.h>
  35
  36 #include "wget.h"
  37 #include "utils.h"
  38 #include "url.h"
  39 #include "host.h"
  40
  41 #ifndef errno
  42 extern int errno;
  43 #endif
  44
  45 /* Is X "."?  */
  46 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  47 /* Is X ".."?  */
  48 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  49
  50 static int urlpath_length PARAMS ((const char *));
  51
  52 struct scheme_data
  53 {
  54   char *leading_string;
  55   int default_port;
  56 };
  57
  58 /* Supported schemes: */
  59 static struct scheme_data supported_schemes[] =
  60 {
  61   { "http://",  DEFAULT_HTTP_PORT },
  62 #ifdef HAVE_SSL
  63   { "https://", DEFAULT_HTTPS_PORT },
  64 #endif
  65   { "ftp://",   DEFAULT_FTP_PORT },
  66
  67   /* SCHEME_INVALID */
  68   { NULL,       -1 }
  69 };
  70
  71 static char *construct_relative PARAMS ((const char *, const char *));
  72
  73 \f
  74 /* Support for encoding and decoding of URL strings.  We determine
  75    whether a character is unsafe through static table lookup.  This
  76    code assumes ASCII character set and 8-bit chars.  */
  77
  78 enum {
  79   urlchr_reserved = 1,
  80   urlchr_unsafe   = 2
  81 };
  82
  83 #define R  urlchr_reserved
  84 #define U  urlchr_unsafe
  85 #define RU R|U
  86
  87 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
  88
  89 /* rfc1738 reserved chars, preserved from encoding.  */
  90
  91 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
  92
  93 /* rfc1738 unsafe chars, plus some more.  */
  94
  95 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
  96
  97 const static unsigned char urlchr_table[256] =
  98 {
  99   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 100   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 101   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 102   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 103   U,  0,  U, RU,   0,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 104   0,  0,  0,  R,   0,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 105   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 106   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 107  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 108   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 109   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 110   0,  0,  0,  U,   U,  U,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 111   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 112   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 113   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 114   0,  0,  0,  U,   U,  U,  U,  U,   /* x   y   z   {    |   }   ~   DEL */
 115
 116   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 117   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 118   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 119   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 120
 121   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 122   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 123   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 124   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 125 };
 126
 127 /* Decodes the forms %xy in a URL to the character the hexadecimal
 128    code of which is xy.  xy are hexadecimal digits from
 129    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 130    hex-digits or `%' precedes `\0', the sequence is inserted
 131    literally.  */
 132
 133 static void
 134 decode_string (char *s)
 135 {
 136   char *t = s;                  /* t - tortoise */
 137   char *h = s;                  /* h - hare     */
 138
 139   for (; *h; h++, t++)
 140     {
 141       if (*h != '%')
 142         {
 143         copychar:
 144           *t = *h;
 145         }
 146       else
 147         {
 148           /* Do nothing if '%' is not followed by two hex digits. */
 149           if (!*(h + 1) || !*(h + 2)
 150               || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
 151             goto copychar;
 152           *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
 153           h += 2;
 154         }
 155     }
 156   *t = '\0';
 157 }
 158
 159 /* Like encode_string, but return S if there are no unsafe chars.  */
 160
 161 static char *
 162 encode_string_maybe (const char *s)
 163 {
 164   const char *p1;
 165   char *p2, *newstr;
 166   int newlen;
 167   int addition = 0;
 168
 169   for (p1 = s; *p1; p1++)
 170     if (UNSAFE_CHAR (*p1))
 171       addition += 2;            /* Two more characters (hex digits) */
 172
 173   if (!addition)
 174     return (char *)s;
 175
 176   newlen = (p1 - s) + addition;
 177   newstr = (char *)xmalloc (newlen + 1);
 178
 179   p1 = s;
 180   p2 = newstr;
 181   while (*p1)
 182     {
 183       if (UNSAFE_CHAR (*p1))
 184         {
 185           const unsigned char c = *p1++;
 186           *p2++ = '%';
 187           *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 188           *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 189         }
 190       else
 191         *p2++ = *p1++;
 192     }
 193   *p2 = '\0';
 194   assert (p2 - newstr == newlen);
 195
 196   return newstr;
 197 }
 198
 199 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
 200    given string, returning a malloc-ed %XX encoded string.  */
 201
 202 char *
 203 encode_string (const char *s)
 204 {
 205   char *encoded = encode_string_maybe (s);
 206   if (encoded != s)
 207     return encoded;
 208   else
 209     return xstrdup (s);
 210 }
 211
 212 /* Encode unsafe characters in PTR to %xx.  If such encoding is done,
 213    the old value of PTR is freed and PTR is made to point to the newly
 214    allocated storage.  */
 215
 216 #define ENCODE(ptr) do {                        \
 217   char *e_new = encode_string_maybe (ptr);      \
 218   if (e_new != ptr)                             \
 219     {                                           \
 220       xfree (ptr);                              \
 221       ptr = e_new;                              \
 222     }                                           \
 223 } while (0)
 224 \f
 225 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
 226
 227 /* Decide whether to encode, decode, or pass through the char at P.
 228    This used to be a macro, but it got a little too convoluted.  */
 229 static inline enum copy_method
 230 decide_copy_method (const char *p)
 231 {
 232   if (*p == '%')
 233     {
 234       if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
 235         {
 236           /* %xx sequence: decode it, unless it would decode to an
 237              unsafe or a reserved char; in that case, leave it as
 238              is. */
 239           char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
 240             XCHAR_TO_XDIGIT (*(p + 2));
 241
 242           if (UNSAFE_CHAR (preempt) || RESERVED_CHAR (preempt))
 243             return CM_PASSTHROUGH;
 244           else
 245             return CM_DECODE;
 246         }
 247       else
 248         /* Garbled %.. sequence: encode `%'. */
 249         return CM_ENCODE;
 250     }
 251   else if (UNSAFE_CHAR (*p) && !RESERVED_CHAR (*p))
 252     return CM_ENCODE;
 253   else
 254     return CM_PASSTHROUGH;
 255 }
 256
 257 /* Translate a %-quoting (but possibly non-conformant) input string S
 258    into a %-quoting (and conformant) output string.  If no characters
 259    are encoded or decoded, return the same string S; otherwise, return
 260    a freshly allocated string with the new contents.
 261
 262    After a URL has been run through this function, the protocols that
 263    use `%' as the quote character can use the resulting string as-is,
 264    while those that don't call decode_string() to get to the intended
 265    data.  This function is also stable: after an input string is
 266    transformed the first time, all further transformations of the
 267    result yield the same result string.
 268
 269    Let's discuss why this function is needed.
 270
 271    Imagine Wget is to retrieve `http://abc.xyz/abc def'.  Since a raw
 272    space character would mess up the HTTP request, it needs to be
 273    quoted, like this:
 274
 275        GET /abc%20def HTTP/1.0
 276
 277    So it appears that the unsafe chars need to be quoted, as with
 278    encode_string.  But what if we're requested to download
 279    `abc%20def'?  Remember that %-encoding is valid URL syntax, so what
 280    the user meant was a literal space, and he was kind enough to quote
 281    it.  In that case, Wget should obviously leave the `%20' as is, and
 282    send the same request as above.  So in this case we may not call
 283    encode_string.
 284
 285    But what if the requested URI is `abc%20 def'?  If we call
 286    encode_string, we end up with `/abc%2520%20def', which is almost
 287    certainly not intended.  If we don't call encode_string, we are
 288    left with the embedded space and cannot send the request.  What the
 289    user meant was for Wget to request `/abc%20%20def', and this is
 290    where reencode_string kicks in.
 291
 292    Wget used to solve this by first decoding %-quotes, and then
 293    encoding all the "unsafe" characters found in the resulting string.
 294    This was wrong because it didn't preserve certain URL special
 295    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 296    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 297    whether we considered `+' reserved (it is).  One of these results
 298    is inevitable because by the second step we would lose information
 299    on whether the `+' was originally encoded or not.  Both results
 300    were wrong because in CGI parameters + means space, while %2B means
 301    literal plus.  reencode_string correctly translates the above to
 302    "a%2B+b", i.e. returns the original string.
 303
 304    This function uses an algorithm proposed by Anon Sricharoenchai:
 305
 306    1. Encode all URL_UNSAFE and the "%" that are not followed by 2
 307       hexdigits.
 308
 309    2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
 310       "+".
 311
 312    ...except that this code conflates the two steps, and decides
 313    whether to encode, decode, or pass through each character in turn.
 314    The function still uses two passes, but their logic is the same --
 315    the first pass exists merely for the sake of allocation.  Another
 316    small difference is that we include `+' to URL_RESERVED.
 317
 318    Anon's test case:
 319
 320    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 321    ->
 322    "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
 323
 324    Simpler test cases:
 325
 326    "foo bar"         -> "foo%20bar"
 327    "foo%20bar"       -> "foo%20bar"
 328    "foo %20bar"      -> "foo%20%20bar"
 329    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 330    "foo%25%20bar"    -> "foo%25%20bar"
 331    "foo%2%20bar"     -> "foo%252%20bar"
 332    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 333    "foo%2b+bar"      -> "foo%2b+bar"  */
 334
 335 char *
 336 reencode_string (const char *s)
 337 {
 338   const char *p1;
 339   char *newstr, *p2;
 340   int oldlen, newlen;
 341
 342   int encode_count = 0;
 343   int decode_count = 0;
 344
 345   /* First, pass through the string to see if there's anything to do,
 346      and to calculate the new length.  */
 347   for (p1 = s; *p1; p1++)
 348     {
 349       switch (decide_copy_method (p1))
 350         {
 351         case CM_ENCODE:
 352           ++encode_count;
 353           break;
 354         case CM_DECODE:
 355           ++decode_count;
 356           break;
 357         case CM_PASSTHROUGH:
 358           break;
 359         }
 360     }
 361
 362   if (!encode_count && !decode_count)
 363     /* The string is good as it is. */
 364     return (char *)s;           /* C const model sucks. */
 365
 366   oldlen = p1 - s;
 367   /* Each encoding adds two characters (hex digits), while each
 368      decoding removes two characters.  */
 369   newlen = oldlen + 2 * (encode_count - decode_count);
 370   newstr = xmalloc (newlen + 1);
 371
 372   p1 = s;
 373   p2 = newstr;
 374
 375   while (*p1)
 376     {
 377       switch (decide_copy_method (p1))
 378         {
 379         case CM_ENCODE:
 380           {
 381             char c = *p1++;
 382             *p2++ = '%';
 383             *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 384             *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 385           }
 386           break;
 387         case CM_DECODE:
 388           *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
 389                    + (XCHAR_TO_XDIGIT (*(p1 + 2))));
 390           p1 += 3;              /* skip %xx */
 391           break;
 392         case CM_PASSTHROUGH:
 393           *p2++ = *p1++;
 394         }
 395     }
 396   *p2 = '\0';
 397   assert (p2 - newstr == newlen);
 398   return newstr;
 399 }
 400
 401 /* Run PTR_VAR through reencode_string.  If a new string is consed,
 402    free PTR_VAR and make it point to the new storage.  Obviously,
 403    PTR_VAR needs to be an lvalue.  */
 404
 405 #define REENCODE(ptr_var) do {                  \
 406   char *rf_new = reencode_string (ptr_var);     \
 407   if (rf_new != ptr_var)                        \
 408     {                                           \
 409       xfree (ptr_var);                          \
 410       ptr_var = rf_new;                         \
 411     }                                           \
 412 } while (0)
 413 \f
 414 /* Returns the scheme type if the scheme is supported, or
 415    SCHEME_INVALID if not.  */
 416 enum url_scheme
 417 url_scheme (const char *url)
 418 {
 419   int i;
 420
 421   for (i = 0; supported_schemes[i].leading_string; i++)
 422     if (!strncasecmp (url, supported_schemes[i].leading_string,
 423                       strlen (supported_schemes[i].leading_string)))
 424       return (enum url_scheme)i;
 425   return SCHEME_INVALID;
 426 }
 427
 428 /* Return the number of characters needed to skip the scheme part of
 429    the URL, e.g. `http://'.  If no scheme is found, returns 0.  */
 430 int
 431 url_skip_scheme (const char *url)
 432 {
 433   const char *p = url;
 434
 435   /* Skip the scheme name.  We allow `-' and `+' because of `whois++',
 436      etc. */
 437   while (ISALNUM (*p) || *p == '-' || *p == '+')
 438     ++p;
 439   if (*p != ':')
 440     return 0;
 441   /* Skip ':'. */
 442   ++p;
 443
 444   /* Skip "//" if found. */
 445   if (*p == '/' && *(p + 1) == '/')
 446     p += 2;
 447
 448   return p - url;
 449 }
 450
 451 /* Returns 1 if the URL begins with a scheme (supported or
 452    unsupported), 0 otherwise.  */
 453 int
 454 url_has_scheme (const char *url)
 455 {
 456   const char *p = url;
 457   while (ISALNUM (*p) || *p == '-' || *p == '+')
 458     ++p;
 459   return *p == ':';
 460 }
 461
 462 int
 463 scheme_default_port (enum url_scheme scheme)
 464 {
 465   return supported_schemes[scheme].default_port;
 466 }
 467
 468 /* Skip the username and password, if present here.  The function
 469    should be called *not* with the complete URL, but with the part
 470    right after the scheme.
 471
 472    If no username and password are found, return 0.  */
 473 int
 474 url_skip_uname (const char *url)
 475 {
 476   const char *p;
 477
 478   /* Look for '@' that comes before '/' or '?'. */
 479   p = (const char *)strpbrk (url, "/?@");
 480   if (!p || *p != '@')
 481     return 0;
 482
 483   return p - url + 1;
 484 }
 485
 486 static int
 487 parse_uname (const char *str, int len, char **user, char **passwd)
 488 {
 489   char *colon;
 490
 491   if (len == 0)
 492     /* Empty user name not allowed. */
 493     return 0;
 494
 495   colon = memchr (str, ':', len);
 496   if (colon == str)
 497     /* Empty user name again. */
 498     return 0;
 499
 500   if (colon)
 501     {
 502       int pwlen = len - (colon + 1 - str);
 503       *passwd = xmalloc (pwlen + 1);
 504       memcpy (*passwd, colon + 1, pwlen);
 505       (*passwd)[pwlen] = '\0';
 506       len -= pwlen + 1;
 507     }
 508   else
 509     *passwd = NULL;
 510
 511   *user = xmalloc (len + 1);
 512   memcpy (*user, str, len);
 513   (*user)[len] = '\0';
 514
 515   return 1;
 516 }
 517
 518 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 519    popularized by Netscape and NcFTP.  HTTP shorthands look like this:
 520
 521    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 522    www.foo.com[:port]            -> http://www.foo.com[:port]
 523
 524    FTP shorthands look like this:
 525
 526    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 527    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 528
 529    If the URL needs not or cannot be rewritten, return NULL.  */
 530 char *
 531 rewrite_shorthand_url (const char *url)
 532 {
 533   const char *p;
 534
 535   if (url_has_scheme (url))
 536     return NULL;
 537
 538   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 539      latter Netscape.  */
 540   for (p = url; *p && *p != ':' && *p != '/'; p++)
 541     ;
 542
 543   if (p == url)
 544     return NULL;
 545
 546   if (*p == ':')
 547     {
 548       const char *pp, *path;
 549       char *res;
 550       /* If the characters after the colon and before the next slash
 551          or end of string are all digits, it's HTTP.  */
 552       int digits = 0;
 553       for (pp = p + 1; ISDIGIT (*pp); pp++)
 554         ++digits;
 555       if (digits > 0
 556           && (*pp == '/' || *pp == '\0'))
 557         goto http;
 558
 559       /* Prepend "ftp://" to the entire URL... */
 560       path = p + 1;
 561       res = xmalloc (6 + strlen (url) + 1);
 562       sprintf (res, "ftp://%s", url);
 563       /* ...and replace ':' with '/'. */
 564       res[6 + (p - url)] = '/';
 565       return res;
 566     }
 567   else
 568     {
 569       char *res;
 570     http:
 571       /* Just prepend "http://" to what we have. */
 572       res = xmalloc (7 + strlen (url) + 1);
 573       sprintf (res, "http://%s", url);
 574       return res;
 575     }
 576 }
 577 \f
 578 static void parse_path PARAMS ((const char *, char **, char **));
 579
 580 static char *
 581 strpbrk_or_eos (const char *s, const char *accept)
 582 {
 583   char *p = strpbrk (s, accept);
 584   if (!p)
 585     p = (char *)s + strlen (s);
 586   return p;
 587 }
 588
 589 static char *parse_errors[] = {
 590 #define PE_NO_ERROR            0
 591   "No error",
 592 #define PE_UNRECOGNIZED_SCHEME 1
 593   "Unrecognized scheme",
 594 #define PE_EMPTY_HOST          2
 595   "Empty host",
 596 #define PE_BAD_PORT_NUMBER     3
 597   "Bad port number",
 598 #define PE_INVALID_USER_NAME   4
 599   "Invalid user name"
 600 };
 601
 602 #define SETERR(p, v) do {                       \
 603   if (p)                                        \
 604     *(p) = (v);                                 \
 605 } while (0)
 606
 607 /* Parse a URL.
 608
 609    Return a new struct url if successful, NULL on error.  In case of
 610    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 611    error code. */
 612 struct url *
 613 url_parse (const char *url, int *error)
 614 {
 615   struct url *u;
 616   const char *p;
 617
 618   enum url_scheme scheme;
 619
 620   const char *uname_b,     *uname_e;
 621   const char *host_b,      *host_e;
 622   const char *path_b,      *path_e;
 623   const char *params_b,    *params_e;
 624   const char *query_b,     *query_e;
 625   const char *fragment_b,  *fragment_e;
 626
 627   int port;
 628   char *user = NULL, *passwd = NULL;
 629
 630   const char *url_orig = url;
 631
 632   p = url = reencode_string (url);
 633
 634   scheme = url_scheme (url);
 635   if (scheme == SCHEME_INVALID)
 636     {
 637       SETERR (error, PE_UNRECOGNIZED_SCHEME);
 638       return NULL;
 639     }
 640
 641   p += strlen (supported_schemes[scheme].leading_string);
 642   uname_b = p;
 643   p += url_skip_uname (p);
 644   uname_e = p;
 645
 646   /* scheme://user:pass@host[:port]... */
 647   /*                    ^              */
 648
 649   /* We attempt to break down the URL into the components path,
 650      params, query, and fragment.  They are ordered like this:
 651
 652        scheme://host[:port][/path][;params][?query][#fragment]  */
 653
 654   params_b   = params_e   = NULL;
 655   query_b    = query_e    = NULL;
 656   fragment_b = fragment_e = NULL;
 657
 658   host_b = p;
 659   p = strpbrk_or_eos (p, ":/;?#");
 660   host_e = p;
 661
 662   if (host_b == host_e)
 663     {
 664       SETERR (error, PE_EMPTY_HOST);
 665       return NULL;
 666     }
 667
 668   port = scheme_default_port (scheme);
 669   if (*p == ':')
 670     {
 671       const char *port_b, *port_e, *pp;
 672
 673       /* scheme://host:port/tralala */
 674       /*              ^             */
 675       ++p;
 676       port_b = p;
 677       p = strpbrk_or_eos (p, "/;?#");
 678       port_e = p;
 679
 680       if (port_b == port_e)
 681         {
 682           /* http://host:/whatever */
 683           /*             ^         */
 684           SETERR (error, PE_BAD_PORT_NUMBER);
 685           return NULL;
 686         }
 687
 688       for (port = 0, pp = port_b; pp < port_e; pp++)
 689         {
 690           if (!ISDIGIT (*pp))
 691             {
 692               /* http://host:12randomgarbage/blah */
 693               /*               ^                  */
 694               SETERR (error, PE_BAD_PORT_NUMBER);
 695               return NULL;
 696             }
 697           port = 10 * port + (*pp - '0');
 698         }
 699     }
 700
 701   if (*p == '/')
 702     {
 703       ++p;
 704       path_b = p;
 705       p = strpbrk_or_eos (p, ";?#");
 706       path_e = p;
 707     }
 708   else
 709     {
 710       /* Path is not allowed not to exist. */
 711       path_b = path_e = p;
 712     }
 713
 714   if (*p == ';')
 715     {
 716       ++p;
 717       params_b = p;
 718       p = strpbrk_or_eos (p, "?#");
 719       params_e = p;
 720     }
 721   if (*p == '?')
 722     {
 723       ++p;
 724       query_b = p;
 725       p = strpbrk_or_eos (p, "#");
 726       query_e = p;
 727     }
 728   if (*p == '#')
 729     {
 730       ++p;
 731       fragment_b = p;
 732       p += strlen (p);
 733       fragment_e = p;
 734     }
 735   assert (*p == 0);
 736
 737   if (uname_b != uname_e)
 738     {
 739       /* http://user:pass@host */
 740       /*        ^         ^    */
 741       /*     uname_b   uname_e */
 742       if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
 743         {
 744           SETERR (error, PE_INVALID_USER_NAME);
 745           return NULL;
 746         }
 747     }
 748
 749   u = (struct url *)xmalloc (sizeof (struct url));
 750   memset (u, 0, sizeof (*u));
 751
 752   if (url == url_orig)
 753     u->url    = xstrdup (url);
 754   else
 755     u->url    = (char *)url;
 756
 757   u->scheme = scheme;
 758   u->host   = strdupdelim (host_b, host_e);
 759   u->port   = port;
 760   u->user   = user;
 761   u->passwd = passwd;
 762
 763   u->path = strdupdelim (path_b, path_e);
 764   path_simplify (u->path);
 765
 766   if (params_b)
 767     u->params = strdupdelim (params_b, params_e);
 768   if (query_b)
 769     u->query = strdupdelim (query_b, query_e);
 770   if (fragment_b)
 771     u->fragment = strdupdelim (fragment_b, fragment_e);
 772
 773   parse_path (u->path, &u->dir, &u->file);
 774
 775   return u;
 776 }
 777
 778 const char *
 779 url_error (int error_code)
 780 {
 781   assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
 782   return parse_errors[error_code];
 783 }
 784
 785 static void
 786 parse_path (const char *quoted_path, char **dir, char **file)
 787 {
 788   char *path, *last_slash;
 789
 790   STRDUP_ALLOCA (path, quoted_path);
 791   decode_string (path);
 792
 793   last_slash = strrchr (path, '/');
 794   if (!last_slash)
 795     {
 796       *dir = xstrdup ("");
 797       *file = xstrdup (path);
 798     }
 799   else
 800     {
 801       *dir = strdupdelim (path, last_slash);
 802       *file = xstrdup (last_slash + 1);
 803     }
 804 }
 805
 806 /* Note: URL's "full path" is the path with the query string and
 807    params appended.  The "fragment" (#foo) is intentionally ignored,
 808    but that might be changed.  For example, if the original URL was
 809    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
 810    the full path will be "/foo/bar/baz;bullshit?querystring".  */
 811
 812 /* Return the length of the full path, without the terminating
 813    zero.  */
 814
 815 static int
 816 full_path_length (const struct url *url)
 817 {
 818   int len = 0;
 819
 820 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
 821
 822   FROB (path);
 823   FROB (params);
 824   FROB (query);
 825
 826 #undef FROB
 827
 828   return len;
 829 }
 830
 831 /* Write out the full path. */
 832
 833 static void
 834 full_path_write (const struct url *url, char *where)
 835 {
 836 #define FROB(el, chr) do {                      \
 837   char *f_el = url->el;                         \
 838   if (f_el) {                                   \
 839     int l = strlen (f_el);                      \
 840     *where++ = chr;                             \
 841     memcpy (where, f_el, l);                    \
 842     where += l;                                 \
 843   }                                             \
 844 } while (0)
 845
 846   FROB (path, '/');
 847   FROB (params, ';');
 848   FROB (query, '?');
 849
 850 #undef FROB
 851 }
 852
 853 /* Public function for getting the "full path". */
 854 char *
 855 url_full_path (const struct url *url)
 856 {
 857   int length = full_path_length (url);
 858   char *full_path = (char *)xmalloc(length + 1);
 859
 860   full_path_write (url, full_path);
 861   full_path[length] = '\0';
 862
 863   return full_path;
 864 }
 865
 866 /* Sync u->path and u->url with u->dir and u->file. */
 867 static void
 868 sync_path (struct url *url)
 869 {
 870   char *newpath;
 871
 872   xfree (url->path);
 873
 874   if (!*url->dir)
 875     {
 876       newpath = xstrdup (url->file);
 877       REENCODE (newpath);
 878     }
 879   else
 880     {
 881       int dirlen = strlen (url->dir);
 882       int filelen = strlen (url->file);
 883
 884       newpath = xmalloc (dirlen + 1 + filelen + 1);
 885       memcpy (newpath, url->dir, dirlen);
 886       newpath[dirlen] = '/';
 887       memcpy (newpath + dirlen + 1, url->file, filelen);
 888       newpath[dirlen + 1 + filelen] = '\0';
 889       REENCODE (newpath);
 890     }
 891
 892   url->path = newpath;
 893
 894   /* Synchronize u->url. */
 895   xfree (url->url);
 896   url->url = url_string (url, 0);
 897 }
 898
 899 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
 900    This way we can sync u->path and u->url when they get changed.  */
 901
 902 void
 903 url_set_dir (struct url *url, const char *newdir)
 904 {
 905   xfree (url->dir);
 906   url->dir = xstrdup (newdir);
 907   sync_path (url);
 908 }
 909
 910 void
 911 url_set_file (struct url *url, const char *newfile)
 912 {
 913   xfree (url->file);
 914   url->file = xstrdup (newfile);
 915   sync_path (url);
 916 }
 917
 918 void
 919 url_free (struct url *url)
 920 {
 921   xfree (url->host);
 922   xfree (url->path);
 923   xfree (url->url);
 924
 925   FREE_MAYBE (url->params);
 926   FREE_MAYBE (url->query);
 927   FREE_MAYBE (url->fragment);
 928   FREE_MAYBE (url->user);
 929   FREE_MAYBE (url->passwd);
 930   FREE_MAYBE (url->dir);
 931   FREE_MAYBE (url->file);
 932
 933   xfree (url);
 934 }
 935 \f
 936 urlpos *
 937 get_urls_file (const char *file)
 938 {
 939   struct file_memory *fm;
 940   urlpos *head, *tail;
 941   const char *text, *text_end;
 942
 943   /* Load the file.  */
 944   fm = read_file (file);
 945   if (!fm)
 946     {
 947       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 948       return NULL;
 949     }
 950   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 951   head = tail = NULL;
 952   text = fm->content;
 953   text_end = fm->content + fm->length;
 954   while (text < text_end)
 955     {
 956       const char *line_beg = text;
 957       const char *line_end = memchr (text, '\n', text_end - text);
 958       if (!line_end)
 959         line_end = text_end;
 960       else
 961         ++line_end;
 962       text = line_end;
 963       while (line_beg < line_end
 964              && ISSPACE (*line_beg))
 965         ++line_beg;
 966       while (line_end > line_beg + 1
 967              && ISSPACE (*(line_end - 1)))
 968         --line_end;
 969       if (line_end > line_beg)
 970         {
 971           urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
 972           memset (entry, 0, sizeof (*entry));
 973           entry->next = NULL;
 974           entry->url = strdupdelim (line_beg, line_end);
 975           if (!head)
 976             head = entry;
 977           else
 978             tail->next = entry;
 979           tail = entry;
 980         }
 981     }
 982   read_file_free (fm);
 983   return head;
 984 }
 985 \f
 986 /* Free the linked list of urlpos.  */
 987 void
 988 free_urlpos (urlpos *l)
 989 {
 990   while (l)
 991     {
 992       urlpos *next = l->next;
 993       xfree (l->url);
 994       FREE_MAYBE (l->local_name);
 995       xfree (l);
 996       l = next;
 997     }
 998 }
 999
1000 /* Rotate FNAME opt.backups times */
1001 void
1002 rotate_backups(const char *fname)
1003 {
1004   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1005   char *from = (char *)alloca (maxlen);
1006   char *to = (char *)alloca (maxlen);
1007   struct stat sb;
1008   int i;
1009
1010   if (stat (fname, &sb) == 0)
1011     if (S_ISREG (sb.st_mode) == 0)
1012       return;
1013
1014   for (i = opt.backups; i > 1; i--)
1015     {
1016       sprintf (from, "%s.%d", fname, i - 1);
1017       sprintf (to, "%s.%d", fname, i);
1018       /* #### This will fail on machines without the rename() system
1019          call.  */
1020       rename (from, to);
1021     }
1022
1023   sprintf (to, "%s.%d", fname, 1);
1024   rename(fname, to);
1025 }
1026
1027 /* Create all the necessary directories for PATH (a file).  Calls
1028    mkdirhier() internally.  */
1029 int
1030 mkalldirs (const char *path)
1031 {
1032   const char *p;
1033   char *t;
1034   struct stat st;
1035   int res;
1036
1037   p = path + strlen (path);
1038   for (; *p != '/' && p != path; p--);
1039   /* Don't create if it's just a file.  */
1040   if ((p == path) && (*p != '/'))
1041     return 0;
1042   t = strdupdelim (path, p);
1043   /* Check whether the directory exists.  */
1044   if ((stat (t, &st) == 0))
1045     {
1046       if (S_ISDIR (st.st_mode))
1047         {
1048           xfree (t);
1049           return 0;
1050         }
1051       else
1052         {
1053           /* If the dir exists as a file name, remove it first.  This
1054              is *only* for Wget to work with buggy old CERN http
1055              servers.  Here is the scenario: When Wget tries to
1056              retrieve a directory without a slash, e.g.
1057              http://foo/bar (bar being a directory), CERN server will
1058              not redirect it too http://foo/bar/ -- it will generate a
1059              directory listing containing links to bar/file1,
1060              bar/file2, etc.  Wget will lose because it saves this
1061              HTML listing to a file `bar', so it cannot create the
1062              directory.  To work around this, if the file of the same
1063              name exists, we just remove it and create the directory
1064              anyway.  */
1065           DEBUGP (("Removing %s because of directory danger!\n", t));
1066           unlink (t);
1067         }
1068     }
1069   res = make_directory (t);
1070   if (res != 0)
1071     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1072   xfree (t);
1073   return res;
1074 }
1075
1076 static int
1077 count_slashes (const char *s)
1078 {
1079   int i = 0;
1080   while (*s)
1081     if (*s++ == '/')
1082       ++i;
1083   return i;
1084 }
1085
1086 /* Return the path name of the URL-equivalent file name, with a
1087    remote-like structure of directories.  */
1088 static char *
1089 mkstruct (const struct url *u)
1090 {
1091   char *host, *dir, *file, *res, *dirpref;
1092   int l;
1093
1094   if (opt.cut_dirs)
1095     {
1096       char *ptr = u->dir + (*u->dir == '/');
1097       int slash_count = 1 + count_slashes (ptr);
1098       int cut = MINVAL (opt.cut_dirs, slash_count);
1099       for (; cut && *ptr; ptr++)
1100         if (*ptr == '/')
1101           --cut;
1102       STRDUP_ALLOCA (dir, ptr);
1103     }
1104   else
1105     dir = u->dir + (*u->dir == '/');
1106
1107   host = xstrdup (u->host);
1108   /* Check for the true name (or at least a consistent name for saving
1109      to directory) of HOST, reusing the hlist if possible.  */
1110   if (opt.add_hostdir && !opt.simple_check)
1111     {
1112       char *nhost = realhost (host);
1113       xfree (host);
1114       host = nhost;
1115     }
1116   /* Add dir_prefix and hostname (if required) to the beginning of
1117      dir.  */
1118   if (opt.add_hostdir)
1119     {
1120       if (!DOTP (opt.dir_prefix))
1121         {
1122           dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1123                                     + strlen (host) + 1);
1124           sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
1125         }
1126       else
1127         STRDUP_ALLOCA (dirpref, host);
1128     }
1129   else                         /* not add_hostdir */
1130     {
1131       if (!DOTP (opt.dir_prefix))
1132         dirpref = opt.dir_prefix;
1133       else
1134         dirpref = "";
1135     }
1136   xfree (host);
1137
1138   /* If there is a prefix, prepend it.  */
1139   if (*dirpref)
1140     {
1141       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1142       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1143       dir = newdir;
1144     }
1145   dir = encode_string (dir);
1146   l = strlen (dir);
1147   if (l && dir[l - 1] == '/')
1148     dir[l - 1] = '\0';
1149
1150   if (!*u->file)
1151     file = "index.html";
1152   else
1153     file = u->file;
1154
1155   /* Finally, construct the full name.  */
1156   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1157   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1158   xfree (dir);
1159   return res;
1160 }
1161
1162 /* Compose a file name out of BASE, an unescaped file name, and QUERY,
1163    an escaped query string.  The trick is to make sure that unsafe
1164    characters in BASE are escaped, and that slashes in QUERY are also
1165    escaped.  */
1166
1167 static char *
1168 compose_file_name (char *base, char *query)
1169 {
1170   char result[256];
1171   char *from;
1172   char *to = result;
1173
1174   /* Copy BASE to RESULT and encode all unsafe characters.  */
1175   from = base;
1176   while (*from && to - result < sizeof (result))
1177     {
1178       if (UNSAFE_CHAR (*from))
1179         {
1180           const unsigned char c = *from++;
1181           *to++ = '%';
1182           *to++ = XDIGIT_TO_XCHAR (c >> 4);
1183           *to++ = XDIGIT_TO_XCHAR (c & 0xf);
1184         }
1185       else
1186         *to++ = *from++;
1187     }
1188
1189   if (query && to - result < sizeof (result))
1190     {
1191       *to++ = '?';
1192
1193       /* Copy QUERY to RESULT and encode all '/' characters. */
1194       from = query;
1195       while (*from && to - result < sizeof (result))
1196         {
1197           if (*from == '/')
1198             {
1199               *to++ = '%';
1200               *to++ = '2';
1201               *to++ = 'F';
1202               ++from;
1203             }
1204           else
1205             *to++ = *from++;
1206         }
1207     }
1208
1209   if (to - result < sizeof (result))
1210     *to = '\0';
1211   else
1212     /* Truncate input which is too long, presumably due to a huge
1213        query string.  */
1214     result[sizeof (result) - 1] = '\0';
1215
1216   return xstrdup (result);
1217 }
1218
1219 /* Create a unique filename, corresponding to a given URL.  Calls
1220    mkstruct if necessary.  Does *not* actually create any directories.  */
1221 char *
1222 url_filename (const struct url *u)
1223 {
1224   char *file, *name;
1225   int have_prefix = 0;          /* whether we must prepend opt.dir_prefix */
1226
1227   if (opt.dirstruct)
1228     {
1229       file = mkstruct (u);
1230       have_prefix = 1;
1231     }
1232   else
1233     {
1234       char *base = *u->file ? u->file : "index.html";
1235       char *query = u->query && *u->query ? u->query : NULL;
1236       file = compose_file_name (base, query);
1237     }
1238
1239   if (!have_prefix)
1240     {
1241       /* Check whether the prefix directory is something other than "."
1242          before prepending it.  */
1243       if (!DOTP (opt.dir_prefix))
1244         {
1245           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1246                                          + 1 + strlen (file) + 1);
1247           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1248           xfree (file);
1249           file = nfile;
1250         }
1251     }
1252   /* DOS-ish file systems don't like `%' signs in them; we change it
1253      to `@'.  */
1254 #ifdef WINDOWS
1255   {
1256     char *p = file;
1257     for (p = file; *p; p++)
1258       if (*p == '%')
1259         *p = '@';
1260   }
1261 #endif /* WINDOWS */
1262
1263   /* Check the cases in which the unique extensions are not used:
1264      1) Clobbering is turned off (-nc).
1265      2) Retrieval with regetting.
1266      3) Timestamping is used.
1267      4) Hierarchy is built.
1268
1269      The exception is the case when file does exist and is a
1270      directory (actually support for bad httpd-s).  */
1271   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1272       && !(file_exists_p (file) && !file_non_directory_p (file)))
1273     return file;
1274
1275   /* Find a unique name.  */
1276   name = unique_name (file);
1277   xfree (file);
1278   return name;
1279 }
1280
1281 /* Like strlen(), but allow the URL to be ended with '?'.  */
1282 static int
1283 urlpath_length (const char *url)
1284 {
1285   const char *q = strchr (url, '?');
1286   if (q)
1287     return q - url;
1288   return strlen (url);
1289 }
1290
1291 /* Find the last occurrence of character C in the range [b, e), or
1292    NULL, if none are present.  This is almost completely equivalent to
1293    { *e = '\0'; return strrchr(b); }, except that it doesn't change
1294    the contents of the string.  */
1295 static const char *
1296 find_last_char (const char *b, const char *e, char c)
1297 {
1298   for (; e > b; e--)
1299     if (*e == c)
1300       return e;
1301   return NULL;
1302 }
1303
1304 /* Resolve the result of "linking" a base URI (BASE) to a
1305    link-specified URI (LINK).
1306
1307    Either of the URIs may be absolute or relative, complete with the
1308    host name, or path only.  This tries to behave "reasonably" in all
1309    foreseeable cases.  It employs little specific knowledge about
1310    schemes or URL-specific stuff -- it just works on strings.
1311
1312    The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1313    See uri_merge for a gentler interface to this functionality.
1314
1315    #### This function should handle `./' and `../' so that the evil
1316    path_simplify can go.  */
1317 static char *
1318 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1319 {
1320   char *constr;
1321
1322   if (no_scheme)
1323     {
1324       const char *end = base + urlpath_length (base);
1325
1326       if (*link != '/')
1327         {
1328           /* LINK is a relative URL: we need to replace everything
1329              after last slash (possibly empty) with LINK.
1330
1331              So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1332              our result should be "whatever/foo/qux/xyzzy".  */
1333           int need_explicit_slash = 0;
1334           int span;
1335           const char *start_insert;
1336           const char *last_slash = find_last_char (base, end, '/');
1337           if (!last_slash)
1338             {
1339               /* No slash found at all.  Append LINK to what we have,
1340                  but we'll need a slash as a separator.
1341
1342                  Example: if base == "foo" and link == "qux/xyzzy", then
1343                  we cannot just append link to base, because we'd get
1344                  "fooqux/xyzzy", whereas what we want is
1345                  "foo/qux/xyzzy".
1346
1347                  To make sure the / gets inserted, we set
1348                  need_explicit_slash to 1.  We also set start_insert
1349                  to end + 1, so that the length calculations work out
1350                  correctly for one more (slash) character.  Accessing
1351                  that character is fine, since it will be the
1352                  delimiter, '\0' or '?'.  */
1353               /* example: "foo?..." */
1354               /*               ^    ('?' gets changed to '/') */
1355               start_insert = end + 1;
1356               need_explicit_slash = 1;
1357             }
1358           else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1359             {
1360               /* example: http://host"  */
1361               /*                      ^ */
1362               start_insert = end + 1;
1363               need_explicit_slash = 1;
1364             }
1365           else
1366             {
1367               /* example: "whatever/foo/bar" */
1368               /*                        ^    */
1369               start_insert = last_slash + 1;
1370             }
1371
1372           span = start_insert - base;
1373           constr = (char *)xmalloc (span + linklength + 1);
1374           if (span)
1375             memcpy (constr, base, span);
1376           if (need_explicit_slash)
1377             constr[span - 1] = '/';
1378           if (linklength)
1379             memcpy (constr + span, link, linklength);
1380           constr[span + linklength] = '\0';
1381         }
1382       else /* *link == `/' */
1383         {
1384           /* LINK is an absolute path: we need to replace everything
1385              after (and including) the FIRST slash with LINK.
1386
1387              So, if BASE is "http://host/whatever/foo/bar", and LINK is
1388              "/qux/xyzzy", our result should be
1389              "http://host/qux/xyzzy".  */
1390           int span;
1391           const char *slash;
1392           const char *start_insert = NULL; /* for gcc to shut up. */
1393           const char *pos = base;
1394           int seen_slash_slash = 0;
1395           /* We're looking for the first slash, but want to ignore
1396              double slash. */
1397         again:
1398           slash = memchr (pos, '/', end - pos);
1399           if (slash && !seen_slash_slash)
1400             if (*(slash + 1) == '/')
1401               {
1402                 pos = slash + 2;
1403                 seen_slash_slash = 1;
1404                 goto again;
1405               }
1406
1407           /* At this point, SLASH is the location of the first / after
1408              "//", or the first slash altogether.  START_INSERT is the
1409              pointer to the location where LINK will be inserted.  When
1410              examining the last two examples, keep in mind that LINK
1411              begins with '/'. */
1412
1413           if (!slash && !seen_slash_slash)
1414             /* example: "foo" */
1415             /*           ^    */
1416             start_insert = base;
1417           else if (!slash && seen_slash_slash)
1418             /* example: "http://foo" */
1419             /*                     ^ */
1420             start_insert = end;
1421           else if (slash && !seen_slash_slash)
1422             /* example: "foo/bar" */
1423             /*           ^        */
1424             start_insert = base;
1425           else if (slash && seen_slash_slash)
1426             /* example: "http://something/" */
1427             /*                           ^  */
1428             start_insert = slash;
1429
1430           span = start_insert - base;
1431           constr = (char *)xmalloc (span + linklength + 1);
1432           if (span)
1433             memcpy (constr, base, span);
1434           if (linklength)
1435             memcpy (constr + span, link, linklength);
1436           constr[span + linklength] = '\0';
1437         }
1438     }
1439   else /* !no_scheme */
1440     {
1441       constr = strdupdelim (link, link + linklength);
1442     }
1443   return constr;
1444 }
1445
1446 /* Merge BASE with LINK and return the resulting URI.  This is an
1447    interface to uri_merge_1 that assumes that LINK is a
1448    zero-terminated string.  */
1449 char *
1450 uri_merge (const char *base, const char *link)
1451 {
1452   return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
1453 }
1454 \f
1455 #define APPEND(p, s) do {                       \
1456   int len = strlen (s);                         \
1457   memcpy (p, s, len);                           \
1458   p += len;                                     \
1459 } while (0)
1460
1461 /* Use this instead of password when the actual password is supposed
1462    to be hidden.  We intentionally use a generic string without giving
1463    away the number of characters in the password, like previous
1464    versions did.  */
1465 #define HIDDEN_PASSWORD "*password*"
1466
1467 /* Recreate the URL string from the data in URL.
1468
1469    If HIDE is non-zero (as it is when we're calling this on a URL we
1470    plan to print, but not when calling it to canonicalize a URL for
1471    use within the program), password will be hidden.  Unsafe
1472    characters in the URL will be quoted.  */
1473
1474 char *
1475 url_string (const struct url *url, int hide_password)
1476 {
1477   int size;
1478   char *result, *p;
1479   char *quoted_user = NULL, *quoted_passwd = NULL;
1480
1481   int scheme_port  = supported_schemes[url->scheme].default_port;
1482   char *scheme_str = supported_schemes[url->scheme].leading_string;
1483   int fplen = full_path_length (url);
1484
1485   assert (scheme_str != NULL);
1486
1487   /* Make sure the user name and password are quoted. */
1488   if (url->user)
1489     {
1490       quoted_user = encode_string_maybe (url->user);
1491       if (url->passwd)
1492         {
1493           if (hide_password)
1494             quoted_passwd = HIDDEN_PASSWORD;
1495           else
1496             quoted_passwd = encode_string_maybe (url->passwd);
1497         }
1498     }
1499
1500   size = (strlen (scheme_str)
1501           + strlen (url->host)
1502           + fplen
1503           + 1);
1504   if (url->port != scheme_port)
1505     size += 1 + numdigit (url->port);
1506   if (quoted_user)
1507     {
1508       size += 1 + strlen (quoted_user);
1509       if (quoted_passwd)
1510         size += 1 + strlen (quoted_passwd);
1511     }
1512
1513   p = result = xmalloc (size);
1514
1515   APPEND (p, scheme_str);
1516   if (quoted_user)
1517     {
1518       APPEND (p, quoted_user);
1519       if (quoted_passwd)
1520         {
1521           *p++ = ':';
1522           APPEND (p, quoted_passwd);
1523         }
1524       *p++ = '@';
1525     }
1526
1527   APPEND (p, url->host);
1528   if (url->port != scheme_port)
1529     {
1530       *p++ = ':';
1531       long_to_string (p, url->port);
1532       p += strlen (p);
1533     }
1534
1535   full_path_write (url, p);
1536   p += fplen;
1537   *p++ = '\0';
1538
1539   assert (p - result == size);
1540
1541   if (quoted_user && quoted_user != url->user)
1542     xfree (quoted_user);
1543   if (quoted_passwd && !hide_password
1544       && quoted_passwd != url->passwd)
1545     xfree (quoted_passwd);
1546
1547   return result;
1548 }
1549 \f
1550 /* Returns proxy host address, in accordance with SCHEME.  */
1551 char *
1552 getproxy (enum url_scheme scheme)
1553 {
1554   char *proxy = NULL;
1555   char *rewritten_url;
1556   static char rewritten_storage[1024];
1557
1558   switch (scheme)
1559     {
1560     case SCHEME_HTTP:
1561       proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1562       break;
1563 #ifdef HAVE_SSL
1564     case SCHEME_HTTPS:
1565       proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1566       break;
1567 #endif
1568     case SCHEME_FTP:
1569       proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1570       break;
1571     case SCHEME_INVALID:
1572       break;
1573     }
1574   if (!proxy || !*proxy)
1575     return NULL;
1576
1577   /* Handle shorthands. */
1578   rewritten_url = rewrite_shorthand_url (proxy);
1579   if (rewritten_url)
1580     {
1581       strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
1582       rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
1583       proxy = rewritten_storage;
1584     }
1585
1586   return proxy;
1587 }
1588
1589 /* Should a host be accessed through proxy, concerning no_proxy?  */
1590 int
1591 no_proxy_match (const char *host, const char **no_proxy)
1592 {
1593   if (!no_proxy)
1594     return 1;
1595   else
1596     return !sufmatch (no_proxy, host);
1597 }
1598 \f
1599 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1600 static void replace_attr PARAMS ((const char **, int, FILE *, const char *));
1601
1602 /* Change the links in an HTML document.  Accepts a structure that
1603    defines the positions of all the links.  */
1604 void
1605 convert_links (const char *file, urlpos *l)
1606 {
1607   struct file_memory *fm;
1608   FILE               *fp;
1609   const char         *p;
1610   downloaded_file_t  downloaded_file_return;
1611
1612   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1613
1614   {
1615     /* First we do a "dry run": go through the list L and see whether
1616        any URL needs to be converted in the first place.  If not, just
1617        leave the file alone.  */
1618     int count = 0;
1619     urlpos *dry = l;
1620     for (dry = l; dry; dry = dry->next)
1621       if (dry->convert != CO_NOCONVERT)
1622         ++count;
1623     if (!count)
1624       {
1625         logputs (LOG_VERBOSE, _("nothing to do.\n"));
1626         return;
1627       }
1628   }
1629
1630   fm = read_file (file);
1631   if (!fm)
1632     {
1633       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1634                  file, strerror (errno));
1635       return;
1636     }
1637
1638   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1639   if (opt.backup_converted && downloaded_file_return)
1640     write_backup_file (file, downloaded_file_return);
1641
1642   /* Before opening the file for writing, unlink the file.  This is
1643      important if the data in FM is mmaped.  In such case, nulling the
1644      file, which is what fopen() below does, would make us read all
1645      zeroes from the mmaped region.  */
1646   if (unlink (file) < 0 && errno != ENOENT)
1647     {
1648       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1649                  file, strerror (errno));
1650       read_file_free (fm);
1651       return;
1652     }
1653   /* Now open the file for writing.  */
1654   fp = fopen (file, "wb");
1655   if (!fp)
1656     {
1657       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1658                  file, strerror (errno));
1659       read_file_free (fm);
1660       return;
1661     }
1662   /* Here we loop through all the URLs in file, replacing those of
1663      them that are downloaded with relative references.  */
1664   p = fm->content;
1665   for (; l; l = l->next)
1666     {
1667       char *url_start = fm->content + l->pos;
1668
1669       if (l->pos >= fm->length)
1670         {
1671           DEBUGP (("Something strange is going on.  Please investigate."));
1672           break;
1673         }
1674       /* If the URL is not to be converted, skip it.  */
1675       if (l->convert == CO_NOCONVERT)
1676         {
1677           DEBUGP (("Skipping %s at position %d.\n", l->url, l->pos));
1678           continue;
1679         }
1680
1681       /* Echo the file contents, up to the offending URL's opening
1682          quote, to the outfile.  */
1683       fwrite (p, 1, url_start - p, fp);
1684       p = url_start;
1685       if (l->convert == CO_CONVERT_TO_RELATIVE)
1686         {
1687           /* Convert absolute URL to relative. */
1688           char *newname = construct_relative (file, l->local_name);
1689           char *quoted_newname = html_quote_string (newname);
1690           replace_attr (&p, l->size, fp, quoted_newname);
1691           DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1692                    l->url, newname, l->pos, file));
1693           xfree (newname);
1694           xfree (quoted_newname);
1695         }
1696       else if (l->convert == CO_CONVERT_TO_COMPLETE)
1697         {
1698           /* Convert the link to absolute URL. */
1699           char *newlink = l->url;
1700           char *quoted_newlink = html_quote_string (newlink);
1701           replace_attr (&p, l->size, fp, quoted_newlink);
1702           DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1703                    newlink, l->pos, file));
1704           xfree (quoted_newlink);
1705         }
1706     }
1707   /* Output the rest of the file. */
1708   if (p - fm->content < fm->length)
1709     fwrite (p, 1, fm->length - (p - fm->content), fp);
1710   fclose (fp);
1711   read_file_free (fm);
1712   logputs (LOG_VERBOSE, _("done.\n"));
1713 }
1714
1715 /* Construct and return a malloced copy of the relative link from two
1716    pieces of information: local name S1 of the referring file and
1717    local name S2 of the referred file.
1718
1719    So, if S1 is "jagor.srce.hr/index.html" and S2 is
1720    "jagor.srce.hr/images/news.gif", the function will return
1721    "images/news.gif".
1722
1723    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1724    "fly.cc.fer.hr/images/fly.gif", the function will return
1725    "../images/fly.gif".
1726
1727    Caveats: S1 should not begin with `/', unless S2 also begins with
1728    '/'.  S1 should not contain things like ".." and such --
1729    construct_relative ("fly/ioccc/../index.html",
1730    "fly/images/fly.gif") will fail.  (A workaround is to call
1731    something like path_simplify() on S1).  */
1732 static char *
1733 construct_relative (const char *s1, const char *s2)
1734 {
1735   int i, cnt, sepdirs1;
1736   char *res;
1737
1738   if (*s2 == '/')
1739     return xstrdup (s2);
1740   /* S1 should *not* be absolute, if S2 wasn't.  */
1741   assert (*s1 != '/');
1742   i = cnt = 0;
1743   /* Skip the directories common to both strings.  */
1744   while (1)
1745     {
1746       while (s1[i] && s2[i]
1747              && (s1[i] == s2[i])
1748              && (s1[i] != '/')
1749              && (s2[i] != '/'))
1750         ++i;
1751       if (s1[i] == '/' && s2[i] == '/')
1752         cnt = ++i;
1753       else
1754         break;
1755     }
1756   for (sepdirs1 = 0; s1[i]; i++)
1757     if (s1[i] == '/')
1758       ++sepdirs1;
1759   /* Now, construct the file as of:
1760      - ../ repeated sepdirs1 time
1761      - all the non-mutual directories of S2.  */
1762   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1763   for (i = 0; i < sepdirs1; i++)
1764     memcpy (res + 3 * i, "../", 3);
1765   strcpy (res + 3 * i, s2 + cnt);
1766   return res;
1767 }
1768 \f
1769 /* Add URL to the head of the list L.  */
1770 urlpos *
1771 add_url (urlpos *l, const char *url, const char *file)
1772 {
1773   urlpos *t;
1774
1775   t = (urlpos *)xmalloc (sizeof (urlpos));
1776   memset (t, 0, sizeof (*t));
1777   t->url = xstrdup (url);
1778   t->local_name = xstrdup (file);
1779   t->next = l;
1780   return t;
1781 }
1782
1783 static void
1784 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1785 {
1786   /* Rather than just writing over the original .html file with the
1787      converted version, save the former to *.orig.  Note we only do
1788      this for files we've _successfully_ downloaded, so we don't
1789      clobber .orig files sitting around from previous invocations. */
1790
1791   /* Construct the backup filename as the original name plus ".orig". */
1792   size_t         filename_len = strlen(file);
1793   char*          filename_plus_orig_suffix;
1794   boolean        already_wrote_backup_file = FALSE;
1795   slist*         converted_file_ptr;
1796   static slist*  converted_files = NULL;
1797
1798   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1799     {
1800       /* Just write "orig" over "html".  We need to do it this way
1801          because when we're checking to see if we've downloaded the
1802          file before (to see if we can skip downloading it), we don't
1803          know if it's a text/html file.  Therefore we don't know yet
1804          at that stage that -E is going to cause us to tack on
1805          ".html", so we need to compare vs. the original URL plus
1806          ".orig", not the original URL plus ".html.orig". */
1807       filename_plus_orig_suffix = alloca (filename_len + 1);
1808       strcpy(filename_plus_orig_suffix, file);
1809       strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1810     }
1811   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1812     {
1813       /* Append ".orig" to the name. */
1814       filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1815       strcpy(filename_plus_orig_suffix, file);
1816       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1817     }
1818
1819   /* We can get called twice on the same URL thanks to the
1820      convert_all_links() call in main().  If we write the .orig file
1821      each time in such a case, it'll end up containing the first-pass
1822      conversion, not the original file.  So, see if we've already been
1823      called on this file. */
1824   converted_file_ptr = converted_files;
1825   while (converted_file_ptr != NULL)
1826     if (strcmp(converted_file_ptr->string, file) == 0)
1827       {
1828         already_wrote_backup_file = TRUE;
1829         break;
1830       }
1831     else
1832       converted_file_ptr = converted_file_ptr->next;
1833
1834   if (!already_wrote_backup_file)
1835     {
1836       /* Rename <file> to <file>.orig before former gets written over. */
1837       if (rename(file, filename_plus_orig_suffix) != 0)
1838         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1839                    file, filename_plus_orig_suffix, strerror (errno));
1840
1841       /* Remember that we've already written a .orig backup for this file.
1842          Note that we never free this memory since we need it till the
1843          convert_all_links() call, which is one of the last things the
1844          program does before terminating.  BTW, I'm not sure if it would be
1845          safe to just set 'converted_file_ptr->string' to 'file' below,
1846          rather than making a copy of the string...  Another note is that I
1847          thought I could just add a field to the urlpos structure saying
1848          that we'd written a .orig file for this URL, but that didn't work,
1849          so I had to make this separate list.
1850          -- Dan Harkless <wget@harkless.org>
1851
1852          This [adding a field to the urlpos structure] didn't work
1853          because convert_file() is called twice: once after all its
1854          sublinks have been retrieved in recursive_retrieve(), and
1855          once at the end of the day in convert_all_links().  The
1856          original linked list collected in recursive_retrieve() is
1857          lost after the first invocation of convert_links(), and
1858          convert_all_links() makes a new one (it calls get_urls_html()
1859          for each file it covers.)  That's why your first approach didn't
1860          work.  The way to make it work is perhaps to make this flag a
1861          field in the `urls_html' list.
1862          -- Hrvoje Niksic <hniksic@arsdigita.com>
1863       */
1864       converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1865       converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
1866       converted_file_ptr->next = converted_files;
1867       converted_files = converted_file_ptr;
1868     }
1869 }
1870
1871 static int find_fragment PARAMS ((const char *, int, const char **,
1872                                   const char **));
1873
1874 static void
1875 replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
1876 {
1877   const char *p = *pp;
1878   int quote_flag = 0;
1879   int size = raw_size;
1880   char quote_char = '\"';
1881   const char *frag_beg, *frag_end;
1882
1883   /* Structure of our string is:
1884        "...old-contents..."
1885        <---  l->size   --->  (with quotes)
1886      OR:
1887        ...old-contents...
1888        <---  l->size  -->    (no quotes)   */
1889
1890   if (*p == '\"' || *p == '\'')
1891     {
1892       quote_char = *p;
1893       quote_flag = 1;
1894       ++p;
1895       size -= 2;                /* disregard opening and closing quote */
1896     }
1897   putc (quote_char, fp);
1898   fputs (new_str, fp);
1899
1900   /* Look for fragment identifier, if any. */
1901   if (find_fragment (p, size, &frag_beg, &frag_end))
1902     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
1903   p += size;
1904   if (quote_flag)
1905     ++p;
1906   putc (quote_char, fp);
1907   *pp = p;
1908 }
1909
1910 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
1911    preceded by '&'.  If the character is not found, return zero.  If
1912    the character is found, return 1 and set BP and EP to point to the
1913    beginning and end of the region.
1914
1915    This is used for finding the fragment indentifiers in URLs.  */
1916
1917 static int
1918 find_fragment (const char *beg, int size, const char **bp, const char **ep)
1919 {
1920   const char *end = beg + size;
1921   int saw_amp = 0;
1922   for (; beg < end; beg++)
1923     {
1924       switch (*beg)
1925         {
1926         case '&':
1927           saw_amp = 1;
1928           break;
1929         case '#':
1930           if (!saw_amp)
1931             {
1932               *bp = beg;
1933               *ep = end;
1934               return 1;
1935             }
1936           /* fallthrough */
1937         default:
1938           saw_amp = 0;
1939         }
1940     }
1941   return 0;
1942 }
1943
1944 typedef struct _downloaded_file_list {
1945   char*                          file;
1946   downloaded_file_t              download_type;
1947   struct _downloaded_file_list*  next;
1948 } downloaded_file_list;
1949
1950 static downloaded_file_list *downloaded_files;
1951
1952 /* Remembers which files have been downloaded.  In the standard case, should be
1953    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1954    download successfully (i.e. not for ones we have failures on or that we skip
1955    due to -N).
1956
1957    When we've downloaded a file and tacked on a ".html" extension due to -E,
1958    call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1959    FILE_DOWNLOADED_NORMALLY.
1960
1961    If you just want to check if a file has been previously added without adding
1962    it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
1963    with local filenames, not remote URLs. */
1964 downloaded_file_t
1965 downloaded_file (downloaded_file_t  mode, const char*  file)
1966 {
1967   boolean                       found_file = FALSE;
1968   downloaded_file_list*         rover = downloaded_files;
1969
1970   while (rover != NULL)
1971     if (strcmp(rover->file, file) == 0)
1972       {
1973         found_file = TRUE;
1974         break;
1975       }
1976     else
1977       rover = rover->next;
1978
1979   if (found_file)
1980     return rover->download_type;  /* file had already been downloaded */
1981   else
1982     {
1983       if (mode != CHECK_FOR_FILE)
1984         {
1985           rover = xmalloc(sizeof(*rover));
1986           rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1987           rover->download_type = mode;
1988           rover->next = downloaded_files;
1989           downloaded_files = rover;
1990         }
1991
1992       return FILE_NOT_ALREADY_DOWNLOADED;
1993     }
1994 }
1995
1996 void
1997 downloaded_files_free (void)
1998 {
1999   downloaded_file_list*         rover = downloaded_files;
2000   while (rover)
2001     {
2002       downloaded_file_list *next = rover->next;
2003       xfree (rover->file);
2004       xfree (rover);
2005       rover = next;
2006     }
2007 }