sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif
  29 #include <sys/types.h>
  30 #ifdef HAVE_UNISTD_H
  31 # include <unistd.h>
  32 #endif
  33 #include <errno.h>
  34 #include <assert.h>
  35
  36 #include "wget.h"
  37 #include "utils.h"
  38 #include "url.h"
  39 #include "host.h"
  40 #include "hash.h"
  41
  42 #ifndef errno
  43 extern int errno;
  44 #endif
  45
  46 /* Is X "."?  */
  47 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  48 /* Is X ".."?  */
  49 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  50
  51 struct scheme_data
  52 {
  53   char *leading_string;
  54   int default_port;
  55   int enabled;
  56 };
  57
  58 /* Supported schemes: */
  59 static struct scheme_data supported_schemes[] =
  60 {
  61   { "http://",  DEFAULT_HTTP_PORT,  1 },
  62 #ifdef HAVE_SSL
  63   { "https://", DEFAULT_HTTPS_PORT, 1 },
  64 #endif
  65   { "ftp://",   DEFAULT_FTP_PORT,   1 },
  66
  67   /* SCHEME_INVALID */
  68   { NULL,       -1,                 0 }
  69 };
  70
  71 /* Forward declarations: */
  72
  73 static char *construct_relative PARAMS ((const char *, const char *));
  74 static int path_simplify PARAMS ((char *));
  75
  76
  77 \f
  78 /* Support for encoding and decoding of URL strings.  We determine
  79    whether a character is unsafe through static table lookup.  This
  80    code assumes ASCII character set and 8-bit chars.  */
  81
  82 enum {
  83   urlchr_reserved = 1,
  84   urlchr_unsafe   = 2
  85 };
  86
  87 #define R  urlchr_reserved
  88 #define U  urlchr_unsafe
  89 #define RU R|U
  90
  91 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
  92
  93 /* rfc1738 reserved chars, preserved from encoding.  */
  94
  95 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
  96
  97 /* rfc1738 unsafe chars, plus some more.  */
  98
  99 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 100
 101 const static unsigned char urlchr_table[256] =
 102 {
 103   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 104   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 105   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 106   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 107   U,  0,  U, RU,   0,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 108   0,  0,  0,  R,   0,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 109   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 110   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 111  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 112   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 113   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 114   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 115   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 116   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 117   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 118   0,  0,  0,  U,   U,  U,  U,  U,   /* x   y   z   {    |   }   ~   DEL */
 119
 120   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 121   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 122   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 123   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 124
 125   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 126   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 127   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 128   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 129 };
 130
 131 /* Decodes the forms %xy in a URL to the character the hexadecimal
 132    code of which is xy.  xy are hexadecimal digits from
 133    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 134    hex-digits or `%' precedes `\0', the sequence is inserted
 135    literally.  */
 136
 137 static void
 138 decode_string (char *s)
 139 {
 140   char *t = s;                  /* t - tortoise */
 141   char *h = s;                  /* h - hare     */
 142
 143   for (; *h; h++, t++)
 144     {
 145       if (*h != '%')
 146         {
 147         copychar:
 148           *t = *h;
 149         }
 150       else
 151         {
 152           /* Do nothing if '%' is not followed by two hex digits. */
 153           if (!*(h + 1) || !*(h + 2)
 154               || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
 155             goto copychar;
 156           *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
 157           h += 2;
 158         }
 159     }
 160   *t = '\0';
 161 }
 162
 163 /* Like encode_string, but return S if there are no unsafe chars.  */
 164
 165 static char *
 166 encode_string_maybe (const char *s)
 167 {
 168   const char *p1;
 169   char *p2, *newstr;
 170   int newlen;
 171   int addition = 0;
 172
 173   for (p1 = s; *p1; p1++)
 174     if (UNSAFE_CHAR (*p1))
 175       addition += 2;            /* Two more characters (hex digits) */
 176
 177   if (!addition)
 178     return (char *)s;
 179
 180   newlen = (p1 - s) + addition;
 181   newstr = (char *)xmalloc (newlen + 1);
 182
 183   p1 = s;
 184   p2 = newstr;
 185   while (*p1)
 186     {
 187       if (UNSAFE_CHAR (*p1))
 188         {
 189           unsigned char c = *p1++;
 190           *p2++ = '%';
 191           *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 192           *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 193         }
 194       else
 195         *p2++ = *p1++;
 196     }
 197   *p2 = '\0';
 198   assert (p2 - newstr == newlen);
 199
 200   return newstr;
 201 }
 202
 203 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
 204    given string, returning a malloc-ed %XX encoded string.  */
 205
 206 char *
 207 encode_string (const char *s)
 208 {
 209   char *encoded = encode_string_maybe (s);
 210   if (encoded != s)
 211     return encoded;
 212   else
 213     return xstrdup (s);
 214 }
 215
 216 /* Encode unsafe characters in PTR to %xx.  If such encoding is done,
 217    the old value of PTR is freed and PTR is made to point to the newly
 218    allocated storage.  */
 219
 220 #define ENCODE(ptr) do {                        \
 221   char *e_new = encode_string_maybe (ptr);      \
 222   if (e_new != ptr)                             \
 223     {                                           \
 224       xfree (ptr);                              \
 225       ptr = e_new;                              \
 226     }                                           \
 227 } while (0)
 228 \f
 229 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
 230
 231 /* Decide whether to encode, decode, or pass through the char at P.
 232    This used to be a macro, but it got a little too convoluted.  */
 233 static inline enum copy_method
 234 decide_copy_method (const char *p)
 235 {
 236   if (*p == '%')
 237     {
 238       if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
 239         {
 240           /* %xx sequence: decode it, unless it would decode to an
 241              unsafe or a reserved char; in that case, leave it as
 242              is. */
 243           char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
 244             XCHAR_TO_XDIGIT (*(p + 2));
 245
 246           if (UNSAFE_CHAR (preempt) || RESERVED_CHAR (preempt))
 247             return CM_PASSTHROUGH;
 248           else
 249             return CM_DECODE;
 250         }
 251       else
 252         /* Garbled %.. sequence: encode `%'. */
 253         return CM_ENCODE;
 254     }
 255   else if (UNSAFE_CHAR (*p) && !RESERVED_CHAR (*p))
 256     return CM_ENCODE;
 257   else
 258     return CM_PASSTHROUGH;
 259 }
 260
 261 /* Translate a %-quoting (but possibly non-conformant) input string S
 262    into a %-quoting (and conformant) output string.  If no characters
 263    are encoded or decoded, return the same string S; otherwise, return
 264    a freshly allocated string with the new contents.
 265
 266    After a URL has been run through this function, the protocols that
 267    use `%' as the quote character can use the resulting string as-is,
 268    while those that don't call decode_string() to get to the intended
 269    data.  This function is also stable: after an input string is
 270    transformed the first time, all further transformations of the
 271    result yield the same result string.
 272
 273    Let's discuss why this function is needed.
 274
 275    Imagine Wget is to retrieve `http://abc.xyz/abc def'.  Since a raw
 276    space character would mess up the HTTP request, it needs to be
 277    quoted, like this:
 278
 279        GET /abc%20def HTTP/1.0
 280
 281    So it appears that the unsafe chars need to be quoted, as with
 282    encode_string.  But what if we're requested to download
 283    `abc%20def'?  Remember that %-encoding is valid URL syntax, so what
 284    the user meant was a literal space, and he was kind enough to quote
 285    it.  In that case, Wget should obviously leave the `%20' as is, and
 286    send the same request as above.  So in this case we may not call
 287    encode_string.
 288
 289    But what if the requested URI is `abc%20 def'?  If we call
 290    encode_string, we end up with `/abc%2520%20def', which is almost
 291    certainly not intended.  If we don't call encode_string, we are
 292    left with the embedded space and cannot send the request.  What the
 293    user meant was for Wget to request `/abc%20%20def', and this is
 294    where reencode_string kicks in.
 295
 296    Wget used to solve this by first decoding %-quotes, and then
 297    encoding all the "unsafe" characters found in the resulting string.
 298    This was wrong because it didn't preserve certain URL special
 299    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 300    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 301    whether we considered `+' reserved (it is).  One of these results
 302    is inevitable because by the second step we would lose information
 303    on whether the `+' was originally encoded or not.  Both results
 304    were wrong because in CGI parameters + means space, while %2B means
 305    literal plus.  reencode_string correctly translates the above to
 306    "a%2B+b", i.e. returns the original string.
 307
 308    This function uses an algorithm proposed by Anon Sricharoenchai:
 309
 310    1. Encode all URL_UNSAFE and the "%" that are not followed by 2
 311       hexdigits.
 312
 313    2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
 314       "+".
 315
 316    ...except that this code conflates the two steps, and decides
 317    whether to encode, decode, or pass through each character in turn.
 318    The function still uses two passes, but their logic is the same --
 319    the first pass exists merely for the sake of allocation.  Another
 320    small difference is that we include `+' to URL_RESERVED.
 321
 322    Anon's test case:
 323
 324    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 325    ->
 326    "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
 327
 328    Simpler test cases:
 329
 330    "foo bar"         -> "foo%20bar"
 331    "foo%20bar"       -> "foo%20bar"
 332    "foo %20bar"      -> "foo%20%20bar"
 333    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 334    "foo%25%20bar"    -> "foo%25%20bar"
 335    "foo%2%20bar"     -> "foo%252%20bar"
 336    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 337    "foo%2b+bar"      -> "foo%2b+bar"  */
 338
 339 static char *
 340 reencode_string (const char *s)
 341 {
 342   const char *p1;
 343   char *newstr, *p2;
 344   int oldlen, newlen;
 345
 346   int encode_count = 0;
 347   int decode_count = 0;
 348
 349   /* First, pass through the string to see if there's anything to do,
 350      and to calculate the new length.  */
 351   for (p1 = s; *p1; p1++)
 352     {
 353       switch (decide_copy_method (p1))
 354         {
 355         case CM_ENCODE:
 356           ++encode_count;
 357           break;
 358         case CM_DECODE:
 359           ++decode_count;
 360           break;
 361         case CM_PASSTHROUGH:
 362           break;
 363         }
 364     }
 365
 366   if (!encode_count && !decode_count)
 367     /* The string is good as it is. */
 368     return (char *)s;           /* C const model sucks. */
 369
 370   oldlen = p1 - s;
 371   /* Each encoding adds two characters (hex digits), while each
 372      decoding removes two characters.  */
 373   newlen = oldlen + 2 * (encode_count - decode_count);
 374   newstr = xmalloc (newlen + 1);
 375
 376   p1 = s;
 377   p2 = newstr;
 378
 379   while (*p1)
 380     {
 381       switch (decide_copy_method (p1))
 382         {
 383         case CM_ENCODE:
 384           {
 385             unsigned char c = *p1++;
 386             *p2++ = '%';
 387             *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 388             *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 389           }
 390           break;
 391         case CM_DECODE:
 392           *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
 393                    + (XCHAR_TO_XDIGIT (*(p1 + 2))));
 394           p1 += 3;              /* skip %xx */
 395           break;
 396         case CM_PASSTHROUGH:
 397           *p2++ = *p1++;
 398         }
 399     }
 400   *p2 = '\0';
 401   assert (p2 - newstr == newlen);
 402   return newstr;
 403 }
 404
 405 /* Run PTR_VAR through reencode_string.  If a new string is consed,
 406    free PTR_VAR and make it point to the new storage.  Obviously,
 407    PTR_VAR needs to be an lvalue.  */
 408
 409 #define REENCODE(ptr_var) do {                  \
 410   char *rf_new = reencode_string (ptr_var);     \
 411   if (rf_new != ptr_var)                        \
 412     {                                           \
 413       xfree (ptr_var);                          \
 414       ptr_var = rf_new;                         \
 415     }                                           \
 416 } while (0)
 417 \f
 418 /* Returns the scheme type if the scheme is supported, or
 419    SCHEME_INVALID if not.  */
 420 enum url_scheme
 421 url_scheme (const char *url)
 422 {
 423   int i;
 424
 425   for (i = 0; supported_schemes[i].leading_string; i++)
 426     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 427                           strlen (supported_schemes[i].leading_string)))
 428       {
 429         if (supported_schemes[i].enabled)
 430           return (enum url_scheme) i;
 431         else
 432           return SCHEME_INVALID;
 433       }
 434
 435   return SCHEME_INVALID;
 436 }
 437
 438 /* Return the number of characters needed to skip the scheme part of
 439    the URL, e.g. `http://'.  If no scheme is found, returns 0.  */
 440 int
 441 url_skip_scheme (const char *url)
 442 {
 443   const char *p = url;
 444
 445   /* Skip the scheme name.  We allow `-' and `+' because of `whois++',
 446      etc. */
 447   while (ISALNUM (*p) || *p == '-' || *p == '+')
 448     ++p;
 449   if (*p != ':')
 450     return 0;
 451   /* Skip ':'. */
 452   ++p;
 453
 454   /* Skip "//" if found. */
 455   if (*p == '/' && *(p + 1) == '/')
 456     p += 2;
 457
 458   return p - url;
 459 }
 460
 461 /* Returns 1 if the URL begins with a scheme (supported or
 462    unsupported), 0 otherwise.  */
 463 int
 464 url_has_scheme (const char *url)
 465 {
 466   const char *p = url;
 467   while (ISALNUM (*p) || *p == '-' || *p == '+')
 468     ++p;
 469   return *p == ':';
 470 }
 471
 472 int
 473 scheme_default_port (enum url_scheme scheme)
 474 {
 475   return supported_schemes[scheme].default_port;
 476 }
 477
 478 void
 479 scheme_disable (enum url_scheme scheme)
 480 {
 481   supported_schemes[scheme].enabled = 0;
 482 }
 483
 484 /* Skip the username and password, if present here.  The function
 485    should be called *not* with the complete URL, but with the part
 486    right after the scheme.
 487
 488    If no username and password are found, return 0.  */
 489 int
 490 url_skip_uname (const char *url)
 491 {
 492   const char *p;
 493
 494   /* Look for '@' that comes before '/' or '?'. */
 495   p = (const char *)strpbrk (url, "/?@");
 496   if (!p || *p != '@')
 497     return 0;
 498
 499   return p - url + 1;
 500 }
 501
 502 static int
 503 parse_uname (const char *str, int len, char **user, char **passwd)
 504 {
 505   char *colon;
 506
 507   if (len == 0)
 508     /* Empty user name not allowed. */
 509     return 0;
 510
 511   colon = memchr (str, ':', len);
 512   if (colon == str)
 513     /* Empty user name again. */
 514     return 0;
 515
 516   if (colon)
 517     {
 518       int pwlen = len - (colon + 1 - str);
 519       *passwd = xmalloc (pwlen + 1);
 520       memcpy (*passwd, colon + 1, pwlen);
 521       (*passwd)[pwlen] = '\0';
 522       len -= pwlen + 1;
 523     }
 524   else
 525     *passwd = NULL;
 526
 527   *user = xmalloc (len + 1);
 528   memcpy (*user, str, len);
 529   (*user)[len] = '\0';
 530
 531   if (*user)
 532     decode_string (*user);
 533   if (*passwd)
 534     decode_string (*passwd);
 535
 536   return 1;
 537 }
 538
 539 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 540    popularized by Netscape and NcFTP.  HTTP shorthands look like this:
 541
 542    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 543    www.foo.com[:port]            -> http://www.foo.com[:port]
 544
 545    FTP shorthands look like this:
 546
 547    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 548    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 549
 550    If the URL needs not or cannot be rewritten, return NULL.  */
 551 char *
 552 rewrite_shorthand_url (const char *url)
 553 {
 554   const char *p;
 555
 556   if (url_has_scheme (url))
 557     return NULL;
 558
 559   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 560      latter Netscape.  */
 561   for (p = url; *p && *p != ':' && *p != '/'; p++)
 562     ;
 563
 564   if (p == url)
 565     return NULL;
 566
 567   if (*p == ':')
 568     {
 569       const char *pp;
 570       char *res;
 571       /* If the characters after the colon and before the next slash
 572          or end of string are all digits, it's HTTP.  */
 573       int digits = 0;
 574       for (pp = p + 1; ISDIGIT (*pp); pp++)
 575         ++digits;
 576       if (digits > 0 && (*pp == '/' || *pp == '\0'))
 577         goto http;
 578
 579       /* Prepend "ftp://" to the entire URL... */
 580       res = xmalloc (6 + strlen (url) + 1);
 581       sprintf (res, "ftp://%s", url);
 582       /* ...and replace ':' with '/'. */
 583       res[6 + (p - url)] = '/';
 584       return res;
 585     }
 586   else
 587     {
 588       char *res;
 589     http:
 590       /* Just prepend "http://" to what we have. */
 591       res = xmalloc (7 + strlen (url) + 1);
 592       sprintf (res, "http://%s", url);
 593       return res;
 594     }
 595 }
 596 \f
 597 static void parse_path PARAMS ((const char *, char **, char **));
 598
 599 static char *
 600 strpbrk_or_eos (const char *s, const char *accept)
 601 {
 602   char *p = strpbrk (s, accept);
 603   if (!p)
 604     p = (char *)s + strlen (s);
 605   return p;
 606 }
 607
 608 /* Turn STR into lowercase; return non-zero if a character was
 609    actually changed. */
 610
 611 static int
 612 lowercase_str (char *str)
 613 {
 614   int change = 0;
 615   for (; *str; str++)
 616     if (ISUPPER (*str))
 617       {
 618         change = 1;
 619         *str = TOLOWER (*str);
 620       }
 621   return change;
 622 }
 623
 624 static char *parse_errors[] = {
 625 #define PE_NO_ERROR                     0
 626   "No error",
 627 #define PE_UNSUPPORTED_SCHEME           1
 628   "Unsupported scheme",
 629 #define PE_EMPTY_HOST                   2
 630   "Empty host",
 631 #define PE_BAD_PORT_NUMBER              3
 632   "Bad port number",
 633 #define PE_INVALID_USER_NAME            4
 634   "Invalid user name",
 635 #define PE_UNTERMINATED_IPV6_ADDRESS    5
 636   "Unterminated IPv6 numeric address",
 637 #define PE_INVALID_IPV6_ADDRESS         6
 638   "Invalid char in IPv6 numeric address"
 639 };
 640
 641 #define SETERR(p, v) do {                       \
 642   if (p)                                        \
 643     *(p) = (v);                                 \
 644 } while (0)
 645
 646 /* Parse a URL.
 647
 648    Return a new struct url if successful, NULL on error.  In case of
 649    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 650    error code. */
 651 struct url *
 652 url_parse (const char *url, int *error)
 653 {
 654   struct url *u;
 655   const char *p;
 656   int path_modified, host_modified;
 657
 658   enum url_scheme scheme;
 659
 660   const char *uname_b,     *uname_e;
 661   const char *host_b,      *host_e;
 662   const char *path_b,      *path_e;
 663   const char *params_b,    *params_e;
 664   const char *query_b,     *query_e;
 665   const char *fragment_b,  *fragment_e;
 666
 667   int port;
 668   char *user = NULL, *passwd = NULL;
 669
 670   char *url_encoded;
 671
 672   scheme = url_scheme (url);
 673   if (scheme == SCHEME_INVALID)
 674     {
 675       SETERR (error, PE_UNSUPPORTED_SCHEME);
 676       return NULL;
 677     }
 678
 679   url_encoded = reencode_string (url);
 680   p = url_encoded;
 681
 682   p += strlen (supported_schemes[scheme].leading_string);
 683   uname_b = p;
 684   p += url_skip_uname (p);
 685   uname_e = p;
 686
 687   /* scheme://user:pass@host[:port]... */
 688   /*                    ^              */
 689
 690   /* We attempt to break down the URL into the components path,
 691      params, query, and fragment.  They are ordered like this:
 692
 693        scheme://host[:port][/path][;params][?query][#fragment]  */
 694
 695   params_b   = params_e   = NULL;
 696   query_b    = query_e    = NULL;
 697   fragment_b = fragment_e = NULL;
 698
 699   host_b = p;
 700
 701   if (*p == '[')
 702     {
 703       /* Support http://[::1]/ used by IPv6. */
 704       int invalid = 0;
 705       ++p;
 706       while (1)
 707         switch (*p++)
 708           {
 709           case ']':
 710             goto out;
 711           case '\0':
 712             SETERR (error, PE_UNTERMINATED_IPV6_ADDRESS);
 713             return NULL;
 714           case '0': case '1': case '2': case '3': case '4':
 715           case '5': case '6': case '7': case '8': case '9':
 716           case ':': case '.':
 717             break;
 718           default:
 719             invalid = 1;
 720           }
 721     out:
 722       if (invalid)
 723         {
 724           SETERR (error, PE_INVALID_IPV6_ADDRESS);
 725           return NULL;
 726         }
 727       /* Don't include brackets in [host_b, host_p). */
 728       ++host_b;
 729       host_e = p - 1;
 730     }
 731   else
 732     {
 733       p = strpbrk_or_eos (p, ":/;?#");
 734       host_e = p;
 735     }
 736
 737   if (host_b == host_e)
 738     {
 739       SETERR (error, PE_EMPTY_HOST);
 740       return NULL;
 741     }
 742
 743   port = scheme_default_port (scheme);
 744   if (*p == ':')
 745     {
 746       const char *port_b, *port_e, *pp;
 747
 748       /* scheme://host:port/tralala */
 749       /*              ^             */
 750       ++p;
 751       port_b = p;
 752       p = strpbrk_or_eos (p, "/;?#");
 753       port_e = p;
 754
 755       if (port_b == port_e)
 756         {
 757           /* http://host:/whatever */
 758           /*             ^         */
 759           SETERR (error, PE_BAD_PORT_NUMBER);
 760           return NULL;
 761         }
 762
 763       for (port = 0, pp = port_b; pp < port_e; pp++)
 764         {
 765           if (!ISDIGIT (*pp))
 766             {
 767               /* http://host:12randomgarbage/blah */
 768               /*               ^                  */
 769               SETERR (error, PE_BAD_PORT_NUMBER);
 770               return NULL;
 771             }
 772           port = 10 * port + (*pp - '0');
 773         }
 774     }
 775
 776   if (*p == '/')
 777     {
 778       ++p;
 779       path_b = p;
 780       p = strpbrk_or_eos (p, ";?#");
 781       path_e = p;
 782     }
 783   else
 784     {
 785       /* Path is not allowed not to exist. */
 786       path_b = path_e = p;
 787     }
 788
 789   if (*p == ';')
 790     {
 791       ++p;
 792       params_b = p;
 793       p = strpbrk_or_eos (p, "?#");
 794       params_e = p;
 795     }
 796   if (*p == '?')
 797     {
 798       ++p;
 799       query_b = p;
 800       p = strpbrk_or_eos (p, "#");
 801       query_e = p;
 802     }
 803   if (*p == '#')
 804     {
 805       ++p;
 806       fragment_b = p;
 807       p += strlen (p);
 808       fragment_e = p;
 809     }
 810   assert (*p == 0);
 811
 812   if (uname_b != uname_e)
 813     {
 814       /* http://user:pass@host */
 815       /*        ^         ^    */
 816       /*     uname_b   uname_e */
 817       if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
 818         {
 819           SETERR (error, PE_INVALID_USER_NAME);
 820           return NULL;
 821         }
 822     }
 823
 824   u = (struct url *)xmalloc (sizeof (struct url));
 825   memset (u, 0, sizeof (*u));
 826
 827   u->scheme = scheme;
 828   u->host   = strdupdelim (host_b, host_e);
 829   u->port   = port;
 830   u->user   = user;
 831   u->passwd = passwd;
 832
 833   u->path = strdupdelim (path_b, path_e);
 834   path_modified = path_simplify (u->path);
 835   parse_path (u->path, &u->dir, &u->file);
 836
 837   host_modified = lowercase_str (u->host);
 838
 839   if (params_b)
 840     u->params = strdupdelim (params_b, params_e);
 841   if (query_b)
 842     u->query = strdupdelim (query_b, query_e);
 843   if (fragment_b)
 844     u->fragment = strdupdelim (fragment_b, fragment_e);
 845
 846   if (path_modified || u->fragment || host_modified || path_b == path_e)
 847     {
 848       /* If we suspect that a transformation has rendered what
 849          url_string might return different from URL_ENCODED, rebuild
 850          u->url using url_string.  */
 851       u->url = url_string (u, 0);
 852
 853       if (url_encoded != url)
 854         xfree ((char *) url_encoded);
 855     }
 856   else
 857     {
 858       if (url_encoded == url)
 859         u->url    = xstrdup (url);
 860       else
 861         u->url    = url_encoded;
 862     }
 863   url_encoded = NULL;
 864
 865   return u;
 866 }
 867
 868 const char *
 869 url_error (int error_code)
 870 {
 871   assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
 872   return parse_errors[error_code];
 873 }
 874
 875 static void
 876 parse_path (const char *quoted_path, char **dir, char **file)
 877 {
 878   char *path, *last_slash;
 879
 880   STRDUP_ALLOCA (path, quoted_path);
 881   decode_string (path);
 882
 883   last_slash = strrchr (path, '/');
 884   if (!last_slash)
 885     {
 886       *dir = xstrdup ("");
 887       *file = xstrdup (path);
 888     }
 889   else
 890     {
 891       *dir = strdupdelim (path, last_slash);
 892       *file = xstrdup (last_slash + 1);
 893     }
 894 }
 895
 896 /* Note: URL's "full path" is the path with the query string and
 897    params appended.  The "fragment" (#foo) is intentionally ignored,
 898    but that might be changed.  For example, if the original URL was
 899    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
 900    the full path will be "/foo/bar/baz;bullshit?querystring".  */
 901
 902 /* Return the length of the full path, without the terminating
 903    zero.  */
 904
 905 static int
 906 full_path_length (const struct url *url)
 907 {
 908   int len = 0;
 909
 910 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
 911
 912   FROB (path);
 913   FROB (params);
 914   FROB (query);
 915
 916 #undef FROB
 917
 918   return len;
 919 }
 920
 921 /* Write out the full path. */
 922
 923 static void
 924 full_path_write (const struct url *url, char *where)
 925 {
 926 #define FROB(el, chr) do {                      \
 927   char *f_el = url->el;                         \
 928   if (f_el) {                                   \
 929     int l = strlen (f_el);                      \
 930     *where++ = chr;                             \
 931     memcpy (where, f_el, l);                    \
 932     where += l;                                 \
 933   }                                             \
 934 } while (0)
 935
 936   FROB (path, '/');
 937   FROB (params, ';');
 938   FROB (query, '?');
 939
 940 #undef FROB
 941 }
 942
 943 /* Public function for getting the "full path".  E.g. if u->path is
 944    "foo/bar" and u->query is "param=value", full_path will be
 945    "/foo/bar?param=value". */
 946
 947 char *
 948 url_full_path (const struct url *url)
 949 {
 950   int length = full_path_length (url);
 951   char *full_path = (char *)xmalloc(length + 1);
 952
 953   full_path_write (url, full_path);
 954   full_path[length] = '\0';
 955
 956   return full_path;
 957 }
 958
 959 /* Sync u->path and u->url with u->dir and u->file. */
 960
 961 static void
 962 sync_path (struct url *url)
 963 {
 964   char *newpath;
 965
 966   xfree (url->path);
 967
 968   if (!*url->dir)
 969     {
 970       newpath = xstrdup (url->file);
 971       REENCODE (newpath);
 972     }
 973   else
 974     {
 975       int dirlen = strlen (url->dir);
 976       int filelen = strlen (url->file);
 977
 978       newpath = xmalloc (dirlen + 1 + filelen + 1);
 979       memcpy (newpath, url->dir, dirlen);
 980       newpath[dirlen] = '/';
 981       memcpy (newpath + dirlen + 1, url->file, filelen);
 982       newpath[dirlen + 1 + filelen] = '\0';
 983       REENCODE (newpath);
 984     }
 985
 986   url->path = newpath;
 987
 988   /* Synchronize u->url. */
 989   xfree (url->url);
 990   url->url = url_string (url, 0);
 991 }
 992
 993 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
 994    This way we can sync u->path and u->url when they get changed.  */
 995
 996 void
 997 url_set_dir (struct url *url, const char *newdir)
 998 {
 999   xfree (url->dir);
1000   url->dir = xstrdup (newdir);
1001   sync_path (url);
1002 }
1003
1004 void
1005 url_set_file (struct url *url, const char *newfile)
1006 {
1007   xfree (url->file);
1008   url->file = xstrdup (newfile);
1009   sync_path (url);
1010 }
1011
1012 void
1013 url_free (struct url *url)
1014 {
1015   xfree (url->host);
1016   xfree (url->path);
1017   xfree (url->url);
1018
1019   FREE_MAYBE (url->params);
1020   FREE_MAYBE (url->query);
1021   FREE_MAYBE (url->fragment);
1022   FREE_MAYBE (url->user);
1023   FREE_MAYBE (url->passwd);
1024
1025   xfree (url->dir);
1026   xfree (url->file);
1027
1028   xfree (url);
1029 }
1030 \f
1031 struct urlpos *
1032 get_urls_file (const char *file)
1033 {
1034   struct file_memory *fm;
1035   struct urlpos *head, *tail;
1036   const char *text, *text_end;
1037
1038   /* Load the file.  */
1039   fm = read_file (file);
1040   if (!fm)
1041     {
1042       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
1043       return NULL;
1044     }
1045   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
1046
1047   head = tail = NULL;
1048   text = fm->content;
1049   text_end = fm->content + fm->length;
1050   while (text < text_end)
1051     {
1052       const char *line_beg = text;
1053       const char *line_end = memchr (text, '\n', text_end - text);
1054       if (!line_end)
1055         line_end = text_end;
1056       else
1057         ++line_end;
1058       text = line_end;
1059
1060       /* Strip whitespace from the beginning and end of line. */
1061       while (line_beg < line_end && ISSPACE (*line_beg))
1062         ++line_beg;
1063       while (line_end > line_beg && ISSPACE (*(line_end - 1)))
1064         --line_end;
1065
1066       if (line_end > line_beg)
1067         {
1068           /* URL is in the [line_beg, line_end) region. */
1069
1070           int up_error_code;
1071           char *url_text;
1072           struct urlpos *entry;
1073           struct url *url;
1074
1075           /* We must copy the URL to a zero-terminated string, and we
1076              can't use alloca because we're in a loop.  *sigh*.  */
1077           url_text = strdupdelim (line_beg, line_end);
1078
1079           if (opt.base_href)
1080             {
1081               /* Merge opt.base_href with URL. */
1082               char *merged = uri_merge (opt.base_href, url_text);
1083               xfree (url_text);
1084               url_text = merged;
1085             }
1086
1087           url = url_parse (url_text, &up_error_code);
1088           if (!url)
1089             {
1090               logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1091                          file, url_text, url_error (up_error_code));
1092               xfree (url_text);
1093               continue;
1094             }
1095           xfree (url_text);
1096
1097           entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1098           memset (entry, 0, sizeof (*entry));
1099           entry->next = NULL;
1100           entry->url = url;
1101
1102           if (!head)
1103             head = entry;
1104           else
1105             tail->next = entry;
1106           tail = entry;
1107         }
1108     }
1109   read_file_free (fm);
1110   return head;
1111 }
1112 \f
1113 /* Free the linked list of urlpos.  */
1114 void
1115 free_urlpos (struct urlpos *l)
1116 {
1117   while (l)
1118     {
1119       struct urlpos *next = l->next;
1120       if (l->url)
1121         url_free (l->url);
1122       FREE_MAYBE (l->local_name);
1123       xfree (l);
1124       l = next;
1125     }
1126 }
1127
1128 /* Rotate FNAME opt.backups times */
1129 void
1130 rotate_backups(const char *fname)
1131 {
1132   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1133   char *from = (char *)alloca (maxlen);
1134   char *to = (char *)alloca (maxlen);
1135   struct stat sb;
1136   int i;
1137
1138   if (stat (fname, &sb) == 0)
1139     if (S_ISREG (sb.st_mode) == 0)
1140       return;
1141
1142   for (i = opt.backups; i > 1; i--)
1143     {
1144       sprintf (from, "%s.%d", fname, i - 1);
1145       sprintf (to, "%s.%d", fname, i);
1146       /* #### This will fail on machines without the rename() system
1147          call.  */
1148       rename (from, to);
1149     }
1150
1151   sprintf (to, "%s.%d", fname, 1);
1152   rename(fname, to);
1153 }
1154
1155 /* Create all the necessary directories for PATH (a file).  Calls
1156    mkdirhier() internally.  */
1157 int
1158 mkalldirs (const char *path)
1159 {
1160   const char *p;
1161   char *t;
1162   struct stat st;
1163   int res;
1164
1165   p = path + strlen (path);
1166   for (; *p != '/' && p != path; p--);
1167   /* Don't create if it's just a file.  */
1168   if ((p == path) && (*p != '/'))
1169     return 0;
1170   t = strdupdelim (path, p);
1171   /* Check whether the directory exists.  */
1172   if ((stat (t, &st) == 0))
1173     {
1174       if (S_ISDIR (st.st_mode))
1175         {
1176           xfree (t);
1177           return 0;
1178         }
1179       else
1180         {
1181           /* If the dir exists as a file name, remove it first.  This
1182              is *only* for Wget to work with buggy old CERN http
1183              servers.  Here is the scenario: When Wget tries to
1184              retrieve a directory without a slash, e.g.
1185              http://foo/bar (bar being a directory), CERN server will
1186              not redirect it too http://foo/bar/ -- it will generate a
1187              directory listing containing links to bar/file1,
1188              bar/file2, etc.  Wget will lose because it saves this
1189              HTML listing to a file `bar', so it cannot create the
1190              directory.  To work around this, if the file of the same
1191              name exists, we just remove it and create the directory
1192              anyway.  */
1193           DEBUGP (("Removing %s because of directory danger!\n", t));
1194           unlink (t);
1195         }
1196     }
1197   res = make_directory (t);
1198   if (res != 0)
1199     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1200   xfree (t);
1201   return res;
1202 }
1203
1204 static int
1205 count_slashes (const char *s)
1206 {
1207   int i = 0;
1208   while (*s)
1209     if (*s++ == '/')
1210       ++i;
1211   return i;
1212 }
1213
1214 /* Return the path name of the URL-equivalent file name, with a
1215    remote-like structure of directories.  */
1216 static char *
1217 mkstruct (const struct url *u)
1218 {
1219   char *dir, *dir_preencoding;
1220   char *file, *res, *dirpref;
1221   char *query = u->query && *u->query ? u->query : NULL;
1222   int l;
1223
1224   if (opt.cut_dirs)
1225     {
1226       char *ptr = u->dir + (*u->dir == '/');
1227       int slash_count = 1 + count_slashes (ptr);
1228       int cut = MINVAL (opt.cut_dirs, slash_count);
1229       for (; cut && *ptr; ptr++)
1230         if (*ptr == '/')
1231           --cut;
1232       STRDUP_ALLOCA (dir, ptr);
1233     }
1234   else
1235     dir = u->dir + (*u->dir == '/');
1236
1237   /* Check for the true name (or at least a consistent name for saving
1238      to directory) of HOST, reusing the hlist if possible.  */
1239   if (opt.add_hostdir)
1240     {
1241       /* Add dir_prefix and hostname (if required) to the beginning of
1242          dir.  */
1243       dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1244                                 + strlen (u->host)
1245                                 + 1 + numdigit (u->port)
1246                                 + 1);
1247       if (!DOTP (opt.dir_prefix))
1248         sprintf (dirpref, "%s/%s", opt.dir_prefix, u->host);
1249       else
1250         strcpy (dirpref, u->host);
1251
1252       if (u->port != scheme_default_port (u->scheme))
1253         {
1254           int len = strlen (dirpref);
1255           dirpref[len] = ':';
1256           number_to_string (dirpref + len + 1, u->port);
1257         }
1258     }
1259   else                          /* not add_hostdir */
1260     {
1261       if (!DOTP (opt.dir_prefix))
1262         dirpref = opt.dir_prefix;
1263       else
1264         dirpref = "";
1265     }
1266
1267   /* If there is a prefix, prepend it.  */
1268   if (*dirpref)
1269     {
1270       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1271       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1272       dir = newdir;
1273     }
1274
1275   dir_preencoding = dir;
1276   dir = reencode_string (dir_preencoding);
1277
1278   l = strlen (dir);
1279   if (l && dir[l - 1] == '/')
1280     dir[l - 1] = '\0';
1281
1282   if (!*u->file)
1283     file = "index.html";
1284   else
1285     file = u->file;
1286
1287   /* Finally, construct the full name.  */
1288   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file)
1289                          + (query ? (1 + strlen (query)) : 0)
1290                          + 1);
1291   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1292   if (query)
1293     {
1294       strcat (res, "?");
1295       strcat (res, query);
1296     }
1297   if (dir != dir_preencoding)
1298     xfree (dir);
1299   return res;
1300 }
1301
1302 /* Compose a file name out of BASE, an unescaped file name, and QUERY,
1303    an escaped query string.  The trick is to make sure that unsafe
1304    characters in BASE are escaped, and that slashes in QUERY are also
1305    escaped.  */
1306
1307 static char *
1308 compose_file_name (char *base, char *query)
1309 {
1310   char result[256];
1311   char *from;
1312   char *to = result;
1313
1314   /* Copy BASE to RESULT and encode all unsafe characters.  */
1315   from = base;
1316   while (*from && to - result < sizeof (result))
1317     {
1318       if (UNSAFE_CHAR (*from))
1319         {
1320           unsigned char c = *from++;
1321           *to++ = '%';
1322           *to++ = XDIGIT_TO_XCHAR (c >> 4);
1323           *to++ = XDIGIT_TO_XCHAR (c & 0xf);
1324         }
1325       else
1326         *to++ = *from++;
1327     }
1328
1329   if (query && to - result < sizeof (result))
1330     {
1331       *to++ = '?';
1332
1333       /* Copy QUERY to RESULT and encode all '/' characters. */
1334       from = query;
1335       while (*from && to - result < sizeof (result))
1336         {
1337           if (*from == '/')
1338             {
1339               *to++ = '%';
1340               *to++ = '2';
1341               *to++ = 'F';
1342               ++from;
1343             }
1344           else
1345             *to++ = *from++;
1346         }
1347     }
1348
1349   if (to - result < sizeof (result))
1350     *to = '\0';
1351   else
1352     /* Truncate input which is too long, presumably due to a huge
1353        query string.  */
1354     result[sizeof (result) - 1] = '\0';
1355
1356   return xstrdup (result);
1357 }
1358
1359 /* Create a unique filename, corresponding to a given URL.  Calls
1360    mkstruct if necessary.  Does *not* actually create any directories.  */
1361 char *
1362 url_filename (const struct url *u)
1363 {
1364   char *file, *name;
1365   int have_prefix = 0;          /* whether we must prepend opt.dir_prefix */
1366
1367   if (opt.dirstruct)
1368     {
1369       file = mkstruct (u);
1370       have_prefix = 1;
1371     }
1372   else
1373     {
1374       char *base = *u->file ? u->file : "index.html";
1375       char *query = u->query && *u->query ? u->query : NULL;
1376       file = compose_file_name (base, query);
1377     }
1378
1379   if (!have_prefix)
1380     {
1381       /* Check whether the prefix directory is something other than "."
1382          before prepending it.  */
1383       if (!DOTP (opt.dir_prefix))
1384         {
1385           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1386                                          + 1 + strlen (file) + 1);
1387           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1388           xfree (file);
1389           file = nfile;
1390         }
1391     }
1392   /* DOS-ish file systems don't like `%' signs in them; we change it
1393      to `@'.  */
1394 #ifdef WINDOWS
1395   {
1396     char *p = file;
1397     for (p = file; *p; p++)
1398       if (*p == '%')
1399         *p = '@';
1400   }
1401 #endif /* WINDOWS */
1402
1403   /* Check the cases in which the unique extensions are not used:
1404      1) Clobbering is turned off (-nc).
1405      2) Retrieval with regetting.
1406      3) Timestamping is used.
1407      4) Hierarchy is built.
1408
1409      The exception is the case when file does exist and is a
1410      directory (actually support for bad httpd-s).  */
1411   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1412       && !(file_exists_p (file) && !file_non_directory_p (file)))
1413     return file;
1414
1415   /* Find a unique name.  */
1416   name = unique_name (file);
1417   xfree (file);
1418   return name;
1419 }
1420
1421 /* Return the langth of URL's path.  Path is considered to be
1422    terminated by one of '?', ';', '#', or by the end of the
1423    string.  */
1424 static int
1425 path_length (const char *url)
1426 {
1427   const char *q = strpbrk_or_eos (url, "?;#");
1428   return q - url;
1429 }
1430
1431 /* Find the last occurrence of character C in the range [b, e), or
1432    NULL, if none are present.  This is equivalent to strrchr(b, c),
1433    except that it accepts an END argument instead of requiring the
1434    string to be zero-terminated.  Why is there no memrchr()?  */
1435 static const char *
1436 find_last_char (const char *b, const char *e, char c)
1437 {
1438   for (; e > b; e--)
1439     if (*e == c)
1440       return e;
1441   return NULL;
1442 }
1443 \f
1444 /* Resolve "." and ".." elements of PATH by destructively modifying
1445    PATH.  "." is resolved by removing that path element, and ".." is
1446    resolved by removing the preceding path element.  Leading and
1447    trailing slashes are preserved.
1448
1449    Return non-zero if any changes have been made.
1450
1451    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1452    test examples are provided below.  If you change anything in this
1453    function, run test_path_simplify to make sure you haven't broken a
1454    test case.
1455
1456    A previous version of this function was based on path_simplify()
1457    from GNU Bash, but it has been rewritten for Wget 1.8.1.  */
1458
1459 static int
1460 path_simplify (char *path)
1461 {
1462   int change = 0;
1463   char *p, *end;
1464
1465   if (path[0] == '/')
1466     ++path;                     /* preserve the leading '/'. */
1467
1468   p = path;
1469   end = p + strlen (p) + 1;     /* position past the terminating zero. */
1470
1471   while (1)
1472     {
1473     again:
1474       /* P should point to the beginning of a path element. */
1475
1476       if (*p == '.' && (*(p + 1) == '/' || *(p + 1) == '\0'))
1477         {
1478           /* Handle "./foo" by moving "foo" two characters to the
1479              left. */
1480           if (*(p + 1) == '/')
1481             {
1482               change = 1;
1483               memmove (p, p + 2, end - p);
1484               end -= 2;
1485               goto again;
1486             }
1487           else
1488             {
1489               change = 1;
1490               *p = '\0';
1491               break;
1492             }
1493         }
1494       else if (*p == '.' && *(p + 1) == '.'
1495                && (*(p + 2) == '/' || *(p + 2) == '\0'))
1496         {
1497           /* Handle "../foo" by moving "foo" one path element to the
1498              left.  */
1499           char *b = p;          /* not p-1 because P can equal PATH */
1500
1501           /* Backtrack by one path element, but not past the beginning
1502              of PATH. */
1503
1504           /* foo/bar/../baz */
1505           /*         ^ p    */
1506           /*     ^ b        */
1507
1508           if (b > path)
1509             {
1510               /* Move backwards until B hits the beginning of the
1511                  previous path element or the beginning of path. */
1512               for (--b; b > path && *(b - 1) != '/'; b--)
1513                 ;
1514             }
1515
1516           change = 1;
1517           if (*(p + 2) == '/')
1518             {
1519               memmove (b, p + 3, end - (p + 3));
1520               end -= (p + 3) - b;
1521               p = b;
1522             }
1523           else
1524             {
1525               *b = '\0';
1526               break;
1527             }
1528
1529           goto again;
1530         }
1531       else if (*p == '/')
1532         {
1533           /* Remove empty path elements.  Not mandated by rfc1808 et
1534              al, but empty path elements are not all that useful, and
1535              the rest of Wget might not deal with them well. */
1536           char *q = p;
1537           while (*q == '/')
1538             ++q;
1539           change = 1;
1540           if (*q == '\0')
1541             {
1542               *p = '\0';
1543               break;
1544             }
1545           memmove (p, q, end - q);
1546           end -= q - p;
1547           goto again;
1548         }
1549
1550       /* Skip to the next path element. */
1551       while (*p && *p != '/')
1552         ++p;
1553       if (*p == '\0')
1554         break;
1555
1556       /* Make sure P points to the beginning of the next path element,
1557          which is location after the slash. */
1558       ++p;
1559     }
1560
1561   return change;
1562 }
1563 \f
1564 /* Resolve the result of "linking" a base URI (BASE) to a
1565    link-specified URI (LINK).
1566
1567    Either of the URIs may be absolute or relative, complete with the
1568    host name, or path only.  This tries to behave "reasonably" in all
1569    foreseeable cases.  It employs little specific knowledge about
1570    schemes or URL-specific stuff -- it just works on strings.
1571
1572    The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1573    See uri_merge for a gentler interface to this functionality.
1574
1575    Perhaps this function should call path_simplify so that the callers
1576    don't have to call url_parse unconditionally.  */
1577 static char *
1578 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1579 {
1580   char *constr;
1581
1582   if (no_scheme)
1583     {
1584       const char *end = base + path_length (base);
1585
1586       if (!*link)
1587         {
1588           /* Empty LINK points back to BASE, query string and all. */
1589           constr = xstrdup (base);
1590         }
1591       else if (*link == '?')
1592         {
1593           /* LINK points to the same location, but changes the query
1594              string.  Examples: */
1595           /* uri_merge("path",         "?new") -> "path?new"     */
1596           /* uri_merge("path?foo",     "?new") -> "path?new"     */
1597           /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1598           /* uri_merge("path#foo",     "?new") -> "path?new"     */
1599           int baselength = end - base;
1600           constr = xmalloc (baselength + linklength + 1);
1601           memcpy (constr, base, baselength);
1602           memcpy (constr + baselength, link, linklength);
1603           constr[baselength + linklength] = '\0';
1604         }
1605       else if (*link == '#')
1606         {
1607           /* uri_merge("path",         "#new") -> "path#new"     */
1608           /* uri_merge("path#foo",     "#new") -> "path#new"     */
1609           /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1610           /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1611           int baselength;
1612           const char *end1 = strchr (base, '#');
1613           if (!end1)
1614             end1 = base + strlen (base);
1615           baselength = end1 - base;
1616           constr = xmalloc (baselength + linklength + 1);
1617           memcpy (constr, base, baselength);
1618           memcpy (constr + baselength, link, linklength);
1619           constr[baselength + linklength] = '\0';
1620         }
1621       else if (linklength > 1 && *link == '/' && *(link + 1) == '/')
1622         {
1623           /* LINK begins with "//" and so is a net path: we need to
1624              replace everything after (and including) the double slash
1625              with LINK. */
1626
1627           /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1628           /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1629           /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1630
1631           int span;
1632           const char *slash;
1633           const char *start_insert;
1634
1635           /* Look for first slash. */
1636           slash = memchr (base, '/', end - base);
1637           /* If found slash and it is a double slash, then replace
1638              from this point, else default to replacing from the
1639              beginning.  */
1640           if (slash && *(slash + 1) == '/')
1641             start_insert = slash;
1642           else
1643             start_insert = base;
1644
1645           span = start_insert - base;
1646           constr = (char *)xmalloc (span + linklength + 1);
1647           if (span)
1648             memcpy (constr, base, span);
1649           memcpy (constr + span, link, linklength);
1650           constr[span + linklength] = '\0';
1651         }
1652       else if (*link == '/')
1653         {
1654           /* LINK is an absolute path: we need to replace everything
1655              after (and including) the FIRST slash with LINK.
1656
1657              So, if BASE is "http://host/whatever/foo/bar", and LINK is
1658              "/qux/xyzzy", our result should be
1659              "http://host/qux/xyzzy".  */
1660           int span;
1661           const char *slash;
1662           const char *start_insert = NULL; /* for gcc to shut up. */
1663           const char *pos = base;
1664           int seen_slash_slash = 0;
1665           /* We're looking for the first slash, but want to ignore
1666              double slash. */
1667         again:
1668           slash = memchr (pos, '/', end - pos);
1669           if (slash && !seen_slash_slash)
1670             if (*(slash + 1) == '/')
1671               {
1672                 pos = slash + 2;
1673                 seen_slash_slash = 1;
1674                 goto again;
1675               }
1676
1677           /* At this point, SLASH is the location of the first / after
1678              "//", or the first slash altogether.  START_INSERT is the
1679              pointer to the location where LINK will be inserted.  When
1680              examining the last two examples, keep in mind that LINK
1681              begins with '/'. */
1682
1683           if (!slash && !seen_slash_slash)
1684             /* example: "foo" */
1685             /*           ^    */
1686             start_insert = base;
1687           else if (!slash && seen_slash_slash)
1688             /* example: "http://foo" */
1689             /*                     ^ */
1690             start_insert = end;
1691           else if (slash && !seen_slash_slash)
1692             /* example: "foo/bar" */
1693             /*           ^        */
1694             start_insert = base;
1695           else if (slash && seen_slash_slash)
1696             /* example: "http://something/" */
1697             /*                           ^  */
1698             start_insert = slash;
1699
1700           span = start_insert - base;
1701           constr = (char *)xmalloc (span + linklength + 1);
1702           if (span)
1703             memcpy (constr, base, span);
1704           if (linklength)
1705             memcpy (constr + span, link, linklength);
1706           constr[span + linklength] = '\0';
1707         }
1708       else
1709         {
1710           /* LINK is a relative URL: we need to replace everything
1711              after last slash (possibly empty) with LINK.
1712
1713              So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1714              our result should be "whatever/foo/qux/xyzzy".  */
1715           int need_explicit_slash = 0;
1716           int span;
1717           const char *start_insert;
1718           const char *last_slash = find_last_char (base, end, '/');
1719           if (!last_slash)
1720             {
1721               /* No slash found at all.  Append LINK to what we have,
1722                  but we'll need a slash as a separator.
1723
1724                  Example: if base == "foo" and link == "qux/xyzzy", then
1725                  we cannot just append link to base, because we'd get
1726                  "fooqux/xyzzy", whereas what we want is
1727                  "foo/qux/xyzzy".
1728
1729                  To make sure the / gets inserted, we set
1730                  need_explicit_slash to 1.  We also set start_insert
1731                  to end + 1, so that the length calculations work out
1732                  correctly for one more (slash) character.  Accessing
1733                  that character is fine, since it will be the
1734                  delimiter, '\0' or '?'.  */
1735               /* example: "foo?..." */
1736               /*               ^    ('?' gets changed to '/') */
1737               start_insert = end + 1;
1738               need_explicit_slash = 1;
1739             }
1740           else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1741             {
1742               /* example: http://host"  */
1743               /*                      ^ */
1744               start_insert = end + 1;
1745               need_explicit_slash = 1;
1746             }
1747           else
1748             {
1749               /* example: "whatever/foo/bar" */
1750               /*                        ^    */
1751               start_insert = last_slash + 1;
1752             }
1753
1754           span = start_insert - base;
1755           constr = (char *)xmalloc (span + linklength + 1);
1756           if (span)
1757             memcpy (constr, base, span);
1758           if (need_explicit_slash)
1759             constr[span - 1] = '/';
1760           if (linklength)
1761             memcpy (constr + span, link, linklength);
1762           constr[span + linklength] = '\0';
1763         }
1764     }
1765   else /* !no_scheme */
1766     {
1767       constr = strdupdelim (link, link + linklength);
1768     }
1769   return constr;
1770 }
1771
1772 /* Merge BASE with LINK and return the resulting URI.  This is an
1773    interface to uri_merge_1 that assumes that LINK is a
1774    zero-terminated string.  */
1775 char *
1776 uri_merge (const char *base, const char *link)
1777 {
1778   return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
1779 }
1780 \f
1781 #define APPEND(p, s) do {                       \
1782   int len = strlen (s);                         \
1783   memcpy (p, s, len);                           \
1784   p += len;                                     \
1785 } while (0)
1786
1787 /* Use this instead of password when the actual password is supposed
1788    to be hidden.  We intentionally use a generic string without giving
1789    away the number of characters in the password, like previous
1790    versions did.  */
1791 #define HIDDEN_PASSWORD "*password*"
1792
1793 /* Recreate the URL string from the data in URL.
1794
1795    If HIDE is non-zero (as it is when we're calling this on a URL we
1796    plan to print, but not when calling it to canonicalize a URL for
1797    use within the program), password will be hidden.  Unsafe
1798    characters in the URL will be quoted.  */
1799
1800 char *
1801 url_string (const struct url *url, int hide_password)
1802 {
1803   int size;
1804   char *result, *p;
1805   char *quoted_user = NULL, *quoted_passwd = NULL;
1806
1807   int scheme_port  = supported_schemes[url->scheme].default_port;
1808   char *scheme_str = supported_schemes[url->scheme].leading_string;
1809   int fplen = full_path_length (url);
1810
1811   int brackets_around_host = 0;
1812
1813   assert (scheme_str != NULL);
1814
1815   /* Make sure the user name and password are quoted. */
1816   if (url->user)
1817     {
1818       quoted_user = encode_string_maybe (url->user);
1819       if (url->passwd)
1820         {
1821           if (hide_password)
1822             quoted_passwd = HIDDEN_PASSWORD;
1823           else
1824             quoted_passwd = encode_string_maybe (url->passwd);
1825         }
1826     }
1827
1828   if (strchr (url->host, ':'))
1829     brackets_around_host = 1;
1830
1831   size = (strlen (scheme_str)
1832           + strlen (url->host)
1833           + (brackets_around_host ? 2 : 0)
1834           + fplen
1835           + 1);
1836   if (url->port != scheme_port)
1837     size += 1 + numdigit (url->port);
1838   if (quoted_user)
1839     {
1840       size += 1 + strlen (quoted_user);
1841       if (quoted_passwd)
1842         size += 1 + strlen (quoted_passwd);
1843     }
1844
1845   p = result = xmalloc (size);
1846
1847   APPEND (p, scheme_str);
1848   if (quoted_user)
1849     {
1850       APPEND (p, quoted_user);
1851       if (quoted_passwd)
1852         {
1853           *p++ = ':';
1854           APPEND (p, quoted_passwd);
1855         }
1856       *p++ = '@';
1857     }
1858
1859   if (brackets_around_host)
1860     *p++ = '[';
1861   APPEND (p, url->host);
1862   if (brackets_around_host)
1863     *p++ = ']';
1864   if (url->port != scheme_port)
1865     {
1866       *p++ = ':';
1867       p = number_to_string (p, url->port);
1868     }
1869
1870   full_path_write (url, p);
1871   p += fplen;
1872   *p++ = '\0';
1873
1874   assert (p - result == size);
1875
1876   if (quoted_user && quoted_user != url->user)
1877     xfree (quoted_user);
1878   if (quoted_passwd && !hide_password
1879       && quoted_passwd != url->passwd)
1880     xfree (quoted_passwd);
1881
1882   return result;
1883 }
1884 \f
1885 /* Returns proxy host address, in accordance with SCHEME.  */
1886 char *
1887 getproxy (enum url_scheme scheme)
1888 {
1889   char *proxy = NULL;
1890   char *rewritten_url;
1891   static char rewritten_storage[1024];
1892
1893   switch (scheme)
1894     {
1895     case SCHEME_HTTP:
1896       proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1897       break;
1898 #ifdef HAVE_SSL
1899     case SCHEME_HTTPS:
1900       proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1901       break;
1902 #endif
1903     case SCHEME_FTP:
1904       proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1905       break;
1906     case SCHEME_INVALID:
1907       break;
1908     }
1909   if (!proxy || !*proxy)
1910     return NULL;
1911
1912   /* Handle shorthands. */
1913   rewritten_url = rewrite_shorthand_url (proxy);
1914   if (rewritten_url)
1915     {
1916       strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
1917       rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
1918       proxy = rewritten_storage;
1919     }
1920
1921   return proxy;
1922 }
1923
1924 /* Should a host be accessed through proxy, concerning no_proxy?  */
1925 int
1926 no_proxy_match (const char *host, const char **no_proxy)
1927 {
1928   if (!no_proxy)
1929     return 1;
1930   else
1931     return !sufmatch (no_proxy, host);
1932 }
1933 \f
1934 /* Support for converting links for local viewing in downloaded HTML
1935    files.  This should be moved to another file, because it has
1936    nothing to do with processing URLs.  */
1937
1938 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1939 static const char *replace_attr PARAMS ((const char *, int, FILE *,
1940                                          const char *));
1941 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
1942                                                       const char *, int));
1943 static char *local_quote_string PARAMS ((const char *));
1944
1945 /* Change the links in one HTML file.  LINKS is a list of links in the
1946    document, along with their positions and the desired direction of
1947    the conversion.  */
1948 void
1949 convert_links (const char *file, struct urlpos *links)
1950 {
1951   struct file_memory *fm;
1952   FILE *fp;
1953   const char *p;
1954   downloaded_file_t downloaded_file_return;
1955
1956   struct urlpos *link;
1957   int to_url_count = 0, to_file_count = 0;
1958
1959   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1960
1961   {
1962     /* First we do a "dry run": go through the list L and see whether
1963        any URL needs to be converted in the first place.  If not, just
1964        leave the file alone.  */
1965     int dry_count = 0;
1966     struct urlpos *dry = links;
1967     for (dry = links; dry; dry = dry->next)
1968       if (dry->convert != CO_NOCONVERT)
1969         ++dry_count;
1970     if (!dry_count)
1971       {
1972         logputs (LOG_VERBOSE, _("nothing to do.\n"));
1973         return;
1974       }
1975   }
1976
1977   fm = read_file (file);
1978   if (!fm)
1979     {
1980       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1981                  file, strerror (errno));
1982       return;
1983     }
1984
1985   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1986   if (opt.backup_converted && downloaded_file_return)
1987     write_backup_file (file, downloaded_file_return);
1988
1989   /* Before opening the file for writing, unlink the file.  This is
1990      important if the data in FM is mmaped.  In such case, nulling the
1991      file, which is what fopen() below does, would make us read all
1992      zeroes from the mmaped region.  */
1993   if (unlink (file) < 0 && errno != ENOENT)
1994     {
1995       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1996                  file, strerror (errno));
1997       read_file_free (fm);
1998       return;
1999     }
2000   /* Now open the file for writing.  */
2001   fp = fopen (file, "wb");
2002   if (!fp)
2003     {
2004       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2005                  file, strerror (errno));
2006       read_file_free (fm);
2007       return;
2008     }
2009
2010   /* Here we loop through all the URLs in file, replacing those of
2011      them that are downloaded with relative references.  */
2012   p = fm->content;
2013   for (link = links; link; link = link->next)
2014     {
2015       char *url_start = fm->content + link->pos;
2016
2017       if (link->pos >= fm->length)
2018         {
2019           DEBUGP (("Something strange is going on.  Please investigate."));
2020           break;
2021         }
2022       /* If the URL is not to be converted, skip it.  */
2023       if (link->convert == CO_NOCONVERT)
2024         {
2025           DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
2026           continue;
2027         }
2028
2029       /* Echo the file contents, up to the offending URL's opening
2030          quote, to the outfile.  */
2031       fwrite (p, 1, url_start - p, fp);
2032       p = url_start;
2033
2034       switch (link->convert)
2035         {
2036         case CO_CONVERT_TO_RELATIVE:
2037           /* Convert absolute URL to relative. */
2038           {
2039             char *newname = construct_relative (file, link->local_name);
2040             char *quoted_newname = local_quote_string (newname);
2041
2042             if (!link->link_refresh_p)
2043               p = replace_attr (p, link->size, fp, quoted_newname);
2044             else
2045               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
2046                                              link->refresh_timeout);
2047
2048             DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
2049                      link->url->url, newname, link->pos, file));
2050             xfree (newname);
2051             xfree (quoted_newname);
2052             ++to_file_count;
2053             break;
2054           }
2055         case CO_CONVERT_TO_COMPLETE:
2056           /* Convert the link to absolute URL. */
2057           {
2058             char *newlink = link->url->url;
2059             char *quoted_newlink = html_quote_string (newlink);
2060
2061             if (!link->link_refresh_p)
2062               p = replace_attr (p, link->size, fp, quoted_newlink);
2063             else
2064               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
2065                                              link->refresh_timeout);
2066
2067             DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
2068                      newlink, link->pos, file));
2069             xfree (quoted_newlink);
2070             ++to_url_count;
2071             break;
2072           }
2073         case CO_NULLIFY_BASE:
2074           /* Change the base href to "". */
2075           p = replace_attr (p, link->size, fp, "");
2076           break;
2077         case CO_NOCONVERT:
2078           abort ();
2079           break;
2080         }
2081     }
2082
2083   /* Output the rest of the file. */
2084   if (p - fm->content < fm->length)
2085     fwrite (p, 1, fm->length - (p - fm->content), fp);
2086   fclose (fp);
2087   read_file_free (fm);
2088
2089   logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
2090 }
2091
2092 /* Construct and return a malloced copy of the relative link from two
2093    pieces of information: local name S1 of the referring file and
2094    local name S2 of the referred file.
2095
2096    So, if S1 is "jagor.srce.hr/index.html" and S2 is
2097    "jagor.srce.hr/images/news.gif", the function will return
2098    "images/news.gif".
2099
2100    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
2101    "fly.cc.fer.hr/images/fly.gif", the function will return
2102    "../images/fly.gif".
2103
2104    Caveats: S1 should not begin with `/', unless S2 also begins with
2105    '/'.  S1 should not contain things like ".." and such --
2106    construct_relative ("fly/ioccc/../index.html",
2107    "fly/images/fly.gif") will fail.  (A workaround is to call
2108    something like path_simplify() on S1).  */
2109 static char *
2110 construct_relative (const char *s1, const char *s2)
2111 {
2112   int i, cnt, sepdirs1;
2113   char *res;
2114
2115   if (*s2 == '/')
2116     return xstrdup (s2);
2117   /* S1 should *not* be absolute, if S2 wasn't.  */
2118   assert (*s1 != '/');
2119   i = cnt = 0;
2120   /* Skip the directories common to both strings.  */
2121   while (1)
2122     {
2123       while (s1[i] && s2[i]
2124              && (s1[i] == s2[i])
2125              && (s1[i] != '/')
2126              && (s2[i] != '/'))
2127         ++i;
2128       if (s1[i] == '/' && s2[i] == '/')
2129         cnt = ++i;
2130       else
2131         break;
2132     }
2133   for (sepdirs1 = 0; s1[i]; i++)
2134     if (s1[i] == '/')
2135       ++sepdirs1;
2136   /* Now, construct the file as of:
2137      - ../ repeated sepdirs1 time
2138      - all the non-mutual directories of S2.  */
2139   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
2140   for (i = 0; i < sepdirs1; i++)
2141     memcpy (res + 3 * i, "../", 3);
2142   strcpy (res + 3 * i, s2 + cnt);
2143   return res;
2144 }
2145 \f
2146 static void
2147 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
2148 {
2149   /* Rather than just writing over the original .html file with the
2150      converted version, save the former to *.orig.  Note we only do
2151      this for files we've _successfully_ downloaded, so we don't
2152      clobber .orig files sitting around from previous invocations. */
2153
2154   /* Construct the backup filename as the original name plus ".orig". */
2155   size_t         filename_len = strlen(file);
2156   char*          filename_plus_orig_suffix;
2157   boolean        already_wrote_backup_file = FALSE;
2158   slist*         converted_file_ptr;
2159   static slist*  converted_files = NULL;
2160
2161   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
2162     {
2163       /* Just write "orig" over "html".  We need to do it this way
2164          because when we're checking to see if we've downloaded the
2165          file before (to see if we can skip downloading it), we don't
2166          know if it's a text/html file.  Therefore we don't know yet
2167          at that stage that -E is going to cause us to tack on
2168          ".html", so we need to compare vs. the original URL plus
2169          ".orig", not the original URL plus ".html.orig". */
2170       filename_plus_orig_suffix = alloca (filename_len + 1);
2171       strcpy(filename_plus_orig_suffix, file);
2172       strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
2173     }
2174   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
2175     {
2176       /* Append ".orig" to the name. */
2177       filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
2178       strcpy(filename_plus_orig_suffix, file);
2179       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
2180     }
2181
2182   /* We can get called twice on the same URL thanks to the
2183      convert_all_links() call in main().  If we write the .orig file
2184      each time in such a case, it'll end up containing the first-pass
2185      conversion, not the original file.  So, see if we've already been
2186      called on this file. */
2187   converted_file_ptr = converted_files;
2188   while (converted_file_ptr != NULL)
2189     if (strcmp(converted_file_ptr->string, file) == 0)
2190       {
2191         already_wrote_backup_file = TRUE;
2192         break;
2193       }
2194     else
2195       converted_file_ptr = converted_file_ptr->next;
2196
2197   if (!already_wrote_backup_file)
2198     {
2199       /* Rename <file> to <file>.orig before former gets written over. */
2200       if (rename(file, filename_plus_orig_suffix) != 0)
2201         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
2202                    file, filename_plus_orig_suffix, strerror (errno));
2203
2204       /* Remember that we've already written a .orig backup for this file.
2205          Note that we never free this memory since we need it till the
2206          convert_all_links() call, which is one of the last things the
2207          program does before terminating.  BTW, I'm not sure if it would be
2208          safe to just set 'converted_file_ptr->string' to 'file' below,
2209          rather than making a copy of the string...  Another note is that I
2210          thought I could just add a field to the urlpos structure saying
2211          that we'd written a .orig file for this URL, but that didn't work,
2212          so I had to make this separate list.
2213          -- Dan Harkless <wget@harkless.org>
2214
2215          This [adding a field to the urlpos structure] didn't work
2216          because convert_file() is called from convert_all_links at
2217          the end of the retrieval with a freshly built new urlpos
2218          list.
2219          -- Hrvoje Niksic <hniksic@arsdigita.com>
2220       */
2221       converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
2222       converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
2223       converted_file_ptr->next = converted_files;
2224       converted_files = converted_file_ptr;
2225     }
2226 }
2227
2228 static int find_fragment PARAMS ((const char *, int, const char **,
2229                                   const char **));
2230
2231 /* Replace an attribute's original text with NEW_TEXT. */
2232
2233 static const char *
2234 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
2235 {
2236   int quote_flag = 0;
2237   char quote_char = '\"';       /* use "..." for quoting, unless the
2238                                    original value is quoted, in which
2239                                    case reuse its quoting char. */
2240   const char *frag_beg, *frag_end;
2241
2242   /* Structure of our string is:
2243        "...old-contents..."
2244        <---    size    --->  (with quotes)
2245      OR:
2246        ...old-contents...
2247        <---    size   -->    (no quotes)   */
2248
2249   if (*p == '\"' || *p == '\'')
2250     {
2251       quote_char = *p;
2252       quote_flag = 1;
2253       ++p;
2254       size -= 2;                /* disregard opening and closing quote */
2255     }
2256   putc (quote_char, fp);
2257   fputs (new_text, fp);
2258
2259   /* Look for fragment identifier, if any. */
2260   if (find_fragment (p, size, &frag_beg, &frag_end))
2261     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2262   p += size;
2263   if (quote_flag)
2264     ++p;
2265   putc (quote_char, fp);
2266
2267   return p;
2268 }
2269
2270 /* The same as REPLACE_ATTR, but used when replacing
2271    <meta http-equiv=refresh content="new_text"> because we need to
2272    append "timeout_value; URL=" before the next_text.  */
2273
2274 static const char *
2275 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
2276                            const char *new_text, int timeout)
2277 {
2278   /* "0; URL=..." */
2279   char *new_with_timeout = (char *)alloca (numdigit (timeout)
2280                                            + 6 /* "; URL=" */
2281                                            + strlen (new_text)
2282                                            + 1);
2283   sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
2284
2285   return replace_attr (p, size, fp, new_with_timeout);
2286 }
2287
2288 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2289    preceded by '&'.  If the character is not found, return zero.  If
2290    the character is found, return 1 and set BP and EP to point to the
2291    beginning and end of the region.
2292
2293    This is used for finding the fragment indentifiers in URLs.  */
2294
2295 static int
2296 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2297 {
2298   const char *end = beg + size;
2299   int saw_amp = 0;
2300   for (; beg < end; beg++)
2301     {
2302       switch (*beg)
2303         {
2304         case '&':
2305           saw_amp = 1;
2306           break;
2307         case '#':
2308           if (!saw_amp)
2309             {
2310               *bp = beg;
2311               *ep = end;
2312               return 1;
2313             }
2314           /* fallthrough */
2315         default:
2316           saw_amp = 0;
2317         }
2318     }
2319   return 0;
2320 }
2321
2322 /* Quote FILE for use as local reference to an HTML file.
2323
2324    We quote ? as %3F to avoid passing part of the file name as the
2325    parameter when browsing the converted file through HTTP.  However,
2326    it is safe to do this only when `--html-extension' is turned on.
2327    This is because converting "index.html?foo=bar" to
2328    "index.html%3Ffoo=bar" would break local browsing, as the latter
2329    isn't even recognized as an HTML file!  However, converting
2330    "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
2331    safe for both local and HTTP-served browsing.  */
2332
2333 static char *
2334 local_quote_string (const char *file)
2335 {
2336   const char *file_sans_qmark;
2337   int qm;
2338
2339   if (!opt.html_extension)
2340     return html_quote_string (file);
2341
2342   qm = count_char (file, '?');
2343
2344   if (qm)
2345     {
2346       const char *from = file;
2347       char *to, *newname;
2348
2349       /* qm * 2 because we replace each question mark with "%3F",
2350          i.e. replace one char with three, hence two more.  */
2351       int fsqlen = strlen (file) + qm * 2;
2352
2353       to = newname = (char *)alloca (fsqlen + 1);
2354       for (; *from; from++)
2355         {
2356           if (*from != '?')
2357             *to++ = *from;
2358           else
2359             {
2360               *to++ = '%';
2361               *to++ = '3';
2362               *to++ = 'F';
2363             }
2364         }
2365       assert (to - newname == fsqlen);
2366       *to = '\0';
2367
2368       file_sans_qmark = newname;
2369     }
2370   else
2371     file_sans_qmark = file;
2372
2373   return html_quote_string (file_sans_qmark);
2374 }
2375
2376 /* We're storing "modes" of type downloaded_file_t in the hash table.
2377    However, our hash tables only accept pointers for keys and values.
2378    So when we need a pointer, we use the address of a
2379    downloaded_file_t variable of static storage.  */
2380
2381 static downloaded_file_t *
2382 downloaded_mode_to_ptr (downloaded_file_t mode)
2383 {
2384   static downloaded_file_t
2385     v1 = FILE_NOT_ALREADY_DOWNLOADED,
2386     v2 = FILE_DOWNLOADED_NORMALLY,
2387     v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2388     v4 = CHECK_FOR_FILE;
2389
2390   switch (mode)
2391     {
2392     case FILE_NOT_ALREADY_DOWNLOADED:
2393       return &v1;
2394     case FILE_DOWNLOADED_NORMALLY:
2395       return &v2;
2396     case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2397       return &v3;
2398     case CHECK_FOR_FILE:
2399       return &v4;
2400     }
2401   return NULL;
2402 }
2403
2404 /* This should really be merged with dl_file_url_map and
2405    downloaded_html_files in recur.c.  This was originally a list, but
2406    I changed it to a hash table beause it was actually taking a lot of
2407    time to find things in it.  */
2408
2409 static struct hash_table *downloaded_files_hash;
2410
2411 /* Remembers which files have been downloaded.  In the standard case, should be
2412    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2413    download successfully (i.e. not for ones we have failures on or that we skip
2414    due to -N).
2415
2416    When we've downloaded a file and tacked on a ".html" extension due to -E,
2417    call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2418    FILE_DOWNLOADED_NORMALLY.
2419
2420    If you just want to check if a file has been previously added without adding
2421    it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
2422    with local filenames, not remote URLs. */
2423 downloaded_file_t
2424 downloaded_file (downloaded_file_t mode, const char *file)
2425 {
2426   downloaded_file_t *ptr;
2427
2428   if (mode == CHECK_FOR_FILE)
2429     {
2430       if (!downloaded_files_hash)
2431         return FILE_NOT_ALREADY_DOWNLOADED;
2432       ptr = hash_table_get (downloaded_files_hash, file);
2433       if (!ptr)
2434         return FILE_NOT_ALREADY_DOWNLOADED;
2435       return *ptr;
2436     }
2437
2438   if (!downloaded_files_hash)
2439     downloaded_files_hash = make_string_hash_table (0);
2440
2441   ptr = hash_table_get (downloaded_files_hash, file);
2442   if (ptr)
2443     return *ptr;
2444
2445   ptr = downloaded_mode_to_ptr (mode);
2446   hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2447
2448   return FILE_NOT_ALREADY_DOWNLOADED;
2449 }
2450
2451 static int
2452 df_free_mapper (void *key, void *value, void *ignored)
2453 {
2454   xfree (key);
2455   return 0;
2456 }
2457
2458 void
2459 downloaded_files_free (void)
2460 {
2461   if (downloaded_files_hash)
2462     {
2463       hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2464       hash_table_destroy (downloaded_files_hash);
2465       downloaded_files_hash = NULL;
2466     }
2467 }
2468 \f
2469 #if 0
2470 /* Debugging and testing support for path_simplify. */
2471
2472 /* Debug: run path_simplify on PATH and return the result in a new
2473    string.  Useful for calling from the debugger.  */
2474 static char *
2475 ps (char *path)
2476 {
2477   char *copy = xstrdup (path);
2478   path_simplify (copy);
2479   return copy;
2480 }
2481
2482 static void
2483 run_test (char *test, char *expected_result, int expected_change)
2484 {
2485   char *test_copy = xstrdup (test);
2486   int modified = path_simplify (test_copy);
2487
2488   if (0 != strcmp (test_copy, expected_result))
2489     {
2490       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2491               test, expected_result, test_copy);
2492     }
2493   if (modified != expected_change)
2494     {
2495       if (expected_change == 1)
2496         printf ("Expected no modification with path_simplify(\"%s\").\n",
2497                 test);
2498       else
2499         printf ("Expected modification with path_simplify(\"%s\").\n",
2500                 test);
2501     }
2502   xfree (test_copy);
2503 }
2504
2505 static void
2506 test_path_simplify (void)
2507 {
2508   static struct {
2509     char *test, *result;
2510     int should_modify;
2511   } tests[] = {
2512     { "",               "",             0 },
2513     { ".",              "",             1 },
2514     { "..",             "",             1 },
2515     { "foo",            "foo",          0 },
2516     { "foo/bar",        "foo/bar",      0 },
2517     { "foo///bar",      "foo/bar",      1 },
2518     { "foo/.",          "foo/",         1 },
2519     { "foo/./",         "foo/",         1 },
2520     { "foo./",          "foo./",        0 },
2521     { "foo/../bar",     "bar",          1 },
2522     { "foo/../bar/",    "bar/",         1 },
2523     { "foo/bar/..",     "foo/",         1 },
2524     { "foo/bar/../x",   "foo/x",        1 },
2525     { "foo/bar/../x/",  "foo/x/",       1 },
2526     { "foo/..",         "",             1 },
2527     { "foo/../..",      "",             1 },
2528     { "a/b/../../c",    "c",            1 },
2529     { "./a/../b",       "b",            1 }
2530   };
2531   int i;
2532
2533   for (i = 0; i < ARRAY_SIZE (tests); i++)
2534     {
2535       char *test = tests[i].test;
2536       char *expected_result = tests[i].result;
2537       int   expected_change = tests[i].should_modify;
2538       run_test (test, expected_result, expected_change);
2539     }
2540
2541   /* Now run all the tests with a leading slash before the test case,
2542      to prove that the slash is being preserved.  */
2543   for (i = 0; i < ARRAY_SIZE (tests); i++)
2544     {
2545       char *test, *expected_result;
2546       int expected_change = tests[i].should_modify;
2547
2548       test = xmalloc (1 + strlen (tests[i].test) + 1);
2549       sprintf (test, "/%s", tests[i].test);
2550
2551       expected_result = xmalloc (1 + strlen (tests[i].result) + 1);
2552       sprintf (expected_result, "/%s", tests[i].result);
2553
2554       run_test (test, expected_result, expected_change);
2555
2556       xfree (test);
2557       xfree (expected_result);
2558     }
2559 }
2560 #endif